diff --git "a/checkpoint-3253/trainer_state.json" "b/checkpoint-3253/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-3253/trainer_state.json" @@ -0,0 +1,22804 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3253, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003074085459575776, + "grad_norm": 1.457303762435913, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.9016, + "step": 1 + }, + { + "epoch": 0.0006148170919151552, + "grad_norm": 1.5361775159835815, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.9014, + "step": 2 + }, + { + "epoch": 0.0009222256378727329, + "grad_norm": 1.6036649942398071, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.9528, + "step": 3 + }, + { + "epoch": 0.0012296341838303104, + "grad_norm": 1.462001919746399, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.9287, + "step": 4 + }, + { + "epoch": 0.001537042729787888, + "grad_norm": 1.5293784141540527, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.9371, + "step": 5 + }, + { + "epoch": 0.0018444512757454657, + "grad_norm": 1.5982283353805542, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.9675, + "step": 6 + }, + { + "epoch": 0.002151859821703043, + "grad_norm": 1.5282213687896729, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.9326, + "step": 7 + }, + { + "epoch": 0.002459268367660621, + "grad_norm": 1.5651417970657349, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.9473, + "step": 8 + }, + { + "epoch": 0.0027666769136181985, + "grad_norm": 1.522950291633606, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.9261, + "step": 9 + }, + { + "epoch": 0.003074085459575776, + "grad_norm": 1.4554579257965088, + "learning_rate": 5.000000000000001e-07, + "loss": 0.937, + "step": 10 + }, + { + "epoch": 0.003381494005533354, + "grad_norm": 1.471447229385376, + "learning_rate": 5.5e-07, + "loss": 0.92, + "step": 11 + }, + { + "epoch": 0.0036889025514909315, + "grad_norm": 1.497208833694458, + "learning_rate": 6.000000000000001e-07, + "loss": 0.8931, + "step": 12 + }, + { + "epoch": 0.003996311097448509, + "grad_norm": 1.4178965091705322, + "learning_rate": 6.5e-07, + "loss": 0.9451, + "step": 13 + }, + { + "epoch": 0.004303719643406086, + "grad_norm": 1.3218611478805542, + "learning_rate": 7.000000000000001e-07, + "loss": 0.9254, + "step": 14 + }, + { + "epoch": 0.0046111281893636644, + "grad_norm": 1.31557035446167, + "learning_rate": 7.5e-07, + "loss": 0.9118, + "step": 15 + }, + { + "epoch": 0.004918536735321242, + "grad_norm": 1.1695120334625244, + "learning_rate": 8.000000000000001e-07, + "loss": 0.9213, + "step": 16 + }, + { + "epoch": 0.00522594528127882, + "grad_norm": 1.1750404834747314, + "learning_rate": 8.500000000000001e-07, + "loss": 0.927, + "step": 17 + }, + { + "epoch": 0.005533353827236397, + "grad_norm": 1.1545753479003906, + "learning_rate": 9.000000000000001e-07, + "loss": 0.8831, + "step": 18 + }, + { + "epoch": 0.005840762373193975, + "grad_norm": 1.1367976665496826, + "learning_rate": 9.500000000000001e-07, + "loss": 0.9318, + "step": 19 + }, + { + "epoch": 0.006148170919151552, + "grad_norm": 1.0088235139846802, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9004, + "step": 20 + }, + { + "epoch": 0.00645557946510913, + "grad_norm": 0.9942068457603455, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.8627, + "step": 21 + }, + { + "epoch": 0.006762988011066708, + "grad_norm": 0.9267295598983765, + "learning_rate": 1.1e-06, + "loss": 0.8556, + "step": 22 + }, + { + "epoch": 0.007070396557024286, + "grad_norm": 0.8020273447036743, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.863, + "step": 23 + }, + { + "epoch": 0.007377805102981863, + "grad_norm": 0.8134849071502686, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.9018, + "step": 24 + }, + { + "epoch": 0.00768521364893944, + "grad_norm": 0.7958990931510925, + "learning_rate": 1.25e-06, + "loss": 0.8596, + "step": 25 + }, + { + "epoch": 0.007992622194897018, + "grad_norm": 0.7303382754325867, + "learning_rate": 1.3e-06, + "loss": 0.8831, + "step": 26 + }, + { + "epoch": 0.008300030740854596, + "grad_norm": 0.7153196334838867, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.8819, + "step": 27 + }, + { + "epoch": 0.008607439286812173, + "grad_norm": 0.6898184418678284, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.8653, + "step": 28 + }, + { + "epoch": 0.00891484783276975, + "grad_norm": 0.6401644349098206, + "learning_rate": 1.45e-06, + "loss": 0.8827, + "step": 29 + }, + { + "epoch": 0.009222256378727329, + "grad_norm": 0.6200055480003357, + "learning_rate": 1.5e-06, + "loss": 0.8864, + "step": 30 + }, + { + "epoch": 0.009529664924684907, + "grad_norm": 0.6081007719039917, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.8771, + "step": 31 + }, + { + "epoch": 0.009837073470642483, + "grad_norm": 0.5443252921104431, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.8601, + "step": 32 + }, + { + "epoch": 0.010144482016600061, + "grad_norm": 0.5112125873565674, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.8309, + "step": 33 + }, + { + "epoch": 0.01045189056255764, + "grad_norm": 0.479396253824234, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.8549, + "step": 34 + }, + { + "epoch": 0.010759299108515216, + "grad_norm": 0.4708899259567261, + "learning_rate": 1.75e-06, + "loss": 0.8214, + "step": 35 + }, + { + "epoch": 0.011066707654472794, + "grad_norm": 0.4567429721355438, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.8491, + "step": 36 + }, + { + "epoch": 0.011374116200430372, + "grad_norm": 0.419939786195755, + "learning_rate": 1.85e-06, + "loss": 0.8116, + "step": 37 + }, + { + "epoch": 0.01168152474638795, + "grad_norm": 0.4159710705280304, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.86, + "step": 38 + }, + { + "epoch": 0.011988933292345526, + "grad_norm": 0.3831232786178589, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.8207, + "step": 39 + }, + { + "epoch": 0.012296341838303105, + "grad_norm": 0.37178653478622437, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8123, + "step": 40 + }, + { + "epoch": 0.012603750384260683, + "grad_norm": 0.382523775100708, + "learning_rate": 2.05e-06, + "loss": 0.8274, + "step": 41 + }, + { + "epoch": 0.01291115893021826, + "grad_norm": 0.3845403492450714, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.851, + "step": 42 + }, + { + "epoch": 0.013218567476175837, + "grad_norm": 0.3395361304283142, + "learning_rate": 2.15e-06, + "loss": 0.8055, + "step": 43 + }, + { + "epoch": 0.013525976022133415, + "grad_norm": 0.3474522829055786, + "learning_rate": 2.2e-06, + "loss": 0.8205, + "step": 44 + }, + { + "epoch": 0.013833384568090993, + "grad_norm": 0.36530056595802307, + "learning_rate": 2.25e-06, + "loss": 0.809, + "step": 45 + }, + { + "epoch": 0.014140793114048571, + "grad_norm": 0.343164324760437, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.7845, + "step": 46 + }, + { + "epoch": 0.014448201660006148, + "grad_norm": 0.3243808150291443, + "learning_rate": 2.35e-06, + "loss": 0.8088, + "step": 47 + }, + { + "epoch": 0.014755610205963726, + "grad_norm": 0.30894219875335693, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.7901, + "step": 48 + }, + { + "epoch": 0.015063018751921304, + "grad_norm": 0.3177136480808258, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.7891, + "step": 49 + }, + { + "epoch": 0.01537042729787888, + "grad_norm": 0.3006742298603058, + "learning_rate": 2.5e-06, + "loss": 0.786, + "step": 50 + }, + { + "epoch": 0.01567783584383646, + "grad_norm": 0.282314658164978, + "learning_rate": 2.55e-06, + "loss": 0.7874, + "step": 51 + }, + { + "epoch": 0.015985244389794036, + "grad_norm": 0.280830055475235, + "learning_rate": 2.6e-06, + "loss": 0.7807, + "step": 52 + }, + { + "epoch": 0.016292652935751613, + "grad_norm": 0.27337926626205444, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.7897, + "step": 53 + }, + { + "epoch": 0.016600061481709193, + "grad_norm": 0.2719617784023285, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.783, + "step": 54 + }, + { + "epoch": 0.01690747002766677, + "grad_norm": 0.24970686435699463, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.7489, + "step": 55 + }, + { + "epoch": 0.017214878573624345, + "grad_norm": 0.26591891050338745, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.7671, + "step": 56 + }, + { + "epoch": 0.017522287119581925, + "grad_norm": 0.24470220506191254, + "learning_rate": 2.85e-06, + "loss": 0.787, + "step": 57 + }, + { + "epoch": 0.0178296956655395, + "grad_norm": 0.24995489418506622, + "learning_rate": 2.9e-06, + "loss": 0.7832, + "step": 58 + }, + { + "epoch": 0.018137104211497078, + "grad_norm": 0.24040956795215607, + "learning_rate": 2.95e-06, + "loss": 0.7882, + "step": 59 + }, + { + "epoch": 0.018444512757454658, + "grad_norm": 0.238366961479187, + "learning_rate": 3e-06, + "loss": 0.7476, + "step": 60 + }, + { + "epoch": 0.018751921303412234, + "grad_norm": 0.24802784621715546, + "learning_rate": 3.05e-06, + "loss": 0.7767, + "step": 61 + }, + { + "epoch": 0.019059329849369814, + "grad_norm": 0.23141387104988098, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.7588, + "step": 62 + }, + { + "epoch": 0.01936673839532739, + "grad_norm": 0.23131798207759857, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.7449, + "step": 63 + }, + { + "epoch": 0.019674146941284967, + "grad_norm": 0.2310822755098343, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.7719, + "step": 64 + }, + { + "epoch": 0.019981555487242546, + "grad_norm": 0.23191697895526886, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.7735, + "step": 65 + }, + { + "epoch": 0.020288964033200123, + "grad_norm": 0.24366876482963562, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.7997, + "step": 66 + }, + { + "epoch": 0.0205963725791577, + "grad_norm": 0.22773416340351105, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.7599, + "step": 67 + }, + { + "epoch": 0.02090378112511528, + "grad_norm": 0.22407405078411102, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.7638, + "step": 68 + }, + { + "epoch": 0.021211189671072855, + "grad_norm": 0.21921014785766602, + "learning_rate": 3.45e-06, + "loss": 0.7518, + "step": 69 + }, + { + "epoch": 0.02151859821703043, + "grad_norm": 0.2200075387954712, + "learning_rate": 3.5e-06, + "loss": 0.7869, + "step": 70 + }, + { + "epoch": 0.02182600676298801, + "grad_norm": 0.2149764746427536, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.7673, + "step": 71 + }, + { + "epoch": 0.022133415308945588, + "grad_norm": 0.22350521385669708, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.773, + "step": 72 + }, + { + "epoch": 0.022440823854903168, + "grad_norm": 0.21712292730808258, + "learning_rate": 3.65e-06, + "loss": 0.7978, + "step": 73 + }, + { + "epoch": 0.022748232400860744, + "grad_norm": 0.2297796756029129, + "learning_rate": 3.7e-06, + "loss": 0.7378, + "step": 74 + }, + { + "epoch": 0.02305564094681832, + "grad_norm": 0.20947284996509552, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.7828, + "step": 75 + }, + { + "epoch": 0.0233630494927759, + "grad_norm": 0.22107581794261932, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.7733, + "step": 76 + }, + { + "epoch": 0.023670458038733477, + "grad_norm": 0.20762985944747925, + "learning_rate": 3.85e-06, + "loss": 0.7696, + "step": 77 + }, + { + "epoch": 0.023977866584691053, + "grad_norm": 0.2052047997713089, + "learning_rate": 3.900000000000001e-06, + "loss": 0.7671, + "step": 78 + }, + { + "epoch": 0.024285275130648633, + "grad_norm": 0.19900698959827423, + "learning_rate": 3.95e-06, + "loss": 0.7669, + "step": 79 + }, + { + "epoch": 0.02459268367660621, + "grad_norm": 0.18975047767162323, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7449, + "step": 80 + }, + { + "epoch": 0.02490009222256379, + "grad_norm": 0.2032550573348999, + "learning_rate": 4.05e-06, + "loss": 0.741, + "step": 81 + }, + { + "epoch": 0.025207500768521365, + "grad_norm": 0.18790468573570251, + "learning_rate": 4.1e-06, + "loss": 0.7491, + "step": 82 + }, + { + "epoch": 0.02551490931447894, + "grad_norm": 0.19185906648635864, + "learning_rate": 4.15e-06, + "loss": 0.7558, + "step": 83 + }, + { + "epoch": 0.02582231786043652, + "grad_norm": 0.20287489891052246, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.745, + "step": 84 + }, + { + "epoch": 0.026129726406394098, + "grad_norm": 0.20952974259853363, + "learning_rate": 4.25e-06, + "loss": 0.7515, + "step": 85 + }, + { + "epoch": 0.026437134952351674, + "grad_norm": 0.2009042650461197, + "learning_rate": 4.3e-06, + "loss": 0.7489, + "step": 86 + }, + { + "epoch": 0.026744543498309254, + "grad_norm": 0.19879287481307983, + "learning_rate": 4.350000000000001e-06, + "loss": 0.7556, + "step": 87 + }, + { + "epoch": 0.02705195204426683, + "grad_norm": 0.19895334541797638, + "learning_rate": 4.4e-06, + "loss": 0.7408, + "step": 88 + }, + { + "epoch": 0.027359360590224407, + "grad_norm": 0.19749175012111664, + "learning_rate": 4.450000000000001e-06, + "loss": 0.7612, + "step": 89 + }, + { + "epoch": 0.027666769136181987, + "grad_norm": 0.20138433575630188, + "learning_rate": 4.5e-06, + "loss": 0.7486, + "step": 90 + }, + { + "epoch": 0.027974177682139563, + "grad_norm": 0.19165848195552826, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.7656, + "step": 91 + }, + { + "epoch": 0.028281586228097143, + "grad_norm": 0.18589848279953003, + "learning_rate": 4.600000000000001e-06, + "loss": 0.7486, + "step": 92 + }, + { + "epoch": 0.02858899477405472, + "grad_norm": 0.17896020412445068, + "learning_rate": 4.65e-06, + "loss": 0.6951, + "step": 93 + }, + { + "epoch": 0.028896403320012296, + "grad_norm": 0.18091824650764465, + "learning_rate": 4.7e-06, + "loss": 0.7441, + "step": 94 + }, + { + "epoch": 0.029203811865969875, + "grad_norm": 0.2503315210342407, + "learning_rate": 4.75e-06, + "loss": 0.7527, + "step": 95 + }, + { + "epoch": 0.02951122041192745, + "grad_norm": 0.1880139261484146, + "learning_rate": 4.800000000000001e-06, + "loss": 0.7398, + "step": 96 + }, + { + "epoch": 0.029818628957885028, + "grad_norm": 0.19291161000728607, + "learning_rate": 4.85e-06, + "loss": 0.7341, + "step": 97 + }, + { + "epoch": 0.030126037503842608, + "grad_norm": 0.19553613662719727, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.7259, + "step": 98 + }, + { + "epoch": 0.030433446049800184, + "grad_norm": 0.18554477393627167, + "learning_rate": 4.95e-06, + "loss": 0.7439, + "step": 99 + }, + { + "epoch": 0.03074085459575776, + "grad_norm": 0.18373316526412964, + "learning_rate": 5e-06, + "loss": 0.7457, + "step": 100 + }, + { + "epoch": 0.03104826314171534, + "grad_norm": 0.20008496940135956, + "learning_rate": 4.999999867765304e-06, + "loss": 0.7333, + "step": 101 + }, + { + "epoch": 0.03135567168767292, + "grad_norm": 0.18038420379161835, + "learning_rate": 4.9999994710612285e-06, + "loss": 0.7493, + "step": 102 + }, + { + "epoch": 0.03166308023363049, + "grad_norm": 0.1933283507823944, + "learning_rate": 4.999998809887817e-06, + "loss": 0.7313, + "step": 103 + }, + { + "epoch": 0.03197048877958807, + "grad_norm": 0.197057843208313, + "learning_rate": 4.999997884245138e-06, + "loss": 0.7682, + "step": 104 + }, + { + "epoch": 0.03227789732554565, + "grad_norm": 0.1884010136127472, + "learning_rate": 4.9999966941332904e-06, + "loss": 0.7764, + "step": 105 + }, + { + "epoch": 0.032585305871503226, + "grad_norm": 0.1793089359998703, + "learning_rate": 4.999995239552399e-06, + "loss": 0.7254, + "step": 106 + }, + { + "epoch": 0.032892714417460805, + "grad_norm": 0.1775074303150177, + "learning_rate": 4.999993520502619e-06, + "loss": 0.7547, + "step": 107 + }, + { + "epoch": 0.033200122963418385, + "grad_norm": 0.19893310964107513, + "learning_rate": 4.999991536984131e-06, + "loss": 0.749, + "step": 108 + }, + { + "epoch": 0.03350753150937596, + "grad_norm": 0.1918499767780304, + "learning_rate": 4.999989288997147e-06, + "loss": 0.7404, + "step": 109 + }, + { + "epoch": 0.03381494005533354, + "grad_norm": 0.1843608170747757, + "learning_rate": 4.999986776541902e-06, + "loss": 0.7339, + "step": 110 + }, + { + "epoch": 0.03412234860129112, + "grad_norm": 0.18037264049053192, + "learning_rate": 4.999983999618663e-06, + "loss": 0.7097, + "step": 111 + }, + { + "epoch": 0.03442975714724869, + "grad_norm": 0.18377307057380676, + "learning_rate": 4.999980958227725e-06, + "loss": 0.7198, + "step": 112 + }, + { + "epoch": 0.03473716569320627, + "grad_norm": 0.1848795861005783, + "learning_rate": 4.999977652369407e-06, + "loss": 0.7494, + "step": 113 + }, + { + "epoch": 0.03504457423916385, + "grad_norm": 0.1871318370103836, + "learning_rate": 4.999974082044062e-06, + "loss": 0.7525, + "step": 114 + }, + { + "epoch": 0.03535198278512142, + "grad_norm": 0.19900010526180267, + "learning_rate": 4.999970247252065e-06, + "loss": 0.7215, + "step": 115 + }, + { + "epoch": 0.035659391331079, + "grad_norm": 0.1839558333158493, + "learning_rate": 4.9999661479938234e-06, + "loss": 0.7393, + "step": 116 + }, + { + "epoch": 0.03596679987703658, + "grad_norm": 0.18815268576145172, + "learning_rate": 4.999961784269769e-06, + "loss": 0.7431, + "step": 117 + }, + { + "epoch": 0.036274208422994156, + "grad_norm": 0.18640805780887604, + "learning_rate": 4.999957156080365e-06, + "loss": 0.7411, + "step": 118 + }, + { + "epoch": 0.036581616968951736, + "grad_norm": 0.1872325986623764, + "learning_rate": 4.999952263426101e-06, + "loss": 0.7129, + "step": 119 + }, + { + "epoch": 0.036889025514909315, + "grad_norm": 0.18295225501060486, + "learning_rate": 4.999947106307495e-06, + "loss": 0.7323, + "step": 120 + }, + { + "epoch": 0.037196434060866895, + "grad_norm": 0.20684652030467987, + "learning_rate": 4.99994168472509e-06, + "loss": 0.7144, + "step": 121 + }, + { + "epoch": 0.03750384260682447, + "grad_norm": 0.1919460892677307, + "learning_rate": 4.9999359986794616e-06, + "loss": 0.7455, + "step": 122 + }, + { + "epoch": 0.03781125115278205, + "grad_norm": 0.18495211005210876, + "learning_rate": 4.9999300481712115e-06, + "loss": 0.7198, + "step": 123 + }, + { + "epoch": 0.03811865969873963, + "grad_norm": 0.18639351427555084, + "learning_rate": 4.9999238332009684e-06, + "loss": 0.7401, + "step": 124 + }, + { + "epoch": 0.0384260682446972, + "grad_norm": 0.1902371495962143, + "learning_rate": 4.9999173537693904e-06, + "loss": 0.7137, + "step": 125 + }, + { + "epoch": 0.03873347679065478, + "grad_norm": 0.18160541355609894, + "learning_rate": 4.999910609877162e-06, + "loss": 0.7369, + "step": 126 + }, + { + "epoch": 0.03904088533661236, + "grad_norm": 0.20006270706653595, + "learning_rate": 4.999903601524998e-06, + "loss": 0.7371, + "step": 127 + }, + { + "epoch": 0.03934829388256993, + "grad_norm": 0.19111914932727814, + "learning_rate": 4.999896328713638e-06, + "loss": 0.7386, + "step": 128 + }, + { + "epoch": 0.03965570242852751, + "grad_norm": 0.1869526505470276, + "learning_rate": 4.999888791443854e-06, + "loss": 0.7429, + "step": 129 + }, + { + "epoch": 0.03996311097448509, + "grad_norm": 0.18547074496746063, + "learning_rate": 4.99988098971644e-06, + "loss": 0.7449, + "step": 130 + }, + { + "epoch": 0.040270519520442666, + "grad_norm": 0.18702180683612823, + "learning_rate": 4.999872923532223e-06, + "loss": 0.7363, + "step": 131 + }, + { + "epoch": 0.040577928066400246, + "grad_norm": 0.18783526122570038, + "learning_rate": 4.9998645928920575e-06, + "loss": 0.7102, + "step": 132 + }, + { + "epoch": 0.040885336612357825, + "grad_norm": 0.1872801035642624, + "learning_rate": 4.999855997796823e-06, + "loss": 0.7333, + "step": 133 + }, + { + "epoch": 0.0411927451583154, + "grad_norm": 0.18505430221557617, + "learning_rate": 4.999847138247429e-06, + "loss": 0.74, + "step": 134 + }, + { + "epoch": 0.04150015370427298, + "grad_norm": 0.18984299898147583, + "learning_rate": 4.999838014244814e-06, + "loss": 0.7472, + "step": 135 + }, + { + "epoch": 0.04180756225023056, + "grad_norm": 0.1857382208108902, + "learning_rate": 4.999828625789942e-06, + "loss": 0.7267, + "step": 136 + }, + { + "epoch": 0.04211497079618813, + "grad_norm": 0.18363380432128906, + "learning_rate": 4.999818972883807e-06, + "loss": 0.7326, + "step": 137 + }, + { + "epoch": 0.04242237934214571, + "grad_norm": 0.19250288605690002, + "learning_rate": 4.999809055527428e-06, + "loss": 0.7287, + "step": 138 + }, + { + "epoch": 0.04272978788810329, + "grad_norm": 0.19416747987270355, + "learning_rate": 4.999798873721857e-06, + "loss": 0.7227, + "step": 139 + }, + { + "epoch": 0.04303719643406086, + "grad_norm": 0.18027730286121368, + "learning_rate": 4.99978842746817e-06, + "loss": 0.7391, + "step": 140 + }, + { + "epoch": 0.04334460498001844, + "grad_norm": 0.18488667905330658, + "learning_rate": 4.999777716767472e-06, + "loss": 0.699, + "step": 141 + }, + { + "epoch": 0.04365201352597602, + "grad_norm": 0.18667159974575043, + "learning_rate": 4.999766741620896e-06, + "loss": 0.7443, + "step": 142 + }, + { + "epoch": 0.0439594220719336, + "grad_norm": 0.18831080198287964, + "learning_rate": 4.999755502029603e-06, + "loss": 0.7324, + "step": 143 + }, + { + "epoch": 0.044266830617891176, + "grad_norm": 0.1966133713722229, + "learning_rate": 4.999743997994782e-06, + "loss": 0.7191, + "step": 144 + }, + { + "epoch": 0.044574239163848756, + "grad_norm": 0.194343701004982, + "learning_rate": 4.9997322295176495e-06, + "loss": 0.7404, + "step": 145 + }, + { + "epoch": 0.044881647709806335, + "grad_norm": 0.1867935061454773, + "learning_rate": 4.999720196599453e-06, + "loss": 0.7005, + "step": 146 + }, + { + "epoch": 0.04518905625576391, + "grad_norm": 0.1914404183626175, + "learning_rate": 4.999707899241462e-06, + "loss": 0.7275, + "step": 147 + }, + { + "epoch": 0.04549646480172149, + "grad_norm": 0.18376590311527252, + "learning_rate": 4.999695337444979e-06, + "loss": 0.7319, + "step": 148 + }, + { + "epoch": 0.04580387334767907, + "grad_norm": 0.18899570405483246, + "learning_rate": 4.9996825112113325e-06, + "loss": 0.744, + "step": 149 + }, + { + "epoch": 0.04611128189363664, + "grad_norm": 0.18382610380649567, + "learning_rate": 4.999669420541879e-06, + "loss": 0.7105, + "step": 150 + }, + { + "epoch": 0.04641869043959422, + "grad_norm": 0.18699327111244202, + "learning_rate": 4.999656065438006e-06, + "loss": 0.7504, + "step": 151 + }, + { + "epoch": 0.0467260989855518, + "grad_norm": 0.18894201517105103, + "learning_rate": 4.999642445901123e-06, + "loss": 0.71, + "step": 152 + }, + { + "epoch": 0.04703350753150937, + "grad_norm": 0.19141949713230133, + "learning_rate": 4.999628561932672e-06, + "loss": 0.7369, + "step": 153 + }, + { + "epoch": 0.04734091607746695, + "grad_norm": 0.19198447465896606, + "learning_rate": 4.999614413534123e-06, + "loss": 0.738, + "step": 154 + }, + { + "epoch": 0.04764832462342453, + "grad_norm": 0.18844984471797943, + "learning_rate": 4.99960000070697e-06, + "loss": 0.7392, + "step": 155 + }, + { + "epoch": 0.047955733169382106, + "grad_norm": 0.1971272975206375, + "learning_rate": 4.9995853234527406e-06, + "loss": 0.6737, + "step": 156 + }, + { + "epoch": 0.048263141715339686, + "grad_norm": 0.1985895335674286, + "learning_rate": 4.999570381772985e-06, + "loss": 0.7204, + "step": 157 + }, + { + "epoch": 0.048570550261297266, + "grad_norm": 0.18716715276241302, + "learning_rate": 4.999555175669285e-06, + "loss": 0.7243, + "step": 158 + }, + { + "epoch": 0.04887795880725484, + "grad_norm": 0.1926867663860321, + "learning_rate": 4.999539705143251e-06, + "loss": 0.7242, + "step": 159 + }, + { + "epoch": 0.04918536735321242, + "grad_norm": 0.21050061285495758, + "learning_rate": 4.9995239701965155e-06, + "loss": 0.7404, + "step": 160 + }, + { + "epoch": 0.04949277589917, + "grad_norm": 0.19631873071193695, + "learning_rate": 4.999507970830746e-06, + "loss": 0.713, + "step": 161 + }, + { + "epoch": 0.04980018444512758, + "grad_norm": 0.19615758955478668, + "learning_rate": 4.999491707047634e-06, + "loss": 0.7178, + "step": 162 + }, + { + "epoch": 0.05010759299108515, + "grad_norm": 0.189536914229393, + "learning_rate": 4.999475178848901e-06, + "loss": 0.7114, + "step": 163 + }, + { + "epoch": 0.05041500153704273, + "grad_norm": 0.19022732973098755, + "learning_rate": 4.999458386236296e-06, + "loss": 0.7292, + "step": 164 + }, + { + "epoch": 0.05072241008300031, + "grad_norm": 0.1974242925643921, + "learning_rate": 4.9994413292115925e-06, + "loss": 0.7421, + "step": 165 + }, + { + "epoch": 0.05102981862895788, + "grad_norm": 0.1984829157590866, + "learning_rate": 4.999424007776597e-06, + "loss": 0.6965, + "step": 166 + }, + { + "epoch": 0.05133722717491546, + "grad_norm": 0.1951403170824051, + "learning_rate": 4.9994064219331426e-06, + "loss": 0.7274, + "step": 167 + }, + { + "epoch": 0.05164463572087304, + "grad_norm": 0.19128255546092987, + "learning_rate": 4.999388571683088e-06, + "loss": 0.7314, + "step": 168 + }, + { + "epoch": 0.051952044266830616, + "grad_norm": 0.19389702379703522, + "learning_rate": 4.999370457028323e-06, + "loss": 0.7004, + "step": 169 + }, + { + "epoch": 0.052259452812788196, + "grad_norm": 0.18810921907424927, + "learning_rate": 4.999352077970762e-06, + "loss": 0.7245, + "step": 170 + }, + { + "epoch": 0.052566861358745776, + "grad_norm": 0.19161537289619446, + "learning_rate": 4.999333434512351e-06, + "loss": 0.6885, + "step": 171 + }, + { + "epoch": 0.05287426990470335, + "grad_norm": 0.1903044730424881, + "learning_rate": 4.9993145266550615e-06, + "loss": 0.7043, + "step": 172 + }, + { + "epoch": 0.05318167845066093, + "grad_norm": 0.19858123362064362, + "learning_rate": 4.999295354400894e-06, + "loss": 0.7202, + "step": 173 + }, + { + "epoch": 0.05348908699661851, + "grad_norm": 0.20039589703083038, + "learning_rate": 4.999275917751877e-06, + "loss": 0.7239, + "step": 174 + }, + { + "epoch": 0.05379649554257608, + "grad_norm": 0.1931225210428238, + "learning_rate": 4.999256216710066e-06, + "loss": 0.7301, + "step": 175 + }, + { + "epoch": 0.05410390408853366, + "grad_norm": 0.1993560642004013, + "learning_rate": 4.9992362512775446e-06, + "loss": 0.7127, + "step": 176 + }, + { + "epoch": 0.05441131263449124, + "grad_norm": 0.22190482914447784, + "learning_rate": 4.999216021456427e-06, + "loss": 0.7245, + "step": 177 + }, + { + "epoch": 0.054718721180448814, + "grad_norm": 0.19294826686382294, + "learning_rate": 4.999195527248851e-06, + "loss": 0.724, + "step": 178 + }, + { + "epoch": 0.05502612972640639, + "grad_norm": 0.19454610347747803, + "learning_rate": 4.999174768656986e-06, + "loss": 0.7152, + "step": 179 + }, + { + "epoch": 0.05533353827236397, + "grad_norm": 0.1957622617483139, + "learning_rate": 4.999153745683028e-06, + "loss": 0.7011, + "step": 180 + }, + { + "epoch": 0.055640946818321546, + "grad_norm": 0.1923701912164688, + "learning_rate": 4.9991324583292e-06, + "loss": 0.6967, + "step": 181 + }, + { + "epoch": 0.055948355364279126, + "grad_norm": 0.2048516422510147, + "learning_rate": 4.999110906597754e-06, + "loss": 0.7007, + "step": 182 + }, + { + "epoch": 0.056255763910236706, + "grad_norm": 0.18447531759738922, + "learning_rate": 4.999089090490972e-06, + "loss": 0.6946, + "step": 183 + }, + { + "epoch": 0.056563172456194286, + "grad_norm": 0.20768792927265167, + "learning_rate": 4.9990670100111585e-06, + "loss": 0.7429, + "step": 184 + }, + { + "epoch": 0.05687058100215186, + "grad_norm": 0.21157075464725494, + "learning_rate": 4.999044665160651e-06, + "loss": 0.7173, + "step": 185 + }, + { + "epoch": 0.05717798954810944, + "grad_norm": 0.19840696454048157, + "learning_rate": 4.999022055941816e-06, + "loss": 0.7084, + "step": 186 + }, + { + "epoch": 0.05748539809406702, + "grad_norm": 0.20940692722797394, + "learning_rate": 4.99899918235704e-06, + "loss": 0.7277, + "step": 187 + }, + { + "epoch": 0.05779280664002459, + "grad_norm": 0.19930700957775116, + "learning_rate": 4.9989760444087474e-06, + "loss": 0.6947, + "step": 188 + }, + { + "epoch": 0.05810021518598217, + "grad_norm": 0.19949018955230713, + "learning_rate": 4.998952642099383e-06, + "loss": 0.7169, + "step": 189 + }, + { + "epoch": 0.05840762373193975, + "grad_norm": 0.21595625579357147, + "learning_rate": 4.998928975431423e-06, + "loss": 0.7129, + "step": 190 + }, + { + "epoch": 0.058715032277897324, + "grad_norm": 0.1975087970495224, + "learning_rate": 4.998905044407373e-06, + "loss": 0.6972, + "step": 191 + }, + { + "epoch": 0.0590224408238549, + "grad_norm": 0.19874757528305054, + "learning_rate": 4.998880849029761e-06, + "loss": 0.7068, + "step": 192 + }, + { + "epoch": 0.05932984936981248, + "grad_norm": 0.18920336663722992, + "learning_rate": 4.99885638930115e-06, + "loss": 0.7025, + "step": 193 + }, + { + "epoch": 0.059637257915770056, + "grad_norm": 0.19457679986953735, + "learning_rate": 4.998831665224126e-06, + "loss": 0.7034, + "step": 194 + }, + { + "epoch": 0.059944666461727636, + "grad_norm": 0.20264878869056702, + "learning_rate": 4.998806676801305e-06, + "loss": 0.709, + "step": 195 + }, + { + "epoch": 0.060252075007685216, + "grad_norm": 0.21136468648910522, + "learning_rate": 4.9987814240353304e-06, + "loss": 0.6971, + "step": 196 + }, + { + "epoch": 0.06055948355364279, + "grad_norm": 0.20135602355003357, + "learning_rate": 4.998755906928873e-06, + "loss": 0.7282, + "step": 197 + }, + { + "epoch": 0.06086689209960037, + "grad_norm": 0.19964660704135895, + "learning_rate": 4.9987301254846324e-06, + "loss": 0.7374, + "step": 198 + }, + { + "epoch": 0.06117430064555795, + "grad_norm": 0.20011897385120392, + "learning_rate": 4.998704079705336e-06, + "loss": 0.7141, + "step": 199 + }, + { + "epoch": 0.06148170919151552, + "grad_norm": 0.194240540266037, + "learning_rate": 4.998677769593741e-06, + "loss": 0.7005, + "step": 200 + }, + { + "epoch": 0.0617891177374731, + "grad_norm": 0.21476790308952332, + "learning_rate": 4.998651195152627e-06, + "loss": 0.7301, + "step": 201 + }, + { + "epoch": 0.06209652628343068, + "grad_norm": 0.20083045959472656, + "learning_rate": 4.998624356384808e-06, + "loss": 0.73, + "step": 202 + }, + { + "epoch": 0.062403934829388254, + "grad_norm": 0.19576038420200348, + "learning_rate": 4.9985972532931225e-06, + "loss": 0.7138, + "step": 203 + }, + { + "epoch": 0.06271134337534584, + "grad_norm": 0.19079318642616272, + "learning_rate": 4.998569885880438e-06, + "loss": 0.7051, + "step": 204 + }, + { + "epoch": 0.0630187519213034, + "grad_norm": 0.18989640474319458, + "learning_rate": 4.9985422541496485e-06, + "loss": 0.7409, + "step": 205 + }, + { + "epoch": 0.06332616046726099, + "grad_norm": 0.19702014327049255, + "learning_rate": 4.998514358103679e-06, + "loss": 0.7155, + "step": 206 + }, + { + "epoch": 0.06363356901321857, + "grad_norm": 0.19765838980674744, + "learning_rate": 4.998486197745478e-06, + "loss": 0.7233, + "step": 207 + }, + { + "epoch": 0.06394097755917615, + "grad_norm": 0.19480004906654358, + "learning_rate": 4.998457773078027e-06, + "loss": 0.684, + "step": 208 + }, + { + "epoch": 0.06424838610513373, + "grad_norm": 0.19725190103054047, + "learning_rate": 4.998429084104331e-06, + "loss": 0.7122, + "step": 209 + }, + { + "epoch": 0.0645557946510913, + "grad_norm": 0.2002919465303421, + "learning_rate": 4.998400130827427e-06, + "loss": 0.7049, + "step": 210 + }, + { + "epoch": 0.06486320319704887, + "grad_norm": 0.19753068685531616, + "learning_rate": 4.998370913250376e-06, + "loss": 0.7046, + "step": 211 + }, + { + "epoch": 0.06517061174300645, + "grad_norm": 0.1991903930902481, + "learning_rate": 4.99834143137627e-06, + "loss": 0.7116, + "step": 212 + }, + { + "epoch": 0.06547802028896403, + "grad_norm": 0.19603382050991058, + "learning_rate": 4.998311685208228e-06, + "loss": 0.7098, + "step": 213 + }, + { + "epoch": 0.06578542883492161, + "grad_norm": 0.20281796157360077, + "learning_rate": 4.998281674749396e-06, + "loss": 0.7214, + "step": 214 + }, + { + "epoch": 0.06609283738087919, + "grad_norm": 0.20258022844791412, + "learning_rate": 4.9982514000029485e-06, + "loss": 0.7183, + "step": 215 + }, + { + "epoch": 0.06640024592683677, + "grad_norm": 0.21211379766464233, + "learning_rate": 4.998220860972089e-06, + "loss": 0.7275, + "step": 216 + }, + { + "epoch": 0.06670765447279435, + "grad_norm": 0.19065549969673157, + "learning_rate": 4.9981900576600476e-06, + "loss": 0.7176, + "step": 217 + }, + { + "epoch": 0.06701506301875192, + "grad_norm": 0.20063215494155884, + "learning_rate": 4.998158990070083e-06, + "loss": 0.7282, + "step": 218 + }, + { + "epoch": 0.0673224715647095, + "grad_norm": 0.20626597106456757, + "learning_rate": 4.998127658205482e-06, + "loss": 0.7278, + "step": 219 + }, + { + "epoch": 0.06762988011066708, + "grad_norm": 0.204964742064476, + "learning_rate": 4.998096062069559e-06, + "loss": 0.6967, + "step": 220 + }, + { + "epoch": 0.06793728865662466, + "grad_norm": 0.2136119306087494, + "learning_rate": 4.9980642016656575e-06, + "loss": 0.7258, + "step": 221 + }, + { + "epoch": 0.06824469720258224, + "grad_norm": 0.20763279497623444, + "learning_rate": 4.998032076997146e-06, + "loss": 0.7016, + "step": 222 + }, + { + "epoch": 0.06855210574853982, + "grad_norm": 0.20279313623905182, + "learning_rate": 4.997999688067424e-06, + "loss": 0.7317, + "step": 223 + }, + { + "epoch": 0.06885951429449738, + "grad_norm": 0.20200158655643463, + "learning_rate": 4.997967034879918e-06, + "loss": 0.7002, + "step": 224 + }, + { + "epoch": 0.06916692284045496, + "grad_norm": 0.20631179213523865, + "learning_rate": 4.997934117438082e-06, + "loss": 0.7018, + "step": 225 + }, + { + "epoch": 0.06947433138641254, + "grad_norm": 0.2043829709291458, + "learning_rate": 4.997900935745398e-06, + "loss": 0.6965, + "step": 226 + }, + { + "epoch": 0.06978173993237012, + "grad_norm": 0.202076256275177, + "learning_rate": 4.997867489805377e-06, + "loss": 0.7155, + "step": 227 + }, + { + "epoch": 0.0700891484783277, + "grad_norm": 0.212909534573555, + "learning_rate": 4.997833779621556e-06, + "loss": 0.7108, + "step": 228 + }, + { + "epoch": 0.07039655702428528, + "grad_norm": 0.2053881585597992, + "learning_rate": 4.9977998051975025e-06, + "loss": 0.7278, + "step": 229 + }, + { + "epoch": 0.07070396557024285, + "grad_norm": 0.20602242648601532, + "learning_rate": 4.997765566536809e-06, + "loss": 0.7276, + "step": 230 + }, + { + "epoch": 0.07101137411620043, + "grad_norm": 0.21406063437461853, + "learning_rate": 4.9977310636431e-06, + "loss": 0.7221, + "step": 231 + }, + { + "epoch": 0.071318782662158, + "grad_norm": 0.2111855000257492, + "learning_rate": 4.997696296520023e-06, + "loss": 0.7094, + "step": 232 + }, + { + "epoch": 0.07162619120811559, + "grad_norm": 0.19595924019813538, + "learning_rate": 4.9976612651712565e-06, + "loss": 0.7095, + "step": 233 + }, + { + "epoch": 0.07193359975407317, + "grad_norm": 0.2113201916217804, + "learning_rate": 4.997625969600507e-06, + "loss": 0.7166, + "step": 234 + }, + { + "epoch": 0.07224100830003075, + "grad_norm": 0.20300433039665222, + "learning_rate": 4.997590409811508e-06, + "loss": 0.6901, + "step": 235 + }, + { + "epoch": 0.07254841684598831, + "grad_norm": 0.20471669733524323, + "learning_rate": 4.997554585808021e-06, + "loss": 0.7128, + "step": 236 + }, + { + "epoch": 0.07285582539194589, + "grad_norm": 0.20826661586761475, + "learning_rate": 4.997518497593836e-06, + "loss": 0.6996, + "step": 237 + }, + { + "epoch": 0.07316323393790347, + "grad_norm": 0.19737187027931213, + "learning_rate": 4.997482145172772e-06, + "loss": 0.7167, + "step": 238 + }, + { + "epoch": 0.07347064248386105, + "grad_norm": 0.21026720106601715, + "learning_rate": 4.997445528548672e-06, + "loss": 0.704, + "step": 239 + }, + { + "epoch": 0.07377805102981863, + "grad_norm": 0.20181673765182495, + "learning_rate": 4.997408647725412e-06, + "loss": 0.6895, + "step": 240 + }, + { + "epoch": 0.07408545957577621, + "grad_norm": 0.20916111767292023, + "learning_rate": 4.997371502706892e-06, + "loss": 0.682, + "step": 241 + }, + { + "epoch": 0.07439286812173379, + "grad_norm": 0.20303376019001007, + "learning_rate": 4.997334093497043e-06, + "loss": 0.71, + "step": 242 + }, + { + "epoch": 0.07470027666769136, + "grad_norm": 0.20897617936134338, + "learning_rate": 4.997296420099821e-06, + "loss": 0.7139, + "step": 243 + }, + { + "epoch": 0.07500768521364894, + "grad_norm": 0.2142588496208191, + "learning_rate": 4.997258482519211e-06, + "loss": 0.712, + "step": 244 + }, + { + "epoch": 0.07531509375960652, + "grad_norm": 0.20516176521778107, + "learning_rate": 4.997220280759229e-06, + "loss": 0.7221, + "step": 245 + }, + { + "epoch": 0.0756225023055641, + "grad_norm": 0.20913976430892944, + "learning_rate": 4.997181814823913e-06, + "loss": 0.7286, + "step": 246 + }, + { + "epoch": 0.07592991085152168, + "grad_norm": 0.20226675271987915, + "learning_rate": 4.997143084717334e-06, + "loss": 0.6664, + "step": 247 + }, + { + "epoch": 0.07623731939747926, + "grad_norm": 0.20316459238529205, + "learning_rate": 4.997104090443589e-06, + "loss": 0.7129, + "step": 248 + }, + { + "epoch": 0.07654472794343682, + "grad_norm": 0.20379343628883362, + "learning_rate": 4.997064832006803e-06, + "loss": 0.6643, + "step": 249 + }, + { + "epoch": 0.0768521364893944, + "grad_norm": 0.2102796584367752, + "learning_rate": 4.997025309411129e-06, + "loss": 0.7253, + "step": 250 + }, + { + "epoch": 0.07715954503535198, + "grad_norm": 0.21215298771858215, + "learning_rate": 4.996985522660748e-06, + "loss": 0.72, + "step": 251 + }, + { + "epoch": 0.07746695358130956, + "grad_norm": 0.20055678486824036, + "learning_rate": 4.996945471759869e-06, + "loss": 0.6899, + "step": 252 + }, + { + "epoch": 0.07777436212726714, + "grad_norm": 0.20982928574085236, + "learning_rate": 4.996905156712729e-06, + "loss": 0.694, + "step": 253 + }, + { + "epoch": 0.07808177067322472, + "grad_norm": 0.20185396075248718, + "learning_rate": 4.996864577523593e-06, + "loss": 0.7229, + "step": 254 + }, + { + "epoch": 0.07838917921918229, + "grad_norm": 0.2081146389245987, + "learning_rate": 4.996823734196753e-06, + "loss": 0.7147, + "step": 255 + }, + { + "epoch": 0.07869658776513987, + "grad_norm": 0.19822584092617035, + "learning_rate": 4.99678262673653e-06, + "loss": 0.7302, + "step": 256 + }, + { + "epoch": 0.07900399631109745, + "grad_norm": 0.20899280905723572, + "learning_rate": 4.996741255147274e-06, + "loss": 0.7269, + "step": 257 + }, + { + "epoch": 0.07931140485705503, + "grad_norm": 0.20852166414260864, + "learning_rate": 4.996699619433359e-06, + "loss": 0.713, + "step": 258 + }, + { + "epoch": 0.0796188134030126, + "grad_norm": 0.20775192975997925, + "learning_rate": 4.996657719599192e-06, + "loss": 0.717, + "step": 259 + }, + { + "epoch": 0.07992622194897019, + "grad_norm": 0.20397156476974487, + "learning_rate": 4.996615555649205e-06, + "loss": 0.678, + "step": 260 + }, + { + "epoch": 0.08023363049492777, + "grad_norm": 0.20643042027950287, + "learning_rate": 4.996573127587858e-06, + "loss": 0.7192, + "step": 261 + }, + { + "epoch": 0.08054103904088533, + "grad_norm": 0.20645011961460114, + "learning_rate": 4.996530435419639e-06, + "loss": 0.7175, + "step": 262 + }, + { + "epoch": 0.08084844758684291, + "grad_norm": 0.21231526136398315, + "learning_rate": 4.996487479149064e-06, + "loss": 0.7099, + "step": 263 + }, + { + "epoch": 0.08115585613280049, + "grad_norm": 0.20372924208641052, + "learning_rate": 4.9964442587806785e-06, + "loss": 0.6565, + "step": 264 + }, + { + "epoch": 0.08146326467875807, + "grad_norm": 0.21318647265434265, + "learning_rate": 4.996400774319054e-06, + "loss": 0.7026, + "step": 265 + }, + { + "epoch": 0.08177067322471565, + "grad_norm": 0.21542344987392426, + "learning_rate": 4.996357025768791e-06, + "loss": 0.6838, + "step": 266 + }, + { + "epoch": 0.08207808177067323, + "grad_norm": 0.2149612307548523, + "learning_rate": 4.996313013134517e-06, + "loss": 0.6981, + "step": 267 + }, + { + "epoch": 0.0823854903166308, + "grad_norm": 0.20526087284088135, + "learning_rate": 4.996268736420888e-06, + "loss": 0.7004, + "step": 268 + }, + { + "epoch": 0.08269289886258838, + "grad_norm": 0.19796745479106903, + "learning_rate": 4.99622419563259e-06, + "loss": 0.6943, + "step": 269 + }, + { + "epoch": 0.08300030740854596, + "grad_norm": 0.206628680229187, + "learning_rate": 4.996179390774331e-06, + "loss": 0.7022, + "step": 270 + }, + { + "epoch": 0.08330771595450354, + "grad_norm": 0.2070830762386322, + "learning_rate": 4.996134321850854e-06, + "loss": 0.6939, + "step": 271 + }, + { + "epoch": 0.08361512450046112, + "grad_norm": 0.20927581191062927, + "learning_rate": 4.996088988866925e-06, + "loss": 0.6994, + "step": 272 + }, + { + "epoch": 0.0839225330464187, + "grad_norm": 0.2084745466709137, + "learning_rate": 4.996043391827341e-06, + "loss": 0.6871, + "step": 273 + }, + { + "epoch": 0.08422994159237626, + "grad_norm": 0.20814822614192963, + "learning_rate": 4.995997530736924e-06, + "loss": 0.7406, + "step": 274 + }, + { + "epoch": 0.08453735013833384, + "grad_norm": 0.21422924101352692, + "learning_rate": 4.995951405600527e-06, + "loss": 0.6906, + "step": 275 + }, + { + "epoch": 0.08484475868429142, + "grad_norm": 0.21904635429382324, + "learning_rate": 4.995905016423029e-06, + "loss": 0.7114, + "step": 276 + }, + { + "epoch": 0.085152167230249, + "grad_norm": 0.21378853917121887, + "learning_rate": 4.995858363209337e-06, + "loss": 0.7025, + "step": 277 + }, + { + "epoch": 0.08545957577620658, + "grad_norm": 0.21804995834827423, + "learning_rate": 4.995811445964387e-06, + "loss": 0.7012, + "step": 278 + }, + { + "epoch": 0.08576698432216416, + "grad_norm": 0.20437245070934296, + "learning_rate": 4.995764264693141e-06, + "loss": 0.6746, + "step": 279 + }, + { + "epoch": 0.08607439286812173, + "grad_norm": 0.21647097170352936, + "learning_rate": 4.995716819400593e-06, + "loss": 0.696, + "step": 280 + }, + { + "epoch": 0.0863818014140793, + "grad_norm": 0.21909961104393005, + "learning_rate": 4.9956691100917585e-06, + "loss": 0.7144, + "step": 281 + }, + { + "epoch": 0.08668920996003689, + "grad_norm": 0.2308073788881302, + "learning_rate": 4.995621136771688e-06, + "loss": 0.7396, + "step": 282 + }, + { + "epoch": 0.08699661850599447, + "grad_norm": 0.21327826380729675, + "learning_rate": 4.995572899445453e-06, + "loss": 0.6941, + "step": 283 + }, + { + "epoch": 0.08730402705195205, + "grad_norm": 0.21016865968704224, + "learning_rate": 4.99552439811816e-06, + "loss": 0.7018, + "step": 284 + }, + { + "epoch": 0.08761143559790963, + "grad_norm": 0.21825113892555237, + "learning_rate": 4.9954756327949374e-06, + "loss": 0.7259, + "step": 285 + }, + { + "epoch": 0.0879188441438672, + "grad_norm": 0.2197452187538147, + "learning_rate": 4.995426603480944e-06, + "loss": 0.6803, + "step": 286 + }, + { + "epoch": 0.08822625268982477, + "grad_norm": 0.22126571834087372, + "learning_rate": 4.995377310181368e-06, + "loss": 0.7042, + "step": 287 + }, + { + "epoch": 0.08853366123578235, + "grad_norm": 0.22027850151062012, + "learning_rate": 4.995327752901424e-06, + "loss": 0.7126, + "step": 288 + }, + { + "epoch": 0.08884106978173993, + "grad_norm": 0.20716974139213562, + "learning_rate": 4.995277931646352e-06, + "loss": 0.7149, + "step": 289 + }, + { + "epoch": 0.08914847832769751, + "grad_norm": 0.21351520717144012, + "learning_rate": 4.995227846421425e-06, + "loss": 0.6661, + "step": 290 + }, + { + "epoch": 0.08945588687365509, + "grad_norm": 0.2097673863172531, + "learning_rate": 4.99517749723194e-06, + "loss": 0.6993, + "step": 291 + }, + { + "epoch": 0.08976329541961267, + "grad_norm": 0.21358217298984528, + "learning_rate": 4.995126884083225e-06, + "loss": 0.6864, + "step": 292 + }, + { + "epoch": 0.09007070396557024, + "grad_norm": 0.21718092262744904, + "learning_rate": 4.995076006980632e-06, + "loss": 0.7206, + "step": 293 + }, + { + "epoch": 0.09037811251152782, + "grad_norm": 0.23119401931762695, + "learning_rate": 4.995024865929546e-06, + "loss": 0.7071, + "step": 294 + }, + { + "epoch": 0.0906855210574854, + "grad_norm": 0.21738840639591217, + "learning_rate": 4.9949734609353746e-06, + "loss": 0.6704, + "step": 295 + }, + { + "epoch": 0.09099292960344298, + "grad_norm": 0.22249609231948853, + "learning_rate": 4.994921792003556e-06, + "loss": 0.7231, + "step": 296 + }, + { + "epoch": 0.09130033814940056, + "grad_norm": 0.21974460780620575, + "learning_rate": 4.994869859139558e-06, + "loss": 0.6475, + "step": 297 + }, + { + "epoch": 0.09160774669535814, + "grad_norm": 0.21733658015727997, + "learning_rate": 4.994817662348873e-06, + "loss": 0.7076, + "step": 298 + }, + { + "epoch": 0.0919151552413157, + "grad_norm": 0.21720989048480988, + "learning_rate": 4.994765201637023e-06, + "loss": 0.693, + "step": 299 + }, + { + "epoch": 0.09222256378727328, + "grad_norm": 0.21586193144321442, + "learning_rate": 4.994712477009558e-06, + "loss": 0.6926, + "step": 300 + }, + { + "epoch": 0.09252997233323086, + "grad_norm": 0.21305078268051147, + "learning_rate": 4.994659488472056e-06, + "loss": 0.7245, + "step": 301 + }, + { + "epoch": 0.09283738087918844, + "grad_norm": 0.20975805819034576, + "learning_rate": 4.99460623603012e-06, + "loss": 0.695, + "step": 302 + }, + { + "epoch": 0.09314478942514602, + "grad_norm": 0.2096678912639618, + "learning_rate": 4.994552719689387e-06, + "loss": 0.6581, + "step": 303 + }, + { + "epoch": 0.0934521979711036, + "grad_norm": 0.2132655829191208, + "learning_rate": 4.994498939455516e-06, + "loss": 0.7004, + "step": 304 + }, + { + "epoch": 0.09375960651706118, + "grad_norm": 0.22455942630767822, + "learning_rate": 4.994444895334198e-06, + "loss": 0.6654, + "step": 305 + }, + { + "epoch": 0.09406701506301875, + "grad_norm": 0.21497289836406708, + "learning_rate": 4.994390587331148e-06, + "loss": 0.6995, + "step": 306 + }, + { + "epoch": 0.09437442360897633, + "grad_norm": 0.21949337422847748, + "learning_rate": 4.994336015452114e-06, + "loss": 0.709, + "step": 307 + }, + { + "epoch": 0.0946818321549339, + "grad_norm": 0.2070390284061432, + "learning_rate": 4.9942811797028665e-06, + "loss": 0.667, + "step": 308 + }, + { + "epoch": 0.09498924070089149, + "grad_norm": 0.2129763811826706, + "learning_rate": 4.994226080089207e-06, + "loss": 0.7, + "step": 309 + }, + { + "epoch": 0.09529664924684907, + "grad_norm": 0.22160769999027252, + "learning_rate": 4.994170716616965e-06, + "loss": 0.6873, + "step": 310 + }, + { + "epoch": 0.09560405779280665, + "grad_norm": 0.21192388236522675, + "learning_rate": 4.994115089291998e-06, + "loss": 0.7016, + "step": 311 + }, + { + "epoch": 0.09591146633876421, + "grad_norm": 0.22130171954631805, + "learning_rate": 4.994059198120189e-06, + "loss": 0.6935, + "step": 312 + }, + { + "epoch": 0.09621887488472179, + "grad_norm": 0.22356827557086945, + "learning_rate": 4.9940030431074505e-06, + "loss": 0.6944, + "step": 313 + }, + { + "epoch": 0.09652628343067937, + "grad_norm": 0.22244198620319366, + "learning_rate": 4.993946624259724e-06, + "loss": 0.7002, + "step": 314 + }, + { + "epoch": 0.09683369197663695, + "grad_norm": 0.21720461547374725, + "learning_rate": 4.993889941582979e-06, + "loss": 0.7146, + "step": 315 + }, + { + "epoch": 0.09714110052259453, + "grad_norm": 0.21723859012126923, + "learning_rate": 4.99383299508321e-06, + "loss": 0.7019, + "step": 316 + }, + { + "epoch": 0.09744850906855211, + "grad_norm": 0.2164224237203598, + "learning_rate": 4.993775784766442e-06, + "loss": 0.7073, + "step": 317 + }, + { + "epoch": 0.09775591761450968, + "grad_norm": 0.2120506316423416, + "learning_rate": 4.993718310638726e-06, + "loss": 0.6756, + "step": 318 + }, + { + "epoch": 0.09806332616046726, + "grad_norm": 0.22142186760902405, + "learning_rate": 4.993660572706145e-06, + "loss": 0.6822, + "step": 319 + }, + { + "epoch": 0.09837073470642484, + "grad_norm": 0.2179284691810608, + "learning_rate": 4.993602570974803e-06, + "loss": 0.6941, + "step": 320 + }, + { + "epoch": 0.09867814325238242, + "grad_norm": 0.21641089022159576, + "learning_rate": 4.993544305450839e-06, + "loss": 0.6825, + "step": 321 + }, + { + "epoch": 0.09898555179834, + "grad_norm": 0.22381040453910828, + "learning_rate": 4.993485776140416e-06, + "loss": 0.6868, + "step": 322 + }, + { + "epoch": 0.09929296034429758, + "grad_norm": 0.2227148562669754, + "learning_rate": 4.993426983049724e-06, + "loss": 0.6852, + "step": 323 + }, + { + "epoch": 0.09960036889025516, + "grad_norm": 0.22471049427986145, + "learning_rate": 4.993367926184985e-06, + "loss": 0.6817, + "step": 324 + }, + { + "epoch": 0.09990777743621272, + "grad_norm": 0.2242165207862854, + "learning_rate": 4.993308605552445e-06, + "loss": 0.709, + "step": 325 + }, + { + "epoch": 0.1002151859821703, + "grad_norm": 0.2172999382019043, + "learning_rate": 4.99324902115838e-06, + "loss": 0.704, + "step": 326 + }, + { + "epoch": 0.10052259452812788, + "grad_norm": 0.2242281138896942, + "learning_rate": 4.993189173009092e-06, + "loss": 0.6823, + "step": 327 + }, + { + "epoch": 0.10083000307408546, + "grad_norm": 0.2219083160161972, + "learning_rate": 4.993129061110915e-06, + "loss": 0.7211, + "step": 328 + }, + { + "epoch": 0.10113741162004304, + "grad_norm": 0.2144426703453064, + "learning_rate": 4.993068685470206e-06, + "loss": 0.6725, + "step": 329 + }, + { + "epoch": 0.10144482016600062, + "grad_norm": 0.22659961879253387, + "learning_rate": 4.9930080460933516e-06, + "loss": 0.7065, + "step": 330 + }, + { + "epoch": 0.10175222871195819, + "grad_norm": 0.21369965374469757, + "learning_rate": 4.992947142986768e-06, + "loss": 0.6927, + "step": 331 + }, + { + "epoch": 0.10205963725791577, + "grad_norm": 0.2247871309518814, + "learning_rate": 4.9928859761568985e-06, + "loss": 0.6971, + "step": 332 + }, + { + "epoch": 0.10236704580387335, + "grad_norm": 0.22574640810489655, + "learning_rate": 4.992824545610212e-06, + "loss": 0.704, + "step": 333 + }, + { + "epoch": 0.10267445434983093, + "grad_norm": 0.2177547663450241, + "learning_rate": 4.9927628513532076e-06, + "loss": 0.6876, + "step": 334 + }, + { + "epoch": 0.1029818628957885, + "grad_norm": 0.22055736184120178, + "learning_rate": 4.992700893392413e-06, + "loss": 0.6967, + "step": 335 + }, + { + "epoch": 0.10328927144174609, + "grad_norm": 0.21607652306556702, + "learning_rate": 4.9926386717343814e-06, + "loss": 0.6877, + "step": 336 + }, + { + "epoch": 0.10359667998770365, + "grad_norm": 0.23030757904052734, + "learning_rate": 4.992576186385695e-06, + "loss": 0.6831, + "step": 337 + }, + { + "epoch": 0.10390408853366123, + "grad_norm": 0.21239306032657623, + "learning_rate": 4.992513437352964e-06, + "loss": 0.6922, + "step": 338 + }, + { + "epoch": 0.10421149707961881, + "grad_norm": 0.2158830463886261, + "learning_rate": 4.992450424642828e-06, + "loss": 0.6879, + "step": 339 + }, + { + "epoch": 0.10451890562557639, + "grad_norm": 0.21971288323402405, + "learning_rate": 4.992387148261951e-06, + "loss": 0.6882, + "step": 340 + }, + { + "epoch": 0.10482631417153397, + "grad_norm": 0.2228928506374359, + "learning_rate": 4.992323608217028e-06, + "loss": 0.7048, + "step": 341 + }, + { + "epoch": 0.10513372271749155, + "grad_norm": 0.2227204442024231, + "learning_rate": 4.992259804514781e-06, + "loss": 0.6837, + "step": 342 + }, + { + "epoch": 0.10544113126344912, + "grad_norm": 0.22480329871177673, + "learning_rate": 4.992195737161959e-06, + "loss": 0.7086, + "step": 343 + }, + { + "epoch": 0.1057485398094067, + "grad_norm": 0.2125869244337082, + "learning_rate": 4.992131406165339e-06, + "loss": 0.6749, + "step": 344 + }, + { + "epoch": 0.10605594835536428, + "grad_norm": 0.22838641703128815, + "learning_rate": 4.9920668115317275e-06, + "loss": 0.6857, + "step": 345 + }, + { + "epoch": 0.10636335690132186, + "grad_norm": 0.22428850829601288, + "learning_rate": 4.992001953267958e-06, + "loss": 0.6579, + "step": 346 + }, + { + "epoch": 0.10667076544727944, + "grad_norm": 0.22640272974967957, + "learning_rate": 4.99193683138089e-06, + "loss": 0.6983, + "step": 347 + }, + { + "epoch": 0.10697817399323702, + "grad_norm": 0.22248347103595734, + "learning_rate": 4.991871445877414e-06, + "loss": 0.6813, + "step": 348 + }, + { + "epoch": 0.1072855825391946, + "grad_norm": 0.22218987345695496, + "learning_rate": 4.991805796764448e-06, + "loss": 0.6988, + "step": 349 + }, + { + "epoch": 0.10759299108515216, + "grad_norm": 0.2276199609041214, + "learning_rate": 4.991739884048934e-06, + "loss": 0.6785, + "step": 350 + }, + { + "epoch": 0.10790039963110974, + "grad_norm": 0.22451390326023102, + "learning_rate": 4.9916737077378475e-06, + "loss": 0.6961, + "step": 351 + }, + { + "epoch": 0.10820780817706732, + "grad_norm": 0.22541603446006775, + "learning_rate": 4.991607267838187e-06, + "loss": 0.6674, + "step": 352 + }, + { + "epoch": 0.1085152167230249, + "grad_norm": 0.223527729511261, + "learning_rate": 4.991540564356983e-06, + "loss": 0.6852, + "step": 353 + }, + { + "epoch": 0.10882262526898248, + "grad_norm": 0.2247486710548401, + "learning_rate": 4.991473597301291e-06, + "loss": 0.7068, + "step": 354 + }, + { + "epoch": 0.10913003381494006, + "grad_norm": 0.23649539053440094, + "learning_rate": 4.9914063666781945e-06, + "loss": 0.6953, + "step": 355 + }, + { + "epoch": 0.10943744236089763, + "grad_norm": 0.2298431098461151, + "learning_rate": 4.9913388724948065e-06, + "loss": 0.7011, + "step": 356 + }, + { + "epoch": 0.10974485090685521, + "grad_norm": 0.22219671308994293, + "learning_rate": 4.9912711147582664e-06, + "loss": 0.683, + "step": 357 + }, + { + "epoch": 0.11005225945281279, + "grad_norm": 0.21246138215065002, + "learning_rate": 4.9912030934757435e-06, + "loss": 0.6877, + "step": 358 + }, + { + "epoch": 0.11035966799877037, + "grad_norm": 0.22073781490325928, + "learning_rate": 4.991134808654433e-06, + "loss": 0.6782, + "step": 359 + }, + { + "epoch": 0.11066707654472795, + "grad_norm": 0.228875070810318, + "learning_rate": 4.991066260301558e-06, + "loss": 0.6681, + "step": 360 + }, + { + "epoch": 0.11097448509068553, + "grad_norm": 0.23340076208114624, + "learning_rate": 4.9909974484243705e-06, + "loss": 0.7081, + "step": 361 + }, + { + "epoch": 0.11128189363664309, + "grad_norm": 0.22545503079891205, + "learning_rate": 4.99092837303015e-06, + "loss": 0.7235, + "step": 362 + }, + { + "epoch": 0.11158930218260067, + "grad_norm": 0.22075724601745605, + "learning_rate": 4.990859034126204e-06, + "loss": 0.6979, + "step": 363 + }, + { + "epoch": 0.11189671072855825, + "grad_norm": 0.2190071940422058, + "learning_rate": 4.990789431719867e-06, + "loss": 0.6754, + "step": 364 + }, + { + "epoch": 0.11220411927451583, + "grad_norm": 0.22624178230762482, + "learning_rate": 4.990719565818504e-06, + "loss": 0.6778, + "step": 365 + }, + { + "epoch": 0.11251152782047341, + "grad_norm": 0.23250389099121094, + "learning_rate": 4.990649436429503e-06, + "loss": 0.7116, + "step": 366 + }, + { + "epoch": 0.11281893636643099, + "grad_norm": 0.2303735464811325, + "learning_rate": 4.990579043560285e-06, + "loss": 0.7049, + "step": 367 + }, + { + "epoch": 0.11312634491238857, + "grad_norm": 0.22498401999473572, + "learning_rate": 4.990508387218296e-06, + "loss": 0.6822, + "step": 368 + }, + { + "epoch": 0.11343375345834614, + "grad_norm": 0.22504596412181854, + "learning_rate": 4.99043746741101e-06, + "loss": 0.6811, + "step": 369 + }, + { + "epoch": 0.11374116200430372, + "grad_norm": 0.23759888112545013, + "learning_rate": 4.990366284145931e-06, + "loss": 0.6853, + "step": 370 + }, + { + "epoch": 0.1140485705502613, + "grad_norm": 0.22899191081523895, + "learning_rate": 4.9902948374305885e-06, + "loss": 0.668, + "step": 371 + }, + { + "epoch": 0.11435597909621888, + "grad_norm": 0.23125910758972168, + "learning_rate": 4.9902231272725394e-06, + "loss": 0.7113, + "step": 372 + }, + { + "epoch": 0.11466338764217646, + "grad_norm": 0.21944087743759155, + "learning_rate": 4.9901511536793715e-06, + "loss": 0.6851, + "step": 373 + }, + { + "epoch": 0.11497079618813404, + "grad_norm": 0.23552356660366058, + "learning_rate": 4.990078916658699e-06, + "loss": 0.698, + "step": 374 + }, + { + "epoch": 0.1152782047340916, + "grad_norm": 0.23017965257167816, + "learning_rate": 4.990006416218162e-06, + "loss": 0.6954, + "step": 375 + }, + { + "epoch": 0.11558561328004918, + "grad_norm": 0.2381400316953659, + "learning_rate": 4.98993365236543e-06, + "loss": 0.6752, + "step": 376 + }, + { + "epoch": 0.11589302182600676, + "grad_norm": 0.2245696783065796, + "learning_rate": 4.9898606251082034e-06, + "loss": 0.6861, + "step": 377 + }, + { + "epoch": 0.11620043037196434, + "grad_norm": 0.22969992458820343, + "learning_rate": 4.989787334454206e-06, + "loss": 0.6679, + "step": 378 + }, + { + "epoch": 0.11650783891792192, + "grad_norm": 0.23566529154777527, + "learning_rate": 4.98971378041119e-06, + "loss": 0.6937, + "step": 379 + }, + { + "epoch": 0.1168152474638795, + "grad_norm": 0.24493800103664398, + "learning_rate": 4.989639962986937e-06, + "loss": 0.6998, + "step": 380 + }, + { + "epoch": 0.11712265600983707, + "grad_norm": 0.24092422425746918, + "learning_rate": 4.989565882189257e-06, + "loss": 0.6927, + "step": 381 + }, + { + "epoch": 0.11743006455579465, + "grad_norm": 0.2252228856086731, + "learning_rate": 4.989491538025985e-06, + "loss": 0.6979, + "step": 382 + }, + { + "epoch": 0.11773747310175223, + "grad_norm": 0.24125419557094574, + "learning_rate": 4.989416930504988e-06, + "loss": 0.6908, + "step": 383 + }, + { + "epoch": 0.1180448816477098, + "grad_norm": 0.23161208629608154, + "learning_rate": 4.989342059634157e-06, + "loss": 0.6513, + "step": 384 + }, + { + "epoch": 0.11835229019366739, + "grad_norm": 0.23513101041316986, + "learning_rate": 4.9892669254214125e-06, + "loss": 0.6812, + "step": 385 + }, + { + "epoch": 0.11865969873962497, + "grad_norm": 0.23237212002277374, + "learning_rate": 4.989191527874704e-06, + "loss": 0.6669, + "step": 386 + }, + { + "epoch": 0.11896710728558253, + "grad_norm": 0.2374318391084671, + "learning_rate": 4.989115867002006e-06, + "loss": 0.7001, + "step": 387 + }, + { + "epoch": 0.11927451583154011, + "grad_norm": 0.24537892639636993, + "learning_rate": 4.989039942811322e-06, + "loss": 0.6979, + "step": 388 + }, + { + "epoch": 0.11958192437749769, + "grad_norm": 0.22981786727905273, + "learning_rate": 4.988963755310687e-06, + "loss": 0.66, + "step": 389 + }, + { + "epoch": 0.11988933292345527, + "grad_norm": 0.22295087575912476, + "learning_rate": 4.988887304508158e-06, + "loss": 0.6797, + "step": 390 + }, + { + "epoch": 0.12019674146941285, + "grad_norm": 0.23019637167453766, + "learning_rate": 4.988810590411824e-06, + "loss": 0.6927, + "step": 391 + }, + { + "epoch": 0.12050415001537043, + "grad_norm": 0.2393874078989029, + "learning_rate": 4.9887336130297995e-06, + "loss": 0.7163, + "step": 392 + }, + { + "epoch": 0.12081155856132801, + "grad_norm": 0.24103809893131256, + "learning_rate": 4.988656372370228e-06, + "loss": 0.7065, + "step": 393 + }, + { + "epoch": 0.12111896710728558, + "grad_norm": 0.22911983728408813, + "learning_rate": 4.9885788684412804e-06, + "loss": 0.6968, + "step": 394 + }, + { + "epoch": 0.12142637565324316, + "grad_norm": 0.227855384349823, + "learning_rate": 4.988501101251156e-06, + "loss": 0.6804, + "step": 395 + }, + { + "epoch": 0.12173378419920074, + "grad_norm": 0.22878803312778473, + "learning_rate": 4.988423070808081e-06, + "loss": 0.6827, + "step": 396 + }, + { + "epoch": 0.12204119274515832, + "grad_norm": 0.24360834062099457, + "learning_rate": 4.988344777120311e-06, + "loss": 0.6827, + "step": 397 + }, + { + "epoch": 0.1223486012911159, + "grad_norm": 0.22544945776462555, + "learning_rate": 4.988266220196129e-06, + "loss": 0.6798, + "step": 398 + }, + { + "epoch": 0.12265600983707348, + "grad_norm": 0.23328754305839539, + "learning_rate": 4.988187400043843e-06, + "loss": 0.6987, + "step": 399 + }, + { + "epoch": 0.12296341838303104, + "grad_norm": 0.2248503714799881, + "learning_rate": 4.988108316671793e-06, + "loss": 0.6636, + "step": 400 + }, + { + "epoch": 0.12327082692898862, + "grad_norm": 0.23064275085926056, + "learning_rate": 4.988028970088346e-06, + "loss": 0.6997, + "step": 401 + }, + { + "epoch": 0.1235782354749462, + "grad_norm": 0.2412574291229248, + "learning_rate": 4.987949360301892e-06, + "loss": 0.7122, + "step": 402 + }, + { + "epoch": 0.12388564402090378, + "grad_norm": 0.23945453763008118, + "learning_rate": 4.987869487320858e-06, + "loss": 0.682, + "step": 403 + }, + { + "epoch": 0.12419305256686136, + "grad_norm": 0.24832607805728912, + "learning_rate": 4.98778935115369e-06, + "loss": 0.6907, + "step": 404 + }, + { + "epoch": 0.12450046111281894, + "grad_norm": 0.23571133613586426, + "learning_rate": 4.987708951808866e-06, + "loss": 0.6656, + "step": 405 + }, + { + "epoch": 0.12480786965877651, + "grad_norm": 0.23227806389331818, + "learning_rate": 4.987628289294893e-06, + "loss": 0.6934, + "step": 406 + }, + { + "epoch": 0.1251152782047341, + "grad_norm": 0.2360718697309494, + "learning_rate": 4.987547363620301e-06, + "loss": 0.715, + "step": 407 + }, + { + "epoch": 0.12542268675069168, + "grad_norm": 0.22533562779426575, + "learning_rate": 4.987466174793653e-06, + "loss": 0.6762, + "step": 408 + }, + { + "epoch": 0.12573009529664925, + "grad_norm": 0.22882840037345886, + "learning_rate": 4.987384722823538e-06, + "loss": 0.655, + "step": 409 + }, + { + "epoch": 0.1260375038426068, + "grad_norm": 0.2441919445991516, + "learning_rate": 4.987303007718573e-06, + "loss": 0.678, + "step": 410 + }, + { + "epoch": 0.1263449123885644, + "grad_norm": 0.2353876829147339, + "learning_rate": 4.9872210294874e-06, + "loss": 0.6852, + "step": 411 + }, + { + "epoch": 0.12665232093452197, + "grad_norm": 0.2358366996049881, + "learning_rate": 4.987138788138693e-06, + "loss": 0.693, + "step": 412 + }, + { + "epoch": 0.12695972948047957, + "grad_norm": 0.23434901237487793, + "learning_rate": 4.987056283681152e-06, + "loss": 0.6838, + "step": 413 + }, + { + "epoch": 0.12726713802643713, + "grad_norm": 0.23970575630664825, + "learning_rate": 4.986973516123506e-06, + "loss": 0.6788, + "step": 414 + }, + { + "epoch": 0.12757454657239473, + "grad_norm": 0.2517559230327606, + "learning_rate": 4.9868904854745084e-06, + "loss": 0.6648, + "step": 415 + }, + { + "epoch": 0.1278819551183523, + "grad_norm": 0.2357812523841858, + "learning_rate": 4.986807191742945e-06, + "loss": 0.6979, + "step": 416 + }, + { + "epoch": 0.12818936366430986, + "grad_norm": 0.23852698504924774, + "learning_rate": 4.986723634937626e-06, + "loss": 0.6906, + "step": 417 + }, + { + "epoch": 0.12849677221026745, + "grad_norm": 0.2432917058467865, + "learning_rate": 4.986639815067391e-06, + "loss": 0.6723, + "step": 418 + }, + { + "epoch": 0.12880418075622502, + "grad_norm": 0.2515276074409485, + "learning_rate": 4.986555732141107e-06, + "loss": 0.6883, + "step": 419 + }, + { + "epoch": 0.1291115893021826, + "grad_norm": 0.2502534091472626, + "learning_rate": 4.986471386167671e-06, + "loss": 0.7101, + "step": 420 + }, + { + "epoch": 0.12941899784814018, + "grad_norm": 0.2407144159078598, + "learning_rate": 4.9863867771560015e-06, + "loss": 0.6712, + "step": 421 + }, + { + "epoch": 0.12972640639409774, + "grad_norm": 0.22499090433120728, + "learning_rate": 4.986301905115053e-06, + "loss": 0.6737, + "step": 422 + }, + { + "epoch": 0.13003381494005534, + "grad_norm": 0.24243836104869843, + "learning_rate": 4.986216770053802e-06, + "loss": 0.6607, + "step": 423 + }, + { + "epoch": 0.1303412234860129, + "grad_norm": 0.24196189641952515, + "learning_rate": 4.9861313719812555e-06, + "loss": 0.693, + "step": 424 + }, + { + "epoch": 0.1306486320319705, + "grad_norm": 0.24590466916561127, + "learning_rate": 4.986045710906447e-06, + "loss": 0.6858, + "step": 425 + }, + { + "epoch": 0.13095604057792806, + "grad_norm": 0.22531507909297943, + "learning_rate": 4.985959786838438e-06, + "loss": 0.6534, + "step": 426 + }, + { + "epoch": 0.13126344912388566, + "grad_norm": 0.26181644201278687, + "learning_rate": 4.98587359978632e-06, + "loss": 0.713, + "step": 427 + }, + { + "epoch": 0.13157085766984322, + "grad_norm": 0.23991088569164276, + "learning_rate": 4.985787149759209e-06, + "loss": 0.6789, + "step": 428 + }, + { + "epoch": 0.1318782662158008, + "grad_norm": 0.23678945004940033, + "learning_rate": 4.9857004367662504e-06, + "loss": 0.6904, + "step": 429 + }, + { + "epoch": 0.13218567476175838, + "grad_norm": 0.23603391647338867, + "learning_rate": 4.985613460816618e-06, + "loss": 0.6724, + "step": 430 + }, + { + "epoch": 0.13249308330771595, + "grad_norm": 0.24264146387577057, + "learning_rate": 4.985526221919512e-06, + "loss": 0.6745, + "step": 431 + }, + { + "epoch": 0.13280049185367354, + "grad_norm": 0.23759552836418152, + "learning_rate": 4.9854387200841625e-06, + "loss": 0.6632, + "step": 432 + }, + { + "epoch": 0.1331079003996311, + "grad_norm": 0.24095793068408966, + "learning_rate": 4.985350955319825e-06, + "loss": 0.6932, + "step": 433 + }, + { + "epoch": 0.1334153089455887, + "grad_norm": 0.2342597395181656, + "learning_rate": 4.985262927635785e-06, + "loss": 0.6988, + "step": 434 + }, + { + "epoch": 0.13372271749154627, + "grad_norm": 0.24006913602352142, + "learning_rate": 4.985174637041353e-06, + "loss": 0.704, + "step": 435 + }, + { + "epoch": 0.13403012603750383, + "grad_norm": 0.2241889238357544, + "learning_rate": 4.985086083545872e-06, + "loss": 0.6453, + "step": 436 + }, + { + "epoch": 0.13433753458346143, + "grad_norm": 0.23041070997714996, + "learning_rate": 4.984997267158705e-06, + "loss": 0.7131, + "step": 437 + }, + { + "epoch": 0.134644943129419, + "grad_norm": 0.24229440093040466, + "learning_rate": 4.984908187889252e-06, + "loss": 0.7012, + "step": 438 + }, + { + "epoch": 0.1349523516753766, + "grad_norm": 0.23099300265312195, + "learning_rate": 4.984818845746936e-06, + "loss": 0.7029, + "step": 439 + }, + { + "epoch": 0.13525976022133415, + "grad_norm": 0.22943897545337677, + "learning_rate": 4.984729240741207e-06, + "loss": 0.6689, + "step": 440 + }, + { + "epoch": 0.13556716876729172, + "grad_norm": 0.2353782206773758, + "learning_rate": 4.984639372881544e-06, + "loss": 0.6561, + "step": 441 + }, + { + "epoch": 0.1358745773132493, + "grad_norm": 0.24052095413208008, + "learning_rate": 4.984549242177455e-06, + "loss": 0.6754, + "step": 442 + }, + { + "epoch": 0.13618198585920688, + "grad_norm": 0.2544795870780945, + "learning_rate": 4.984458848638475e-06, + "loss": 0.6673, + "step": 443 + }, + { + "epoch": 0.13648939440516447, + "grad_norm": 0.23707976937294006, + "learning_rate": 4.984368192274165e-06, + "loss": 0.6493, + "step": 444 + }, + { + "epoch": 0.13679680295112204, + "grad_norm": 0.23822879791259766, + "learning_rate": 4.9842772730941154e-06, + "loss": 0.6681, + "step": 445 + }, + { + "epoch": 0.13710421149707963, + "grad_norm": 0.23402857780456543, + "learning_rate": 4.984186091107947e-06, + "loss": 0.6836, + "step": 446 + }, + { + "epoch": 0.1374116200430372, + "grad_norm": 0.24922534823417664, + "learning_rate": 4.984094646325302e-06, + "loss": 0.694, + "step": 447 + }, + { + "epoch": 0.13771902858899476, + "grad_norm": 0.23032186925411224, + "learning_rate": 4.984002938755856e-06, + "loss": 0.6874, + "step": 448 + }, + { + "epoch": 0.13802643713495236, + "grad_norm": 0.2353002429008484, + "learning_rate": 4.9839109684093115e-06, + "loss": 0.6776, + "step": 449 + }, + { + "epoch": 0.13833384568090992, + "grad_norm": 0.23028720915317535, + "learning_rate": 4.9838187352953955e-06, + "loss": 0.6684, + "step": 450 + }, + { + "epoch": 0.13864125422686752, + "grad_norm": 0.24302490055561066, + "learning_rate": 4.983726239423867e-06, + "loss": 0.6916, + "step": 451 + }, + { + "epoch": 0.13894866277282508, + "grad_norm": 0.2301994115114212, + "learning_rate": 4.9836334808045105e-06, + "loss": 0.6649, + "step": 452 + }, + { + "epoch": 0.13925607131878268, + "grad_norm": 0.23958425223827362, + "learning_rate": 4.983540459447138e-06, + "loss": 0.6854, + "step": 453 + }, + { + "epoch": 0.13956347986474024, + "grad_norm": 0.23680134117603302, + "learning_rate": 4.983447175361591e-06, + "loss": 0.6856, + "step": 454 + }, + { + "epoch": 0.1398708884106978, + "grad_norm": 0.23594430088996887, + "learning_rate": 4.983353628557738e-06, + "loss": 0.6628, + "step": 455 + }, + { + "epoch": 0.1401782969566554, + "grad_norm": 0.23183932900428772, + "learning_rate": 4.983259819045474e-06, + "loss": 0.6861, + "step": 456 + }, + { + "epoch": 0.14048570550261297, + "grad_norm": 0.24667420983314514, + "learning_rate": 4.983165746834723e-06, + "loss": 0.6979, + "step": 457 + }, + { + "epoch": 0.14079311404857056, + "grad_norm": 0.25168952345848083, + "learning_rate": 4.983071411935437e-06, + "loss": 0.6718, + "step": 458 + }, + { + "epoch": 0.14110052259452813, + "grad_norm": 0.24363493919372559, + "learning_rate": 4.9829768143575965e-06, + "loss": 0.6693, + "step": 459 + }, + { + "epoch": 0.1414079311404857, + "grad_norm": 0.22828063368797302, + "learning_rate": 4.9828819541112075e-06, + "loss": 0.6776, + "step": 460 + }, + { + "epoch": 0.1417153396864433, + "grad_norm": 0.2334529161453247, + "learning_rate": 4.982786831206305e-06, + "loss": 0.6859, + "step": 461 + }, + { + "epoch": 0.14202274823240085, + "grad_norm": 0.23192743957042694, + "learning_rate": 4.982691445652953e-06, + "loss": 0.6524, + "step": 462 + }, + { + "epoch": 0.14233015677835845, + "grad_norm": 0.24218718707561493, + "learning_rate": 4.98259579746124e-06, + "loss": 0.6695, + "step": 463 + }, + { + "epoch": 0.142637565324316, + "grad_norm": 0.2314915508031845, + "learning_rate": 4.982499886641287e-06, + "loss": 0.6616, + "step": 464 + }, + { + "epoch": 0.1429449738702736, + "grad_norm": 0.2543243169784546, + "learning_rate": 4.982403713203239e-06, + "loss": 0.6818, + "step": 465 + }, + { + "epoch": 0.14325238241623117, + "grad_norm": 0.24794739484786987, + "learning_rate": 4.982307277157269e-06, + "loss": 0.6795, + "step": 466 + }, + { + "epoch": 0.14355979096218874, + "grad_norm": 0.23679906129837036, + "learning_rate": 4.98221057851358e-06, + "loss": 0.6969, + "step": 467 + }, + { + "epoch": 0.14386719950814633, + "grad_norm": 0.26235082745552063, + "learning_rate": 4.982113617282402e-06, + "loss": 0.6757, + "step": 468 + }, + { + "epoch": 0.1441746080541039, + "grad_norm": 0.2423919290304184, + "learning_rate": 4.982016393473991e-06, + "loss": 0.6809, + "step": 469 + }, + { + "epoch": 0.1444820166000615, + "grad_norm": 0.2342311292886734, + "learning_rate": 4.9819189070986325e-06, + "loss": 0.6819, + "step": 470 + }, + { + "epoch": 0.14478942514601906, + "grad_norm": 0.2513377368450165, + "learning_rate": 4.9818211581666395e-06, + "loss": 0.6945, + "step": 471 + }, + { + "epoch": 0.14509683369197662, + "grad_norm": 0.2616936266422272, + "learning_rate": 4.981723146688352e-06, + "loss": 0.6987, + "step": 472 + }, + { + "epoch": 0.14540424223793422, + "grad_norm": 0.22780486941337585, + "learning_rate": 4.98162487267414e-06, + "loss": 0.675, + "step": 473 + }, + { + "epoch": 0.14571165078389178, + "grad_norm": 0.23293662071228027, + "learning_rate": 4.981526336134399e-06, + "loss": 0.6569, + "step": 474 + }, + { + "epoch": 0.14601905932984938, + "grad_norm": 0.23476390540599823, + "learning_rate": 4.981427537079551e-06, + "loss": 0.6495, + "step": 475 + }, + { + "epoch": 0.14632646787580694, + "grad_norm": 0.24571935832500458, + "learning_rate": 4.981328475520051e-06, + "loss": 0.6858, + "step": 476 + }, + { + "epoch": 0.14663387642176454, + "grad_norm": 0.2484048455953598, + "learning_rate": 4.981229151466377e-06, + "loss": 0.6486, + "step": 477 + }, + { + "epoch": 0.1469412849677221, + "grad_norm": 0.24468356370925903, + "learning_rate": 4.981129564929036e-06, + "loss": 0.6825, + "step": 478 + }, + { + "epoch": 0.14724869351367967, + "grad_norm": 0.2419050931930542, + "learning_rate": 4.981029715918562e-06, + "loss": 0.6888, + "step": 479 + }, + { + "epoch": 0.14755610205963726, + "grad_norm": 0.25062939524650574, + "learning_rate": 4.98092960444552e-06, + "loss": 0.7068, + "step": 480 + }, + { + "epoch": 0.14786351060559483, + "grad_norm": 0.23986443877220154, + "learning_rate": 4.980829230520499e-06, + "loss": 0.6979, + "step": 481 + }, + { + "epoch": 0.14817091915155242, + "grad_norm": 0.24445512890815735, + "learning_rate": 4.980728594154118e-06, + "loss": 0.6673, + "step": 482 + }, + { + "epoch": 0.14847832769751, + "grad_norm": 0.24610568583011627, + "learning_rate": 4.980627695357024e-06, + "loss": 0.6783, + "step": 483 + }, + { + "epoch": 0.14878573624346758, + "grad_norm": 0.24511119723320007, + "learning_rate": 4.980526534139889e-06, + "loss": 0.7035, + "step": 484 + }, + { + "epoch": 0.14909314478942515, + "grad_norm": 0.23889902234077454, + "learning_rate": 4.980425110513416e-06, + "loss": 0.6526, + "step": 485 + }, + { + "epoch": 0.1494005533353827, + "grad_norm": 0.2264011949300766, + "learning_rate": 4.980323424488334e-06, + "loss": 0.6887, + "step": 486 + }, + { + "epoch": 0.1497079618813403, + "grad_norm": 0.2439829558134079, + "learning_rate": 4.980221476075399e-06, + "loss": 0.6818, + "step": 487 + }, + { + "epoch": 0.15001537042729787, + "grad_norm": 0.24035774171352386, + "learning_rate": 4.9801192652853975e-06, + "loss": 0.6885, + "step": 488 + }, + { + "epoch": 0.15032277897325547, + "grad_norm": 0.2489815056324005, + "learning_rate": 4.980016792129142e-06, + "loss": 0.6722, + "step": 489 + }, + { + "epoch": 0.15063018751921303, + "grad_norm": 0.22862565517425537, + "learning_rate": 4.979914056617472e-06, + "loss": 0.6583, + "step": 490 + }, + { + "epoch": 0.1509375960651706, + "grad_norm": 0.24348539113998413, + "learning_rate": 4.979811058761256e-06, + "loss": 0.6788, + "step": 491 + }, + { + "epoch": 0.1512450046111282, + "grad_norm": 0.246994748711586, + "learning_rate": 4.97970779857139e-06, + "loss": 0.6995, + "step": 492 + }, + { + "epoch": 0.15155241315708576, + "grad_norm": 0.24090728163719177, + "learning_rate": 4.979604276058797e-06, + "loss": 0.6859, + "step": 493 + }, + { + "epoch": 0.15185982170304335, + "grad_norm": 0.24561458826065063, + "learning_rate": 4.9795004912344305e-06, + "loss": 0.6718, + "step": 494 + }, + { + "epoch": 0.15216723024900092, + "grad_norm": 0.2518598437309265, + "learning_rate": 4.979396444109268e-06, + "loss": 0.6856, + "step": 495 + }, + { + "epoch": 0.1524746387949585, + "grad_norm": 0.24160663783550262, + "learning_rate": 4.979292134694316e-06, + "loss": 0.6766, + "step": 496 + }, + { + "epoch": 0.15278204734091608, + "grad_norm": 0.2495517134666443, + "learning_rate": 4.97918756300061e-06, + "loss": 0.7053, + "step": 497 + }, + { + "epoch": 0.15308945588687364, + "grad_norm": 0.2491316795349121, + "learning_rate": 4.979082729039213e-06, + "loss": 0.6915, + "step": 498 + }, + { + "epoch": 0.15339686443283124, + "grad_norm": 0.24625681340694427, + "learning_rate": 4.978977632821214e-06, + "loss": 0.7065, + "step": 499 + }, + { + "epoch": 0.1537042729787888, + "grad_norm": 0.24978867173194885, + "learning_rate": 4.978872274357731e-06, + "loss": 0.682, + "step": 500 + }, + { + "epoch": 0.1540116815247464, + "grad_norm": 0.2635263204574585, + "learning_rate": 4.97876665365991e-06, + "loss": 0.6911, + "step": 501 + }, + { + "epoch": 0.15431909007070396, + "grad_norm": 0.2497268170118332, + "learning_rate": 4.9786607707389245e-06, + "loss": 0.6754, + "step": 502 + }, + { + "epoch": 0.15462649861666156, + "grad_norm": 0.24663786590099335, + "learning_rate": 4.978554625605975e-06, + "loss": 0.688, + "step": 503 + }, + { + "epoch": 0.15493390716261912, + "grad_norm": 0.23803699016571045, + "learning_rate": 4.978448218272291e-06, + "loss": 0.6473, + "step": 504 + }, + { + "epoch": 0.1552413157085767, + "grad_norm": 0.2515263855457306, + "learning_rate": 4.978341548749129e-06, + "loss": 0.6864, + "step": 505 + }, + { + "epoch": 0.15554872425453428, + "grad_norm": 0.24402639269828796, + "learning_rate": 4.978234617047773e-06, + "loss": 0.6875, + "step": 506 + }, + { + "epoch": 0.15585613280049185, + "grad_norm": 0.26242175698280334, + "learning_rate": 4.978127423179535e-06, + "loss": 0.6794, + "step": 507 + }, + { + "epoch": 0.15616354134644944, + "grad_norm": 0.24294021725654602, + "learning_rate": 4.978019967155756e-06, + "loss": 0.6589, + "step": 508 + }, + { + "epoch": 0.156470949892407, + "grad_norm": 0.26833099126815796, + "learning_rate": 4.977912248987802e-06, + "loss": 0.6688, + "step": 509 + }, + { + "epoch": 0.15677835843836457, + "grad_norm": 0.25340187549591064, + "learning_rate": 4.977804268687069e-06, + "loss": 0.6685, + "step": 510 + }, + { + "epoch": 0.15708576698432217, + "grad_norm": 0.23830445110797882, + "learning_rate": 4.977696026264979e-06, + "loss": 0.6844, + "step": 511 + }, + { + "epoch": 0.15739317553027973, + "grad_norm": 0.23974956572055817, + "learning_rate": 4.977587521732985e-06, + "loss": 0.6828, + "step": 512 + }, + { + "epoch": 0.15770058407623733, + "grad_norm": 0.24546289443969727, + "learning_rate": 4.9774787551025625e-06, + "loss": 0.6773, + "step": 513 + }, + { + "epoch": 0.1580079926221949, + "grad_norm": 0.2641289532184601, + "learning_rate": 4.977369726385221e-06, + "loss": 0.6659, + "step": 514 + }, + { + "epoch": 0.1583154011681525, + "grad_norm": 0.24615997076034546, + "learning_rate": 4.977260435592491e-06, + "loss": 0.6692, + "step": 515 + }, + { + "epoch": 0.15862280971411005, + "grad_norm": 0.25502142310142517, + "learning_rate": 4.977150882735937e-06, + "loss": 0.6999, + "step": 516 + }, + { + "epoch": 0.15893021826006762, + "grad_norm": 0.2404036819934845, + "learning_rate": 4.977041067827146e-06, + "loss": 0.6635, + "step": 517 + }, + { + "epoch": 0.1592376268060252, + "grad_norm": 0.25526389479637146, + "learning_rate": 4.976930990877737e-06, + "loss": 0.6675, + "step": 518 + }, + { + "epoch": 0.15954503535198278, + "grad_norm": 0.264413058757782, + "learning_rate": 4.976820651899353e-06, + "loss": 0.7027, + "step": 519 + }, + { + "epoch": 0.15985244389794037, + "grad_norm": 0.268205463886261, + "learning_rate": 4.976710050903668e-06, + "loss": 0.7043, + "step": 520 + }, + { + "epoch": 0.16015985244389794, + "grad_norm": 0.26408302783966064, + "learning_rate": 4.976599187902382e-06, + "loss": 0.6678, + "step": 521 + }, + { + "epoch": 0.16046726098985553, + "grad_norm": 0.2375425547361374, + "learning_rate": 4.976488062907222e-06, + "loss": 0.6705, + "step": 522 + }, + { + "epoch": 0.1607746695358131, + "grad_norm": 0.2544267773628235, + "learning_rate": 4.9763766759299444e-06, + "loss": 0.6876, + "step": 523 + }, + { + "epoch": 0.16108207808177066, + "grad_norm": 0.24648383259773254, + "learning_rate": 4.976265026982333e-06, + "loss": 0.6931, + "step": 524 + }, + { + "epoch": 0.16138948662772826, + "grad_norm": 0.2509390711784363, + "learning_rate": 4.976153116076199e-06, + "loss": 0.6712, + "step": 525 + }, + { + "epoch": 0.16169689517368582, + "grad_norm": 0.25217822194099426, + "learning_rate": 4.976040943223379e-06, + "loss": 0.6775, + "step": 526 + }, + { + "epoch": 0.16200430371964342, + "grad_norm": 0.23648643493652344, + "learning_rate": 4.9759285084357414e-06, + "loss": 0.6705, + "step": 527 + }, + { + "epoch": 0.16231171226560098, + "grad_norm": 0.24051621556282043, + "learning_rate": 4.975815811725181e-06, + "loss": 0.6579, + "step": 528 + }, + { + "epoch": 0.16261912081155855, + "grad_norm": 0.24535511434078217, + "learning_rate": 4.975702853103618e-06, + "loss": 0.6503, + "step": 529 + }, + { + "epoch": 0.16292652935751614, + "grad_norm": 0.25291678309440613, + "learning_rate": 4.975589632583003e-06, + "loss": 0.691, + "step": 530 + }, + { + "epoch": 0.1632339379034737, + "grad_norm": 0.25369974970817566, + "learning_rate": 4.975476150175313e-06, + "loss": 0.6894, + "step": 531 + }, + { + "epoch": 0.1635413464494313, + "grad_norm": 0.25263500213623047, + "learning_rate": 4.975362405892554e-06, + "loss": 0.6898, + "step": 532 + }, + { + "epoch": 0.16384875499538887, + "grad_norm": 0.2474166452884674, + "learning_rate": 4.975248399746758e-06, + "loss": 0.6717, + "step": 533 + }, + { + "epoch": 0.16415616354134646, + "grad_norm": 0.25353074073791504, + "learning_rate": 4.975134131749986e-06, + "loss": 0.6427, + "step": 534 + }, + { + "epoch": 0.16446357208730403, + "grad_norm": 0.23797006905078888, + "learning_rate": 4.975019601914325e-06, + "loss": 0.6504, + "step": 535 + }, + { + "epoch": 0.1647709806332616, + "grad_norm": 0.23866446316242218, + "learning_rate": 4.974904810251891e-06, + "loss": 0.6627, + "step": 536 + }, + { + "epoch": 0.1650783891792192, + "grad_norm": 0.25052937865257263, + "learning_rate": 4.974789756774828e-06, + "loss": 0.6725, + "step": 537 + }, + { + "epoch": 0.16538579772517675, + "grad_norm": 0.2573775351047516, + "learning_rate": 4.974674441495308e-06, + "loss": 0.6871, + "step": 538 + }, + { + "epoch": 0.16569320627113435, + "grad_norm": 0.25020214915275574, + "learning_rate": 4.9745588644255295e-06, + "loss": 0.6765, + "step": 539 + }, + { + "epoch": 0.1660006148170919, + "grad_norm": 0.251759797334671, + "learning_rate": 4.974443025577718e-06, + "loss": 0.6721, + "step": 540 + }, + { + "epoch": 0.1663080233630495, + "grad_norm": 0.25863924622535706, + "learning_rate": 4.974326924964129e-06, + "loss": 0.6731, + "step": 541 + }, + { + "epoch": 0.16661543190900707, + "grad_norm": 0.2594468593597412, + "learning_rate": 4.9742105625970445e-06, + "loss": 0.6821, + "step": 542 + }, + { + "epoch": 0.16692284045496464, + "grad_norm": 0.24865466356277466, + "learning_rate": 4.974093938488774e-06, + "loss": 0.6833, + "step": 543 + }, + { + "epoch": 0.16723024900092223, + "grad_norm": 0.24839884042739868, + "learning_rate": 4.973977052651654e-06, + "loss": 0.6755, + "step": 544 + }, + { + "epoch": 0.1675376575468798, + "grad_norm": 0.24850760400295258, + "learning_rate": 4.973859905098052e-06, + "loss": 0.6864, + "step": 545 + }, + { + "epoch": 0.1678450660928374, + "grad_norm": 0.23691034317016602, + "learning_rate": 4.973742495840358e-06, + "loss": 0.6835, + "step": 546 + }, + { + "epoch": 0.16815247463879496, + "grad_norm": 0.2553350031375885, + "learning_rate": 4.973624824890994e-06, + "loss": 0.6715, + "step": 547 + }, + { + "epoch": 0.16845988318475252, + "grad_norm": 0.23956908285617828, + "learning_rate": 4.973506892262407e-06, + "loss": 0.6512, + "step": 548 + }, + { + "epoch": 0.16876729173071012, + "grad_norm": 0.25226065516471863, + "learning_rate": 4.973388697967075e-06, + "loss": 0.6823, + "step": 549 + }, + { + "epoch": 0.16907470027666768, + "grad_norm": 0.2571025490760803, + "learning_rate": 4.9732702420175e-06, + "loss": 0.6801, + "step": 550 + }, + { + "epoch": 0.16938210882262528, + "grad_norm": 0.25051867961883545, + "learning_rate": 4.973151524426213e-06, + "loss": 0.6817, + "step": 551 + }, + { + "epoch": 0.16968951736858284, + "grad_norm": 0.2533714771270752, + "learning_rate": 4.973032545205773e-06, + "loss": 0.6721, + "step": 552 + }, + { + "epoch": 0.16999692591454044, + "grad_norm": 0.23266085982322693, + "learning_rate": 4.972913304368767e-06, + "loss": 0.6532, + "step": 553 + }, + { + "epoch": 0.170304334460498, + "grad_norm": 0.24950990080833435, + "learning_rate": 4.97279380192781e-06, + "loss": 0.6945, + "step": 554 + }, + { + "epoch": 0.17061174300645557, + "grad_norm": 0.2611696422100067, + "learning_rate": 4.972674037895542e-06, + "loss": 0.6715, + "step": 555 + }, + { + "epoch": 0.17091915155241316, + "grad_norm": 0.24125483632087708, + "learning_rate": 4.972554012284633e-06, + "loss": 0.6763, + "step": 556 + }, + { + "epoch": 0.17122656009837073, + "grad_norm": 0.24504122138023376, + "learning_rate": 4.972433725107781e-06, + "loss": 0.6738, + "step": 557 + }, + { + "epoch": 0.17153396864432832, + "grad_norm": 0.24712328612804413, + "learning_rate": 4.972313176377711e-06, + "loss": 0.6797, + "step": 558 + }, + { + "epoch": 0.1718413771902859, + "grad_norm": 0.25098344683647156, + "learning_rate": 4.972192366107174e-06, + "loss": 0.6716, + "step": 559 + }, + { + "epoch": 0.17214878573624345, + "grad_norm": 0.25446850061416626, + "learning_rate": 4.972071294308952e-06, + "loss": 0.6968, + "step": 560 + }, + { + "epoch": 0.17245619428220105, + "grad_norm": 0.2526160478591919, + "learning_rate": 4.971949960995853e-06, + "loss": 0.6776, + "step": 561 + }, + { + "epoch": 0.1727636028281586, + "grad_norm": 0.25980380177497864, + "learning_rate": 4.971828366180712e-06, + "loss": 0.6848, + "step": 562 + }, + { + "epoch": 0.1730710113741162, + "grad_norm": 0.24714243412017822, + "learning_rate": 4.971706509876392e-06, + "loss": 0.682, + "step": 563 + }, + { + "epoch": 0.17337841992007377, + "grad_norm": 0.24370869994163513, + "learning_rate": 4.971584392095784e-06, + "loss": 0.6872, + "step": 564 + }, + { + "epoch": 0.17368582846603137, + "grad_norm": 0.26818326115608215, + "learning_rate": 4.971462012851806e-06, + "loss": 0.7034, + "step": 565 + }, + { + "epoch": 0.17399323701198893, + "grad_norm": 0.2617219090461731, + "learning_rate": 4.971339372157405e-06, + "loss": 0.6736, + "step": 566 + }, + { + "epoch": 0.1743006455579465, + "grad_norm": 0.24568036198616028, + "learning_rate": 4.971216470025556e-06, + "loss": 0.7033, + "step": 567 + }, + { + "epoch": 0.1746080541039041, + "grad_norm": 0.24460071325302124, + "learning_rate": 4.971093306469258e-06, + "loss": 0.7016, + "step": 568 + }, + { + "epoch": 0.17491546264986166, + "grad_norm": 0.2538304030895233, + "learning_rate": 4.970969881501542e-06, + "loss": 0.6813, + "step": 569 + }, + { + "epoch": 0.17522287119581925, + "grad_norm": 0.2527407705783844, + "learning_rate": 4.970846195135464e-06, + "loss": 0.6701, + "step": 570 + }, + { + "epoch": 0.17553027974177682, + "grad_norm": 0.24947218596935272, + "learning_rate": 4.970722247384109e-06, + "loss": 0.6646, + "step": 571 + }, + { + "epoch": 0.1758376882877344, + "grad_norm": 0.24635274708271027, + "learning_rate": 4.970598038260589e-06, + "loss": 0.6495, + "step": 572 + }, + { + "epoch": 0.17614509683369198, + "grad_norm": 0.26777052879333496, + "learning_rate": 4.970473567778044e-06, + "loss": 0.6711, + "step": 573 + }, + { + "epoch": 0.17645250537964954, + "grad_norm": 0.25102052092552185, + "learning_rate": 4.970348835949641e-06, + "loss": 0.6893, + "step": 574 + }, + { + "epoch": 0.17675991392560714, + "grad_norm": 0.25012901425361633, + "learning_rate": 4.970223842788576e-06, + "loss": 0.6559, + "step": 575 + }, + { + "epoch": 0.1770673224715647, + "grad_norm": 0.2576897442340851, + "learning_rate": 4.97009858830807e-06, + "loss": 0.6817, + "step": 576 + }, + { + "epoch": 0.1773747310175223, + "grad_norm": 0.24928008019924164, + "learning_rate": 4.9699730725213755e-06, + "loss": 0.6826, + "step": 577 + }, + { + "epoch": 0.17768213956347986, + "grad_norm": 0.2549649477005005, + "learning_rate": 4.969847295441769e-06, + "loss": 0.655, + "step": 578 + }, + { + "epoch": 0.17798954810943743, + "grad_norm": 0.2546837329864502, + "learning_rate": 4.969721257082557e-06, + "loss": 0.6917, + "step": 579 + }, + { + "epoch": 0.17829695665539502, + "grad_norm": 0.2493862509727478, + "learning_rate": 4.969594957457073e-06, + "loss": 0.6582, + "step": 580 + }, + { + "epoch": 0.1786043652013526, + "grad_norm": 0.2532242238521576, + "learning_rate": 4.969468396578676e-06, + "loss": 0.6721, + "step": 581 + }, + { + "epoch": 0.17891177374731018, + "grad_norm": 0.26338669657707214, + "learning_rate": 4.9693415744607565e-06, + "loss": 0.664, + "step": 582 + }, + { + "epoch": 0.17921918229326775, + "grad_norm": 0.2551650404930115, + "learning_rate": 4.96921449111673e-06, + "loss": 0.6917, + "step": 583 + }, + { + "epoch": 0.17952659083922534, + "grad_norm": 0.2550080120563507, + "learning_rate": 4.969087146560042e-06, + "loss": 0.6623, + "step": 584 + }, + { + "epoch": 0.1798339993851829, + "grad_norm": 0.2521880269050598, + "learning_rate": 4.9689595408041625e-06, + "loss": 0.6633, + "step": 585 + }, + { + "epoch": 0.18014140793114047, + "grad_norm": 0.2523528039455414, + "learning_rate": 4.968831673862589e-06, + "loss": 0.6551, + "step": 586 + }, + { + "epoch": 0.18044881647709807, + "grad_norm": 0.2602214217185974, + "learning_rate": 4.968703545748852e-06, + "loss": 0.6694, + "step": 587 + }, + { + "epoch": 0.18075622502305563, + "grad_norm": 0.24902822077274323, + "learning_rate": 4.968575156476503e-06, + "loss": 0.7014, + "step": 588 + }, + { + "epoch": 0.18106363356901323, + "grad_norm": 0.2605271637439728, + "learning_rate": 4.968446506059125e-06, + "loss": 0.6913, + "step": 589 + }, + { + "epoch": 0.1813710421149708, + "grad_norm": 0.2538703382015228, + "learning_rate": 4.968317594510328e-06, + "loss": 0.6692, + "step": 590 + }, + { + "epoch": 0.1816784506609284, + "grad_norm": 0.26326510310173035, + "learning_rate": 4.9681884218437485e-06, + "loss": 0.6722, + "step": 591 + }, + { + "epoch": 0.18198585920688595, + "grad_norm": 0.2702607810497284, + "learning_rate": 4.968058988073051e-06, + "loss": 0.7094, + "step": 592 + }, + { + "epoch": 0.18229326775284352, + "grad_norm": 0.25501400232315063, + "learning_rate": 4.96792929321193e-06, + "loss": 0.6551, + "step": 593 + }, + { + "epoch": 0.1826006762988011, + "grad_norm": 0.2505462169647217, + "learning_rate": 4.967799337274104e-06, + "loss": 0.6573, + "step": 594 + }, + { + "epoch": 0.18290808484475868, + "grad_norm": 0.26958340406417847, + "learning_rate": 4.967669120273321e-06, + "loss": 0.6806, + "step": 595 + }, + { + "epoch": 0.18321549339071627, + "grad_norm": 0.26196449995040894, + "learning_rate": 4.967538642223356e-06, + "loss": 0.6912, + "step": 596 + }, + { + "epoch": 0.18352290193667384, + "grad_norm": 0.26596367359161377, + "learning_rate": 4.967407903138013e-06, + "loss": 0.6965, + "step": 597 + }, + { + "epoch": 0.1838303104826314, + "grad_norm": 0.26291850209236145, + "learning_rate": 4.967276903031122e-06, + "loss": 0.6789, + "step": 598 + }, + { + "epoch": 0.184137719028589, + "grad_norm": 0.26795732975006104, + "learning_rate": 4.967145641916541e-06, + "loss": 0.665, + "step": 599 + }, + { + "epoch": 0.18444512757454656, + "grad_norm": 0.24405546486377716, + "learning_rate": 4.967014119808156e-06, + "loss": 0.6847, + "step": 600 + }, + { + "epoch": 0.18475253612050416, + "grad_norm": 0.25520753860473633, + "learning_rate": 4.966882336719881e-06, + "loss": 0.6888, + "step": 601 + }, + { + "epoch": 0.18505994466646172, + "grad_norm": 0.26401686668395996, + "learning_rate": 4.966750292665655e-06, + "loss": 0.6666, + "step": 602 + }, + { + "epoch": 0.18536735321241932, + "grad_norm": 0.26253995299339294, + "learning_rate": 4.96661798765945e-06, + "loss": 0.6889, + "step": 603 + }, + { + "epoch": 0.18567476175837688, + "grad_norm": 0.24981486797332764, + "learning_rate": 4.966485421715259e-06, + "loss": 0.643, + "step": 604 + }, + { + "epoch": 0.18598217030433445, + "grad_norm": 0.26210200786590576, + "learning_rate": 4.966352594847109e-06, + "loss": 0.6656, + "step": 605 + }, + { + "epoch": 0.18628957885029204, + "grad_norm": 0.250558078289032, + "learning_rate": 4.966219507069049e-06, + "loss": 0.6398, + "step": 606 + }, + { + "epoch": 0.1865969873962496, + "grad_norm": 0.2680855989456177, + "learning_rate": 4.966086158395158e-06, + "loss": 0.6811, + "step": 607 + }, + { + "epoch": 0.1869043959422072, + "grad_norm": 0.2839406728744507, + "learning_rate": 4.965952548839544e-06, + "loss": 0.6897, + "step": 608 + }, + { + "epoch": 0.18721180448816477, + "grad_norm": 0.25288131833076477, + "learning_rate": 4.9658186784163406e-06, + "loss": 0.681, + "step": 609 + }, + { + "epoch": 0.18751921303412236, + "grad_norm": 0.2541447579860687, + "learning_rate": 4.96568454713971e-06, + "loss": 0.665, + "step": 610 + }, + { + "epoch": 0.18782662158007993, + "grad_norm": 0.25278621912002563, + "learning_rate": 4.9655501550238415e-06, + "loss": 0.6559, + "step": 611 + }, + { + "epoch": 0.1881340301260375, + "grad_norm": 0.2542255222797394, + "learning_rate": 4.965415502082952e-06, + "loss": 0.667, + "step": 612 + }, + { + "epoch": 0.1884414386719951, + "grad_norm": 0.2626453638076782, + "learning_rate": 4.965280588331286e-06, + "loss": 0.6835, + "step": 613 + }, + { + "epoch": 0.18874884721795265, + "grad_norm": 0.2495110183954239, + "learning_rate": 4.965145413783115e-06, + "loss": 0.6409, + "step": 614 + }, + { + "epoch": 0.18905625576391025, + "grad_norm": 0.2669087052345276, + "learning_rate": 4.965009978452742e-06, + "loss": 0.7015, + "step": 615 + }, + { + "epoch": 0.1893636643098678, + "grad_norm": 0.25848931074142456, + "learning_rate": 4.96487428235449e-06, + "loss": 0.6914, + "step": 616 + }, + { + "epoch": 0.18967107285582538, + "grad_norm": 0.2576999366283417, + "learning_rate": 4.964738325502717e-06, + "loss": 0.6765, + "step": 617 + }, + { + "epoch": 0.18997848140178297, + "grad_norm": 0.25248250365257263, + "learning_rate": 4.964602107911805e-06, + "loss": 0.669, + "step": 618 + }, + { + "epoch": 0.19028588994774054, + "grad_norm": 0.26001980900764465, + "learning_rate": 4.964465629596163e-06, + "loss": 0.6838, + "step": 619 + }, + { + "epoch": 0.19059329849369813, + "grad_norm": 0.2606329917907715, + "learning_rate": 4.964328890570231e-06, + "loss": 0.6687, + "step": 620 + }, + { + "epoch": 0.1909007070396557, + "grad_norm": 0.2599039077758789, + "learning_rate": 4.964191890848471e-06, + "loss": 0.6854, + "step": 621 + }, + { + "epoch": 0.1912081155856133, + "grad_norm": 0.26552683115005493, + "learning_rate": 4.964054630445379e-06, + "loss": 0.6606, + "step": 622 + }, + { + "epoch": 0.19151552413157086, + "grad_norm": 0.2543853521347046, + "learning_rate": 4.963917109375474e-06, + "loss": 0.6755, + "step": 623 + }, + { + "epoch": 0.19182293267752842, + "grad_norm": 0.24646969139575958, + "learning_rate": 4.963779327653304e-06, + "loss": 0.6624, + "step": 624 + }, + { + "epoch": 0.19213034122348602, + "grad_norm": 0.2541002631187439, + "learning_rate": 4.963641285293444e-06, + "loss": 0.6697, + "step": 625 + }, + { + "epoch": 0.19243774976944358, + "grad_norm": 0.2585326135158539, + "learning_rate": 4.963502982310499e-06, + "loss": 0.6661, + "step": 626 + }, + { + "epoch": 0.19274515831540118, + "grad_norm": 0.25656262040138245, + "learning_rate": 4.9633644187191e-06, + "loss": 0.6995, + "step": 627 + }, + { + "epoch": 0.19305256686135874, + "grad_norm": 0.25890713930130005, + "learning_rate": 4.963225594533902e-06, + "loss": 0.6468, + "step": 628 + }, + { + "epoch": 0.19335997540731634, + "grad_norm": 0.2684546113014221, + "learning_rate": 4.9630865097695956e-06, + "loss": 0.6625, + "step": 629 + }, + { + "epoch": 0.1936673839532739, + "grad_norm": 0.2578732371330261, + "learning_rate": 4.96294716444089e-06, + "loss": 0.6885, + "step": 630 + }, + { + "epoch": 0.19397479249923147, + "grad_norm": 0.24567724764347076, + "learning_rate": 4.962807558562528e-06, + "loss": 0.6429, + "step": 631 + }, + { + "epoch": 0.19428220104518906, + "grad_norm": 0.2578364610671997, + "learning_rate": 4.962667692149279e-06, + "loss": 0.6865, + "step": 632 + }, + { + "epoch": 0.19458960959114663, + "grad_norm": 0.25177448987960815, + "learning_rate": 4.962527565215938e-06, + "loss": 0.6749, + "step": 633 + }, + { + "epoch": 0.19489701813710422, + "grad_norm": 0.25360623002052307, + "learning_rate": 4.962387177777329e-06, + "loss": 0.668, + "step": 634 + }, + { + "epoch": 0.1952044266830618, + "grad_norm": 0.26047319173812866, + "learning_rate": 4.962246529848303e-06, + "loss": 0.7091, + "step": 635 + }, + { + "epoch": 0.19551183522901935, + "grad_norm": 0.24287913739681244, + "learning_rate": 4.962105621443739e-06, + "loss": 0.6645, + "step": 636 + }, + { + "epoch": 0.19581924377497695, + "grad_norm": 0.2576439380645752, + "learning_rate": 4.961964452578544e-06, + "loss": 0.685, + "step": 637 + }, + { + "epoch": 0.1961266523209345, + "grad_norm": 0.2572690546512604, + "learning_rate": 4.961823023267652e-06, + "loss": 0.6709, + "step": 638 + }, + { + "epoch": 0.1964340608668921, + "grad_norm": 0.26042258739471436, + "learning_rate": 4.961681333526023e-06, + "loss": 0.6584, + "step": 639 + }, + { + "epoch": 0.19674146941284967, + "grad_norm": 0.2490772306919098, + "learning_rate": 4.9615393833686475e-06, + "loss": 0.6904, + "step": 640 + }, + { + "epoch": 0.19704887795880727, + "grad_norm": 0.2595136761665344, + "learning_rate": 4.961397172810541e-06, + "loss": 0.6657, + "step": 641 + }, + { + "epoch": 0.19735628650476483, + "grad_norm": 0.2609310746192932, + "learning_rate": 4.961254701866749e-06, + "loss": 0.677, + "step": 642 + }, + { + "epoch": 0.1976636950507224, + "grad_norm": 0.2632048428058624, + "learning_rate": 4.9611119705523415e-06, + "loss": 0.6602, + "step": 643 + }, + { + "epoch": 0.19797110359668, + "grad_norm": 0.25774917006492615, + "learning_rate": 4.96096897888242e-06, + "loss": 0.6764, + "step": 644 + }, + { + "epoch": 0.19827851214263756, + "grad_norm": 0.25693070888519287, + "learning_rate": 4.960825726872109e-06, + "loss": 0.6667, + "step": 645 + }, + { + "epoch": 0.19858592068859515, + "grad_norm": 0.25493523478507996, + "learning_rate": 4.960682214536564e-06, + "loss": 0.6718, + "step": 646 + }, + { + "epoch": 0.19889332923455272, + "grad_norm": 0.25811508297920227, + "learning_rate": 4.960538441890966e-06, + "loss": 0.6662, + "step": 647 + }, + { + "epoch": 0.1992007377805103, + "grad_norm": 0.25233983993530273, + "learning_rate": 4.960394408950525e-06, + "loss": 0.6842, + "step": 648 + }, + { + "epoch": 0.19950814632646788, + "grad_norm": 0.2517624795436859, + "learning_rate": 4.960250115730478e-06, + "loss": 0.6625, + "step": 649 + }, + { + "epoch": 0.19981555487242544, + "grad_norm": 0.26288411021232605, + "learning_rate": 4.96010556224609e-06, + "loss": 0.6742, + "step": 650 + }, + { + "epoch": 0.20012296341838304, + "grad_norm": 0.2557123601436615, + "learning_rate": 4.959960748512651e-06, + "loss": 0.6664, + "step": 651 + }, + { + "epoch": 0.2004303719643406, + "grad_norm": 0.2678082287311554, + "learning_rate": 4.959815674545483e-06, + "loss": 0.6608, + "step": 652 + }, + { + "epoch": 0.2007377805102982, + "grad_norm": 0.25069501996040344, + "learning_rate": 4.959670340359932e-06, + "loss": 0.6752, + "step": 653 + }, + { + "epoch": 0.20104518905625576, + "grad_norm": 0.2751714587211609, + "learning_rate": 4.9595247459713714e-06, + "loss": 0.6609, + "step": 654 + }, + { + "epoch": 0.20135259760221333, + "grad_norm": 0.2570600211620331, + "learning_rate": 4.9593788913952045e-06, + "loss": 0.6411, + "step": 655 + }, + { + "epoch": 0.20166000614817092, + "grad_norm": 0.25339362025260925, + "learning_rate": 4.9592327766468615e-06, + "loss": 0.6569, + "step": 656 + }, + { + "epoch": 0.2019674146941285, + "grad_norm": 0.2566807270050049, + "learning_rate": 4.959086401741798e-06, + "loss": 0.6614, + "step": 657 + }, + { + "epoch": 0.20227482324008608, + "grad_norm": 0.2667348086833954, + "learning_rate": 4.958939766695501e-06, + "loss": 0.6815, + "step": 658 + }, + { + "epoch": 0.20258223178604365, + "grad_norm": 0.27658990025520325, + "learning_rate": 4.95879287152348e-06, + "loss": 0.6759, + "step": 659 + }, + { + "epoch": 0.20288964033200124, + "grad_norm": 0.26108837127685547, + "learning_rate": 4.958645716241276e-06, + "loss": 0.6688, + "step": 660 + }, + { + "epoch": 0.2031970488779588, + "grad_norm": 0.25907057523727417, + "learning_rate": 4.958498300864456e-06, + "loss": 0.6853, + "step": 661 + }, + { + "epoch": 0.20350445742391637, + "grad_norm": 0.2742748260498047, + "learning_rate": 4.958350625408615e-06, + "loss": 0.6799, + "step": 662 + }, + { + "epoch": 0.20381186596987397, + "grad_norm": 0.27830713987350464, + "learning_rate": 4.958202689889375e-06, + "loss": 0.6747, + "step": 663 + }, + { + "epoch": 0.20411927451583153, + "grad_norm": 0.2651318907737732, + "learning_rate": 4.958054494322387e-06, + "loss": 0.6916, + "step": 664 + }, + { + "epoch": 0.20442668306178913, + "grad_norm": 0.24833299219608307, + "learning_rate": 4.957906038723326e-06, + "loss": 0.6486, + "step": 665 + }, + { + "epoch": 0.2047340916077467, + "grad_norm": 0.25406503677368164, + "learning_rate": 4.957757323107898e-06, + "loss": 0.6615, + "step": 666 + }, + { + "epoch": 0.20504150015370426, + "grad_norm": 0.27297908067703247, + "learning_rate": 4.957608347491836e-06, + "loss": 0.6847, + "step": 667 + }, + { + "epoch": 0.20534890869966185, + "grad_norm": 0.2735472321510315, + "learning_rate": 4.957459111890899e-06, + "loss": 0.6436, + "step": 668 + }, + { + "epoch": 0.20565631724561942, + "grad_norm": 0.25384414196014404, + "learning_rate": 4.957309616320873e-06, + "loss": 0.656, + "step": 669 + }, + { + "epoch": 0.205963725791577, + "grad_norm": 0.25569432973861694, + "learning_rate": 4.957159860797576e-06, + "loss": 0.678, + "step": 670 + }, + { + "epoch": 0.20627113433753458, + "grad_norm": 0.2721424698829651, + "learning_rate": 4.957009845336847e-06, + "loss": 0.676, + "step": 671 + }, + { + "epoch": 0.20657854288349217, + "grad_norm": 0.27477967739105225, + "learning_rate": 4.956859569954559e-06, + "loss": 0.6781, + "step": 672 + }, + { + "epoch": 0.20688595142944974, + "grad_norm": 0.2637997269630432, + "learning_rate": 4.9567090346666055e-06, + "loss": 0.6764, + "step": 673 + }, + { + "epoch": 0.2071933599754073, + "grad_norm": 0.2489425390958786, + "learning_rate": 4.956558239488914e-06, + "loss": 0.6878, + "step": 674 + }, + { + "epoch": 0.2075007685213649, + "grad_norm": 0.2719458341598511, + "learning_rate": 4.956407184437437e-06, + "loss": 0.639, + "step": 675 + }, + { + "epoch": 0.20780817706732246, + "grad_norm": 0.26121658086776733, + "learning_rate": 4.956255869528152e-06, + "loss": 0.6515, + "step": 676 + }, + { + "epoch": 0.20811558561328006, + "grad_norm": 0.27100127935409546, + "learning_rate": 4.9561042947770684e-06, + "loss": 0.6875, + "step": 677 + }, + { + "epoch": 0.20842299415923762, + "grad_norm": 0.25916171073913574, + "learning_rate": 4.95595246020022e-06, + "loss": 0.6596, + "step": 678 + }, + { + "epoch": 0.20873040270519522, + "grad_norm": 0.26584476232528687, + "learning_rate": 4.9558003658136704e-06, + "loss": 0.6904, + "step": 679 + }, + { + "epoch": 0.20903781125115278, + "grad_norm": 0.2794335186481476, + "learning_rate": 4.955648011633507e-06, + "loss": 0.6774, + "step": 680 + }, + { + "epoch": 0.20934521979711035, + "grad_norm": 0.2651100754737854, + "learning_rate": 4.955495397675849e-06, + "loss": 0.6695, + "step": 681 + }, + { + "epoch": 0.20965262834306794, + "grad_norm": 0.26175498962402344, + "learning_rate": 4.95534252395684e-06, + "loss": 0.6637, + "step": 682 + }, + { + "epoch": 0.2099600368890255, + "grad_norm": 0.25834447145462036, + "learning_rate": 4.9551893904926516e-06, + "loss": 0.63, + "step": 683 + }, + { + "epoch": 0.2102674454349831, + "grad_norm": 0.2578924894332886, + "learning_rate": 4.955035997299485e-06, + "loss": 0.6746, + "step": 684 + }, + { + "epoch": 0.21057485398094067, + "grad_norm": 0.25839561223983765, + "learning_rate": 4.954882344393566e-06, + "loss": 0.6679, + "step": 685 + }, + { + "epoch": 0.21088226252689823, + "grad_norm": 0.2551698684692383, + "learning_rate": 4.954728431791151e-06, + "loss": 0.6516, + "step": 686 + }, + { + "epoch": 0.21118967107285583, + "grad_norm": 0.2622458040714264, + "learning_rate": 4.95457425950852e-06, + "loss": 0.6736, + "step": 687 + }, + { + "epoch": 0.2114970796188134, + "grad_norm": 0.2612099051475525, + "learning_rate": 4.954419827561984e-06, + "loss": 0.6919, + "step": 688 + }, + { + "epoch": 0.211804488164771, + "grad_norm": 0.26117244362831116, + "learning_rate": 4.954265135967879e-06, + "loss": 0.6664, + "step": 689 + }, + { + "epoch": 0.21211189671072855, + "grad_norm": 0.26962918043136597, + "learning_rate": 4.95411018474257e-06, + "loss": 0.6532, + "step": 690 + }, + { + "epoch": 0.21241930525668615, + "grad_norm": 0.2620513141155243, + "learning_rate": 4.953954973902449e-06, + "loss": 0.631, + "step": 691 + }, + { + "epoch": 0.2127267138026437, + "grad_norm": 0.26602956652641296, + "learning_rate": 4.953799503463934e-06, + "loss": 0.6766, + "step": 692 + }, + { + "epoch": 0.21303412234860128, + "grad_norm": 0.26681429147720337, + "learning_rate": 4.953643773443475e-06, + "loss": 0.6752, + "step": 693 + }, + { + "epoch": 0.21334153089455887, + "grad_norm": 0.2824001908302307, + "learning_rate": 4.953487783857543e-06, + "loss": 0.6735, + "step": 694 + }, + { + "epoch": 0.21364893944051644, + "grad_norm": 0.27332139015197754, + "learning_rate": 4.953331534722642e-06, + "loss": 0.655, + "step": 695 + }, + { + "epoch": 0.21395634798647403, + "grad_norm": 0.26553598046302795, + "learning_rate": 4.9531750260553e-06, + "loss": 0.6561, + "step": 696 + }, + { + "epoch": 0.2142637565324316, + "grad_norm": 0.26391762495040894, + "learning_rate": 4.953018257872075e-06, + "loss": 0.6784, + "step": 697 + }, + { + "epoch": 0.2145711650783892, + "grad_norm": 0.2535015940666199, + "learning_rate": 4.95286123018955e-06, + "loss": 0.673, + "step": 698 + }, + { + "epoch": 0.21487857362434676, + "grad_norm": 0.2696651518344879, + "learning_rate": 4.9527039430243366e-06, + "loss": 0.6705, + "step": 699 + }, + { + "epoch": 0.21518598217030432, + "grad_norm": 0.2647089958190918, + "learning_rate": 4.952546396393074e-06, + "loss": 0.6824, + "step": 700 + }, + { + "epoch": 0.21549339071626192, + "grad_norm": 0.25741392374038696, + "learning_rate": 4.9523885903124295e-06, + "loss": 0.6846, + "step": 701 + }, + { + "epoch": 0.21580079926221948, + "grad_norm": 0.2744412124156952, + "learning_rate": 4.952230524799097e-06, + "loss": 0.6647, + "step": 702 + }, + { + "epoch": 0.21610820780817708, + "grad_norm": 0.2595648169517517, + "learning_rate": 4.952072199869796e-06, + "loss": 0.679, + "step": 703 + }, + { + "epoch": 0.21641561635413464, + "grad_norm": 0.272344172000885, + "learning_rate": 4.951913615541277e-06, + "loss": 0.6805, + "step": 704 + }, + { + "epoch": 0.2167230249000922, + "grad_norm": 0.2539536952972412, + "learning_rate": 4.951754771830317e-06, + "loss": 0.6307, + "step": 705 + }, + { + "epoch": 0.2170304334460498, + "grad_norm": 0.27336791157722473, + "learning_rate": 4.951595668753717e-06, + "loss": 0.6587, + "step": 706 + }, + { + "epoch": 0.21733784199200737, + "grad_norm": 0.2749011218547821, + "learning_rate": 4.951436306328311e-06, + "loss": 0.6838, + "step": 707 + }, + { + "epoch": 0.21764525053796496, + "grad_norm": 0.2670399844646454, + "learning_rate": 4.951276684570956e-06, + "loss": 0.634, + "step": 708 + }, + { + "epoch": 0.21795265908392253, + "grad_norm": 0.26999178528785706, + "learning_rate": 4.951116803498539e-06, + "loss": 0.6419, + "step": 709 + }, + { + "epoch": 0.21826006762988012, + "grad_norm": 0.2819100320339203, + "learning_rate": 4.950956663127973e-06, + "loss": 0.6701, + "step": 710 + }, + { + "epoch": 0.2185674761758377, + "grad_norm": 0.2786787748336792, + "learning_rate": 4.950796263476198e-06, + "loss": 0.6607, + "step": 711 + }, + { + "epoch": 0.21887488472179525, + "grad_norm": 0.2577665150165558, + "learning_rate": 4.950635604560184e-06, + "loss": 0.6646, + "step": 712 + }, + { + "epoch": 0.21918229326775285, + "grad_norm": 0.26123517751693726, + "learning_rate": 4.950474686396926e-06, + "loss": 0.6677, + "step": 713 + }, + { + "epoch": 0.21948970181371041, + "grad_norm": 0.2644079625606537, + "learning_rate": 4.950313509003446e-06, + "loss": 0.6571, + "step": 714 + }, + { + "epoch": 0.219797110359668, + "grad_norm": 0.26732295751571655, + "learning_rate": 4.950152072396796e-06, + "loss": 0.6674, + "step": 715 + }, + { + "epoch": 0.22010451890562557, + "grad_norm": 0.26275402307510376, + "learning_rate": 4.9499903765940546e-06, + "loss": 0.69, + "step": 716 + }, + { + "epoch": 0.22041192745158317, + "grad_norm": 0.2704673707485199, + "learning_rate": 4.949828421612325e-06, + "loss": 0.663, + "step": 717 + }, + { + "epoch": 0.22071933599754073, + "grad_norm": 0.26878997683525085, + "learning_rate": 4.949666207468742e-06, + "loss": 0.6594, + "step": 718 + }, + { + "epoch": 0.2210267445434983, + "grad_norm": 0.26202625036239624, + "learning_rate": 4.9495037341804654e-06, + "loss": 0.6596, + "step": 719 + }, + { + "epoch": 0.2213341530894559, + "grad_norm": 0.2751867175102234, + "learning_rate": 4.949341001764683e-06, + "loss": 0.6662, + "step": 720 + }, + { + "epoch": 0.22164156163541346, + "grad_norm": 0.2594238221645355, + "learning_rate": 4.949178010238609e-06, + "loss": 0.6793, + "step": 721 + }, + { + "epoch": 0.22194897018137105, + "grad_norm": 0.2624204456806183, + "learning_rate": 4.949014759619487e-06, + "loss": 0.6811, + "step": 722 + }, + { + "epoch": 0.22225637872732862, + "grad_norm": 0.26247018575668335, + "learning_rate": 4.948851249924585e-06, + "loss": 0.67, + "step": 723 + }, + { + "epoch": 0.22256378727328618, + "grad_norm": 0.2603485584259033, + "learning_rate": 4.9486874811712035e-06, + "loss": 0.6608, + "step": 724 + }, + { + "epoch": 0.22287119581924378, + "grad_norm": 0.2616192400455475, + "learning_rate": 4.948523453376665e-06, + "loss": 0.6474, + "step": 725 + }, + { + "epoch": 0.22317860436520134, + "grad_norm": 0.2620954215526581, + "learning_rate": 4.948359166558322e-06, + "loss": 0.6624, + "step": 726 + }, + { + "epoch": 0.22348601291115894, + "grad_norm": 0.27407070994377136, + "learning_rate": 4.948194620733554e-06, + "loss": 0.6723, + "step": 727 + }, + { + "epoch": 0.2237934214571165, + "grad_norm": 0.2662461996078491, + "learning_rate": 4.948029815919768e-06, + "loss": 0.6697, + "step": 728 + }, + { + "epoch": 0.2241008300030741, + "grad_norm": 0.2622913420200348, + "learning_rate": 4.947864752134399e-06, + "loss": 0.6531, + "step": 729 + }, + { + "epoch": 0.22440823854903166, + "grad_norm": 0.2619720697402954, + "learning_rate": 4.9476994293949075e-06, + "loss": 0.6715, + "step": 730 + }, + { + "epoch": 0.22471564709498923, + "grad_norm": 0.2599133253097534, + "learning_rate": 4.947533847718784e-06, + "loss": 0.6456, + "step": 731 + }, + { + "epoch": 0.22502305564094682, + "grad_norm": 0.28690043091773987, + "learning_rate": 4.947368007123544e-06, + "loss": 0.6749, + "step": 732 + }, + { + "epoch": 0.2253304641869044, + "grad_norm": 0.26802098751068115, + "learning_rate": 4.947201907626732e-06, + "loss": 0.6479, + "step": 733 + }, + { + "epoch": 0.22563787273286198, + "grad_norm": 0.2694631516933441, + "learning_rate": 4.9470355492459186e-06, + "loss": 0.6853, + "step": 734 + }, + { + "epoch": 0.22594528127881955, + "grad_norm": 0.26029765605926514, + "learning_rate": 4.946868931998703e-06, + "loss": 0.6594, + "step": 735 + }, + { + "epoch": 0.22625268982477714, + "grad_norm": 0.28009122610092163, + "learning_rate": 4.94670205590271e-06, + "loss": 0.6983, + "step": 736 + }, + { + "epoch": 0.2265600983707347, + "grad_norm": 0.27814438939094543, + "learning_rate": 4.946534920975595e-06, + "loss": 0.6813, + "step": 737 + }, + { + "epoch": 0.22686750691669227, + "grad_norm": 0.26479458808898926, + "learning_rate": 4.946367527235038e-06, + "loss": 0.6481, + "step": 738 + }, + { + "epoch": 0.22717491546264987, + "grad_norm": 0.27032554149627686, + "learning_rate": 4.946199874698748e-06, + "loss": 0.6466, + "step": 739 + }, + { + "epoch": 0.22748232400860743, + "grad_norm": 0.27509844303131104, + "learning_rate": 4.9460319633844595e-06, + "loss": 0.6531, + "step": 740 + }, + { + "epoch": 0.22778973255456503, + "grad_norm": 0.2816414535045624, + "learning_rate": 4.945863793309935e-06, + "loss": 0.6739, + "step": 741 + }, + { + "epoch": 0.2280971411005226, + "grad_norm": 0.27062103152275085, + "learning_rate": 4.945695364492966e-06, + "loss": 0.6611, + "step": 742 + }, + { + "epoch": 0.22840454964648016, + "grad_norm": 0.2724711000919342, + "learning_rate": 4.94552667695137e-06, + "loss": 0.6595, + "step": 743 + }, + { + "epoch": 0.22871195819243775, + "grad_norm": 0.2618529796600342, + "learning_rate": 4.945357730702992e-06, + "loss": 0.6623, + "step": 744 + }, + { + "epoch": 0.22901936673839532, + "grad_norm": 0.2670546770095825, + "learning_rate": 4.945188525765704e-06, + "loss": 0.6686, + "step": 745 + }, + { + "epoch": 0.2293267752843529, + "grad_norm": 0.2832658588886261, + "learning_rate": 4.945019062157406e-06, + "loss": 0.6227, + "step": 746 + }, + { + "epoch": 0.22963418383031048, + "grad_norm": 0.2746593952178955, + "learning_rate": 4.944849339896026e-06, + "loss": 0.6788, + "step": 747 + }, + { + "epoch": 0.22994159237626807, + "grad_norm": 0.2730538547039032, + "learning_rate": 4.944679358999517e-06, + "loss": 0.6927, + "step": 748 + }, + { + "epoch": 0.23024900092222564, + "grad_norm": 0.26855310797691345, + "learning_rate": 4.944509119485863e-06, + "loss": 0.6699, + "step": 749 + }, + { + "epoch": 0.2305564094681832, + "grad_norm": 0.27263903617858887, + "learning_rate": 4.94433862137307e-06, + "loss": 0.6797, + "step": 750 + }, + { + "epoch": 0.2308638180141408, + "grad_norm": 0.27998316287994385, + "learning_rate": 4.944167864679178e-06, + "loss": 0.655, + "step": 751 + }, + { + "epoch": 0.23117122656009836, + "grad_norm": 0.2675449252128601, + "learning_rate": 4.94399684942225e-06, + "loss": 0.6779, + "step": 752 + }, + { + "epoch": 0.23147863510605596, + "grad_norm": 0.26816803216934204, + "learning_rate": 4.9438255756203755e-06, + "loss": 0.6865, + "step": 753 + }, + { + "epoch": 0.23178604365201352, + "grad_norm": 0.2696927487850189, + "learning_rate": 4.943654043291676e-06, + "loss": 0.6775, + "step": 754 + }, + { + "epoch": 0.23209345219797112, + "grad_norm": 0.25898706912994385, + "learning_rate": 4.943482252454295e-06, + "loss": 0.6853, + "step": 755 + }, + { + "epoch": 0.23240086074392868, + "grad_norm": 0.25592583417892456, + "learning_rate": 4.943310203126407e-06, + "loss": 0.6517, + "step": 756 + }, + { + "epoch": 0.23270826928988625, + "grad_norm": 0.2721053659915924, + "learning_rate": 4.943137895326214e-06, + "loss": 0.6574, + "step": 757 + }, + { + "epoch": 0.23301567783584384, + "grad_norm": 0.2750680446624756, + "learning_rate": 4.942965329071942e-06, + "loss": 0.6279, + "step": 758 + }, + { + "epoch": 0.2333230863818014, + "grad_norm": 0.2667582929134369, + "learning_rate": 4.942792504381846e-06, + "loss": 0.665, + "step": 759 + }, + { + "epoch": 0.233630494927759, + "grad_norm": 0.26429447531700134, + "learning_rate": 4.942619421274211e-06, + "loss": 0.6456, + "step": 760 + }, + { + "epoch": 0.23393790347371657, + "grad_norm": 0.2873212397098541, + "learning_rate": 4.942446079767346e-06, + "loss": 0.6585, + "step": 761 + }, + { + "epoch": 0.23424531201967413, + "grad_norm": 0.26627352833747864, + "learning_rate": 4.942272479879588e-06, + "loss": 0.6674, + "step": 762 + }, + { + "epoch": 0.23455272056563173, + "grad_norm": 0.2861795425415039, + "learning_rate": 4.942098621629302e-06, + "loss": 0.6701, + "step": 763 + }, + { + "epoch": 0.2348601291115893, + "grad_norm": 0.264367938041687, + "learning_rate": 4.94192450503488e-06, + "loss": 0.6698, + "step": 764 + }, + { + "epoch": 0.2351675376575469, + "grad_norm": 0.28627654910087585, + "learning_rate": 4.941750130114742e-06, + "loss": 0.6789, + "step": 765 + }, + { + "epoch": 0.23547494620350445, + "grad_norm": 0.25917142629623413, + "learning_rate": 4.941575496887334e-06, + "loss": 0.6638, + "step": 766 + }, + { + "epoch": 0.23578235474946205, + "grad_norm": 0.282570481300354, + "learning_rate": 4.94140060537113e-06, + "loss": 0.6626, + "step": 767 + }, + { + "epoch": 0.2360897632954196, + "grad_norm": 0.2688579559326172, + "learning_rate": 4.941225455584633e-06, + "loss": 0.6612, + "step": 768 + }, + { + "epoch": 0.23639717184137718, + "grad_norm": 0.2687447667121887, + "learning_rate": 4.94105004754637e-06, + "loss": 0.6899, + "step": 769 + }, + { + "epoch": 0.23670458038733477, + "grad_norm": 0.26782941818237305, + "learning_rate": 4.9408743812748975e-06, + "loss": 0.6708, + "step": 770 + }, + { + "epoch": 0.23701198893329234, + "grad_norm": 0.2636446952819824, + "learning_rate": 4.940698456788798e-06, + "loss": 0.6818, + "step": 771 + }, + { + "epoch": 0.23731939747924993, + "grad_norm": 0.2775349020957947, + "learning_rate": 4.9405222741066835e-06, + "loss": 0.6757, + "step": 772 + }, + { + "epoch": 0.2376268060252075, + "grad_norm": 0.2643603980541229, + "learning_rate": 4.940345833247192e-06, + "loss": 0.6908, + "step": 773 + }, + { + "epoch": 0.23793421457116506, + "grad_norm": 0.2809801697731018, + "learning_rate": 4.940169134228987e-06, + "loss": 0.6672, + "step": 774 + }, + { + "epoch": 0.23824162311712266, + "grad_norm": 0.28609442710876465, + "learning_rate": 4.939992177070763e-06, + "loss": 0.6699, + "step": 775 + }, + { + "epoch": 0.23854903166308022, + "grad_norm": 0.2694535255432129, + "learning_rate": 4.93981496179124e-06, + "loss": 0.6641, + "step": 776 + }, + { + "epoch": 0.23885644020903782, + "grad_norm": 0.2742762565612793, + "learning_rate": 4.939637488409162e-06, + "loss": 0.6536, + "step": 777 + }, + { + "epoch": 0.23916384875499538, + "grad_norm": 0.27596384286880493, + "learning_rate": 4.9394597569433076e-06, + "loss": 0.6785, + "step": 778 + }, + { + "epoch": 0.23947125730095298, + "grad_norm": 0.2614978849887848, + "learning_rate": 4.939281767412477e-06, + "loss": 0.6709, + "step": 779 + }, + { + "epoch": 0.23977866584691054, + "grad_norm": 0.2663417160511017, + "learning_rate": 4.939103519835499e-06, + "loss": 0.6458, + "step": 780 + }, + { + "epoch": 0.2400860743928681, + "grad_norm": 0.2701205909252167, + "learning_rate": 4.93892501423123e-06, + "loss": 0.6494, + "step": 781 + }, + { + "epoch": 0.2403934829388257, + "grad_norm": 0.26919201016426086, + "learning_rate": 4.938746250618555e-06, + "loss": 0.6949, + "step": 782 + }, + { + "epoch": 0.24070089148478327, + "grad_norm": 0.280649334192276, + "learning_rate": 4.938567229016383e-06, + "loss": 0.6777, + "step": 783 + }, + { + "epoch": 0.24100830003074086, + "grad_norm": 0.2781096398830414, + "learning_rate": 4.938387949443654e-06, + "loss": 0.6779, + "step": 784 + }, + { + "epoch": 0.24131570857669843, + "grad_norm": 0.2755800187587738, + "learning_rate": 4.938208411919333e-06, + "loss": 0.66, + "step": 785 + }, + { + "epoch": 0.24162311712265602, + "grad_norm": 0.2689821720123291, + "learning_rate": 4.938028616462411e-06, + "loss": 0.6584, + "step": 786 + }, + { + "epoch": 0.2419305256686136, + "grad_norm": 0.26898521184921265, + "learning_rate": 4.937848563091911e-06, + "loss": 0.6718, + "step": 787 + }, + { + "epoch": 0.24223793421457115, + "grad_norm": 0.2733604311943054, + "learning_rate": 4.937668251826881e-06, + "loss": 0.6414, + "step": 788 + }, + { + "epoch": 0.24254534276052875, + "grad_norm": 0.26993444561958313, + "learning_rate": 4.937487682686391e-06, + "loss": 0.6923, + "step": 789 + }, + { + "epoch": 0.24285275130648631, + "grad_norm": 0.27963343262672424, + "learning_rate": 4.9373068556895485e-06, + "loss": 0.6815, + "step": 790 + }, + { + "epoch": 0.2431601598524439, + "grad_norm": 0.31615933775901794, + "learning_rate": 4.9371257708554785e-06, + "loss": 0.6641, + "step": 791 + }, + { + "epoch": 0.24346756839840147, + "grad_norm": 0.2661019265651703, + "learning_rate": 4.93694442820334e-06, + "loss": 0.6522, + "step": 792 + }, + { + "epoch": 0.24377497694435904, + "grad_norm": 0.2866568863391876, + "learning_rate": 4.9367628277523176e-06, + "loss": 0.6832, + "step": 793 + }, + { + "epoch": 0.24408238549031663, + "grad_norm": 0.2947637736797333, + "learning_rate": 4.936580969521619e-06, + "loss": 0.6741, + "step": 794 + }, + { + "epoch": 0.2443897940362742, + "grad_norm": 0.2759728729724884, + "learning_rate": 4.9363988535304856e-06, + "loss": 0.6452, + "step": 795 + }, + { + "epoch": 0.2446972025822318, + "grad_norm": 0.2817714810371399, + "learning_rate": 4.9362164797981825e-06, + "loss": 0.6407, + "step": 796 + }, + { + "epoch": 0.24500461112818936, + "grad_norm": 0.27466192841529846, + "learning_rate": 4.936033848344001e-06, + "loss": 0.6388, + "step": 797 + }, + { + "epoch": 0.24531201967414695, + "grad_norm": 0.2746501564979553, + "learning_rate": 4.935850959187263e-06, + "loss": 0.6542, + "step": 798 + }, + { + "epoch": 0.24561942822010452, + "grad_norm": 0.26667994260787964, + "learning_rate": 4.9356678123473154e-06, + "loss": 0.636, + "step": 799 + }, + { + "epoch": 0.24592683676606208, + "grad_norm": 0.28103873133659363, + "learning_rate": 4.935484407843533e-06, + "loss": 0.6685, + "step": 800 + }, + { + "epoch": 0.24623424531201968, + "grad_norm": 0.2831793427467346, + "learning_rate": 4.935300745695317e-06, + "loss": 0.6606, + "step": 801 + }, + { + "epoch": 0.24654165385797724, + "grad_norm": 0.2865827679634094, + "learning_rate": 4.935116825922099e-06, + "loss": 0.6695, + "step": 802 + }, + { + "epoch": 0.24684906240393484, + "grad_norm": 0.28268373012542725, + "learning_rate": 4.934932648543332e-06, + "loss": 0.6672, + "step": 803 + }, + { + "epoch": 0.2471564709498924, + "grad_norm": 0.2684009373188019, + "learning_rate": 4.934748213578501e-06, + "loss": 0.6643, + "step": 804 + }, + { + "epoch": 0.24746387949585, + "grad_norm": 0.2727474570274353, + "learning_rate": 4.934563521047119e-06, + "loss": 0.7024, + "step": 805 + }, + { + "epoch": 0.24777128804180756, + "grad_norm": 0.2793123424053192, + "learning_rate": 4.934378570968721e-06, + "loss": 0.6749, + "step": 806 + }, + { + "epoch": 0.24807869658776513, + "grad_norm": 0.28544074296951294, + "learning_rate": 4.934193363362875e-06, + "loss": 0.6753, + "step": 807 + }, + { + "epoch": 0.24838610513372272, + "grad_norm": 0.27918943762779236, + "learning_rate": 4.934007898249173e-06, + "loss": 0.6645, + "step": 808 + }, + { + "epoch": 0.2486935136796803, + "grad_norm": 0.268384724855423, + "learning_rate": 4.933822175647233e-06, + "loss": 0.6861, + "step": 809 + }, + { + "epoch": 0.24900092222563788, + "grad_norm": 0.27463585138320923, + "learning_rate": 4.933636195576705e-06, + "loss": 0.6375, + "step": 810 + }, + { + "epoch": 0.24930833077159545, + "grad_norm": 0.2645835280418396, + "learning_rate": 4.933449958057262e-06, + "loss": 0.665, + "step": 811 + }, + { + "epoch": 0.24961573931755301, + "grad_norm": 0.28224438428878784, + "learning_rate": 4.9332634631086065e-06, + "loss": 0.6891, + "step": 812 + }, + { + "epoch": 0.2499231478635106, + "grad_norm": 0.26846227049827576, + "learning_rate": 4.933076710750466e-06, + "loss": 0.6654, + "step": 813 + }, + { + "epoch": 0.2502305564094682, + "grad_norm": 0.27603524923324585, + "learning_rate": 4.9328897010025975e-06, + "loss": 0.6618, + "step": 814 + }, + { + "epoch": 0.25053796495542574, + "grad_norm": 0.26666751503944397, + "learning_rate": 4.9327024338847836e-06, + "loss": 0.6532, + "step": 815 + }, + { + "epoch": 0.25084537350138336, + "grad_norm": 0.2866850197315216, + "learning_rate": 4.932514909416837e-06, + "loss": 0.6452, + "step": 816 + }, + { + "epoch": 0.25115278204734093, + "grad_norm": 0.28107506036758423, + "learning_rate": 4.932327127618593e-06, + "loss": 0.6553, + "step": 817 + }, + { + "epoch": 0.2514601905932985, + "grad_norm": 0.2634201943874359, + "learning_rate": 4.932139088509917e-06, + "loss": 0.6709, + "step": 818 + }, + { + "epoch": 0.25176759913925606, + "grad_norm": 0.26501449942588806, + "learning_rate": 4.9319507921107036e-06, + "loss": 0.6405, + "step": 819 + }, + { + "epoch": 0.2520750076852136, + "grad_norm": 0.2839086651802063, + "learning_rate": 4.931762238440869e-06, + "loss": 0.6768, + "step": 820 + }, + { + "epoch": 0.25238241623117125, + "grad_norm": 0.2748083472251892, + "learning_rate": 4.931573427520362e-06, + "loss": 0.674, + "step": 821 + }, + { + "epoch": 0.2526898247771288, + "grad_norm": 0.2639026343822479, + "learning_rate": 4.931384359369156e-06, + "loss": 0.6769, + "step": 822 + }, + { + "epoch": 0.2529972333230864, + "grad_norm": 0.2638327181339264, + "learning_rate": 4.931195034007251e-06, + "loss": 0.6756, + "step": 823 + }, + { + "epoch": 0.25330464186904394, + "grad_norm": 0.2688182592391968, + "learning_rate": 4.931005451454678e-06, + "loss": 0.6704, + "step": 824 + }, + { + "epoch": 0.2536120504150015, + "grad_norm": 0.25220006704330444, + "learning_rate": 4.9308156117314895e-06, + "loss": 0.6162, + "step": 825 + }, + { + "epoch": 0.25391945896095913, + "grad_norm": 0.2677249610424042, + "learning_rate": 4.930625514857769e-06, + "loss": 0.6476, + "step": 826 + }, + { + "epoch": 0.2542268675069167, + "grad_norm": 0.265381395816803, + "learning_rate": 4.930435160853629e-06, + "loss": 0.6669, + "step": 827 + }, + { + "epoch": 0.25453427605287426, + "grad_norm": 0.27576640248298645, + "learning_rate": 4.930244549739202e-06, + "loss": 0.6335, + "step": 828 + }, + { + "epoch": 0.25484168459883183, + "grad_norm": 0.2772267460823059, + "learning_rate": 4.930053681534655e-06, + "loss": 0.6516, + "step": 829 + }, + { + "epoch": 0.25514909314478945, + "grad_norm": 0.27747246623039246, + "learning_rate": 4.929862556260181e-06, + "loss": 0.6486, + "step": 830 + }, + { + "epoch": 0.255456501690747, + "grad_norm": 0.2856988310813904, + "learning_rate": 4.929671173935997e-06, + "loss": 0.6698, + "step": 831 + }, + { + "epoch": 0.2557639102367046, + "grad_norm": 0.278134286403656, + "learning_rate": 4.9294795345823475e-06, + "loss": 0.6385, + "step": 832 + }, + { + "epoch": 0.25607131878266215, + "grad_norm": 0.282972514629364, + "learning_rate": 4.929287638219508e-06, + "loss": 0.6665, + "step": 833 + }, + { + "epoch": 0.2563787273286197, + "grad_norm": 0.2686905264854431, + "learning_rate": 4.929095484867778e-06, + "loss": 0.6376, + "step": 834 + }, + { + "epoch": 0.25668613587457734, + "grad_norm": 0.27265262603759766, + "learning_rate": 4.928903074547485e-06, + "loss": 0.6622, + "step": 835 + }, + { + "epoch": 0.2569935444205349, + "grad_norm": 0.26456764340400696, + "learning_rate": 4.928710407278982e-06, + "loss": 0.6601, + "step": 836 + }, + { + "epoch": 0.25730095296649247, + "grad_norm": 0.26518625020980835, + "learning_rate": 4.928517483082655e-06, + "loss": 0.6399, + "step": 837 + }, + { + "epoch": 0.25760836151245003, + "grad_norm": 0.2598682940006256, + "learning_rate": 4.928324301978908e-06, + "loss": 0.6487, + "step": 838 + }, + { + "epoch": 0.2579157700584076, + "grad_norm": 0.26371216773986816, + "learning_rate": 4.928130863988181e-06, + "loss": 0.646, + "step": 839 + }, + { + "epoch": 0.2582231786043652, + "grad_norm": 0.2707434594631195, + "learning_rate": 4.927937169130935e-06, + "loss": 0.6524, + "step": 840 + }, + { + "epoch": 0.2585305871503228, + "grad_norm": 0.2712406516075134, + "learning_rate": 4.927743217427662e-06, + "loss": 0.6637, + "step": 841 + }, + { + "epoch": 0.25883799569628035, + "grad_norm": 0.2861570417881012, + "learning_rate": 4.927549008898879e-06, + "loss": 0.6614, + "step": 842 + }, + { + "epoch": 0.2591454042422379, + "grad_norm": 0.27062320709228516, + "learning_rate": 4.927354543565131e-06, + "loss": 0.6641, + "step": 843 + }, + { + "epoch": 0.2594528127881955, + "grad_norm": 0.28554990887641907, + "learning_rate": 4.927159821446989e-06, + "loss": 0.6728, + "step": 844 + }, + { + "epoch": 0.2597602213341531, + "grad_norm": 0.2839095890522003, + "learning_rate": 4.926964842565055e-06, + "loss": 0.6653, + "step": 845 + }, + { + "epoch": 0.2600676298801107, + "grad_norm": 0.2673824727535248, + "learning_rate": 4.9267696069399525e-06, + "loss": 0.6636, + "step": 846 + }, + { + "epoch": 0.26037503842606824, + "grad_norm": 0.28149592876434326, + "learning_rate": 4.926574114592337e-06, + "loss": 0.6472, + "step": 847 + }, + { + "epoch": 0.2606824469720258, + "grad_norm": 0.26293984055519104, + "learning_rate": 4.926378365542889e-06, + "loss": 0.6553, + "step": 848 + }, + { + "epoch": 0.2609898555179834, + "grad_norm": 0.2730710506439209, + "learning_rate": 4.926182359812315e-06, + "loss": 0.6737, + "step": 849 + }, + { + "epoch": 0.261297264063941, + "grad_norm": 0.2707740068435669, + "learning_rate": 4.925986097421351e-06, + "loss": 0.6261, + "step": 850 + }, + { + "epoch": 0.26160467260989856, + "grad_norm": 0.27371007204055786, + "learning_rate": 4.925789578390759e-06, + "loss": 0.6543, + "step": 851 + }, + { + "epoch": 0.2619120811558561, + "grad_norm": 0.2727755308151245, + "learning_rate": 4.925592802741328e-06, + "loss": 0.6838, + "step": 852 + }, + { + "epoch": 0.2622194897018137, + "grad_norm": 0.27672725915908813, + "learning_rate": 4.925395770493875e-06, + "loss": 0.6567, + "step": 853 + }, + { + "epoch": 0.2625268982477713, + "grad_norm": 0.2812221944332123, + "learning_rate": 4.925198481669243e-06, + "loss": 0.6598, + "step": 854 + }, + { + "epoch": 0.2628343067937289, + "grad_norm": 0.27474042773246765, + "learning_rate": 4.925000936288303e-06, + "loss": 0.6657, + "step": 855 + }, + { + "epoch": 0.26314171533968644, + "grad_norm": 0.2859150767326355, + "learning_rate": 4.924803134371954e-06, + "loss": 0.6657, + "step": 856 + }, + { + "epoch": 0.263449123885644, + "grad_norm": 0.2753065526485443, + "learning_rate": 4.924605075941119e-06, + "loss": 0.6543, + "step": 857 + }, + { + "epoch": 0.2637565324316016, + "grad_norm": 0.2638867497444153, + "learning_rate": 4.924406761016751e-06, + "loss": 0.6749, + "step": 858 + }, + { + "epoch": 0.2640639409775592, + "grad_norm": 0.28440067172050476, + "learning_rate": 4.92420818961983e-06, + "loss": 0.6451, + "step": 859 + }, + { + "epoch": 0.26437134952351676, + "grad_norm": 0.26432809233665466, + "learning_rate": 4.924009361771362e-06, + "loss": 0.6609, + "step": 860 + }, + { + "epoch": 0.26467875806947433, + "grad_norm": 0.2738279700279236, + "learning_rate": 4.92381027749238e-06, + "loss": 0.6526, + "step": 861 + }, + { + "epoch": 0.2649861666154319, + "grad_norm": 0.28444328904151917, + "learning_rate": 4.923610936803945e-06, + "loss": 0.6375, + "step": 862 + }, + { + "epoch": 0.26529357516138946, + "grad_norm": 0.2695639133453369, + "learning_rate": 4.923411339727145e-06, + "loss": 0.6493, + "step": 863 + }, + { + "epoch": 0.2656009837073471, + "grad_norm": 0.29440760612487793, + "learning_rate": 4.923211486283095e-06, + "loss": 0.6602, + "step": 864 + }, + { + "epoch": 0.26590839225330465, + "grad_norm": 0.2817835807800293, + "learning_rate": 4.923011376492937e-06, + "loss": 0.6333, + "step": 865 + }, + { + "epoch": 0.2662158007992622, + "grad_norm": 0.26602670550346375, + "learning_rate": 4.922811010377839e-06, + "loss": 0.6565, + "step": 866 + }, + { + "epoch": 0.2665232093452198, + "grad_norm": 0.2698235511779785, + "learning_rate": 4.922610387958999e-06, + "loss": 0.6674, + "step": 867 + }, + { + "epoch": 0.2668306178911774, + "grad_norm": 0.27588701248168945, + "learning_rate": 4.92240950925764e-06, + "loss": 0.6703, + "step": 868 + }, + { + "epoch": 0.26713802643713497, + "grad_norm": 0.2847118675708771, + "learning_rate": 4.922208374295011e-06, + "loss": 0.6805, + "step": 869 + }, + { + "epoch": 0.26744543498309253, + "grad_norm": 0.2722637951374054, + "learning_rate": 4.922006983092392e-06, + "loss": 0.6852, + "step": 870 + }, + { + "epoch": 0.2677528435290501, + "grad_norm": 0.2654719054698944, + "learning_rate": 4.921805335671086e-06, + "loss": 0.6483, + "step": 871 + }, + { + "epoch": 0.26806025207500767, + "grad_norm": 0.27018728852272034, + "learning_rate": 4.921603432052426e-06, + "loss": 0.659, + "step": 872 + }, + { + "epoch": 0.2683676606209653, + "grad_norm": 0.29125022888183594, + "learning_rate": 4.9214012722577695e-06, + "loss": 0.674, + "step": 873 + }, + { + "epoch": 0.26867506916692285, + "grad_norm": 0.2821239233016968, + "learning_rate": 4.921198856308504e-06, + "loss": 0.6873, + "step": 874 + }, + { + "epoch": 0.2689824777128804, + "grad_norm": 0.2825275957584381, + "learning_rate": 4.9209961842260414e-06, + "loss": 0.6704, + "step": 875 + }, + { + "epoch": 0.269289886258838, + "grad_norm": 0.2743333578109741, + "learning_rate": 4.920793256031823e-06, + "loss": 0.657, + "step": 876 + }, + { + "epoch": 0.26959729480479555, + "grad_norm": 0.283241331577301, + "learning_rate": 4.920590071747315e-06, + "loss": 0.6795, + "step": 877 + }, + { + "epoch": 0.2699047033507532, + "grad_norm": 0.2759643793106079, + "learning_rate": 4.920386631394013e-06, + "loss": 0.6478, + "step": 878 + }, + { + "epoch": 0.27021211189671074, + "grad_norm": 0.28253209590911865, + "learning_rate": 4.9201829349934376e-06, + "loss": 0.6767, + "step": 879 + }, + { + "epoch": 0.2705195204426683, + "grad_norm": 0.28054240345954895, + "learning_rate": 4.919978982567138e-06, + "loss": 0.6373, + "step": 880 + }, + { + "epoch": 0.27082692898862587, + "grad_norm": 0.2865906357765198, + "learning_rate": 4.919774774136689e-06, + "loss": 0.6505, + "step": 881 + }, + { + "epoch": 0.27113433753458344, + "grad_norm": 0.2798917889595032, + "learning_rate": 4.919570309723695e-06, + "loss": 0.6218, + "step": 882 + }, + { + "epoch": 0.27144174608054106, + "grad_norm": 0.28011131286621094, + "learning_rate": 4.919365589349783e-06, + "loss": 0.6894, + "step": 883 + }, + { + "epoch": 0.2717491546264986, + "grad_norm": 0.2739619314670563, + "learning_rate": 4.9191606130366134e-06, + "loss": 0.6512, + "step": 884 + }, + { + "epoch": 0.2720565631724562, + "grad_norm": 0.2722596228122711, + "learning_rate": 4.918955380805867e-06, + "loss": 0.6622, + "step": 885 + }, + { + "epoch": 0.27236397171841376, + "grad_norm": 0.27943506836891174, + "learning_rate": 4.918749892679258e-06, + "loss": 0.6557, + "step": 886 + }, + { + "epoch": 0.2726713802643714, + "grad_norm": 0.26833900809288025, + "learning_rate": 4.918544148678522e-06, + "loss": 0.6713, + "step": 887 + }, + { + "epoch": 0.27297878881032894, + "grad_norm": 0.2778347134590149, + "learning_rate": 4.918338148825424e-06, + "loss": 0.6592, + "step": 888 + }, + { + "epoch": 0.2732861973562865, + "grad_norm": 0.28899693489074707, + "learning_rate": 4.9181318931417585e-06, + "loss": 0.66, + "step": 889 + }, + { + "epoch": 0.2735936059022441, + "grad_norm": 0.27082186937332153, + "learning_rate": 4.9179253816493435e-06, + "loss": 0.6316, + "step": 890 + }, + { + "epoch": 0.27390101444820164, + "grad_norm": 0.279742032289505, + "learning_rate": 4.917718614370026e-06, + "loss": 0.6726, + "step": 891 + }, + { + "epoch": 0.27420842299415926, + "grad_norm": 0.2831493318080902, + "learning_rate": 4.9175115913256775e-06, + "loss": 0.6716, + "step": 892 + }, + { + "epoch": 0.27451583154011683, + "grad_norm": 0.2837923467159271, + "learning_rate": 4.917304312538201e-06, + "loss": 0.6605, + "step": 893 + }, + { + "epoch": 0.2748232400860744, + "grad_norm": 0.27232810854911804, + "learning_rate": 4.917096778029523e-06, + "loss": 0.6715, + "step": 894 + }, + { + "epoch": 0.27513064863203196, + "grad_norm": 0.28728893399238586, + "learning_rate": 4.916888987821598e-06, + "loss": 0.6554, + "step": 895 + }, + { + "epoch": 0.2754380571779895, + "grad_norm": 0.2738216519355774, + "learning_rate": 4.916680941936408e-06, + "loss": 0.6484, + "step": 896 + }, + { + "epoch": 0.27574546572394715, + "grad_norm": 0.27484771609306335, + "learning_rate": 4.916472640395961e-06, + "loss": 0.6563, + "step": 897 + }, + { + "epoch": 0.2760528742699047, + "grad_norm": 0.2635241448879242, + "learning_rate": 4.916264083222293e-06, + "loss": 0.6448, + "step": 898 + }, + { + "epoch": 0.2763602828158623, + "grad_norm": 0.278667688369751, + "learning_rate": 4.916055270437468e-06, + "loss": 0.6698, + "step": 899 + }, + { + "epoch": 0.27666769136181985, + "grad_norm": 0.2718406617641449, + "learning_rate": 4.915846202063574e-06, + "loss": 0.6417, + "step": 900 + }, + { + "epoch": 0.2769750999077774, + "grad_norm": 0.27329710125923157, + "learning_rate": 4.915636878122729e-06, + "loss": 0.6708, + "step": 901 + }, + { + "epoch": 0.27728250845373503, + "grad_norm": 0.2778084874153137, + "learning_rate": 4.9154272986370765e-06, + "loss": 0.663, + "step": 902 + }, + { + "epoch": 0.2775899169996926, + "grad_norm": 0.2850241959095001, + "learning_rate": 4.9152174636287884e-06, + "loss": 0.6644, + "step": 903 + }, + { + "epoch": 0.27789732554565016, + "grad_norm": 0.28188356757164, + "learning_rate": 4.915007373120061e-06, + "loss": 0.6553, + "step": 904 + }, + { + "epoch": 0.27820473409160773, + "grad_norm": 0.2768317759037018, + "learning_rate": 4.91479702713312e-06, + "loss": 0.6549, + "step": 905 + }, + { + "epoch": 0.27851214263756535, + "grad_norm": 0.2942928075790405, + "learning_rate": 4.914586425690219e-06, + "loss": 0.6528, + "step": 906 + }, + { + "epoch": 0.2788195511835229, + "grad_norm": 0.28510066866874695, + "learning_rate": 4.914375568813634e-06, + "loss": 0.6704, + "step": 907 + }, + { + "epoch": 0.2791269597294805, + "grad_norm": 0.2745829224586487, + "learning_rate": 4.914164456525674e-06, + "loss": 0.6799, + "step": 908 + }, + { + "epoch": 0.27943436827543805, + "grad_norm": 0.2722851634025574, + "learning_rate": 4.91395308884867e-06, + "loss": 0.6647, + "step": 909 + }, + { + "epoch": 0.2797417768213956, + "grad_norm": 0.27554479241371155, + "learning_rate": 4.913741465804984e-06, + "loss": 0.6359, + "step": 910 + }, + { + "epoch": 0.28004918536735324, + "grad_norm": 0.2794571816921234, + "learning_rate": 4.913529587417001e-06, + "loss": 0.6703, + "step": 911 + }, + { + "epoch": 0.2803565939133108, + "grad_norm": 0.2773742377758026, + "learning_rate": 4.913317453707137e-06, + "loss": 0.6658, + "step": 912 + }, + { + "epoch": 0.28066400245926837, + "grad_norm": 0.2678312063217163, + "learning_rate": 4.913105064697832e-06, + "loss": 0.6501, + "step": 913 + }, + { + "epoch": 0.28097141100522594, + "grad_norm": 0.266136109828949, + "learning_rate": 4.912892420411554e-06, + "loss": 0.6768, + "step": 914 + }, + { + "epoch": 0.2812788195511835, + "grad_norm": 0.2856849431991577, + "learning_rate": 4.9126795208708e-06, + "loss": 0.6644, + "step": 915 + }, + { + "epoch": 0.2815862280971411, + "grad_norm": 0.2794843316078186, + "learning_rate": 4.91246636609809e-06, + "loss": 0.667, + "step": 916 + }, + { + "epoch": 0.2818936366430987, + "grad_norm": 0.27441468834877014, + "learning_rate": 4.912252956115974e-06, + "loss": 0.6603, + "step": 917 + }, + { + "epoch": 0.28220104518905625, + "grad_norm": 0.27316445112228394, + "learning_rate": 4.912039290947028e-06, + "loss": 0.6518, + "step": 918 + }, + { + "epoch": 0.2825084537350138, + "grad_norm": 0.2775651812553406, + "learning_rate": 4.911825370613855e-06, + "loss": 0.6364, + "step": 919 + }, + { + "epoch": 0.2828158622809714, + "grad_norm": 0.27441123127937317, + "learning_rate": 4.911611195139087e-06, + "loss": 0.6594, + "step": 920 + }, + { + "epoch": 0.283123270826929, + "grad_norm": 0.2887556850910187, + "learning_rate": 4.911396764545379e-06, + "loss": 0.6521, + "step": 921 + }, + { + "epoch": 0.2834306793728866, + "grad_norm": 0.2651980519294739, + "learning_rate": 4.9111820788554144e-06, + "loss": 0.6684, + "step": 922 + }, + { + "epoch": 0.28373808791884414, + "grad_norm": 0.2838245630264282, + "learning_rate": 4.910967138091907e-06, + "loss": 0.6673, + "step": 923 + }, + { + "epoch": 0.2840454964648017, + "grad_norm": 0.2757062017917633, + "learning_rate": 4.910751942277593e-06, + "loss": 0.6554, + "step": 924 + }, + { + "epoch": 0.28435290501075927, + "grad_norm": 0.26697805523872375, + "learning_rate": 4.910536491435239e-06, + "loss": 0.6609, + "step": 925 + }, + { + "epoch": 0.2846603135567169, + "grad_norm": 0.27919891476631165, + "learning_rate": 4.910320785587636e-06, + "loss": 0.6841, + "step": 926 + }, + { + "epoch": 0.28496772210267446, + "grad_norm": 0.2746002674102783, + "learning_rate": 4.910104824757602e-06, + "loss": 0.6606, + "step": 927 + }, + { + "epoch": 0.285275130648632, + "grad_norm": 0.2729363441467285, + "learning_rate": 4.909888608967985e-06, + "loss": 0.6637, + "step": 928 + }, + { + "epoch": 0.2855825391945896, + "grad_norm": 0.26898568868637085, + "learning_rate": 4.909672138241657e-06, + "loss": 0.6303, + "step": 929 + }, + { + "epoch": 0.2858899477405472, + "grad_norm": 0.2774103581905365, + "learning_rate": 4.9094554126015185e-06, + "loss": 0.6739, + "step": 930 + }, + { + "epoch": 0.2861973562865048, + "grad_norm": 0.27155500650405884, + "learning_rate": 4.909238432070495e-06, + "loss": 0.6542, + "step": 931 + }, + { + "epoch": 0.28650476483246234, + "grad_norm": 0.2806902229785919, + "learning_rate": 4.909021196671543e-06, + "loss": 0.6564, + "step": 932 + }, + { + "epoch": 0.2868121733784199, + "grad_norm": 0.2798900306224823, + "learning_rate": 4.908803706427641e-06, + "loss": 0.6571, + "step": 933 + }, + { + "epoch": 0.2871195819243775, + "grad_norm": 0.28008270263671875, + "learning_rate": 4.908585961361797e-06, + "loss": 0.6719, + "step": 934 + }, + { + "epoch": 0.2874269904703351, + "grad_norm": 0.27390480041503906, + "learning_rate": 4.908367961497047e-06, + "loss": 0.6497, + "step": 935 + }, + { + "epoch": 0.28773439901629266, + "grad_norm": 0.2820371091365814, + "learning_rate": 4.908149706856451e-06, + "loss": 0.6475, + "step": 936 + }, + { + "epoch": 0.28804180756225023, + "grad_norm": 0.2790486216545105, + "learning_rate": 4.9079311974631005e-06, + "loss": 0.626, + "step": 937 + }, + { + "epoch": 0.2883492161082078, + "grad_norm": 0.2740943133831024, + "learning_rate": 4.9077124333401084e-06, + "loss": 0.6648, + "step": 938 + }, + { + "epoch": 0.28865662465416536, + "grad_norm": 0.28001904487609863, + "learning_rate": 4.907493414510618e-06, + "loss": 0.6718, + "step": 939 + }, + { + "epoch": 0.288964033200123, + "grad_norm": 0.27583110332489014, + "learning_rate": 4.907274140997799e-06, + "loss": 0.6616, + "step": 940 + }, + { + "epoch": 0.28927144174608055, + "grad_norm": 0.28662559390068054, + "learning_rate": 4.907054612824848e-06, + "loss": 0.6735, + "step": 941 + }, + { + "epoch": 0.2895788502920381, + "grad_norm": 0.2576884925365448, + "learning_rate": 4.906834830014988e-06, + "loss": 0.6522, + "step": 942 + }, + { + "epoch": 0.2898862588379957, + "grad_norm": 0.2676111161708832, + "learning_rate": 4.906614792591471e-06, + "loss": 0.6432, + "step": 943 + }, + { + "epoch": 0.29019366738395325, + "grad_norm": 0.28525060415267944, + "learning_rate": 4.9063945005775715e-06, + "loss": 0.6764, + "step": 944 + }, + { + "epoch": 0.29050107592991087, + "grad_norm": 0.28737494349479675, + "learning_rate": 4.906173953996596e-06, + "loss": 0.6521, + "step": 945 + }, + { + "epoch": 0.29080848447586843, + "grad_norm": 0.2873564064502716, + "learning_rate": 4.905953152871873e-06, + "loss": 0.6531, + "step": 946 + }, + { + "epoch": 0.291115893021826, + "grad_norm": 0.275357186794281, + "learning_rate": 4.905732097226763e-06, + "loss": 0.6531, + "step": 947 + }, + { + "epoch": 0.29142330156778357, + "grad_norm": 0.2822877764701843, + "learning_rate": 4.905510787084651e-06, + "loss": 0.6487, + "step": 948 + }, + { + "epoch": 0.2917307101137412, + "grad_norm": 0.2911280691623688, + "learning_rate": 4.9052892224689474e-06, + "loss": 0.6625, + "step": 949 + }, + { + "epoch": 0.29203811865969875, + "grad_norm": 0.27757546305656433, + "learning_rate": 4.905067403403092e-06, + "loss": 0.6621, + "step": 950 + }, + { + "epoch": 0.2923455272056563, + "grad_norm": 0.28784435987472534, + "learning_rate": 4.904845329910551e-06, + "loss": 0.6711, + "step": 951 + }, + { + "epoch": 0.2926529357516139, + "grad_norm": 0.2947133481502533, + "learning_rate": 4.904623002014814e-06, + "loss": 0.6685, + "step": 952 + }, + { + "epoch": 0.29296034429757145, + "grad_norm": 0.3002372980117798, + "learning_rate": 4.9044004197394055e-06, + "loss": 0.6813, + "step": 953 + }, + { + "epoch": 0.2932677528435291, + "grad_norm": 0.2803349792957306, + "learning_rate": 4.904177583107868e-06, + "loss": 0.6632, + "step": 954 + }, + { + "epoch": 0.29357516138948664, + "grad_norm": 0.27639782428741455, + "learning_rate": 4.903954492143777e-06, + "loss": 0.6387, + "step": 955 + }, + { + "epoch": 0.2938825699354442, + "grad_norm": 0.2816582918167114, + "learning_rate": 4.9037311468707315e-06, + "loss": 0.6515, + "step": 956 + }, + { + "epoch": 0.29418997848140177, + "grad_norm": 0.27421632409095764, + "learning_rate": 4.90350754731236e-06, + "loss": 0.655, + "step": 957 + }, + { + "epoch": 0.29449738702735934, + "grad_norm": 0.28104084730148315, + "learning_rate": 4.903283693492314e-06, + "loss": 0.6603, + "step": 958 + }, + { + "epoch": 0.29480479557331696, + "grad_norm": 0.27812764048576355, + "learning_rate": 4.903059585434278e-06, + "loss": 0.6712, + "step": 959 + }, + { + "epoch": 0.2951122041192745, + "grad_norm": 0.29678329825401306, + "learning_rate": 4.902835223161958e-06, + "loss": 0.6688, + "step": 960 + }, + { + "epoch": 0.2954196126652321, + "grad_norm": 0.276205450296402, + "learning_rate": 4.902610606699089e-06, + "loss": 0.6685, + "step": 961 + }, + { + "epoch": 0.29572702121118966, + "grad_norm": 0.28820374608039856, + "learning_rate": 4.902385736069433e-06, + "loss": 0.6714, + "step": 962 + }, + { + "epoch": 0.2960344297571472, + "grad_norm": 0.279329776763916, + "learning_rate": 4.902160611296778e-06, + "loss": 0.6343, + "step": 963 + }, + { + "epoch": 0.29634183830310484, + "grad_norm": 0.29236891865730286, + "learning_rate": 4.90193523240494e-06, + "loss": 0.6556, + "step": 964 + }, + { + "epoch": 0.2966492468490624, + "grad_norm": 0.274167001247406, + "learning_rate": 4.90170959941776e-06, + "loss": 0.6851, + "step": 965 + }, + { + "epoch": 0.29695665539502, + "grad_norm": 0.2821110188961029, + "learning_rate": 4.901483712359109e-06, + "loss": 0.6592, + "step": 966 + }, + { + "epoch": 0.29726406394097754, + "grad_norm": 0.2815389931201935, + "learning_rate": 4.901257571252882e-06, + "loss": 0.6532, + "step": 967 + }, + { + "epoch": 0.29757147248693516, + "grad_norm": 0.28063249588012695, + "learning_rate": 4.9010311761230025e-06, + "loss": 0.6701, + "step": 968 + }, + { + "epoch": 0.29787888103289273, + "grad_norm": 0.28257957100868225, + "learning_rate": 4.90080452699342e-06, + "loss": 0.6604, + "step": 969 + }, + { + "epoch": 0.2981862895788503, + "grad_norm": 0.28300178050994873, + "learning_rate": 4.900577623888112e-06, + "loss": 0.6519, + "step": 970 + }, + { + "epoch": 0.29849369812480786, + "grad_norm": 0.2848157286643982, + "learning_rate": 4.900350466831081e-06, + "loss": 0.6715, + "step": 971 + }, + { + "epoch": 0.2988011066707654, + "grad_norm": 0.2704631984233856, + "learning_rate": 4.900123055846357e-06, + "loss": 0.6757, + "step": 972 + }, + { + "epoch": 0.29910851521672305, + "grad_norm": 0.2832757532596588, + "learning_rate": 4.899895390957999e-06, + "loss": 0.6502, + "step": 973 + }, + { + "epoch": 0.2994159237626806, + "grad_norm": 0.2866239845752716, + "learning_rate": 4.89966747219009e-06, + "loss": 0.6661, + "step": 974 + }, + { + "epoch": 0.2997233323086382, + "grad_norm": 0.2807655334472656, + "learning_rate": 4.899439299566742e-06, + "loss": 0.657, + "step": 975 + }, + { + "epoch": 0.30003074085459575, + "grad_norm": 0.27409598231315613, + "learning_rate": 4.899210873112092e-06, + "loss": 0.6353, + "step": 976 + }, + { + "epoch": 0.3003381494005533, + "grad_norm": 0.2765432894229889, + "learning_rate": 4.898982192850303e-06, + "loss": 0.6683, + "step": 977 + }, + { + "epoch": 0.30064555794651093, + "grad_norm": 0.2918175160884857, + "learning_rate": 4.89875325880557e-06, + "loss": 0.6405, + "step": 978 + }, + { + "epoch": 0.3009529664924685, + "grad_norm": 0.29269182682037354, + "learning_rate": 4.8985240710021095e-06, + "loss": 0.6643, + "step": 979 + }, + { + "epoch": 0.30126037503842606, + "grad_norm": 0.2813666760921478, + "learning_rate": 4.898294629464167e-06, + "loss": 0.6537, + "step": 980 + }, + { + "epoch": 0.30156778358438363, + "grad_norm": 0.2793412208557129, + "learning_rate": 4.898064934216016e-06, + "loss": 0.6257, + "step": 981 + }, + { + "epoch": 0.3018751921303412, + "grad_norm": 0.27533039450645447, + "learning_rate": 4.897834985281954e-06, + "loss": 0.666, + "step": 982 + }, + { + "epoch": 0.3021826006762988, + "grad_norm": 0.28941893577575684, + "learning_rate": 4.897604782686306e-06, + "loss": 0.6819, + "step": 983 + }, + { + "epoch": 0.3024900092222564, + "grad_norm": 0.2679516077041626, + "learning_rate": 4.897374326453425e-06, + "loss": 0.6602, + "step": 984 + }, + { + "epoch": 0.30279741776821395, + "grad_norm": 0.2729361355304718, + "learning_rate": 4.897143616607693e-06, + "loss": 0.6672, + "step": 985 + }, + { + "epoch": 0.3031048263141715, + "grad_norm": 0.2769686281681061, + "learning_rate": 4.896912653173513e-06, + "loss": 0.6767, + "step": 986 + }, + { + "epoch": 0.30341223486012914, + "grad_norm": 0.27531909942626953, + "learning_rate": 4.89668143617532e-06, + "loss": 0.6854, + "step": 987 + }, + { + "epoch": 0.3037196434060867, + "grad_norm": 0.26944589614868164, + "learning_rate": 4.896449965637573e-06, + "loss": 0.6594, + "step": 988 + }, + { + "epoch": 0.30402705195204427, + "grad_norm": 0.30762001872062683, + "learning_rate": 4.896218241584759e-06, + "loss": 0.6512, + "step": 989 + }, + { + "epoch": 0.30433446049800184, + "grad_norm": 0.2762143015861511, + "learning_rate": 4.8959862640413915e-06, + "loss": 0.6732, + "step": 990 + }, + { + "epoch": 0.3046418690439594, + "grad_norm": 0.28259825706481934, + "learning_rate": 4.895754033032011e-06, + "loss": 0.6602, + "step": 991 + }, + { + "epoch": 0.304949277589917, + "grad_norm": 0.28329309821128845, + "learning_rate": 4.8955215485811855e-06, + "loss": 0.6683, + "step": 992 + }, + { + "epoch": 0.3052566861358746, + "grad_norm": 0.2743130028247833, + "learning_rate": 4.895288810713508e-06, + "loss": 0.6635, + "step": 993 + }, + { + "epoch": 0.30556409468183215, + "grad_norm": 0.27627381682395935, + "learning_rate": 4.895055819453599e-06, + "loss": 0.6687, + "step": 994 + }, + { + "epoch": 0.3058715032277897, + "grad_norm": 0.2859172523021698, + "learning_rate": 4.894822574826107e-06, + "loss": 0.6721, + "step": 995 + }, + { + "epoch": 0.3061789117737473, + "grad_norm": 0.2868649959564209, + "learning_rate": 4.894589076855707e-06, + "loss": 0.6547, + "step": 996 + }, + { + "epoch": 0.3064863203197049, + "grad_norm": 0.2768910825252533, + "learning_rate": 4.8943553255670984e-06, + "loss": 0.6445, + "step": 997 + }, + { + "epoch": 0.3067937288656625, + "grad_norm": 0.2893315255641937, + "learning_rate": 4.89412132098501e-06, + "loss": 0.6849, + "step": 998 + }, + { + "epoch": 0.30710113741162004, + "grad_norm": 0.2802261412143707, + "learning_rate": 4.893887063134197e-06, + "loss": 0.6529, + "step": 999 + }, + { + "epoch": 0.3074085459575776, + "grad_norm": 0.27559301257133484, + "learning_rate": 4.893652552039442e-06, + "loss": 0.6401, + "step": 1000 + }, + { + "epoch": 0.30771595450353517, + "grad_norm": 0.2903192639350891, + "learning_rate": 4.8934177877255505e-06, + "loss": 0.6526, + "step": 1001 + }, + { + "epoch": 0.3080233630494928, + "grad_norm": 0.2764742374420166, + "learning_rate": 4.8931827702173605e-06, + "loss": 0.6579, + "step": 1002 + }, + { + "epoch": 0.30833077159545036, + "grad_norm": 0.278499960899353, + "learning_rate": 4.892947499539732e-06, + "loss": 0.6732, + "step": 1003 + }, + { + "epoch": 0.3086381801414079, + "grad_norm": 0.2841573655605316, + "learning_rate": 4.892711975717556e-06, + "loss": 0.6654, + "step": 1004 + }, + { + "epoch": 0.3089455886873655, + "grad_norm": 0.2738995850086212, + "learning_rate": 4.892476198775745e-06, + "loss": 0.6686, + "step": 1005 + }, + { + "epoch": 0.3092529972333231, + "grad_norm": 0.2731524109840393, + "learning_rate": 4.892240168739244e-06, + "loss": 0.6586, + "step": 1006 + }, + { + "epoch": 0.3095604057792807, + "grad_norm": 0.2740257978439331, + "learning_rate": 4.892003885633021e-06, + "loss": 0.6533, + "step": 1007 + }, + { + "epoch": 0.30986781432523824, + "grad_norm": 0.29074904322624207, + "learning_rate": 4.891767349482071e-06, + "loss": 0.6672, + "step": 1008 + }, + { + "epoch": 0.3101752228711958, + "grad_norm": 0.28397491574287415, + "learning_rate": 4.891530560311419e-06, + "loss": 0.6343, + "step": 1009 + }, + { + "epoch": 0.3104826314171534, + "grad_norm": 0.30214086174964905, + "learning_rate": 4.891293518146112e-06, + "loss": 0.6585, + "step": 1010 + }, + { + "epoch": 0.310790039963111, + "grad_norm": 0.2877248525619507, + "learning_rate": 4.891056223011227e-06, + "loss": 0.6316, + "step": 1011 + }, + { + "epoch": 0.31109744850906856, + "grad_norm": 0.27332234382629395, + "learning_rate": 4.890818674931868e-06, + "loss": 0.6455, + "step": 1012 + }, + { + "epoch": 0.31140485705502613, + "grad_norm": 0.283634215593338, + "learning_rate": 4.8905808739331624e-06, + "loss": 0.6646, + "step": 1013 + }, + { + "epoch": 0.3117122656009837, + "grad_norm": 0.27943384647369385, + "learning_rate": 4.890342820040269e-06, + "loss": 0.6578, + "step": 1014 + }, + { + "epoch": 0.31201967414694126, + "grad_norm": 0.27337193489074707, + "learning_rate": 4.8901045132783695e-06, + "loss": 0.6596, + "step": 1015 + }, + { + "epoch": 0.3123270826928989, + "grad_norm": 0.2924189567565918, + "learning_rate": 4.889865953672675e-06, + "loss": 0.6146, + "step": 1016 + }, + { + "epoch": 0.31263449123885645, + "grad_norm": 0.28520676493644714, + "learning_rate": 4.889627141248421e-06, + "loss": 0.6674, + "step": 1017 + }, + { + "epoch": 0.312941899784814, + "grad_norm": 0.2826230525970459, + "learning_rate": 4.889388076030871e-06, + "loss": 0.6573, + "step": 1018 + }, + { + "epoch": 0.3132493083307716, + "grad_norm": 0.28045687079429626, + "learning_rate": 4.889148758045316e-06, + "loss": 0.6603, + "step": 1019 + }, + { + "epoch": 0.31355671687672915, + "grad_norm": 0.2975037395954132, + "learning_rate": 4.888909187317072e-06, + "loss": 0.6645, + "step": 1020 + }, + { + "epoch": 0.31386412542268677, + "grad_norm": 0.27164244651794434, + "learning_rate": 4.888669363871484e-06, + "loss": 0.6448, + "step": 1021 + }, + { + "epoch": 0.31417153396864433, + "grad_norm": 0.2755047380924225, + "learning_rate": 4.88842928773392e-06, + "loss": 0.6541, + "step": 1022 + }, + { + "epoch": 0.3144789425146019, + "grad_norm": 0.28549090027809143, + "learning_rate": 4.88818895892978e-06, + "loss": 0.6737, + "step": 1023 + }, + { + "epoch": 0.31478635106055947, + "grad_norm": 0.28082481026649475, + "learning_rate": 4.887948377484486e-06, + "loss": 0.6757, + "step": 1024 + }, + { + "epoch": 0.3150937596065171, + "grad_norm": 0.2947121560573578, + "learning_rate": 4.887707543423489e-06, + "loss": 0.6596, + "step": 1025 + }, + { + "epoch": 0.31540116815247465, + "grad_norm": 0.27371206879615784, + "learning_rate": 4.887466456772266e-06, + "loss": 0.6535, + "step": 1026 + }, + { + "epoch": 0.3157085766984322, + "grad_norm": 0.2797688841819763, + "learning_rate": 4.887225117556322e-06, + "loss": 0.6453, + "step": 1027 + }, + { + "epoch": 0.3160159852443898, + "grad_norm": 0.2954927384853363, + "learning_rate": 4.886983525801187e-06, + "loss": 0.633, + "step": 1028 + }, + { + "epoch": 0.31632339379034735, + "grad_norm": 0.28235945105552673, + "learning_rate": 4.886741681532418e-06, + "loss": 0.6471, + "step": 1029 + }, + { + "epoch": 0.316630802336305, + "grad_norm": 0.29598402976989746, + "learning_rate": 4.8864995847756e-06, + "loss": 0.6537, + "step": 1030 + }, + { + "epoch": 0.31693821088226254, + "grad_norm": 0.2893388271331787, + "learning_rate": 4.886257235556343e-06, + "loss": 0.6669, + "step": 1031 + }, + { + "epoch": 0.3172456194282201, + "grad_norm": 0.2917623817920685, + "learning_rate": 4.886014633900286e-06, + "loss": 0.666, + "step": 1032 + }, + { + "epoch": 0.31755302797417767, + "grad_norm": 0.27546045184135437, + "learning_rate": 4.885771779833093e-06, + "loss": 0.6597, + "step": 1033 + }, + { + "epoch": 0.31786043652013524, + "grad_norm": 0.3002658486366272, + "learning_rate": 4.885528673380453e-06, + "loss": 0.644, + "step": 1034 + }, + { + "epoch": 0.31816784506609286, + "grad_norm": 0.2960573136806488, + "learning_rate": 4.885285314568086e-06, + "loss": 0.6582, + "step": 1035 + }, + { + "epoch": 0.3184752536120504, + "grad_norm": 0.27503740787506104, + "learning_rate": 4.885041703421735e-06, + "loss": 0.6695, + "step": 1036 + }, + { + "epoch": 0.318782662158008, + "grad_norm": 0.2778552770614624, + "learning_rate": 4.8847978399671716e-06, + "loss": 0.6559, + "step": 1037 + }, + { + "epoch": 0.31909007070396556, + "grad_norm": 0.3295230269432068, + "learning_rate": 4.884553724230194e-06, + "loss": 0.6607, + "step": 1038 + }, + { + "epoch": 0.3193974792499231, + "grad_norm": 0.297981858253479, + "learning_rate": 4.884309356236626e-06, + "loss": 0.662, + "step": 1039 + }, + { + "epoch": 0.31970488779588074, + "grad_norm": 0.28815123438835144, + "learning_rate": 4.884064736012319e-06, + "loss": 0.6401, + "step": 1040 + }, + { + "epoch": 0.3200122963418383, + "grad_norm": 0.28518587350845337, + "learning_rate": 4.88381986358315e-06, + "loss": 0.6666, + "step": 1041 + }, + { + "epoch": 0.3203197048877959, + "grad_norm": 0.29665154218673706, + "learning_rate": 4.883574738975025e-06, + "loss": 0.6602, + "step": 1042 + }, + { + "epoch": 0.32062711343375344, + "grad_norm": 0.2962576448917389, + "learning_rate": 4.883329362213874e-06, + "loss": 0.6523, + "step": 1043 + }, + { + "epoch": 0.32093452197971106, + "grad_norm": 0.2798975706100464, + "learning_rate": 4.883083733325656e-06, + "loss": 0.6433, + "step": 1044 + }, + { + "epoch": 0.32124193052566863, + "grad_norm": 0.31634512543678284, + "learning_rate": 4.882837852336355e-06, + "loss": 0.6508, + "step": 1045 + }, + { + "epoch": 0.3215493390716262, + "grad_norm": 0.2803241014480591, + "learning_rate": 4.882591719271982e-06, + "loss": 0.6667, + "step": 1046 + }, + { + "epoch": 0.32185674761758376, + "grad_norm": 0.28122246265411377, + "learning_rate": 4.882345334158573e-06, + "loss": 0.645, + "step": 1047 + }, + { + "epoch": 0.3221641561635413, + "grad_norm": 0.30069246888160706, + "learning_rate": 4.8820986970221966e-06, + "loss": 0.6728, + "step": 1048 + }, + { + "epoch": 0.32247156470949895, + "grad_norm": 0.28234201669692993, + "learning_rate": 4.881851807888941e-06, + "loss": 0.6584, + "step": 1049 + }, + { + "epoch": 0.3227789732554565, + "grad_norm": 0.27929025888442993, + "learning_rate": 4.881604666784926e-06, + "loss": 0.6286, + "step": 1050 + }, + { + "epoch": 0.3230863818014141, + "grad_norm": 0.2793880105018616, + "learning_rate": 4.8813572737362934e-06, + "loss": 0.6679, + "step": 1051 + }, + { + "epoch": 0.32339379034737165, + "grad_norm": 0.2879345715045929, + "learning_rate": 4.881109628769216e-06, + "loss": 0.6734, + "step": 1052 + }, + { + "epoch": 0.3237011988933292, + "grad_norm": 0.2773875296115875, + "learning_rate": 4.8808617319098925e-06, + "loss": 0.6514, + "step": 1053 + }, + { + "epoch": 0.32400860743928683, + "grad_norm": 0.2780797779560089, + "learning_rate": 4.880613583184546e-06, + "loss": 0.6259, + "step": 1054 + }, + { + "epoch": 0.3243160159852444, + "grad_norm": 0.2765081822872162, + "learning_rate": 4.880365182619428e-06, + "loss": 0.664, + "step": 1055 + }, + { + "epoch": 0.32462342453120197, + "grad_norm": 0.28214532136917114, + "learning_rate": 4.880116530240815e-06, + "loss": 0.6432, + "step": 1056 + }, + { + "epoch": 0.32493083307715953, + "grad_norm": 0.2676883339881897, + "learning_rate": 4.879867626075015e-06, + "loss": 0.658, + "step": 1057 + }, + { + "epoch": 0.3252382416231171, + "grad_norm": 0.28015467524528503, + "learning_rate": 4.879618470148354e-06, + "loss": 0.6734, + "step": 1058 + }, + { + "epoch": 0.3255456501690747, + "grad_norm": 0.27959010004997253, + "learning_rate": 4.879369062487194e-06, + "loss": 0.6529, + "step": 1059 + }, + { + "epoch": 0.3258530587150323, + "grad_norm": 0.29613688588142395, + "learning_rate": 4.879119403117917e-06, + "loss": 0.6627, + "step": 1060 + }, + { + "epoch": 0.32616046726098985, + "grad_norm": 0.2888384163379669, + "learning_rate": 4.878869492066934e-06, + "loss": 0.66, + "step": 1061 + }, + { + "epoch": 0.3264678758069474, + "grad_norm": 0.28243085741996765, + "learning_rate": 4.878619329360684e-06, + "loss": 0.6667, + "step": 1062 + }, + { + "epoch": 0.32677528435290504, + "grad_norm": 0.2777983546257019, + "learning_rate": 4.87836891502563e-06, + "loss": 0.6242, + "step": 1063 + }, + { + "epoch": 0.3270826928988626, + "grad_norm": 0.28589585423469543, + "learning_rate": 4.878118249088262e-06, + "loss": 0.632, + "step": 1064 + }, + { + "epoch": 0.32739010144482017, + "grad_norm": 0.280431866645813, + "learning_rate": 4.877867331575099e-06, + "loss": 0.6428, + "step": 1065 + }, + { + "epoch": 0.32769750999077774, + "grad_norm": 0.28930264711380005, + "learning_rate": 4.877616162512684e-06, + "loss": 0.664, + "step": 1066 + }, + { + "epoch": 0.3280049185367353, + "grad_norm": 0.28977516293525696, + "learning_rate": 4.8773647419275875e-06, + "loss": 0.6607, + "step": 1067 + }, + { + "epoch": 0.3283123270826929, + "grad_norm": 0.292520135641098, + "learning_rate": 4.877113069846407e-06, + "loss": 0.681, + "step": 1068 + }, + { + "epoch": 0.3286197356286505, + "grad_norm": 0.292205810546875, + "learning_rate": 4.876861146295767e-06, + "loss": 0.6613, + "step": 1069 + }, + { + "epoch": 0.32892714417460805, + "grad_norm": 0.268900603055954, + "learning_rate": 4.8766089713023176e-06, + "loss": 0.6401, + "step": 1070 + }, + { + "epoch": 0.3292345527205656, + "grad_norm": 0.2794860303401947, + "learning_rate": 4.876356544892735e-06, + "loss": 0.6771, + "step": 1071 + }, + { + "epoch": 0.3295419612665232, + "grad_norm": 0.26819857954978943, + "learning_rate": 4.8761038670937246e-06, + "loss": 0.6421, + "step": 1072 + }, + { + "epoch": 0.3298493698124808, + "grad_norm": 0.30564814805984497, + "learning_rate": 4.875850937932014e-06, + "loss": 0.6419, + "step": 1073 + }, + { + "epoch": 0.3301567783584384, + "grad_norm": 0.27707114815711975, + "learning_rate": 4.8755977574343625e-06, + "loss": 0.6553, + "step": 1074 + }, + { + "epoch": 0.33046418690439594, + "grad_norm": 0.28109315037727356, + "learning_rate": 4.875344325627551e-06, + "loss": 0.6657, + "step": 1075 + }, + { + "epoch": 0.3307715954503535, + "grad_norm": 0.2874222695827484, + "learning_rate": 4.875090642538393e-06, + "loss": 0.64, + "step": 1076 + }, + { + "epoch": 0.33107900399631107, + "grad_norm": 0.3160046339035034, + "learning_rate": 4.874836708193721e-06, + "loss": 0.6425, + "step": 1077 + }, + { + "epoch": 0.3313864125422687, + "grad_norm": 0.29330354928970337, + "learning_rate": 4.8745825226204015e-06, + "loss": 0.658, + "step": 1078 + }, + { + "epoch": 0.33169382108822626, + "grad_norm": 0.2973116934299469, + "learning_rate": 4.874328085845322e-06, + "loss": 0.627, + "step": 1079 + }, + { + "epoch": 0.3320012296341838, + "grad_norm": 0.2896367311477661, + "learning_rate": 4.8740733978954e-06, + "loss": 0.6631, + "step": 1080 + }, + { + "epoch": 0.3323086381801414, + "grad_norm": 0.3064299523830414, + "learning_rate": 4.873818458797578e-06, + "loss": 0.682, + "step": 1081 + }, + { + "epoch": 0.332616046726099, + "grad_norm": 0.2802063226699829, + "learning_rate": 4.873563268578825e-06, + "loss": 0.6758, + "step": 1082 + }, + { + "epoch": 0.3329234552720566, + "grad_norm": 0.29365620017051697, + "learning_rate": 4.873307827266138e-06, + "loss": 0.658, + "step": 1083 + }, + { + "epoch": 0.33323086381801414, + "grad_norm": 0.2806922495365143, + "learning_rate": 4.873052134886539e-06, + "loss": 0.6713, + "step": 1084 + }, + { + "epoch": 0.3335382723639717, + "grad_norm": 0.2819922864437103, + "learning_rate": 4.8727961914670764e-06, + "loss": 0.656, + "step": 1085 + }, + { + "epoch": 0.3338456809099293, + "grad_norm": 0.28363433480262756, + "learning_rate": 4.872539997034827e-06, + "loss": 0.6562, + "step": 1086 + }, + { + "epoch": 0.3341530894558869, + "grad_norm": 0.28179118037223816, + "learning_rate": 4.872283551616893e-06, + "loss": 0.6569, + "step": 1087 + }, + { + "epoch": 0.33446049800184446, + "grad_norm": 0.2784107029438019, + "learning_rate": 4.872026855240403e-06, + "loss": 0.675, + "step": 1088 + }, + { + "epoch": 0.33476790654780203, + "grad_norm": 0.29847031831741333, + "learning_rate": 4.871769907932512e-06, + "loss": 0.6502, + "step": 1089 + }, + { + "epoch": 0.3350753150937596, + "grad_norm": 0.2799206078052521, + "learning_rate": 4.871512709720402e-06, + "loss": 0.6359, + "step": 1090 + }, + { + "epoch": 0.33538272363971716, + "grad_norm": 0.2813762426376343, + "learning_rate": 4.871255260631281e-06, + "loss": 0.6728, + "step": 1091 + }, + { + "epoch": 0.3356901321856748, + "grad_norm": 0.3038489818572998, + "learning_rate": 4.870997560692385e-06, + "loss": 0.638, + "step": 1092 + }, + { + "epoch": 0.33599754073163235, + "grad_norm": 0.2957247495651245, + "learning_rate": 4.8707396099309746e-06, + "loss": 0.6512, + "step": 1093 + }, + { + "epoch": 0.3363049492775899, + "grad_norm": 0.285360723733902, + "learning_rate": 4.870481408374338e-06, + "loss": 0.636, + "step": 1094 + }, + { + "epoch": 0.3366123578235475, + "grad_norm": 0.2844794690608978, + "learning_rate": 4.8702229560497905e-06, + "loss": 0.6689, + "step": 1095 + }, + { + "epoch": 0.33691976636950505, + "grad_norm": 0.2810419499874115, + "learning_rate": 4.869964252984673e-06, + "loss": 0.6355, + "step": 1096 + }, + { + "epoch": 0.33722717491546267, + "grad_norm": 0.2817031145095825, + "learning_rate": 4.869705299206352e-06, + "loss": 0.6443, + "step": 1097 + }, + { + "epoch": 0.33753458346142023, + "grad_norm": 0.28217530250549316, + "learning_rate": 4.869446094742222e-06, + "loss": 0.6568, + "step": 1098 + }, + { + "epoch": 0.3378419920073778, + "grad_norm": 0.2924758195877075, + "learning_rate": 4.869186639619705e-06, + "loss": 0.6792, + "step": 1099 + }, + { + "epoch": 0.33814940055333537, + "grad_norm": 0.2887811064720154, + "learning_rate": 4.8689269338662465e-06, + "loss": 0.6562, + "step": 1100 + }, + { + "epoch": 0.338456809099293, + "grad_norm": 0.2827998995780945, + "learning_rate": 4.8686669775093205e-06, + "loss": 0.6594, + "step": 1101 + }, + { + "epoch": 0.33876421764525055, + "grad_norm": 0.280564546585083, + "learning_rate": 4.868406770576428e-06, + "loss": 0.6239, + "step": 1102 + }, + { + "epoch": 0.3390716261912081, + "grad_norm": 0.28526246547698975, + "learning_rate": 4.868146313095096e-06, + "loss": 0.6475, + "step": 1103 + }, + { + "epoch": 0.3393790347371657, + "grad_norm": 0.29037556052207947, + "learning_rate": 4.867885605092877e-06, + "loss": 0.6354, + "step": 1104 + }, + { + "epoch": 0.33968644328312325, + "grad_norm": 0.2768741846084595, + "learning_rate": 4.86762464659735e-06, + "loss": 0.6602, + "step": 1105 + }, + { + "epoch": 0.3399938518290809, + "grad_norm": 0.28261837363243103, + "learning_rate": 4.867363437636122e-06, + "loss": 0.641, + "step": 1106 + }, + { + "epoch": 0.34030126037503844, + "grad_norm": 0.29442232847213745, + "learning_rate": 4.867101978236826e-06, + "loss": 0.6622, + "step": 1107 + }, + { + "epoch": 0.340608668920996, + "grad_norm": 0.28028449416160583, + "learning_rate": 4.866840268427121e-06, + "loss": 0.6729, + "step": 1108 + }, + { + "epoch": 0.34091607746695357, + "grad_norm": 0.2927473485469818, + "learning_rate": 4.8665783082346914e-06, + "loss": 0.6544, + "step": 1109 + }, + { + "epoch": 0.34122348601291114, + "grad_norm": 0.2763660252094269, + "learning_rate": 4.866316097687253e-06, + "loss": 0.6624, + "step": 1110 + }, + { + "epoch": 0.34153089455886876, + "grad_norm": 0.2794618010520935, + "learning_rate": 4.86605363681254e-06, + "loss": 0.6534, + "step": 1111 + }, + { + "epoch": 0.3418383031048263, + "grad_norm": 0.2877190411090851, + "learning_rate": 4.86579092563832e-06, + "loss": 0.6529, + "step": 1112 + }, + { + "epoch": 0.3421457116507839, + "grad_norm": 0.29772084951400757, + "learning_rate": 4.865527964192384e-06, + "loss": 0.6668, + "step": 1113 + }, + { + "epoch": 0.34245312019674146, + "grad_norm": 0.28618401288986206, + "learning_rate": 4.86526475250255e-06, + "loss": 0.644, + "step": 1114 + }, + { + "epoch": 0.342760528742699, + "grad_norm": 0.28646034002304077, + "learning_rate": 4.865001290596664e-06, + "loss": 0.644, + "step": 1115 + }, + { + "epoch": 0.34306793728865664, + "grad_norm": 0.2772636115550995, + "learning_rate": 4.8647375785025945e-06, + "loss": 0.6294, + "step": 1116 + }, + { + "epoch": 0.3433753458346142, + "grad_norm": 0.28783392906188965, + "learning_rate": 4.8644736162482406e-06, + "loss": 0.6406, + "step": 1117 + }, + { + "epoch": 0.3436827543805718, + "grad_norm": 0.2826582193374634, + "learning_rate": 4.864209403861528e-06, + "loss": 0.6508, + "step": 1118 + }, + { + "epoch": 0.34399016292652934, + "grad_norm": 0.2899433970451355, + "learning_rate": 4.863944941370403e-06, + "loss": 0.6766, + "step": 1119 + }, + { + "epoch": 0.3442975714724869, + "grad_norm": 0.2816702425479889, + "learning_rate": 4.863680228802845e-06, + "loss": 0.6535, + "step": 1120 + }, + { + "epoch": 0.34460498001844453, + "grad_norm": 0.2852102518081665, + "learning_rate": 4.863415266186858e-06, + "loss": 0.6374, + "step": 1121 + }, + { + "epoch": 0.3449123885644021, + "grad_norm": 0.2898584306240082, + "learning_rate": 4.863150053550471e-06, + "loss": 0.6494, + "step": 1122 + }, + { + "epoch": 0.34521979711035966, + "grad_norm": 0.2763925790786743, + "learning_rate": 4.862884590921739e-06, + "loss": 0.6697, + "step": 1123 + }, + { + "epoch": 0.3455272056563172, + "grad_norm": 0.29952865839004517, + "learning_rate": 4.862618878328746e-06, + "loss": 0.6542, + "step": 1124 + }, + { + "epoch": 0.34583461420227485, + "grad_norm": 0.27197974920272827, + "learning_rate": 4.862352915799602e-06, + "loss": 0.6532, + "step": 1125 + }, + { + "epoch": 0.3461420227482324, + "grad_norm": 0.2819601893424988, + "learning_rate": 4.8620867033624405e-06, + "loss": 0.6448, + "step": 1126 + }, + { + "epoch": 0.34644943129419, + "grad_norm": 0.2917955815792084, + "learning_rate": 4.8618202410454245e-06, + "loss": 0.6584, + "step": 1127 + }, + { + "epoch": 0.34675683984014755, + "grad_norm": 0.28686511516571045, + "learning_rate": 4.861553528876743e-06, + "loss": 0.6472, + "step": 1128 + }, + { + "epoch": 0.3470642483861051, + "grad_norm": 0.29280564188957214, + "learning_rate": 4.861286566884611e-06, + "loss": 0.6799, + "step": 1129 + }, + { + "epoch": 0.34737165693206273, + "grad_norm": 0.2900910973548889, + "learning_rate": 4.861019355097267e-06, + "loss": 0.6762, + "step": 1130 + }, + { + "epoch": 0.3476790654780203, + "grad_norm": 0.26742270588874817, + "learning_rate": 4.860751893542983e-06, + "loss": 0.6644, + "step": 1131 + }, + { + "epoch": 0.34798647402397787, + "grad_norm": 0.28952351212501526, + "learning_rate": 4.86048418225005e-06, + "loss": 0.6353, + "step": 1132 + }, + { + "epoch": 0.34829388256993543, + "grad_norm": 0.2784757912158966, + "learning_rate": 4.860216221246791e-06, + "loss": 0.6334, + "step": 1133 + }, + { + "epoch": 0.348601291115893, + "grad_norm": 0.29550811648368835, + "learning_rate": 4.859948010561551e-06, + "loss": 0.6524, + "step": 1134 + }, + { + "epoch": 0.3489086996618506, + "grad_norm": 0.28642791509628296, + "learning_rate": 4.859679550222703e-06, + "loss": 0.6522, + "step": 1135 + }, + { + "epoch": 0.3492161082078082, + "grad_norm": 0.3003406226634979, + "learning_rate": 4.859410840258648e-06, + "loss": 0.6341, + "step": 1136 + }, + { + "epoch": 0.34952351675376575, + "grad_norm": 0.2866849899291992, + "learning_rate": 4.859141880697813e-06, + "loss": 0.6376, + "step": 1137 + }, + { + "epoch": 0.3498309252997233, + "grad_norm": 0.301612913608551, + "learning_rate": 4.8588726715686495e-06, + "loss": 0.6342, + "step": 1138 + }, + { + "epoch": 0.3501383338456809, + "grad_norm": 0.2917943000793457, + "learning_rate": 4.858603212899637e-06, + "loss": 0.6499, + "step": 1139 + }, + { + "epoch": 0.3504457423916385, + "grad_norm": 0.2799971401691437, + "learning_rate": 4.85833350471928e-06, + "loss": 0.6589, + "step": 1140 + }, + { + "epoch": 0.35075315093759607, + "grad_norm": 0.2874235212802887, + "learning_rate": 4.858063547056112e-06, + "loss": 0.6462, + "step": 1141 + }, + { + "epoch": 0.35106055948355364, + "grad_norm": 0.28999242186546326, + "learning_rate": 4.8577933399386895e-06, + "loss": 0.6521, + "step": 1142 + }, + { + "epoch": 0.3513679680295112, + "grad_norm": 0.2903909385204315, + "learning_rate": 4.857522883395598e-06, + "loss": 0.6544, + "step": 1143 + }, + { + "epoch": 0.3516753765754688, + "grad_norm": 0.2758408784866333, + "learning_rate": 4.8572521774554485e-06, + "loss": 0.6483, + "step": 1144 + }, + { + "epoch": 0.3519827851214264, + "grad_norm": 0.29137951135635376, + "learning_rate": 4.856981222146877e-06, + "loss": 0.672, + "step": 1145 + }, + { + "epoch": 0.35229019366738396, + "grad_norm": 0.30551037192344666, + "learning_rate": 4.856710017498551e-06, + "loss": 0.66, + "step": 1146 + }, + { + "epoch": 0.3525976022133415, + "grad_norm": 0.2800922393798828, + "learning_rate": 4.856438563539155e-06, + "loss": 0.6537, + "step": 1147 + }, + { + "epoch": 0.3529050107592991, + "grad_norm": 0.29091954231262207, + "learning_rate": 4.856166860297411e-06, + "loss": 0.6523, + "step": 1148 + }, + { + "epoch": 0.3532124193052567, + "grad_norm": 0.30311766266822815, + "learning_rate": 4.855894907802059e-06, + "loss": 0.6424, + "step": 1149 + }, + { + "epoch": 0.3535198278512143, + "grad_norm": 0.29623645544052124, + "learning_rate": 4.855622706081867e-06, + "loss": 0.6565, + "step": 1150 + }, + { + "epoch": 0.35382723639717184, + "grad_norm": 0.2909236550331116, + "learning_rate": 4.855350255165634e-06, + "loss": 0.6553, + "step": 1151 + }, + { + "epoch": 0.3541346449431294, + "grad_norm": 0.28737714886665344, + "learning_rate": 4.85507755508218e-06, + "loss": 0.6483, + "step": 1152 + }, + { + "epoch": 0.354442053489087, + "grad_norm": 0.2926008701324463, + "learning_rate": 4.8548046058603536e-06, + "loss": 0.648, + "step": 1153 + }, + { + "epoch": 0.3547494620350446, + "grad_norm": 0.3053188621997833, + "learning_rate": 4.85453140752903e-06, + "loss": 0.6641, + "step": 1154 + }, + { + "epoch": 0.35505687058100216, + "grad_norm": 0.2944413721561432, + "learning_rate": 4.854257960117108e-06, + "loss": 0.6797, + "step": 1155 + }, + { + "epoch": 0.3553642791269597, + "grad_norm": 0.3149133324623108, + "learning_rate": 4.853984263653519e-06, + "loss": 0.6325, + "step": 1156 + }, + { + "epoch": 0.3556716876729173, + "grad_norm": 0.29788222908973694, + "learning_rate": 4.853710318167213e-06, + "loss": 0.6693, + "step": 1157 + }, + { + "epoch": 0.35597909621887486, + "grad_norm": 0.3314996361732483, + "learning_rate": 4.853436123687172e-06, + "loss": 0.6772, + "step": 1158 + }, + { + "epoch": 0.3562865047648325, + "grad_norm": 0.28273630142211914, + "learning_rate": 4.853161680242402e-06, + "loss": 0.6466, + "step": 1159 + }, + { + "epoch": 0.35659391331079004, + "grad_norm": 0.2757919132709503, + "learning_rate": 4.852886987861935e-06, + "loss": 0.6207, + "step": 1160 + }, + { + "epoch": 0.3569013218567476, + "grad_norm": 0.2972654104232788, + "learning_rate": 4.852612046574832e-06, + "loss": 0.6611, + "step": 1161 + }, + { + "epoch": 0.3572087304027052, + "grad_norm": 0.2969825863838196, + "learning_rate": 4.852336856410176e-06, + "loss": 0.6544, + "step": 1162 + }, + { + "epoch": 0.3575161389486628, + "grad_norm": 0.28127217292785645, + "learning_rate": 4.85206141739708e-06, + "loss": 0.6567, + "step": 1163 + }, + { + "epoch": 0.35782354749462036, + "grad_norm": 0.2802630662918091, + "learning_rate": 4.851785729564683e-06, + "loss": 0.6114, + "step": 1164 + }, + { + "epoch": 0.35813095604057793, + "grad_norm": 0.2864419221878052, + "learning_rate": 4.851509792942149e-06, + "loss": 0.6783, + "step": 1165 + }, + { + "epoch": 0.3584383645865355, + "grad_norm": 0.2825075089931488, + "learning_rate": 4.851233607558666e-06, + "loss": 0.6342, + "step": 1166 + }, + { + "epoch": 0.35874577313249306, + "grad_norm": 0.2917403280735016, + "learning_rate": 4.850957173443455e-06, + "loss": 0.6345, + "step": 1167 + }, + { + "epoch": 0.3590531816784507, + "grad_norm": 0.2781220078468323, + "learning_rate": 4.850680490625757e-06, + "loss": 0.6255, + "step": 1168 + }, + { + "epoch": 0.35936059022440825, + "grad_norm": 0.29117605090141296, + "learning_rate": 4.850403559134842e-06, + "loss": 0.6625, + "step": 1169 + }, + { + "epoch": 0.3596679987703658, + "grad_norm": 0.2950398921966553, + "learning_rate": 4.850126379000006e-06, + "loss": 0.665, + "step": 1170 + }, + { + "epoch": 0.3599754073163234, + "grad_norm": 0.2927609980106354, + "learning_rate": 4.849848950250572e-06, + "loss": 0.6243, + "step": 1171 + }, + { + "epoch": 0.36028281586228095, + "grad_norm": 0.2933370769023895, + "learning_rate": 4.849571272915888e-06, + "loss": 0.6739, + "step": 1172 + }, + { + "epoch": 0.36059022440823857, + "grad_norm": 0.2940187454223633, + "learning_rate": 4.849293347025329e-06, + "loss": 0.6501, + "step": 1173 + }, + { + "epoch": 0.36089763295419613, + "grad_norm": 0.2717529535293579, + "learning_rate": 4.849015172608296e-06, + "loss": 0.6578, + "step": 1174 + }, + { + "epoch": 0.3612050415001537, + "grad_norm": 0.2939319610595703, + "learning_rate": 4.848736749694216e-06, + "loss": 0.6502, + "step": 1175 + }, + { + "epoch": 0.36151245004611127, + "grad_norm": 0.2774084806442261, + "learning_rate": 4.848458078312543e-06, + "loss": 0.6349, + "step": 1176 + }, + { + "epoch": 0.36181985859206883, + "grad_norm": 0.29062098264694214, + "learning_rate": 4.848179158492758e-06, + "loss": 0.6461, + "step": 1177 + }, + { + "epoch": 0.36212726713802645, + "grad_norm": 0.2775641679763794, + "learning_rate": 4.847899990264366e-06, + "loss": 0.6549, + "step": 1178 + }, + { + "epoch": 0.362434675683984, + "grad_norm": 0.293854683637619, + "learning_rate": 4.8476205736569e-06, + "loss": 0.6618, + "step": 1179 + }, + { + "epoch": 0.3627420842299416, + "grad_norm": 0.2859247624874115, + "learning_rate": 4.847340908699919e-06, + "loss": 0.6815, + "step": 1180 + }, + { + "epoch": 0.36304949277589915, + "grad_norm": 0.28764423727989197, + "learning_rate": 4.847060995423009e-06, + "loss": 0.6337, + "step": 1181 + }, + { + "epoch": 0.3633569013218568, + "grad_norm": 0.2956876754760742, + "learning_rate": 4.8467808338557796e-06, + "loss": 0.6633, + "step": 1182 + }, + { + "epoch": 0.36366430986781434, + "grad_norm": 0.3008383810520172, + "learning_rate": 4.846500424027869e-06, + "loss": 0.6645, + "step": 1183 + }, + { + "epoch": 0.3639717184137719, + "grad_norm": 0.2965761125087738, + "learning_rate": 4.846219765968943e-06, + "loss": 0.6399, + "step": 1184 + }, + { + "epoch": 0.36427912695972947, + "grad_norm": 0.2863546311855316, + "learning_rate": 4.845938859708689e-06, + "loss": 0.6603, + "step": 1185 + }, + { + "epoch": 0.36458653550568704, + "grad_norm": 0.2940642237663269, + "learning_rate": 4.845657705276824e-06, + "loss": 0.6582, + "step": 1186 + }, + { + "epoch": 0.36489394405164466, + "grad_norm": 0.28825128078460693, + "learning_rate": 4.8453763027030924e-06, + "loss": 0.6515, + "step": 1187 + }, + { + "epoch": 0.3652013525976022, + "grad_norm": 0.3103175461292267, + "learning_rate": 4.845094652017263e-06, + "loss": 0.6803, + "step": 1188 + }, + { + "epoch": 0.3655087611435598, + "grad_norm": 0.2962205111980438, + "learning_rate": 4.844812753249128e-06, + "loss": 0.6622, + "step": 1189 + }, + { + "epoch": 0.36581616968951736, + "grad_norm": 0.29945769906044006, + "learning_rate": 4.8445306064285125e-06, + "loss": 0.635, + "step": 1190 + }, + { + "epoch": 0.3661235782354749, + "grad_norm": 0.28851333260536194, + "learning_rate": 4.844248211585263e-06, + "loss": 0.6485, + "step": 1191 + }, + { + "epoch": 0.36643098678143254, + "grad_norm": 0.30816417932510376, + "learning_rate": 4.8439655687492525e-06, + "loss": 0.6513, + "step": 1192 + }, + { + "epoch": 0.3667383953273901, + "grad_norm": 0.3089192807674408, + "learning_rate": 4.843682677950381e-06, + "loss": 0.6381, + "step": 1193 + }, + { + "epoch": 0.3670458038733477, + "grad_norm": 0.29314282536506653, + "learning_rate": 4.843399539218576e-06, + "loss": 0.6427, + "step": 1194 + }, + { + "epoch": 0.36735321241930524, + "grad_norm": 0.28689178824424744, + "learning_rate": 4.843116152583791e-06, + "loss": 0.6565, + "step": 1195 + }, + { + "epoch": 0.3676606209652628, + "grad_norm": 0.2941093444824219, + "learning_rate": 4.842832518076002e-06, + "loss": 0.6454, + "step": 1196 + }, + { + "epoch": 0.36796802951122043, + "grad_norm": 0.2931382656097412, + "learning_rate": 4.842548635725216e-06, + "loss": 0.6723, + "step": 1197 + }, + { + "epoch": 0.368275438057178, + "grad_norm": 0.29798975586891174, + "learning_rate": 4.842264505561464e-06, + "loss": 0.6286, + "step": 1198 + }, + { + "epoch": 0.36858284660313556, + "grad_norm": 0.2882380783557892, + "learning_rate": 4.841980127614804e-06, + "loss": 0.6754, + "step": 1199 + }, + { + "epoch": 0.3688902551490931, + "grad_norm": 0.295621395111084, + "learning_rate": 4.8416955019153185e-06, + "loss": 0.6522, + "step": 1200 + }, + { + "epoch": 0.36919766369505075, + "grad_norm": 0.28226956725120544, + "learning_rate": 4.841410628493118e-06, + "loss": 0.65, + "step": 1201 + }, + { + "epoch": 0.3695050722410083, + "grad_norm": 0.2999158501625061, + "learning_rate": 4.841125507378338e-06, + "loss": 0.6747, + "step": 1202 + }, + { + "epoch": 0.3698124807869659, + "grad_norm": 0.30476444959640503, + "learning_rate": 4.840840138601143e-06, + "loss": 0.6561, + "step": 1203 + }, + { + "epoch": 0.37011988933292345, + "grad_norm": 0.28192731738090515, + "learning_rate": 4.840554522191719e-06, + "loss": 0.6344, + "step": 1204 + }, + { + "epoch": 0.370427297878881, + "grad_norm": 0.2815313935279846, + "learning_rate": 4.840268658180281e-06, + "loss": 0.6376, + "step": 1205 + }, + { + "epoch": 0.37073470642483863, + "grad_norm": 0.29476284980773926, + "learning_rate": 4.839982546597072e-06, + "loss": 0.6285, + "step": 1206 + }, + { + "epoch": 0.3710421149707962, + "grad_norm": 0.28575247526168823, + "learning_rate": 4.839696187472357e-06, + "loss": 0.6572, + "step": 1207 + }, + { + "epoch": 0.37134952351675377, + "grad_norm": 0.29342466592788696, + "learning_rate": 4.839409580836431e-06, + "loss": 0.6409, + "step": 1208 + }, + { + "epoch": 0.37165693206271133, + "grad_norm": 0.29101651906967163, + "learning_rate": 4.839122726719611e-06, + "loss": 0.6449, + "step": 1209 + }, + { + "epoch": 0.3719643406086689, + "grad_norm": 0.26941409707069397, + "learning_rate": 4.838835625152244e-06, + "loss": 0.6376, + "step": 1210 + }, + { + "epoch": 0.3722717491546265, + "grad_norm": 0.2749631404876709, + "learning_rate": 4.838548276164703e-06, + "loss": 0.6556, + "step": 1211 + }, + { + "epoch": 0.3725791577005841, + "grad_norm": 0.27997729182243347, + "learning_rate": 4.838260679787385e-06, + "loss": 0.643, + "step": 1212 + }, + { + "epoch": 0.37288656624654165, + "grad_norm": 0.28735148906707764, + "learning_rate": 4.837972836050714e-06, + "loss": 0.6467, + "step": 1213 + }, + { + "epoch": 0.3731939747924992, + "grad_norm": 0.2802216410636902, + "learning_rate": 4.83768474498514e-06, + "loss": 0.6467, + "step": 1214 + }, + { + "epoch": 0.3735013833384568, + "grad_norm": 0.2816302180290222, + "learning_rate": 4.83739640662114e-06, + "loss": 0.6531, + "step": 1215 + }, + { + "epoch": 0.3738087918844144, + "grad_norm": 0.2900158166885376, + "learning_rate": 4.8371078209892166e-06, + "loss": 0.6583, + "step": 1216 + }, + { + "epoch": 0.37411620043037197, + "grad_norm": 0.2846461534500122, + "learning_rate": 4.836818988119899e-06, + "loss": 0.6516, + "step": 1217 + }, + { + "epoch": 0.37442360897632954, + "grad_norm": 0.27678653597831726, + "learning_rate": 4.836529908043742e-06, + "loss": 0.6542, + "step": 1218 + }, + { + "epoch": 0.3747310175222871, + "grad_norm": 0.2751465141773224, + "learning_rate": 4.836240580791327e-06, + "loss": 0.665, + "step": 1219 + }, + { + "epoch": 0.3750384260682447, + "grad_norm": 0.2741398215293884, + "learning_rate": 4.83595100639326e-06, + "loss": 0.6482, + "step": 1220 + }, + { + "epoch": 0.3753458346142023, + "grad_norm": 0.27430182695388794, + "learning_rate": 4.835661184880176e-06, + "loss": 0.6376, + "step": 1221 + }, + { + "epoch": 0.37565324316015986, + "grad_norm": 0.29915979504585266, + "learning_rate": 4.835371116282733e-06, + "loss": 0.6399, + "step": 1222 + }, + { + "epoch": 0.3759606517061174, + "grad_norm": 0.27466264367103577, + "learning_rate": 4.835080800631618e-06, + "loss": 0.6672, + "step": 1223 + }, + { + "epoch": 0.376268060252075, + "grad_norm": 0.3107733726501465, + "learning_rate": 4.834790237957543e-06, + "loss": 0.6172, + "step": 1224 + }, + { + "epoch": 0.3765754687980326, + "grad_norm": 0.281197726726532, + "learning_rate": 4.834499428291245e-06, + "loss": 0.6315, + "step": 1225 + }, + { + "epoch": 0.3768828773439902, + "grad_norm": 0.291128009557724, + "learning_rate": 4.834208371663488e-06, + "loss": 0.6581, + "step": 1226 + }, + { + "epoch": 0.37719028588994774, + "grad_norm": 0.28337034583091736, + "learning_rate": 4.833917068105063e-06, + "loss": 0.6381, + "step": 1227 + }, + { + "epoch": 0.3774976944359053, + "grad_norm": 0.293664813041687, + "learning_rate": 4.833625517646786e-06, + "loss": 0.669, + "step": 1228 + }, + { + "epoch": 0.3778051029818629, + "grad_norm": 0.2929069697856903, + "learning_rate": 4.8333337203195e-06, + "loss": 0.6362, + "step": 1229 + }, + { + "epoch": 0.3781125115278205, + "grad_norm": 0.2923349142074585, + "learning_rate": 4.833041676154073e-06, + "loss": 0.6656, + "step": 1230 + }, + { + "epoch": 0.37841992007377806, + "grad_norm": 0.2853974401950836, + "learning_rate": 4.8327493851814e-06, + "loss": 0.6546, + "step": 1231 + }, + { + "epoch": 0.3787273286197356, + "grad_norm": 0.2901977598667145, + "learning_rate": 4.832456847432401e-06, + "loss": 0.6365, + "step": 1232 + }, + { + "epoch": 0.3790347371656932, + "grad_norm": 0.2888997197151184, + "learning_rate": 4.832164062938024e-06, + "loss": 0.6359, + "step": 1233 + }, + { + "epoch": 0.37934214571165076, + "grad_norm": 0.2854214310646057, + "learning_rate": 4.831871031729242e-06, + "loss": 0.6442, + "step": 1234 + }, + { + "epoch": 0.3796495542576084, + "grad_norm": 0.28577283024787903, + "learning_rate": 4.831577753837052e-06, + "loss": 0.6537, + "step": 1235 + }, + { + "epoch": 0.37995696280356595, + "grad_norm": 0.29881423711776733, + "learning_rate": 4.831284229292482e-06, + "loss": 0.6453, + "step": 1236 + }, + { + "epoch": 0.3802643713495235, + "grad_norm": 0.2894684374332428, + "learning_rate": 4.830990458126583e-06, + "loss": 0.6481, + "step": 1237 + }, + { + "epoch": 0.3805717798954811, + "grad_norm": 0.2969074547290802, + "learning_rate": 4.83069644037043e-06, + "loss": 0.6344, + "step": 1238 + }, + { + "epoch": 0.3808791884414387, + "grad_norm": 0.2987270653247833, + "learning_rate": 4.830402176055129e-06, + "loss": 0.6277, + "step": 1239 + }, + { + "epoch": 0.38118659698739626, + "grad_norm": 0.28988513350486755, + "learning_rate": 4.830107665211808e-06, + "loss": 0.6411, + "step": 1240 + }, + { + "epoch": 0.38149400553335383, + "grad_norm": 0.29726454615592957, + "learning_rate": 4.829812907871624e-06, + "loss": 0.6308, + "step": 1241 + }, + { + "epoch": 0.3818014140793114, + "grad_norm": 0.2749503552913666, + "learning_rate": 4.829517904065758e-06, + "loss": 0.6341, + "step": 1242 + }, + { + "epoch": 0.38210882262526896, + "grad_norm": 0.28571709990501404, + "learning_rate": 4.829222653825417e-06, + "loss": 0.6701, + "step": 1243 + }, + { + "epoch": 0.3824162311712266, + "grad_norm": 0.28376710414886475, + "learning_rate": 4.8289271571818366e-06, + "loss": 0.6477, + "step": 1244 + }, + { + "epoch": 0.38272363971718415, + "grad_norm": 0.28571224212646484, + "learning_rate": 4.828631414166275e-06, + "loss": 0.6488, + "step": 1245 + }, + { + "epoch": 0.3830310482631417, + "grad_norm": 0.2944090664386749, + "learning_rate": 4.82833542481002e-06, + "loss": 0.6624, + "step": 1246 + }, + { + "epoch": 0.3833384568090993, + "grad_norm": 0.28510746359825134, + "learning_rate": 4.828039189144381e-06, + "loss": 0.661, + "step": 1247 + }, + { + "epoch": 0.38364586535505685, + "grad_norm": 0.2825288772583008, + "learning_rate": 4.827742707200699e-06, + "loss": 0.6527, + "step": 1248 + }, + { + "epoch": 0.38395327390101447, + "grad_norm": 0.29219722747802734, + "learning_rate": 4.827445979010336e-06, + "loss": 0.6435, + "step": 1249 + }, + { + "epoch": 0.38426068244697204, + "grad_norm": 0.2841136157512665, + "learning_rate": 4.8271490046046835e-06, + "loss": 0.6515, + "step": 1250 + }, + { + "epoch": 0.3845680909929296, + "grad_norm": 0.29854416847229004, + "learning_rate": 4.8268517840151576e-06, + "loss": 0.6417, + "step": 1251 + }, + { + "epoch": 0.38487549953888717, + "grad_norm": 0.2892513871192932, + "learning_rate": 4.8265543172732e-06, + "loss": 0.6446, + "step": 1252 + }, + { + "epoch": 0.38518290808484473, + "grad_norm": 0.2976434528827667, + "learning_rate": 4.826256604410279e-06, + "loss": 0.6459, + "step": 1253 + }, + { + "epoch": 0.38549031663080235, + "grad_norm": 0.30052557587623596, + "learning_rate": 4.82595864545789e-06, + "loss": 0.636, + "step": 1254 + }, + { + "epoch": 0.3857977251767599, + "grad_norm": 0.27374929189682007, + "learning_rate": 4.825660440447553e-06, + "loss": 0.633, + "step": 1255 + }, + { + "epoch": 0.3861051337227175, + "grad_norm": 0.2786353826522827, + "learning_rate": 4.825361989410813e-06, + "loss": 0.6563, + "step": 1256 + }, + { + "epoch": 0.38641254226867505, + "grad_norm": 0.3012694716453552, + "learning_rate": 4.825063292379244e-06, + "loss": 0.6524, + "step": 1257 + }, + { + "epoch": 0.3867199508146327, + "grad_norm": 0.28786700963974, + "learning_rate": 4.824764349384445e-06, + "loss": 0.6586, + "step": 1258 + }, + { + "epoch": 0.38702735936059024, + "grad_norm": 0.2752508819103241, + "learning_rate": 4.824465160458039e-06, + "loss": 0.6286, + "step": 1259 + }, + { + "epoch": 0.3873347679065478, + "grad_norm": 0.28267568349838257, + "learning_rate": 4.824165725631678e-06, + "loss": 0.6597, + "step": 1260 + }, + { + "epoch": 0.38764217645250537, + "grad_norm": 0.2800127863883972, + "learning_rate": 4.823866044937037e-06, + "loss": 0.6368, + "step": 1261 + }, + { + "epoch": 0.38794958499846294, + "grad_norm": 0.28366294503211975, + "learning_rate": 4.8235661184058186e-06, + "loss": 0.6204, + "step": 1262 + }, + { + "epoch": 0.38825699354442056, + "grad_norm": 0.29918238520622253, + "learning_rate": 4.823265946069753e-06, + "loss": 0.6459, + "step": 1263 + }, + { + "epoch": 0.3885644020903781, + "grad_norm": 0.2910010814666748, + "learning_rate": 4.822965527960593e-06, + "loss": 0.6466, + "step": 1264 + }, + { + "epoch": 0.3888718106363357, + "grad_norm": 0.29574763774871826, + "learning_rate": 4.822664864110121e-06, + "loss": 0.6522, + "step": 1265 + }, + { + "epoch": 0.38917921918229326, + "grad_norm": 0.2960658073425293, + "learning_rate": 4.822363954550142e-06, + "loss": 0.6662, + "step": 1266 + }, + { + "epoch": 0.3894866277282508, + "grad_norm": 0.2983403205871582, + "learning_rate": 4.822062799312489e-06, + "loss": 0.6421, + "step": 1267 + }, + { + "epoch": 0.38979403627420844, + "grad_norm": 0.2772797644138336, + "learning_rate": 4.821761398429021e-06, + "loss": 0.6357, + "step": 1268 + }, + { + "epoch": 0.390101444820166, + "grad_norm": 0.28706374764442444, + "learning_rate": 4.821459751931622e-06, + "loss": 0.6231, + "step": 1269 + }, + { + "epoch": 0.3904088533661236, + "grad_norm": 0.2870384752750397, + "learning_rate": 4.821157859852201e-06, + "loss": 0.6388, + "step": 1270 + }, + { + "epoch": 0.39071626191208114, + "grad_norm": 0.287937194108963, + "learning_rate": 4.8208557222226985e-06, + "loss": 0.6228, + "step": 1271 + }, + { + "epoch": 0.3910236704580387, + "grad_norm": 0.2779027223587036, + "learning_rate": 4.820553339075073e-06, + "loss": 0.6285, + "step": 1272 + }, + { + "epoch": 0.39133107900399633, + "grad_norm": 0.28143054246902466, + "learning_rate": 4.820250710441315e-06, + "loss": 0.658, + "step": 1273 + }, + { + "epoch": 0.3916384875499539, + "grad_norm": 0.28802284598350525, + "learning_rate": 4.819947836353438e-06, + "loss": 0.6374, + "step": 1274 + }, + { + "epoch": 0.39194589609591146, + "grad_norm": 0.2876126170158386, + "learning_rate": 4.8196447168434834e-06, + "loss": 0.6443, + "step": 1275 + }, + { + "epoch": 0.392253304641869, + "grad_norm": 0.29363813996315, + "learning_rate": 4.819341351943517e-06, + "loss": 0.6659, + "step": 1276 + }, + { + "epoch": 0.39256071318782665, + "grad_norm": 0.2872544825077057, + "learning_rate": 4.81903774168563e-06, + "loss": 0.6595, + "step": 1277 + }, + { + "epoch": 0.3928681217337842, + "grad_norm": 0.2973545491695404, + "learning_rate": 4.8187338861019426e-06, + "loss": 0.6497, + "step": 1278 + }, + { + "epoch": 0.3931755302797418, + "grad_norm": 0.29788586497306824, + "learning_rate": 4.818429785224598e-06, + "loss": 0.6524, + "step": 1279 + }, + { + "epoch": 0.39348293882569935, + "grad_norm": 0.29561275243759155, + "learning_rate": 4.818125439085766e-06, + "loss": 0.6345, + "step": 1280 + }, + { + "epoch": 0.3937903473716569, + "grad_norm": 0.280953049659729, + "learning_rate": 4.817820847717643e-06, + "loss": 0.6417, + "step": 1281 + }, + { + "epoch": 0.39409775591761453, + "grad_norm": 0.29340311884880066, + "learning_rate": 4.817516011152451e-06, + "loss": 0.6306, + "step": 1282 + }, + { + "epoch": 0.3944051644635721, + "grad_norm": 0.29695218801498413, + "learning_rate": 4.817210929422439e-06, + "loss": 0.6288, + "step": 1283 + }, + { + "epoch": 0.39471257300952967, + "grad_norm": 0.28304794430732727, + "learning_rate": 4.81690560255988e-06, + "loss": 0.6517, + "step": 1284 + }, + { + "epoch": 0.39501998155548723, + "grad_norm": 0.3004229962825775, + "learning_rate": 4.816600030597073e-06, + "loss": 0.6414, + "step": 1285 + }, + { + "epoch": 0.3953273901014448, + "grad_norm": 0.2944739758968353, + "learning_rate": 4.816294213566345e-06, + "loss": 0.6615, + "step": 1286 + }, + { + "epoch": 0.3956347986474024, + "grad_norm": 0.29077383875846863, + "learning_rate": 4.815988151500048e-06, + "loss": 0.665, + "step": 1287 + }, + { + "epoch": 0.39594220719336, + "grad_norm": 0.3191203773021698, + "learning_rate": 4.815681844430557e-06, + "loss": 0.6534, + "step": 1288 + }, + { + "epoch": 0.39624961573931755, + "grad_norm": 0.29577741026878357, + "learning_rate": 4.815375292390279e-06, + "loss": 0.6509, + "step": 1289 + }, + { + "epoch": 0.3965570242852751, + "grad_norm": 0.2899928390979767, + "learning_rate": 4.815068495411642e-06, + "loss": 0.6414, + "step": 1290 + }, + { + "epoch": 0.3968644328312327, + "grad_norm": 0.2970438301563263, + "learning_rate": 4.814761453527101e-06, + "loss": 0.6629, + "step": 1291 + }, + { + "epoch": 0.3971718413771903, + "grad_norm": 0.3125231862068176, + "learning_rate": 4.8144541667691375e-06, + "loss": 0.6386, + "step": 1292 + }, + { + "epoch": 0.39747924992314787, + "grad_norm": 0.30687856674194336, + "learning_rate": 4.814146635170259e-06, + "loss": 0.6587, + "step": 1293 + }, + { + "epoch": 0.39778665846910544, + "grad_norm": 0.2877073884010315, + "learning_rate": 4.813838858762998e-06, + "loss": 0.6735, + "step": 1294 + }, + { + "epoch": 0.398094067015063, + "grad_norm": 0.2884162366390228, + "learning_rate": 4.813530837579914e-06, + "loss": 0.6774, + "step": 1295 + }, + { + "epoch": 0.3984014755610206, + "grad_norm": 0.31494462490081787, + "learning_rate": 4.8132225716535915e-06, + "loss": 0.6421, + "step": 1296 + }, + { + "epoch": 0.3987088841069782, + "grad_norm": 0.3030208945274353, + "learning_rate": 4.812914061016641e-06, + "loss": 0.6444, + "step": 1297 + }, + { + "epoch": 0.39901629265293576, + "grad_norm": 0.28316885232925415, + "learning_rate": 4.812605305701701e-06, + "loss": 0.6551, + "step": 1298 + }, + { + "epoch": 0.3993237011988933, + "grad_norm": 0.3290197551250458, + "learning_rate": 4.812296305741432e-06, + "loss": 0.6546, + "step": 1299 + }, + { + "epoch": 0.3996311097448509, + "grad_norm": 0.2984607517719269, + "learning_rate": 4.8119870611685225e-06, + "loss": 0.6274, + "step": 1300 + }, + { + "epoch": 0.3999385182908085, + "grad_norm": 0.27872130274772644, + "learning_rate": 4.8116775720156875e-06, + "loss": 0.6413, + "step": 1301 + }, + { + "epoch": 0.4002459268367661, + "grad_norm": 0.2952040433883667, + "learning_rate": 4.811367838315667e-06, + "loss": 0.6366, + "step": 1302 + }, + { + "epoch": 0.40055333538272364, + "grad_norm": 0.30513325333595276, + "learning_rate": 4.811057860101227e-06, + "loss": 0.6512, + "step": 1303 + }, + { + "epoch": 0.4008607439286812, + "grad_norm": 0.2853948771953583, + "learning_rate": 4.81074763740516e-06, + "loss": 0.6371, + "step": 1304 + }, + { + "epoch": 0.4011681524746388, + "grad_norm": 0.2751544713973999, + "learning_rate": 4.810437170260283e-06, + "loss": 0.6257, + "step": 1305 + }, + { + "epoch": 0.4014755610205964, + "grad_norm": 0.2961689531803131, + "learning_rate": 4.810126458699439e-06, + "loss": 0.6526, + "step": 1306 + }, + { + "epoch": 0.40178296956655396, + "grad_norm": 0.30491209030151367, + "learning_rate": 4.809815502755499e-06, + "loss": 0.6597, + "step": 1307 + }, + { + "epoch": 0.4020903781125115, + "grad_norm": 0.2933606505393982, + "learning_rate": 4.809504302461358e-06, + "loss": 0.6469, + "step": 1308 + }, + { + "epoch": 0.4023977866584691, + "grad_norm": 0.2970842719078064, + "learning_rate": 4.809192857849936e-06, + "loss": 0.6492, + "step": 1309 + }, + { + "epoch": 0.40270519520442666, + "grad_norm": 0.3252752423286438, + "learning_rate": 4.80888116895418e-06, + "loss": 0.6451, + "step": 1310 + }, + { + "epoch": 0.4030126037503843, + "grad_norm": 0.30382615327835083, + "learning_rate": 4.808569235807066e-06, + "loss": 0.6493, + "step": 1311 + }, + { + "epoch": 0.40332001229634185, + "grad_norm": 0.2959381341934204, + "learning_rate": 4.808257058441589e-06, + "loss": 0.6493, + "step": 1312 + }, + { + "epoch": 0.4036274208422994, + "grad_norm": 0.2897432744503021, + "learning_rate": 4.8079446368907745e-06, + "loss": 0.6448, + "step": 1313 + }, + { + "epoch": 0.403934829388257, + "grad_norm": 0.31283366680145264, + "learning_rate": 4.807631971187674e-06, + "loss": 0.6391, + "step": 1314 + }, + { + "epoch": 0.4042422379342146, + "grad_norm": 0.29319167137145996, + "learning_rate": 4.807319061365363e-06, + "loss": 0.6415, + "step": 1315 + }, + { + "epoch": 0.40454964648017216, + "grad_norm": 0.29910117387771606, + "learning_rate": 4.807005907456943e-06, + "loss": 0.6266, + "step": 1316 + }, + { + "epoch": 0.40485705502612973, + "grad_norm": 0.2988852262496948, + "learning_rate": 4.806692509495543e-06, + "loss": 0.6371, + "step": 1317 + }, + { + "epoch": 0.4051644635720873, + "grad_norm": 0.2972976565361023, + "learning_rate": 4.806378867514317e-06, + "loss": 0.6305, + "step": 1318 + }, + { + "epoch": 0.40547187211804486, + "grad_norm": 0.2830500900745392, + "learning_rate": 4.8060649815464425e-06, + "loss": 0.6669, + "step": 1319 + }, + { + "epoch": 0.4057792806640025, + "grad_norm": 0.2883017957210541, + "learning_rate": 4.805750851625125e-06, + "loss": 0.6364, + "step": 1320 + }, + { + "epoch": 0.40608668920996005, + "grad_norm": 0.3054847717285156, + "learning_rate": 4.805436477783598e-06, + "loss": 0.645, + "step": 1321 + }, + { + "epoch": 0.4063940977559176, + "grad_norm": 0.2969795763492584, + "learning_rate": 4.805121860055116e-06, + "loss": 0.6422, + "step": 1322 + }, + { + "epoch": 0.4067015063018752, + "grad_norm": 0.28812962770462036, + "learning_rate": 4.804806998472962e-06, + "loss": 0.6661, + "step": 1323 + }, + { + "epoch": 0.40700891484783275, + "grad_norm": 0.30108073353767395, + "learning_rate": 4.804491893070446e-06, + "loss": 0.6473, + "step": 1324 + }, + { + "epoch": 0.40731632339379037, + "grad_norm": 0.30156052112579346, + "learning_rate": 4.804176543880901e-06, + "loss": 0.6673, + "step": 1325 + }, + { + "epoch": 0.40762373193974794, + "grad_norm": 0.2892749607563019, + "learning_rate": 4.803860950937687e-06, + "loss": 0.6566, + "step": 1326 + }, + { + "epoch": 0.4079311404857055, + "grad_norm": 0.2765393555164337, + "learning_rate": 4.803545114274192e-06, + "loss": 0.6276, + "step": 1327 + }, + { + "epoch": 0.40823854903166307, + "grad_norm": 0.29268428683280945, + "learning_rate": 4.803229033923824e-06, + "loss": 0.6401, + "step": 1328 + }, + { + "epoch": 0.40854595757762063, + "grad_norm": 0.2892889976501465, + "learning_rate": 4.802912709920023e-06, + "loss": 0.6629, + "step": 1329 + }, + { + "epoch": 0.40885336612357825, + "grad_norm": 0.2830463647842407, + "learning_rate": 4.802596142296252e-06, + "loss": 0.6432, + "step": 1330 + }, + { + "epoch": 0.4091607746695358, + "grad_norm": 0.2858258783817291, + "learning_rate": 4.8022793310859995e-06, + "loss": 0.6492, + "step": 1331 + }, + { + "epoch": 0.4094681832154934, + "grad_norm": 0.28231415152549744, + "learning_rate": 4.8019622763227804e-06, + "loss": 0.6455, + "step": 1332 + }, + { + "epoch": 0.40977559176145095, + "grad_norm": 0.2801310122013092, + "learning_rate": 4.801644978040134e-06, + "loss": 0.6159, + "step": 1333 + }, + { + "epoch": 0.4100830003074085, + "grad_norm": 0.2734101116657257, + "learning_rate": 4.801327436271629e-06, + "loss": 0.6387, + "step": 1334 + }, + { + "epoch": 0.41039040885336614, + "grad_norm": 0.2847292423248291, + "learning_rate": 4.801009651050856e-06, + "loss": 0.6416, + "step": 1335 + }, + { + "epoch": 0.4106978173993237, + "grad_norm": 0.297359824180603, + "learning_rate": 4.800691622411432e-06, + "loss": 0.6332, + "step": 1336 + }, + { + "epoch": 0.41100522594528127, + "grad_norm": 0.29589030146598816, + "learning_rate": 4.800373350387002e-06, + "loss": 0.6671, + "step": 1337 + }, + { + "epoch": 0.41131263449123884, + "grad_norm": 0.28293389081954956, + "learning_rate": 4.800054835011236e-06, + "loss": 0.644, + "step": 1338 + }, + { + "epoch": 0.41162004303719646, + "grad_norm": 0.27555689215660095, + "learning_rate": 4.799736076317825e-06, + "loss": 0.6462, + "step": 1339 + }, + { + "epoch": 0.411927451583154, + "grad_norm": 0.29116660356521606, + "learning_rate": 4.799417074340495e-06, + "loss": 0.6563, + "step": 1340 + }, + { + "epoch": 0.4122348601291116, + "grad_norm": 0.2966228723526001, + "learning_rate": 4.799097829112988e-06, + "loss": 0.6559, + "step": 1341 + }, + { + "epoch": 0.41254226867506916, + "grad_norm": 0.2990160584449768, + "learning_rate": 4.79877834066908e-06, + "loss": 0.6541, + "step": 1342 + }, + { + "epoch": 0.4128496772210267, + "grad_norm": 0.2890549898147583, + "learning_rate": 4.798458609042567e-06, + "loss": 0.6507, + "step": 1343 + }, + { + "epoch": 0.41315708576698434, + "grad_norm": 0.29993942379951477, + "learning_rate": 4.798138634267273e-06, + "loss": 0.6419, + "step": 1344 + }, + { + "epoch": 0.4134644943129419, + "grad_norm": 0.28753456473350525, + "learning_rate": 4.7978184163770465e-06, + "loss": 0.6209, + "step": 1345 + }, + { + "epoch": 0.4137719028588995, + "grad_norm": 0.2902330458164215, + "learning_rate": 4.797497955405764e-06, + "loss": 0.6414, + "step": 1346 + }, + { + "epoch": 0.41407931140485704, + "grad_norm": 0.28759071230888367, + "learning_rate": 4.7971772513873255e-06, + "loss": 0.6555, + "step": 1347 + }, + { + "epoch": 0.4143867199508146, + "grad_norm": 0.2864113450050354, + "learning_rate": 4.796856304355658e-06, + "loss": 0.6275, + "step": 1348 + }, + { + "epoch": 0.41469412849677223, + "grad_norm": 0.291998028755188, + "learning_rate": 4.7965351143447135e-06, + "loss": 0.6405, + "step": 1349 + }, + { + "epoch": 0.4150015370427298, + "grad_norm": 0.28509432077407837, + "learning_rate": 4.79621368138847e-06, + "loss": 0.6358, + "step": 1350 + }, + { + "epoch": 0.41530894558868736, + "grad_norm": 0.2929030954837799, + "learning_rate": 4.795892005520933e-06, + "loss": 0.6448, + "step": 1351 + }, + { + "epoch": 0.4156163541346449, + "grad_norm": 0.28987154364585876, + "learning_rate": 4.795570086776128e-06, + "loss": 0.6659, + "step": 1352 + }, + { + "epoch": 0.4159237626806025, + "grad_norm": 0.29789918661117554, + "learning_rate": 4.795247925188114e-06, + "loss": 0.6284, + "step": 1353 + }, + { + "epoch": 0.4162311712265601, + "grad_norm": 0.2786828279495239, + "learning_rate": 4.7949255207909685e-06, + "loss": 0.661, + "step": 1354 + }, + { + "epoch": 0.4165385797725177, + "grad_norm": 0.30405327677726746, + "learning_rate": 4.7946028736187994e-06, + "loss": 0.6666, + "step": 1355 + }, + { + "epoch": 0.41684598831847525, + "grad_norm": 0.30058905482292175, + "learning_rate": 4.794279983705739e-06, + "loss": 0.6475, + "step": 1356 + }, + { + "epoch": 0.4171533968644328, + "grad_norm": 0.2944830060005188, + "learning_rate": 4.7939568510859455e-06, + "loss": 0.6553, + "step": 1357 + }, + { + "epoch": 0.41746080541039043, + "grad_norm": 0.2861654460430145, + "learning_rate": 4.793633475793602e-06, + "loss": 0.6451, + "step": 1358 + }, + { + "epoch": 0.417768213956348, + "grad_norm": 0.28486523032188416, + "learning_rate": 4.7933098578629165e-06, + "loss": 0.6336, + "step": 1359 + }, + { + "epoch": 0.41807562250230557, + "grad_norm": 0.30304887890815735, + "learning_rate": 4.792985997328126e-06, + "loss": 0.628, + "step": 1360 + }, + { + "epoch": 0.41838303104826313, + "grad_norm": 0.2826055586338043, + "learning_rate": 4.792661894223489e-06, + "loss": 0.6406, + "step": 1361 + }, + { + "epoch": 0.4186904395942207, + "grad_norm": 0.2965010702610016, + "learning_rate": 4.792337548583292e-06, + "loss": 0.6772, + "step": 1362 + }, + { + "epoch": 0.4189978481401783, + "grad_norm": 0.2906387448310852, + "learning_rate": 4.7920129604418476e-06, + "loss": 0.6324, + "step": 1363 + }, + { + "epoch": 0.4193052566861359, + "grad_norm": 0.2874164581298828, + "learning_rate": 4.791688129833493e-06, + "loss": 0.6599, + "step": 1364 + }, + { + "epoch": 0.41961266523209345, + "grad_norm": 0.2864219546318054, + "learning_rate": 4.791363056792591e-06, + "loss": 0.6441, + "step": 1365 + }, + { + "epoch": 0.419920073778051, + "grad_norm": 0.3047635853290558, + "learning_rate": 4.79103774135353e-06, + "loss": 0.6681, + "step": 1366 + }, + { + "epoch": 0.4202274823240086, + "grad_norm": 0.2857508659362793, + "learning_rate": 4.790712183550726e-06, + "loss": 0.6498, + "step": 1367 + }, + { + "epoch": 0.4205348908699662, + "grad_norm": 0.2881411910057068, + "learning_rate": 4.790386383418617e-06, + "loss": 0.6646, + "step": 1368 + }, + { + "epoch": 0.42084229941592377, + "grad_norm": 0.30643975734710693, + "learning_rate": 4.79006034099167e-06, + "loss": 0.6601, + "step": 1369 + }, + { + "epoch": 0.42114970796188134, + "grad_norm": 0.31087690591812134, + "learning_rate": 4.789734056304376e-06, + "loss": 0.6313, + "step": 1370 + }, + { + "epoch": 0.4214571165078389, + "grad_norm": 0.2854000926017761, + "learning_rate": 4.789407529391251e-06, + "loss": 0.631, + "step": 1371 + }, + { + "epoch": 0.42176452505379647, + "grad_norm": 0.3018312454223633, + "learning_rate": 4.789080760286839e-06, + "loss": 0.6422, + "step": 1372 + }, + { + "epoch": 0.4220719335997541, + "grad_norm": 0.2817084789276123, + "learning_rate": 4.788753749025707e-06, + "loss": 0.6387, + "step": 1373 + }, + { + "epoch": 0.42237934214571166, + "grad_norm": 0.28203386068344116, + "learning_rate": 4.78842649564245e-06, + "loss": 0.632, + "step": 1374 + }, + { + "epoch": 0.4226867506916692, + "grad_norm": 0.3305239975452423, + "learning_rate": 4.7880990001716865e-06, + "loss": 0.6481, + "step": 1375 + }, + { + "epoch": 0.4229941592376268, + "grad_norm": 0.3063565194606781, + "learning_rate": 4.787771262648062e-06, + "loss": 0.6374, + "step": 1376 + }, + { + "epoch": 0.4233015677835844, + "grad_norm": 0.2873280644416809, + "learning_rate": 4.7874432831062456e-06, + "loss": 0.6277, + "step": 1377 + }, + { + "epoch": 0.423608976329542, + "grad_norm": 0.2987636923789978, + "learning_rate": 4.787115061580936e-06, + "loss": 0.6526, + "step": 1378 + }, + { + "epoch": 0.42391638487549954, + "grad_norm": 0.2991296350955963, + "learning_rate": 4.7867865981068536e-06, + "loss": 0.6423, + "step": 1379 + }, + { + "epoch": 0.4242237934214571, + "grad_norm": 0.2835756242275238, + "learning_rate": 4.7864578927187455e-06, + "loss": 0.6378, + "step": 1380 + }, + { + "epoch": 0.4245312019674147, + "grad_norm": 0.30141958594322205, + "learning_rate": 4.786128945451386e-06, + "loss": 0.6374, + "step": 1381 + }, + { + "epoch": 0.4248386105133723, + "grad_norm": 0.31011196970939636, + "learning_rate": 4.785799756339572e-06, + "loss": 0.6566, + "step": 1382 + }, + { + "epoch": 0.42514601905932986, + "grad_norm": 0.3008863925933838, + "learning_rate": 4.785470325418129e-06, + "loss": 0.6438, + "step": 1383 + }, + { + "epoch": 0.4254534276052874, + "grad_norm": 0.2868259847164154, + "learning_rate": 4.785140652721906e-06, + "loss": 0.6577, + "step": 1384 + }, + { + "epoch": 0.425760836151245, + "grad_norm": 0.30679020285606384, + "learning_rate": 4.784810738285779e-06, + "loss": 0.6393, + "step": 1385 + }, + { + "epoch": 0.42606824469720256, + "grad_norm": 0.3091297745704651, + "learning_rate": 4.784480582144648e-06, + "loss": 0.6789, + "step": 1386 + }, + { + "epoch": 0.4263756532431602, + "grad_norm": 0.2946123778820038, + "learning_rate": 4.78415018433344e-06, + "loss": 0.6375, + "step": 1387 + }, + { + "epoch": 0.42668306178911775, + "grad_norm": 0.29859328269958496, + "learning_rate": 4.783819544887108e-06, + "loss": 0.6631, + "step": 1388 + }, + { + "epoch": 0.4269904703350753, + "grad_norm": 0.2838020622730255, + "learning_rate": 4.783488663840629e-06, + "loss": 0.6469, + "step": 1389 + }, + { + "epoch": 0.4272978788810329, + "grad_norm": 0.30996763706207275, + "learning_rate": 4.783157541229004e-06, + "loss": 0.629, + "step": 1390 + }, + { + "epoch": 0.42760528742699044, + "grad_norm": 0.28751084208488464, + "learning_rate": 4.782826177087265e-06, + "loss": 0.6671, + "step": 1391 + }, + { + "epoch": 0.42791269597294807, + "grad_norm": 0.3103798031806946, + "learning_rate": 4.782494571450465e-06, + "loss": 0.6515, + "step": 1392 + }, + { + "epoch": 0.42822010451890563, + "grad_norm": 0.28273436427116394, + "learning_rate": 4.7821627243536824e-06, + "loss": 0.6433, + "step": 1393 + }, + { + "epoch": 0.4285275130648632, + "grad_norm": 0.2845565974712372, + "learning_rate": 4.781830635832025e-06, + "loss": 0.6533, + "step": 1394 + }, + { + "epoch": 0.42883492161082076, + "grad_norm": 0.29229190945625305, + "learning_rate": 4.781498305920622e-06, + "loss": 0.6524, + "step": 1395 + }, + { + "epoch": 0.4291423301567784, + "grad_norm": 0.2931855022907257, + "learning_rate": 4.781165734654632e-06, + "loss": 0.6604, + "step": 1396 + }, + { + "epoch": 0.42944973870273595, + "grad_norm": 0.2965172529220581, + "learning_rate": 4.780832922069233e-06, + "loss": 0.6435, + "step": 1397 + }, + { + "epoch": 0.4297571472486935, + "grad_norm": 0.29172855615615845, + "learning_rate": 4.780499868199636e-06, + "loss": 0.6231, + "step": 1398 + }, + { + "epoch": 0.4300645557946511, + "grad_norm": 0.28969332575798035, + "learning_rate": 4.780166573081072e-06, + "loss": 0.6306, + "step": 1399 + }, + { + "epoch": 0.43037196434060865, + "grad_norm": 0.28254467248916626, + "learning_rate": 4.779833036748801e-06, + "loss": 0.6358, + "step": 1400 + }, + { + "epoch": 0.43067937288656627, + "grad_norm": 0.2924171984195709, + "learning_rate": 4.7794992592381066e-06, + "loss": 0.6458, + "step": 1401 + }, + { + "epoch": 0.43098678143252384, + "grad_norm": 0.2915787100791931, + "learning_rate": 4.779165240584298e-06, + "loss": 0.69, + "step": 1402 + }, + { + "epoch": 0.4312941899784814, + "grad_norm": 0.28551405668258667, + "learning_rate": 4.7788309808227104e-06, + "loss": 0.642, + "step": 1403 + }, + { + "epoch": 0.43160159852443897, + "grad_norm": 0.29959219694137573, + "learning_rate": 4.778496479988705e-06, + "loss": 0.6392, + "step": 1404 + }, + { + "epoch": 0.43190900707039653, + "grad_norm": 0.2942274510860443, + "learning_rate": 4.778161738117666e-06, + "loss": 0.6283, + "step": 1405 + }, + { + "epoch": 0.43221641561635415, + "grad_norm": 0.29136040806770325, + "learning_rate": 4.777826755245008e-06, + "loss": 0.6316, + "step": 1406 + }, + { + "epoch": 0.4325238241623117, + "grad_norm": 0.2860836982727051, + "learning_rate": 4.777491531406165e-06, + "loss": 0.6396, + "step": 1407 + }, + { + "epoch": 0.4328312327082693, + "grad_norm": 0.2952307164669037, + "learning_rate": 4.777156066636602e-06, + "loss": 0.6445, + "step": 1408 + }, + { + "epoch": 0.43313864125422685, + "grad_norm": 0.29671967029571533, + "learning_rate": 4.776820360971806e-06, + "loss": 0.6252, + "step": 1409 + }, + { + "epoch": 0.4334460498001844, + "grad_norm": 0.3051259219646454, + "learning_rate": 4.776484414447291e-06, + "loss": 0.6632, + "step": 1410 + }, + { + "epoch": 0.43375345834614204, + "grad_norm": 0.28810831904411316, + "learning_rate": 4.776148227098595e-06, + "loss": 0.6159, + "step": 1411 + }, + { + "epoch": 0.4340608668920996, + "grad_norm": 0.2889414429664612, + "learning_rate": 4.775811798961283e-06, + "loss": 0.6434, + "step": 1412 + }, + { + "epoch": 0.43436827543805717, + "grad_norm": 0.28940972685813904, + "learning_rate": 4.775475130070946e-06, + "loss": 0.6356, + "step": 1413 + }, + { + "epoch": 0.43467568398401474, + "grad_norm": 0.29118797183036804, + "learning_rate": 4.775138220463199e-06, + "loss": 0.6708, + "step": 1414 + }, + { + "epoch": 0.43498309252997236, + "grad_norm": 0.28793805837631226, + "learning_rate": 4.774801070173681e-06, + "loss": 0.6412, + "step": 1415 + }, + { + "epoch": 0.4352905010759299, + "grad_norm": 0.3022698760032654, + "learning_rate": 4.774463679238061e-06, + "loss": 0.6517, + "step": 1416 + }, + { + "epoch": 0.4355979096218875, + "grad_norm": 0.2879706621170044, + "learning_rate": 4.774126047692029e-06, + "loss": 0.623, + "step": 1417 + }, + { + "epoch": 0.43590531816784506, + "grad_norm": 0.28082728385925293, + "learning_rate": 4.773788175571304e-06, + "loss": 0.6418, + "step": 1418 + }, + { + "epoch": 0.4362127267138026, + "grad_norm": 0.29914408922195435, + "learning_rate": 4.773450062911627e-06, + "loss": 0.6373, + "step": 1419 + }, + { + "epoch": 0.43652013525976024, + "grad_norm": 0.306641161441803, + "learning_rate": 4.773111709748767e-06, + "loss": 0.6549, + "step": 1420 + }, + { + "epoch": 0.4368275438057178, + "grad_norm": 0.2959968149662018, + "learning_rate": 4.772773116118518e-06, + "loss": 0.6567, + "step": 1421 + }, + { + "epoch": 0.4371349523516754, + "grad_norm": 0.29691922664642334, + "learning_rate": 4.7724342820566986e-06, + "loss": 0.6406, + "step": 1422 + }, + { + "epoch": 0.43744236089763294, + "grad_norm": 0.29534029960632324, + "learning_rate": 4.772095207599153e-06, + "loss": 0.6594, + "step": 1423 + }, + { + "epoch": 0.4377497694435905, + "grad_norm": 0.2903323471546173, + "learning_rate": 4.7717558927817516e-06, + "loss": 0.6516, + "step": 1424 + }, + { + "epoch": 0.43805717798954813, + "grad_norm": 0.2871916890144348, + "learning_rate": 4.771416337640389e-06, + "loss": 0.6554, + "step": 1425 + }, + { + "epoch": 0.4383645865355057, + "grad_norm": 0.2906838655471802, + "learning_rate": 4.771076542210987e-06, + "loss": 0.6619, + "step": 1426 + }, + { + "epoch": 0.43867199508146326, + "grad_norm": 0.3029407262802124, + "learning_rate": 4.770736506529492e-06, + "loss": 0.6931, + "step": 1427 + }, + { + "epoch": 0.43897940362742083, + "grad_norm": 0.2975960075855255, + "learning_rate": 4.770396230631874e-06, + "loss": 0.6775, + "step": 1428 + }, + { + "epoch": 0.4392868121733784, + "grad_norm": 0.28220292925834656, + "learning_rate": 4.770055714554132e-06, + "loss": 0.6399, + "step": 1429 + }, + { + "epoch": 0.439594220719336, + "grad_norm": 0.2836722135543823, + "learning_rate": 4.7697149583322865e-06, + "loss": 0.6636, + "step": 1430 + }, + { + "epoch": 0.4399016292652936, + "grad_norm": 0.30066776275634766, + "learning_rate": 4.769373962002387e-06, + "loss": 0.6437, + "step": 1431 + }, + { + "epoch": 0.44020903781125115, + "grad_norm": 0.2889501750469208, + "learning_rate": 4.769032725600505e-06, + "loss": 0.625, + "step": 1432 + }, + { + "epoch": 0.4405164463572087, + "grad_norm": 0.30354368686676025, + "learning_rate": 4.768691249162741e-06, + "loss": 0.653, + "step": 1433 + }, + { + "epoch": 0.44082385490316633, + "grad_norm": 0.28996190428733826, + "learning_rate": 4.768349532725218e-06, + "loss": 0.628, + "step": 1434 + }, + { + "epoch": 0.4411312634491239, + "grad_norm": 0.28898096084594727, + "learning_rate": 4.768007576324085e-06, + "loss": 0.6556, + "step": 1435 + }, + { + "epoch": 0.44143867199508147, + "grad_norm": 0.2962207794189453, + "learning_rate": 4.767665379995518e-06, + "loss": 0.6338, + "step": 1436 + }, + { + "epoch": 0.44174608054103903, + "grad_norm": 0.29608067870140076, + "learning_rate": 4.767322943775716e-06, + "loss": 0.6189, + "step": 1437 + }, + { + "epoch": 0.4420534890869966, + "grad_norm": 0.2832201421260834, + "learning_rate": 4.766980267700906e-06, + "loss": 0.6389, + "step": 1438 + }, + { + "epoch": 0.4423608976329542, + "grad_norm": 0.2959904670715332, + "learning_rate": 4.7666373518073375e-06, + "loss": 0.669, + "step": 1439 + }, + { + "epoch": 0.4426683061789118, + "grad_norm": 0.30100777745246887, + "learning_rate": 4.766294196131288e-06, + "loss": 0.6742, + "step": 1440 + }, + { + "epoch": 0.44297571472486935, + "grad_norm": 0.30483341217041016, + "learning_rate": 4.765950800709058e-06, + "loss": 0.6397, + "step": 1441 + }, + { + "epoch": 0.4432831232708269, + "grad_norm": 0.2897294759750366, + "learning_rate": 4.7656071655769745e-06, + "loss": 0.6588, + "step": 1442 + }, + { + "epoch": 0.4435905318167845, + "grad_norm": 0.2855444848537445, + "learning_rate": 4.765263290771392e-06, + "loss": 0.6567, + "step": 1443 + }, + { + "epoch": 0.4438979403627421, + "grad_norm": 0.3053029477596283, + "learning_rate": 4.764919176328685e-06, + "loss": 0.666, + "step": 1444 + }, + { + "epoch": 0.44420534890869967, + "grad_norm": 0.27984005212783813, + "learning_rate": 4.764574822285259e-06, + "loss": 0.6716, + "step": 1445 + }, + { + "epoch": 0.44451275745465724, + "grad_norm": 0.3000514805316925, + "learning_rate": 4.7642302286775425e-06, + "loss": 0.6337, + "step": 1446 + }, + { + "epoch": 0.4448201660006148, + "grad_norm": 0.29364800453186035, + "learning_rate": 4.7638853955419875e-06, + "loss": 0.6383, + "step": 1447 + }, + { + "epoch": 0.44512757454657237, + "grad_norm": 0.28497782349586487, + "learning_rate": 4.763540322915075e-06, + "loss": 0.6289, + "step": 1448 + }, + { + "epoch": 0.44543498309253, + "grad_norm": 0.30061399936676025, + "learning_rate": 4.763195010833307e-06, + "loss": 0.6361, + "step": 1449 + }, + { + "epoch": 0.44574239163848756, + "grad_norm": 0.297253280878067, + "learning_rate": 4.762849459333216e-06, + "loss": 0.6545, + "step": 1450 + }, + { + "epoch": 0.4460498001844451, + "grad_norm": 0.2866867482662201, + "learning_rate": 4.762503668451356e-06, + "loss": 0.6309, + "step": 1451 + }, + { + "epoch": 0.4463572087304027, + "grad_norm": 0.29333987832069397, + "learning_rate": 4.762157638224308e-06, + "loss": 0.6567, + "step": 1452 + }, + { + "epoch": 0.4466646172763603, + "grad_norm": 0.2911797761917114, + "learning_rate": 4.7618113686886755e-06, + "loss": 0.6577, + "step": 1453 + }, + { + "epoch": 0.4469720258223179, + "grad_norm": 0.30487287044525146, + "learning_rate": 4.761464859881093e-06, + "loss": 0.6336, + "step": 1454 + }, + { + "epoch": 0.44727943436827544, + "grad_norm": 0.2947708070278168, + "learning_rate": 4.761118111838215e-06, + "loss": 0.6311, + "step": 1455 + }, + { + "epoch": 0.447586842914233, + "grad_norm": 0.2884784936904907, + "learning_rate": 4.760771124596723e-06, + "loss": 0.6409, + "step": 1456 + }, + { + "epoch": 0.4478942514601906, + "grad_norm": 0.29756152629852295, + "learning_rate": 4.760423898193324e-06, + "loss": 0.6447, + "step": 1457 + }, + { + "epoch": 0.4482016600061482, + "grad_norm": 0.29785579442977905, + "learning_rate": 4.760076432664751e-06, + "loss": 0.645, + "step": 1458 + }, + { + "epoch": 0.44850906855210576, + "grad_norm": 0.29637962579727173, + "learning_rate": 4.759728728047761e-06, + "loss": 0.6337, + "step": 1459 + }, + { + "epoch": 0.4488164770980633, + "grad_norm": 0.2938099801540375, + "learning_rate": 4.759380784379137e-06, + "loss": 0.6671, + "step": 1460 + }, + { + "epoch": 0.4491238856440209, + "grad_norm": 0.2918235957622528, + "learning_rate": 4.759032601695688e-06, + "loss": 0.6491, + "step": 1461 + }, + { + "epoch": 0.44943129418997846, + "grad_norm": 0.28989458084106445, + "learning_rate": 4.758684180034247e-06, + "loss": 0.6316, + "step": 1462 + }, + { + "epoch": 0.4497387027359361, + "grad_norm": 0.2799231708049774, + "learning_rate": 4.758335519431671e-06, + "loss": 0.657, + "step": 1463 + }, + { + "epoch": 0.45004611128189365, + "grad_norm": 0.28531450033187866, + "learning_rate": 4.757986619924847e-06, + "loss": 0.6614, + "step": 1464 + }, + { + "epoch": 0.4503535198278512, + "grad_norm": 0.29477742314338684, + "learning_rate": 4.757637481550683e-06, + "loss": 0.6563, + "step": 1465 + }, + { + "epoch": 0.4506609283738088, + "grad_norm": 0.2945522367954254, + "learning_rate": 4.757288104346114e-06, + "loss": 0.6617, + "step": 1466 + }, + { + "epoch": 0.45096833691976634, + "grad_norm": 0.29456108808517456, + "learning_rate": 4.7569384883480975e-06, + "loss": 0.6331, + "step": 1467 + }, + { + "epoch": 0.45127574546572397, + "grad_norm": 0.283151239156723, + "learning_rate": 4.756588633593622e-06, + "loss": 0.6279, + "step": 1468 + }, + { + "epoch": 0.45158315401168153, + "grad_norm": 0.2909386157989502, + "learning_rate": 4.756238540119695e-06, + "loss": 0.6251, + "step": 1469 + }, + { + "epoch": 0.4518905625576391, + "grad_norm": 0.2915608286857605, + "learning_rate": 4.755888207963354e-06, + "loss": 0.6069, + "step": 1470 + }, + { + "epoch": 0.45219797110359666, + "grad_norm": 0.28931739926338196, + "learning_rate": 4.755537637161659e-06, + "loss": 0.6607, + "step": 1471 + }, + { + "epoch": 0.4525053796495543, + "grad_norm": 0.30026066303253174, + "learning_rate": 4.755186827751696e-06, + "loss": 0.6502, + "step": 1472 + }, + { + "epoch": 0.45281278819551185, + "grad_norm": 0.302143394947052, + "learning_rate": 4.754835779770577e-06, + "loss": 0.6354, + "step": 1473 + }, + { + "epoch": 0.4531201967414694, + "grad_norm": 0.30134716629981995, + "learning_rate": 4.7544844932554375e-06, + "loss": 0.6556, + "step": 1474 + }, + { + "epoch": 0.453427605287427, + "grad_norm": 0.287076473236084, + "learning_rate": 4.754132968243441e-06, + "loss": 0.6534, + "step": 1475 + }, + { + "epoch": 0.45373501383338455, + "grad_norm": 0.275790810585022, + "learning_rate": 4.753781204771773e-06, + "loss": 0.6583, + "step": 1476 + }, + { + "epoch": 0.45404242237934217, + "grad_norm": 0.2907719910144806, + "learning_rate": 4.753429202877647e-06, + "loss": 0.6387, + "step": 1477 + }, + { + "epoch": 0.45434983092529974, + "grad_norm": 0.2935529947280884, + "learning_rate": 4.7530769625982995e-06, + "loss": 0.6528, + "step": 1478 + }, + { + "epoch": 0.4546572394712573, + "grad_norm": 0.27522626519203186, + "learning_rate": 4.752724483970993e-06, + "loss": 0.6312, + "step": 1479 + }, + { + "epoch": 0.45496464801721487, + "grad_norm": 0.27994540333747864, + "learning_rate": 4.752371767033016e-06, + "loss": 0.6776, + "step": 1480 + }, + { + "epoch": 0.45527205656317243, + "grad_norm": 0.3046683669090271, + "learning_rate": 4.752018811821682e-06, + "loss": 0.6635, + "step": 1481 + }, + { + "epoch": 0.45557946510913006, + "grad_norm": 0.29309189319610596, + "learning_rate": 4.751665618374329e-06, + "loss": 0.6345, + "step": 1482 + }, + { + "epoch": 0.4558868736550876, + "grad_norm": 0.2883107364177704, + "learning_rate": 4.75131218672832e-06, + "loss": 0.6469, + "step": 1483 + }, + { + "epoch": 0.4561942822010452, + "grad_norm": 0.3067610561847687, + "learning_rate": 4.750958516921045e-06, + "loss": 0.685, + "step": 1484 + }, + { + "epoch": 0.45650169074700275, + "grad_norm": 0.2890136241912842, + "learning_rate": 4.750604608989916e-06, + "loss": 0.6535, + "step": 1485 + }, + { + "epoch": 0.4568090992929603, + "grad_norm": 0.3001953065395355, + "learning_rate": 4.750250462972374e-06, + "loss": 0.663, + "step": 1486 + }, + { + "epoch": 0.45711650783891794, + "grad_norm": 0.28826040029525757, + "learning_rate": 4.749896078905883e-06, + "loss": 0.6365, + "step": 1487 + }, + { + "epoch": 0.4574239163848755, + "grad_norm": 0.29574403166770935, + "learning_rate": 4.749541456827931e-06, + "loss": 0.643, + "step": 1488 + }, + { + "epoch": 0.4577313249308331, + "grad_norm": 0.30554935336112976, + "learning_rate": 4.749186596776034e-06, + "loss": 0.6569, + "step": 1489 + }, + { + "epoch": 0.45803873347679064, + "grad_norm": 0.3008880317211151, + "learning_rate": 4.748831498787731e-06, + "loss": 0.6532, + "step": 1490 + }, + { + "epoch": 0.45834614202274826, + "grad_norm": 0.30550616979599, + "learning_rate": 4.748476162900589e-06, + "loss": 0.6389, + "step": 1491 + }, + { + "epoch": 0.4586535505687058, + "grad_norm": 0.2926572561264038, + "learning_rate": 4.748120589152196e-06, + "loss": 0.6391, + "step": 1492 + }, + { + "epoch": 0.4589609591146634, + "grad_norm": 0.2909615635871887, + "learning_rate": 4.7477647775801686e-06, + "loss": 0.6446, + "step": 1493 + }, + { + "epoch": 0.45926836766062096, + "grad_norm": 0.30203884840011597, + "learning_rate": 4.747408728222147e-06, + "loss": 0.6409, + "step": 1494 + }, + { + "epoch": 0.4595757762065785, + "grad_norm": 0.28833842277526855, + "learning_rate": 4.747052441115796e-06, + "loss": 0.6358, + "step": 1495 + }, + { + "epoch": 0.45988318475253614, + "grad_norm": 0.29309627413749695, + "learning_rate": 4.746695916298808e-06, + "loss": 0.6542, + "step": 1496 + }, + { + "epoch": 0.4601905932984937, + "grad_norm": 0.30000215768814087, + "learning_rate": 4.746339153808897e-06, + "loss": 0.6364, + "step": 1497 + }, + { + "epoch": 0.4604980018444513, + "grad_norm": 0.2909153997898102, + "learning_rate": 4.745982153683805e-06, + "loss": 0.6269, + "step": 1498 + }, + { + "epoch": 0.46080541039040884, + "grad_norm": 0.3017018437385559, + "learning_rate": 4.7456249159613e-06, + "loss": 0.6435, + "step": 1499 + }, + { + "epoch": 0.4611128189363664, + "grad_norm": 0.31200769543647766, + "learning_rate": 4.745267440679171e-06, + "loss": 0.623, + "step": 1500 + }, + { + "epoch": 0.46142022748232403, + "grad_norm": 0.28843122720718384, + "learning_rate": 4.744909727875235e-06, + "loss": 0.6296, + "step": 1501 + }, + { + "epoch": 0.4617276360282816, + "grad_norm": 0.2891080379486084, + "learning_rate": 4.744551777587335e-06, + "loss": 0.6405, + "step": 1502 + }, + { + "epoch": 0.46203504457423916, + "grad_norm": 0.3228342533111572, + "learning_rate": 4.744193589853336e-06, + "loss": 0.6324, + "step": 1503 + }, + { + "epoch": 0.46234245312019673, + "grad_norm": 0.27922263741493225, + "learning_rate": 4.743835164711131e-06, + "loss": 0.6373, + "step": 1504 + }, + { + "epoch": 0.4626498616661543, + "grad_norm": 0.3168621361255646, + "learning_rate": 4.743476502198636e-06, + "loss": 0.6643, + "step": 1505 + }, + { + "epoch": 0.4629572702121119, + "grad_norm": 0.31680944561958313, + "learning_rate": 4.743117602353794e-06, + "loss": 0.613, + "step": 1506 + }, + { + "epoch": 0.4632646787580695, + "grad_norm": 0.2896377742290497, + "learning_rate": 4.742758465214572e-06, + "loss": 0.6516, + "step": 1507 + }, + { + "epoch": 0.46357208730402705, + "grad_norm": 0.3046092391014099, + "learning_rate": 4.742399090818962e-06, + "loss": 0.6406, + "step": 1508 + }, + { + "epoch": 0.4638794958499846, + "grad_norm": 0.3204990029335022, + "learning_rate": 4.742039479204981e-06, + "loss": 0.6397, + "step": 1509 + }, + { + "epoch": 0.46418690439594223, + "grad_norm": 0.2928156852722168, + "learning_rate": 4.741679630410674e-06, + "loss": 0.6588, + "step": 1510 + }, + { + "epoch": 0.4644943129418998, + "grad_norm": 0.2949657142162323, + "learning_rate": 4.741319544474105e-06, + "loss": 0.6616, + "step": 1511 + }, + { + "epoch": 0.46480172148785737, + "grad_norm": 0.3214471638202667, + "learning_rate": 4.7409592214333685e-06, + "loss": 0.6276, + "step": 1512 + }, + { + "epoch": 0.46510913003381493, + "grad_norm": 0.3054629862308502, + "learning_rate": 4.740598661326583e-06, + "loss": 0.6644, + "step": 1513 + }, + { + "epoch": 0.4654165385797725, + "grad_norm": 0.287351131439209, + "learning_rate": 4.74023786419189e-06, + "loss": 0.6516, + "step": 1514 + }, + { + "epoch": 0.4657239471257301, + "grad_norm": 0.30987346172332764, + "learning_rate": 4.739876830067458e-06, + "loss": 0.6398, + "step": 1515 + }, + { + "epoch": 0.4660313556716877, + "grad_norm": 0.2868606448173523, + "learning_rate": 4.73951555899148e-06, + "loss": 0.614, + "step": 1516 + }, + { + "epoch": 0.46633876421764525, + "grad_norm": 0.2902650833129883, + "learning_rate": 4.739154051002175e-06, + "loss": 0.6449, + "step": 1517 + }, + { + "epoch": 0.4666461727636028, + "grad_norm": 0.308560311794281, + "learning_rate": 4.7387923061377836e-06, + "loss": 0.6537, + "step": 1518 + }, + { + "epoch": 0.4669535813095604, + "grad_norm": 0.2907334566116333, + "learning_rate": 4.738430324436576e-06, + "loss": 0.6554, + "step": 1519 + }, + { + "epoch": 0.467260989855518, + "grad_norm": 0.29763495922088623, + "learning_rate": 4.738068105936845e-06, + "loss": 0.6299, + "step": 1520 + }, + { + "epoch": 0.46756839840147557, + "grad_norm": 0.3021467924118042, + "learning_rate": 4.737705650676909e-06, + "loss": 0.6427, + "step": 1521 + }, + { + "epoch": 0.46787580694743314, + "grad_norm": 0.2907562851905823, + "learning_rate": 4.7373429586951105e-06, + "loss": 0.6424, + "step": 1522 + }, + { + "epoch": 0.4681832154933907, + "grad_norm": 0.3046243190765381, + "learning_rate": 4.736980030029819e-06, + "loss": 0.6579, + "step": 1523 + }, + { + "epoch": 0.46849062403934827, + "grad_norm": 0.2929055392742157, + "learning_rate": 4.736616864719427e-06, + "loss": 0.663, + "step": 1524 + }, + { + "epoch": 0.4687980325853059, + "grad_norm": 0.2915979027748108, + "learning_rate": 4.736253462802354e-06, + "loss": 0.6311, + "step": 1525 + }, + { + "epoch": 0.46910544113126346, + "grad_norm": 0.28508633375167847, + "learning_rate": 4.735889824317043e-06, + "loss": 0.6445, + "step": 1526 + }, + { + "epoch": 0.469412849677221, + "grad_norm": 0.2854878604412079, + "learning_rate": 4.735525949301962e-06, + "loss": 0.6525, + "step": 1527 + }, + { + "epoch": 0.4697202582231786, + "grad_norm": 0.2854996919631958, + "learning_rate": 4.735161837795604e-06, + "loss": 0.6304, + "step": 1528 + }, + { + "epoch": 0.47002766676913615, + "grad_norm": 0.29767417907714844, + "learning_rate": 4.7347974898364896e-06, + "loss": 0.6574, + "step": 1529 + }, + { + "epoch": 0.4703350753150938, + "grad_norm": 0.2925272583961487, + "learning_rate": 4.73443290546316e-06, + "loss": 0.6436, + "step": 1530 + }, + { + "epoch": 0.47064248386105134, + "grad_norm": 0.2814065217971802, + "learning_rate": 4.734068084714186e-06, + "loss": 0.6333, + "step": 1531 + }, + { + "epoch": 0.4709498924070089, + "grad_norm": 0.29923903942108154, + "learning_rate": 4.73370302762816e-06, + "loss": 0.6367, + "step": 1532 + }, + { + "epoch": 0.4712573009529665, + "grad_norm": 0.293012797832489, + "learning_rate": 4.733337734243701e-06, + "loss": 0.6422, + "step": 1533 + }, + { + "epoch": 0.4715647094989241, + "grad_norm": 0.30055898427963257, + "learning_rate": 4.732972204599451e-06, + "loss": 0.6664, + "step": 1534 + }, + { + "epoch": 0.47187211804488166, + "grad_norm": 0.2920820713043213, + "learning_rate": 4.7326064387340795e-06, + "loss": 0.6561, + "step": 1535 + }, + { + "epoch": 0.4721795265908392, + "grad_norm": 0.29254621267318726, + "learning_rate": 4.732240436686282e-06, + "loss": 0.6462, + "step": 1536 + }, + { + "epoch": 0.4724869351367968, + "grad_norm": 0.30884718894958496, + "learning_rate": 4.731874198494773e-06, + "loss": 0.6471, + "step": 1537 + }, + { + "epoch": 0.47279434368275436, + "grad_norm": 0.2999608516693115, + "learning_rate": 4.7315077241983006e-06, + "loss": 0.6426, + "step": 1538 + }, + { + "epoch": 0.473101752228712, + "grad_norm": 0.2891947627067566, + "learning_rate": 4.731141013835631e-06, + "loss": 0.6363, + "step": 1539 + }, + { + "epoch": 0.47340916077466955, + "grad_norm": 0.29156970977783203, + "learning_rate": 4.730774067445557e-06, + "loss": 0.6459, + "step": 1540 + }, + { + "epoch": 0.4737165693206271, + "grad_norm": 0.28957098722457886, + "learning_rate": 4.730406885066897e-06, + "loss": 0.6536, + "step": 1541 + }, + { + "epoch": 0.4740239778665847, + "grad_norm": 0.3034295439720154, + "learning_rate": 4.730039466738496e-06, + "loss": 0.6335, + "step": 1542 + }, + { + "epoch": 0.47433138641254224, + "grad_norm": 0.31189337372779846, + "learning_rate": 4.729671812499222e-06, + "loss": 0.6529, + "step": 1543 + }, + { + "epoch": 0.47463879495849987, + "grad_norm": 0.2973169982433319, + "learning_rate": 4.7293039223879676e-06, + "loss": 0.663, + "step": 1544 + }, + { + "epoch": 0.47494620350445743, + "grad_norm": 0.31514599919319153, + "learning_rate": 4.728935796443651e-06, + "loss": 0.6416, + "step": 1545 + }, + { + "epoch": 0.475253612050415, + "grad_norm": 0.31050825119018555, + "learning_rate": 4.728567434705216e-06, + "loss": 0.6476, + "step": 1546 + }, + { + "epoch": 0.47556102059637256, + "grad_norm": 0.296159952878952, + "learning_rate": 4.72819883721163e-06, + "loss": 0.6533, + "step": 1547 + }, + { + "epoch": 0.47586842914233013, + "grad_norm": 0.285687118768692, + "learning_rate": 4.727830004001887e-06, + "loss": 0.6289, + "step": 1548 + }, + { + "epoch": 0.47617583768828775, + "grad_norm": 0.27613309025764465, + "learning_rate": 4.7274609351150045e-06, + "loss": 0.6419, + "step": 1549 + }, + { + "epoch": 0.4764832462342453, + "grad_norm": 0.3030616343021393, + "learning_rate": 4.727091630590026e-06, + "loss": 0.6369, + "step": 1550 + }, + { + "epoch": 0.4767906547802029, + "grad_norm": 0.2965136468410492, + "learning_rate": 4.726722090466019e-06, + "loss": 0.688, + "step": 1551 + }, + { + "epoch": 0.47709806332616045, + "grad_norm": 0.29543739557266235, + "learning_rate": 4.7263523147820755e-06, + "loss": 0.6465, + "step": 1552 + }, + { + "epoch": 0.47740547187211807, + "grad_norm": 0.2901233434677124, + "learning_rate": 4.725982303577315e-06, + "loss": 0.642, + "step": 1553 + }, + { + "epoch": 0.47771288041807564, + "grad_norm": 0.29610395431518555, + "learning_rate": 4.7256120568908786e-06, + "loss": 0.6604, + "step": 1554 + }, + { + "epoch": 0.4780202889640332, + "grad_norm": 0.2889134883880615, + "learning_rate": 4.725241574761935e-06, + "loss": 0.659, + "step": 1555 + }, + { + "epoch": 0.47832769750999077, + "grad_norm": 0.2860468327999115, + "learning_rate": 4.724870857229676e-06, + "loss": 0.6455, + "step": 1556 + }, + { + "epoch": 0.47863510605594833, + "grad_norm": 0.2929280996322632, + "learning_rate": 4.724499904333319e-06, + "loss": 0.6525, + "step": 1557 + }, + { + "epoch": 0.47894251460190596, + "grad_norm": 0.27316340804100037, + "learning_rate": 4.724128716112106e-06, + "loss": 0.5949, + "step": 1558 + }, + { + "epoch": 0.4792499231478635, + "grad_norm": 0.29215648770332336, + "learning_rate": 4.723757292605305e-06, + "loss": 0.6491, + "step": 1559 + }, + { + "epoch": 0.4795573316938211, + "grad_norm": 0.28030458092689514, + "learning_rate": 4.7233856338522086e-06, + "loss": 0.6536, + "step": 1560 + }, + { + "epoch": 0.47986474023977865, + "grad_norm": 0.29562002420425415, + "learning_rate": 4.723013739892132e-06, + "loss": 0.6408, + "step": 1561 + }, + { + "epoch": 0.4801721487857362, + "grad_norm": 0.2907724380493164, + "learning_rate": 4.722641610764418e-06, + "loss": 0.6464, + "step": 1562 + }, + { + "epoch": 0.48047955733169384, + "grad_norm": 0.29368242621421814, + "learning_rate": 4.7222692465084325e-06, + "loss": 0.6329, + "step": 1563 + }, + { + "epoch": 0.4807869658776514, + "grad_norm": 0.28430888056755066, + "learning_rate": 4.721896647163568e-06, + "loss": 0.6192, + "step": 1564 + }, + { + "epoch": 0.481094374423609, + "grad_norm": 0.2872581481933594, + "learning_rate": 4.721523812769241e-06, + "loss": 0.6239, + "step": 1565 + }, + { + "epoch": 0.48140178296956654, + "grad_norm": 0.2975291311740875, + "learning_rate": 4.721150743364892e-06, + "loss": 0.6567, + "step": 1566 + }, + { + "epoch": 0.4817091915155241, + "grad_norm": 0.2997916638851166, + "learning_rate": 4.720777438989988e-06, + "loss": 0.643, + "step": 1567 + }, + { + "epoch": 0.4820166000614817, + "grad_norm": 0.2913118600845337, + "learning_rate": 4.720403899684019e-06, + "loss": 0.6438, + "step": 1568 + }, + { + "epoch": 0.4823240086074393, + "grad_norm": 0.311947762966156, + "learning_rate": 4.720030125486503e-06, + "loss": 0.6558, + "step": 1569 + }, + { + "epoch": 0.48263141715339686, + "grad_norm": 0.2992820143699646, + "learning_rate": 4.719656116436978e-06, + "loss": 0.6513, + "step": 1570 + }, + { + "epoch": 0.4829388256993544, + "grad_norm": 0.29688578844070435, + "learning_rate": 4.719281872575011e-06, + "loss": 0.6385, + "step": 1571 + }, + { + "epoch": 0.48324623424531205, + "grad_norm": 0.283503919839859, + "learning_rate": 4.718907393940193e-06, + "loss": 0.627, + "step": 1572 + }, + { + "epoch": 0.4835536427912696, + "grad_norm": 0.3121541142463684, + "learning_rate": 4.718532680572138e-06, + "loss": 0.6418, + "step": 1573 + }, + { + "epoch": 0.4838610513372272, + "grad_norm": 0.314458429813385, + "learning_rate": 4.718157732510487e-06, + "loss": 0.656, + "step": 1574 + }, + { + "epoch": 0.48416845988318474, + "grad_norm": 0.3058575391769409, + "learning_rate": 4.717782549794904e-06, + "loss": 0.6287, + "step": 1575 + }, + { + "epoch": 0.4844758684291423, + "grad_norm": 0.30992016196250916, + "learning_rate": 4.71740713246508e-06, + "loss": 0.6563, + "step": 1576 + }, + { + "epoch": 0.48478327697509993, + "grad_norm": 0.3015616834163666, + "learning_rate": 4.717031480560729e-06, + "loss": 0.6358, + "step": 1577 + }, + { + "epoch": 0.4850906855210575, + "grad_norm": 0.2994561493396759, + "learning_rate": 4.716655594121589e-06, + "loss": 0.6375, + "step": 1578 + }, + { + "epoch": 0.48539809406701506, + "grad_norm": 0.3010402023792267, + "learning_rate": 4.716279473187426e-06, + "loss": 0.632, + "step": 1579 + }, + { + "epoch": 0.48570550261297263, + "grad_norm": 0.29999062418937683, + "learning_rate": 4.715903117798029e-06, + "loss": 0.6384, + "step": 1580 + }, + { + "epoch": 0.4860129111589302, + "grad_norm": 0.29340818524360657, + "learning_rate": 4.715526527993211e-06, + "loss": 0.6457, + "step": 1581 + }, + { + "epoch": 0.4863203197048878, + "grad_norm": 0.2889280319213867, + "learning_rate": 4.715149703812811e-06, + "loss": 0.6363, + "step": 1582 + }, + { + "epoch": 0.4866277282508454, + "grad_norm": 0.3016546070575714, + "learning_rate": 4.714772645296691e-06, + "loss": 0.6387, + "step": 1583 + }, + { + "epoch": 0.48693513679680295, + "grad_norm": 0.28305673599243164, + "learning_rate": 4.714395352484741e-06, + "loss": 0.6576, + "step": 1584 + }, + { + "epoch": 0.4872425453427605, + "grad_norm": 0.2918756306171417, + "learning_rate": 4.714017825416873e-06, + "loss": 0.6538, + "step": 1585 + }, + { + "epoch": 0.4875499538887181, + "grad_norm": 0.29923731088638306, + "learning_rate": 4.7136400641330245e-06, + "loss": 0.6576, + "step": 1586 + }, + { + "epoch": 0.4878573624346757, + "grad_norm": 0.2859906256198883, + "learning_rate": 4.71326206867316e-06, + "loss": 0.6169, + "step": 1587 + }, + { + "epoch": 0.48816477098063327, + "grad_norm": 0.2951626479625702, + "learning_rate": 4.712883839077264e-06, + "loss": 0.6634, + "step": 1588 + }, + { + "epoch": 0.48847217952659083, + "grad_norm": 0.2935561239719391, + "learning_rate": 4.712505375385351e-06, + "loss": 0.6533, + "step": 1589 + }, + { + "epoch": 0.4887795880725484, + "grad_norm": 0.2939285337924957, + "learning_rate": 4.712126677637456e-06, + "loss": 0.6373, + "step": 1590 + }, + { + "epoch": 0.489086996618506, + "grad_norm": 0.2894609272480011, + "learning_rate": 4.7117477458736425e-06, + "loss": 0.6312, + "step": 1591 + }, + { + "epoch": 0.4893944051644636, + "grad_norm": 0.29936760663986206, + "learning_rate": 4.711368580133995e-06, + "loss": 0.6432, + "step": 1592 + }, + { + "epoch": 0.48970181371042115, + "grad_norm": 0.29750484228134155, + "learning_rate": 4.710989180458625e-06, + "loss": 0.6515, + "step": 1593 + }, + { + "epoch": 0.4900092222563787, + "grad_norm": 0.30923351645469666, + "learning_rate": 4.7106095468876685e-06, + "loss": 0.6393, + "step": 1594 + }, + { + "epoch": 0.4903166308023363, + "grad_norm": 0.30309706926345825, + "learning_rate": 4.7102296794612865e-06, + "loss": 0.6376, + "step": 1595 + }, + { + "epoch": 0.4906240393482939, + "grad_norm": 0.308635413646698, + "learning_rate": 4.709849578219664e-06, + "loss": 0.6437, + "step": 1596 + }, + { + "epoch": 0.49093144789425147, + "grad_norm": 0.29729804396629333, + "learning_rate": 4.709469243203011e-06, + "loss": 0.645, + "step": 1597 + }, + { + "epoch": 0.49123885644020904, + "grad_norm": 0.28293007612228394, + "learning_rate": 4.7090886744515615e-06, + "loss": 0.6488, + "step": 1598 + }, + { + "epoch": 0.4915462649861666, + "grad_norm": 0.2996366620063782, + "learning_rate": 4.708707872005577e-06, + "loss": 0.6569, + "step": 1599 + }, + { + "epoch": 0.49185367353212417, + "grad_norm": 0.2993902862071991, + "learning_rate": 4.7083268359053405e-06, + "loss": 0.6294, + "step": 1600 + }, + { + "epoch": 0.4921610820780818, + "grad_norm": 0.3039522171020508, + "learning_rate": 4.707945566191161e-06, + "loss": 0.6542, + "step": 1601 + }, + { + "epoch": 0.49246849062403936, + "grad_norm": 0.29706791043281555, + "learning_rate": 4.7075640629033716e-06, + "loss": 0.6501, + "step": 1602 + }, + { + "epoch": 0.4927758991699969, + "grad_norm": 0.29550302028656006, + "learning_rate": 4.707182326082332e-06, + "loss": 0.6242, + "step": 1603 + }, + { + "epoch": 0.4930833077159545, + "grad_norm": 0.2890315651893616, + "learning_rate": 4.706800355768424e-06, + "loss": 0.6445, + "step": 1604 + }, + { + "epoch": 0.49339071626191205, + "grad_norm": 0.2965807616710663, + "learning_rate": 4.706418152002056e-06, + "loss": 0.6613, + "step": 1605 + }, + { + "epoch": 0.4936981248078697, + "grad_norm": 0.2880922257900238, + "learning_rate": 4.706035714823662e-06, + "loss": 0.6278, + "step": 1606 + }, + { + "epoch": 0.49400553335382724, + "grad_norm": 0.2843095362186432, + "learning_rate": 4.705653044273695e-06, + "loss": 0.624, + "step": 1607 + }, + { + "epoch": 0.4943129418997848, + "grad_norm": 0.2863067090511322, + "learning_rate": 4.705270140392641e-06, + "loss": 0.6467, + "step": 1608 + }, + { + "epoch": 0.4946203504457424, + "grad_norm": 0.29626643657684326, + "learning_rate": 4.7048870032210045e-06, + "loss": 0.6422, + "step": 1609 + }, + { + "epoch": 0.4949277589917, + "grad_norm": 0.29024648666381836, + "learning_rate": 4.704503632799318e-06, + "loss": 0.6281, + "step": 1610 + }, + { + "epoch": 0.49523516753765756, + "grad_norm": 0.29172930121421814, + "learning_rate": 4.704120029168136e-06, + "loss": 0.6502, + "step": 1611 + }, + { + "epoch": 0.4955425760836151, + "grad_norm": 0.28816771507263184, + "learning_rate": 4.7037361923680404e-06, + "loss": 0.6115, + "step": 1612 + }, + { + "epoch": 0.4958499846295727, + "grad_norm": 0.31117433309555054, + "learning_rate": 4.703352122439635e-06, + "loss": 0.64, + "step": 1613 + }, + { + "epoch": 0.49615739317553026, + "grad_norm": 0.29790377616882324, + "learning_rate": 4.70296781942355e-06, + "loss": 0.652, + "step": 1614 + }, + { + "epoch": 0.4964648017214879, + "grad_norm": 0.290182501077652, + "learning_rate": 4.7025832833604414e-06, + "loss": 0.6224, + "step": 1615 + }, + { + "epoch": 0.49677221026744545, + "grad_norm": 0.29069972038269043, + "learning_rate": 4.702198514290988e-06, + "loss": 0.6443, + "step": 1616 + }, + { + "epoch": 0.497079618813403, + "grad_norm": 0.3012597858905792, + "learning_rate": 4.7018135122558915e-06, + "loss": 0.6394, + "step": 1617 + }, + { + "epoch": 0.4973870273593606, + "grad_norm": 0.30400386452674866, + "learning_rate": 4.701428277295882e-06, + "loss": 0.6467, + "step": 1618 + }, + { + "epoch": 0.49769443590531814, + "grad_norm": 0.2897169291973114, + "learning_rate": 4.7010428094517135e-06, + "loss": 0.6108, + "step": 1619 + }, + { + "epoch": 0.49800184445127577, + "grad_norm": 0.30824363231658936, + "learning_rate": 4.700657108764163e-06, + "loss": 0.6481, + "step": 1620 + }, + { + "epoch": 0.49830925299723333, + "grad_norm": 0.2922405004501343, + "learning_rate": 4.700271175274032e-06, + "loss": 0.6495, + "step": 1621 + }, + { + "epoch": 0.4986166615431909, + "grad_norm": 0.3004756271839142, + "learning_rate": 4.699885009022148e-06, + "loss": 0.6394, + "step": 1622 + }, + { + "epoch": 0.49892407008914846, + "grad_norm": 0.29616275429725647, + "learning_rate": 4.6994986100493646e-06, + "loss": 0.6248, + "step": 1623 + }, + { + "epoch": 0.49923147863510603, + "grad_norm": 0.29526853561401367, + "learning_rate": 4.699111978396555e-06, + "loss": 0.6392, + "step": 1624 + }, + { + "epoch": 0.49953888718106365, + "grad_norm": 0.2940318286418915, + "learning_rate": 4.698725114104622e-06, + "loss": 0.6348, + "step": 1625 + }, + { + "epoch": 0.4998462957270212, + "grad_norm": 0.31017637252807617, + "learning_rate": 4.698338017214491e-06, + "loss": 0.6274, + "step": 1626 + }, + { + "epoch": 0.5001537042729788, + "grad_norm": 0.3258587419986725, + "learning_rate": 4.697950687767111e-06, + "loss": 0.6512, + "step": 1627 + }, + { + "epoch": 0.5004611128189363, + "grad_norm": 0.29995211958885193, + "learning_rate": 4.697563125803458e-06, + "loss": 0.6548, + "step": 1628 + }, + { + "epoch": 0.5007685213648939, + "grad_norm": 0.31565529108047485, + "learning_rate": 4.697175331364531e-06, + "loss": 0.6492, + "step": 1629 + }, + { + "epoch": 0.5010759299108515, + "grad_norm": 0.3331223130226135, + "learning_rate": 4.696787304491354e-06, + "loss": 0.6329, + "step": 1630 + }, + { + "epoch": 0.501383338456809, + "grad_norm": 0.28796327114105225, + "learning_rate": 4.696399045224974e-06, + "loss": 0.6552, + "step": 1631 + }, + { + "epoch": 0.5016907470027667, + "grad_norm": 0.29295915365219116, + "learning_rate": 4.696010553606466e-06, + "loss": 0.6235, + "step": 1632 + }, + { + "epoch": 0.5019981555487243, + "grad_norm": 0.32168880105018616, + "learning_rate": 4.695621829676927e-06, + "loss": 0.6443, + "step": 1633 + }, + { + "epoch": 0.5023055640946819, + "grad_norm": 0.30393096804618835, + "learning_rate": 4.69523287347748e-06, + "loss": 0.6632, + "step": 1634 + }, + { + "epoch": 0.5026129726406394, + "grad_norm": 0.294190913438797, + "learning_rate": 4.69484368504927e-06, + "loss": 0.6676, + "step": 1635 + }, + { + "epoch": 0.502920381186597, + "grad_norm": 0.2821173369884491, + "learning_rate": 4.694454264433469e-06, + "loss": 0.667, + "step": 1636 + }, + { + "epoch": 0.5032277897325546, + "grad_norm": 0.3420603275299072, + "learning_rate": 4.694064611671274e-06, + "loss": 0.6462, + "step": 1637 + }, + { + "epoch": 0.5035351982785121, + "grad_norm": 0.2899837791919708, + "learning_rate": 4.693674726803904e-06, + "loss": 0.6389, + "step": 1638 + }, + { + "epoch": 0.5038426068244697, + "grad_norm": 0.28935351967811584, + "learning_rate": 4.693284609872605e-06, + "loss": 0.656, + "step": 1639 + }, + { + "epoch": 0.5041500153704273, + "grad_norm": 0.3264833688735962, + "learning_rate": 4.6928942609186465e-06, + "loss": 0.641, + "step": 1640 + }, + { + "epoch": 0.5044574239163849, + "grad_norm": 0.3110412657260895, + "learning_rate": 4.692503679983324e-06, + "loss": 0.6467, + "step": 1641 + }, + { + "epoch": 0.5047648324623425, + "grad_norm": 0.3076092302799225, + "learning_rate": 4.692112867107952e-06, + "loss": 0.6294, + "step": 1642 + }, + { + "epoch": 0.5050722410083001, + "grad_norm": 0.2945149838924408, + "learning_rate": 4.6917218223338784e-06, + "loss": 0.6487, + "step": 1643 + }, + { + "epoch": 0.5053796495542576, + "grad_norm": 0.2994938790798187, + "learning_rate": 4.691330545702469e-06, + "loss": 0.6571, + "step": 1644 + }, + { + "epoch": 0.5056870581002152, + "grad_norm": 0.3348276615142822, + "learning_rate": 4.690939037255115e-06, + "loss": 0.6374, + "step": 1645 + }, + { + "epoch": 0.5059944666461728, + "grad_norm": 0.29244545102119446, + "learning_rate": 4.690547297033235e-06, + "loss": 0.6517, + "step": 1646 + }, + { + "epoch": 0.5063018751921303, + "grad_norm": 0.2997102439403534, + "learning_rate": 4.690155325078269e-06, + "loss": 0.649, + "step": 1647 + }, + { + "epoch": 0.5066092837380879, + "grad_norm": 0.3170575797557831, + "learning_rate": 4.689763121431684e-06, + "loss": 0.6579, + "step": 1648 + }, + { + "epoch": 0.5069166922840455, + "grad_norm": 0.33217161893844604, + "learning_rate": 4.68937068613497e-06, + "loss": 0.6314, + "step": 1649 + }, + { + "epoch": 0.507224100830003, + "grad_norm": 0.3040415942668915, + "learning_rate": 4.688978019229641e-06, + "loss": 0.6463, + "step": 1650 + }, + { + "epoch": 0.5075315093759607, + "grad_norm": 0.33137884736061096, + "learning_rate": 4.688585120757238e-06, + "loss": 0.6376, + "step": 1651 + }, + { + "epoch": 0.5078389179219183, + "grad_norm": 0.30779799818992615, + "learning_rate": 4.688191990759322e-06, + "loss": 0.6177, + "step": 1652 + }, + { + "epoch": 0.5081463264678758, + "grad_norm": 0.30758827924728394, + "learning_rate": 4.687798629277485e-06, + "loss": 0.6155, + "step": 1653 + }, + { + "epoch": 0.5084537350138334, + "grad_norm": 0.3154120445251465, + "learning_rate": 4.687405036353337e-06, + "loss": 0.6649, + "step": 1654 + }, + { + "epoch": 0.508761143559791, + "grad_norm": 0.33186909556388855, + "learning_rate": 4.687011212028516e-06, + "loss": 0.6185, + "step": 1655 + }, + { + "epoch": 0.5090685521057485, + "grad_norm": 0.29728108644485474, + "learning_rate": 4.686617156344685e-06, + "loss": 0.634, + "step": 1656 + }, + { + "epoch": 0.5093759606517061, + "grad_norm": 0.3050207495689392, + "learning_rate": 4.686222869343529e-06, + "loss": 0.6452, + "step": 1657 + }, + { + "epoch": 0.5096833691976637, + "grad_norm": 0.3051831126213074, + "learning_rate": 4.685828351066759e-06, + "loss": 0.6536, + "step": 1658 + }, + { + "epoch": 0.5099907777436212, + "grad_norm": 0.3139176666736603, + "learning_rate": 4.6854336015561105e-06, + "loss": 0.6356, + "step": 1659 + }, + { + "epoch": 0.5102981862895789, + "grad_norm": 0.2929617166519165, + "learning_rate": 4.6850386208533424e-06, + "loss": 0.6055, + "step": 1660 + }, + { + "epoch": 0.5106055948355365, + "grad_norm": 0.27839627861976624, + "learning_rate": 4.68464340900024e-06, + "loss": 0.6277, + "step": 1661 + }, + { + "epoch": 0.510913003381494, + "grad_norm": 0.29698723554611206, + "learning_rate": 4.684247966038611e-06, + "loss": 0.6357, + "step": 1662 + }, + { + "epoch": 0.5112204119274516, + "grad_norm": 0.294109046459198, + "learning_rate": 4.683852292010289e-06, + "loss": 0.6456, + "step": 1663 + }, + { + "epoch": 0.5115278204734092, + "grad_norm": 0.3006846010684967, + "learning_rate": 4.683456386957131e-06, + "loss": 0.6458, + "step": 1664 + }, + { + "epoch": 0.5118352290193667, + "grad_norm": 0.2928318977355957, + "learning_rate": 4.6830602509210184e-06, + "loss": 0.6103, + "step": 1665 + }, + { + "epoch": 0.5121426375653243, + "grad_norm": 0.2863270342350006, + "learning_rate": 4.6826638839438594e-06, + "loss": 0.6453, + "step": 1666 + }, + { + "epoch": 0.5124500461112819, + "grad_norm": 0.29372072219848633, + "learning_rate": 4.682267286067583e-06, + "loss": 0.6517, + "step": 1667 + }, + { + "epoch": 0.5127574546572394, + "grad_norm": 0.3029353618621826, + "learning_rate": 4.681870457334145e-06, + "loss": 0.6589, + "step": 1668 + }, + { + "epoch": 0.513064863203197, + "grad_norm": 0.2965511083602905, + "learning_rate": 4.681473397785526e-06, + "loss": 0.622, + "step": 1669 + }, + { + "epoch": 0.5133722717491547, + "grad_norm": 0.2846650779247284, + "learning_rate": 4.6810761074637275e-06, + "loss": 0.6195, + "step": 1670 + }, + { + "epoch": 0.5136796802951122, + "grad_norm": 0.32080894708633423, + "learning_rate": 4.68067858641078e-06, + "loss": 0.6289, + "step": 1671 + }, + { + "epoch": 0.5139870888410698, + "grad_norm": 0.2967267334461212, + "learning_rate": 4.6802808346687355e-06, + "loss": 0.634, + "step": 1672 + }, + { + "epoch": 0.5142944973870274, + "grad_norm": 0.29958221316337585, + "learning_rate": 4.679882852279672e-06, + "loss": 0.6237, + "step": 1673 + }, + { + "epoch": 0.5146019059329849, + "grad_norm": 0.2862364947795868, + "learning_rate": 4.67948463928569e-06, + "loss": 0.6547, + "step": 1674 + }, + { + "epoch": 0.5149093144789425, + "grad_norm": 0.30968037247657776, + "learning_rate": 4.679086195728918e-06, + "loss": 0.6371, + "step": 1675 + }, + { + "epoch": 0.5152167230249001, + "grad_norm": 0.29742076992988586, + "learning_rate": 4.678687521651504e-06, + "loss": 0.6346, + "step": 1676 + }, + { + "epoch": 0.5155241315708576, + "grad_norm": 0.29233691096305847, + "learning_rate": 4.6782886170956235e-06, + "loss": 0.6311, + "step": 1677 + }, + { + "epoch": 0.5158315401168152, + "grad_norm": 0.3018572926521301, + "learning_rate": 4.6778894821034756e-06, + "loss": 0.6492, + "step": 1678 + }, + { + "epoch": 0.5161389486627729, + "grad_norm": 0.30581530928611755, + "learning_rate": 4.677490116717285e-06, + "loss": 0.6192, + "step": 1679 + }, + { + "epoch": 0.5164463572087304, + "grad_norm": 0.300190269947052, + "learning_rate": 4.677090520979298e-06, + "loss": 0.6522, + "step": 1680 + }, + { + "epoch": 0.516753765754688, + "grad_norm": 0.2934325933456421, + "learning_rate": 4.676690694931788e-06, + "loss": 0.6457, + "step": 1681 + }, + { + "epoch": 0.5170611743006456, + "grad_norm": 0.2907101511955261, + "learning_rate": 4.676290638617051e-06, + "loss": 0.6292, + "step": 1682 + }, + { + "epoch": 0.5173685828466031, + "grad_norm": 0.3125830292701721, + "learning_rate": 4.6758903520774105e-06, + "loss": 0.6232, + "step": 1683 + }, + { + "epoch": 0.5176759913925607, + "grad_norm": 0.28631365299224854, + "learning_rate": 4.675489835355208e-06, + "loss": 0.6574, + "step": 1684 + }, + { + "epoch": 0.5179833999385183, + "grad_norm": 0.2968284785747528, + "learning_rate": 4.675089088492815e-06, + "loss": 0.6083, + "step": 1685 + }, + { + "epoch": 0.5182908084844758, + "grad_norm": 0.2977120876312256, + "learning_rate": 4.674688111532627e-06, + "loss": 0.6275, + "step": 1686 + }, + { + "epoch": 0.5185982170304334, + "grad_norm": 0.3009866178035736, + "learning_rate": 4.6742869045170605e-06, + "loss": 0.6477, + "step": 1687 + }, + { + "epoch": 0.518905625576391, + "grad_norm": 0.3011506199836731, + "learning_rate": 4.6738854674885595e-06, + "loss": 0.6304, + "step": 1688 + }, + { + "epoch": 0.5192130341223486, + "grad_norm": 0.30018067359924316, + "learning_rate": 4.67348380048959e-06, + "loss": 0.6426, + "step": 1689 + }, + { + "epoch": 0.5195204426683062, + "grad_norm": 0.2850914001464844, + "learning_rate": 4.6730819035626445e-06, + "loss": 0.6449, + "step": 1690 + }, + { + "epoch": 0.5198278512142638, + "grad_norm": 0.2975022792816162, + "learning_rate": 4.672679776750239e-06, + "loss": 0.6238, + "step": 1691 + }, + { + "epoch": 0.5201352597602213, + "grad_norm": 0.3046853840351105, + "learning_rate": 4.672277420094912e-06, + "loss": 0.6405, + "step": 1692 + }, + { + "epoch": 0.5204426683061789, + "grad_norm": 0.292764812707901, + "learning_rate": 4.671874833639229e-06, + "loss": 0.6637, + "step": 1693 + }, + { + "epoch": 0.5207500768521365, + "grad_norm": 0.28226742148399353, + "learning_rate": 4.6714720174257794e-06, + "loss": 0.6307, + "step": 1694 + }, + { + "epoch": 0.521057485398094, + "grad_norm": 0.27410653233528137, + "learning_rate": 4.671068971497175e-06, + "loss": 0.6177, + "step": 1695 + }, + { + "epoch": 0.5213648939440516, + "grad_norm": 0.3028128147125244, + "learning_rate": 4.670665695896053e-06, + "loss": 0.6479, + "step": 1696 + }, + { + "epoch": 0.5216723024900092, + "grad_norm": 0.29009151458740234, + "learning_rate": 4.670262190665077e-06, + "loss": 0.6233, + "step": 1697 + }, + { + "epoch": 0.5219797110359669, + "grad_norm": 0.30761879682540894, + "learning_rate": 4.66985845584693e-06, + "loss": 0.6659, + "step": 1698 + }, + { + "epoch": 0.5222871195819244, + "grad_norm": 0.2853202521800995, + "learning_rate": 4.669454491484324e-06, + "loss": 0.6176, + "step": 1699 + }, + { + "epoch": 0.522594528127882, + "grad_norm": 0.303395539522171, + "learning_rate": 4.6690502976199946e-06, + "loss": 0.6343, + "step": 1700 + }, + { + "epoch": 0.5229019366738396, + "grad_norm": 0.2955869138240814, + "learning_rate": 4.668645874296698e-06, + "loss": 0.6422, + "step": 1701 + }, + { + "epoch": 0.5232093452197971, + "grad_norm": 0.28882211446762085, + "learning_rate": 4.668241221557218e-06, + "loss": 0.6382, + "step": 1702 + }, + { + "epoch": 0.5235167537657547, + "grad_norm": 0.3092248737812042, + "learning_rate": 4.667836339444363e-06, + "loss": 0.6403, + "step": 1703 + }, + { + "epoch": 0.5238241623117122, + "grad_norm": 0.2997128665447235, + "learning_rate": 4.667431228000964e-06, + "loss": 0.638, + "step": 1704 + }, + { + "epoch": 0.5241315708576698, + "grad_norm": 0.29779043793678284, + "learning_rate": 4.667025887269876e-06, + "loss": 0.6405, + "step": 1705 + }, + { + "epoch": 0.5244389794036274, + "grad_norm": 0.2977481484413147, + "learning_rate": 4.666620317293979e-06, + "loss": 0.6594, + "step": 1706 + }, + { + "epoch": 0.524746387949585, + "grad_norm": 0.2940770387649536, + "learning_rate": 4.66621451811618e-06, + "loss": 0.6511, + "step": 1707 + }, + { + "epoch": 0.5250537964955426, + "grad_norm": 0.2917109727859497, + "learning_rate": 4.665808489779405e-06, + "loss": 0.6503, + "step": 1708 + }, + { + "epoch": 0.5253612050415002, + "grad_norm": 0.3117634356021881, + "learning_rate": 4.6654022323266065e-06, + "loss": 0.6405, + "step": 1709 + }, + { + "epoch": 0.5256686135874578, + "grad_norm": 0.2986466586589813, + "learning_rate": 4.664995745800764e-06, + "loss": 0.6103, + "step": 1710 + }, + { + "epoch": 0.5259760221334153, + "grad_norm": 0.3003517985343933, + "learning_rate": 4.664589030244877e-06, + "loss": 0.6176, + "step": 1711 + }, + { + "epoch": 0.5262834306793729, + "grad_norm": 0.2960118353366852, + "learning_rate": 4.664182085701971e-06, + "loss": 0.6489, + "step": 1712 + }, + { + "epoch": 0.5265908392253305, + "grad_norm": 0.29012438654899597, + "learning_rate": 4.663774912215097e-06, + "loss": 0.6094, + "step": 1713 + }, + { + "epoch": 0.526898247771288, + "grad_norm": 0.31231966614723206, + "learning_rate": 4.6633675098273275e-06, + "loss": 0.6412, + "step": 1714 + }, + { + "epoch": 0.5272056563172456, + "grad_norm": 0.29159703850746155, + "learning_rate": 4.662959878581761e-06, + "loss": 0.6589, + "step": 1715 + }, + { + "epoch": 0.5275130648632032, + "grad_norm": 0.2865101397037506, + "learning_rate": 4.662552018521521e-06, + "loss": 0.617, + "step": 1716 + }, + { + "epoch": 0.5278204734091608, + "grad_norm": 0.30639326572418213, + "learning_rate": 4.662143929689753e-06, + "loss": 0.6637, + "step": 1717 + }, + { + "epoch": 0.5281278819551184, + "grad_norm": 0.28661462664604187, + "learning_rate": 4.661735612129627e-06, + "loss": 0.6777, + "step": 1718 + }, + { + "epoch": 0.528435290501076, + "grad_norm": 0.29852256178855896, + "learning_rate": 4.661327065884341e-06, + "loss": 0.6482, + "step": 1719 + }, + { + "epoch": 0.5287426990470335, + "grad_norm": 0.30542561411857605, + "learning_rate": 4.660918290997112e-06, + "loss": 0.6473, + "step": 1720 + }, + { + "epoch": 0.5290501075929911, + "grad_norm": 0.29083889722824097, + "learning_rate": 4.660509287511183e-06, + "loss": 0.6307, + "step": 1721 + }, + { + "epoch": 0.5293575161389487, + "grad_norm": 0.2883787751197815, + "learning_rate": 4.6601000554698224e-06, + "loss": 0.6405, + "step": 1722 + }, + { + "epoch": 0.5296649246849062, + "grad_norm": 0.30269092321395874, + "learning_rate": 4.659690594916322e-06, + "loss": 0.6508, + "step": 1723 + }, + { + "epoch": 0.5299723332308638, + "grad_norm": 0.2942925691604614, + "learning_rate": 4.659280905893997e-06, + "loss": 0.6455, + "step": 1724 + }, + { + "epoch": 0.5302797417768214, + "grad_norm": 0.2843689024448395, + "learning_rate": 4.658870988446189e-06, + "loss": 0.6418, + "step": 1725 + }, + { + "epoch": 0.5305871503227789, + "grad_norm": 0.2960506081581116, + "learning_rate": 4.65846084261626e-06, + "loss": 0.6392, + "step": 1726 + }, + { + "epoch": 0.5308945588687366, + "grad_norm": 0.30612507462501526, + "learning_rate": 4.6580504684476e-06, + "loss": 0.643, + "step": 1727 + }, + { + "epoch": 0.5312019674146942, + "grad_norm": 0.2923547923564911, + "learning_rate": 4.657639865983622e-06, + "loss": 0.6351, + "step": 1728 + }, + { + "epoch": 0.5315093759606517, + "grad_norm": 0.29477161169052124, + "learning_rate": 4.657229035267761e-06, + "loss": 0.631, + "step": 1729 + }, + { + "epoch": 0.5318167845066093, + "grad_norm": 0.29007285833358765, + "learning_rate": 4.656817976343479e-06, + "loss": 0.6181, + "step": 1730 + }, + { + "epoch": 0.5321241930525669, + "grad_norm": 0.2881002128124237, + "learning_rate": 4.656406689254261e-06, + "loss": 0.6241, + "step": 1731 + }, + { + "epoch": 0.5324316015985244, + "grad_norm": 0.2945367693901062, + "learning_rate": 4.655995174043616e-06, + "loss": 0.6491, + "step": 1732 + }, + { + "epoch": 0.532739010144482, + "grad_norm": 0.28666654229164124, + "learning_rate": 4.655583430755077e-06, + "loss": 0.6324, + "step": 1733 + }, + { + "epoch": 0.5330464186904396, + "grad_norm": 0.2970888018608093, + "learning_rate": 4.655171459432202e-06, + "loss": 0.6607, + "step": 1734 + }, + { + "epoch": 0.5333538272363971, + "grad_norm": 0.29708027839660645, + "learning_rate": 4.654759260118571e-06, + "loss": 0.6064, + "step": 1735 + }, + { + "epoch": 0.5336612357823548, + "grad_norm": 0.3019571006298065, + "learning_rate": 4.654346832857791e-06, + "loss": 0.6423, + "step": 1736 + }, + { + "epoch": 0.5339686443283124, + "grad_norm": 0.3006290793418884, + "learning_rate": 4.653934177693492e-06, + "loss": 0.6459, + "step": 1737 + }, + { + "epoch": 0.5342760528742699, + "grad_norm": 0.2936036288738251, + "learning_rate": 4.653521294669328e-06, + "loss": 0.6505, + "step": 1738 + }, + { + "epoch": 0.5345834614202275, + "grad_norm": 0.3096372187137604, + "learning_rate": 4.653108183828975e-06, + "loss": 0.6167, + "step": 1739 + }, + { + "epoch": 0.5348908699661851, + "grad_norm": 0.3029332756996155, + "learning_rate": 4.652694845216138e-06, + "loss": 0.6511, + "step": 1740 + }, + { + "epoch": 0.5351982785121426, + "grad_norm": 0.2882334887981415, + "learning_rate": 4.65228127887454e-06, + "loss": 0.6315, + "step": 1741 + }, + { + "epoch": 0.5355056870581002, + "grad_norm": 0.3023930788040161, + "learning_rate": 4.651867484847933e-06, + "loss": 0.6508, + "step": 1742 + }, + { + "epoch": 0.5358130956040578, + "grad_norm": 0.30938342213630676, + "learning_rate": 4.6514534631800925e-06, + "loss": 0.6538, + "step": 1743 + }, + { + "epoch": 0.5361205041500153, + "grad_norm": 0.2953920066356659, + "learning_rate": 4.651039213914814e-06, + "loss": 0.6335, + "step": 1744 + }, + { + "epoch": 0.5364279126959729, + "grad_norm": 0.2931448519229889, + "learning_rate": 4.6506247370959225e-06, + "loss": 0.6329, + "step": 1745 + }, + { + "epoch": 0.5367353212419306, + "grad_norm": 0.2873618006706238, + "learning_rate": 4.6502100327672636e-06, + "loss": 0.6178, + "step": 1746 + }, + { + "epoch": 0.5370427297878881, + "grad_norm": 0.28269365429878235, + "learning_rate": 4.6497951009727084e-06, + "loss": 0.6231, + "step": 1747 + }, + { + "epoch": 0.5373501383338457, + "grad_norm": 0.31302785873413086, + "learning_rate": 4.6493799417561504e-06, + "loss": 0.6422, + "step": 1748 + }, + { + "epoch": 0.5376575468798033, + "grad_norm": 0.2981608808040619, + "learning_rate": 4.648964555161509e-06, + "loss": 0.6502, + "step": 1749 + }, + { + "epoch": 0.5379649554257608, + "grad_norm": 0.29036223888397217, + "learning_rate": 4.648548941232727e-06, + "loss": 0.6532, + "step": 1750 + }, + { + "epoch": 0.5382723639717184, + "grad_norm": 0.2962914705276489, + "learning_rate": 4.648133100013773e-06, + "loss": 0.6199, + "step": 1751 + }, + { + "epoch": 0.538579772517676, + "grad_norm": 0.30304983258247375, + "learning_rate": 4.647717031548635e-06, + "loss": 0.6182, + "step": 1752 + }, + { + "epoch": 0.5388871810636335, + "grad_norm": 0.3131561279296875, + "learning_rate": 4.64730073588133e-06, + "loss": 0.6415, + "step": 1753 + }, + { + "epoch": 0.5391945896095911, + "grad_norm": 0.29090794920921326, + "learning_rate": 4.646884213055896e-06, + "loss": 0.6299, + "step": 1754 + }, + { + "epoch": 0.5395019981555488, + "grad_norm": 0.30570393800735474, + "learning_rate": 4.646467463116397e-06, + "loss": 0.6212, + "step": 1755 + }, + { + "epoch": 0.5398094067015063, + "grad_norm": 0.30659863352775574, + "learning_rate": 4.646050486106919e-06, + "loss": 0.6466, + "step": 1756 + }, + { + "epoch": 0.5401168152474639, + "grad_norm": 0.29204073548316956, + "learning_rate": 4.645633282071573e-06, + "loss": 0.6325, + "step": 1757 + }, + { + "epoch": 0.5404242237934215, + "grad_norm": 0.3070466220378876, + "learning_rate": 4.645215851054496e-06, + "loss": 0.6433, + "step": 1758 + }, + { + "epoch": 0.540731632339379, + "grad_norm": 0.2836306095123291, + "learning_rate": 4.644798193099843e-06, + "loss": 0.6499, + "step": 1759 + }, + { + "epoch": 0.5410390408853366, + "grad_norm": 0.30039921402931213, + "learning_rate": 4.644380308251801e-06, + "loss": 0.6368, + "step": 1760 + }, + { + "epoch": 0.5413464494312942, + "grad_norm": 0.2959757447242737, + "learning_rate": 4.643962196554576e-06, + "loss": 0.6216, + "step": 1761 + }, + { + "epoch": 0.5416538579772517, + "grad_norm": 0.29898300766944885, + "learning_rate": 4.6435438580523985e-06, + "loss": 0.6444, + "step": 1762 + }, + { + "epoch": 0.5419612665232093, + "grad_norm": 0.31235426664352417, + "learning_rate": 4.643125292789524e-06, + "loss": 0.642, + "step": 1763 + }, + { + "epoch": 0.5422686750691669, + "grad_norm": 0.30948346853256226, + "learning_rate": 4.642706500810233e-06, + "loss": 0.6149, + "step": 1764 + }, + { + "epoch": 0.5425760836151245, + "grad_norm": 0.298782616853714, + "learning_rate": 4.642287482158825e-06, + "loss": 0.6429, + "step": 1765 + }, + { + "epoch": 0.5428834921610821, + "grad_norm": 0.3064468801021576, + "learning_rate": 4.64186823687963e-06, + "loss": 0.6243, + "step": 1766 + }, + { + "epoch": 0.5431909007070397, + "grad_norm": 0.2971173822879791, + "learning_rate": 4.641448765016998e-06, + "loss": 0.6159, + "step": 1767 + }, + { + "epoch": 0.5434983092529972, + "grad_norm": 0.31629690527915955, + "learning_rate": 4.6410290666153045e-06, + "loss": 0.6332, + "step": 1768 + }, + { + "epoch": 0.5438057177989548, + "grad_norm": 0.2917172610759735, + "learning_rate": 4.640609141718948e-06, + "loss": 0.6288, + "step": 1769 + }, + { + "epoch": 0.5441131263449124, + "grad_norm": 0.29941290616989136, + "learning_rate": 4.640188990372352e-06, + "loss": 0.6443, + "step": 1770 + }, + { + "epoch": 0.54442053489087, + "grad_norm": 0.29681703448295593, + "learning_rate": 4.639768612619962e-06, + "loss": 0.6461, + "step": 1771 + }, + { + "epoch": 0.5447279434368275, + "grad_norm": 0.3291272521018982, + "learning_rate": 4.639348008506251e-06, + "loss": 0.6224, + "step": 1772 + }, + { + "epoch": 0.5450353519827851, + "grad_norm": 0.30756986141204834, + "learning_rate": 4.63892717807571e-06, + "loss": 0.6653, + "step": 1773 + }, + { + "epoch": 0.5453427605287428, + "grad_norm": 0.31963005661964417, + "learning_rate": 4.6385061213728624e-06, + "loss": 0.6237, + "step": 1774 + }, + { + "epoch": 0.5456501690747003, + "grad_norm": 0.29193171858787537, + "learning_rate": 4.6380848384422486e-06, + "loss": 0.6507, + "step": 1775 + }, + { + "epoch": 0.5459575776206579, + "grad_norm": 0.2873878479003906, + "learning_rate": 4.637663329328434e-06, + "loss": 0.6174, + "step": 1776 + }, + { + "epoch": 0.5462649861666155, + "grad_norm": 0.3021123707294464, + "learning_rate": 4.637241594076012e-06, + "loss": 0.624, + "step": 1777 + }, + { + "epoch": 0.546572394712573, + "grad_norm": 0.30224865674972534, + "learning_rate": 4.636819632729595e-06, + "loss": 0.6384, + "step": 1778 + }, + { + "epoch": 0.5468798032585306, + "grad_norm": 0.3009949326515198, + "learning_rate": 4.63639744533382e-06, + "loss": 0.6345, + "step": 1779 + }, + { + "epoch": 0.5471872118044881, + "grad_norm": 0.29023247957229614, + "learning_rate": 4.635975031933352e-06, + "loss": 0.6342, + "step": 1780 + }, + { + "epoch": 0.5474946203504457, + "grad_norm": 0.2997609078884125, + "learning_rate": 4.6355523925728766e-06, + "loss": 0.6379, + "step": 1781 + }, + { + "epoch": 0.5478020288964033, + "grad_norm": 0.30991536378860474, + "learning_rate": 4.635129527297103e-06, + "loss": 0.6349, + "step": 1782 + }, + { + "epoch": 0.5481094374423608, + "grad_norm": 0.293270468711853, + "learning_rate": 4.634706436150766e-06, + "loss": 0.6496, + "step": 1783 + }, + { + "epoch": 0.5484168459883185, + "grad_norm": 0.293817400932312, + "learning_rate": 4.6342831191786216e-06, + "loss": 0.6492, + "step": 1784 + }, + { + "epoch": 0.5487242545342761, + "grad_norm": 0.3100987374782562, + "learning_rate": 4.633859576425454e-06, + "loss": 0.6394, + "step": 1785 + }, + { + "epoch": 0.5490316630802337, + "grad_norm": 0.30403023958206177, + "learning_rate": 4.633435807936068e-06, + "loss": 0.6512, + "step": 1786 + }, + { + "epoch": 0.5493390716261912, + "grad_norm": 0.29605036973953247, + "learning_rate": 4.633011813755292e-06, + "loss": 0.6359, + "step": 1787 + }, + { + "epoch": 0.5496464801721488, + "grad_norm": 0.30904653668403625, + "learning_rate": 4.632587593927982e-06, + "loss": 0.6431, + "step": 1788 + }, + { + "epoch": 0.5499538887181064, + "grad_norm": 0.3177379071712494, + "learning_rate": 4.6321631484990126e-06, + "loss": 0.6293, + "step": 1789 + }, + { + "epoch": 0.5502612972640639, + "grad_norm": 0.30802205204963684, + "learning_rate": 4.631738477513286e-06, + "loss": 0.614, + "step": 1790 + }, + { + "epoch": 0.5505687058100215, + "grad_norm": 0.31541338562965393, + "learning_rate": 4.631313581015727e-06, + "loss": 0.6601, + "step": 1791 + }, + { + "epoch": 0.550876114355979, + "grad_norm": 0.3587876856327057, + "learning_rate": 4.630888459051284e-06, + "loss": 0.6254, + "step": 1792 + }, + { + "epoch": 0.5511835229019367, + "grad_norm": 0.285706490278244, + "learning_rate": 4.6304631116649315e-06, + "loss": 0.6245, + "step": 1793 + }, + { + "epoch": 0.5514909314478943, + "grad_norm": 0.2981203496456146, + "learning_rate": 4.630037538901664e-06, + "loss": 0.6539, + "step": 1794 + }, + { + "epoch": 0.5517983399938519, + "grad_norm": 0.32486411929130554, + "learning_rate": 4.629611740806502e-06, + "loss": 0.6326, + "step": 1795 + }, + { + "epoch": 0.5521057485398094, + "grad_norm": 0.30077770352363586, + "learning_rate": 4.629185717424492e-06, + "loss": 0.6437, + "step": 1796 + }, + { + "epoch": 0.552413157085767, + "grad_norm": 0.30557769536972046, + "learning_rate": 4.6287594688007e-06, + "loss": 0.632, + "step": 1797 + }, + { + "epoch": 0.5527205656317246, + "grad_norm": 0.32038646936416626, + "learning_rate": 4.628332994980217e-06, + "loss": 0.6492, + "step": 1798 + }, + { + "epoch": 0.5530279741776821, + "grad_norm": 0.3171752989292145, + "learning_rate": 4.627906296008161e-06, + "loss": 0.6289, + "step": 1799 + }, + { + "epoch": 0.5533353827236397, + "grad_norm": 0.2960434854030609, + "learning_rate": 4.627479371929672e-06, + "loss": 0.6348, + "step": 1800 + }, + { + "epoch": 0.5536427912695973, + "grad_norm": 0.2894599735736847, + "learning_rate": 4.62705222278991e-06, + "loss": 0.6262, + "step": 1801 + }, + { + "epoch": 0.5539501998155548, + "grad_norm": 0.30991706252098083, + "learning_rate": 4.6266248486340645e-06, + "loss": 0.6574, + "step": 1802 + }, + { + "epoch": 0.5542576083615125, + "grad_norm": 0.31100067496299744, + "learning_rate": 4.626197249507347e-06, + "loss": 0.6304, + "step": 1803 + }, + { + "epoch": 0.5545650169074701, + "grad_norm": 0.32505422830581665, + "learning_rate": 4.62576942545499e-06, + "loss": 0.6408, + "step": 1804 + }, + { + "epoch": 0.5548724254534276, + "grad_norm": 0.30701813101768494, + "learning_rate": 4.625341376522254e-06, + "loss": 0.6611, + "step": 1805 + }, + { + "epoch": 0.5551798339993852, + "grad_norm": 0.30991774797439575, + "learning_rate": 4.624913102754421e-06, + "loss": 0.6187, + "step": 1806 + }, + { + "epoch": 0.5554872425453428, + "grad_norm": 0.3147648572921753, + "learning_rate": 4.624484604196796e-06, + "loss": 0.6362, + "step": 1807 + }, + { + "epoch": 0.5557946510913003, + "grad_norm": 0.30528295040130615, + "learning_rate": 4.62405588089471e-06, + "loss": 0.6372, + "step": 1808 + }, + { + "epoch": 0.5561020596372579, + "grad_norm": 0.3172713816165924, + "learning_rate": 4.623626932893517e-06, + "loss": 0.6709, + "step": 1809 + }, + { + "epoch": 0.5564094681832155, + "grad_norm": 0.3263775706291199, + "learning_rate": 4.6231977602385935e-06, + "loss": 0.6227, + "step": 1810 + }, + { + "epoch": 0.556716876729173, + "grad_norm": 0.3009786009788513, + "learning_rate": 4.622768362975341e-06, + "loss": 0.6696, + "step": 1811 + }, + { + "epoch": 0.5570242852751307, + "grad_norm": 0.2948378622531891, + "learning_rate": 4.622338741149184e-06, + "loss": 0.6172, + "step": 1812 + }, + { + "epoch": 0.5573316938210883, + "grad_norm": 0.30789434909820557, + "learning_rate": 4.6219088948055716e-06, + "loss": 0.5976, + "step": 1813 + }, + { + "epoch": 0.5576391023670458, + "grad_norm": 0.3011975586414337, + "learning_rate": 4.621478823989977e-06, + "loss": 0.6554, + "step": 1814 + }, + { + "epoch": 0.5579465109130034, + "grad_norm": 0.29166504740715027, + "learning_rate": 4.6210485287478955e-06, + "loss": 0.6492, + "step": 1815 + }, + { + "epoch": 0.558253919458961, + "grad_norm": 0.2951098680496216, + "learning_rate": 4.620618009124848e-06, + "loss": 0.6357, + "step": 1816 + }, + { + "epoch": 0.5585613280049185, + "grad_norm": 0.3178653120994568, + "learning_rate": 4.620187265166377e-06, + "loss": 0.6328, + "step": 1817 + }, + { + "epoch": 0.5588687365508761, + "grad_norm": 0.29690927267074585, + "learning_rate": 4.619756296918051e-06, + "loss": 0.632, + "step": 1818 + }, + { + "epoch": 0.5591761450968337, + "grad_norm": 0.3144308924674988, + "learning_rate": 4.619325104425461e-06, + "loss": 0.6337, + "step": 1819 + }, + { + "epoch": 0.5594835536427912, + "grad_norm": 0.31371212005615234, + "learning_rate": 4.618893687734221e-06, + "loss": 0.6186, + "step": 1820 + }, + { + "epoch": 0.5597909621887488, + "grad_norm": 0.31025779247283936, + "learning_rate": 4.61846204688997e-06, + "loss": 0.646, + "step": 1821 + }, + { + "epoch": 0.5600983707347065, + "grad_norm": 0.2930643558502197, + "learning_rate": 4.618030181938371e-06, + "loss": 0.6284, + "step": 1822 + }, + { + "epoch": 0.560405779280664, + "grad_norm": 0.3042875826358795, + "learning_rate": 4.61759809292511e-06, + "loss": 0.6467, + "step": 1823 + }, + { + "epoch": 0.5607131878266216, + "grad_norm": 0.31503742933273315, + "learning_rate": 4.617165779895896e-06, + "loss": 0.6397, + "step": 1824 + }, + { + "epoch": 0.5610205963725792, + "grad_norm": 0.3075665533542633, + "learning_rate": 4.616733242896462e-06, + "loss": 0.6517, + "step": 1825 + }, + { + "epoch": 0.5613280049185367, + "grad_norm": 0.291671484708786, + "learning_rate": 4.616300481972566e-06, + "loss": 0.6318, + "step": 1826 + }, + { + "epoch": 0.5616354134644943, + "grad_norm": 0.30458584427833557, + "learning_rate": 4.615867497169989e-06, + "loss": 0.6584, + "step": 1827 + }, + { + "epoch": 0.5619428220104519, + "grad_norm": 0.29958638548851013, + "learning_rate": 4.615434288534535e-06, + "loss": 0.6231, + "step": 1828 + }, + { + "epoch": 0.5622502305564094, + "grad_norm": 0.2990656793117523, + "learning_rate": 4.615000856112032e-06, + "loss": 0.6329, + "step": 1829 + }, + { + "epoch": 0.562557639102367, + "grad_norm": 0.3069312870502472, + "learning_rate": 4.614567199948333e-06, + "loss": 0.6565, + "step": 1830 + }, + { + "epoch": 0.5628650476483246, + "grad_norm": 0.2972862124443054, + "learning_rate": 4.614133320089312e-06, + "loss": 0.6521, + "step": 1831 + }, + { + "epoch": 0.5631724561942822, + "grad_norm": 0.2982487678527832, + "learning_rate": 4.613699216580869e-06, + "loss": 0.6424, + "step": 1832 + }, + { + "epoch": 0.5634798647402398, + "grad_norm": 0.2952280640602112, + "learning_rate": 4.613264889468927e-06, + "loss": 0.6283, + "step": 1833 + }, + { + "epoch": 0.5637872732861974, + "grad_norm": 0.30089280009269714, + "learning_rate": 4.612830338799431e-06, + "loss": 0.6241, + "step": 1834 + }, + { + "epoch": 0.5640946818321549, + "grad_norm": 0.29035207629203796, + "learning_rate": 4.612395564618354e-06, + "loss": 0.6558, + "step": 1835 + }, + { + "epoch": 0.5644020903781125, + "grad_norm": 0.3009234070777893, + "learning_rate": 4.611960566971686e-06, + "loss": 0.6401, + "step": 1836 + }, + { + "epoch": 0.5647094989240701, + "grad_norm": 0.293933242559433, + "learning_rate": 4.611525345905448e-06, + "loss": 0.6515, + "step": 1837 + }, + { + "epoch": 0.5650169074700276, + "grad_norm": 0.2873729467391968, + "learning_rate": 4.6110899014656795e-06, + "loss": 0.619, + "step": 1838 + }, + { + "epoch": 0.5653243160159852, + "grad_norm": 0.3064028024673462, + "learning_rate": 4.610654233698445e-06, + "loss": 0.6419, + "step": 1839 + }, + { + "epoch": 0.5656317245619428, + "grad_norm": 0.30360427498817444, + "learning_rate": 4.610218342649833e-06, + "loss": 0.6495, + "step": 1840 + }, + { + "epoch": 0.5659391331079004, + "grad_norm": 0.3064430058002472, + "learning_rate": 4.609782228365955e-06, + "loss": 0.6372, + "step": 1841 + }, + { + "epoch": 0.566246541653858, + "grad_norm": 0.2934454679489136, + "learning_rate": 4.6093458908929475e-06, + "loss": 0.6531, + "step": 1842 + }, + { + "epoch": 0.5665539501998156, + "grad_norm": 0.30291587114334106, + "learning_rate": 4.608909330276969e-06, + "loss": 0.6253, + "step": 1843 + }, + { + "epoch": 0.5668613587457731, + "grad_norm": 0.3005976676940918, + "learning_rate": 4.608472546564203e-06, + "loss": 0.6332, + "step": 1844 + }, + { + "epoch": 0.5671687672917307, + "grad_norm": 0.2956908047199249, + "learning_rate": 4.608035539800856e-06, + "loss": 0.6387, + "step": 1845 + }, + { + "epoch": 0.5674761758376883, + "grad_norm": 0.2940070629119873, + "learning_rate": 4.607598310033157e-06, + "loss": 0.6461, + "step": 1846 + }, + { + "epoch": 0.5677835843836458, + "grad_norm": 0.28469932079315186, + "learning_rate": 4.607160857307359e-06, + "loss": 0.6384, + "step": 1847 + }, + { + "epoch": 0.5680909929296034, + "grad_norm": 0.30298274755477905, + "learning_rate": 4.60672318166974e-06, + "loss": 0.6295, + "step": 1848 + }, + { + "epoch": 0.568398401475561, + "grad_norm": 0.3029312193393707, + "learning_rate": 4.6062852831666016e-06, + "loss": 0.6279, + "step": 1849 + }, + { + "epoch": 0.5687058100215185, + "grad_norm": 0.2976098358631134, + "learning_rate": 4.605847161844267e-06, + "loss": 0.622, + "step": 1850 + }, + { + "epoch": 0.5690132185674762, + "grad_norm": 0.30003297328948975, + "learning_rate": 4.605408817749084e-06, + "loss": 0.6497, + "step": 1851 + }, + { + "epoch": 0.5693206271134338, + "grad_norm": 0.30226752161979675, + "learning_rate": 4.604970250927424e-06, + "loss": 0.6327, + "step": 1852 + }, + { + "epoch": 0.5696280356593914, + "grad_norm": 0.3103128671646118, + "learning_rate": 4.604531461425683e-06, + "loss": 0.642, + "step": 1853 + }, + { + "epoch": 0.5699354442053489, + "grad_norm": 0.29161834716796875, + "learning_rate": 4.604092449290279e-06, + "loss": 0.642, + "step": 1854 + }, + { + "epoch": 0.5702428527513065, + "grad_norm": 0.3005453944206238, + "learning_rate": 4.603653214567654e-06, + "loss": 0.6572, + "step": 1855 + }, + { + "epoch": 0.570550261297264, + "grad_norm": 0.30533504486083984, + "learning_rate": 4.603213757304274e-06, + "loss": 0.6479, + "step": 1856 + }, + { + "epoch": 0.5708576698432216, + "grad_norm": 0.2918700575828552, + "learning_rate": 4.602774077546627e-06, + "loss": 0.6459, + "step": 1857 + }, + { + "epoch": 0.5711650783891792, + "grad_norm": 0.2961483299732208, + "learning_rate": 4.602334175341226e-06, + "loss": 0.6404, + "step": 1858 + }, + { + "epoch": 0.5714724869351367, + "grad_norm": 0.29218021035194397, + "learning_rate": 4.601894050734609e-06, + "loss": 0.6391, + "step": 1859 + }, + { + "epoch": 0.5717798954810944, + "grad_norm": 0.2973973751068115, + "learning_rate": 4.601453703773335e-06, + "loss": 0.6443, + "step": 1860 + }, + { + "epoch": 0.572087304027052, + "grad_norm": 0.28230422735214233, + "learning_rate": 4.601013134503986e-06, + "loss": 0.6257, + "step": 1861 + }, + { + "epoch": 0.5723947125730096, + "grad_norm": 0.30661872029304504, + "learning_rate": 4.600572342973171e-06, + "loss": 0.6666, + "step": 1862 + }, + { + "epoch": 0.5727021211189671, + "grad_norm": 0.3151032030582428, + "learning_rate": 4.600131329227519e-06, + "loss": 0.6164, + "step": 1863 + }, + { + "epoch": 0.5730095296649247, + "grad_norm": 0.29291635751724243, + "learning_rate": 4.599690093313683e-06, + "loss": 0.6266, + "step": 1864 + }, + { + "epoch": 0.5733169382108823, + "grad_norm": 0.31712669134140015, + "learning_rate": 4.599248635278343e-06, + "loss": 0.6499, + "step": 1865 + }, + { + "epoch": 0.5736243467568398, + "grad_norm": 0.3039402961730957, + "learning_rate": 4.598806955168197e-06, + "loss": 0.6377, + "step": 1866 + }, + { + "epoch": 0.5739317553027974, + "grad_norm": 0.2988415062427521, + "learning_rate": 4.598365053029971e-06, + "loss": 0.637, + "step": 1867 + }, + { + "epoch": 0.574239163848755, + "grad_norm": 0.3034099340438843, + "learning_rate": 4.597922928910412e-06, + "loss": 0.6473, + "step": 1868 + }, + { + "epoch": 0.5745465723947125, + "grad_norm": 0.3014233410358429, + "learning_rate": 4.5974805828562915e-06, + "loss": 0.6416, + "step": 1869 + }, + { + "epoch": 0.5748539809406702, + "grad_norm": 0.31777387857437134, + "learning_rate": 4.597038014914405e-06, + "loss": 0.6363, + "step": 1870 + }, + { + "epoch": 0.5751613894866278, + "grad_norm": 0.3020791709423065, + "learning_rate": 4.596595225131571e-06, + "loss": 0.6487, + "step": 1871 + }, + { + "epoch": 0.5754687980325853, + "grad_norm": 0.29350733757019043, + "learning_rate": 4.59615221355463e-06, + "loss": 0.6133, + "step": 1872 + }, + { + "epoch": 0.5757762065785429, + "grad_norm": 0.3213062584400177, + "learning_rate": 4.595708980230448e-06, + "loss": 0.6547, + "step": 1873 + }, + { + "epoch": 0.5760836151245005, + "grad_norm": 0.31323912739753723, + "learning_rate": 4.595265525205913e-06, + "loss": 0.6417, + "step": 1874 + }, + { + "epoch": 0.576391023670458, + "grad_norm": 0.3043946623802185, + "learning_rate": 4.594821848527937e-06, + "loss": 0.638, + "step": 1875 + }, + { + "epoch": 0.5766984322164156, + "grad_norm": 0.3033462166786194, + "learning_rate": 4.594377950243457e-06, + "loss": 0.6635, + "step": 1876 + }, + { + "epoch": 0.5770058407623732, + "grad_norm": 0.32417917251586914, + "learning_rate": 4.5939338303994305e-06, + "loss": 0.6442, + "step": 1877 + }, + { + "epoch": 0.5773132493083307, + "grad_norm": 0.3047316074371338, + "learning_rate": 4.593489489042842e-06, + "loss": 0.6269, + "step": 1878 + }, + { + "epoch": 0.5776206578542884, + "grad_norm": 0.2929757833480835, + "learning_rate": 4.593044926220694e-06, + "loss": 0.6504, + "step": 1879 + }, + { + "epoch": 0.577928066400246, + "grad_norm": 0.345733106136322, + "learning_rate": 4.592600141980019e-06, + "loss": 0.6254, + "step": 1880 + }, + { + "epoch": 0.5782354749462035, + "grad_norm": 0.30610647797584534, + "learning_rate": 4.592155136367867e-06, + "loss": 0.6373, + "step": 1881 + }, + { + "epoch": 0.5785428834921611, + "grad_norm": 0.2902810573577881, + "learning_rate": 4.5917099094313175e-06, + "loss": 0.6485, + "step": 1882 + }, + { + "epoch": 0.5788502920381187, + "grad_norm": 0.3397778570652008, + "learning_rate": 4.591264461217467e-06, + "loss": 0.6252, + "step": 1883 + }, + { + "epoch": 0.5791577005840762, + "grad_norm": 0.3079828917980194, + "learning_rate": 4.59081879177344e-06, + "loss": 0.6279, + "step": 1884 + }, + { + "epoch": 0.5794651091300338, + "grad_norm": 0.30894163250923157, + "learning_rate": 4.5903729011463825e-06, + "loss": 0.6354, + "step": 1885 + }, + { + "epoch": 0.5797725176759914, + "grad_norm": 0.2941904067993164, + "learning_rate": 4.589926789383464e-06, + "loss": 0.6487, + "step": 1886 + }, + { + "epoch": 0.5800799262219489, + "grad_norm": 0.2903376519680023, + "learning_rate": 4.5894804565318775e-06, + "loss": 0.6565, + "step": 1887 + }, + { + "epoch": 0.5803873347679065, + "grad_norm": 0.3600059449672699, + "learning_rate": 4.589033902638841e-06, + "loss": 0.6517, + "step": 1888 + }, + { + "epoch": 0.5806947433138642, + "grad_norm": 0.30211836099624634, + "learning_rate": 4.588587127751593e-06, + "loss": 0.6316, + "step": 1889 + }, + { + "epoch": 0.5810021518598217, + "grad_norm": 0.32158973813056946, + "learning_rate": 4.588140131917397e-06, + "loss": 0.6418, + "step": 1890 + }, + { + "epoch": 0.5813095604057793, + "grad_norm": 0.34952977299690247, + "learning_rate": 4.587692915183539e-06, + "loss": 0.6188, + "step": 1891 + }, + { + "epoch": 0.5816169689517369, + "grad_norm": 0.29489532113075256, + "learning_rate": 4.5872454775973314e-06, + "loss": 0.6482, + "step": 1892 + }, + { + "epoch": 0.5819243774976944, + "grad_norm": 0.3097151219844818, + "learning_rate": 4.586797819206106e-06, + "loss": 0.6317, + "step": 1893 + }, + { + "epoch": 0.582231786043652, + "grad_norm": 0.32615798711776733, + "learning_rate": 4.5863499400572195e-06, + "loss": 0.6292, + "step": 1894 + }, + { + "epoch": 0.5825391945896096, + "grad_norm": 0.3030427098274231, + "learning_rate": 4.585901840198052e-06, + "loss": 0.6381, + "step": 1895 + }, + { + "epoch": 0.5828466031355671, + "grad_norm": 0.2936180830001831, + "learning_rate": 4.585453519676008e-06, + "loss": 0.6363, + "step": 1896 + }, + { + "epoch": 0.5831540116815247, + "grad_norm": 0.2994568645954132, + "learning_rate": 4.585004978538512e-06, + "loss": 0.6243, + "step": 1897 + }, + { + "epoch": 0.5834614202274824, + "grad_norm": 0.30517956614494324, + "learning_rate": 4.584556216833017e-06, + "loss": 0.6273, + "step": 1898 + }, + { + "epoch": 0.5837688287734399, + "grad_norm": 0.31152835488319397, + "learning_rate": 4.584107234606996e-06, + "loss": 0.6284, + "step": 1899 + }, + { + "epoch": 0.5840762373193975, + "grad_norm": 0.2999560832977295, + "learning_rate": 4.583658031907944e-06, + "loss": 0.6355, + "step": 1900 + }, + { + "epoch": 0.5843836458653551, + "grad_norm": 0.2904406487941742, + "learning_rate": 4.583208608783382e-06, + "loss": 0.6494, + "step": 1901 + }, + { + "epoch": 0.5846910544113126, + "grad_norm": 0.29921045899391174, + "learning_rate": 4.582758965280854e-06, + "loss": 0.6335, + "step": 1902 + }, + { + "epoch": 0.5849984629572702, + "grad_norm": 0.293984979391098, + "learning_rate": 4.582309101447926e-06, + "loss": 0.628, + "step": 1903 + }, + { + "epoch": 0.5853058715032278, + "grad_norm": 0.29990270733833313, + "learning_rate": 4.581859017332189e-06, + "loss": 0.6281, + "step": 1904 + }, + { + "epoch": 0.5856132800491853, + "grad_norm": 0.30195721983909607, + "learning_rate": 4.5814087129812565e-06, + "loss": 0.6406, + "step": 1905 + }, + { + "epoch": 0.5859206885951429, + "grad_norm": 0.29355448484420776, + "learning_rate": 4.580958188442765e-06, + "loss": 0.6408, + "step": 1906 + }, + { + "epoch": 0.5862280971411005, + "grad_norm": 0.29423362016677856, + "learning_rate": 4.580507443764372e-06, + "loss": 0.6368, + "step": 1907 + }, + { + "epoch": 0.5865355056870581, + "grad_norm": 0.2937408983707428, + "learning_rate": 4.580056478993765e-06, + "loss": 0.6436, + "step": 1908 + }, + { + "epoch": 0.5868429142330157, + "grad_norm": 0.2980180084705353, + "learning_rate": 4.579605294178647e-06, + "loss": 0.6448, + "step": 1909 + }, + { + "epoch": 0.5871503227789733, + "grad_norm": 0.2903384268283844, + "learning_rate": 4.57915388936675e-06, + "loss": 0.6351, + "step": 1910 + }, + { + "epoch": 0.5874577313249308, + "grad_norm": 0.3066135346889496, + "learning_rate": 4.5787022646058255e-06, + "loss": 0.6394, + "step": 1911 + }, + { + "epoch": 0.5877651398708884, + "grad_norm": 0.2960333526134491, + "learning_rate": 4.578250419943652e-06, + "loss": 0.6028, + "step": 1912 + }, + { + "epoch": 0.588072548416846, + "grad_norm": 0.2984008193016052, + "learning_rate": 4.577798355428027e-06, + "loss": 0.6204, + "step": 1913 + }, + { + "epoch": 0.5883799569628035, + "grad_norm": 0.28751564025878906, + "learning_rate": 4.577346071106774e-06, + "loss": 0.6345, + "step": 1914 + }, + { + "epoch": 0.5886873655087611, + "grad_norm": 0.2880474925041199, + "learning_rate": 4.57689356702774e-06, + "loss": 0.6478, + "step": 1915 + }, + { + "epoch": 0.5889947740547187, + "grad_norm": 0.2881876230239868, + "learning_rate": 4.576440843238793e-06, + "loss": 0.6705, + "step": 1916 + }, + { + "epoch": 0.5893021826006763, + "grad_norm": 0.28930819034576416, + "learning_rate": 4.575987899787827e-06, + "loss": 0.6479, + "step": 1917 + }, + { + "epoch": 0.5896095911466339, + "grad_norm": 0.2885318100452423, + "learning_rate": 4.575534736722757e-06, + "loss": 0.6165, + "step": 1918 + }, + { + "epoch": 0.5899169996925915, + "grad_norm": 0.30029428005218506, + "learning_rate": 4.575081354091523e-06, + "loss": 0.6488, + "step": 1919 + }, + { + "epoch": 0.590224408238549, + "grad_norm": 0.3073025643825531, + "learning_rate": 4.574627751942087e-06, + "loss": 0.6498, + "step": 1920 + }, + { + "epoch": 0.5905318167845066, + "grad_norm": 0.29624143242836, + "learning_rate": 4.574173930322433e-06, + "loss": 0.6384, + "step": 1921 + }, + { + "epoch": 0.5908392253304642, + "grad_norm": 0.30202367901802063, + "learning_rate": 4.5737198892805724e-06, + "loss": 0.6303, + "step": 1922 + }, + { + "epoch": 0.5911466338764217, + "grad_norm": 0.2872784435749054, + "learning_rate": 4.573265628864535e-06, + "loss": 0.6227, + "step": 1923 + }, + { + "epoch": 0.5914540424223793, + "grad_norm": 0.2937835156917572, + "learning_rate": 4.572811149122376e-06, + "loss": 0.6189, + "step": 1924 + }, + { + "epoch": 0.5917614509683369, + "grad_norm": 0.2978774607181549, + "learning_rate": 4.572356450102176e-06, + "loss": 0.621, + "step": 1925 + }, + { + "epoch": 0.5920688595142944, + "grad_norm": 0.29992032051086426, + "learning_rate": 4.571901531852035e-06, + "loss": 0.6295, + "step": 1926 + }, + { + "epoch": 0.5923762680602521, + "grad_norm": 0.2930610477924347, + "learning_rate": 4.571446394420078e-06, + "loss": 0.6419, + "step": 1927 + }, + { + "epoch": 0.5926836766062097, + "grad_norm": 0.2877572178840637, + "learning_rate": 4.570991037854452e-06, + "loss": 0.6304, + "step": 1928 + }, + { + "epoch": 0.5929910851521673, + "grad_norm": 0.32795271277427673, + "learning_rate": 4.57053546220333e-06, + "loss": 0.6225, + "step": 1929 + }, + { + "epoch": 0.5932984936981248, + "grad_norm": 0.30169937014579773, + "learning_rate": 4.570079667514905e-06, + "loss": 0.6489, + "step": 1930 + }, + { + "epoch": 0.5936059022440824, + "grad_norm": 0.3028746247291565, + "learning_rate": 4.569623653837395e-06, + "loss": 0.6258, + "step": 1931 + }, + { + "epoch": 0.59391331079004, + "grad_norm": 0.29739677906036377, + "learning_rate": 4.5691674212190416e-06, + "loss": 0.6364, + "step": 1932 + }, + { + "epoch": 0.5942207193359975, + "grad_norm": 0.3029134273529053, + "learning_rate": 4.568710969708106e-06, + "loss": 0.6327, + "step": 1933 + }, + { + "epoch": 0.5945281278819551, + "grad_norm": 0.29422813653945923, + "learning_rate": 4.568254299352878e-06, + "loss": 0.6521, + "step": 1934 + }, + { + "epoch": 0.5948355364279126, + "grad_norm": 0.30457937717437744, + "learning_rate": 4.567797410201666e-06, + "loss": 0.668, + "step": 1935 + }, + { + "epoch": 0.5951429449738703, + "grad_norm": 0.2986065745353699, + "learning_rate": 4.567340302302804e-06, + "loss": 0.6478, + "step": 1936 + }, + { + "epoch": 0.5954503535198279, + "grad_norm": 0.301922082901001, + "learning_rate": 4.566882975704648e-06, + "loss": 0.6278, + "step": 1937 + }, + { + "epoch": 0.5957577620657855, + "grad_norm": 0.2915398180484772, + "learning_rate": 4.566425430455578e-06, + "loss": 0.636, + "step": 1938 + }, + { + "epoch": 0.596065170611743, + "grad_norm": 0.29171520471572876, + "learning_rate": 4.5659676666039965e-06, + "loss": 0.6242, + "step": 1939 + }, + { + "epoch": 0.5963725791577006, + "grad_norm": 0.29409390687942505, + "learning_rate": 4.565509684198329e-06, + "loss": 0.6398, + "step": 1940 + }, + { + "epoch": 0.5966799877036582, + "grad_norm": 0.29990631341934204, + "learning_rate": 4.565051483287025e-06, + "loss": 0.671, + "step": 1941 + }, + { + "epoch": 0.5969873962496157, + "grad_norm": 0.2852870523929596, + "learning_rate": 4.564593063918556e-06, + "loss": 0.6529, + "step": 1942 + }, + { + "epoch": 0.5972948047955733, + "grad_norm": 0.29266083240509033, + "learning_rate": 4.5641344261414174e-06, + "loss": 0.6316, + "step": 1943 + }, + { + "epoch": 0.5976022133415309, + "grad_norm": 0.2949276268482208, + "learning_rate": 4.563675570004128e-06, + "loss": 0.6349, + "step": 1944 + }, + { + "epoch": 0.5979096218874884, + "grad_norm": 0.29619404673576355, + "learning_rate": 4.5632164955552276e-06, + "loss": 0.6301, + "step": 1945 + }, + { + "epoch": 0.5982170304334461, + "grad_norm": 0.3085547685623169, + "learning_rate": 4.5627572028432824e-06, + "loss": 0.5991, + "step": 1946 + }, + { + "epoch": 0.5985244389794037, + "grad_norm": 0.30367356538772583, + "learning_rate": 4.562297691916879e-06, + "loss": 0.651, + "step": 1947 + }, + { + "epoch": 0.5988318475253612, + "grad_norm": 0.3003896176815033, + "learning_rate": 4.561837962824629e-06, + "loss": 0.6441, + "step": 1948 + }, + { + "epoch": 0.5991392560713188, + "grad_norm": 0.30614855885505676, + "learning_rate": 4.561378015615164e-06, + "loss": 0.6277, + "step": 1949 + }, + { + "epoch": 0.5994466646172764, + "grad_norm": 0.29111605882644653, + "learning_rate": 4.560917850337143e-06, + "loss": 0.6461, + "step": 1950 + }, + { + "epoch": 0.5997540731632339, + "grad_norm": 0.3156605064868927, + "learning_rate": 4.560457467039245e-06, + "loss": 0.6302, + "step": 1951 + }, + { + "epoch": 0.6000614817091915, + "grad_norm": 0.3045242428779602, + "learning_rate": 4.5599968657701735e-06, + "loss": 0.6379, + "step": 1952 + }, + { + "epoch": 0.6003688902551491, + "grad_norm": 0.2990531921386719, + "learning_rate": 4.559536046578653e-06, + "loss": 0.6434, + "step": 1953 + }, + { + "epoch": 0.6006762988011066, + "grad_norm": 0.30450358986854553, + "learning_rate": 4.559075009513434e-06, + "loss": 0.6302, + "step": 1954 + }, + { + "epoch": 0.6009837073470643, + "grad_norm": 0.2938859164714813, + "learning_rate": 4.558613754623288e-06, + "loss": 0.6674, + "step": 1955 + }, + { + "epoch": 0.6012911158930219, + "grad_norm": 0.3059353232383728, + "learning_rate": 4.55815228195701e-06, + "loss": 0.6252, + "step": 1956 + }, + { + "epoch": 0.6015985244389794, + "grad_norm": 0.2940181791782379, + "learning_rate": 4.557690591563418e-06, + "loss": 0.6071, + "step": 1957 + }, + { + "epoch": 0.601905932984937, + "grad_norm": 0.30427172780036926, + "learning_rate": 4.557228683491354e-06, + "loss": 0.6323, + "step": 1958 + }, + { + "epoch": 0.6022133415308946, + "grad_norm": 0.30258533358573914, + "learning_rate": 4.556766557789681e-06, + "loss": 0.6399, + "step": 1959 + }, + { + "epoch": 0.6025207500768521, + "grad_norm": 0.30086514353752136, + "learning_rate": 4.556304214507288e-06, + "loss": 0.6605, + "step": 1960 + }, + { + "epoch": 0.6028281586228097, + "grad_norm": 0.29490116238594055, + "learning_rate": 4.555841653693083e-06, + "loss": 0.6564, + "step": 1961 + }, + { + "epoch": 0.6031355671687673, + "grad_norm": 0.2940918803215027, + "learning_rate": 4.555378875396001e-06, + "loss": 0.6491, + "step": 1962 + }, + { + "epoch": 0.6034429757147248, + "grad_norm": 0.29902487993240356, + "learning_rate": 4.554915879664998e-06, + "loss": 0.6222, + "step": 1963 + }, + { + "epoch": 0.6037503842606824, + "grad_norm": 0.29304325580596924, + "learning_rate": 4.554452666549053e-06, + "loss": 0.6247, + "step": 1964 + }, + { + "epoch": 0.6040577928066401, + "grad_norm": 0.296871155500412, + "learning_rate": 4.553989236097168e-06, + "loss": 0.6257, + "step": 1965 + }, + { + "epoch": 0.6043652013525976, + "grad_norm": 0.29497724771499634, + "learning_rate": 4.553525588358368e-06, + "loss": 0.6126, + "step": 1966 + }, + { + "epoch": 0.6046726098985552, + "grad_norm": 0.30268314480781555, + "learning_rate": 4.553061723381703e-06, + "loss": 0.6576, + "step": 1967 + }, + { + "epoch": 0.6049800184445128, + "grad_norm": 0.28924131393432617, + "learning_rate": 4.552597641216242e-06, + "loss": 0.65, + "step": 1968 + }, + { + "epoch": 0.6052874269904703, + "grad_norm": 0.3073563277721405, + "learning_rate": 4.552133341911081e-06, + "loss": 0.6326, + "step": 1969 + }, + { + "epoch": 0.6055948355364279, + "grad_norm": 0.30229267477989197, + "learning_rate": 4.551668825515336e-06, + "loss": 0.6508, + "step": 1970 + }, + { + "epoch": 0.6059022440823855, + "grad_norm": 0.28924062848091125, + "learning_rate": 4.551204092078149e-06, + "loss": 0.6519, + "step": 1971 + }, + { + "epoch": 0.606209652628343, + "grad_norm": 0.3002241253852844, + "learning_rate": 4.55073914164868e-06, + "loss": 0.6346, + "step": 1972 + }, + { + "epoch": 0.6065170611743006, + "grad_norm": 0.2989045977592468, + "learning_rate": 4.5502739742761185e-06, + "loss": 0.6266, + "step": 1973 + }, + { + "epoch": 0.6068244697202583, + "grad_norm": 0.327038437128067, + "learning_rate": 4.54980859000967e-06, + "loss": 0.6073, + "step": 1974 + }, + { + "epoch": 0.6071318782662158, + "grad_norm": 0.29759031534194946, + "learning_rate": 4.549342988898569e-06, + "loss": 0.6482, + "step": 1975 + }, + { + "epoch": 0.6074392868121734, + "grad_norm": 0.30709776282310486, + "learning_rate": 4.54887717099207e-06, + "loss": 0.6211, + "step": 1976 + }, + { + "epoch": 0.607746695358131, + "grad_norm": 0.3206390142440796, + "learning_rate": 4.548411136339449e-06, + "loss": 0.6146, + "step": 1977 + }, + { + "epoch": 0.6080541039040885, + "grad_norm": 0.2984306812286377, + "learning_rate": 4.5479448849900094e-06, + "loss": 0.643, + "step": 1978 + }, + { + "epoch": 0.6083615124500461, + "grad_norm": 0.2947533130645752, + "learning_rate": 4.547478416993074e-06, + "loss": 0.6251, + "step": 1979 + }, + { + "epoch": 0.6086689209960037, + "grad_norm": 0.3039008677005768, + "learning_rate": 4.547011732397989e-06, + "loss": 0.6339, + "step": 1980 + }, + { + "epoch": 0.6089763295419612, + "grad_norm": 0.31064751744270325, + "learning_rate": 4.5465448312541235e-06, + "loss": 0.6289, + "step": 1981 + }, + { + "epoch": 0.6092837380879188, + "grad_norm": 0.3012250065803528, + "learning_rate": 4.546077713610871e-06, + "loss": 0.6405, + "step": 1982 + }, + { + "epoch": 0.6095911466338764, + "grad_norm": 0.3065010905265808, + "learning_rate": 4.545610379517646e-06, + "loss": 0.6473, + "step": 1983 + }, + { + "epoch": 0.609898555179834, + "grad_norm": 0.3025766611099243, + "learning_rate": 4.545142829023887e-06, + "loss": 0.6334, + "step": 1984 + }, + { + "epoch": 0.6102059637257916, + "grad_norm": 0.30567023158073425, + "learning_rate": 4.544675062179056e-06, + "loss": 0.6244, + "step": 1985 + }, + { + "epoch": 0.6105133722717492, + "grad_norm": 0.2939755320549011, + "learning_rate": 4.544207079032636e-06, + "loss": 0.6496, + "step": 1986 + }, + { + "epoch": 0.6108207808177067, + "grad_norm": 0.30813825130462646, + "learning_rate": 4.543738879634132e-06, + "loss": 0.608, + "step": 1987 + }, + { + "epoch": 0.6111281893636643, + "grad_norm": 0.29349762201309204, + "learning_rate": 4.543270464033078e-06, + "loss": 0.6488, + "step": 1988 + }, + { + "epoch": 0.6114355979096219, + "grad_norm": 0.3045792877674103, + "learning_rate": 4.5428018322790235e-06, + "loss": 0.6409, + "step": 1989 + }, + { + "epoch": 0.6117430064555794, + "grad_norm": 0.2957614064216614, + "learning_rate": 4.5423329844215445e-06, + "loss": 0.6374, + "step": 1990 + }, + { + "epoch": 0.612050415001537, + "grad_norm": 0.30088427662849426, + "learning_rate": 4.541863920510241e-06, + "loss": 0.6308, + "step": 1991 + }, + { + "epoch": 0.6123578235474946, + "grad_norm": 0.2939455807209015, + "learning_rate": 4.541394640594733e-06, + "loss": 0.6113, + "step": 1992 + }, + { + "epoch": 0.6126652320934522, + "grad_norm": 0.3101837635040283, + "learning_rate": 4.5409251447246635e-06, + "loss": 0.612, + "step": 1993 + }, + { + "epoch": 0.6129726406394098, + "grad_norm": 0.31027552485466003, + "learning_rate": 4.540455432949701e-06, + "loss": 0.6384, + "step": 1994 + }, + { + "epoch": 0.6132800491853674, + "grad_norm": 0.31794866919517517, + "learning_rate": 4.539985505319535e-06, + "loss": 0.6616, + "step": 1995 + }, + { + "epoch": 0.613587457731325, + "grad_norm": 0.29876232147216797, + "learning_rate": 4.539515361883877e-06, + "loss": 0.6238, + "step": 1996 + }, + { + "epoch": 0.6138948662772825, + "grad_norm": 0.3061017096042633, + "learning_rate": 4.5390450026924635e-06, + "loss": 0.6223, + "step": 1997 + }, + { + "epoch": 0.6142022748232401, + "grad_norm": 0.29263585805892944, + "learning_rate": 4.538574427795054e-06, + "loss": 0.6518, + "step": 1998 + }, + { + "epoch": 0.6145096833691976, + "grad_norm": 0.2982010841369629, + "learning_rate": 4.538103637241427e-06, + "loss": 0.6522, + "step": 1999 + }, + { + "epoch": 0.6148170919151552, + "grad_norm": 0.29502585530281067, + "learning_rate": 4.537632631081387e-06, + "loss": 0.6681, + "step": 2000 + }, + { + "epoch": 0.6151245004611128, + "grad_norm": 0.2943054735660553, + "learning_rate": 4.537161409364761e-06, + "loss": 0.6396, + "step": 2001 + }, + { + "epoch": 0.6154319090070703, + "grad_norm": 0.3108402192592621, + "learning_rate": 4.536689972141399e-06, + "loss": 0.6439, + "step": 2002 + }, + { + "epoch": 0.615739317553028, + "grad_norm": 0.2987877130508423, + "learning_rate": 4.536218319461174e-06, + "loss": 0.6227, + "step": 2003 + }, + { + "epoch": 0.6160467260989856, + "grad_norm": 0.30403485894203186, + "learning_rate": 4.535746451373978e-06, + "loss": 0.6451, + "step": 2004 + }, + { + "epoch": 0.6163541346449432, + "grad_norm": 0.2994406223297119, + "learning_rate": 4.5352743679297325e-06, + "loss": 0.631, + "step": 2005 + }, + { + "epoch": 0.6166615431909007, + "grad_norm": 0.29738208651542664, + "learning_rate": 4.534802069178376e-06, + "loss": 0.6336, + "step": 2006 + }, + { + "epoch": 0.6169689517368583, + "grad_norm": 0.30438101291656494, + "learning_rate": 4.534329555169872e-06, + "loss": 0.637, + "step": 2007 + }, + { + "epoch": 0.6172763602828159, + "grad_norm": 0.2975326478481293, + "learning_rate": 4.533856825954208e-06, + "loss": 0.6381, + "step": 2008 + }, + { + "epoch": 0.6175837688287734, + "grad_norm": 0.30169349908828735, + "learning_rate": 4.5333838815813915e-06, + "loss": 0.6443, + "step": 2009 + }, + { + "epoch": 0.617891177374731, + "grad_norm": 0.30227935314178467, + "learning_rate": 4.532910722101454e-06, + "loss": 0.6463, + "step": 2010 + }, + { + "epoch": 0.6181985859206885, + "grad_norm": 0.29732760787010193, + "learning_rate": 4.532437347564453e-06, + "loss": 0.6358, + "step": 2011 + }, + { + "epoch": 0.6185059944666462, + "grad_norm": 0.29276931285858154, + "learning_rate": 4.531963758020461e-06, + "loss": 0.6233, + "step": 2012 + }, + { + "epoch": 0.6188134030126038, + "grad_norm": 0.28951773047447205, + "learning_rate": 4.531489953519582e-06, + "loss": 0.6325, + "step": 2013 + }, + { + "epoch": 0.6191208115585614, + "grad_norm": 0.2931373119354248, + "learning_rate": 4.531015934111938e-06, + "loss": 0.634, + "step": 2014 + }, + { + "epoch": 0.6194282201045189, + "grad_norm": 0.2877959907054901, + "learning_rate": 4.5305416998476714e-06, + "loss": 0.6308, + "step": 2015 + }, + { + "epoch": 0.6197356286504765, + "grad_norm": 0.3018031418323517, + "learning_rate": 4.530067250776954e-06, + "loss": 0.6338, + "step": 2016 + }, + { + "epoch": 0.620043037196434, + "grad_norm": 0.3161218464374542, + "learning_rate": 4.529592586949976e-06, + "loss": 0.615, + "step": 2017 + }, + { + "epoch": 0.6203504457423916, + "grad_norm": 0.2975153923034668, + "learning_rate": 4.529117708416949e-06, + "loss": 0.6588, + "step": 2018 + }, + { + "epoch": 0.6206578542883492, + "grad_norm": 0.31941545009613037, + "learning_rate": 4.528642615228111e-06, + "loss": 0.6254, + "step": 2019 + }, + { + "epoch": 0.6209652628343068, + "grad_norm": 0.28932276368141174, + "learning_rate": 4.528167307433721e-06, + "loss": 0.6214, + "step": 2020 + }, + { + "epoch": 0.6212726713802643, + "grad_norm": 0.3038439452648163, + "learning_rate": 4.5276917850840604e-06, + "loss": 0.6188, + "step": 2021 + }, + { + "epoch": 0.621580079926222, + "grad_norm": 0.305925577878952, + "learning_rate": 4.527216048229434e-06, + "loss": 0.6259, + "step": 2022 + }, + { + "epoch": 0.6218874884721796, + "grad_norm": 0.31204187870025635, + "learning_rate": 4.526740096920169e-06, + "loss": 0.6406, + "step": 2023 + }, + { + "epoch": 0.6221948970181371, + "grad_norm": 0.3027236759662628, + "learning_rate": 4.526263931206613e-06, + "loss": 0.6721, + "step": 2024 + }, + { + "epoch": 0.6225023055640947, + "grad_norm": 0.29359006881713867, + "learning_rate": 4.525787551139143e-06, + "loss": 0.6505, + "step": 2025 + }, + { + "epoch": 0.6228097141100523, + "grad_norm": 0.2894918620586395, + "learning_rate": 4.5253109567681505e-06, + "loss": 0.6442, + "step": 2026 + }, + { + "epoch": 0.6231171226560098, + "grad_norm": 0.3018547594547272, + "learning_rate": 4.524834148144054e-06, + "loss": 0.6486, + "step": 2027 + }, + { + "epoch": 0.6234245312019674, + "grad_norm": 0.31827789545059204, + "learning_rate": 4.524357125317295e-06, + "loss": 0.6336, + "step": 2028 + }, + { + "epoch": 0.623731939747925, + "grad_norm": 0.2970574200153351, + "learning_rate": 4.523879888338336e-06, + "loss": 0.6277, + "step": 2029 + }, + { + "epoch": 0.6240393482938825, + "grad_norm": 0.2979516386985779, + "learning_rate": 4.523402437257662e-06, + "loss": 0.6254, + "step": 2030 + }, + { + "epoch": 0.6243467568398402, + "grad_norm": 0.3012678623199463, + "learning_rate": 4.522924772125784e-06, + "loss": 0.6258, + "step": 2031 + }, + { + "epoch": 0.6246541653857978, + "grad_norm": 0.3056723475456238, + "learning_rate": 4.522446892993231e-06, + "loss": 0.6203, + "step": 2032 + }, + { + "epoch": 0.6249615739317553, + "grad_norm": 0.3053358495235443, + "learning_rate": 4.521968799910556e-06, + "loss": 0.6108, + "step": 2033 + }, + { + "epoch": 0.6252689824777129, + "grad_norm": 0.29397931694984436, + "learning_rate": 4.5214904929283384e-06, + "loss": 0.6385, + "step": 2034 + }, + { + "epoch": 0.6255763910236705, + "grad_norm": 0.27784737944602966, + "learning_rate": 4.521011972097175e-06, + "loss": 0.6475, + "step": 2035 + }, + { + "epoch": 0.625883799569628, + "grad_norm": 0.29497358202934265, + "learning_rate": 4.520533237467688e-06, + "loss": 0.6204, + "step": 2036 + }, + { + "epoch": 0.6261912081155856, + "grad_norm": 0.3069138824939728, + "learning_rate": 4.520054289090521e-06, + "loss": 0.6616, + "step": 2037 + }, + { + "epoch": 0.6264986166615432, + "grad_norm": 0.30118271708488464, + "learning_rate": 4.519575127016343e-06, + "loss": 0.6564, + "step": 2038 + }, + { + "epoch": 0.6268060252075007, + "grad_norm": 0.30004119873046875, + "learning_rate": 4.51909575129584e-06, + "loss": 0.649, + "step": 2039 + }, + { + "epoch": 0.6271134337534583, + "grad_norm": 0.30604395270347595, + "learning_rate": 4.518616161979727e-06, + "loss": 0.6112, + "step": 2040 + }, + { + "epoch": 0.627420842299416, + "grad_norm": 0.301862895488739, + "learning_rate": 4.518136359118738e-06, + "loss": 0.6339, + "step": 2041 + }, + { + "epoch": 0.6277282508453735, + "grad_norm": 0.30131906270980835, + "learning_rate": 4.51765634276363e-06, + "loss": 0.6326, + "step": 2042 + }, + { + "epoch": 0.6280356593913311, + "grad_norm": 0.2995993196964264, + "learning_rate": 4.517176112965183e-06, + "loss": 0.6428, + "step": 2043 + }, + { + "epoch": 0.6283430679372887, + "grad_norm": 0.3058539628982544, + "learning_rate": 4.516695669774199e-06, + "loss": 0.6399, + "step": 2044 + }, + { + "epoch": 0.6286504764832462, + "grad_norm": 0.290720671415329, + "learning_rate": 4.516215013241504e-06, + "loss": 0.6113, + "step": 2045 + }, + { + "epoch": 0.6289578850292038, + "grad_norm": 0.3091951012611389, + "learning_rate": 4.5157341434179435e-06, + "loss": 0.6241, + "step": 2046 + }, + { + "epoch": 0.6292652935751614, + "grad_norm": 0.30531731247901917, + "learning_rate": 4.5152530603543895e-06, + "loss": 0.6336, + "step": 2047 + }, + { + "epoch": 0.6295727021211189, + "grad_norm": 0.29141026735305786, + "learning_rate": 4.5147717641017355e-06, + "loss": 0.6062, + "step": 2048 + }, + { + "epoch": 0.6298801106670765, + "grad_norm": 0.2977672219276428, + "learning_rate": 4.514290254710893e-06, + "loss": 0.6433, + "step": 2049 + }, + { + "epoch": 0.6301875192130342, + "grad_norm": 0.2898685038089752, + "learning_rate": 4.513808532232804e-06, + "loss": 0.6271, + "step": 2050 + }, + { + "epoch": 0.6304949277589917, + "grad_norm": 0.30528637766838074, + "learning_rate": 4.513326596718428e-06, + "loss": 0.6162, + "step": 2051 + }, + { + "epoch": 0.6308023363049493, + "grad_norm": 0.2929508686065674, + "learning_rate": 4.512844448218745e-06, + "loss": 0.5988, + "step": 2052 + }, + { + "epoch": 0.6311097448509069, + "grad_norm": 0.28880175948143005, + "learning_rate": 4.512362086784764e-06, + "loss": 0.6539, + "step": 2053 + }, + { + "epoch": 0.6314171533968644, + "grad_norm": 0.3129745125770569, + "learning_rate": 4.51187951246751e-06, + "loss": 0.6238, + "step": 2054 + }, + { + "epoch": 0.631724561942822, + "grad_norm": 0.2977875769138336, + "learning_rate": 4.511396725318036e-06, + "loss": 0.6065, + "step": 2055 + }, + { + "epoch": 0.6320319704887796, + "grad_norm": 0.2935378849506378, + "learning_rate": 4.5109137253874125e-06, + "loss": 0.6499, + "step": 2056 + }, + { + "epoch": 0.6323393790347371, + "grad_norm": 0.29041075706481934, + "learning_rate": 4.510430512726737e-06, + "loss": 0.633, + "step": 2057 + }, + { + "epoch": 0.6326467875806947, + "grad_norm": 0.29289036989212036, + "learning_rate": 4.509947087387126e-06, + "loss": 0.6305, + "step": 2058 + }, + { + "epoch": 0.6329541961266523, + "grad_norm": 0.29133304953575134, + "learning_rate": 4.509463449419722e-06, + "loss": 0.6251, + "step": 2059 + }, + { + "epoch": 0.63326160467261, + "grad_norm": 0.30553799867630005, + "learning_rate": 4.508979598875686e-06, + "loss": 0.6204, + "step": 2060 + }, + { + "epoch": 0.6335690132185675, + "grad_norm": 0.29963457584381104, + "learning_rate": 4.508495535806204e-06, + "loss": 0.6399, + "step": 2061 + }, + { + "epoch": 0.6338764217645251, + "grad_norm": 0.30186474323272705, + "learning_rate": 4.508011260262485e-06, + "loss": 0.6222, + "step": 2062 + }, + { + "epoch": 0.6341838303104826, + "grad_norm": 0.2956302762031555, + "learning_rate": 4.507526772295757e-06, + "loss": 0.6236, + "step": 2063 + }, + { + "epoch": 0.6344912388564402, + "grad_norm": 0.30776363611221313, + "learning_rate": 4.507042071957275e-06, + "loss": 0.6034, + "step": 2064 + }, + { + "epoch": 0.6347986474023978, + "grad_norm": 0.3145751357078552, + "learning_rate": 4.506557159298314e-06, + "loss": 0.6259, + "step": 2065 + }, + { + "epoch": 0.6351060559483553, + "grad_norm": 0.2942216694355011, + "learning_rate": 4.506072034370172e-06, + "loss": 0.6285, + "step": 2066 + }, + { + "epoch": 0.6354134644943129, + "grad_norm": 0.29947352409362793, + "learning_rate": 4.505586697224168e-06, + "loss": 0.6323, + "step": 2067 + }, + { + "epoch": 0.6357208730402705, + "grad_norm": 0.30774953961372375, + "learning_rate": 4.505101147911647e-06, + "loss": 0.6454, + "step": 2068 + }, + { + "epoch": 0.6360282815862282, + "grad_norm": 0.2981733977794647, + "learning_rate": 4.5046153864839715e-06, + "loss": 0.6425, + "step": 2069 + }, + { + "epoch": 0.6363356901321857, + "grad_norm": 0.2987770736217499, + "learning_rate": 4.5041294129925305e-06, + "loss": 0.6369, + "step": 2070 + }, + { + "epoch": 0.6366430986781433, + "grad_norm": 0.294778436422348, + "learning_rate": 4.503643227488734e-06, + "loss": 0.6334, + "step": 2071 + }, + { + "epoch": 0.6369505072241008, + "grad_norm": 0.3095795810222626, + "learning_rate": 4.503156830024015e-06, + "loss": 0.633, + "step": 2072 + }, + { + "epoch": 0.6372579157700584, + "grad_norm": 0.3212897777557373, + "learning_rate": 4.502670220649827e-06, + "loss": 0.6454, + "step": 2073 + }, + { + "epoch": 0.637565324316016, + "grad_norm": 0.3004525601863861, + "learning_rate": 4.5021833994176485e-06, + "loss": 0.6285, + "step": 2074 + }, + { + "epoch": 0.6378727328619735, + "grad_norm": 0.295023649930954, + "learning_rate": 4.501696366378979e-06, + "loss": 0.6259, + "step": 2075 + }, + { + "epoch": 0.6381801414079311, + "grad_norm": 0.30629655718803406, + "learning_rate": 4.50120912158534e-06, + "loss": 0.6286, + "step": 2076 + }, + { + "epoch": 0.6384875499538887, + "grad_norm": 0.30279457569122314, + "learning_rate": 4.500721665088277e-06, + "loss": 0.6498, + "step": 2077 + }, + { + "epoch": 0.6387949584998462, + "grad_norm": 0.3083172142505646, + "learning_rate": 4.500233996939356e-06, + "loss": 0.635, + "step": 2078 + }, + { + "epoch": 0.6391023670458039, + "grad_norm": 0.30751657485961914, + "learning_rate": 4.499746117190167e-06, + "loss": 0.6387, + "step": 2079 + }, + { + "epoch": 0.6394097755917615, + "grad_norm": 0.31951865553855896, + "learning_rate": 4.499258025892321e-06, + "loss": 0.6201, + "step": 2080 + }, + { + "epoch": 0.639717184137719, + "grad_norm": 0.31065744161605835, + "learning_rate": 4.498769723097453e-06, + "loss": 0.6492, + "step": 2081 + }, + { + "epoch": 0.6400245926836766, + "grad_norm": 0.3076138496398926, + "learning_rate": 4.49828120885722e-06, + "loss": 0.6144, + "step": 2082 + }, + { + "epoch": 0.6403320012296342, + "grad_norm": 0.299002468585968, + "learning_rate": 4.4977924832232985e-06, + "loss": 0.6504, + "step": 2083 + }, + { + "epoch": 0.6406394097755918, + "grad_norm": 0.30808907747268677, + "learning_rate": 4.497303546247391e-06, + "loss": 0.6195, + "step": 2084 + }, + { + "epoch": 0.6409468183215493, + "grad_norm": 0.3158666491508484, + "learning_rate": 4.496814397981222e-06, + "loss": 0.6138, + "step": 2085 + }, + { + "epoch": 0.6412542268675069, + "grad_norm": 0.3093908727169037, + "learning_rate": 4.496325038476536e-06, + "loss": 0.6374, + "step": 2086 + }, + { + "epoch": 0.6415616354134644, + "grad_norm": 0.31253480911254883, + "learning_rate": 4.4958354677851015e-06, + "loss": 0.6288, + "step": 2087 + }, + { + "epoch": 0.6418690439594221, + "grad_norm": 0.301441490650177, + "learning_rate": 4.495345685958709e-06, + "loss": 0.6348, + "step": 2088 + }, + { + "epoch": 0.6421764525053797, + "grad_norm": 0.299230694770813, + "learning_rate": 4.494855693049171e-06, + "loss": 0.6333, + "step": 2089 + }, + { + "epoch": 0.6424838610513373, + "grad_norm": 0.31671711802482605, + "learning_rate": 4.4943654891083255e-06, + "loss": 0.6385, + "step": 2090 + }, + { + "epoch": 0.6427912695972948, + "grad_norm": 0.3091694712638855, + "learning_rate": 4.493875074188027e-06, + "loss": 0.6441, + "step": 2091 + }, + { + "epoch": 0.6430986781432524, + "grad_norm": 0.2906147837638855, + "learning_rate": 4.493384448340157e-06, + "loss": 0.6341, + "step": 2092 + }, + { + "epoch": 0.64340608668921, + "grad_norm": 0.29521268606185913, + "learning_rate": 4.492893611616617e-06, + "loss": 0.6462, + "step": 2093 + }, + { + "epoch": 0.6437134952351675, + "grad_norm": 0.31674298644065857, + "learning_rate": 4.4924025640693304e-06, + "loss": 0.6444, + "step": 2094 + }, + { + "epoch": 0.6440209037811251, + "grad_norm": 0.3206893503665924, + "learning_rate": 4.491911305750246e-06, + "loss": 0.6475, + "step": 2095 + }, + { + "epoch": 0.6443283123270827, + "grad_norm": 0.3145778477191925, + "learning_rate": 4.491419836711333e-06, + "loss": 0.6631, + "step": 2096 + }, + { + "epoch": 0.6446357208730402, + "grad_norm": 0.30818891525268555, + "learning_rate": 4.490928157004582e-06, + "loss": 0.6621, + "step": 2097 + }, + { + "epoch": 0.6449431294189979, + "grad_norm": 0.3380703926086426, + "learning_rate": 4.490436266682006e-06, + "loss": 0.6401, + "step": 2098 + }, + { + "epoch": 0.6452505379649555, + "grad_norm": 0.30974888801574707, + "learning_rate": 4.489944165795641e-06, + "loss": 0.6502, + "step": 2099 + }, + { + "epoch": 0.645557946510913, + "grad_norm": 0.32630982995033264, + "learning_rate": 4.489451854397547e-06, + "loss": 0.6313, + "step": 2100 + }, + { + "epoch": 0.6458653550568706, + "grad_norm": 0.30520084500312805, + "learning_rate": 4.488959332539803e-06, + "loss": 0.6165, + "step": 2101 + }, + { + "epoch": 0.6461727636028282, + "grad_norm": 0.30227187275886536, + "learning_rate": 4.488466600274513e-06, + "loss": 0.6597, + "step": 2102 + }, + { + "epoch": 0.6464801721487857, + "grad_norm": 0.3385670483112335, + "learning_rate": 4.4879736576538e-06, + "loss": 0.639, + "step": 2103 + }, + { + "epoch": 0.6467875806947433, + "grad_norm": 0.3081066310405731, + "learning_rate": 4.487480504729813e-06, + "loss": 0.6067, + "step": 2104 + }, + { + "epoch": 0.6470949892407009, + "grad_norm": 0.3002223074436188, + "learning_rate": 4.486987141554722e-06, + "loss": 0.6596, + "step": 2105 + }, + { + "epoch": 0.6474023977866584, + "grad_norm": 0.2977466881275177, + "learning_rate": 4.486493568180717e-06, + "loss": 0.648, + "step": 2106 + }, + { + "epoch": 0.6477098063326161, + "grad_norm": 0.3307541012763977, + "learning_rate": 4.485999784660013e-06, + "loss": 0.649, + "step": 2107 + }, + { + "epoch": 0.6480172148785737, + "grad_norm": 0.33533158898353577, + "learning_rate": 4.485505791044846e-06, + "loss": 0.6286, + "step": 2108 + }, + { + "epoch": 0.6483246234245312, + "grad_norm": 0.3004852533340454, + "learning_rate": 4.485011587387475e-06, + "loss": 0.6303, + "step": 2109 + }, + { + "epoch": 0.6486320319704888, + "grad_norm": 0.315408319234848, + "learning_rate": 4.48451717374018e-06, + "loss": 0.6322, + "step": 2110 + }, + { + "epoch": 0.6489394405164464, + "grad_norm": 0.30032652616500854, + "learning_rate": 4.484022550155265e-06, + "loss": 0.6357, + "step": 2111 + }, + { + "epoch": 0.6492468490624039, + "grad_norm": 0.2896817624568939, + "learning_rate": 4.483527716685054e-06, + "loss": 0.6504, + "step": 2112 + }, + { + "epoch": 0.6495542576083615, + "grad_norm": 0.29470717906951904, + "learning_rate": 4.483032673381894e-06, + "loss": 0.6397, + "step": 2113 + }, + { + "epoch": 0.6498616661543191, + "grad_norm": 0.30125710368156433, + "learning_rate": 4.482537420298155e-06, + "loss": 0.6267, + "step": 2114 + }, + { + "epoch": 0.6501690747002766, + "grad_norm": 0.30751654505729675, + "learning_rate": 4.482041957486229e-06, + "loss": 0.645, + "step": 2115 + }, + { + "epoch": 0.6504764832462342, + "grad_norm": 0.30155810713768005, + "learning_rate": 4.4815462849985315e-06, + "loss": 0.6427, + "step": 2116 + }, + { + "epoch": 0.6507838917921919, + "grad_norm": 0.28954917192459106, + "learning_rate": 4.481050402887495e-06, + "loss": 0.6459, + "step": 2117 + }, + { + "epoch": 0.6510913003381494, + "grad_norm": 0.28848540782928467, + "learning_rate": 4.480554311205581e-06, + "loss": 0.6397, + "step": 2118 + }, + { + "epoch": 0.651398708884107, + "grad_norm": 0.3061654567718506, + "learning_rate": 4.480058010005267e-06, + "loss": 0.6454, + "step": 2119 + }, + { + "epoch": 0.6517061174300646, + "grad_norm": 0.3146761953830719, + "learning_rate": 4.479561499339059e-06, + "loss": 0.6472, + "step": 2120 + }, + { + "epoch": 0.6520135259760221, + "grad_norm": 0.3006506562232971, + "learning_rate": 4.479064779259479e-06, + "loss": 0.6576, + "step": 2121 + }, + { + "epoch": 0.6523209345219797, + "grad_norm": 0.2848888635635376, + "learning_rate": 4.4785678498190744e-06, + "loss": 0.5782, + "step": 2122 + }, + { + "epoch": 0.6526283430679373, + "grad_norm": 0.2956538200378418, + "learning_rate": 4.478070711070416e-06, + "loss": 0.6218, + "step": 2123 + }, + { + "epoch": 0.6529357516138948, + "grad_norm": 0.3003077805042267, + "learning_rate": 4.4775733630660925e-06, + "loss": 0.6375, + "step": 2124 + }, + { + "epoch": 0.6532431601598524, + "grad_norm": 0.29768458008766174, + "learning_rate": 4.477075805858718e-06, + "loss": 0.6215, + "step": 2125 + }, + { + "epoch": 0.6535505687058101, + "grad_norm": 0.2937188148498535, + "learning_rate": 4.4765780395009295e-06, + "loss": 0.6381, + "step": 2126 + }, + { + "epoch": 0.6538579772517676, + "grad_norm": 0.2926722466945648, + "learning_rate": 4.476080064045383e-06, + "loss": 0.6191, + "step": 2127 + }, + { + "epoch": 0.6541653857977252, + "grad_norm": 0.3183633089065552, + "learning_rate": 4.475581879544759e-06, + "loss": 0.6134, + "step": 2128 + }, + { + "epoch": 0.6544727943436828, + "grad_norm": 0.3030667006969452, + "learning_rate": 4.475083486051758e-06, + "loss": 0.65, + "step": 2129 + }, + { + "epoch": 0.6547802028896403, + "grad_norm": 0.3169498145580292, + "learning_rate": 4.4745848836191055e-06, + "loss": 0.6375, + "step": 2130 + }, + { + "epoch": 0.6550876114355979, + "grad_norm": 0.2987414300441742, + "learning_rate": 4.474086072299547e-06, + "loss": 0.6333, + "step": 2131 + }, + { + "epoch": 0.6553950199815555, + "grad_norm": 0.3019852638244629, + "learning_rate": 4.4735870521458505e-06, + "loss": 0.6336, + "step": 2132 + }, + { + "epoch": 0.655702428527513, + "grad_norm": 0.2994304299354553, + "learning_rate": 4.473087823210807e-06, + "loss": 0.6203, + "step": 2133 + }, + { + "epoch": 0.6560098370734706, + "grad_norm": 0.31021857261657715, + "learning_rate": 4.472588385547227e-06, + "loss": 0.6552, + "step": 2134 + }, + { + "epoch": 0.6563172456194282, + "grad_norm": 0.3039351999759674, + "learning_rate": 4.472088739207946e-06, + "loss": 0.6244, + "step": 2135 + }, + { + "epoch": 0.6566246541653858, + "grad_norm": 0.295057475566864, + "learning_rate": 4.471588884245821e-06, + "loss": 0.6284, + "step": 2136 + }, + { + "epoch": 0.6569320627113434, + "grad_norm": 0.30496788024902344, + "learning_rate": 4.47108882071373e-06, + "loss": 0.6238, + "step": 2137 + }, + { + "epoch": 0.657239471257301, + "grad_norm": 0.29849207401275635, + "learning_rate": 4.470588548664573e-06, + "loss": 0.6524, + "step": 2138 + }, + { + "epoch": 0.6575468798032585, + "grad_norm": 0.29760465025901794, + "learning_rate": 4.470088068151273e-06, + "loss": 0.627, + "step": 2139 + }, + { + "epoch": 0.6578542883492161, + "grad_norm": 0.28455719351768494, + "learning_rate": 4.4695873792267766e-06, + "loss": 0.6323, + "step": 2140 + }, + { + "epoch": 0.6581616968951737, + "grad_norm": 0.31202569603919983, + "learning_rate": 4.4690864819440475e-06, + "loss": 0.6391, + "step": 2141 + }, + { + "epoch": 0.6584691054411312, + "grad_norm": 0.30599984526634216, + "learning_rate": 4.468585376356076e-06, + "loss": 0.6249, + "step": 2142 + }, + { + "epoch": 0.6587765139870888, + "grad_norm": 0.30883049964904785, + "learning_rate": 4.468084062515873e-06, + "loss": 0.6303, + "step": 2143 + }, + { + "epoch": 0.6590839225330464, + "grad_norm": 0.3002879023551941, + "learning_rate": 4.467582540476471e-06, + "loss": 0.6095, + "step": 2144 + }, + { + "epoch": 0.659391331079004, + "grad_norm": 0.301749050617218, + "learning_rate": 4.467080810290927e-06, + "loss": 0.6169, + "step": 2145 + }, + { + "epoch": 0.6596987396249616, + "grad_norm": 0.31657764315605164, + "learning_rate": 4.4665788720123135e-06, + "loss": 0.6391, + "step": 2146 + }, + { + "epoch": 0.6600061481709192, + "grad_norm": 0.29839351773262024, + "learning_rate": 4.466076725693733e-06, + "loss": 0.6267, + "step": 2147 + }, + { + "epoch": 0.6603135567168767, + "grad_norm": 0.29515132308006287, + "learning_rate": 4.465574371388306e-06, + "loss": 0.6144, + "step": 2148 + }, + { + "epoch": 0.6606209652628343, + "grad_norm": 0.30227693915367126, + "learning_rate": 4.465071809149174e-06, + "loss": 0.6271, + "step": 2149 + }, + { + "epoch": 0.6609283738087919, + "grad_norm": 0.2910769581794739, + "learning_rate": 4.464569039029503e-06, + "loss": 0.6378, + "step": 2150 + }, + { + "epoch": 0.6612357823547494, + "grad_norm": 0.3078521192073822, + "learning_rate": 4.464066061082479e-06, + "loss": 0.6318, + "step": 2151 + }, + { + "epoch": 0.661543190900707, + "grad_norm": 0.30606192350387573, + "learning_rate": 4.463562875361312e-06, + "loss": 0.6095, + "step": 2152 + }, + { + "epoch": 0.6618505994466646, + "grad_norm": 0.29543188214302063, + "learning_rate": 4.463059481919233e-06, + "loss": 0.6327, + "step": 2153 + }, + { + "epoch": 0.6621580079926221, + "grad_norm": 0.31078365445137024, + "learning_rate": 4.462555880809494e-06, + "loss": 0.6492, + "step": 2154 + }, + { + "epoch": 0.6624654165385798, + "grad_norm": 0.298523873090744, + "learning_rate": 4.46205207208537e-06, + "loss": 0.6254, + "step": 2155 + }, + { + "epoch": 0.6627728250845374, + "grad_norm": 0.2931356132030487, + "learning_rate": 4.461548055800157e-06, + "loss": 0.6224, + "step": 2156 + }, + { + "epoch": 0.663080233630495, + "grad_norm": 0.30253371596336365, + "learning_rate": 4.461043832007176e-06, + "loss": 0.6631, + "step": 2157 + }, + { + "epoch": 0.6633876421764525, + "grad_norm": 0.2934292256832123, + "learning_rate": 4.460539400759765e-06, + "loss": 0.6161, + "step": 2158 + }, + { + "epoch": 0.6636950507224101, + "grad_norm": 0.3056827485561371, + "learning_rate": 4.46003476211129e-06, + "loss": 0.6289, + "step": 2159 + }, + { + "epoch": 0.6640024592683677, + "grad_norm": 0.287657231092453, + "learning_rate": 4.459529916115133e-06, + "loss": 0.6328, + "step": 2160 + }, + { + "epoch": 0.6643098678143252, + "grad_norm": 0.32069674134254456, + "learning_rate": 4.4590248628247015e-06, + "loss": 0.6577, + "step": 2161 + }, + { + "epoch": 0.6646172763602828, + "grad_norm": 0.2972988486289978, + "learning_rate": 4.458519602293423e-06, + "loss": 0.6156, + "step": 2162 + }, + { + "epoch": 0.6649246849062403, + "grad_norm": 0.3087727427482605, + "learning_rate": 4.4580141345747484e-06, + "loss": 0.6308, + "step": 2163 + }, + { + "epoch": 0.665232093452198, + "grad_norm": 0.2917284071445465, + "learning_rate": 4.45750845972215e-06, + "loss": 0.642, + "step": 2164 + }, + { + "epoch": 0.6655395019981556, + "grad_norm": 0.2837419807910919, + "learning_rate": 4.457002577789122e-06, + "loss": 0.6302, + "step": 2165 + }, + { + "epoch": 0.6658469105441132, + "grad_norm": 0.31428661942481995, + "learning_rate": 4.456496488829182e-06, + "loss": 0.6236, + "step": 2166 + }, + { + "epoch": 0.6661543190900707, + "grad_norm": 0.3015732169151306, + "learning_rate": 4.4559901928958654e-06, + "loss": 0.6252, + "step": 2167 + }, + { + "epoch": 0.6664617276360283, + "grad_norm": 0.29515141248703003, + "learning_rate": 4.4554836900427335e-06, + "loss": 0.6459, + "step": 2168 + }, + { + "epoch": 0.6667691361819859, + "grad_norm": 0.30147844552993774, + "learning_rate": 4.454976980323368e-06, + "loss": 0.6462, + "step": 2169 + }, + { + "epoch": 0.6670765447279434, + "grad_norm": 0.29585134983062744, + "learning_rate": 4.4544700637913725e-06, + "loss": 0.6372, + "step": 2170 + }, + { + "epoch": 0.667383953273901, + "grad_norm": 0.29216834902763367, + "learning_rate": 4.4539629405003736e-06, + "loss": 0.6352, + "step": 2171 + }, + { + "epoch": 0.6676913618198586, + "grad_norm": 0.3023844361305237, + "learning_rate": 4.453455610504016e-06, + "loss": 0.6232, + "step": 2172 + }, + { + "epoch": 0.6679987703658161, + "grad_norm": 0.3098876476287842, + "learning_rate": 4.452948073855973e-06, + "loss": 0.6069, + "step": 2173 + }, + { + "epoch": 0.6683061789117738, + "grad_norm": 0.3012644052505493, + "learning_rate": 4.452440330609932e-06, + "loss": 0.623, + "step": 2174 + }, + { + "epoch": 0.6686135874577314, + "grad_norm": 0.31987428665161133, + "learning_rate": 4.451932380819607e-06, + "loss": 0.6465, + "step": 2175 + }, + { + "epoch": 0.6689209960036889, + "grad_norm": 0.35358113050460815, + "learning_rate": 4.451424224538734e-06, + "loss": 0.6213, + "step": 2176 + }, + { + "epoch": 0.6692284045496465, + "grad_norm": 0.29438650608062744, + "learning_rate": 4.450915861821069e-06, + "loss": 0.6114, + "step": 2177 + }, + { + "epoch": 0.6695358130956041, + "grad_norm": 0.3246724605560303, + "learning_rate": 4.450407292720391e-06, + "loss": 0.6104, + "step": 2178 + }, + { + "epoch": 0.6698432216415616, + "grad_norm": 0.30447283387184143, + "learning_rate": 4.449898517290499e-06, + "loss": 0.6255, + "step": 2179 + }, + { + "epoch": 0.6701506301875192, + "grad_norm": 0.2876913845539093, + "learning_rate": 4.4493895355852176e-06, + "loss": 0.6007, + "step": 2180 + }, + { + "epoch": 0.6704580387334768, + "grad_norm": 0.2999209761619568, + "learning_rate": 4.44888034765839e-06, + "loss": 0.647, + "step": 2181 + }, + { + "epoch": 0.6707654472794343, + "grad_norm": 0.3210514783859253, + "learning_rate": 4.4483709535638805e-06, + "loss": 0.6365, + "step": 2182 + }, + { + "epoch": 0.671072855825392, + "grad_norm": 0.311082661151886, + "learning_rate": 4.447861353355578e-06, + "loss": 0.6167, + "step": 2183 + }, + { + "epoch": 0.6713802643713496, + "grad_norm": 0.29417961835861206, + "learning_rate": 4.4473515470873914e-06, + "loss": 0.6241, + "step": 2184 + }, + { + "epoch": 0.6716876729173071, + "grad_norm": 0.296328604221344, + "learning_rate": 4.446841534813252e-06, + "loss": 0.6205, + "step": 2185 + }, + { + "epoch": 0.6719950814632647, + "grad_norm": 0.2974420189857483, + "learning_rate": 4.446331316587114e-06, + "loss": 0.5907, + "step": 2186 + }, + { + "epoch": 0.6723024900092223, + "grad_norm": 0.3141501247882843, + "learning_rate": 4.445820892462951e-06, + "loss": 0.6032, + "step": 2187 + }, + { + "epoch": 0.6726098985551798, + "grad_norm": 0.3098449110984802, + "learning_rate": 4.445310262494761e-06, + "loss": 0.6346, + "step": 2188 + }, + { + "epoch": 0.6729173071011374, + "grad_norm": 0.32845935225486755, + "learning_rate": 4.44479942673656e-06, + "loss": 0.6334, + "step": 2189 + }, + { + "epoch": 0.673224715647095, + "grad_norm": 0.3042459487915039, + "learning_rate": 4.44428838524239e-06, + "loss": 0.6209, + "step": 2190 + }, + { + "epoch": 0.6735321241930525, + "grad_norm": 0.32484668493270874, + "learning_rate": 4.443777138066313e-06, + "loss": 0.6231, + "step": 2191 + }, + { + "epoch": 0.6738395327390101, + "grad_norm": 0.2893773019313812, + "learning_rate": 4.443265685262413e-06, + "loss": 0.6333, + "step": 2192 + }, + { + "epoch": 0.6741469412849678, + "grad_norm": 0.32232290506362915, + "learning_rate": 4.442754026884793e-06, + "loss": 0.632, + "step": 2193 + }, + { + "epoch": 0.6744543498309253, + "grad_norm": 0.31659576296806335, + "learning_rate": 4.442242162987583e-06, + "loss": 0.6149, + "step": 2194 + }, + { + "epoch": 0.6747617583768829, + "grad_norm": 0.29838958382606506, + "learning_rate": 4.44173009362493e-06, + "loss": 0.6319, + "step": 2195 + }, + { + "epoch": 0.6750691669228405, + "grad_norm": 0.3054872453212738, + "learning_rate": 4.4412178188510055e-06, + "loss": 0.6252, + "step": 2196 + }, + { + "epoch": 0.675376575468798, + "grad_norm": 0.31894826889038086, + "learning_rate": 4.440705338720002e-06, + "loss": 0.6284, + "step": 2197 + }, + { + "epoch": 0.6756839840147556, + "grad_norm": 0.3020031750202179, + "learning_rate": 4.440192653286134e-06, + "loss": 0.6329, + "step": 2198 + }, + { + "epoch": 0.6759913925607132, + "grad_norm": 0.2982437014579773, + "learning_rate": 4.4396797626036364e-06, + "loss": 0.6509, + "step": 2199 + }, + { + "epoch": 0.6762988011066707, + "grad_norm": 0.3138371706008911, + "learning_rate": 4.439166666726768e-06, + "loss": 0.6324, + "step": 2200 + }, + { + "epoch": 0.6766062096526283, + "grad_norm": 0.3070920407772064, + "learning_rate": 4.4386533657098065e-06, + "loss": 0.602, + "step": 2201 + }, + { + "epoch": 0.676913618198586, + "grad_norm": 0.286702424287796, + "learning_rate": 4.438139859607054e-06, + "loss": 0.6217, + "step": 2202 + }, + { + "epoch": 0.6772210267445435, + "grad_norm": 0.29270443320274353, + "learning_rate": 4.437626148472832e-06, + "loss": 0.6105, + "step": 2203 + }, + { + "epoch": 0.6775284352905011, + "grad_norm": 0.3053779602050781, + "learning_rate": 4.4371122323614854e-06, + "loss": 0.6285, + "step": 2204 + }, + { + "epoch": 0.6778358438364587, + "grad_norm": 0.3123090863227844, + "learning_rate": 4.436598111327381e-06, + "loss": 0.6328, + "step": 2205 + }, + { + "epoch": 0.6781432523824162, + "grad_norm": 0.30964764952659607, + "learning_rate": 4.436083785424907e-06, + "loss": 0.6182, + "step": 2206 + }, + { + "epoch": 0.6784506609283738, + "grad_norm": 0.2976219654083252, + "learning_rate": 4.43556925470847e-06, + "loss": 0.6203, + "step": 2207 + }, + { + "epoch": 0.6787580694743314, + "grad_norm": 0.30432114005088806, + "learning_rate": 4.435054519232505e-06, + "loss": 0.6287, + "step": 2208 + }, + { + "epoch": 0.6790654780202889, + "grad_norm": 0.30061936378479004, + "learning_rate": 4.434539579051461e-06, + "loss": 0.6117, + "step": 2209 + }, + { + "epoch": 0.6793728865662465, + "grad_norm": 0.30404916405677795, + "learning_rate": 4.434024434219813e-06, + "loss": 0.6462, + "step": 2210 + }, + { + "epoch": 0.6796802951122041, + "grad_norm": 0.30231472849845886, + "learning_rate": 4.43350908479206e-06, + "loss": 0.6506, + "step": 2211 + }, + { + "epoch": 0.6799877036581617, + "grad_norm": 0.2908565402030945, + "learning_rate": 4.432993530822717e-06, + "loss": 0.6429, + "step": 2212 + }, + { + "epoch": 0.6802951122041193, + "grad_norm": 0.3062841594219208, + "learning_rate": 4.432477772366323e-06, + "loss": 0.6172, + "step": 2213 + }, + { + "epoch": 0.6806025207500769, + "grad_norm": 0.30602848529815674, + "learning_rate": 4.431961809477439e-06, + "loss": 0.6321, + "step": 2214 + }, + { + "epoch": 0.6809099292960344, + "grad_norm": 0.30355969071388245, + "learning_rate": 4.431445642210651e-06, + "loss": 0.6368, + "step": 2215 + }, + { + "epoch": 0.681217337841992, + "grad_norm": 0.3039747476577759, + "learning_rate": 4.430929270620559e-06, + "loss": 0.6351, + "step": 2216 + }, + { + "epoch": 0.6815247463879496, + "grad_norm": 0.30608007311820984, + "learning_rate": 4.43041269476179e-06, + "loss": 0.6366, + "step": 2217 + }, + { + "epoch": 0.6818321549339071, + "grad_norm": 0.3049275279045105, + "learning_rate": 4.429895914688992e-06, + "loss": 0.6315, + "step": 2218 + }, + { + "epoch": 0.6821395634798647, + "grad_norm": 0.30615904927253723, + "learning_rate": 4.429378930456834e-06, + "loss": 0.6342, + "step": 2219 + }, + { + "epoch": 0.6824469720258223, + "grad_norm": 0.3073166012763977, + "learning_rate": 4.428861742120006e-06, + "loss": 0.6258, + "step": 2220 + }, + { + "epoch": 0.68275438057178, + "grad_norm": 0.2974025309085846, + "learning_rate": 4.428344349733221e-06, + "loss": 0.6033, + "step": 2221 + }, + { + "epoch": 0.6830617891177375, + "grad_norm": 0.28466713428497314, + "learning_rate": 4.427826753351212e-06, + "loss": 0.6309, + "step": 2222 + }, + { + "epoch": 0.6833691976636951, + "grad_norm": 0.29885002970695496, + "learning_rate": 4.4273089530287345e-06, + "loss": 0.6187, + "step": 2223 + }, + { + "epoch": 0.6836766062096526, + "grad_norm": 0.2982214093208313, + "learning_rate": 4.426790948820566e-06, + "loss": 0.6373, + "step": 2224 + }, + { + "epoch": 0.6839840147556102, + "grad_norm": 0.301616907119751, + "learning_rate": 4.426272740781504e-06, + "loss": 0.6339, + "step": 2225 + }, + { + "epoch": 0.6842914233015678, + "grad_norm": 0.30134618282318115, + "learning_rate": 4.425754328966369e-06, + "loss": 0.6573, + "step": 2226 + }, + { + "epoch": 0.6845988318475253, + "grad_norm": 0.29481416940689087, + "learning_rate": 4.425235713430004e-06, + "loss": 0.6393, + "step": 2227 + }, + { + "epoch": 0.6849062403934829, + "grad_norm": 0.30418622493743896, + "learning_rate": 4.42471689422727e-06, + "loss": 0.6049, + "step": 2228 + }, + { + "epoch": 0.6852136489394405, + "grad_norm": 0.30411219596862793, + "learning_rate": 4.424197871413053e-06, + "loss": 0.6412, + "step": 2229 + }, + { + "epoch": 0.685521057485398, + "grad_norm": 0.3045949637889862, + "learning_rate": 4.4236786450422585e-06, + "loss": 0.659, + "step": 2230 + }, + { + "epoch": 0.6858284660313557, + "grad_norm": 0.3289847671985626, + "learning_rate": 4.423159215169815e-06, + "loss": 0.6405, + "step": 2231 + }, + { + "epoch": 0.6861358745773133, + "grad_norm": 0.30936646461486816, + "learning_rate": 4.422639581850672e-06, + "loss": 0.6503, + "step": 2232 + }, + { + "epoch": 0.6864432831232709, + "grad_norm": 0.2978769540786743, + "learning_rate": 4.4221197451398e-06, + "loss": 0.6407, + "step": 2233 + }, + { + "epoch": 0.6867506916692284, + "grad_norm": 0.3109564185142517, + "learning_rate": 4.4215997050921905e-06, + "loss": 0.6183, + "step": 2234 + }, + { + "epoch": 0.687058100215186, + "grad_norm": 0.31553366780281067, + "learning_rate": 4.421079461762858e-06, + "loss": 0.6282, + "step": 2235 + }, + { + "epoch": 0.6873655087611436, + "grad_norm": 0.310350626707077, + "learning_rate": 4.420559015206839e-06, + "loss": 0.6188, + "step": 2236 + }, + { + "epoch": 0.6876729173071011, + "grad_norm": 0.3262034058570862, + "learning_rate": 4.420038365479189e-06, + "loss": 0.619, + "step": 2237 + }, + { + "epoch": 0.6879803258530587, + "grad_norm": 0.3091893792152405, + "learning_rate": 4.4195175126349865e-06, + "loss": 0.6172, + "step": 2238 + }, + { + "epoch": 0.6882877343990162, + "grad_norm": 0.310176819562912, + "learning_rate": 4.418996456729331e-06, + "loss": 0.6255, + "step": 2239 + }, + { + "epoch": 0.6885951429449738, + "grad_norm": 0.32693010568618774, + "learning_rate": 4.418475197817346e-06, + "loss": 0.6348, + "step": 2240 + }, + { + "epoch": 0.6889025514909315, + "grad_norm": 0.31642863154411316, + "learning_rate": 4.417953735954171e-06, + "loss": 0.6156, + "step": 2241 + }, + { + "epoch": 0.6892099600368891, + "grad_norm": 0.3028152287006378, + "learning_rate": 4.417432071194973e-06, + "loss": 0.6306, + "step": 2242 + }, + { + "epoch": 0.6895173685828466, + "grad_norm": 0.3051857650279999, + "learning_rate": 4.416910203594936e-06, + "loss": 0.618, + "step": 2243 + }, + { + "epoch": 0.6898247771288042, + "grad_norm": 0.32833096385002136, + "learning_rate": 4.4163881332092695e-06, + "loss": 0.6298, + "step": 2244 + }, + { + "epoch": 0.6901321856747618, + "grad_norm": 0.3007728159427643, + "learning_rate": 4.415865860093199e-06, + "loss": 0.6389, + "step": 2245 + }, + { + "epoch": 0.6904395942207193, + "grad_norm": 0.2908185124397278, + "learning_rate": 4.415343384301977e-06, + "loss": 0.6253, + "step": 2246 + }, + { + "epoch": 0.6907470027666769, + "grad_norm": 0.28626781702041626, + "learning_rate": 4.414820705890874e-06, + "loss": 0.5858, + "step": 2247 + }, + { + "epoch": 0.6910544113126345, + "grad_norm": 0.32487648725509644, + "learning_rate": 4.4142978249151835e-06, + "loss": 0.6422, + "step": 2248 + }, + { + "epoch": 0.691361819858592, + "grad_norm": 0.3022691309452057, + "learning_rate": 4.4137747414302205e-06, + "loss": 0.6477, + "step": 2249 + }, + { + "epoch": 0.6916692284045497, + "grad_norm": 0.28923526406288147, + "learning_rate": 4.4132514554913184e-06, + "loss": 0.6028, + "step": 2250 + }, + { + "epoch": 0.6919766369505073, + "grad_norm": 0.3042193055152893, + "learning_rate": 4.412727967153837e-06, + "loss": 0.6476, + "step": 2251 + }, + { + "epoch": 0.6922840454964648, + "grad_norm": 0.30875927209854126, + "learning_rate": 4.412204276473154e-06, + "loss": 0.631, + "step": 2252 + }, + { + "epoch": 0.6925914540424224, + "grad_norm": 0.29395127296447754, + "learning_rate": 4.411680383504669e-06, + "loss": 0.619, + "step": 2253 + }, + { + "epoch": 0.69289886258838, + "grad_norm": 0.3117886483669281, + "learning_rate": 4.411156288303804e-06, + "loss": 0.6252, + "step": 2254 + }, + { + "epoch": 0.6932062711343375, + "grad_norm": 0.2902756631374359, + "learning_rate": 4.410631990926002e-06, + "loss": 0.6361, + "step": 2255 + }, + { + "epoch": 0.6935136796802951, + "grad_norm": 0.30556005239486694, + "learning_rate": 4.410107491426726e-06, + "loss": 0.6469, + "step": 2256 + }, + { + "epoch": 0.6938210882262527, + "grad_norm": 0.3043801486492157, + "learning_rate": 4.409582789861464e-06, + "loss": 0.6522, + "step": 2257 + }, + { + "epoch": 0.6941284967722102, + "grad_norm": 0.29736432433128357, + "learning_rate": 4.409057886285722e-06, + "loss": 0.628, + "step": 2258 + }, + { + "epoch": 0.6944359053181678, + "grad_norm": 0.30432215332984924, + "learning_rate": 4.408532780755026e-06, + "loss": 0.6224, + "step": 2259 + }, + { + "epoch": 0.6947433138641255, + "grad_norm": 0.3050527572631836, + "learning_rate": 4.408007473324929e-06, + "loss": 0.6262, + "step": 2260 + }, + { + "epoch": 0.695050722410083, + "grad_norm": 0.3033735156059265, + "learning_rate": 4.407481964051001e-06, + "loss": 0.6332, + "step": 2261 + }, + { + "epoch": 0.6953581309560406, + "grad_norm": 0.29335054755210876, + "learning_rate": 4.406956252988834e-06, + "loss": 0.6199, + "step": 2262 + }, + { + "epoch": 0.6956655395019982, + "grad_norm": 0.3077775239944458, + "learning_rate": 4.406430340194042e-06, + "loss": 0.6305, + "step": 2263 + }, + { + "epoch": 0.6959729480479557, + "grad_norm": 0.29911714792251587, + "learning_rate": 4.405904225722261e-06, + "loss": 0.625, + "step": 2264 + }, + { + "epoch": 0.6962803565939133, + "grad_norm": 0.3108196258544922, + "learning_rate": 4.405377909629145e-06, + "loss": 0.6137, + "step": 2265 + }, + { + "epoch": 0.6965877651398709, + "grad_norm": 0.29731661081314087, + "learning_rate": 4.404851391970375e-06, + "loss": 0.6216, + "step": 2266 + }, + { + "epoch": 0.6968951736858284, + "grad_norm": 0.31382039189338684, + "learning_rate": 4.404324672801647e-06, + "loss": 0.5996, + "step": 2267 + }, + { + "epoch": 0.697202582231786, + "grad_norm": 0.3134569227695465, + "learning_rate": 4.403797752178685e-06, + "loss": 0.6453, + "step": 2268 + }, + { + "epoch": 0.6975099907777437, + "grad_norm": 0.3103843927383423, + "learning_rate": 4.403270630157228e-06, + "loss": 0.6582, + "step": 2269 + }, + { + "epoch": 0.6978173993237012, + "grad_norm": 0.3061358630657196, + "learning_rate": 4.40274330679304e-06, + "loss": 0.6236, + "step": 2270 + }, + { + "epoch": 0.6981248078696588, + "grad_norm": 0.31670641899108887, + "learning_rate": 4.402215782141904e-06, + "loss": 0.6557, + "step": 2271 + }, + { + "epoch": 0.6984322164156164, + "grad_norm": 0.300375372171402, + "learning_rate": 4.401688056259629e-06, + "loss": 0.6369, + "step": 2272 + }, + { + "epoch": 0.6987396249615739, + "grad_norm": 0.3140697777271271, + "learning_rate": 4.401160129202038e-06, + "loss": 0.6211, + "step": 2273 + }, + { + "epoch": 0.6990470335075315, + "grad_norm": 0.31066766381263733, + "learning_rate": 4.400632001024982e-06, + "loss": 0.6521, + "step": 2274 + }, + { + "epoch": 0.6993544420534891, + "grad_norm": 0.33040499687194824, + "learning_rate": 4.4001036717843295e-06, + "loss": 0.6286, + "step": 2275 + }, + { + "epoch": 0.6996618505994466, + "grad_norm": 0.3188686966896057, + "learning_rate": 4.39957514153597e-06, + "loss": 0.6519, + "step": 2276 + }, + { + "epoch": 0.6999692591454042, + "grad_norm": 0.3246189057826996, + "learning_rate": 4.399046410335818e-06, + "loss": 0.6289, + "step": 2277 + }, + { + "epoch": 0.7002766676913618, + "grad_norm": 0.2942262887954712, + "learning_rate": 4.398517478239806e-06, + "loss": 0.6396, + "step": 2278 + }, + { + "epoch": 0.7005840762373194, + "grad_norm": 0.30195149779319763, + "learning_rate": 4.397988345303887e-06, + "loss": 0.6333, + "step": 2279 + }, + { + "epoch": 0.700891484783277, + "grad_norm": 0.3045705258846283, + "learning_rate": 4.397459011584039e-06, + "loss": 0.6154, + "step": 2280 + }, + { + "epoch": 0.7011988933292346, + "grad_norm": 0.3070095181465149, + "learning_rate": 4.3969294771362566e-06, + "loss": 0.6316, + "step": 2281 + }, + { + "epoch": 0.7015063018751921, + "grad_norm": 0.3247891366481781, + "learning_rate": 4.39639974201656e-06, + "loss": 0.6266, + "step": 2282 + }, + { + "epoch": 0.7018137104211497, + "grad_norm": 0.30934110283851624, + "learning_rate": 4.395869806280989e-06, + "loss": 0.6388, + "step": 2283 + }, + { + "epoch": 0.7021211189671073, + "grad_norm": 0.31538987159729004, + "learning_rate": 4.395339669985602e-06, + "loss": 0.6285, + "step": 2284 + }, + { + "epoch": 0.7024285275130648, + "grad_norm": 0.30565935373306274, + "learning_rate": 4.394809333186483e-06, + "loss": 0.6638, + "step": 2285 + }, + { + "epoch": 0.7027359360590224, + "grad_norm": 0.32694822549819946, + "learning_rate": 4.394278795939735e-06, + "loss": 0.6379, + "step": 2286 + }, + { + "epoch": 0.70304334460498, + "grad_norm": 0.3252955973148346, + "learning_rate": 4.3937480583014815e-06, + "loss": 0.6258, + "step": 2287 + }, + { + "epoch": 0.7033507531509376, + "grad_norm": 0.3027402460575104, + "learning_rate": 4.393217120327868e-06, + "loss": 0.6213, + "step": 2288 + }, + { + "epoch": 0.7036581616968952, + "grad_norm": 0.3013116419315338, + "learning_rate": 4.392685982075061e-06, + "loss": 0.6399, + "step": 2289 + }, + { + "epoch": 0.7039655702428528, + "grad_norm": 0.31085869669914246, + "learning_rate": 4.39215464359925e-06, + "loss": 0.6282, + "step": 2290 + }, + { + "epoch": 0.7042729787888103, + "grad_norm": 0.3201334476470947, + "learning_rate": 4.391623104956644e-06, + "loss": 0.6144, + "step": 2291 + }, + { + "epoch": 0.7045803873347679, + "grad_norm": 0.2974223494529724, + "learning_rate": 4.391091366203471e-06, + "loss": 0.6169, + "step": 2292 + }, + { + "epoch": 0.7048877958807255, + "grad_norm": 0.3017517626285553, + "learning_rate": 4.390559427395984e-06, + "loss": 0.6215, + "step": 2293 + }, + { + "epoch": 0.705195204426683, + "grad_norm": 0.29883119463920593, + "learning_rate": 4.390027288590456e-06, + "loss": 0.6389, + "step": 2294 + }, + { + "epoch": 0.7055026129726406, + "grad_norm": 0.31952181458473206, + "learning_rate": 4.38949494984318e-06, + "loss": 0.6543, + "step": 2295 + }, + { + "epoch": 0.7058100215185982, + "grad_norm": 0.3148316740989685, + "learning_rate": 4.3889624112104715e-06, + "loss": 0.6321, + "step": 2296 + }, + { + "epoch": 0.7061174300645557, + "grad_norm": 0.29396432638168335, + "learning_rate": 4.388429672748666e-06, + "loss": 0.6269, + "step": 2297 + }, + { + "epoch": 0.7064248386105134, + "grad_norm": 0.29572829604148865, + "learning_rate": 4.38789673451412e-06, + "loss": 0.626, + "step": 2298 + }, + { + "epoch": 0.706732247156471, + "grad_norm": 0.3040049076080322, + "learning_rate": 4.3873635965632144e-06, + "loss": 0.6466, + "step": 2299 + }, + { + "epoch": 0.7070396557024285, + "grad_norm": 0.3051271140575409, + "learning_rate": 4.386830258952346e-06, + "loss": 0.6365, + "step": 2300 + }, + { + "epoch": 0.7073470642483861, + "grad_norm": 0.2902343273162842, + "learning_rate": 4.386296721737937e-06, + "loss": 0.6199, + "step": 2301 + }, + { + "epoch": 0.7076544727943437, + "grad_norm": 0.3021601140499115, + "learning_rate": 4.385762984976428e-06, + "loss": 0.6384, + "step": 2302 + }, + { + "epoch": 0.7079618813403012, + "grad_norm": 0.2954258918762207, + "learning_rate": 4.385229048724283e-06, + "loss": 0.6283, + "step": 2303 + }, + { + "epoch": 0.7082692898862588, + "grad_norm": 0.29449549317359924, + "learning_rate": 4.384694913037985e-06, + "loss": 0.6256, + "step": 2304 + }, + { + "epoch": 0.7085766984322164, + "grad_norm": 0.2973698079586029, + "learning_rate": 4.3841605779740395e-06, + "loss": 0.6259, + "step": 2305 + }, + { + "epoch": 0.708884106978174, + "grad_norm": 0.3153115510940552, + "learning_rate": 4.383626043588972e-06, + "loss": 0.6713, + "step": 2306 + }, + { + "epoch": 0.7091915155241316, + "grad_norm": 0.3046559691429138, + "learning_rate": 4.38309130993933e-06, + "loss": 0.6141, + "step": 2307 + }, + { + "epoch": 0.7094989240700892, + "grad_norm": 0.3107219934463501, + "learning_rate": 4.382556377081683e-06, + "loss": 0.6357, + "step": 2308 + }, + { + "epoch": 0.7098063326160468, + "grad_norm": 0.29752087593078613, + "learning_rate": 4.382021245072618e-06, + "loss": 0.6301, + "step": 2309 + }, + { + "epoch": 0.7101137411620043, + "grad_norm": 0.29124629497528076, + "learning_rate": 4.381485913968747e-06, + "loss": 0.6312, + "step": 2310 + }, + { + "epoch": 0.7104211497079619, + "grad_norm": 0.30363526940345764, + "learning_rate": 4.380950383826702e-06, + "loss": 0.6523, + "step": 2311 + }, + { + "epoch": 0.7107285582539195, + "grad_norm": 0.3029852509498596, + "learning_rate": 4.3804146547031335e-06, + "loss": 0.623, + "step": 2312 + }, + { + "epoch": 0.711035966799877, + "grad_norm": 0.3085462152957916, + "learning_rate": 4.379878726654717e-06, + "loss": 0.629, + "step": 2313 + }, + { + "epoch": 0.7113433753458346, + "grad_norm": 0.29750269651412964, + "learning_rate": 4.379342599738146e-06, + "loss": 0.6249, + "step": 2314 + }, + { + "epoch": 0.7116507838917921, + "grad_norm": 0.3007909655570984, + "learning_rate": 4.378806274010136e-06, + "loss": 0.6089, + "step": 2315 + }, + { + "epoch": 0.7119581924377497, + "grad_norm": 0.30618202686309814, + "learning_rate": 4.378269749527425e-06, + "loss": 0.6122, + "step": 2316 + }, + { + "epoch": 0.7122656009837074, + "grad_norm": 0.30295053124427795, + "learning_rate": 4.377733026346769e-06, + "loss": 0.6116, + "step": 2317 + }, + { + "epoch": 0.712573009529665, + "grad_norm": 0.2990586757659912, + "learning_rate": 4.377196104524948e-06, + "loss": 0.6019, + "step": 2318 + }, + { + "epoch": 0.7128804180756225, + "grad_norm": 0.30710235238075256, + "learning_rate": 4.376658984118763e-06, + "loss": 0.6667, + "step": 2319 + }, + { + "epoch": 0.7131878266215801, + "grad_norm": 0.30798718333244324, + "learning_rate": 4.3761216651850314e-06, + "loss": 0.6491, + "step": 2320 + }, + { + "epoch": 0.7134952351675377, + "grad_norm": 0.30121591687202454, + "learning_rate": 4.375584147780597e-06, + "loss": 0.632, + "step": 2321 + }, + { + "epoch": 0.7138026437134952, + "grad_norm": 0.28627774119377136, + "learning_rate": 4.3750464319623235e-06, + "loss": 0.6136, + "step": 2322 + }, + { + "epoch": 0.7141100522594528, + "grad_norm": 0.2961050868034363, + "learning_rate": 4.3745085177870915e-06, + "loss": 0.6305, + "step": 2323 + }, + { + "epoch": 0.7144174608054104, + "grad_norm": 0.30248257517814636, + "learning_rate": 4.373970405311809e-06, + "loss": 0.6268, + "step": 2324 + }, + { + "epoch": 0.7147248693513679, + "grad_norm": 0.3004158139228821, + "learning_rate": 4.373432094593401e-06, + "loss": 0.635, + "step": 2325 + }, + { + "epoch": 0.7150322778973256, + "grad_norm": 0.3215925097465515, + "learning_rate": 4.372893585688813e-06, + "loss": 0.656, + "step": 2326 + }, + { + "epoch": 0.7153396864432832, + "grad_norm": 0.3164115846157074, + "learning_rate": 4.372354878655013e-06, + "loss": 0.6266, + "step": 2327 + }, + { + "epoch": 0.7156470949892407, + "grad_norm": 0.304446816444397, + "learning_rate": 4.37181597354899e-06, + "loss": 0.6313, + "step": 2328 + }, + { + "epoch": 0.7159545035351983, + "grad_norm": 0.30307045578956604, + "learning_rate": 4.3712768704277535e-06, + "loss": 0.6404, + "step": 2329 + }, + { + "epoch": 0.7162619120811559, + "grad_norm": 0.30101802945137024, + "learning_rate": 4.3707375693483336e-06, + "loss": 0.6015, + "step": 2330 + }, + { + "epoch": 0.7165693206271134, + "grad_norm": 0.3050714433193207, + "learning_rate": 4.370198070367783e-06, + "loss": 0.6424, + "step": 2331 + }, + { + "epoch": 0.716876729173071, + "grad_norm": 0.28375548124313354, + "learning_rate": 4.369658373543172e-06, + "loss": 0.6369, + "step": 2332 + }, + { + "epoch": 0.7171841377190286, + "grad_norm": 0.317184716463089, + "learning_rate": 4.369118478931595e-06, + "loss": 0.6268, + "step": 2333 + }, + { + "epoch": 0.7174915462649861, + "grad_norm": 0.33115315437316895, + "learning_rate": 4.368578386590168e-06, + "loss": 0.6371, + "step": 2334 + }, + { + "epoch": 0.7177989548109437, + "grad_norm": 0.31678658723831177, + "learning_rate": 4.368038096576023e-06, + "loss": 0.6295, + "step": 2335 + }, + { + "epoch": 0.7181063633569014, + "grad_norm": 0.31439754366874695, + "learning_rate": 4.367497608946318e-06, + "loss": 0.6325, + "step": 2336 + }, + { + "epoch": 0.7184137719028589, + "grad_norm": 0.28829750418663025, + "learning_rate": 4.3669569237582296e-06, + "loss": 0.6076, + "step": 2337 + }, + { + "epoch": 0.7187211804488165, + "grad_norm": 0.31290125846862793, + "learning_rate": 4.3664160410689556e-06, + "loss": 0.6419, + "step": 2338 + }, + { + "epoch": 0.7190285889947741, + "grad_norm": 0.2996119260787964, + "learning_rate": 4.3658749609357145e-06, + "loss": 0.6133, + "step": 2339 + }, + { + "epoch": 0.7193359975407316, + "grad_norm": 0.30370762944221497, + "learning_rate": 4.3653336834157465e-06, + "loss": 0.6331, + "step": 2340 + }, + { + "epoch": 0.7196434060866892, + "grad_norm": 0.311774879693985, + "learning_rate": 4.364792208566312e-06, + "loss": 0.6249, + "step": 2341 + }, + { + "epoch": 0.7199508146326468, + "grad_norm": 0.3073479235172272, + "learning_rate": 4.3642505364446915e-06, + "loss": 0.6442, + "step": 2342 + }, + { + "epoch": 0.7202582231786043, + "grad_norm": 0.30494141578674316, + "learning_rate": 4.3637086671081894e-06, + "loss": 0.6157, + "step": 2343 + }, + { + "epoch": 0.7205656317245619, + "grad_norm": 0.2920476198196411, + "learning_rate": 4.363166600614126e-06, + "loss": 0.6069, + "step": 2344 + }, + { + "epoch": 0.7208730402705196, + "grad_norm": 0.2933422029018402, + "learning_rate": 4.362624337019848e-06, + "loss": 0.6056, + "step": 2345 + }, + { + "epoch": 0.7211804488164771, + "grad_norm": 0.32173943519592285, + "learning_rate": 4.362081876382719e-06, + "loss": 0.6423, + "step": 2346 + }, + { + "epoch": 0.7214878573624347, + "grad_norm": 0.3099469244480133, + "learning_rate": 4.361539218760123e-06, + "loss": 0.6249, + "step": 2347 + }, + { + "epoch": 0.7217952659083923, + "grad_norm": 0.2973398268222809, + "learning_rate": 4.360996364209471e-06, + "loss": 0.6522, + "step": 2348 + }, + { + "epoch": 0.7221026744543498, + "grad_norm": 0.2906361222267151, + "learning_rate": 4.360453312788185e-06, + "loss": 0.6341, + "step": 2349 + }, + { + "epoch": 0.7224100830003074, + "grad_norm": 0.29723864793777466, + "learning_rate": 4.359910064553718e-06, + "loss": 0.6251, + "step": 2350 + }, + { + "epoch": 0.722717491546265, + "grad_norm": 0.3047962486743927, + "learning_rate": 4.3593666195635355e-06, + "loss": 0.617, + "step": 2351 + }, + { + "epoch": 0.7230249000922225, + "grad_norm": 0.29311221837997437, + "learning_rate": 4.358822977875129e-06, + "loss": 0.6312, + "step": 2352 + }, + { + "epoch": 0.7233323086381801, + "grad_norm": 0.2905018925666809, + "learning_rate": 4.3582791395460085e-06, + "loss": 0.6103, + "step": 2353 + }, + { + "epoch": 0.7236397171841377, + "grad_norm": 0.2964112460613251, + "learning_rate": 4.357735104633706e-06, + "loss": 0.639, + "step": 2354 + }, + { + "epoch": 0.7239471257300953, + "grad_norm": 0.30444905161857605, + "learning_rate": 4.357190873195774e-06, + "loss": 0.6365, + "step": 2355 + }, + { + "epoch": 0.7242545342760529, + "grad_norm": 0.29664650559425354, + "learning_rate": 4.356646445289784e-06, + "loss": 0.6057, + "step": 2356 + }, + { + "epoch": 0.7245619428220105, + "grad_norm": 0.2961133122444153, + "learning_rate": 4.356101820973332e-06, + "loss": 0.6412, + "step": 2357 + }, + { + "epoch": 0.724869351367968, + "grad_norm": 0.3044832944869995, + "learning_rate": 4.355557000304031e-06, + "loss": 0.6382, + "step": 2358 + }, + { + "epoch": 0.7251767599139256, + "grad_norm": 0.30839037895202637, + "learning_rate": 4.355011983339516e-06, + "loss": 0.634, + "step": 2359 + }, + { + "epoch": 0.7254841684598832, + "grad_norm": 0.30081722140312195, + "learning_rate": 4.354466770137445e-06, + "loss": 0.6291, + "step": 2360 + }, + { + "epoch": 0.7257915770058407, + "grad_norm": 0.31568461656570435, + "learning_rate": 4.353921360755492e-06, + "loss": 0.6139, + "step": 2361 + }, + { + "epoch": 0.7260989855517983, + "grad_norm": 0.3050088584423065, + "learning_rate": 4.353375755251358e-06, + "loss": 0.6172, + "step": 2362 + }, + { + "epoch": 0.7264063940977559, + "grad_norm": 0.30875450372695923, + "learning_rate": 4.352829953682759e-06, + "loss": 0.6467, + "step": 2363 + }, + { + "epoch": 0.7267138026437135, + "grad_norm": 0.3128209710121155, + "learning_rate": 4.352283956107435e-06, + "loss": 0.6293, + "step": 2364 + }, + { + "epoch": 0.7270212111896711, + "grad_norm": 0.3009471595287323, + "learning_rate": 4.3517377625831455e-06, + "loss": 0.6228, + "step": 2365 + }, + { + "epoch": 0.7273286197356287, + "grad_norm": 0.29608386754989624, + "learning_rate": 4.351191373167672e-06, + "loss": 0.6414, + "step": 2366 + }, + { + "epoch": 0.7276360282815862, + "grad_norm": 0.29667893052101135, + "learning_rate": 4.350644787918815e-06, + "loss": 0.6224, + "step": 2367 + }, + { + "epoch": 0.7279434368275438, + "grad_norm": 0.29360437393188477, + "learning_rate": 4.350098006894396e-06, + "loss": 0.6114, + "step": 2368 + }, + { + "epoch": 0.7282508453735014, + "grad_norm": 0.29947909712791443, + "learning_rate": 4.3495510301522595e-06, + "loss": 0.6017, + "step": 2369 + }, + { + "epoch": 0.7285582539194589, + "grad_norm": 0.2971738278865814, + "learning_rate": 4.349003857750267e-06, + "loss": 0.6217, + "step": 2370 + }, + { + "epoch": 0.7288656624654165, + "grad_norm": 0.3016505539417267, + "learning_rate": 4.348456489746303e-06, + "loss": 0.6301, + "step": 2371 + }, + { + "epoch": 0.7291730710113741, + "grad_norm": 0.29933926463127136, + "learning_rate": 4.347908926198274e-06, + "loss": 0.6021, + "step": 2372 + }, + { + "epoch": 0.7294804795573316, + "grad_norm": 0.30990907549858093, + "learning_rate": 4.347361167164104e-06, + "loss": 0.6307, + "step": 2373 + }, + { + "epoch": 0.7297878881032893, + "grad_norm": 0.2813735008239746, + "learning_rate": 4.346813212701739e-06, + "loss": 0.6024, + "step": 2374 + }, + { + "epoch": 0.7300952966492469, + "grad_norm": 0.30995872616767883, + "learning_rate": 4.346265062869147e-06, + "loss": 0.64, + "step": 2375 + }, + { + "epoch": 0.7304027051952044, + "grad_norm": 0.302640825510025, + "learning_rate": 4.345716717724315e-06, + "loss": 0.6501, + "step": 2376 + }, + { + "epoch": 0.730710113741162, + "grad_norm": 0.2969285249710083, + "learning_rate": 4.345168177325251e-06, + "loss": 0.6201, + "step": 2377 + }, + { + "epoch": 0.7310175222871196, + "grad_norm": 0.2934682369232178, + "learning_rate": 4.344619441729984e-06, + "loss": 0.6043, + "step": 2378 + }, + { + "epoch": 0.7313249308330771, + "grad_norm": 0.307385116815567, + "learning_rate": 4.3440705109965635e-06, + "loss": 0.6192, + "step": 2379 + }, + { + "epoch": 0.7316323393790347, + "grad_norm": 0.29822251200675964, + "learning_rate": 4.34352138518306e-06, + "loss": 0.6197, + "step": 2380 + }, + { + "epoch": 0.7319397479249923, + "grad_norm": 0.30719172954559326, + "learning_rate": 4.342972064347564e-06, + "loss": 0.6187, + "step": 2381 + }, + { + "epoch": 0.7322471564709498, + "grad_norm": 0.3037765622138977, + "learning_rate": 4.342422548548186e-06, + "loss": 0.6172, + "step": 2382 + }, + { + "epoch": 0.7325545650169075, + "grad_norm": 0.29930052161216736, + "learning_rate": 4.34187283784306e-06, + "loss": 0.6537, + "step": 2383 + }, + { + "epoch": 0.7328619735628651, + "grad_norm": 0.3045055568218231, + "learning_rate": 4.341322932290337e-06, + "loss": 0.6394, + "step": 2384 + }, + { + "epoch": 0.7331693821088227, + "grad_norm": 0.30624327063560486, + "learning_rate": 4.340772831948191e-06, + "loss": 0.6182, + "step": 2385 + }, + { + "epoch": 0.7334767906547802, + "grad_norm": 0.3049914836883545, + "learning_rate": 4.340222536874816e-06, + "loss": 0.6412, + "step": 2386 + }, + { + "epoch": 0.7337841992007378, + "grad_norm": 0.2970142066478729, + "learning_rate": 4.3396720471284255e-06, + "loss": 0.6374, + "step": 2387 + }, + { + "epoch": 0.7340916077466954, + "grad_norm": 0.3164125978946686, + "learning_rate": 4.339121362767256e-06, + "loss": 0.64, + "step": 2388 + }, + { + "epoch": 0.7343990162926529, + "grad_norm": 0.29766884446144104, + "learning_rate": 4.338570483849562e-06, + "loss": 0.6362, + "step": 2389 + }, + { + "epoch": 0.7347064248386105, + "grad_norm": 0.28777968883514404, + "learning_rate": 4.3380194104336204e-06, + "loss": 0.634, + "step": 2390 + }, + { + "epoch": 0.735013833384568, + "grad_norm": 0.3099069595336914, + "learning_rate": 4.3374681425777275e-06, + "loss": 0.5998, + "step": 2391 + }, + { + "epoch": 0.7353212419305256, + "grad_norm": 0.29564371705055237, + "learning_rate": 4.336916680340202e-06, + "loss": 0.6427, + "step": 2392 + }, + { + "epoch": 0.7356286504764833, + "grad_norm": 0.2965984046459198, + "learning_rate": 4.33636502377938e-06, + "loss": 0.6271, + "step": 2393 + }, + { + "epoch": 0.7359360590224409, + "grad_norm": 0.3099779784679413, + "learning_rate": 4.335813172953621e-06, + "loss": 0.615, + "step": 2394 + }, + { + "epoch": 0.7362434675683984, + "grad_norm": 0.2961738705635071, + "learning_rate": 4.335261127921304e-06, + "loss": 0.622, + "step": 2395 + }, + { + "epoch": 0.736550876114356, + "grad_norm": 0.29632386565208435, + "learning_rate": 4.334708888740829e-06, + "loss": 0.6045, + "step": 2396 + }, + { + "epoch": 0.7368582846603136, + "grad_norm": 0.31231534481048584, + "learning_rate": 4.334156455470616e-06, + "loss": 0.6082, + "step": 2397 + }, + { + "epoch": 0.7371656932062711, + "grad_norm": 0.29632842540740967, + "learning_rate": 4.333603828169104e-06, + "loss": 0.6252, + "step": 2398 + }, + { + "epoch": 0.7374731017522287, + "grad_norm": 0.30079004168510437, + "learning_rate": 4.333051006894757e-06, + "loss": 0.614, + "step": 2399 + }, + { + "epoch": 0.7377805102981863, + "grad_norm": 0.3032827377319336, + "learning_rate": 4.332497991706054e-06, + "loss": 0.6371, + "step": 2400 + }, + { + "epoch": 0.7380879188441438, + "grad_norm": 0.31305134296417236, + "learning_rate": 4.331944782661499e-06, + "loss": 0.638, + "step": 2401 + }, + { + "epoch": 0.7383953273901015, + "grad_norm": 0.2989344000816345, + "learning_rate": 4.331391379819615e-06, + "loss": 0.622, + "step": 2402 + }, + { + "epoch": 0.7387027359360591, + "grad_norm": 0.3300643861293793, + "learning_rate": 4.330837783238944e-06, + "loss": 0.6503, + "step": 2403 + }, + { + "epoch": 0.7390101444820166, + "grad_norm": 0.3089626431465149, + "learning_rate": 4.3302839929780505e-06, + "loss": 0.6276, + "step": 2404 + }, + { + "epoch": 0.7393175530279742, + "grad_norm": 0.3069544732570648, + "learning_rate": 4.329730009095518e-06, + "loss": 0.6434, + "step": 2405 + }, + { + "epoch": 0.7396249615739318, + "grad_norm": 0.3012744188308716, + "learning_rate": 4.329175831649952e-06, + "loss": 0.6103, + "step": 2406 + }, + { + "epoch": 0.7399323701198893, + "grad_norm": 0.2992340624332428, + "learning_rate": 4.328621460699978e-06, + "loss": 0.669, + "step": 2407 + }, + { + "epoch": 0.7402397786658469, + "grad_norm": 0.31130585074424744, + "learning_rate": 4.32806689630424e-06, + "loss": 0.6311, + "step": 2408 + }, + { + "epoch": 0.7405471872118045, + "grad_norm": 0.2986952066421509, + "learning_rate": 4.327512138521406e-06, + "loss": 0.6245, + "step": 2409 + }, + { + "epoch": 0.740854595757762, + "grad_norm": 0.29707324504852295, + "learning_rate": 4.326957187410161e-06, + "loss": 0.6504, + "step": 2410 + }, + { + "epoch": 0.7411620043037196, + "grad_norm": 0.30634474754333496, + "learning_rate": 4.326402043029213e-06, + "loss": 0.6271, + "step": 2411 + }, + { + "epoch": 0.7414694128496773, + "grad_norm": 0.30534693598747253, + "learning_rate": 4.32584670543729e-06, + "loss": 0.6383, + "step": 2412 + }, + { + "epoch": 0.7417768213956348, + "grad_norm": 0.3028537333011627, + "learning_rate": 4.325291174693138e-06, + "loss": 0.622, + "step": 2413 + }, + { + "epoch": 0.7420842299415924, + "grad_norm": 0.3008033037185669, + "learning_rate": 4.324735450855528e-06, + "loss": 0.6127, + "step": 2414 + }, + { + "epoch": 0.74239163848755, + "grad_norm": 0.30875301361083984, + "learning_rate": 4.324179533983246e-06, + "loss": 0.6338, + "step": 2415 + }, + { + "epoch": 0.7426990470335075, + "grad_norm": 0.31111931800842285, + "learning_rate": 4.323623424135104e-06, + "loss": 0.6299, + "step": 2416 + }, + { + "epoch": 0.7430064555794651, + "grad_norm": 0.3100573718547821, + "learning_rate": 4.323067121369929e-06, + "loss": 0.6252, + "step": 2417 + }, + { + "epoch": 0.7433138641254227, + "grad_norm": 0.2990139424800873, + "learning_rate": 4.322510625746572e-06, + "loss": 0.6481, + "step": 2418 + }, + { + "epoch": 0.7436212726713802, + "grad_norm": 0.2919752597808838, + "learning_rate": 4.321953937323904e-06, + "loss": 0.6075, + "step": 2419 + }, + { + "epoch": 0.7439286812173378, + "grad_norm": 0.3053419291973114, + "learning_rate": 4.321397056160815e-06, + "loss": 0.611, + "step": 2420 + }, + { + "epoch": 0.7442360897632955, + "grad_norm": 0.3024568259716034, + "learning_rate": 4.3208399823162175e-06, + "loss": 0.6094, + "step": 2421 + }, + { + "epoch": 0.744543498309253, + "grad_norm": 0.30447298288345337, + "learning_rate": 4.320282715849042e-06, + "loss": 0.6552, + "step": 2422 + }, + { + "epoch": 0.7448509068552106, + "grad_norm": 0.31399697065353394, + "learning_rate": 4.31972525681824e-06, + "loss": 0.6194, + "step": 2423 + }, + { + "epoch": 0.7451583154011682, + "grad_norm": 0.3164733052253723, + "learning_rate": 4.3191676052827855e-06, + "loss": 0.6237, + "step": 2424 + }, + { + "epoch": 0.7454657239471257, + "grad_norm": 0.3002210855484009, + "learning_rate": 4.31860976130167e-06, + "loss": 0.6104, + "step": 2425 + }, + { + "epoch": 0.7457731324930833, + "grad_norm": 0.3003298342227936, + "learning_rate": 4.318051724933906e-06, + "loss": 0.6546, + "step": 2426 + }, + { + "epoch": 0.7460805410390409, + "grad_norm": 0.30995237827301025, + "learning_rate": 4.317493496238529e-06, + "loss": 0.6377, + "step": 2427 + }, + { + "epoch": 0.7463879495849984, + "grad_norm": 0.30517780780792236, + "learning_rate": 4.31693507527459e-06, + "loss": 0.6284, + "step": 2428 + }, + { + "epoch": 0.746695358130956, + "grad_norm": 0.3016275465488434, + "learning_rate": 4.316376462101165e-06, + "loss": 0.6171, + "step": 2429 + }, + { + "epoch": 0.7470027666769136, + "grad_norm": 0.3008437156677246, + "learning_rate": 4.315817656777348e-06, + "loss": 0.6439, + "step": 2430 + }, + { + "epoch": 0.7473101752228712, + "grad_norm": 0.29322195053100586, + "learning_rate": 4.315258659362254e-06, + "loss": 0.5793, + "step": 2431 + }, + { + "epoch": 0.7476175837688288, + "grad_norm": 0.3116666078567505, + "learning_rate": 4.314699469915018e-06, + "loss": 0.6269, + "step": 2432 + }, + { + "epoch": 0.7479249923147864, + "grad_norm": 0.3027917444705963, + "learning_rate": 4.314140088494795e-06, + "loss": 0.6037, + "step": 2433 + }, + { + "epoch": 0.7482324008607439, + "grad_norm": 0.3279725909233093, + "learning_rate": 4.31358051516076e-06, + "loss": 0.6169, + "step": 2434 + }, + { + "epoch": 0.7485398094067015, + "grad_norm": 0.30106624960899353, + "learning_rate": 4.313020749972112e-06, + "loss": 0.6276, + "step": 2435 + }, + { + "epoch": 0.7488472179526591, + "grad_norm": 0.3057149648666382, + "learning_rate": 4.312460792988064e-06, + "loss": 0.6324, + "step": 2436 + }, + { + "epoch": 0.7491546264986166, + "grad_norm": 0.2916492521762848, + "learning_rate": 4.311900644267853e-06, + "loss": 0.6382, + "step": 2437 + }, + { + "epoch": 0.7494620350445742, + "grad_norm": 0.3044597804546356, + "learning_rate": 4.311340303870739e-06, + "loss": 0.6282, + "step": 2438 + }, + { + "epoch": 0.7497694435905318, + "grad_norm": 0.2920282781124115, + "learning_rate": 4.310779771855995e-06, + "loss": 0.6222, + "step": 2439 + }, + { + "epoch": 0.7500768521364894, + "grad_norm": 0.2844346761703491, + "learning_rate": 4.310219048282921e-06, + "loss": 0.6271, + "step": 2440 + }, + { + "epoch": 0.750384260682447, + "grad_norm": 0.30267179012298584, + "learning_rate": 4.309658133210834e-06, + "loss": 0.6297, + "step": 2441 + }, + { + "epoch": 0.7506916692284046, + "grad_norm": 0.2998878061771393, + "learning_rate": 4.3090970266990715e-06, + "loss": 0.6145, + "step": 2442 + }, + { + "epoch": 0.7509990777743621, + "grad_norm": 0.3298640251159668, + "learning_rate": 4.308535728806993e-06, + "loss": 0.6116, + "step": 2443 + }, + { + "epoch": 0.7513064863203197, + "grad_norm": 0.29277703166007996, + "learning_rate": 4.307974239593975e-06, + "loss": 0.6074, + "step": 2444 + }, + { + "epoch": 0.7516138948662773, + "grad_norm": 0.31157049536705017, + "learning_rate": 4.307412559119418e-06, + "loss": 0.6344, + "step": 2445 + }, + { + "epoch": 0.7519213034122348, + "grad_norm": 0.3166014850139618, + "learning_rate": 4.3068506874427405e-06, + "loss": 0.6438, + "step": 2446 + }, + { + "epoch": 0.7522287119581924, + "grad_norm": 0.2980503439903259, + "learning_rate": 4.306288624623381e-06, + "loss": 0.6522, + "step": 2447 + }, + { + "epoch": 0.75253612050415, + "grad_norm": 0.2943282425403595, + "learning_rate": 4.3057263707207995e-06, + "loss": 0.6116, + "step": 2448 + }, + { + "epoch": 0.7528435290501075, + "grad_norm": 0.29266157746315, + "learning_rate": 4.305163925794475e-06, + "loss": 0.6294, + "step": 2449 + }, + { + "epoch": 0.7531509375960652, + "grad_norm": 0.2946892976760864, + "learning_rate": 4.304601289903907e-06, + "loss": 0.6192, + "step": 2450 + }, + { + "epoch": 0.7534583461420228, + "grad_norm": 0.29160642623901367, + "learning_rate": 4.304038463108616e-06, + "loss": 0.6277, + "step": 2451 + }, + { + "epoch": 0.7537657546879803, + "grad_norm": 0.2973109483718872, + "learning_rate": 4.3034754454681425e-06, + "loss": 0.6275, + "step": 2452 + }, + { + "epoch": 0.7540731632339379, + "grad_norm": 0.30840739607810974, + "learning_rate": 4.302912237042048e-06, + "loss": 0.6038, + "step": 2453 + }, + { + "epoch": 0.7543805717798955, + "grad_norm": 0.30278027057647705, + "learning_rate": 4.30234883788991e-06, + "loss": 0.636, + "step": 2454 + }, + { + "epoch": 0.754687980325853, + "grad_norm": 0.3025553226470947, + "learning_rate": 4.301785248071331e-06, + "loss": 0.6406, + "step": 2455 + }, + { + "epoch": 0.7549953888718106, + "grad_norm": 0.3072817921638489, + "learning_rate": 4.3012214676459326e-06, + "loss": 0.6322, + "step": 2456 + }, + { + "epoch": 0.7553027974177682, + "grad_norm": 0.3026777505874634, + "learning_rate": 4.300657496673355e-06, + "loss": 0.6278, + "step": 2457 + }, + { + "epoch": 0.7556102059637257, + "grad_norm": 0.3045984208583832, + "learning_rate": 4.300093335213259e-06, + "loss": 0.6381, + "step": 2458 + }, + { + "epoch": 0.7559176145096834, + "grad_norm": 0.30145400762557983, + "learning_rate": 4.299528983325327e-06, + "loss": 0.6437, + "step": 2459 + }, + { + "epoch": 0.756225023055641, + "grad_norm": 0.3057374954223633, + "learning_rate": 4.298964441069259e-06, + "loss": 0.6403, + "step": 2460 + }, + { + "epoch": 0.7565324316015986, + "grad_norm": 0.29623696208000183, + "learning_rate": 4.298399708504779e-06, + "loss": 0.6085, + "step": 2461 + }, + { + "epoch": 0.7568398401475561, + "grad_norm": 0.3011881411075592, + "learning_rate": 4.297834785691626e-06, + "loss": 0.6292, + "step": 2462 + }, + { + "epoch": 0.7571472486935137, + "grad_norm": 0.31721457839012146, + "learning_rate": 4.297269672689565e-06, + "loss": 0.6363, + "step": 2463 + }, + { + "epoch": 0.7574546572394713, + "grad_norm": 0.3141317069530487, + "learning_rate": 4.296704369558375e-06, + "loss": 0.6193, + "step": 2464 + }, + { + "epoch": 0.7577620657854288, + "grad_norm": 0.29915767908096313, + "learning_rate": 4.296138876357861e-06, + "loss": 0.5986, + "step": 2465 + }, + { + "epoch": 0.7580694743313864, + "grad_norm": 0.3048783242702484, + "learning_rate": 4.2955731931478425e-06, + "loss": 0.6308, + "step": 2466 + }, + { + "epoch": 0.758376882877344, + "grad_norm": 0.297248899936676, + "learning_rate": 4.295007319988163e-06, + "loss": 0.624, + "step": 2467 + }, + { + "epoch": 0.7586842914233015, + "grad_norm": 0.29690074920654297, + "learning_rate": 4.294441256938686e-06, + "loss": 0.6317, + "step": 2468 + }, + { + "epoch": 0.7589916999692592, + "grad_norm": 0.30204084515571594, + "learning_rate": 4.293875004059292e-06, + "loss": 0.6331, + "step": 2469 + }, + { + "epoch": 0.7592991085152168, + "grad_norm": 0.3002704977989197, + "learning_rate": 4.293308561409886e-06, + "loss": 0.6336, + "step": 2470 + }, + { + "epoch": 0.7596065170611743, + "grad_norm": 0.29486894607543945, + "learning_rate": 4.2927419290503895e-06, + "loss": 0.6335, + "step": 2471 + }, + { + "epoch": 0.7599139256071319, + "grad_norm": 0.3063690662384033, + "learning_rate": 4.292175107040745e-06, + "loss": 0.6447, + "step": 2472 + }, + { + "epoch": 0.7602213341530895, + "grad_norm": 0.3007512092590332, + "learning_rate": 4.291608095440915e-06, + "loss": 0.6205, + "step": 2473 + }, + { + "epoch": 0.760528742699047, + "grad_norm": 0.29112547636032104, + "learning_rate": 4.2910408943108835e-06, + "loss": 0.6051, + "step": 2474 + }, + { + "epoch": 0.7608361512450046, + "grad_norm": 0.3002714216709137, + "learning_rate": 4.290473503710653e-06, + "loss": 0.6447, + "step": 2475 + }, + { + "epoch": 0.7611435597909622, + "grad_norm": 0.29345548152923584, + "learning_rate": 4.289905923700245e-06, + "loss": 0.6285, + "step": 2476 + }, + { + "epoch": 0.7614509683369197, + "grad_norm": 0.2942519783973694, + "learning_rate": 4.289338154339706e-06, + "loss": 0.6317, + "step": 2477 + }, + { + "epoch": 0.7617583768828774, + "grad_norm": 0.3049924075603485, + "learning_rate": 4.288770195689096e-06, + "loss": 0.6485, + "step": 2478 + }, + { + "epoch": 0.762065785428835, + "grad_norm": 0.2921378016471863, + "learning_rate": 4.288202047808498e-06, + "loss": 0.6279, + "step": 2479 + }, + { + "epoch": 0.7623731939747925, + "grad_norm": 0.3123618960380554, + "learning_rate": 4.287633710758017e-06, + "loss": 0.6238, + "step": 2480 + }, + { + "epoch": 0.7626806025207501, + "grad_norm": 0.2977310121059418, + "learning_rate": 4.287065184597776e-06, + "loss": 0.632, + "step": 2481 + }, + { + "epoch": 0.7629880110667077, + "grad_norm": 0.301406592130661, + "learning_rate": 4.286496469387917e-06, + "loss": 0.6123, + "step": 2482 + }, + { + "epoch": 0.7632954196126652, + "grad_norm": 0.28823596239089966, + "learning_rate": 4.285927565188602e-06, + "loss": 0.5969, + "step": 2483 + }, + { + "epoch": 0.7636028281586228, + "grad_norm": 0.29607269167900085, + "learning_rate": 4.285358472060016e-06, + "loss": 0.6159, + "step": 2484 + }, + { + "epoch": 0.7639102367045804, + "grad_norm": 0.3106372058391571, + "learning_rate": 4.284789190062363e-06, + "loss": 0.6297, + "step": 2485 + }, + { + "epoch": 0.7642176452505379, + "grad_norm": 0.2826676368713379, + "learning_rate": 4.284219719255863e-06, + "loss": 0.6297, + "step": 2486 + }, + { + "epoch": 0.7645250537964955, + "grad_norm": 0.3014405369758606, + "learning_rate": 4.283650059700761e-06, + "loss": 0.6247, + "step": 2487 + }, + { + "epoch": 0.7648324623424532, + "grad_norm": 0.3036655783653259, + "learning_rate": 4.283080211457321e-06, + "loss": 0.6442, + "step": 2488 + }, + { + "epoch": 0.7651398708884107, + "grad_norm": 0.3107497990131378, + "learning_rate": 4.282510174585823e-06, + "loss": 0.625, + "step": 2489 + }, + { + "epoch": 0.7654472794343683, + "grad_norm": 0.2904626727104187, + "learning_rate": 4.281939949146573e-06, + "loss": 0.6145, + "step": 2490 + }, + { + "epoch": 0.7657546879803259, + "grad_norm": 0.30235421657562256, + "learning_rate": 4.281369535199892e-06, + "loss": 0.6408, + "step": 2491 + }, + { + "epoch": 0.7660620965262834, + "grad_norm": 0.3001440763473511, + "learning_rate": 4.280798932806123e-06, + "loss": 0.6413, + "step": 2492 + }, + { + "epoch": 0.766369505072241, + "grad_norm": 0.29928141832351685, + "learning_rate": 4.280228142025629e-06, + "loss": 0.6188, + "step": 2493 + }, + { + "epoch": 0.7666769136181986, + "grad_norm": 0.30460506677627563, + "learning_rate": 4.279657162918794e-06, + "loss": 0.6405, + "step": 2494 + }, + { + "epoch": 0.7669843221641561, + "grad_norm": 0.31585919857025146, + "learning_rate": 4.279085995546017e-06, + "loss": 0.6131, + "step": 2495 + }, + { + "epoch": 0.7672917307101137, + "grad_norm": 0.3080865740776062, + "learning_rate": 4.278514639967724e-06, + "loss": 0.6346, + "step": 2496 + }, + { + "epoch": 0.7675991392560714, + "grad_norm": 0.30653369426727295, + "learning_rate": 4.277943096244356e-06, + "loss": 0.6355, + "step": 2497 + }, + { + "epoch": 0.7679065478020289, + "grad_norm": 0.2990642488002777, + "learning_rate": 4.277371364436376e-06, + "loss": 0.6303, + "step": 2498 + }, + { + "epoch": 0.7682139563479865, + "grad_norm": 0.2991875112056732, + "learning_rate": 4.276799444604266e-06, + "loss": 0.6349, + "step": 2499 + }, + { + "epoch": 0.7685213648939441, + "grad_norm": 0.2983812987804413, + "learning_rate": 4.2762273368085276e-06, + "loss": 0.6214, + "step": 2500 + }, + { + "epoch": 0.7688287734399016, + "grad_norm": 0.30244308710098267, + "learning_rate": 4.2756550411096826e-06, + "loss": 0.6224, + "step": 2501 + }, + { + "epoch": 0.7691361819858592, + "grad_norm": 0.3083246946334839, + "learning_rate": 4.2750825575682745e-06, + "loss": 0.6252, + "step": 2502 + }, + { + "epoch": 0.7694435905318168, + "grad_norm": 0.29712435603141785, + "learning_rate": 4.274509886244863e-06, + "loss": 0.6263, + "step": 2503 + }, + { + "epoch": 0.7697509990777743, + "grad_norm": 0.2947876453399658, + "learning_rate": 4.27393702720003e-06, + "loss": 0.6359, + "step": 2504 + }, + { + "epoch": 0.7700584076237319, + "grad_norm": 0.2900857925415039, + "learning_rate": 4.273363980494379e-06, + "loss": 0.6178, + "step": 2505 + }, + { + "epoch": 0.7703658161696895, + "grad_norm": 0.2881874442100525, + "learning_rate": 4.272790746188529e-06, + "loss": 0.6182, + "step": 2506 + }, + { + "epoch": 0.7706732247156471, + "grad_norm": 0.29537343978881836, + "learning_rate": 4.272217324343123e-06, + "loss": 0.6178, + "step": 2507 + }, + { + "epoch": 0.7709806332616047, + "grad_norm": 0.2884519100189209, + "learning_rate": 4.27164371501882e-06, + "loss": 0.6309, + "step": 2508 + }, + { + "epoch": 0.7712880418075623, + "grad_norm": 0.3048785924911499, + "learning_rate": 4.271069918276304e-06, + "loss": 0.6255, + "step": 2509 + }, + { + "epoch": 0.7715954503535198, + "grad_norm": 0.3029156029224396, + "learning_rate": 4.270495934176272e-06, + "loss": 0.6541, + "step": 2510 + }, + { + "epoch": 0.7719028588994774, + "grad_norm": 0.3001176416873932, + "learning_rate": 4.269921762779447e-06, + "loss": 0.6126, + "step": 2511 + }, + { + "epoch": 0.772210267445435, + "grad_norm": 0.2964211702346802, + "learning_rate": 4.2693474041465676e-06, + "loss": 0.6434, + "step": 2512 + }, + { + "epoch": 0.7725176759913925, + "grad_norm": 0.28966790437698364, + "learning_rate": 4.268772858338395e-06, + "loss": 0.6137, + "step": 2513 + }, + { + "epoch": 0.7728250845373501, + "grad_norm": 0.30313220620155334, + "learning_rate": 4.268198125415709e-06, + "loss": 0.6391, + "step": 2514 + }, + { + "epoch": 0.7731324930833077, + "grad_norm": 0.3071134388446808, + "learning_rate": 4.26762320543931e-06, + "loss": 0.6385, + "step": 2515 + }, + { + "epoch": 0.7734399016292653, + "grad_norm": 0.3366715610027313, + "learning_rate": 4.267048098470015e-06, + "loss": 0.6162, + "step": 2516 + }, + { + "epoch": 0.7737473101752229, + "grad_norm": 0.3106440305709839, + "learning_rate": 4.266472804568666e-06, + "loss": 0.6086, + "step": 2517 + }, + { + "epoch": 0.7740547187211805, + "grad_norm": 0.31793564558029175, + "learning_rate": 4.265897323796121e-06, + "loss": 0.6555, + "step": 2518 + }, + { + "epoch": 0.774362127267138, + "grad_norm": 0.3083655536174774, + "learning_rate": 4.265321656213259e-06, + "loss": 0.6303, + "step": 2519 + }, + { + "epoch": 0.7746695358130956, + "grad_norm": 0.3000026047229767, + "learning_rate": 4.264745801880977e-06, + "loss": 0.6314, + "step": 2520 + }, + { + "epoch": 0.7749769443590532, + "grad_norm": 0.3096294105052948, + "learning_rate": 4.264169760860196e-06, + "loss": 0.6171, + "step": 2521 + }, + { + "epoch": 0.7752843529050107, + "grad_norm": 0.3231581449508667, + "learning_rate": 4.2635935332118526e-06, + "loss": 0.6449, + "step": 2522 + }, + { + "epoch": 0.7755917614509683, + "grad_norm": 0.3002406060695648, + "learning_rate": 4.2630171189969046e-06, + "loss": 0.6296, + "step": 2523 + }, + { + "epoch": 0.7758991699969259, + "grad_norm": 0.2927456200122833, + "learning_rate": 4.262440518276331e-06, + "loss": 0.62, + "step": 2524 + }, + { + "epoch": 0.7762065785428834, + "grad_norm": 0.3076331615447998, + "learning_rate": 4.261863731111127e-06, + "loss": 0.625, + "step": 2525 + }, + { + "epoch": 0.7765139870888411, + "grad_norm": 0.3296608626842499, + "learning_rate": 4.261286757562311e-06, + "loss": 0.6073, + "step": 2526 + }, + { + "epoch": 0.7768213956347987, + "grad_norm": 0.31735920906066895, + "learning_rate": 4.2607095976909185e-06, + "loss": 0.6277, + "step": 2527 + }, + { + "epoch": 0.7771288041807562, + "grad_norm": 0.3382786810398102, + "learning_rate": 4.260132251558007e-06, + "loss": 0.6283, + "step": 2528 + }, + { + "epoch": 0.7774362127267138, + "grad_norm": 0.3382722735404968, + "learning_rate": 4.259554719224652e-06, + "loss": 0.6195, + "step": 2529 + }, + { + "epoch": 0.7777436212726714, + "grad_norm": 0.2980000972747803, + "learning_rate": 4.258977000751951e-06, + "loss": 0.6493, + "step": 2530 + }, + { + "epoch": 0.778051029818629, + "grad_norm": 0.3005400598049164, + "learning_rate": 4.258399096201017e-06, + "loss": 0.6305, + "step": 2531 + }, + { + "epoch": 0.7783584383645865, + "grad_norm": 0.3153238594532013, + "learning_rate": 4.257821005632987e-06, + "loss": 0.628, + "step": 2532 + }, + { + "epoch": 0.7786658469105441, + "grad_norm": 0.3177393972873688, + "learning_rate": 4.257242729109016e-06, + "loss": 0.6252, + "step": 2533 + }, + { + "epoch": 0.7789732554565016, + "grad_norm": 0.3164929151535034, + "learning_rate": 4.256664266690277e-06, + "loss": 0.6312, + "step": 2534 + }, + { + "epoch": 0.7792806640024593, + "grad_norm": 0.3075389266014099, + "learning_rate": 4.256085618437966e-06, + "loss": 0.6329, + "step": 2535 + }, + { + "epoch": 0.7795880725484169, + "grad_norm": 0.312110036611557, + "learning_rate": 4.2555067844132955e-06, + "loss": 0.6405, + "step": 2536 + }, + { + "epoch": 0.7798954810943745, + "grad_norm": 0.3229072391986847, + "learning_rate": 4.254927764677499e-06, + "loss": 0.6369, + "step": 2537 + }, + { + "epoch": 0.780202889640332, + "grad_norm": 0.29848405718803406, + "learning_rate": 4.254348559291832e-06, + "loss": 0.6245, + "step": 2538 + }, + { + "epoch": 0.7805102981862896, + "grad_norm": 0.29484057426452637, + "learning_rate": 4.253769168317564e-06, + "loss": 0.613, + "step": 2539 + }, + { + "epoch": 0.7808177067322472, + "grad_norm": 0.3101823329925537, + "learning_rate": 4.253189591815991e-06, + "loss": 0.612, + "step": 2540 + }, + { + "epoch": 0.7811251152782047, + "grad_norm": 0.3092913031578064, + "learning_rate": 4.252609829848423e-06, + "loss": 0.6546, + "step": 2541 + }, + { + "epoch": 0.7814325238241623, + "grad_norm": 0.29712167382240295, + "learning_rate": 4.252029882476191e-06, + "loss": 0.6106, + "step": 2542 + }, + { + "epoch": 0.7817399323701198, + "grad_norm": 0.3047908842563629, + "learning_rate": 4.251449749760648e-06, + "loss": 0.6069, + "step": 2543 + }, + { + "epoch": 0.7820473409160774, + "grad_norm": 0.31976741552352905, + "learning_rate": 4.250869431763164e-06, + "loss": 0.6358, + "step": 2544 + }, + { + "epoch": 0.7823547494620351, + "grad_norm": 0.316275417804718, + "learning_rate": 4.2502889285451295e-06, + "loss": 0.6339, + "step": 2545 + }, + { + "epoch": 0.7826621580079927, + "grad_norm": 0.29747214913368225, + "learning_rate": 4.249708240167956e-06, + "loss": 0.6539, + "step": 2546 + }, + { + "epoch": 0.7829695665539502, + "grad_norm": 0.3253827393054962, + "learning_rate": 4.249127366693071e-06, + "loss": 0.6464, + "step": 2547 + }, + { + "epoch": 0.7832769750999078, + "grad_norm": 0.3038269281387329, + "learning_rate": 4.248546308181926e-06, + "loss": 0.6296, + "step": 2548 + }, + { + "epoch": 0.7835843836458654, + "grad_norm": 0.315327525138855, + "learning_rate": 4.247965064695988e-06, + "loss": 0.6275, + "step": 2549 + }, + { + "epoch": 0.7838917921918229, + "grad_norm": 0.3113914430141449, + "learning_rate": 4.247383636296747e-06, + "loss": 0.6381, + "step": 2550 + }, + { + "epoch": 0.7841992007377805, + "grad_norm": 0.29868197441101074, + "learning_rate": 4.246802023045709e-06, + "loss": 0.6097, + "step": 2551 + }, + { + "epoch": 0.784506609283738, + "grad_norm": 0.2946641743183136, + "learning_rate": 4.246220225004405e-06, + "loss": 0.6395, + "step": 2552 + }, + { + "epoch": 0.7848140178296956, + "grad_norm": 0.315939337015152, + "learning_rate": 4.2456382422343775e-06, + "loss": 0.6102, + "step": 2553 + }, + { + "epoch": 0.7851214263756533, + "grad_norm": 0.30631911754608154, + "learning_rate": 4.245056074797197e-06, + "loss": 0.6091, + "step": 2554 + }, + { + "epoch": 0.7854288349216109, + "grad_norm": 0.29854002594947815, + "learning_rate": 4.244473722754449e-06, + "loss": 0.5908, + "step": 2555 + }, + { + "epoch": 0.7857362434675684, + "grad_norm": 0.3007999360561371, + "learning_rate": 4.243891186167737e-06, + "loss": 0.6262, + "step": 2556 + }, + { + "epoch": 0.786043652013526, + "grad_norm": 0.3145750164985657, + "learning_rate": 4.243308465098689e-06, + "loss": 0.6286, + "step": 2557 + }, + { + "epoch": 0.7863510605594836, + "grad_norm": 0.3130640685558319, + "learning_rate": 4.242725559608948e-06, + "loss": 0.6403, + "step": 2558 + }, + { + "epoch": 0.7866584691054411, + "grad_norm": 0.3111433684825897, + "learning_rate": 4.242142469760178e-06, + "loss": 0.6212, + "step": 2559 + }, + { + "epoch": 0.7869658776513987, + "grad_norm": 0.30323687195777893, + "learning_rate": 4.241559195614065e-06, + "loss": 0.6199, + "step": 2560 + }, + { + "epoch": 0.7872732861973563, + "grad_norm": 0.32196271419525146, + "learning_rate": 4.24097573723231e-06, + "loss": 0.6212, + "step": 2561 + }, + { + "epoch": 0.7875806947433138, + "grad_norm": 0.2959735691547394, + "learning_rate": 4.240392094676637e-06, + "loss": 0.6314, + "step": 2562 + }, + { + "epoch": 0.7878881032892714, + "grad_norm": 0.3035019338130951, + "learning_rate": 4.239808268008787e-06, + "loss": 0.6071, + "step": 2563 + }, + { + "epoch": 0.7881955118352291, + "grad_norm": 0.3111043870449066, + "learning_rate": 4.239224257290523e-06, + "loss": 0.6229, + "step": 2564 + }, + { + "epoch": 0.7885029203811866, + "grad_norm": 0.30196595191955566, + "learning_rate": 4.238640062583625e-06, + "loss": 0.6235, + "step": 2565 + }, + { + "epoch": 0.7888103289271442, + "grad_norm": 0.310310035943985, + "learning_rate": 4.238055683949896e-06, + "loss": 0.6271, + "step": 2566 + }, + { + "epoch": 0.7891177374731018, + "grad_norm": 0.3293777108192444, + "learning_rate": 4.237471121451153e-06, + "loss": 0.6281, + "step": 2567 + }, + { + "epoch": 0.7894251460190593, + "grad_norm": 0.3111010789871216, + "learning_rate": 4.236886375149238e-06, + "loss": 0.6434, + "step": 2568 + }, + { + "epoch": 0.7897325545650169, + "grad_norm": 0.30308717489242554, + "learning_rate": 4.236301445106008e-06, + "loss": 0.6246, + "step": 2569 + }, + { + "epoch": 0.7900399631109745, + "grad_norm": 0.3082054853439331, + "learning_rate": 4.235716331383343e-06, + "loss": 0.632, + "step": 2570 + }, + { + "epoch": 0.790347371656932, + "grad_norm": 0.301236629486084, + "learning_rate": 4.23513103404314e-06, + "loss": 0.6101, + "step": 2571 + }, + { + "epoch": 0.7906547802028896, + "grad_norm": 0.29852575063705444, + "learning_rate": 4.234545553147318e-06, + "loss": 0.6495, + "step": 2572 + }, + { + "epoch": 0.7909621887488473, + "grad_norm": 0.30029964447021484, + "learning_rate": 4.233959888757811e-06, + "loss": 0.6375, + "step": 2573 + }, + { + "epoch": 0.7912695972948048, + "grad_norm": 0.2866511642932892, + "learning_rate": 4.233374040936577e-06, + "loss": 0.6158, + "step": 2574 + }, + { + "epoch": 0.7915770058407624, + "grad_norm": 0.3045900762081146, + "learning_rate": 4.232788009745591e-06, + "loss": 0.6291, + "step": 2575 + }, + { + "epoch": 0.79188441438672, + "grad_norm": 0.2887575626373291, + "learning_rate": 4.232201795246847e-06, + "loss": 0.6369, + "step": 2576 + }, + { + "epoch": 0.7921918229326775, + "grad_norm": 0.3066348135471344, + "learning_rate": 4.231615397502361e-06, + "loss": 0.6148, + "step": 2577 + }, + { + "epoch": 0.7924992314786351, + "grad_norm": 0.30953332781791687, + "learning_rate": 4.231028816574166e-06, + "loss": 0.6196, + "step": 2578 + }, + { + "epoch": 0.7928066400245927, + "grad_norm": 0.3096400797367096, + "learning_rate": 4.230442052524314e-06, + "loss": 0.6383, + "step": 2579 + }, + { + "epoch": 0.7931140485705502, + "grad_norm": 0.30168232321739197, + "learning_rate": 4.229855105414879e-06, + "loss": 0.6178, + "step": 2580 + }, + { + "epoch": 0.7934214571165078, + "grad_norm": 0.3116748631000519, + "learning_rate": 4.229267975307953e-06, + "loss": 0.6028, + "step": 2581 + }, + { + "epoch": 0.7937288656624654, + "grad_norm": 0.29654526710510254, + "learning_rate": 4.228680662265646e-06, + "loss": 0.6212, + "step": 2582 + }, + { + "epoch": 0.794036274208423, + "grad_norm": 0.3189421594142914, + "learning_rate": 4.228093166350088e-06, + "loss": 0.6254, + "step": 2583 + }, + { + "epoch": 0.7943436827543806, + "grad_norm": 0.3036354184150696, + "learning_rate": 4.227505487623431e-06, + "loss": 0.6473, + "step": 2584 + }, + { + "epoch": 0.7946510913003382, + "grad_norm": 0.3184194564819336, + "learning_rate": 4.226917626147843e-06, + "loss": 0.6288, + "step": 2585 + }, + { + "epoch": 0.7949584998462957, + "grad_norm": 0.31039395928382874, + "learning_rate": 4.226329581985512e-06, + "loss": 0.6402, + "step": 2586 + }, + { + "epoch": 0.7952659083922533, + "grad_norm": 0.28654035925865173, + "learning_rate": 4.225741355198647e-06, + "loss": 0.6326, + "step": 2587 + }, + { + "epoch": 0.7955733169382109, + "grad_norm": 0.30573615431785583, + "learning_rate": 4.225152945849475e-06, + "loss": 0.6294, + "step": 2588 + }, + { + "epoch": 0.7958807254841684, + "grad_norm": 0.31071093678474426, + "learning_rate": 4.224564354000241e-06, + "loss": 0.6399, + "step": 2589 + }, + { + "epoch": 0.796188134030126, + "grad_norm": 0.2852582037448883, + "learning_rate": 4.223975579713213e-06, + "loss": 0.6189, + "step": 2590 + }, + { + "epoch": 0.7964955425760836, + "grad_norm": 0.29512330889701843, + "learning_rate": 4.223386623050675e-06, + "loss": 0.6116, + "step": 2591 + }, + { + "epoch": 0.7968029511220412, + "grad_norm": 0.3036036193370819, + "learning_rate": 4.2227974840749305e-06, + "loss": 0.6228, + "step": 2592 + }, + { + "epoch": 0.7971103596679988, + "grad_norm": 0.300504207611084, + "learning_rate": 4.222208162848305e-06, + "loss": 0.6008, + "step": 2593 + }, + { + "epoch": 0.7974177682139564, + "grad_norm": 0.30004993081092834, + "learning_rate": 4.221618659433141e-06, + "loss": 0.6122, + "step": 2594 + }, + { + "epoch": 0.797725176759914, + "grad_norm": 0.30380678176879883, + "learning_rate": 4.221028973891799e-06, + "loss": 0.6067, + "step": 2595 + }, + { + "epoch": 0.7980325853058715, + "grad_norm": 0.2968532145023346, + "learning_rate": 4.220439106286664e-06, + "loss": 0.624, + "step": 2596 + }, + { + "epoch": 0.7983399938518291, + "grad_norm": 0.2984146773815155, + "learning_rate": 4.2198490566801335e-06, + "loss": 0.615, + "step": 2597 + }, + { + "epoch": 0.7986474023977866, + "grad_norm": 0.32451874017715454, + "learning_rate": 4.219258825134629e-06, + "loss": 0.6469, + "step": 2598 + }, + { + "epoch": 0.7989548109437442, + "grad_norm": 0.30999088287353516, + "learning_rate": 4.218668411712589e-06, + "loss": 0.6408, + "step": 2599 + }, + { + "epoch": 0.7992622194897018, + "grad_norm": 0.3293072283267975, + "learning_rate": 4.218077816476473e-06, + "loss": 0.6205, + "step": 2600 + }, + { + "epoch": 0.7995696280356593, + "grad_norm": 0.30580657720565796, + "learning_rate": 4.217487039488758e-06, + "loss": 0.6376, + "step": 2601 + }, + { + "epoch": 0.799877036581617, + "grad_norm": 0.30849817395210266, + "learning_rate": 4.216896080811942e-06, + "loss": 0.6211, + "step": 2602 + }, + { + "epoch": 0.8001844451275746, + "grad_norm": 0.3035764694213867, + "learning_rate": 4.21630494050854e-06, + "loss": 0.6379, + "step": 2603 + }, + { + "epoch": 0.8004918536735321, + "grad_norm": 0.30357903242111206, + "learning_rate": 4.215713618641087e-06, + "loss": 0.5936, + "step": 2604 + }, + { + "epoch": 0.8007992622194897, + "grad_norm": 0.3110692799091339, + "learning_rate": 4.2151221152721385e-06, + "loss": 0.6432, + "step": 2605 + }, + { + "epoch": 0.8011066707654473, + "grad_norm": 0.2813165485858917, + "learning_rate": 4.2145304304642695e-06, + "loss": 0.6314, + "step": 2606 + }, + { + "epoch": 0.8014140793114048, + "grad_norm": 0.3039419651031494, + "learning_rate": 4.213938564280072e-06, + "loss": 0.6126, + "step": 2607 + }, + { + "epoch": 0.8017214878573624, + "grad_norm": 0.30776897072792053, + "learning_rate": 4.213346516782155e-06, + "loss": 0.6183, + "step": 2608 + }, + { + "epoch": 0.80202889640332, + "grad_norm": 0.31240999698638916, + "learning_rate": 4.212754288033155e-06, + "loss": 0.6175, + "step": 2609 + }, + { + "epoch": 0.8023363049492775, + "grad_norm": 0.32089120149612427, + "learning_rate": 4.212161878095721e-06, + "loss": 0.6328, + "step": 2610 + }, + { + "epoch": 0.8026437134952352, + "grad_norm": 0.3115231990814209, + "learning_rate": 4.211569287032521e-06, + "loss": 0.6439, + "step": 2611 + }, + { + "epoch": 0.8029511220411928, + "grad_norm": 0.30546456575393677, + "learning_rate": 4.210976514906246e-06, + "loss": 0.6447, + "step": 2612 + }, + { + "epoch": 0.8032585305871504, + "grad_norm": 0.302080363035202, + "learning_rate": 4.210383561779602e-06, + "loss": 0.6408, + "step": 2613 + }, + { + "epoch": 0.8035659391331079, + "grad_norm": 0.3028517961502075, + "learning_rate": 4.209790427715317e-06, + "loss": 0.6552, + "step": 2614 + }, + { + "epoch": 0.8038733476790655, + "grad_norm": 0.29901477694511414, + "learning_rate": 4.209197112776138e-06, + "loss": 0.6252, + "step": 2615 + }, + { + "epoch": 0.804180756225023, + "grad_norm": 0.29798436164855957, + "learning_rate": 4.2086036170248305e-06, + "loss": 0.6415, + "step": 2616 + }, + { + "epoch": 0.8044881647709806, + "grad_norm": 0.3101266324520111, + "learning_rate": 4.208009940524177e-06, + "loss": 0.6434, + "step": 2617 + }, + { + "epoch": 0.8047955733169382, + "grad_norm": 0.3014441132545471, + "learning_rate": 4.207416083336984e-06, + "loss": 0.6128, + "step": 2618 + }, + { + "epoch": 0.8051029818628958, + "grad_norm": 0.2980823814868927, + "learning_rate": 4.206822045526073e-06, + "loss": 0.6502, + "step": 2619 + }, + { + "epoch": 0.8054103904088533, + "grad_norm": 0.29623568058013916, + "learning_rate": 4.206227827154285e-06, + "loss": 0.6215, + "step": 2620 + }, + { + "epoch": 0.805717798954811, + "grad_norm": 0.30340760946273804, + "learning_rate": 4.205633428284484e-06, + "loss": 0.6478, + "step": 2621 + }, + { + "epoch": 0.8060252075007686, + "grad_norm": 0.3119695484638214, + "learning_rate": 4.205038848979547e-06, + "loss": 0.6261, + "step": 2622 + }, + { + "epoch": 0.8063326160467261, + "grad_norm": 0.31161633133888245, + "learning_rate": 4.204444089302374e-06, + "loss": 0.6356, + "step": 2623 + }, + { + "epoch": 0.8066400245926837, + "grad_norm": 0.31465819478034973, + "learning_rate": 4.203849149315885e-06, + "loss": 0.6127, + "step": 2624 + }, + { + "epoch": 0.8069474331386413, + "grad_norm": 0.3042815029621124, + "learning_rate": 4.203254029083015e-06, + "loss": 0.6344, + "step": 2625 + }, + { + "epoch": 0.8072548416845988, + "grad_norm": 0.3036349415779114, + "learning_rate": 4.2026587286667215e-06, + "loss": 0.6133, + "step": 2626 + }, + { + "epoch": 0.8075622502305564, + "grad_norm": 0.30203697085380554, + "learning_rate": 4.20206324812998e-06, + "loss": 0.6304, + "step": 2627 + }, + { + "epoch": 0.807869658776514, + "grad_norm": 0.2967226803302765, + "learning_rate": 4.201467587535785e-06, + "loss": 0.6163, + "step": 2628 + }, + { + "epoch": 0.8081770673224715, + "grad_norm": 0.3061487376689911, + "learning_rate": 4.20087174694715e-06, + "loss": 0.638, + "step": 2629 + }, + { + "epoch": 0.8084844758684292, + "grad_norm": 0.31245511770248413, + "learning_rate": 4.2002757264271076e-06, + "loss": 0.623, + "step": 2630 + }, + { + "epoch": 0.8087918844143868, + "grad_norm": 0.31216153502464294, + "learning_rate": 4.1996795260387105e-06, + "loss": 0.6295, + "step": 2631 + }, + { + "epoch": 0.8090992929603443, + "grad_norm": 0.3013874292373657, + "learning_rate": 4.1990831458450275e-06, + "loss": 0.6186, + "step": 2632 + }, + { + "epoch": 0.8094067015063019, + "grad_norm": 0.3282599151134491, + "learning_rate": 4.19848658590915e-06, + "loss": 0.6132, + "step": 2633 + }, + { + "epoch": 0.8097141100522595, + "grad_norm": 0.306295245885849, + "learning_rate": 4.197889846294186e-06, + "loss": 0.6009, + "step": 2634 + }, + { + "epoch": 0.810021518598217, + "grad_norm": 0.30181753635406494, + "learning_rate": 4.197292927063263e-06, + "loss": 0.5922, + "step": 2635 + }, + { + "epoch": 0.8103289271441746, + "grad_norm": 0.30467653274536133, + "learning_rate": 4.196695828279529e-06, + "loss": 0.6291, + "step": 2636 + }, + { + "epoch": 0.8106363356901322, + "grad_norm": 0.3159371614456177, + "learning_rate": 4.196098550006148e-06, + "loss": 0.6276, + "step": 2637 + }, + { + "epoch": 0.8109437442360897, + "grad_norm": 0.30807381868362427, + "learning_rate": 4.195501092306306e-06, + "loss": 0.6022, + "step": 2638 + }, + { + "epoch": 0.8112511527820473, + "grad_norm": 0.3095446527004242, + "learning_rate": 4.194903455243206e-06, + "loss": 0.6285, + "step": 2639 + }, + { + "epoch": 0.811558561328005, + "grad_norm": 0.30841273069381714, + "learning_rate": 4.194305638880071e-06, + "loss": 0.6194, + "step": 2640 + }, + { + "epoch": 0.8118659698739625, + "grad_norm": 0.294185996055603, + "learning_rate": 4.193707643280143e-06, + "loss": 0.6296, + "step": 2641 + }, + { + "epoch": 0.8121733784199201, + "grad_norm": 0.29483821988105774, + "learning_rate": 4.1931094685066805e-06, + "loss": 0.6363, + "step": 2642 + }, + { + "epoch": 0.8124807869658777, + "grad_norm": 0.2865297198295593, + "learning_rate": 4.192511114622967e-06, + "loss": 0.6143, + "step": 2643 + }, + { + "epoch": 0.8127881955118352, + "grad_norm": 0.3086078464984894, + "learning_rate": 4.191912581692297e-06, + "loss": 0.6021, + "step": 2644 + }, + { + "epoch": 0.8130956040577928, + "grad_norm": 0.3185979723930359, + "learning_rate": 4.191313869777991e-06, + "loss": 0.625, + "step": 2645 + }, + { + "epoch": 0.8134030126037504, + "grad_norm": 0.3041331171989441, + "learning_rate": 4.190714978943384e-06, + "loss": 0.6292, + "step": 2646 + }, + { + "epoch": 0.8137104211497079, + "grad_norm": 0.32535797357559204, + "learning_rate": 4.1901159092518304e-06, + "loss": 0.6456, + "step": 2647 + }, + { + "epoch": 0.8140178296956655, + "grad_norm": 0.3362574875354767, + "learning_rate": 4.189516660766707e-06, + "loss": 0.6359, + "step": 2648 + }, + { + "epoch": 0.8143252382416231, + "grad_norm": 0.3000102639198303, + "learning_rate": 4.188917233551405e-06, + "loss": 0.62, + "step": 2649 + }, + { + "epoch": 0.8146326467875807, + "grad_norm": 0.3041217625141144, + "learning_rate": 4.188317627669336e-06, + "loss": 0.6272, + "step": 2650 + }, + { + "epoch": 0.8149400553335383, + "grad_norm": 0.30004453659057617, + "learning_rate": 4.1877178431839325e-06, + "loss": 0.6272, + "step": 2651 + }, + { + "epoch": 0.8152474638794959, + "grad_norm": 0.31063568592071533, + "learning_rate": 4.187117880158643e-06, + "loss": 0.6012, + "step": 2652 + }, + { + "epoch": 0.8155548724254534, + "grad_norm": 0.3147011697292328, + "learning_rate": 4.186517738656939e-06, + "loss": 0.6536, + "step": 2653 + }, + { + "epoch": 0.815862280971411, + "grad_norm": 0.30957475304603577, + "learning_rate": 4.185917418742304e-06, + "loss": 0.6403, + "step": 2654 + }, + { + "epoch": 0.8161696895173686, + "grad_norm": 0.32400521636009216, + "learning_rate": 4.1853169204782475e-06, + "loss": 0.6334, + "step": 2655 + }, + { + "epoch": 0.8164770980633261, + "grad_norm": 0.29500722885131836, + "learning_rate": 4.184716243928294e-06, + "loss": 0.6395, + "step": 2656 + }, + { + "epoch": 0.8167845066092837, + "grad_norm": 0.3190675675868988, + "learning_rate": 4.184115389155987e-06, + "loss": 0.6204, + "step": 2657 + }, + { + "epoch": 0.8170919151552413, + "grad_norm": 0.31017303466796875, + "learning_rate": 4.183514356224891e-06, + "loss": 0.6156, + "step": 2658 + }, + { + "epoch": 0.8173993237011989, + "grad_norm": 0.31462860107421875, + "learning_rate": 4.182913145198587e-06, + "loss": 0.5933, + "step": 2659 + }, + { + "epoch": 0.8177067322471565, + "grad_norm": 0.3060140907764435, + "learning_rate": 4.1823117561406765e-06, + "loss": 0.6309, + "step": 2660 + }, + { + "epoch": 0.8180141407931141, + "grad_norm": 0.347188800573349, + "learning_rate": 4.181710189114777e-06, + "loss": 0.621, + "step": 2661 + }, + { + "epoch": 0.8183215493390716, + "grad_norm": 0.3060716986656189, + "learning_rate": 4.18110844418453e-06, + "loss": 0.6145, + "step": 2662 + }, + { + "epoch": 0.8186289578850292, + "grad_norm": 0.3066619634628296, + "learning_rate": 4.180506521413591e-06, + "loss": 0.6386, + "step": 2663 + }, + { + "epoch": 0.8189363664309868, + "grad_norm": 0.3259820342063904, + "learning_rate": 4.179904420865636e-06, + "loss": 0.614, + "step": 2664 + }, + { + "epoch": 0.8192437749769443, + "grad_norm": 0.30398988723754883, + "learning_rate": 4.17930214260436e-06, + "loss": 0.6219, + "step": 2665 + }, + { + "epoch": 0.8195511835229019, + "grad_norm": 0.30527451634407043, + "learning_rate": 4.178699686693476e-06, + "loss": 0.5997, + "step": 2666 + }, + { + "epoch": 0.8198585920688595, + "grad_norm": 0.3108130991458893, + "learning_rate": 4.178097053196719e-06, + "loss": 0.6243, + "step": 2667 + }, + { + "epoch": 0.820166000614817, + "grad_norm": 0.30927127599716187, + "learning_rate": 4.177494242177837e-06, + "loss": 0.6105, + "step": 2668 + }, + { + "epoch": 0.8204734091607747, + "grad_norm": 0.30970773100852966, + "learning_rate": 4.176891253700603e-06, + "loss": 0.6236, + "step": 2669 + }, + { + "epoch": 0.8207808177067323, + "grad_norm": 0.2977321445941925, + "learning_rate": 4.176288087828804e-06, + "loss": 0.6346, + "step": 2670 + }, + { + "epoch": 0.8210882262526898, + "grad_norm": 0.3113629221916199, + "learning_rate": 4.175684744626247e-06, + "loss": 0.6366, + "step": 2671 + }, + { + "epoch": 0.8213956347986474, + "grad_norm": 0.3136957585811615, + "learning_rate": 4.175081224156759e-06, + "loss": 0.6389, + "step": 2672 + }, + { + "epoch": 0.821703043344605, + "grad_norm": 0.29976144433021545, + "learning_rate": 4.1744775264841865e-06, + "loss": 0.6219, + "step": 2673 + }, + { + "epoch": 0.8220104518905625, + "grad_norm": 0.31472358107566833, + "learning_rate": 4.173873651672392e-06, + "loss": 0.6025, + "step": 2674 + }, + { + "epoch": 0.8223178604365201, + "grad_norm": 0.3071633279323578, + "learning_rate": 4.173269599785258e-06, + "loss": 0.6367, + "step": 2675 + }, + { + "epoch": 0.8226252689824777, + "grad_norm": 0.302686870098114, + "learning_rate": 4.172665370886685e-06, + "loss": 0.6331, + "step": 2676 + }, + { + "epoch": 0.8229326775284352, + "grad_norm": 0.3157960772514343, + "learning_rate": 4.172060965040595e-06, + "loss": 0.6284, + "step": 2677 + }, + { + "epoch": 0.8232400860743929, + "grad_norm": 0.3082062900066376, + "learning_rate": 4.1714563823109265e-06, + "loss": 0.6167, + "step": 2678 + }, + { + "epoch": 0.8235474946203505, + "grad_norm": 0.30192843079566956, + "learning_rate": 4.170851622761635e-06, + "loss": 0.6244, + "step": 2679 + }, + { + "epoch": 0.823854903166308, + "grad_norm": 0.2897443473339081, + "learning_rate": 4.170246686456698e-06, + "loss": 0.623, + "step": 2680 + }, + { + "epoch": 0.8241623117122656, + "grad_norm": 0.3140231668949127, + "learning_rate": 4.169641573460111e-06, + "loss": 0.6122, + "step": 2681 + }, + { + "epoch": 0.8244697202582232, + "grad_norm": 0.2981278598308563, + "learning_rate": 4.169036283835886e-06, + "loss": 0.6137, + "step": 2682 + }, + { + "epoch": 0.8247771288041807, + "grad_norm": 0.30770403146743774, + "learning_rate": 4.168430817648056e-06, + "loss": 0.593, + "step": 2683 + }, + { + "epoch": 0.8250845373501383, + "grad_norm": 0.3164565861225128, + "learning_rate": 4.167825174960673e-06, + "loss": 0.6187, + "step": 2684 + }, + { + "epoch": 0.8253919458960959, + "grad_norm": 0.3019592761993408, + "learning_rate": 4.167219355837804e-06, + "loss": 0.6151, + "step": 2685 + }, + { + "epoch": 0.8256993544420534, + "grad_norm": 0.29768213629722595, + "learning_rate": 4.166613360343539e-06, + "loss": 0.6138, + "step": 2686 + }, + { + "epoch": 0.826006762988011, + "grad_norm": 0.2961913049221039, + "learning_rate": 4.166007188541985e-06, + "loss": 0.6084, + "step": 2687 + }, + { + "epoch": 0.8263141715339687, + "grad_norm": 0.3030428886413574, + "learning_rate": 4.165400840497267e-06, + "loss": 0.6314, + "step": 2688 + }, + { + "epoch": 0.8266215800799263, + "grad_norm": 0.29897409677505493, + "learning_rate": 4.1647943162735294e-06, + "loss": 0.6282, + "step": 2689 + }, + { + "epoch": 0.8269289886258838, + "grad_norm": 0.31468483805656433, + "learning_rate": 4.164187615934936e-06, + "loss": 0.6205, + "step": 2690 + }, + { + "epoch": 0.8272363971718414, + "grad_norm": 0.30147117376327515, + "learning_rate": 4.163580739545666e-06, + "loss": 0.6283, + "step": 2691 + }, + { + "epoch": 0.827543805717799, + "grad_norm": 0.3083139657974243, + "learning_rate": 4.162973687169921e-06, + "loss": 0.6144, + "step": 2692 + }, + { + "epoch": 0.8278512142637565, + "grad_norm": 0.30401790142059326, + "learning_rate": 4.16236645887192e-06, + "loss": 0.6418, + "step": 2693 + }, + { + "epoch": 0.8281586228097141, + "grad_norm": 0.28821030259132385, + "learning_rate": 4.161759054715899e-06, + "loss": 0.6245, + "step": 2694 + }, + { + "epoch": 0.8284660313556717, + "grad_norm": 0.3020317256450653, + "learning_rate": 4.161151474766115e-06, + "loss": 0.6124, + "step": 2695 + }, + { + "epoch": 0.8287734399016292, + "grad_norm": 0.30427253246307373, + "learning_rate": 4.160543719086842e-06, + "loss": 0.6423, + "step": 2696 + }, + { + "epoch": 0.8290808484475869, + "grad_norm": 0.29938164353370667, + "learning_rate": 4.1599357877423745e-06, + "loss": 0.6295, + "step": 2697 + }, + { + "epoch": 0.8293882569935445, + "grad_norm": 0.3055977523326874, + "learning_rate": 4.159327680797023e-06, + "loss": 0.6453, + "step": 2698 + }, + { + "epoch": 0.829695665539502, + "grad_norm": 0.3081665337085724, + "learning_rate": 4.1587193983151165e-06, + "loss": 0.629, + "step": 2699 + }, + { + "epoch": 0.8300030740854596, + "grad_norm": 0.3101694583892822, + "learning_rate": 4.158110940361007e-06, + "loss": 0.6188, + "step": 2700 + }, + { + "epoch": 0.8303104826314172, + "grad_norm": 0.30632349848747253, + "learning_rate": 4.1575023069990585e-06, + "loss": 0.6651, + "step": 2701 + }, + { + "epoch": 0.8306178911773747, + "grad_norm": 0.3048382103443146, + "learning_rate": 4.156893498293659e-06, + "loss": 0.6287, + "step": 2702 + }, + { + "epoch": 0.8309252997233323, + "grad_norm": 0.2912927567958832, + "learning_rate": 4.156284514309213e-06, + "loss": 0.6232, + "step": 2703 + }, + { + "epoch": 0.8312327082692899, + "grad_norm": 0.2979373037815094, + "learning_rate": 4.155675355110143e-06, + "loss": 0.6084, + "step": 2704 + }, + { + "epoch": 0.8315401168152474, + "grad_norm": 0.3070951998233795, + "learning_rate": 4.155066020760889e-06, + "loss": 0.6121, + "step": 2705 + }, + { + "epoch": 0.831847525361205, + "grad_norm": 0.31716129183769226, + "learning_rate": 4.154456511325915e-06, + "loss": 0.6218, + "step": 2706 + }, + { + "epoch": 0.8321549339071627, + "grad_norm": 0.2973363697528839, + "learning_rate": 4.153846826869696e-06, + "loss": 0.5956, + "step": 2707 + }, + { + "epoch": 0.8324623424531202, + "grad_norm": 0.3029475808143616, + "learning_rate": 4.1532369674567314e-06, + "loss": 0.6334, + "step": 2708 + }, + { + "epoch": 0.8327697509990778, + "grad_norm": 0.30333325266838074, + "learning_rate": 4.152626933151536e-06, + "loss": 0.6536, + "step": 2709 + }, + { + "epoch": 0.8330771595450354, + "grad_norm": 0.30468985438346863, + "learning_rate": 4.1520167240186435e-06, + "loss": 0.634, + "step": 2710 + }, + { + "epoch": 0.8333845680909929, + "grad_norm": 0.3006296455860138, + "learning_rate": 4.1514063401226075e-06, + "loss": 0.6252, + "step": 2711 + }, + { + "epoch": 0.8336919766369505, + "grad_norm": 0.3172493875026703, + "learning_rate": 4.150795781527999e-06, + "loss": 0.6193, + "step": 2712 + }, + { + "epoch": 0.8339993851829081, + "grad_norm": 0.3026213049888611, + "learning_rate": 4.150185048299406e-06, + "loss": 0.6551, + "step": 2713 + }, + { + "epoch": 0.8343067937288656, + "grad_norm": 0.3101213872432709, + "learning_rate": 4.149574140501439e-06, + "loss": 0.6393, + "step": 2714 + }, + { + "epoch": 0.8346142022748232, + "grad_norm": 0.2982063591480255, + "learning_rate": 4.1489630581987224e-06, + "loss": 0.6424, + "step": 2715 + }, + { + "epoch": 0.8349216108207809, + "grad_norm": 0.31529107689857483, + "learning_rate": 4.148351801455904e-06, + "loss": 0.6245, + "step": 2716 + }, + { + "epoch": 0.8352290193667384, + "grad_norm": 0.3097776472568512, + "learning_rate": 4.147740370337645e-06, + "loss": 0.6275, + "step": 2717 + }, + { + "epoch": 0.835536427912696, + "grad_norm": 0.2996264100074768, + "learning_rate": 4.147128764908627e-06, + "loss": 0.5998, + "step": 2718 + }, + { + "epoch": 0.8358438364586536, + "grad_norm": 0.30986517667770386, + "learning_rate": 4.146516985233552e-06, + "loss": 0.6322, + "step": 2719 + }, + { + "epoch": 0.8361512450046111, + "grad_norm": 0.30868449807167053, + "learning_rate": 4.145905031377138e-06, + "loss": 0.5998, + "step": 2720 + }, + { + "epoch": 0.8364586535505687, + "grad_norm": 0.2943165600299835, + "learning_rate": 4.145292903404122e-06, + "loss": 0.6454, + "step": 2721 + }, + { + "epoch": 0.8367660620965263, + "grad_norm": 0.33602645993232727, + "learning_rate": 4.14468060137926e-06, + "loss": 0.6284, + "step": 2722 + }, + { + "epoch": 0.8370734706424838, + "grad_norm": 0.295949250459671, + "learning_rate": 4.144068125367326e-06, + "loss": 0.6281, + "step": 2723 + }, + { + "epoch": 0.8373808791884414, + "grad_norm": 0.30702975392341614, + "learning_rate": 4.143455475433114e-06, + "loss": 0.6221, + "step": 2724 + }, + { + "epoch": 0.837688287734399, + "grad_norm": 0.316520094871521, + "learning_rate": 4.142842651641432e-06, + "loss": 0.6498, + "step": 2725 + }, + { + "epoch": 0.8379956962803566, + "grad_norm": 0.29540038108825684, + "learning_rate": 4.1422296540571105e-06, + "loss": 0.6234, + "step": 2726 + }, + { + "epoch": 0.8383031048263142, + "grad_norm": 0.314132422208786, + "learning_rate": 4.141616482744997e-06, + "loss": 0.617, + "step": 2727 + }, + { + "epoch": 0.8386105133722718, + "grad_norm": 0.29376617074012756, + "learning_rate": 4.1410031377699585e-06, + "loss": 0.6247, + "step": 2728 + }, + { + "epoch": 0.8389179219182293, + "grad_norm": 0.3041173815727234, + "learning_rate": 4.140389619196878e-06, + "loss": 0.6264, + "step": 2729 + }, + { + "epoch": 0.8392253304641869, + "grad_norm": 0.29771947860717773, + "learning_rate": 4.139775927090659e-06, + "loss": 0.6313, + "step": 2730 + }, + { + "epoch": 0.8395327390101445, + "grad_norm": 0.2977447211742401, + "learning_rate": 4.139162061516223e-06, + "loss": 0.6165, + "step": 2731 + }, + { + "epoch": 0.839840147556102, + "grad_norm": 0.2993142306804657, + "learning_rate": 4.138548022538509e-06, + "loss": 0.6224, + "step": 2732 + }, + { + "epoch": 0.8401475561020596, + "grad_norm": 0.3048458695411682, + "learning_rate": 4.137933810222474e-06, + "loss": 0.6456, + "step": 2733 + }, + { + "epoch": 0.8404549646480172, + "grad_norm": 0.3077619969844818, + "learning_rate": 4.137319424633096e-06, + "loss": 0.615, + "step": 2734 + }, + { + "epoch": 0.8407623731939748, + "grad_norm": 0.3079674243927002, + "learning_rate": 4.136704865835367e-06, + "loss": 0.6252, + "step": 2735 + }, + { + "epoch": 0.8410697817399324, + "grad_norm": 0.298648476600647, + "learning_rate": 4.136090133894302e-06, + "loss": 0.6248, + "step": 2736 + }, + { + "epoch": 0.84137719028589, + "grad_norm": 0.302963525056839, + "learning_rate": 4.135475228874931e-06, + "loss": 0.6153, + "step": 2737 + }, + { + "epoch": 0.8416845988318475, + "grad_norm": 0.3254270851612091, + "learning_rate": 4.1348601508423045e-06, + "loss": 0.6346, + "step": 2738 + }, + { + "epoch": 0.8419920073778051, + "grad_norm": 0.3107457756996155, + "learning_rate": 4.134244899861489e-06, + "loss": 0.6234, + "step": 2739 + }, + { + "epoch": 0.8422994159237627, + "grad_norm": 0.31091392040252686, + "learning_rate": 4.13362947599757e-06, + "loss": 0.6141, + "step": 2740 + }, + { + "epoch": 0.8426068244697202, + "grad_norm": 0.2978545129299164, + "learning_rate": 4.133013879315654e-06, + "loss": 0.6139, + "step": 2741 + }, + { + "epoch": 0.8429142330156778, + "grad_norm": 0.30100733041763306, + "learning_rate": 4.132398109880862e-06, + "loss": 0.6184, + "step": 2742 + }, + { + "epoch": 0.8432216415616354, + "grad_norm": 0.3111432194709778, + "learning_rate": 4.131782167758334e-06, + "loss": 0.6148, + "step": 2743 + }, + { + "epoch": 0.8435290501075929, + "grad_norm": 0.30315878987312317, + "learning_rate": 4.131166053013232e-06, + "loss": 0.6183, + "step": 2744 + }, + { + "epoch": 0.8438364586535506, + "grad_norm": 0.3014165163040161, + "learning_rate": 4.1305497657107315e-06, + "loss": 0.6263, + "step": 2745 + }, + { + "epoch": 0.8441438671995082, + "grad_norm": 0.32071375846862793, + "learning_rate": 4.129933305916029e-06, + "loss": 0.6441, + "step": 2746 + }, + { + "epoch": 0.8444512757454657, + "grad_norm": 0.3150396943092346, + "learning_rate": 4.129316673694337e-06, + "loss": 0.6322, + "step": 2747 + }, + { + "epoch": 0.8447586842914233, + "grad_norm": 0.30317357182502747, + "learning_rate": 4.1286998691108895e-06, + "loss": 0.6275, + "step": 2748 + }, + { + "epoch": 0.8450660928373809, + "grad_norm": 0.2998681962490082, + "learning_rate": 4.128082892230935e-06, + "loss": 0.6379, + "step": 2749 + }, + { + "epoch": 0.8453735013833384, + "grad_norm": 0.3000768721103668, + "learning_rate": 4.1274657431197415e-06, + "loss": 0.6275, + "step": 2750 + }, + { + "epoch": 0.845680909929296, + "grad_norm": 0.30269813537597656, + "learning_rate": 4.126848421842599e-06, + "loss": 0.6483, + "step": 2751 + }, + { + "epoch": 0.8459883184752536, + "grad_norm": 0.31398195028305054, + "learning_rate": 4.126230928464811e-06, + "loss": 0.6288, + "step": 2752 + }, + { + "epoch": 0.8462957270212111, + "grad_norm": 0.3084731698036194, + "learning_rate": 4.1256132630517e-06, + "loss": 0.6038, + "step": 2753 + }, + { + "epoch": 0.8466031355671688, + "grad_norm": 0.3073367178440094, + "learning_rate": 4.124995425668607e-06, + "loss": 0.6186, + "step": 2754 + }, + { + "epoch": 0.8469105441131264, + "grad_norm": 0.3212699592113495, + "learning_rate": 4.124377416380894e-06, + "loss": 0.627, + "step": 2755 + }, + { + "epoch": 0.847217952659084, + "grad_norm": 0.30904802680015564, + "learning_rate": 4.123759235253937e-06, + "loss": 0.6207, + "step": 2756 + }, + { + "epoch": 0.8475253612050415, + "grad_norm": 0.30337780714035034, + "learning_rate": 4.1231408823531325e-06, + "loss": 0.6217, + "step": 2757 + }, + { + "epoch": 0.8478327697509991, + "grad_norm": 0.31017500162124634, + "learning_rate": 4.122522357743894e-06, + "loss": 0.6238, + "step": 2758 + }, + { + "epoch": 0.8481401782969566, + "grad_norm": 0.2964027225971222, + "learning_rate": 4.121903661491655e-06, + "loss": 0.6395, + "step": 2759 + }, + { + "epoch": 0.8484475868429142, + "grad_norm": 0.33274590969085693, + "learning_rate": 4.121284793661865e-06, + "loss": 0.6119, + "step": 2760 + }, + { + "epoch": 0.8487549953888718, + "grad_norm": 0.29782387614250183, + "learning_rate": 4.120665754319993e-06, + "loss": 0.6274, + "step": 2761 + }, + { + "epoch": 0.8490624039348293, + "grad_norm": 0.318963885307312, + "learning_rate": 4.120046543531526e-06, + "loss": 0.6381, + "step": 2762 + }, + { + "epoch": 0.8493698124807869, + "grad_norm": 0.3125503957271576, + "learning_rate": 4.119427161361969e-06, + "loss": 0.6079, + "step": 2763 + }, + { + "epoch": 0.8496772210267446, + "grad_norm": 0.30190786719322205, + "learning_rate": 4.118807607876845e-06, + "loss": 0.6424, + "step": 2764 + }, + { + "epoch": 0.8499846295727022, + "grad_norm": 0.30317866802215576, + "learning_rate": 4.118187883141694e-06, + "loss": 0.6339, + "step": 2765 + }, + { + "epoch": 0.8502920381186597, + "grad_norm": 0.3024280369281769, + "learning_rate": 4.117567987222077e-06, + "loss": 0.632, + "step": 2766 + }, + { + "epoch": 0.8505994466646173, + "grad_norm": 0.3111323416233063, + "learning_rate": 4.11694792018357e-06, + "loss": 0.6292, + "step": 2767 + }, + { + "epoch": 0.8509068552105749, + "grad_norm": 0.30080926418304443, + "learning_rate": 4.116327682091769e-06, + "loss": 0.6186, + "step": 2768 + }, + { + "epoch": 0.8512142637565324, + "grad_norm": 0.2983926832675934, + "learning_rate": 4.1157072730122894e-06, + "loss": 0.6319, + "step": 2769 + }, + { + "epoch": 0.85152167230249, + "grad_norm": 0.3042178750038147, + "learning_rate": 4.11508669301076e-06, + "loss": 0.607, + "step": 2770 + }, + { + "epoch": 0.8518290808484476, + "grad_norm": 0.30089709162712097, + "learning_rate": 4.114465942152832e-06, + "loss": 0.6278, + "step": 2771 + }, + { + "epoch": 0.8521364893944051, + "grad_norm": 0.29848629236221313, + "learning_rate": 4.113845020504173e-06, + "loss": 0.629, + "step": 2772 + }, + { + "epoch": 0.8524438979403628, + "grad_norm": 0.31816670298576355, + "learning_rate": 4.11322392813047e-06, + "loss": 0.6172, + "step": 2773 + }, + { + "epoch": 0.8527513064863204, + "grad_norm": 0.2932562828063965, + "learning_rate": 4.112602665097424e-06, + "loss": 0.6101, + "step": 2774 + }, + { + "epoch": 0.8530587150322779, + "grad_norm": 0.3043822944164276, + "learning_rate": 4.11198123147076e-06, + "loss": 0.5987, + "step": 2775 + }, + { + "epoch": 0.8533661235782355, + "grad_norm": 0.29165562987327576, + "learning_rate": 4.111359627316217e-06, + "loss": 0.6047, + "step": 2776 + }, + { + "epoch": 0.8536735321241931, + "grad_norm": 0.29623866081237793, + "learning_rate": 4.110737852699553e-06, + "loss": 0.6155, + "step": 2777 + }, + { + "epoch": 0.8539809406701506, + "grad_norm": 0.3056478202342987, + "learning_rate": 4.110115907686544e-06, + "loss": 0.6319, + "step": 2778 + }, + { + "epoch": 0.8542883492161082, + "grad_norm": 0.292704313993454, + "learning_rate": 4.1094937923429845e-06, + "loss": 0.6221, + "step": 2779 + }, + { + "epoch": 0.8545957577620658, + "grad_norm": 0.30040523409843445, + "learning_rate": 4.108871506734687e-06, + "loss": 0.6193, + "step": 2780 + }, + { + "epoch": 0.8549031663080233, + "grad_norm": 0.30487632751464844, + "learning_rate": 4.1082490509274805e-06, + "loss": 0.6234, + "step": 2781 + }, + { + "epoch": 0.8552105748539809, + "grad_norm": 0.30026358366012573, + "learning_rate": 4.107626424987216e-06, + "loss": 0.6077, + "step": 2782 + }, + { + "epoch": 0.8555179833999386, + "grad_norm": 0.29730212688446045, + "learning_rate": 4.107003628979755e-06, + "loss": 0.6395, + "step": 2783 + }, + { + "epoch": 0.8558253919458961, + "grad_norm": 0.28994497656822205, + "learning_rate": 4.106380662970986e-06, + "loss": 0.6102, + "step": 2784 + }, + { + "epoch": 0.8561328004918537, + "grad_norm": 0.2969021499156952, + "learning_rate": 4.105757527026809e-06, + "loss": 0.6333, + "step": 2785 + }, + { + "epoch": 0.8564402090378113, + "grad_norm": 0.3040597140789032, + "learning_rate": 4.105134221213145e-06, + "loss": 0.6372, + "step": 2786 + }, + { + "epoch": 0.8567476175837688, + "grad_norm": 0.30555081367492676, + "learning_rate": 4.104510745595932e-06, + "loss": 0.6147, + "step": 2787 + }, + { + "epoch": 0.8570550261297264, + "grad_norm": 0.30972787737846375, + "learning_rate": 4.1038871002411266e-06, + "loss": 0.6335, + "step": 2788 + }, + { + "epoch": 0.857362434675684, + "grad_norm": 0.3042164146900177, + "learning_rate": 4.1032632852147015e-06, + "loss": 0.6279, + "step": 2789 + }, + { + "epoch": 0.8576698432216415, + "grad_norm": 0.30560386180877686, + "learning_rate": 4.1026393005826496e-06, + "loss": 0.634, + "step": 2790 + }, + { + "epoch": 0.8579772517675991, + "grad_norm": 0.3078862726688385, + "learning_rate": 4.10201514641098e-06, + "loss": 0.6198, + "step": 2791 + }, + { + "epoch": 0.8582846603135568, + "grad_norm": 0.2954578399658203, + "learning_rate": 4.101390822765721e-06, + "loss": 0.6193, + "step": 2792 + }, + { + "epoch": 0.8585920688595143, + "grad_norm": 0.31326594948768616, + "learning_rate": 4.1007663297129204e-06, + "loss": 0.621, + "step": 2793 + }, + { + "epoch": 0.8588994774054719, + "grad_norm": 0.30064865946769714, + "learning_rate": 4.100141667318639e-06, + "loss": 0.6302, + "step": 2794 + }, + { + "epoch": 0.8592068859514295, + "grad_norm": 0.29542163014411926, + "learning_rate": 4.099516835648959e-06, + "loss": 0.6306, + "step": 2795 + }, + { + "epoch": 0.859514294497387, + "grad_norm": 0.2929269075393677, + "learning_rate": 4.098891834769981e-06, + "loss": 0.6318, + "step": 2796 + }, + { + "epoch": 0.8598217030433446, + "grad_norm": 0.31106629967689514, + "learning_rate": 4.098266664747822e-06, + "loss": 0.613, + "step": 2797 + }, + { + "epoch": 0.8601291115893022, + "grad_norm": 0.31151920557022095, + "learning_rate": 4.0976413256486174e-06, + "loss": 0.6204, + "step": 2798 + }, + { + "epoch": 0.8604365201352597, + "grad_norm": 0.30714306235313416, + "learning_rate": 4.09701581753852e-06, + "loss": 0.6022, + "step": 2799 + }, + { + "epoch": 0.8607439286812173, + "grad_norm": 0.30716460943222046, + "learning_rate": 4.096390140483701e-06, + "loss": 0.6147, + "step": 2800 + }, + { + "epoch": 0.8610513372271749, + "grad_norm": 0.3128371238708496, + "learning_rate": 4.0957642945503504e-06, + "loss": 0.6257, + "step": 2801 + }, + { + "epoch": 0.8613587457731325, + "grad_norm": 0.3136928677558899, + "learning_rate": 4.095138279804673e-06, + "loss": 0.6024, + "step": 2802 + }, + { + "epoch": 0.8616661543190901, + "grad_norm": 0.30455663800239563, + "learning_rate": 4.094512096312896e-06, + "loss": 0.5993, + "step": 2803 + }, + { + "epoch": 0.8619735628650477, + "grad_norm": 0.3031393885612488, + "learning_rate": 4.0938857441412595e-06, + "loss": 0.6033, + "step": 2804 + }, + { + "epoch": 0.8622809714110052, + "grad_norm": 0.3326217532157898, + "learning_rate": 4.093259223356025e-06, + "loss": 0.5999, + "step": 2805 + }, + { + "epoch": 0.8625883799569628, + "grad_norm": 0.29695069789886475, + "learning_rate": 4.092632534023472e-06, + "loss": 0.63, + "step": 2806 + }, + { + "epoch": 0.8628957885029204, + "grad_norm": 0.30854862928390503, + "learning_rate": 4.0920056762098946e-06, + "loss": 0.5962, + "step": 2807 + }, + { + "epoch": 0.8632031970488779, + "grad_norm": 0.3151753544807434, + "learning_rate": 4.091378649981608e-06, + "loss": 0.6245, + "step": 2808 + }, + { + "epoch": 0.8635106055948355, + "grad_norm": 0.2965186536312103, + "learning_rate": 4.090751455404942e-06, + "loss": 0.6, + "step": 2809 + }, + { + "epoch": 0.8638180141407931, + "grad_norm": 0.30801185965538025, + "learning_rate": 4.090124092546248e-06, + "loss": 0.613, + "step": 2810 + }, + { + "epoch": 0.8641254226867507, + "grad_norm": 0.32277122139930725, + "learning_rate": 4.089496561471893e-06, + "loss": 0.6349, + "step": 2811 + }, + { + "epoch": 0.8644328312327083, + "grad_norm": 0.31703993678092957, + "learning_rate": 4.088868862248262e-06, + "loss": 0.6339, + "step": 2812 + }, + { + "epoch": 0.8647402397786659, + "grad_norm": 0.29285216331481934, + "learning_rate": 4.088240994941757e-06, + "loss": 0.6318, + "step": 2813 + }, + { + "epoch": 0.8650476483246234, + "grad_norm": 0.28814011812210083, + "learning_rate": 4.0876129596188e-06, + "loss": 0.6216, + "step": 2814 + }, + { + "epoch": 0.865355056870581, + "grad_norm": 0.31513717770576477, + "learning_rate": 4.0869847563458285e-06, + "loss": 0.6048, + "step": 2815 + }, + { + "epoch": 0.8656624654165386, + "grad_norm": 0.31848230957984924, + "learning_rate": 4.086356385189298e-06, + "loss": 0.6181, + "step": 2816 + }, + { + "epoch": 0.8659698739624961, + "grad_norm": 0.3108014762401581, + "learning_rate": 4.085727846215686e-06, + "loss": 0.6278, + "step": 2817 + }, + { + "epoch": 0.8662772825084537, + "grad_norm": 0.30635374784469604, + "learning_rate": 4.0850991394914795e-06, + "loss": 0.632, + "step": 2818 + }, + { + "epoch": 0.8665846910544113, + "grad_norm": 0.31315502524375916, + "learning_rate": 4.084470265083191e-06, + "loss": 0.653, + "step": 2819 + }, + { + "epoch": 0.8668920996003688, + "grad_norm": 0.33418750762939453, + "learning_rate": 4.083841223057347e-06, + "loss": 0.6344, + "step": 2820 + }, + { + "epoch": 0.8671995081463265, + "grad_norm": 0.32311394810676575, + "learning_rate": 4.083212013480493e-06, + "loss": 0.6201, + "step": 2821 + }, + { + "epoch": 0.8675069166922841, + "grad_norm": 0.2989126741886139, + "learning_rate": 4.0825826364191905e-06, + "loss": 0.6093, + "step": 2822 + }, + { + "epoch": 0.8678143252382416, + "grad_norm": 0.31540510058403015, + "learning_rate": 4.0819530919400205e-06, + "loss": 0.5906, + "step": 2823 + }, + { + "epoch": 0.8681217337841992, + "grad_norm": 0.31160998344421387, + "learning_rate": 4.0813233801095805e-06, + "loss": 0.6161, + "step": 2824 + }, + { + "epoch": 0.8684291423301568, + "grad_norm": 0.2978249490261078, + "learning_rate": 4.080693500994488e-06, + "loss": 0.6251, + "step": 2825 + }, + { + "epoch": 0.8687365508761143, + "grad_norm": 0.3580038547515869, + "learning_rate": 4.080063454661375e-06, + "loss": 0.6277, + "step": 2826 + }, + { + "epoch": 0.8690439594220719, + "grad_norm": 0.30050545930862427, + "learning_rate": 4.079433241176892e-06, + "loss": 0.6282, + "step": 2827 + }, + { + "epoch": 0.8693513679680295, + "grad_norm": 0.30352938175201416, + "learning_rate": 4.07880286060771e-06, + "loss": 0.6141, + "step": 2828 + }, + { + "epoch": 0.869658776513987, + "grad_norm": 0.32049357891082764, + "learning_rate": 4.0781723130205135e-06, + "loss": 0.6124, + "step": 2829 + }, + { + "epoch": 0.8699661850599447, + "grad_norm": 0.3322206437587738, + "learning_rate": 4.077541598482009e-06, + "loss": 0.6336, + "step": 2830 + }, + { + "epoch": 0.8702735936059023, + "grad_norm": 0.3006199598312378, + "learning_rate": 4.076910717058916e-06, + "loss": 0.6109, + "step": 2831 + }, + { + "epoch": 0.8705810021518599, + "grad_norm": 0.3002825081348419, + "learning_rate": 4.076279668817975e-06, + "loss": 0.6186, + "step": 2832 + }, + { + "epoch": 0.8708884106978174, + "grad_norm": 0.2954705059528351, + "learning_rate": 4.075648453825944e-06, + "loss": 0.6152, + "step": 2833 + }, + { + "epoch": 0.871195819243775, + "grad_norm": 0.3244445025920868, + "learning_rate": 4.075017072149596e-06, + "loss": 0.6498, + "step": 2834 + }, + { + "epoch": 0.8715032277897325, + "grad_norm": 0.3170804977416992, + "learning_rate": 4.0743855238557265e-06, + "loss": 0.5965, + "step": 2835 + }, + { + "epoch": 0.8718106363356901, + "grad_norm": 0.33296188712120056, + "learning_rate": 4.073753809011141e-06, + "loss": 0.6111, + "step": 2836 + }, + { + "epoch": 0.8721180448816477, + "grad_norm": 0.30246657133102417, + "learning_rate": 4.073121927682672e-06, + "loss": 0.6226, + "step": 2837 + }, + { + "epoch": 0.8724254534276052, + "grad_norm": 0.31511858105659485, + "learning_rate": 4.072489879937161e-06, + "loss": 0.6096, + "step": 2838 + }, + { + "epoch": 0.8727328619735628, + "grad_norm": 0.3104989230632782, + "learning_rate": 4.071857665841474e-06, + "loss": 0.6269, + "step": 2839 + }, + { + "epoch": 0.8730402705195205, + "grad_norm": 0.3039863407611847, + "learning_rate": 4.071225285462489e-06, + "loss": 0.6308, + "step": 2840 + }, + { + "epoch": 0.873347679065478, + "grad_norm": 0.3145555257797241, + "learning_rate": 4.070592738867105e-06, + "loss": 0.6364, + "step": 2841 + }, + { + "epoch": 0.8736550876114356, + "grad_norm": 0.3237541913986206, + "learning_rate": 4.069960026122238e-06, + "loss": 0.628, + "step": 2842 + }, + { + "epoch": 0.8739624961573932, + "grad_norm": 0.3224969804286957, + "learning_rate": 4.069327147294822e-06, + "loss": 0.608, + "step": 2843 + }, + { + "epoch": 0.8742699047033508, + "grad_norm": 0.303448885679245, + "learning_rate": 4.068694102451805e-06, + "loss": 0.6097, + "step": 2844 + }, + { + "epoch": 0.8745773132493083, + "grad_norm": 0.3311273455619812, + "learning_rate": 4.0680608916601594e-06, + "loss": 0.6112, + "step": 2845 + }, + { + "epoch": 0.8748847217952659, + "grad_norm": 0.3050517439842224, + "learning_rate": 4.067427514986868e-06, + "loss": 0.6418, + "step": 2846 + }, + { + "epoch": 0.8751921303412235, + "grad_norm": 0.29391759634017944, + "learning_rate": 4.066793972498936e-06, + "loss": 0.6229, + "step": 2847 + }, + { + "epoch": 0.875499538887181, + "grad_norm": 0.30355820059776306, + "learning_rate": 4.066160264263383e-06, + "loss": 0.6355, + "step": 2848 + }, + { + "epoch": 0.8758069474331387, + "grad_norm": 0.3192652463912964, + "learning_rate": 4.0655263903472485e-06, + "loss": 0.6277, + "step": 2849 + }, + { + "epoch": 0.8761143559790963, + "grad_norm": 0.303589403629303, + "learning_rate": 4.064892350817589e-06, + "loss": 0.6215, + "step": 2850 + }, + { + "epoch": 0.8764217645250538, + "grad_norm": 0.3093075156211853, + "learning_rate": 4.064258145741477e-06, + "loss": 0.6278, + "step": 2851 + }, + { + "epoch": 0.8767291730710114, + "grad_norm": 0.3209528923034668, + "learning_rate": 4.063623775186004e-06, + "loss": 0.6359, + "step": 2852 + }, + { + "epoch": 0.877036581616969, + "grad_norm": 0.31218090653419495, + "learning_rate": 4.062989239218279e-06, + "loss": 0.6183, + "step": 2853 + }, + { + "epoch": 0.8773439901629265, + "grad_norm": 0.29197123646736145, + "learning_rate": 4.062354537905428e-06, + "loss": 0.6429, + "step": 2854 + }, + { + "epoch": 0.8776513987088841, + "grad_norm": 0.33045512437820435, + "learning_rate": 4.061719671314596e-06, + "loss": 0.6151, + "step": 2855 + }, + { + "epoch": 0.8779588072548417, + "grad_norm": 0.3134247064590454, + "learning_rate": 4.061084639512941e-06, + "loss": 0.6117, + "step": 2856 + }, + { + "epoch": 0.8782662158007992, + "grad_norm": 0.2946312129497528, + "learning_rate": 4.060449442567644e-06, + "loss": 0.6168, + "step": 2857 + }, + { + "epoch": 0.8785736243467568, + "grad_norm": 0.3034891188144684, + "learning_rate": 4.059814080545901e-06, + "loss": 0.5989, + "step": 2858 + }, + { + "epoch": 0.8788810328927145, + "grad_norm": 0.2975204288959503, + "learning_rate": 4.059178553514923e-06, + "loss": 0.6316, + "step": 2859 + }, + { + "epoch": 0.879188441438672, + "grad_norm": 0.3013540208339691, + "learning_rate": 4.058542861541945e-06, + "loss": 0.6062, + "step": 2860 + }, + { + "epoch": 0.8794958499846296, + "grad_norm": 0.3060966432094574, + "learning_rate": 4.057907004694212e-06, + "loss": 0.6314, + "step": 2861 + }, + { + "epoch": 0.8798032585305872, + "grad_norm": 0.3206006586551666, + "learning_rate": 4.057270983038991e-06, + "loss": 0.6173, + "step": 2862 + }, + { + "epoch": 0.8801106670765447, + "grad_norm": 0.2965049147605896, + "learning_rate": 4.056634796643566e-06, + "loss": 0.6193, + "step": 2863 + }, + { + "epoch": 0.8804180756225023, + "grad_norm": 0.30479854345321655, + "learning_rate": 4.055998445575238e-06, + "loss": 0.6203, + "step": 2864 + }, + { + "epoch": 0.8807254841684599, + "grad_norm": 0.310628741979599, + "learning_rate": 4.055361929901324e-06, + "loss": 0.6175, + "step": 2865 + }, + { + "epoch": 0.8810328927144174, + "grad_norm": 0.331474632024765, + "learning_rate": 4.054725249689159e-06, + "loss": 0.6045, + "step": 2866 + }, + { + "epoch": 0.881340301260375, + "grad_norm": 0.3087190091609955, + "learning_rate": 4.054088405006098e-06, + "loss": 0.6027, + "step": 2867 + }, + { + "epoch": 0.8816477098063327, + "grad_norm": 0.30168071389198303, + "learning_rate": 4.0534513959195095e-06, + "loss": 0.5991, + "step": 2868 + }, + { + "epoch": 0.8819551183522902, + "grad_norm": 0.30081111192703247, + "learning_rate": 4.052814222496783e-06, + "loss": 0.6446, + "step": 2869 + }, + { + "epoch": 0.8822625268982478, + "grad_norm": 0.3099258244037628, + "learning_rate": 4.052176884805321e-06, + "loss": 0.6307, + "step": 2870 + }, + { + "epoch": 0.8825699354442054, + "grad_norm": 0.3125542104244232, + "learning_rate": 4.051539382912548e-06, + "loss": 0.6161, + "step": 2871 + }, + { + "epoch": 0.8828773439901629, + "grad_norm": 0.2983538508415222, + "learning_rate": 4.050901716885905e-06, + "loss": 0.6013, + "step": 2872 + }, + { + "epoch": 0.8831847525361205, + "grad_norm": 0.30163681507110596, + "learning_rate": 4.050263886792847e-06, + "loss": 0.6299, + "step": 2873 + }, + { + "epoch": 0.8834921610820781, + "grad_norm": 0.29254770278930664, + "learning_rate": 4.04962589270085e-06, + "loss": 0.6218, + "step": 2874 + }, + { + "epoch": 0.8837995696280356, + "grad_norm": 0.3168123662471771, + "learning_rate": 4.048987734677405e-06, + "loss": 0.6133, + "step": 2875 + }, + { + "epoch": 0.8841069781739932, + "grad_norm": 0.2989143431186676, + "learning_rate": 4.048349412790022e-06, + "loss": 0.6318, + "step": 2876 + }, + { + "epoch": 0.8844143867199508, + "grad_norm": 0.32681167125701904, + "learning_rate": 4.047710927106227e-06, + "loss": 0.6191, + "step": 2877 + }, + { + "epoch": 0.8847217952659084, + "grad_norm": 0.3019354045391083, + "learning_rate": 4.047072277693565e-06, + "loss": 0.6256, + "step": 2878 + }, + { + "epoch": 0.885029203811866, + "grad_norm": 0.3056715428829193, + "learning_rate": 4.0464334646195955e-06, + "loss": 0.6146, + "step": 2879 + }, + { + "epoch": 0.8853366123578236, + "grad_norm": 0.31775158643722534, + "learning_rate": 4.045794487951899e-06, + "loss": 0.6403, + "step": 2880 + }, + { + "epoch": 0.8856440209037811, + "grad_norm": 0.3080347776412964, + "learning_rate": 4.0451553477580705e-06, + "loss": 0.62, + "step": 2881 + }, + { + "epoch": 0.8859514294497387, + "grad_norm": 0.30480700731277466, + "learning_rate": 4.044516044105724e-06, + "loss": 0.642, + "step": 2882 + }, + { + "epoch": 0.8862588379956963, + "grad_norm": 0.303638219833374, + "learning_rate": 4.043876577062489e-06, + "loss": 0.6136, + "step": 2883 + }, + { + "epoch": 0.8865662465416538, + "grad_norm": 0.30553096532821655, + "learning_rate": 4.043236946696013e-06, + "loss": 0.6103, + "step": 2884 + }, + { + "epoch": 0.8868736550876114, + "grad_norm": 0.2891029715538025, + "learning_rate": 4.042597153073963e-06, + "loss": 0.6005, + "step": 2885 + }, + { + "epoch": 0.887181063633569, + "grad_norm": 0.3045364320278168, + "learning_rate": 4.041957196264019e-06, + "loss": 0.6478, + "step": 2886 + }, + { + "epoch": 0.8874884721795266, + "grad_norm": 0.3133210837841034, + "learning_rate": 4.041317076333882e-06, + "loss": 0.6251, + "step": 2887 + }, + { + "epoch": 0.8877958807254842, + "grad_norm": 0.3113643229007721, + "learning_rate": 4.040676793351268e-06, + "loss": 0.6257, + "step": 2888 + }, + { + "epoch": 0.8881032892714418, + "grad_norm": 0.31472867727279663, + "learning_rate": 4.040036347383913e-06, + "loss": 0.6002, + "step": 2889 + }, + { + "epoch": 0.8884106978173993, + "grad_norm": 0.31547310948371887, + "learning_rate": 4.039395738499565e-06, + "loss": 0.634, + "step": 2890 + }, + { + "epoch": 0.8887181063633569, + "grad_norm": 0.31316009163856506, + "learning_rate": 4.038754966765996e-06, + "loss": 0.6151, + "step": 2891 + }, + { + "epoch": 0.8890255149093145, + "grad_norm": 0.3140314817428589, + "learning_rate": 4.03811403225099e-06, + "loss": 0.6458, + "step": 2892 + }, + { + "epoch": 0.889332923455272, + "grad_norm": 0.3115178346633911, + "learning_rate": 4.0374729350223495e-06, + "loss": 0.6164, + "step": 2893 + }, + { + "epoch": 0.8896403320012296, + "grad_norm": 0.3134821355342865, + "learning_rate": 4.036831675147896e-06, + "loss": 0.6046, + "step": 2894 + }, + { + "epoch": 0.8899477405471872, + "grad_norm": 0.2930891811847687, + "learning_rate": 4.036190252695467e-06, + "loss": 0.6195, + "step": 2895 + }, + { + "epoch": 0.8902551490931447, + "grad_norm": 0.3012644648551941, + "learning_rate": 4.035548667732917e-06, + "loss": 0.6203, + "step": 2896 + }, + { + "epoch": 0.8905625576391024, + "grad_norm": 0.2979808449745178, + "learning_rate": 4.034906920328117e-06, + "loss": 0.615, + "step": 2897 + }, + { + "epoch": 0.89086996618506, + "grad_norm": 0.30913642048835754, + "learning_rate": 4.034265010548956e-06, + "loss": 0.6176, + "step": 2898 + }, + { + "epoch": 0.8911773747310175, + "grad_norm": 0.3246704638004303, + "learning_rate": 4.033622938463341e-06, + "loss": 0.6082, + "step": 2899 + }, + { + "epoch": 0.8914847832769751, + "grad_norm": 0.2998434603214264, + "learning_rate": 4.032980704139195e-06, + "loss": 0.6114, + "step": 2900 + }, + { + "epoch": 0.8917921918229327, + "grad_norm": 0.3043539226055145, + "learning_rate": 4.03233830764446e-06, + "loss": 0.6378, + "step": 2901 + }, + { + "epoch": 0.8920996003688902, + "grad_norm": 0.3381515443325043, + "learning_rate": 4.03169574904709e-06, + "loss": 0.6062, + "step": 2902 + }, + { + "epoch": 0.8924070089148478, + "grad_norm": 0.31770703196525574, + "learning_rate": 4.031053028415064e-06, + "loss": 0.6346, + "step": 2903 + }, + { + "epoch": 0.8927144174608054, + "grad_norm": 0.3143541216850281, + "learning_rate": 4.0304101458163715e-06, + "loss": 0.6201, + "step": 2904 + }, + { + "epoch": 0.8930218260067629, + "grad_norm": 0.32600805163383484, + "learning_rate": 4.029767101319021e-06, + "loss": 0.6266, + "step": 2905 + }, + { + "epoch": 0.8933292345527206, + "grad_norm": 0.30677229166030884, + "learning_rate": 4.029123894991042e-06, + "loss": 0.6244, + "step": 2906 + }, + { + "epoch": 0.8936366430986782, + "grad_norm": 0.3048979938030243, + "learning_rate": 4.028480526900474e-06, + "loss": 0.6148, + "step": 2907 + }, + { + "epoch": 0.8939440516446358, + "grad_norm": 0.3074537515640259, + "learning_rate": 4.027836997115381e-06, + "loss": 0.6227, + "step": 2908 + }, + { + "epoch": 0.8942514601905933, + "grad_norm": 0.3100559115409851, + "learning_rate": 4.027193305703838e-06, + "loss": 0.6112, + "step": 2909 + }, + { + "epoch": 0.8945588687365509, + "grad_norm": 0.3030841052532196, + "learning_rate": 4.02654945273394e-06, + "loss": 0.6291, + "step": 2910 + }, + { + "epoch": 0.8948662772825084, + "grad_norm": 0.30353760719299316, + "learning_rate": 4.0259054382737995e-06, + "loss": 0.6445, + "step": 2911 + }, + { + "epoch": 0.895173685828466, + "grad_norm": 0.3316253125667572, + "learning_rate": 4.025261262391546e-06, + "loss": 0.6197, + "step": 2912 + }, + { + "epoch": 0.8954810943744236, + "grad_norm": 0.3966232240200043, + "learning_rate": 4.024616925155324e-06, + "loss": 0.6163, + "step": 2913 + }, + { + "epoch": 0.8957885029203811, + "grad_norm": 0.29467934370040894, + "learning_rate": 4.0239724266332965e-06, + "loss": 0.6426, + "step": 2914 + }, + { + "epoch": 0.8960959114663387, + "grad_norm": 0.3231373429298401, + "learning_rate": 4.023327766893644e-06, + "loss": 0.6224, + "step": 2915 + }, + { + "epoch": 0.8964033200122964, + "grad_norm": 0.35864198207855225, + "learning_rate": 4.022682946004564e-06, + "loss": 0.6135, + "step": 2916 + }, + { + "epoch": 0.896710728558254, + "grad_norm": 0.36816465854644775, + "learning_rate": 4.022037964034271e-06, + "loss": 0.5945, + "step": 2917 + }, + { + "epoch": 0.8970181371042115, + "grad_norm": 0.2959573268890381, + "learning_rate": 4.021392821050994e-06, + "loss": 0.6297, + "step": 2918 + }, + { + "epoch": 0.8973255456501691, + "grad_norm": 0.31264251470565796, + "learning_rate": 4.020747517122985e-06, + "loss": 0.6143, + "step": 2919 + }, + { + "epoch": 0.8976329541961267, + "grad_norm": 0.3852449357509613, + "learning_rate": 4.020102052318506e-06, + "loss": 0.6242, + "step": 2920 + }, + { + "epoch": 0.8979403627420842, + "grad_norm": 0.3372320234775543, + "learning_rate": 4.01945642670584e-06, + "loss": 0.6477, + "step": 2921 + }, + { + "epoch": 0.8982477712880418, + "grad_norm": 0.3143776059150696, + "learning_rate": 4.0188106403532864e-06, + "loss": 0.6063, + "step": 2922 + }, + { + "epoch": 0.8985551798339994, + "grad_norm": 0.3171533942222595, + "learning_rate": 4.018164693329162e-06, + "loss": 0.6233, + "step": 2923 + }, + { + "epoch": 0.8988625883799569, + "grad_norm": 0.37644970417022705, + "learning_rate": 4.0175185857018e-06, + "loss": 0.6287, + "step": 2924 + }, + { + "epoch": 0.8991699969259146, + "grad_norm": 0.3700834810733795, + "learning_rate": 4.01687231753955e-06, + "loss": 0.6318, + "step": 2925 + }, + { + "epoch": 0.8994774054718722, + "grad_norm": 0.29402872920036316, + "learning_rate": 4.0162258889107795e-06, + "loss": 0.6336, + "step": 2926 + }, + { + "epoch": 0.8997848140178297, + "grad_norm": 0.33640986680984497, + "learning_rate": 4.0155792998838735e-06, + "loss": 0.63, + "step": 2927 + }, + { + "epoch": 0.9000922225637873, + "grad_norm": 0.3633931279182434, + "learning_rate": 4.014932550527233e-06, + "loss": 0.6056, + "step": 2928 + }, + { + "epoch": 0.9003996311097449, + "grad_norm": 0.326484739780426, + "learning_rate": 4.014285640909275e-06, + "loss": 0.6442, + "step": 2929 + }, + { + "epoch": 0.9007070396557024, + "grad_norm": 0.3055676519870758, + "learning_rate": 4.013638571098436e-06, + "loss": 0.6225, + "step": 2930 + }, + { + "epoch": 0.90101444820166, + "grad_norm": 0.29801812767982483, + "learning_rate": 4.0129913411631675e-06, + "loss": 0.6058, + "step": 2931 + }, + { + "epoch": 0.9013218567476176, + "grad_norm": 0.3035387694835663, + "learning_rate": 4.012343951171938e-06, + "loss": 0.6187, + "step": 2932 + }, + { + "epoch": 0.9016292652935751, + "grad_norm": 0.3137499988079071, + "learning_rate": 4.011696401193234e-06, + "loss": 0.6056, + "step": 2933 + }, + { + "epoch": 0.9019366738395327, + "grad_norm": 0.300121933221817, + "learning_rate": 4.0110486912955585e-06, + "loss": 0.6071, + "step": 2934 + }, + { + "epoch": 0.9022440823854904, + "grad_norm": 0.32410770654678345, + "learning_rate": 4.010400821547431e-06, + "loss": 0.6177, + "step": 2935 + }, + { + "epoch": 0.9025514909314479, + "grad_norm": 0.3174334466457367, + "learning_rate": 4.009752792017388e-06, + "loss": 0.6317, + "step": 2936 + }, + { + "epoch": 0.9028588994774055, + "grad_norm": 0.30705124139785767, + "learning_rate": 4.009104602773983e-06, + "loss": 0.6298, + "step": 2937 + }, + { + "epoch": 0.9031663080233631, + "grad_norm": 0.3325636684894562, + "learning_rate": 4.0084562538857875e-06, + "loss": 0.6438, + "step": 2938 + }, + { + "epoch": 0.9034737165693206, + "grad_norm": 0.32471615076065063, + "learning_rate": 4.0078077454213875e-06, + "loss": 0.634, + "step": 2939 + }, + { + "epoch": 0.9037811251152782, + "grad_norm": 0.3061945140361786, + "learning_rate": 4.007159077449388e-06, + "loss": 0.629, + "step": 2940 + }, + { + "epoch": 0.9040885336612358, + "grad_norm": 0.3048650622367859, + "learning_rate": 4.0065102500384115e-06, + "loss": 0.6384, + "step": 2941 + }, + { + "epoch": 0.9043959422071933, + "grad_norm": 0.3092881739139557, + "learning_rate": 4.005861263257094e-06, + "loss": 0.6619, + "step": 2942 + }, + { + "epoch": 0.9047033507531509, + "grad_norm": 0.31361186504364014, + "learning_rate": 4.00521211717409e-06, + "loss": 0.6055, + "step": 2943 + }, + { + "epoch": 0.9050107592991086, + "grad_norm": 0.3371451795101166, + "learning_rate": 4.004562811858073e-06, + "loss": 0.5995, + "step": 2944 + }, + { + "epoch": 0.9053181678450661, + "grad_norm": 0.2922208309173584, + "learning_rate": 4.00391334737773e-06, + "loss": 0.6166, + "step": 2945 + }, + { + "epoch": 0.9056255763910237, + "grad_norm": 0.30419448018074036, + "learning_rate": 4.003263723801768e-06, + "loss": 0.6168, + "step": 2946 + }, + { + "epoch": 0.9059329849369813, + "grad_norm": 0.2996313273906708, + "learning_rate": 4.002613941198907e-06, + "loss": 0.6205, + "step": 2947 + }, + { + "epoch": 0.9062403934829388, + "grad_norm": 0.35129258036613464, + "learning_rate": 4.001963999637889e-06, + "loss": 0.6071, + "step": 2948 + }, + { + "epoch": 0.9065478020288964, + "grad_norm": 0.3073549270629883, + "learning_rate": 4.001313899187467e-06, + "loss": 0.6506, + "step": 2949 + }, + { + "epoch": 0.906855210574854, + "grad_norm": 0.3107692003250122, + "learning_rate": 4.0006636399164155e-06, + "loss": 0.6288, + "step": 2950 + }, + { + "epoch": 0.9071626191208115, + "grad_norm": 0.30691710114479065, + "learning_rate": 4.000013221893523e-06, + "loss": 0.6298, + "step": 2951 + }, + { + "epoch": 0.9074700276667691, + "grad_norm": 0.3118327260017395, + "learning_rate": 3.999362645187596e-06, + "loss": 0.6177, + "step": 2952 + }, + { + "epoch": 0.9077774362127267, + "grad_norm": 0.3160204589366913, + "learning_rate": 3.9987119098674586e-06, + "loss": 0.6155, + "step": 2953 + }, + { + "epoch": 0.9080848447586843, + "grad_norm": 0.30090126395225525, + "learning_rate": 3.998061016001948e-06, + "loss": 0.5946, + "step": 2954 + }, + { + "epoch": 0.9083922533046419, + "grad_norm": 0.2928600609302521, + "learning_rate": 3.997409963659924e-06, + "loss": 0.6239, + "step": 2955 + }, + { + "epoch": 0.9086996618505995, + "grad_norm": 0.3092261254787445, + "learning_rate": 3.996758752910258e-06, + "loss": 0.6481, + "step": 2956 + }, + { + "epoch": 0.909007070396557, + "grad_norm": 0.31681451201438904, + "learning_rate": 3.9961073838218404e-06, + "loss": 0.6111, + "step": 2957 + }, + { + "epoch": 0.9093144789425146, + "grad_norm": 0.3159211277961731, + "learning_rate": 3.995455856463578e-06, + "loss": 0.6452, + "step": 2958 + }, + { + "epoch": 0.9096218874884722, + "grad_norm": 0.3034047484397888, + "learning_rate": 3.994804170904395e-06, + "loss": 0.6141, + "step": 2959 + }, + { + "epoch": 0.9099292960344297, + "grad_norm": 0.31449154019355774, + "learning_rate": 3.994152327213232e-06, + "loss": 0.5925, + "step": 2960 + }, + { + "epoch": 0.9102367045803873, + "grad_norm": 0.29788365960121155, + "learning_rate": 3.993500325459045e-06, + "loss": 0.6081, + "step": 2961 + }, + { + "epoch": 0.9105441131263449, + "grad_norm": 0.3170234262943268, + "learning_rate": 3.992848165710808e-06, + "loss": 0.6009, + "step": 2962 + }, + { + "epoch": 0.9108515216723025, + "grad_norm": 0.3090430200099945, + "learning_rate": 3.992195848037512e-06, + "loss": 0.6231, + "step": 2963 + }, + { + "epoch": 0.9111589302182601, + "grad_norm": 0.30584657192230225, + "learning_rate": 3.991543372508164e-06, + "loss": 0.6146, + "step": 2964 + }, + { + "epoch": 0.9114663387642177, + "grad_norm": 0.2986617982387543, + "learning_rate": 3.990890739191788e-06, + "loss": 0.6098, + "step": 2965 + }, + { + "epoch": 0.9117737473101752, + "grad_norm": 0.3277614116668701, + "learning_rate": 3.990237948157426e-06, + "loss": 0.5895, + "step": 2966 + }, + { + "epoch": 0.9120811558561328, + "grad_norm": 0.3078819513320923, + "learning_rate": 3.989584999474132e-06, + "loss": 0.6323, + "step": 2967 + }, + { + "epoch": 0.9123885644020904, + "grad_norm": 0.31868481636047363, + "learning_rate": 3.988931893210984e-06, + "loss": 0.6257, + "step": 2968 + }, + { + "epoch": 0.9126959729480479, + "grad_norm": 0.3124963641166687, + "learning_rate": 3.988278629437068e-06, + "loss": 0.6281, + "step": 2969 + }, + { + "epoch": 0.9130033814940055, + "grad_norm": 0.30472514033317566, + "learning_rate": 3.9876252082214954e-06, + "loss": 0.6294, + "step": 2970 + }, + { + "epoch": 0.9133107900399631, + "grad_norm": 0.3112039864063263, + "learning_rate": 3.986971629633388e-06, + "loss": 0.6419, + "step": 2971 + }, + { + "epoch": 0.9136181985859206, + "grad_norm": 0.3083873987197876, + "learning_rate": 3.986317893741887e-06, + "loss": 0.6274, + "step": 2972 + }, + { + "epoch": 0.9139256071318783, + "grad_norm": 0.2955109179019928, + "learning_rate": 3.985664000616149e-06, + "loss": 0.6131, + "step": 2973 + }, + { + "epoch": 0.9142330156778359, + "grad_norm": 0.2973729968070984, + "learning_rate": 3.98500995032535e-06, + "loss": 0.6124, + "step": 2974 + }, + { + "epoch": 0.9145404242237934, + "grad_norm": 0.3001807630062103, + "learning_rate": 3.984355742938679e-06, + "loss": 0.6147, + "step": 2975 + }, + { + "epoch": 0.914847832769751, + "grad_norm": 0.3091183304786682, + "learning_rate": 3.983701378525342e-06, + "loss": 0.6077, + "step": 2976 + }, + { + "epoch": 0.9151552413157086, + "grad_norm": 0.3070439398288727, + "learning_rate": 3.983046857154565e-06, + "loss": 0.6325, + "step": 2977 + }, + { + "epoch": 0.9154626498616661, + "grad_norm": 0.3007453382015228, + "learning_rate": 3.982392178895587e-06, + "loss": 0.624, + "step": 2978 + }, + { + "epoch": 0.9157700584076237, + "grad_norm": 0.3008919060230255, + "learning_rate": 3.981737343817665e-06, + "loss": 0.6194, + "step": 2979 + }, + { + "epoch": 0.9160774669535813, + "grad_norm": 0.311378538608551, + "learning_rate": 3.981082351990073e-06, + "loss": 0.6098, + "step": 2980 + }, + { + "epoch": 0.9163848754995388, + "grad_norm": 0.30156633257865906, + "learning_rate": 3.9804272034821e-06, + "loss": 0.5981, + "step": 2981 + }, + { + "epoch": 0.9166922840454965, + "grad_norm": 0.29901832342147827, + "learning_rate": 3.979771898363054e-06, + "loss": 0.6209, + "step": 2982 + }, + { + "epoch": 0.9169996925914541, + "grad_norm": 0.3083949089050293, + "learning_rate": 3.9791164367022586e-06, + "loss": 0.6376, + "step": 2983 + }, + { + "epoch": 0.9173071011374117, + "grad_norm": 0.30445027351379395, + "learning_rate": 3.978460818569052e-06, + "loss": 0.6177, + "step": 2984 + }, + { + "epoch": 0.9176145096833692, + "grad_norm": 0.30728819966316223, + "learning_rate": 3.977805044032792e-06, + "loss": 0.5913, + "step": 2985 + }, + { + "epoch": 0.9179219182293268, + "grad_norm": 0.2974046468734741, + "learning_rate": 3.977149113162851e-06, + "loss": 0.6447, + "step": 2986 + }, + { + "epoch": 0.9182293267752843, + "grad_norm": 0.3163948357105255, + "learning_rate": 3.9764930260286186e-06, + "loss": 0.6486, + "step": 2987 + }, + { + "epoch": 0.9185367353212419, + "grad_norm": 0.3056921362876892, + "learning_rate": 3.9758367826995e-06, + "loss": 0.6197, + "step": 2988 + }, + { + "epoch": 0.9188441438671995, + "grad_norm": 0.3026762008666992, + "learning_rate": 3.975180383244918e-06, + "loss": 0.6031, + "step": 2989 + }, + { + "epoch": 0.919151552413157, + "grad_norm": 0.2938539385795593, + "learning_rate": 3.974523827734312e-06, + "loss": 0.6264, + "step": 2990 + }, + { + "epoch": 0.9194589609591146, + "grad_norm": 0.3070076107978821, + "learning_rate": 3.973867116237138e-06, + "loss": 0.5923, + "step": 2991 + }, + { + "epoch": 0.9197663695050723, + "grad_norm": 0.30741557478904724, + "learning_rate": 3.973210248822867e-06, + "loss": 0.6301, + "step": 2992 + }, + { + "epoch": 0.9200737780510299, + "grad_norm": 0.29768338799476624, + "learning_rate": 3.972553225560989e-06, + "loss": 0.5951, + "step": 2993 + }, + { + "epoch": 0.9203811865969874, + "grad_norm": 0.31364908814430237, + "learning_rate": 3.971896046521007e-06, + "loss": 0.6245, + "step": 2994 + }, + { + "epoch": 0.920688595142945, + "grad_norm": 0.2973991334438324, + "learning_rate": 3.971238711772444e-06, + "loss": 0.6243, + "step": 2995 + }, + { + "epoch": 0.9209960036889026, + "grad_norm": 0.30048540234565735, + "learning_rate": 3.970581221384837e-06, + "loss": 0.6235, + "step": 2996 + }, + { + "epoch": 0.9213034122348601, + "grad_norm": 0.30283868312835693, + "learning_rate": 3.969923575427742e-06, + "loss": 0.6113, + "step": 2997 + }, + { + "epoch": 0.9216108207808177, + "grad_norm": 0.29072505235671997, + "learning_rate": 3.969265773970728e-06, + "loss": 0.6258, + "step": 2998 + }, + { + "epoch": 0.9219182293267753, + "grad_norm": 0.30422037839889526, + "learning_rate": 3.968607817083383e-06, + "loss": 0.5907, + "step": 2999 + }, + { + "epoch": 0.9222256378727328, + "grad_norm": 0.2968962788581848, + "learning_rate": 3.967949704835311e-06, + "loss": 0.625, + "step": 3000 + }, + { + "epoch": 0.9225330464186905, + "grad_norm": 0.3079689145088196, + "learning_rate": 3.967291437296133e-06, + "loss": 0.6493, + "step": 3001 + }, + { + "epoch": 0.9228404549646481, + "grad_norm": 0.31143277883529663, + "learning_rate": 3.966633014535484e-06, + "loss": 0.6109, + "step": 3002 + }, + { + "epoch": 0.9231478635106056, + "grad_norm": 0.29072636365890503, + "learning_rate": 3.965974436623018e-06, + "loss": 0.6498, + "step": 3003 + }, + { + "epoch": 0.9234552720565632, + "grad_norm": 0.30744338035583496, + "learning_rate": 3.965315703628404e-06, + "loss": 0.6228, + "step": 3004 + }, + { + "epoch": 0.9237626806025208, + "grad_norm": 0.29519784450531006, + "learning_rate": 3.964656815621329e-06, + "loss": 0.6203, + "step": 3005 + }, + { + "epoch": 0.9240700891484783, + "grad_norm": 0.3089035153388977, + "learning_rate": 3.963997772671494e-06, + "loss": 0.637, + "step": 3006 + }, + { + "epoch": 0.9243774976944359, + "grad_norm": 0.3018156588077545, + "learning_rate": 3.963338574848619e-06, + "loss": 0.6301, + "step": 3007 + }, + { + "epoch": 0.9246849062403935, + "grad_norm": 0.29864224791526794, + "learning_rate": 3.962679222222437e-06, + "loss": 0.6019, + "step": 3008 + }, + { + "epoch": 0.924992314786351, + "grad_norm": 0.30080071091651917, + "learning_rate": 3.962019714862702e-06, + "loss": 0.6197, + "step": 3009 + }, + { + "epoch": 0.9252997233323086, + "grad_norm": 0.31929099559783936, + "learning_rate": 3.9613600528391795e-06, + "loss": 0.6146, + "step": 3010 + }, + { + "epoch": 0.9256071318782663, + "grad_norm": 0.300263375043869, + "learning_rate": 3.9607002362216554e-06, + "loss": 0.593, + "step": 3011 + }, + { + "epoch": 0.9259145404242238, + "grad_norm": 0.31488093733787537, + "learning_rate": 3.9600402650799295e-06, + "loss": 0.6185, + "step": 3012 + }, + { + "epoch": 0.9262219489701814, + "grad_norm": 0.30141910910606384, + "learning_rate": 3.959380139483818e-06, + "loss": 0.6196, + "step": 3013 + }, + { + "epoch": 0.926529357516139, + "grad_norm": 0.3132549226284027, + "learning_rate": 3.958719859503155e-06, + "loss": 0.6117, + "step": 3014 + }, + { + "epoch": 0.9268367660620965, + "grad_norm": 0.2903481721878052, + "learning_rate": 3.958059425207791e-06, + "loss": 0.6005, + "step": 3015 + }, + { + "epoch": 0.9271441746080541, + "grad_norm": 0.30704841017723083, + "learning_rate": 3.957398836667589e-06, + "loss": 0.6341, + "step": 3016 + }, + { + "epoch": 0.9274515831540117, + "grad_norm": 0.29083356261253357, + "learning_rate": 3.956738093952433e-06, + "loss": 0.5932, + "step": 3017 + }, + { + "epoch": 0.9277589916999692, + "grad_norm": 0.30894598364830017, + "learning_rate": 3.956077197132222e-06, + "loss": 0.6254, + "step": 3018 + }, + { + "epoch": 0.9280664002459268, + "grad_norm": 0.334259957075119, + "learning_rate": 3.955416146276871e-06, + "loss": 0.6163, + "step": 3019 + }, + { + "epoch": 0.9283738087918845, + "grad_norm": 0.30138206481933594, + "learning_rate": 3.954754941456308e-06, + "loss": 0.6199, + "step": 3020 + }, + { + "epoch": 0.928681217337842, + "grad_norm": 0.31409260630607605, + "learning_rate": 3.9540935827404844e-06, + "loss": 0.6335, + "step": 3021 + }, + { + "epoch": 0.9289886258837996, + "grad_norm": 0.30272454023361206, + "learning_rate": 3.953432070199361e-06, + "loss": 0.6028, + "step": 3022 + }, + { + "epoch": 0.9292960344297572, + "grad_norm": 0.2886223793029785, + "learning_rate": 3.952770403902919e-06, + "loss": 0.6069, + "step": 3023 + }, + { + "epoch": 0.9296034429757147, + "grad_norm": 0.3018653988838196, + "learning_rate": 3.952108583921154e-06, + "loss": 0.6267, + "step": 3024 + }, + { + "epoch": 0.9299108515216723, + "grad_norm": 0.322063684463501, + "learning_rate": 3.95144661032408e-06, + "loss": 0.6242, + "step": 3025 + }, + { + "epoch": 0.9302182600676299, + "grad_norm": 0.3052050769329071, + "learning_rate": 3.950784483181724e-06, + "loss": 0.6347, + "step": 3026 + }, + { + "epoch": 0.9305256686135874, + "grad_norm": 0.30574357509613037, + "learning_rate": 3.950122202564131e-06, + "loss": 0.6368, + "step": 3027 + }, + { + "epoch": 0.930833077159545, + "grad_norm": 0.3158875107765198, + "learning_rate": 3.949459768541363e-06, + "loss": 0.6074, + "step": 3028 + }, + { + "epoch": 0.9311404857055026, + "grad_norm": 0.29096725583076477, + "learning_rate": 3.948797181183497e-06, + "loss": 0.6249, + "step": 3029 + }, + { + "epoch": 0.9314478942514602, + "grad_norm": 0.3006783723831177, + "learning_rate": 3.948134440560627e-06, + "loss": 0.6286, + "step": 3030 + }, + { + "epoch": 0.9317553027974178, + "grad_norm": 0.30553990602493286, + "learning_rate": 3.947471546742863e-06, + "loss": 0.6078, + "step": 3031 + }, + { + "epoch": 0.9320627113433754, + "grad_norm": 0.31766512989997864, + "learning_rate": 3.946808499800329e-06, + "loss": 0.6379, + "step": 3032 + }, + { + "epoch": 0.9323701198893329, + "grad_norm": 0.3034387528896332, + "learning_rate": 3.9461452998031705e-06, + "loss": 0.6177, + "step": 3033 + }, + { + "epoch": 0.9326775284352905, + "grad_norm": 0.3088289201259613, + "learning_rate": 3.9454819468215425e-06, + "loss": 0.6077, + "step": 3034 + }, + { + "epoch": 0.9329849369812481, + "grad_norm": 0.29894864559173584, + "learning_rate": 3.944818440925623e-06, + "loss": 0.624, + "step": 3035 + }, + { + "epoch": 0.9332923455272056, + "grad_norm": 0.28950050473213196, + "learning_rate": 3.9441547821856e-06, + "loss": 0.6213, + "step": 3036 + }, + { + "epoch": 0.9335997540731632, + "grad_norm": 0.3120788037776947, + "learning_rate": 3.943490970671683e-06, + "loss": 0.6263, + "step": 3037 + }, + { + "epoch": 0.9339071626191208, + "grad_norm": 0.3045397102832794, + "learning_rate": 3.942827006454093e-06, + "loss": 0.6233, + "step": 3038 + }, + { + "epoch": 0.9342145711650784, + "grad_norm": 0.3037109971046448, + "learning_rate": 3.942162889603069e-06, + "loss": 0.6147, + "step": 3039 + }, + { + "epoch": 0.934521979711036, + "grad_norm": 0.2949826419353485, + "learning_rate": 3.94149862018887e-06, + "loss": 0.6358, + "step": 3040 + }, + { + "epoch": 0.9348293882569936, + "grad_norm": 0.3051709830760956, + "learning_rate": 3.940834198281763e-06, + "loss": 0.6273, + "step": 3041 + }, + { + "epoch": 0.9351367968029511, + "grad_norm": 0.31823763251304626, + "learning_rate": 3.940169623952038e-06, + "loss": 0.638, + "step": 3042 + }, + { + "epoch": 0.9354442053489087, + "grad_norm": 0.32294926047325134, + "learning_rate": 3.93950489727e-06, + "loss": 0.6295, + "step": 3043 + }, + { + "epoch": 0.9357516138948663, + "grad_norm": 0.3151704668998718, + "learning_rate": 3.9388400183059665e-06, + "loss": 0.5944, + "step": 3044 + }, + { + "epoch": 0.9360590224408238, + "grad_norm": 0.2958519756793976, + "learning_rate": 3.938174987130274e-06, + "loss": 0.6297, + "step": 3045 + }, + { + "epoch": 0.9363664309867814, + "grad_norm": 0.31529584527015686, + "learning_rate": 3.937509803813276e-06, + "loss": 0.6255, + "step": 3046 + }, + { + "epoch": 0.936673839532739, + "grad_norm": 0.31599727272987366, + "learning_rate": 3.93684446842534e-06, + "loss": 0.6368, + "step": 3047 + }, + { + "epoch": 0.9369812480786965, + "grad_norm": 0.33044853806495667, + "learning_rate": 3.9361789810368514e-06, + "loss": 0.6022, + "step": 3048 + }, + { + "epoch": 0.9372886566246542, + "grad_norm": 0.3072105348110199, + "learning_rate": 3.9355133417182076e-06, + "loss": 0.6272, + "step": 3049 + }, + { + "epoch": 0.9375960651706118, + "grad_norm": 0.3030715882778168, + "learning_rate": 3.934847550539828e-06, + "loss": 0.6228, + "step": 3050 + }, + { + "epoch": 0.9379034737165693, + "grad_norm": 0.31041330099105835, + "learning_rate": 3.934181607572145e-06, + "loss": 0.6104, + "step": 3051 + }, + { + "epoch": 0.9382108822625269, + "grad_norm": 0.30978724360466003, + "learning_rate": 3.933515512885606e-06, + "loss": 0.5868, + "step": 3052 + }, + { + "epoch": 0.9385182908084845, + "grad_norm": 0.3060765564441681, + "learning_rate": 3.932849266550676e-06, + "loss": 0.6037, + "step": 3053 + }, + { + "epoch": 0.938825699354442, + "grad_norm": 0.30832791328430176, + "learning_rate": 3.932182868637836e-06, + "loss": 0.5996, + "step": 3054 + }, + { + "epoch": 0.9391331079003996, + "grad_norm": 0.31210073828697205, + "learning_rate": 3.931516319217582e-06, + "loss": 0.629, + "step": 3055 + }, + { + "epoch": 0.9394405164463572, + "grad_norm": 0.30727681517601013, + "learning_rate": 3.930849618360427e-06, + "loss": 0.6395, + "step": 3056 + }, + { + "epoch": 0.9397479249923147, + "grad_norm": 0.30309441685676575, + "learning_rate": 3.930182766136901e-06, + "loss": 0.603, + "step": 3057 + }, + { + "epoch": 0.9400553335382723, + "grad_norm": 0.3039757311344147, + "learning_rate": 3.929515762617548e-06, + "loss": 0.628, + "step": 3058 + }, + { + "epoch": 0.94036274208423, + "grad_norm": 0.32207629084587097, + "learning_rate": 3.928848607872928e-06, + "loss": 0.6407, + "step": 3059 + }, + { + "epoch": 0.9406701506301876, + "grad_norm": 0.3187604248523712, + "learning_rate": 3.928181301973619e-06, + "loss": 0.6273, + "step": 3060 + }, + { + "epoch": 0.9409775591761451, + "grad_norm": 0.31495851278305054, + "learning_rate": 3.927513844990213e-06, + "loss": 0.6305, + "step": 3061 + }, + { + "epoch": 0.9412849677221027, + "grad_norm": 0.3095179796218872, + "learning_rate": 3.926846236993319e-06, + "loss": 0.6323, + "step": 3062 + }, + { + "epoch": 0.9415923762680602, + "grad_norm": 0.3196287751197815, + "learning_rate": 3.926178478053563e-06, + "loss": 0.6324, + "step": 3063 + }, + { + "epoch": 0.9418997848140178, + "grad_norm": 0.3267959654331207, + "learning_rate": 3.9255105682415824e-06, + "loss": 0.6163, + "step": 3064 + }, + { + "epoch": 0.9422071933599754, + "grad_norm": 0.31493785977363586, + "learning_rate": 3.924842507628038e-06, + "loss": 0.6121, + "step": 3065 + }, + { + "epoch": 0.942514601905933, + "grad_norm": 0.3126414120197296, + "learning_rate": 3.9241742962836e-06, + "loss": 0.6243, + "step": 3066 + }, + { + "epoch": 0.9428220104518905, + "grad_norm": 0.29945918917655945, + "learning_rate": 3.923505934278956e-06, + "loss": 0.6086, + "step": 3067 + }, + { + "epoch": 0.9431294189978482, + "grad_norm": 0.3285350501537323, + "learning_rate": 3.9228374216848135e-06, + "loss": 0.614, + "step": 3068 + }, + { + "epoch": 0.9434368275438058, + "grad_norm": 0.3179163932800293, + "learning_rate": 3.922168758571891e-06, + "loss": 0.623, + "step": 3069 + }, + { + "epoch": 0.9437442360897633, + "grad_norm": 0.31514057517051697, + "learning_rate": 3.921499945010926e-06, + "loss": 0.6188, + "step": 3070 + }, + { + "epoch": 0.9440516446357209, + "grad_norm": 0.30295488238334656, + "learning_rate": 3.920830981072669e-06, + "loss": 0.6091, + "step": 3071 + }, + { + "epoch": 0.9443590531816785, + "grad_norm": 0.31388038396835327, + "learning_rate": 3.92016186682789e-06, + "loss": 0.6016, + "step": 3072 + }, + { + "epoch": 0.944666461727636, + "grad_norm": 0.30659613013267517, + "learning_rate": 3.919492602347372e-06, + "loss": 0.6156, + "step": 3073 + }, + { + "epoch": 0.9449738702735936, + "grad_norm": 0.301763653755188, + "learning_rate": 3.918823187701916e-06, + "loss": 0.5933, + "step": 3074 + }, + { + "epoch": 0.9452812788195512, + "grad_norm": 0.3101961016654968, + "learning_rate": 3.918153622962336e-06, + "loss": 0.6467, + "step": 3075 + }, + { + "epoch": 0.9455886873655087, + "grad_norm": 0.31577301025390625, + "learning_rate": 3.917483908199466e-06, + "loss": 0.6305, + "step": 3076 + }, + { + "epoch": 0.9458960959114663, + "grad_norm": 0.30448248982429504, + "learning_rate": 3.916814043484152e-06, + "loss": 0.6413, + "step": 3077 + }, + { + "epoch": 0.946203504457424, + "grad_norm": 0.3052712380886078, + "learning_rate": 3.916144028887259e-06, + "loss": 0.6324, + "step": 3078 + }, + { + "epoch": 0.9465109130033815, + "grad_norm": 0.30894795060157776, + "learning_rate": 3.915473864479664e-06, + "loss": 0.5946, + "step": 3079 + }, + { + "epoch": 0.9468183215493391, + "grad_norm": 0.31840944290161133, + "learning_rate": 3.9148035503322645e-06, + "loss": 0.6318, + "step": 3080 + }, + { + "epoch": 0.9471257300952967, + "grad_norm": 0.3117295801639557, + "learning_rate": 3.914133086515971e-06, + "loss": 0.6236, + "step": 3081 + }, + { + "epoch": 0.9474331386412542, + "grad_norm": 0.3022928535938263, + "learning_rate": 3.91346247310171e-06, + "loss": 0.6158, + "step": 3082 + }, + { + "epoch": 0.9477405471872118, + "grad_norm": 0.3106420040130615, + "learning_rate": 3.912791710160423e-06, + "loss": 0.6152, + "step": 3083 + }, + { + "epoch": 0.9480479557331694, + "grad_norm": 0.29847419261932373, + "learning_rate": 3.91212079776307e-06, + "loss": 0.6331, + "step": 3084 + }, + { + "epoch": 0.9483553642791269, + "grad_norm": 0.3086632192134857, + "learning_rate": 3.911449735980626e-06, + "loss": 0.625, + "step": 3085 + }, + { + "epoch": 0.9486627728250845, + "grad_norm": 0.3139030933380127, + "learning_rate": 3.91077852488408e-06, + "loss": 0.6412, + "step": 3086 + }, + { + "epoch": 0.9489701813710422, + "grad_norm": 0.2981898784637451, + "learning_rate": 3.910107164544437e-06, + "loss": 0.6165, + "step": 3087 + }, + { + "epoch": 0.9492775899169997, + "grad_norm": 0.32215872406959534, + "learning_rate": 3.909435655032721e-06, + "loss": 0.5987, + "step": 3088 + }, + { + "epoch": 0.9495849984629573, + "grad_norm": 0.29775577783584595, + "learning_rate": 3.908763996419968e-06, + "loss": 0.623, + "step": 3089 + }, + { + "epoch": 0.9498924070089149, + "grad_norm": 0.29532137513160706, + "learning_rate": 3.9080921887772314e-06, + "loss": 0.631, + "step": 3090 + }, + { + "epoch": 0.9501998155548724, + "grad_norm": 0.30143025517463684, + "learning_rate": 3.90742023217558e-06, + "loss": 0.6287, + "step": 3091 + }, + { + "epoch": 0.95050722410083, + "grad_norm": 0.30286574363708496, + "learning_rate": 3.906748126686099e-06, + "loss": 0.632, + "step": 3092 + }, + { + "epoch": 0.9508146326467876, + "grad_norm": 0.3003418743610382, + "learning_rate": 3.9060758723798895e-06, + "loss": 0.6233, + "step": 3093 + }, + { + "epoch": 0.9511220411927451, + "grad_norm": 0.29685941338539124, + "learning_rate": 3.9054034693280666e-06, + "loss": 0.6268, + "step": 3094 + }, + { + "epoch": 0.9514294497387027, + "grad_norm": 0.30067992210388184, + "learning_rate": 3.904730917601763e-06, + "loss": 0.5654, + "step": 3095 + }, + { + "epoch": 0.9517368582846603, + "grad_norm": 0.3081408441066742, + "learning_rate": 3.904058217272126e-06, + "loss": 0.6046, + "step": 3096 + }, + { + "epoch": 0.9520442668306179, + "grad_norm": 0.2925926446914673, + "learning_rate": 3.90338536841032e-06, + "loss": 0.6212, + "step": 3097 + }, + { + "epoch": 0.9523516753765755, + "grad_norm": 0.3070280849933624, + "learning_rate": 3.902712371087522e-06, + "loss": 0.5997, + "step": 3098 + }, + { + "epoch": 0.9526590839225331, + "grad_norm": 0.3154558539390564, + "learning_rate": 3.902039225374929e-06, + "loss": 0.6271, + "step": 3099 + }, + { + "epoch": 0.9529664924684906, + "grad_norm": 0.30187663435935974, + "learning_rate": 3.901365931343752e-06, + "loss": 0.6113, + "step": 3100 + }, + { + "epoch": 0.9532739010144482, + "grad_norm": 0.3020797669887543, + "learning_rate": 3.9006924890652156e-06, + "loss": 0.631, + "step": 3101 + }, + { + "epoch": 0.9535813095604058, + "grad_norm": 0.30515143275260925, + "learning_rate": 3.900018898610562e-06, + "loss": 0.6372, + "step": 3102 + }, + { + "epoch": 0.9538887181063633, + "grad_norm": 0.30071115493774414, + "learning_rate": 3.89934516005105e-06, + "loss": 0.6283, + "step": 3103 + }, + { + "epoch": 0.9541961266523209, + "grad_norm": 0.3121829330921173, + "learning_rate": 3.898671273457952e-06, + "loss": 0.6273, + "step": 3104 + }, + { + "epoch": 0.9545035351982785, + "grad_norm": 0.2942935824394226, + "learning_rate": 3.897997238902557e-06, + "loss": 0.6149, + "step": 3105 + }, + { + "epoch": 0.9548109437442361, + "grad_norm": 0.30109840631484985, + "learning_rate": 3.897323056456169e-06, + "loss": 0.6002, + "step": 3106 + }, + { + "epoch": 0.9551183522901937, + "grad_norm": 0.31744182109832764, + "learning_rate": 3.896648726190109e-06, + "loss": 0.6172, + "step": 3107 + }, + { + "epoch": 0.9554257608361513, + "grad_norm": 0.3172870874404907, + "learning_rate": 3.895974248175714e-06, + "loss": 0.6311, + "step": 3108 + }, + { + "epoch": 0.9557331693821088, + "grad_norm": 0.3003832995891571, + "learning_rate": 3.895299622484333e-06, + "loss": 0.6178, + "step": 3109 + }, + { + "epoch": 0.9560405779280664, + "grad_norm": 0.3343397378921509, + "learning_rate": 3.894624849187334e-06, + "loss": 0.619, + "step": 3110 + }, + { + "epoch": 0.956347986474024, + "grad_norm": 0.29251742362976074, + "learning_rate": 3.893949928356101e-06, + "loss": 0.6374, + "step": 3111 + }, + { + "epoch": 0.9566553950199815, + "grad_norm": 0.3240315020084381, + "learning_rate": 3.893274860062031e-06, + "loss": 0.6099, + "step": 3112 + }, + { + "epoch": 0.9569628035659391, + "grad_norm": 0.3038442134857178, + "learning_rate": 3.892599644376539e-06, + "loss": 0.6296, + "step": 3113 + }, + { + "epoch": 0.9572702121118967, + "grad_norm": 0.3250772953033447, + "learning_rate": 3.891924281371055e-06, + "loss": 0.6381, + "step": 3114 + }, + { + "epoch": 0.9575776206578542, + "grad_norm": 0.3278203308582306, + "learning_rate": 3.891248771117022e-06, + "loss": 0.6241, + "step": 3115 + }, + { + "epoch": 0.9578850292038119, + "grad_norm": 0.31181949377059937, + "learning_rate": 3.890573113685902e-06, + "loss": 0.6139, + "step": 3116 + }, + { + "epoch": 0.9581924377497695, + "grad_norm": 0.3064418435096741, + "learning_rate": 3.889897309149171e-06, + "loss": 0.5991, + "step": 3117 + }, + { + "epoch": 0.958499846295727, + "grad_norm": 0.3094772398471832, + "learning_rate": 3.889221357578321e-06, + "loss": 0.6123, + "step": 3118 + }, + { + "epoch": 0.9588072548416846, + "grad_norm": 0.31118306517601013, + "learning_rate": 3.888545259044861e-06, + "loss": 0.6294, + "step": 3119 + }, + { + "epoch": 0.9591146633876422, + "grad_norm": 0.31489452719688416, + "learning_rate": 3.887869013620311e-06, + "loss": 0.589, + "step": 3120 + }, + { + "epoch": 0.9594220719335997, + "grad_norm": 0.2925233244895935, + "learning_rate": 3.8871926213762114e-06, + "loss": 0.6185, + "step": 3121 + }, + { + "epoch": 0.9597294804795573, + "grad_norm": 0.3111340403556824, + "learning_rate": 3.8865160823841155e-06, + "loss": 0.5989, + "step": 3122 + }, + { + "epoch": 0.9600368890255149, + "grad_norm": 0.32858824729919434, + "learning_rate": 3.885839396715592e-06, + "loss": 0.607, + "step": 3123 + }, + { + "epoch": 0.9603442975714724, + "grad_norm": 0.3165005147457123, + "learning_rate": 3.885162564442229e-06, + "loss": 0.605, + "step": 3124 + }, + { + "epoch": 0.9606517061174301, + "grad_norm": 0.3076055645942688, + "learning_rate": 3.884485585635623e-06, + "loss": 0.6323, + "step": 3125 + }, + { + "epoch": 0.9609591146633877, + "grad_norm": 0.3108116388320923, + "learning_rate": 3.8838084603673935e-06, + "loss": 0.6055, + "step": 3126 + }, + { + "epoch": 0.9612665232093452, + "grad_norm": 0.32404381036758423, + "learning_rate": 3.883131188709171e-06, + "loss": 0.6094, + "step": 3127 + }, + { + "epoch": 0.9615739317553028, + "grad_norm": 0.3090170621871948, + "learning_rate": 3.882453770732602e-06, + "loss": 0.6303, + "step": 3128 + }, + { + "epoch": 0.9618813403012604, + "grad_norm": 0.30147692561149597, + "learning_rate": 3.881776206509348e-06, + "loss": 0.6233, + "step": 3129 + }, + { + "epoch": 0.962188748847218, + "grad_norm": 0.299306184053421, + "learning_rate": 3.88109849611109e-06, + "loss": 0.6326, + "step": 3130 + }, + { + "epoch": 0.9624961573931755, + "grad_norm": 0.32337838411331177, + "learning_rate": 3.880420639609518e-06, + "loss": 0.6148, + "step": 3131 + }, + { + "epoch": 0.9628035659391331, + "grad_norm": 0.32928961515426636, + "learning_rate": 3.879742637076344e-06, + "loss": 0.5976, + "step": 3132 + }, + { + "epoch": 0.9631109744850906, + "grad_norm": 0.3099465072154999, + "learning_rate": 3.87906448858329e-06, + "loss": 0.5966, + "step": 3133 + }, + { + "epoch": 0.9634183830310482, + "grad_norm": 0.3220928907394409, + "learning_rate": 3.8783861942020975e-06, + "loss": 0.6187, + "step": 3134 + }, + { + "epoch": 0.9637257915770059, + "grad_norm": 0.30981767177581787, + "learning_rate": 3.877707754004522e-06, + "loss": 0.6126, + "step": 3135 + }, + { + "epoch": 0.9640332001229635, + "grad_norm": 0.32328781485557556, + "learning_rate": 3.877029168062332e-06, + "loss": 0.6239, + "step": 3136 + }, + { + "epoch": 0.964340608668921, + "grad_norm": 0.3168927729129791, + "learning_rate": 3.876350436447315e-06, + "loss": 0.6093, + "step": 3137 + }, + { + "epoch": 0.9646480172148786, + "grad_norm": 0.2983643710613251, + "learning_rate": 3.875671559231272e-06, + "loss": 0.6242, + "step": 3138 + }, + { + "epoch": 0.9649554257608361, + "grad_norm": 0.31516361236572266, + "learning_rate": 3.874992536486021e-06, + "loss": 0.6106, + "step": 3139 + }, + { + "epoch": 0.9652628343067937, + "grad_norm": 0.29410645365715027, + "learning_rate": 3.874313368283393e-06, + "loss": 0.6225, + "step": 3140 + }, + { + "epoch": 0.9655702428527513, + "grad_norm": 0.31093981862068176, + "learning_rate": 3.873634054695237e-06, + "loss": 0.6165, + "step": 3141 + }, + { + "epoch": 0.9658776513987088, + "grad_norm": 0.30124565958976746, + "learning_rate": 3.872954595793415e-06, + "loss": 0.6042, + "step": 3142 + }, + { + "epoch": 0.9661850599446664, + "grad_norm": 0.3097095191478729, + "learning_rate": 3.8722749916498045e-06, + "loss": 0.6195, + "step": 3143 + }, + { + "epoch": 0.9664924684906241, + "grad_norm": 0.31426912546157837, + "learning_rate": 3.8715952423363015e-06, + "loss": 0.6258, + "step": 3144 + }, + { + "epoch": 0.9667998770365817, + "grad_norm": 0.3012601137161255, + "learning_rate": 3.870915347924814e-06, + "loss": 0.6286, + "step": 3145 + }, + { + "epoch": 0.9671072855825392, + "grad_norm": 0.2977764308452606, + "learning_rate": 3.870235308487267e-06, + "loss": 0.588, + "step": 3146 + }, + { + "epoch": 0.9674146941284968, + "grad_norm": 0.2988179922103882, + "learning_rate": 3.8695551240956e-06, + "loss": 0.6106, + "step": 3147 + }, + { + "epoch": 0.9677221026744544, + "grad_norm": 0.31564080715179443, + "learning_rate": 3.868874794821767e-06, + "loss": 0.6064, + "step": 3148 + }, + { + "epoch": 0.9680295112204119, + "grad_norm": 0.2989579737186432, + "learning_rate": 3.8681943207377405e-06, + "loss": 0.6143, + "step": 3149 + }, + { + "epoch": 0.9683369197663695, + "grad_norm": 0.29288890957832336, + "learning_rate": 3.867513701915506e-06, + "loss": 0.6233, + "step": 3150 + }, + { + "epoch": 0.968644328312327, + "grad_norm": 0.3148396611213684, + "learning_rate": 3.866832938427063e-06, + "loss": 0.6327, + "step": 3151 + }, + { + "epoch": 0.9689517368582846, + "grad_norm": 0.305948406457901, + "learning_rate": 3.86615203034443e-06, + "loss": 0.6061, + "step": 3152 + }, + { + "epoch": 0.9692591454042422, + "grad_norm": 0.30244651436805725, + "learning_rate": 3.865470977739637e-06, + "loss": 0.6147, + "step": 3153 + }, + { + "epoch": 0.9695665539501999, + "grad_norm": 0.3150150775909424, + "learning_rate": 3.864789780684733e-06, + "loss": 0.5856, + "step": 3154 + }, + { + "epoch": 0.9698739624961574, + "grad_norm": 0.3161182999610901, + "learning_rate": 3.864108439251779e-06, + "loss": 0.6163, + "step": 3155 + }, + { + "epoch": 0.970181371042115, + "grad_norm": 0.3161320686340332, + "learning_rate": 3.863426953512851e-06, + "loss": 0.6304, + "step": 3156 + }, + { + "epoch": 0.9704887795880726, + "grad_norm": 0.3189849257469177, + "learning_rate": 3.862745323540046e-06, + "loss": 0.6284, + "step": 3157 + }, + { + "epoch": 0.9707961881340301, + "grad_norm": 0.3076252341270447, + "learning_rate": 3.862063549405468e-06, + "loss": 0.62, + "step": 3158 + }, + { + "epoch": 0.9711035966799877, + "grad_norm": 0.31868767738342285, + "learning_rate": 3.861381631181244e-06, + "loss": 0.6392, + "step": 3159 + }, + { + "epoch": 0.9714110052259453, + "grad_norm": 0.2976219356060028, + "learning_rate": 3.860699568939509e-06, + "loss": 0.6391, + "step": 3160 + }, + { + "epoch": 0.9717184137719028, + "grad_norm": 0.3049347698688507, + "learning_rate": 3.860017362752418e-06, + "loss": 0.6239, + "step": 3161 + }, + { + "epoch": 0.9720258223178604, + "grad_norm": 0.30632489919662476, + "learning_rate": 3.859335012692142e-06, + "loss": 0.6291, + "step": 3162 + }, + { + "epoch": 0.9723332308638181, + "grad_norm": 0.3105093836784363, + "learning_rate": 3.858652518830863e-06, + "loss": 0.6285, + "step": 3163 + }, + { + "epoch": 0.9726406394097756, + "grad_norm": 0.31097519397735596, + "learning_rate": 3.8579698812407815e-06, + "loss": 0.619, + "step": 3164 + }, + { + "epoch": 0.9729480479557332, + "grad_norm": 0.2975943684577942, + "learning_rate": 3.857287099994113e-06, + "loss": 0.6131, + "step": 3165 + }, + { + "epoch": 0.9732554565016908, + "grad_norm": 0.3065780699253082, + "learning_rate": 3.856604175163085e-06, + "loss": 0.6148, + "step": 3166 + }, + { + "epoch": 0.9735628650476483, + "grad_norm": 0.3277686834335327, + "learning_rate": 3.855921106819944e-06, + "loss": 0.6352, + "step": 3167 + }, + { + "epoch": 0.9738702735936059, + "grad_norm": 0.3146565556526184, + "learning_rate": 3.855237895036952e-06, + "loss": 0.6142, + "step": 3168 + }, + { + "epoch": 0.9741776821395635, + "grad_norm": 0.30731070041656494, + "learning_rate": 3.854554539886382e-06, + "loss": 0.6041, + "step": 3169 + }, + { + "epoch": 0.974485090685521, + "grad_norm": 0.28919950127601624, + "learning_rate": 3.853871041440524e-06, + "loss": 0.6159, + "step": 3170 + }, + { + "epoch": 0.9747924992314786, + "grad_norm": 0.2980407476425171, + "learning_rate": 3.853187399771686e-06, + "loss": 0.6366, + "step": 3171 + }, + { + "epoch": 0.9750999077774362, + "grad_norm": 0.2955489754676819, + "learning_rate": 3.852503614952188e-06, + "loss": 0.6249, + "step": 3172 + }, + { + "epoch": 0.9754073163233938, + "grad_norm": 0.32059094309806824, + "learning_rate": 3.851819687054367e-06, + "loss": 0.62, + "step": 3173 + }, + { + "epoch": 0.9757147248693514, + "grad_norm": 0.31086355447769165, + "learning_rate": 3.851135616150572e-06, + "loss": 0.6384, + "step": 3174 + }, + { + "epoch": 0.976022133415309, + "grad_norm": 0.3083001375198364, + "learning_rate": 3.850451402313171e-06, + "loss": 0.6019, + "step": 3175 + }, + { + "epoch": 0.9763295419612665, + "grad_norm": 0.29288142919540405, + "learning_rate": 3.849767045614545e-06, + "loss": 0.6343, + "step": 3176 + }, + { + "epoch": 0.9766369505072241, + "grad_norm": 0.3042510449886322, + "learning_rate": 3.849082546127091e-06, + "loss": 0.6257, + "step": 3177 + }, + { + "epoch": 0.9769443590531817, + "grad_norm": 0.3090575635433197, + "learning_rate": 3.848397903923219e-06, + "loss": 0.6213, + "step": 3178 + }, + { + "epoch": 0.9772517675991392, + "grad_norm": 0.32032713294029236, + "learning_rate": 3.847713119075358e-06, + "loss": 0.6331, + "step": 3179 + }, + { + "epoch": 0.9775591761450968, + "grad_norm": 0.30812394618988037, + "learning_rate": 3.847028191655949e-06, + "loss": 0.6243, + "step": 3180 + }, + { + "epoch": 0.9778665846910544, + "grad_norm": 0.31247591972351074, + "learning_rate": 3.846343121737449e-06, + "loss": 0.656, + "step": 3181 + }, + { + "epoch": 0.978173993237012, + "grad_norm": 0.30322980880737305, + "learning_rate": 3.84565790939233e-06, + "loss": 0.6145, + "step": 3182 + }, + { + "epoch": 0.9784814017829696, + "grad_norm": 0.3117871880531311, + "learning_rate": 3.8449725546930786e-06, + "loss": 0.5869, + "step": 3183 + }, + { + "epoch": 0.9787888103289272, + "grad_norm": 0.300976425409317, + "learning_rate": 3.844287057712197e-06, + "loss": 0.6246, + "step": 3184 + }, + { + "epoch": 0.9790962188748847, + "grad_norm": 0.3130923807621002, + "learning_rate": 3.843601418522203e-06, + "loss": 0.6355, + "step": 3185 + }, + { + "epoch": 0.9794036274208423, + "grad_norm": 0.30826887488365173, + "learning_rate": 3.842915637195628e-06, + "loss": 0.6329, + "step": 3186 + }, + { + "epoch": 0.9797110359667999, + "grad_norm": 0.30372175574302673, + "learning_rate": 3.84222971380502e-06, + "loss": 0.6179, + "step": 3187 + }, + { + "epoch": 0.9800184445127574, + "grad_norm": 0.29121559858322144, + "learning_rate": 3.841543648422942e-06, + "loss": 0.6298, + "step": 3188 + }, + { + "epoch": 0.980325853058715, + "grad_norm": 0.3049377501010895, + "learning_rate": 3.840857441121969e-06, + "loss": 0.6183, + "step": 3189 + }, + { + "epoch": 0.9806332616046726, + "grad_norm": 0.3098234236240387, + "learning_rate": 3.840171091974695e-06, + "loss": 0.6148, + "step": 3190 + }, + { + "epoch": 0.9809406701506301, + "grad_norm": 0.3066501319408417, + "learning_rate": 3.839484601053728e-06, + "loss": 0.6074, + "step": 3191 + }, + { + "epoch": 0.9812480786965878, + "grad_norm": 0.2980804443359375, + "learning_rate": 3.8387979684316885e-06, + "loss": 0.6318, + "step": 3192 + }, + { + "epoch": 0.9815554872425454, + "grad_norm": 0.31224313378334045, + "learning_rate": 3.8381111941812145e-06, + "loss": 0.6268, + "step": 3193 + }, + { + "epoch": 0.9818628957885029, + "grad_norm": 0.30289995670318604, + "learning_rate": 3.837424278374959e-06, + "loss": 0.6281, + "step": 3194 + }, + { + "epoch": 0.9821703043344605, + "grad_norm": 0.3005608022212982, + "learning_rate": 3.836737221085588e-06, + "loss": 0.6062, + "step": 3195 + }, + { + "epoch": 0.9824777128804181, + "grad_norm": 0.30535492300987244, + "learning_rate": 3.836050022385785e-06, + "loss": 0.6255, + "step": 3196 + }, + { + "epoch": 0.9827851214263756, + "grad_norm": 0.2978629767894745, + "learning_rate": 3.835362682348247e-06, + "loss": 0.6084, + "step": 3197 + }, + { + "epoch": 0.9830925299723332, + "grad_norm": 0.29700347781181335, + "learning_rate": 3.834675201045686e-06, + "loss": 0.5896, + "step": 3198 + }, + { + "epoch": 0.9833999385182908, + "grad_norm": 0.30251795053482056, + "learning_rate": 3.833987578550829e-06, + "loss": 0.622, + "step": 3199 + }, + { + "epoch": 0.9837073470642483, + "grad_norm": 0.3262624442577362, + "learning_rate": 3.8332998149364175e-06, + "loss": 0.6372, + "step": 3200 + }, + { + "epoch": 0.984014755610206, + "grad_norm": 0.3081316649913788, + "learning_rate": 3.832611910275209e-06, + "loss": 0.618, + "step": 3201 + }, + { + "epoch": 0.9843221641561636, + "grad_norm": 0.3123590350151062, + "learning_rate": 3.831923864639976e-06, + "loss": 0.6226, + "step": 3202 + }, + { + "epoch": 0.9846295727021211, + "grad_norm": 0.31658753752708435, + "learning_rate": 3.831235678103504e-06, + "loss": 0.6018, + "step": 3203 + }, + { + "epoch": 0.9849369812480787, + "grad_norm": 0.32160836458206177, + "learning_rate": 3.830547350738596e-06, + "loss": 0.6283, + "step": 3204 + }, + { + "epoch": 0.9852443897940363, + "grad_norm": 0.30282074213027954, + "learning_rate": 3.829858882618067e-06, + "loss": 0.6449, + "step": 3205 + }, + { + "epoch": 0.9855517983399938, + "grad_norm": 0.3293642997741699, + "learning_rate": 3.829170273814751e-06, + "loss": 0.5859, + "step": 3206 + }, + { + "epoch": 0.9858592068859514, + "grad_norm": 0.29925718903541565, + "learning_rate": 3.828481524401493e-06, + "loss": 0.6101, + "step": 3207 + }, + { + "epoch": 0.986166615431909, + "grad_norm": 0.30473560094833374, + "learning_rate": 3.827792634451154e-06, + "loss": 0.6119, + "step": 3208 + }, + { + "epoch": 0.9864740239778665, + "grad_norm": 0.3029218316078186, + "learning_rate": 3.827103604036609e-06, + "loss": 0.6466, + "step": 3209 + }, + { + "epoch": 0.9867814325238241, + "grad_norm": 0.3122491240501404, + "learning_rate": 3.826414433230751e-06, + "loss": 0.6108, + "step": 3210 + }, + { + "epoch": 0.9870888410697818, + "grad_norm": 0.30126506090164185, + "learning_rate": 3.825725122106485e-06, + "loss": 0.6106, + "step": 3211 + }, + { + "epoch": 0.9873962496157394, + "grad_norm": 0.29143834114074707, + "learning_rate": 3.825035670736732e-06, + "loss": 0.605, + "step": 3212 + }, + { + "epoch": 0.9877036581616969, + "grad_norm": 0.29969364404678345, + "learning_rate": 3.824346079194428e-06, + "loss": 0.5961, + "step": 3213 + }, + { + "epoch": 0.9880110667076545, + "grad_norm": 0.31834620237350464, + "learning_rate": 3.823656347552521e-06, + "loss": 0.6408, + "step": 3214 + }, + { + "epoch": 0.988318475253612, + "grad_norm": 0.3116997480392456, + "learning_rate": 3.822966475883979e-06, + "loss": 0.6281, + "step": 3215 + }, + { + "epoch": 0.9886258837995696, + "grad_norm": 0.31109219789505005, + "learning_rate": 3.822276464261781e-06, + "loss": 0.6188, + "step": 3216 + }, + { + "epoch": 0.9889332923455272, + "grad_norm": 0.3044778108596802, + "learning_rate": 3.821586312758921e-06, + "loss": 0.6021, + "step": 3217 + }, + { + "epoch": 0.9892407008914847, + "grad_norm": 0.3029796779155731, + "learning_rate": 3.820896021448409e-06, + "loss": 0.6099, + "step": 3218 + }, + { + "epoch": 0.9895481094374423, + "grad_norm": 0.30369582772254944, + "learning_rate": 3.8202055904032694e-06, + "loss": 0.6119, + "step": 3219 + }, + { + "epoch": 0.9898555179834, + "grad_norm": 0.3388898968696594, + "learning_rate": 3.8195150196965414e-06, + "loss": 0.5945, + "step": 3220 + }, + { + "epoch": 0.9901629265293576, + "grad_norm": 0.2996682822704315, + "learning_rate": 3.818824309401279e-06, + "loss": 0.6176, + "step": 3221 + }, + { + "epoch": 0.9904703350753151, + "grad_norm": 0.3192312717437744, + "learning_rate": 3.818133459590552e-06, + "loss": 0.6244, + "step": 3222 + }, + { + "epoch": 0.9907777436212727, + "grad_norm": 0.3216210901737213, + "learning_rate": 3.817442470337441e-06, + "loss": 0.6257, + "step": 3223 + }, + { + "epoch": 0.9910851521672303, + "grad_norm": 0.32142961025238037, + "learning_rate": 3.816751341715047e-06, + "loss": 0.6202, + "step": 3224 + }, + { + "epoch": 0.9913925607131878, + "grad_norm": 0.30123358964920044, + "learning_rate": 3.816060073796481e-06, + "loss": 0.6168, + "step": 3225 + }, + { + "epoch": 0.9916999692591454, + "grad_norm": 0.3368956446647644, + "learning_rate": 3.815368666654874e-06, + "loss": 0.6093, + "step": 3226 + }, + { + "epoch": 0.992007377805103, + "grad_norm": 0.30887940526008606, + "learning_rate": 3.8146771203633637e-06, + "loss": 0.6387, + "step": 3227 + }, + { + "epoch": 0.9923147863510605, + "grad_norm": 0.30802878737449646, + "learning_rate": 3.81398543499511e-06, + "loss": 0.6301, + "step": 3228 + }, + { + "epoch": 0.9926221948970181, + "grad_norm": 0.3133756220340729, + "learning_rate": 3.8132936106232844e-06, + "loss": 0.6029, + "step": 3229 + }, + { + "epoch": 0.9929296034429758, + "grad_norm": 0.29889655113220215, + "learning_rate": 3.8126016473210736e-06, + "loss": 0.5996, + "step": 3230 + }, + { + "epoch": 0.9932370119889333, + "grad_norm": 0.30858662724494934, + "learning_rate": 3.811909545161677e-06, + "loss": 0.6391, + "step": 3231 + }, + { + "epoch": 0.9935444205348909, + "grad_norm": 0.29993724822998047, + "learning_rate": 3.8112173042183144e-06, + "loss": 0.589, + "step": 3232 + }, + { + "epoch": 0.9938518290808485, + "grad_norm": 0.3075692653656006, + "learning_rate": 3.810524924564213e-06, + "loss": 0.6239, + "step": 3233 + }, + { + "epoch": 0.994159237626806, + "grad_norm": 0.30158087611198425, + "learning_rate": 3.8098324062726188e-06, + "loss": 0.6281, + "step": 3234 + }, + { + "epoch": 0.9944666461727636, + "grad_norm": 0.30300167202949524, + "learning_rate": 3.8091397494167925e-06, + "loss": 0.6325, + "step": 3235 + }, + { + "epoch": 0.9947740547187212, + "grad_norm": 0.3152081072330475, + "learning_rate": 3.8084469540700087e-06, + "loss": 0.6082, + "step": 3236 + }, + { + "epoch": 0.9950814632646787, + "grad_norm": 0.3233110010623932, + "learning_rate": 3.8077540203055556e-06, + "loss": 0.6242, + "step": 3237 + }, + { + "epoch": 0.9953888718106363, + "grad_norm": 0.29228100180625916, + "learning_rate": 3.807060948196738e-06, + "loss": 0.6278, + "step": 3238 + }, + { + "epoch": 0.995696280356594, + "grad_norm": 0.30293628573417664, + "learning_rate": 3.806367737816875e-06, + "loss": 0.5845, + "step": 3239 + }, + { + "epoch": 0.9960036889025515, + "grad_norm": 0.3011828660964966, + "learning_rate": 3.8056743892392982e-06, + "loss": 0.5923, + "step": 3240 + }, + { + "epoch": 0.9963110974485091, + "grad_norm": 0.31442439556121826, + "learning_rate": 3.804980902537357e-06, + "loss": 0.6357, + "step": 3241 + }, + { + "epoch": 0.9966185059944667, + "grad_norm": 0.31455183029174805, + "learning_rate": 3.804287277784412e-06, + "loss": 0.6422, + "step": 3242 + }, + { + "epoch": 0.9969259145404242, + "grad_norm": 0.30632883310317993, + "learning_rate": 3.8035935150538423e-06, + "loss": 0.6542, + "step": 3243 + }, + { + "epoch": 0.9972333230863818, + "grad_norm": 0.32481056451797485, + "learning_rate": 3.802899614419038e-06, + "loss": 0.6094, + "step": 3244 + }, + { + "epoch": 0.9975407316323394, + "grad_norm": 0.3031092882156372, + "learning_rate": 3.8022055759534052e-06, + "loss": 0.6325, + "step": 3245 + }, + { + "epoch": 0.9978481401782969, + "grad_norm": 0.29607757925987244, + "learning_rate": 3.8015113997303655e-06, + "loss": 0.6384, + "step": 3246 + }, + { + "epoch": 0.9981555487242545, + "grad_norm": 0.3074757158756256, + "learning_rate": 3.800817085823355e-06, + "loss": 0.6293, + "step": 3247 + }, + { + "epoch": 0.9984629572702121, + "grad_norm": 0.3202285170555115, + "learning_rate": 3.8001226343058217e-06, + "loss": 0.6013, + "step": 3248 + }, + { + "epoch": 0.9987703658161697, + "grad_norm": 0.30392986536026, + "learning_rate": 3.7994280452512312e-06, + "loss": 0.6178, + "step": 3249 + }, + { + "epoch": 0.9990777743621273, + "grad_norm": 0.3055027425289154, + "learning_rate": 3.7987333187330622e-06, + "loss": 0.6133, + "step": 3250 + }, + { + "epoch": 0.9993851829080849, + "grad_norm": 0.3334810137748718, + "learning_rate": 3.7980384548248083e-06, + "loss": 0.6176, + "step": 3251 + }, + { + "epoch": 0.9996925914540424, + "grad_norm": 0.30586081743240356, + "learning_rate": 3.7973434535999776e-06, + "loss": 0.6263, + "step": 3252 + }, + { + "epoch": 1.0, + "grad_norm": 0.3043861985206604, + "learning_rate": 3.7966483151320925e-06, + "loss": 0.6345, + "step": 3253 + } + ], + "logging_steps": 1, + "max_steps": 9759, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 3253, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.9147994053733777e+19, + "train_batch_size": 12, + "trial_name": null, + "trial_params": null +}