diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,28883 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 8241, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007280669821623589, + "grad_norm": 38.75, + "learning_rate": 1.25e-08, + "loss": 1.6701526641845703, + "step": 2 + }, + { + "epoch": 0.0014561339643247178, + "grad_norm": 8.625, + "learning_rate": 3.75e-08, + "loss": 1.5440375804901123, + "step": 4 + }, + { + "epoch": 0.002184200946487077, + "grad_norm": 139.0, + "learning_rate": 6.25e-08, + "loss": 2.2783899307250977, + "step": 6 + }, + { + "epoch": 0.0029122679286494356, + "grad_norm": 5.65625, + "learning_rate": 8.75e-08, + "loss": 1.6183305978775024, + "step": 8 + }, + { + "epoch": 0.0036403349108117948, + "grad_norm": 26.5, + "learning_rate": 1.125e-07, + "loss": 2.030928611755371, + "step": 10 + }, + { + "epoch": 0.004368401892974154, + "grad_norm": 16.75, + "learning_rate": 1.375e-07, + "loss": 2.2140793800354004, + "step": 12 + }, + { + "epoch": 0.005096468875136513, + "grad_norm": 29.0, + "learning_rate": 1.625e-07, + "loss": 1.7079442739486694, + "step": 14 + }, + { + "epoch": 0.005824535857298871, + "grad_norm": 29.375, + "learning_rate": 1.875e-07, + "loss": 1.8502070903778076, + "step": 16 + }, + { + "epoch": 0.006552602839461231, + "grad_norm": 51.5, + "learning_rate": 2.1249999999999998e-07, + "loss": 1.8441216945648193, + "step": 18 + }, + { + "epoch": 0.0072806698216235895, + "grad_norm": 24.75, + "learning_rate": 2.3749999999999998e-07, + "loss": 1.9404842853546143, + "step": 20 + }, + { + "epoch": 0.008008736803785948, + "grad_norm": 12.1875, + "learning_rate": 2.625e-07, + "loss": 1.9104686975479126, + "step": 22 + }, + { + "epoch": 0.008736803785948308, + "grad_norm": 10.5625, + "learning_rate": 2.8749999999999995e-07, + "loss": 1.91398024559021, + "step": 24 + }, + { + "epoch": 0.009464870768110666, + "grad_norm": 13.5, + "learning_rate": 3.1249999999999997e-07, + "loss": 1.7455148696899414, + "step": 26 + }, + { + "epoch": 0.010192937750273025, + "grad_norm": 6.3125, + "learning_rate": 3.375e-07, + "loss": 1.1563613414764404, + "step": 28 + }, + { + "epoch": 0.010921004732435385, + "grad_norm": 5.53125, + "learning_rate": 3.6249999999999997e-07, + "loss": 1.7613410949707031, + "step": 30 + }, + { + "epoch": 0.011649071714597743, + "grad_norm": 8.5, + "learning_rate": 3.875e-07, + "loss": 1.7684240341186523, + "step": 32 + }, + { + "epoch": 0.012377138696760102, + "grad_norm": 43.75, + "learning_rate": 4.1249999999999997e-07, + "loss": 1.6029598712921143, + "step": 34 + }, + { + "epoch": 0.013105205678922462, + "grad_norm": 18.5, + "learning_rate": 4.375e-07, + "loss": 1.4408862590789795, + "step": 36 + }, + { + "epoch": 0.01383327266108482, + "grad_norm": 12.8125, + "learning_rate": 4.625e-07, + "loss": 1.7882254123687744, + "step": 38 + }, + { + "epoch": 0.014561339643247179, + "grad_norm": 4.90625, + "learning_rate": 4.875e-07, + "loss": 1.5668270587921143, + "step": 40 + }, + { + "epoch": 0.015289406625409537, + "grad_norm": 7.21875, + "learning_rate": 5.125e-07, + "loss": 1.5615062713623047, + "step": 42 + }, + { + "epoch": 0.016017473607571896, + "grad_norm": 20.625, + "learning_rate": 5.374999999999999e-07, + "loss": 1.8634085655212402, + "step": 44 + }, + { + "epoch": 0.016745540589734254, + "grad_norm": 11.125, + "learning_rate": 5.625e-07, + "loss": 1.7552001476287842, + "step": 46 + }, + { + "epoch": 0.017473607571896616, + "grad_norm": 5.71875, + "learning_rate": 5.875e-07, + "loss": 1.539961576461792, + "step": 48 + }, + { + "epoch": 0.018201674554058973, + "grad_norm": 14.1875, + "learning_rate": 6.125000000000001e-07, + "loss": 2.1092772483825684, + "step": 50 + }, + { + "epoch": 0.01892974153622133, + "grad_norm": 18.875, + "learning_rate": 6.374999999999999e-07, + "loss": 1.7703300714492798, + "step": 52 + }, + { + "epoch": 0.019657808518383692, + "grad_norm": 19.875, + "learning_rate": 6.624999999999999e-07, + "loss": 1.68864905834198, + "step": 54 + }, + { + "epoch": 0.02038587550054605, + "grad_norm": 13.0625, + "learning_rate": 6.875e-07, + "loss": 1.6324245929718018, + "step": 56 + }, + { + "epoch": 0.021113942482708408, + "grad_norm": 31.625, + "learning_rate": 7.125e-07, + "loss": 1.9167428016662598, + "step": 58 + }, + { + "epoch": 0.02184200946487077, + "grad_norm": 5.65625, + "learning_rate": 7.375e-07, + "loss": 1.6022722721099854, + "step": 60 + }, + { + "epoch": 0.022570076447033127, + "grad_norm": 6.6875, + "learning_rate": 7.624999999999999e-07, + "loss": 1.3783684968948364, + "step": 62 + }, + { + "epoch": 0.023298143429195485, + "grad_norm": 17.75, + "learning_rate": 7.875e-07, + "loss": 1.5951179265975952, + "step": 64 + }, + { + "epoch": 0.024026210411357846, + "grad_norm": 20.125, + "learning_rate": 8.125e-07, + "loss": 1.8818310499191284, + "step": 66 + }, + { + "epoch": 0.024754277393520204, + "grad_norm": 19.75, + "learning_rate": 8.375e-07, + "loss": 1.768096685409546, + "step": 68 + }, + { + "epoch": 0.025482344375682562, + "grad_norm": 15.0, + "learning_rate": 8.625e-07, + "loss": 1.7854245901107788, + "step": 70 + }, + { + "epoch": 0.026210411357844923, + "grad_norm": 13.9375, + "learning_rate": 8.874999999999999e-07, + "loss": 1.4966001510620117, + "step": 72 + }, + { + "epoch": 0.02693847834000728, + "grad_norm": 33.5, + "learning_rate": 9.124999999999999e-07, + "loss": 1.1714494228363037, + "step": 74 + }, + { + "epoch": 0.02766654532216964, + "grad_norm": 30.625, + "learning_rate": 9.374999999999999e-07, + "loss": 1.5881410837173462, + "step": 76 + }, + { + "epoch": 0.028394612304332, + "grad_norm": 10.0, + "learning_rate": 9.624999999999999e-07, + "loss": 1.8096888065338135, + "step": 78 + }, + { + "epoch": 0.029122679286494358, + "grad_norm": 75.0, + "learning_rate": 9.875e-07, + "loss": 1.8397023677825928, + "step": 80 + }, + { + "epoch": 0.029850746268656716, + "grad_norm": 5.46875, + "learning_rate": 9.999999703624035e-07, + "loss": 1.5183111429214478, + "step": 82 + }, + { + "epoch": 0.030578813250819074, + "grad_norm": 24.5, + "learning_rate": 9.999997332616586e-07, + "loss": 1.5736167430877686, + "step": 84 + }, + { + "epoch": 0.031306880232981435, + "grad_norm": 12.5, + "learning_rate": 9.999992590603089e-07, + "loss": 1.7563443183898926, + "step": 86 + }, + { + "epoch": 0.03203494721514379, + "grad_norm": 10.625, + "learning_rate": 9.999985477586358e-07, + "loss": 1.5300308465957642, + "step": 88 + }, + { + "epoch": 0.03276301419730615, + "grad_norm": 8.0625, + "learning_rate": 9.99997599357061e-07, + "loss": 1.5263605117797852, + "step": 90 + }, + { + "epoch": 0.03349108117946851, + "grad_norm": 42.0, + "learning_rate": 9.999964138561468e-07, + "loss": 1.7090799808502197, + "step": 92 + }, + { + "epoch": 0.03421914816163087, + "grad_norm": 8.3125, + "learning_rate": 9.999949912565952e-07, + "loss": 1.4451731443405151, + "step": 94 + }, + { + "epoch": 0.03494721514379323, + "grad_norm": 10.6875, + "learning_rate": 9.999933315592501e-07, + "loss": 1.9472981691360474, + "step": 96 + }, + { + "epoch": 0.03567528212595559, + "grad_norm": 11.8125, + "learning_rate": 9.99991434765095e-07, + "loss": 2.2682976722717285, + "step": 98 + }, + { + "epoch": 0.03640334910811795, + "grad_norm": 13.6875, + "learning_rate": 9.999893008752545e-07, + "loss": 1.6405476331710815, + "step": 100 + }, + { + "epoch": 0.037131416090280304, + "grad_norm": 7.25, + "learning_rate": 9.999869298909934e-07, + "loss": 0.9278600215911865, + "step": 102 + }, + { + "epoch": 0.03785948307244266, + "grad_norm": 6.53125, + "learning_rate": 9.999843218137167e-07, + "loss": 1.526383638381958, + "step": 104 + }, + { + "epoch": 0.03858755005460503, + "grad_norm": 16.25, + "learning_rate": 9.999814766449708e-07, + "loss": 1.4768216609954834, + "step": 106 + }, + { + "epoch": 0.039315617036767385, + "grad_norm": 4.90625, + "learning_rate": 9.99978394386442e-07, + "loss": 1.1783959865570068, + "step": 108 + }, + { + "epoch": 0.04004368401892974, + "grad_norm": 14.4375, + "learning_rate": 9.999750750399574e-07, + "loss": 1.9052605628967285, + "step": 110 + }, + { + "epoch": 0.0407717510010921, + "grad_norm": 13.625, + "learning_rate": 9.999715186074842e-07, + "loss": 1.4604225158691406, + "step": 112 + }, + { + "epoch": 0.04149981798325446, + "grad_norm": 14.8125, + "learning_rate": 9.99967725091131e-07, + "loss": 1.7786848545074463, + "step": 114 + }, + { + "epoch": 0.042227884965416816, + "grad_norm": 21.125, + "learning_rate": 9.999636944931464e-07, + "loss": 1.4846135377883911, + "step": 116 + }, + { + "epoch": 0.042955951947579174, + "grad_norm": 27.75, + "learning_rate": 9.99959426815919e-07, + "loss": 1.8612072467803955, + "step": 118 + }, + { + "epoch": 0.04368401892974154, + "grad_norm": 13.9375, + "learning_rate": 9.99954922061979e-07, + "loss": 1.358473539352417, + "step": 120 + }, + { + "epoch": 0.0444120859119039, + "grad_norm": 14.3125, + "learning_rate": 9.999501802339963e-07, + "loss": 1.2156610488891602, + "step": 122 + }, + { + "epoch": 0.045140152894066254, + "grad_norm": 15.6875, + "learning_rate": 9.999452013347818e-07, + "loss": 1.9383506774902344, + "step": 124 + }, + { + "epoch": 0.04586821987622861, + "grad_norm": 5.75, + "learning_rate": 9.999399853672864e-07, + "loss": 1.1553086042404175, + "step": 126 + }, + { + "epoch": 0.04659628685839097, + "grad_norm": 18.75, + "learning_rate": 9.999345323346024e-07, + "loss": 2.207151174545288, + "step": 128 + }, + { + "epoch": 0.04732435384055333, + "grad_norm": 12.9375, + "learning_rate": 9.999288422399619e-07, + "loss": 1.4385790824890137, + "step": 130 + }, + { + "epoch": 0.04805242082271569, + "grad_norm": 8.1875, + "learning_rate": 9.999229150867376e-07, + "loss": 1.6901663541793823, + "step": 132 + }, + { + "epoch": 0.04878048780487805, + "grad_norm": 13.625, + "learning_rate": 9.99916750878443e-07, + "loss": 2.0154829025268555, + "step": 134 + }, + { + "epoch": 0.04950855478704041, + "grad_norm": 12.75, + "learning_rate": 9.999103496187318e-07, + "loss": 1.2983794212341309, + "step": 136 + }, + { + "epoch": 0.050236621769202766, + "grad_norm": 3.375, + "learning_rate": 9.999037113113983e-07, + "loss": 0.7801814079284668, + "step": 138 + }, + { + "epoch": 0.050964688751365124, + "grad_norm": 30.125, + "learning_rate": 9.998968359603776e-07, + "loss": 1.260277509689331, + "step": 140 + }, + { + "epoch": 0.05169275573352748, + "grad_norm": 74.5, + "learning_rate": 9.998897235697447e-07, + "loss": 1.8376469612121582, + "step": 142 + }, + { + "epoch": 0.052420822715689847, + "grad_norm": 3.71875, + "learning_rate": 9.99882374143716e-07, + "loss": 1.6184654235839844, + "step": 144 + }, + { + "epoch": 0.053148889697852204, + "grad_norm": 26.625, + "learning_rate": 9.998747876866475e-07, + "loss": 2.4433298110961914, + "step": 146 + }, + { + "epoch": 0.05387695668001456, + "grad_norm": 9.75, + "learning_rate": 9.99866964203036e-07, + "loss": 1.6185541152954102, + "step": 148 + }, + { + "epoch": 0.05460502366217692, + "grad_norm": 24.5, + "learning_rate": 9.998589036975198e-07, + "loss": 1.8726744651794434, + "step": 150 + }, + { + "epoch": 0.05533309064433928, + "grad_norm": 9.5, + "learning_rate": 9.998506061748756e-07, + "loss": 1.1015827655792236, + "step": 152 + }, + { + "epoch": 0.056061157626501636, + "grad_norm": 12.5, + "learning_rate": 9.998420716400223e-07, + "loss": 1.5632359981536865, + "step": 154 + }, + { + "epoch": 0.056789224608664, + "grad_norm": 30.125, + "learning_rate": 9.998333000980187e-07, + "loss": 1.4904030561447144, + "step": 156 + }, + { + "epoch": 0.05751729159082636, + "grad_norm": 10.875, + "learning_rate": 9.99824291554064e-07, + "loss": 1.4396255016326904, + "step": 158 + }, + { + "epoch": 0.058245358572988716, + "grad_norm": 23.875, + "learning_rate": 9.998150460134982e-07, + "loss": 2.1215415000915527, + "step": 160 + }, + { + "epoch": 0.058973425555151074, + "grad_norm": 12.8125, + "learning_rate": 9.998055634818018e-07, + "loss": 1.838822603225708, + "step": 162 + }, + { + "epoch": 0.05970149253731343, + "grad_norm": 69.0, + "learning_rate": 9.997958439645952e-07, + "loss": 1.844721794128418, + "step": 164 + }, + { + "epoch": 0.06042955951947579, + "grad_norm": 16.25, + "learning_rate": 9.9978588746764e-07, + "loss": 2.0135297775268555, + "step": 166 + }, + { + "epoch": 0.06115762650163815, + "grad_norm": 15.25, + "learning_rate": 9.997756939968377e-07, + "loss": 1.5659213066101074, + "step": 168 + }, + { + "epoch": 0.06188569348380051, + "grad_norm": 12.0625, + "learning_rate": 9.997652635582305e-07, + "loss": 1.6769918203353882, + "step": 170 + }, + { + "epoch": 0.06261376046596287, + "grad_norm": 20.375, + "learning_rate": 9.997545961580015e-07, + "loss": 1.50160551071167, + "step": 172 + }, + { + "epoch": 0.06334182744812522, + "grad_norm": 33.25, + "learning_rate": 9.99743691802473e-07, + "loss": 1.9703657627105713, + "step": 174 + }, + { + "epoch": 0.06406989443028759, + "grad_norm": 10.5625, + "learning_rate": 9.997325504981094e-07, + "loss": 1.795241117477417, + "step": 176 + }, + { + "epoch": 0.06479796141244995, + "grad_norm": 111.5, + "learning_rate": 9.997211722515142e-07, + "loss": 1.4168236255645752, + "step": 178 + }, + { + "epoch": 0.0655260283946123, + "grad_norm": 11.5625, + "learning_rate": 9.997095570694321e-07, + "loss": 1.7200202941894531, + "step": 180 + }, + { + "epoch": 0.06625409537677467, + "grad_norm": 5.3125, + "learning_rate": 9.996977049587478e-07, + "loss": 1.500860571861267, + "step": 182 + }, + { + "epoch": 0.06698216235893702, + "grad_norm": 6.9375, + "learning_rate": 9.996856159264871e-07, + "loss": 1.3273615837097168, + "step": 184 + }, + { + "epoch": 0.06771022934109938, + "grad_norm": 13.125, + "learning_rate": 9.996732899798154e-07, + "loss": 1.321894884109497, + "step": 186 + }, + { + "epoch": 0.06843829632326175, + "grad_norm": 12.1875, + "learning_rate": 9.99660727126039e-07, + "loss": 1.6595005989074707, + "step": 188 + }, + { + "epoch": 0.0691663633054241, + "grad_norm": 25.375, + "learning_rate": 9.996479273726045e-07, + "loss": 1.8635836839675903, + "step": 190 + }, + { + "epoch": 0.06989443028758646, + "grad_norm": 44.25, + "learning_rate": 9.996348907270995e-07, + "loss": 1.752155065536499, + "step": 192 + }, + { + "epoch": 0.07062249726974881, + "grad_norm": 42.25, + "learning_rate": 9.996216171972508e-07, + "loss": 1.2666740417480469, + "step": 194 + }, + { + "epoch": 0.07135056425191118, + "grad_norm": 14.0, + "learning_rate": 9.996081067909269e-07, + "loss": 1.573847770690918, + "step": 196 + }, + { + "epoch": 0.07207863123407353, + "grad_norm": 16.875, + "learning_rate": 9.995943595161354e-07, + "loss": 1.5668258666992188, + "step": 198 + }, + { + "epoch": 0.0728066982162359, + "grad_norm": 4.59375, + "learning_rate": 9.995803753810255e-07, + "loss": 1.3051080703735352, + "step": 200 + }, + { + "epoch": 0.07353476519839826, + "grad_norm": 11.1875, + "learning_rate": 9.995661543938864e-07, + "loss": 1.935826063156128, + "step": 202 + }, + { + "epoch": 0.07426283218056061, + "grad_norm": 11.375, + "learning_rate": 9.995516965631476e-07, + "loss": 1.7311904430389404, + "step": 204 + }, + { + "epoch": 0.07499089916272297, + "grad_norm": 11.8125, + "learning_rate": 9.995370018973787e-07, + "loss": 1.7337747812271118, + "step": 206 + }, + { + "epoch": 0.07571896614488532, + "grad_norm": 26.125, + "learning_rate": 9.995220704052902e-07, + "loss": 1.330377221107483, + "step": 208 + }, + { + "epoch": 0.07644703312704769, + "grad_norm": 11.875, + "learning_rate": 9.995069020957327e-07, + "loss": 1.9736666679382324, + "step": 210 + }, + { + "epoch": 0.07717510010921005, + "grad_norm": 9.75, + "learning_rate": 9.994914969776975e-07, + "loss": 1.5441925525665283, + "step": 212 + }, + { + "epoch": 0.0779031670913724, + "grad_norm": 10.6875, + "learning_rate": 9.994758550603154e-07, + "loss": 1.3768044710159302, + "step": 214 + }, + { + "epoch": 0.07863123407353477, + "grad_norm": 10.6875, + "learning_rate": 9.99459976352859e-07, + "loss": 1.7713137865066528, + "step": 216 + }, + { + "epoch": 0.07935930105569712, + "grad_norm": 14.6875, + "learning_rate": 9.9944386086474e-07, + "loss": 1.6875104904174805, + "step": 218 + }, + { + "epoch": 0.08008736803785949, + "grad_norm": 12.25, + "learning_rate": 9.994275086055106e-07, + "loss": 1.7201932668685913, + "step": 220 + }, + { + "epoch": 0.08081543502002184, + "grad_norm": 23.125, + "learning_rate": 9.994109195848642e-07, + "loss": 1.8243221044540405, + "step": 222 + }, + { + "epoch": 0.0815435020021842, + "grad_norm": 2.59375, + "learning_rate": 9.993940938126334e-07, + "loss": 1.333681583404541, + "step": 224 + }, + { + "epoch": 0.08227156898434657, + "grad_norm": 15.375, + "learning_rate": 9.993770312987925e-07, + "loss": 1.3824301958084106, + "step": 226 + }, + { + "epoch": 0.08299963596650892, + "grad_norm": 14.625, + "learning_rate": 9.993597320534545e-07, + "loss": 1.970837950706482, + "step": 228 + }, + { + "epoch": 0.08372770294867128, + "grad_norm": 11.5625, + "learning_rate": 9.99342196086874e-07, + "loss": 1.2264541387557983, + "step": 230 + }, + { + "epoch": 0.08445576993083363, + "grad_norm": 13.4375, + "learning_rate": 9.993244234094456e-07, + "loss": 1.9517059326171875, + "step": 232 + }, + { + "epoch": 0.085183836912996, + "grad_norm": 16.375, + "learning_rate": 9.993064140317035e-07, + "loss": 1.9142043590545654, + "step": 234 + }, + { + "epoch": 0.08591190389515835, + "grad_norm": 28.75, + "learning_rate": 9.992881679643234e-07, + "loss": 1.3287625312805176, + "step": 236 + }, + { + "epoch": 0.08663997087732071, + "grad_norm": 16.75, + "learning_rate": 9.992696852181205e-07, + "loss": 1.1787291765213013, + "step": 238 + }, + { + "epoch": 0.08736803785948308, + "grad_norm": 11.8125, + "learning_rate": 9.992509658040503e-07, + "loss": 1.333935260772705, + "step": 240 + }, + { + "epoch": 0.08809610484164543, + "grad_norm": 8.5, + "learning_rate": 9.99232009733209e-07, + "loss": 1.5971310138702393, + "step": 242 + }, + { + "epoch": 0.0888241718238078, + "grad_norm": 10.75, + "learning_rate": 9.992128170168328e-07, + "loss": 1.5595444440841675, + "step": 244 + }, + { + "epoch": 0.08955223880597014, + "grad_norm": 32.0, + "learning_rate": 9.99193387666298e-07, + "loss": 1.9195828437805176, + "step": 246 + }, + { + "epoch": 0.09028030578813251, + "grad_norm": 16.75, + "learning_rate": 9.991737216931217e-07, + "loss": 2.0701756477355957, + "step": 248 + }, + { + "epoch": 0.09100837277029487, + "grad_norm": 5.21875, + "learning_rate": 9.99153819108961e-07, + "loss": 1.0891534090042114, + "step": 250 + }, + { + "epoch": 0.09173643975245722, + "grad_norm": 4.28125, + "learning_rate": 9.99133679925613e-07, + "loss": 1.224845051765442, + "step": 252 + }, + { + "epoch": 0.09246450673461959, + "grad_norm": 18.875, + "learning_rate": 9.991133041550147e-07, + "loss": 1.3348400592803955, + "step": 254 + }, + { + "epoch": 0.09319257371678194, + "grad_norm": 10.75, + "learning_rate": 9.99092691809245e-07, + "loss": 1.8456023931503296, + "step": 256 + }, + { + "epoch": 0.0939206406989443, + "grad_norm": 12.25, + "learning_rate": 9.990718429005212e-07, + "loss": 1.3841841220855713, + "step": 258 + }, + { + "epoch": 0.09464870768110666, + "grad_norm": 11.4375, + "learning_rate": 9.990507574412016e-07, + "loss": 1.731948971748352, + "step": 260 + }, + { + "epoch": 0.09537677466326902, + "grad_norm": 29.5, + "learning_rate": 9.990294354437845e-07, + "loss": 2.075178623199463, + "step": 262 + }, + { + "epoch": 0.09610484164543139, + "grad_norm": 41.5, + "learning_rate": 9.99007876920909e-07, + "loss": 1.4239192008972168, + "step": 264 + }, + { + "epoch": 0.09683290862759374, + "grad_norm": 4.84375, + "learning_rate": 9.989860818853537e-07, + "loss": 1.3436033725738525, + "step": 266 + }, + { + "epoch": 0.0975609756097561, + "grad_norm": 12.375, + "learning_rate": 9.989640503500376e-07, + "loss": 1.332651138305664, + "step": 268 + }, + { + "epoch": 0.09828904259191845, + "grad_norm": 7.25, + "learning_rate": 9.9894178232802e-07, + "loss": 1.4065250158309937, + "step": 270 + }, + { + "epoch": 0.09901710957408082, + "grad_norm": 6.5, + "learning_rate": 9.989192778325004e-07, + "loss": 1.4498741626739502, + "step": 272 + }, + { + "epoch": 0.09974517655624317, + "grad_norm": 4.96875, + "learning_rate": 9.988965368768182e-07, + "loss": 1.3354822397232056, + "step": 274 + }, + { + "epoch": 0.10047324353840553, + "grad_norm": 62.5, + "learning_rate": 9.988735594744532e-07, + "loss": 1.3507013320922852, + "step": 276 + }, + { + "epoch": 0.1012013105205679, + "grad_norm": 21.125, + "learning_rate": 9.988503456390256e-07, + "loss": 1.7165093421936035, + "step": 278 + }, + { + "epoch": 0.10192937750273025, + "grad_norm": 11.75, + "learning_rate": 9.988268953842952e-07, + "loss": 1.728652834892273, + "step": 280 + }, + { + "epoch": 0.10265744448489261, + "grad_norm": 20.75, + "learning_rate": 9.988032087241618e-07, + "loss": 1.5289946794509888, + "step": 282 + }, + { + "epoch": 0.10338551146705496, + "grad_norm": 12.4375, + "learning_rate": 9.987792856726661e-07, + "loss": 1.461730718612671, + "step": 284 + }, + { + "epoch": 0.10411357844921733, + "grad_norm": 13.375, + "learning_rate": 9.987551262439888e-07, + "loss": 1.2372313737869263, + "step": 286 + }, + { + "epoch": 0.10484164543137969, + "grad_norm": 21.0, + "learning_rate": 9.987307304524502e-07, + "loss": 1.56668221950531, + "step": 288 + }, + { + "epoch": 0.10556971241354204, + "grad_norm": 26.75, + "learning_rate": 9.987060983125106e-07, + "loss": 1.99294114112854, + "step": 290 + }, + { + "epoch": 0.10629777939570441, + "grad_norm": 16.375, + "learning_rate": 9.986812298387713e-07, + "loss": 1.6428991556167603, + "step": 292 + }, + { + "epoch": 0.10702584637786676, + "grad_norm": 8.1875, + "learning_rate": 9.986561250459728e-07, + "loss": 1.4644380807876587, + "step": 294 + }, + { + "epoch": 0.10775391336002912, + "grad_norm": 12.375, + "learning_rate": 9.98630783948996e-07, + "loss": 1.4112789630889893, + "step": 296 + }, + { + "epoch": 0.10848198034219148, + "grad_norm": 22.5, + "learning_rate": 9.98605206562862e-07, + "loss": 1.5523277521133423, + "step": 298 + }, + { + "epoch": 0.10921004732435384, + "grad_norm": 7.71875, + "learning_rate": 9.985793929027322e-07, + "loss": 1.5722994804382324, + "step": 300 + }, + { + "epoch": 0.1099381143065162, + "grad_norm": 9.5, + "learning_rate": 9.985533429839071e-07, + "loss": 1.4232302904129028, + "step": 302 + }, + { + "epoch": 0.11066618128867856, + "grad_norm": 31.25, + "learning_rate": 9.98527056821828e-07, + "loss": 1.6651103496551514, + "step": 304 + }, + { + "epoch": 0.11139424827084092, + "grad_norm": 10.9375, + "learning_rate": 9.985005344320763e-07, + "loss": 1.4233425855636597, + "step": 306 + }, + { + "epoch": 0.11212231525300327, + "grad_norm": 10.3125, + "learning_rate": 9.984737758303728e-07, + "loss": 1.5064756870269775, + "step": 308 + }, + { + "epoch": 0.11285038223516564, + "grad_norm": 12.125, + "learning_rate": 9.984467810325792e-07, + "loss": 1.372679352760315, + "step": 310 + }, + { + "epoch": 0.113578449217328, + "grad_norm": 9.375, + "learning_rate": 9.984195500546963e-07, + "loss": 1.5836809873580933, + "step": 312 + }, + { + "epoch": 0.11430651619949035, + "grad_norm": 11.25, + "learning_rate": 9.983920829128655e-07, + "loss": 1.8380441665649414, + "step": 314 + }, + { + "epoch": 0.11503458318165272, + "grad_norm": 28.75, + "learning_rate": 9.983643796233683e-07, + "loss": 1.612189531326294, + "step": 316 + }, + { + "epoch": 0.11576265016381507, + "grad_norm": 10.0, + "learning_rate": 9.983364402026254e-07, + "loss": 1.7775076627731323, + "step": 318 + }, + { + "epoch": 0.11649071714597743, + "grad_norm": 24.0, + "learning_rate": 9.983082646671978e-07, + "loss": 1.6349949836730957, + "step": 320 + }, + { + "epoch": 0.11721878412813978, + "grad_norm": 12.625, + "learning_rate": 9.982798530337872e-07, + "loss": 1.5466070175170898, + "step": 322 + }, + { + "epoch": 0.11794685111030215, + "grad_norm": 10.625, + "learning_rate": 9.982512053192344e-07, + "loss": 1.5946142673492432, + "step": 324 + }, + { + "epoch": 0.11867491809246451, + "grad_norm": 53.75, + "learning_rate": 9.982223215405203e-07, + "loss": 1.292494773864746, + "step": 326 + }, + { + "epoch": 0.11940298507462686, + "grad_norm": 5.8125, + "learning_rate": 9.981932017147658e-07, + "loss": 1.5809032917022705, + "step": 328 + }, + { + "epoch": 0.12013105205678923, + "grad_norm": 5.6875, + "learning_rate": 9.981638458592318e-07, + "loss": 0.9446820616722107, + "step": 330 + }, + { + "epoch": 0.12085911903895158, + "grad_norm": 11.25, + "learning_rate": 9.981342539913192e-07, + "loss": 1.282289743423462, + "step": 332 + }, + { + "epoch": 0.12158718602111394, + "grad_norm": 12.5, + "learning_rate": 9.981044261285684e-07, + "loss": 1.641176700592041, + "step": 334 + }, + { + "epoch": 0.1223152530032763, + "grad_norm": 29.25, + "learning_rate": 9.9807436228866e-07, + "loss": 1.9249002933502197, + "step": 336 + }, + { + "epoch": 0.12304331998543866, + "grad_norm": 14.8125, + "learning_rate": 9.980440624894141e-07, + "loss": 1.2323753833770752, + "step": 338 + }, + { + "epoch": 0.12377138696760102, + "grad_norm": 11.5625, + "learning_rate": 9.980135267487917e-07, + "loss": 1.4582998752593994, + "step": 340 + }, + { + "epoch": 0.12449945394976338, + "grad_norm": 2.953125, + "learning_rate": 9.979827550848922e-07, + "loss": 1.1940613985061646, + "step": 342 + }, + { + "epoch": 0.12522752093192574, + "grad_norm": 12.375, + "learning_rate": 9.97951747515956e-07, + "loss": 1.571114420890808, + "step": 344 + }, + { + "epoch": 0.1259555879140881, + "grad_norm": 5.59375, + "learning_rate": 9.979205040603626e-07, + "loss": 1.4128682613372803, + "step": 346 + }, + { + "epoch": 0.12668365489625044, + "grad_norm": 32.75, + "learning_rate": 9.978890247366317e-07, + "loss": 1.355841875076294, + "step": 348 + }, + { + "epoch": 0.12741172187841282, + "grad_norm": 17.375, + "learning_rate": 9.97857309563423e-07, + "loss": 1.7977299690246582, + "step": 350 + }, + { + "epoch": 0.12813978886057517, + "grad_norm": 12.875, + "learning_rate": 9.978253585595352e-07, + "loss": 1.3079497814178467, + "step": 352 + }, + { + "epoch": 0.12886785584273752, + "grad_norm": 24.0, + "learning_rate": 9.97793171743908e-07, + "loss": 1.5428612232208252, + "step": 354 + }, + { + "epoch": 0.1295959228248999, + "grad_norm": 9.1875, + "learning_rate": 9.977607491356197e-07, + "loss": 1.2862461805343628, + "step": 356 + }, + { + "epoch": 0.13032398980706225, + "grad_norm": 10.0, + "learning_rate": 9.97728090753889e-07, + "loss": 1.1886472702026367, + "step": 358 + }, + { + "epoch": 0.1310520567892246, + "grad_norm": 12.5625, + "learning_rate": 9.97695196618074e-07, + "loss": 1.4950411319732666, + "step": 360 + }, + { + "epoch": 0.13178012377138698, + "grad_norm": 16.875, + "learning_rate": 9.976620667476733e-07, + "loss": 1.5768711566925049, + "step": 362 + }, + { + "epoch": 0.13250819075354933, + "grad_norm": 10.5, + "learning_rate": 9.976287011623241e-07, + "loss": 1.8466485738754272, + "step": 364 + }, + { + "epoch": 0.13323625773571168, + "grad_norm": 7.6875, + "learning_rate": 9.975950998818044e-07, + "loss": 0.9884294867515564, + "step": 366 + }, + { + "epoch": 0.13396432471787403, + "grad_norm": 10.9375, + "learning_rate": 9.975612629260311e-07, + "loss": 1.528059959411621, + "step": 368 + }, + { + "epoch": 0.1346923917000364, + "grad_norm": 23.625, + "learning_rate": 9.97527190315061e-07, + "loss": 1.7985742092132568, + "step": 370 + }, + { + "epoch": 0.13542045868219876, + "grad_norm": 8.25, + "learning_rate": 9.974928820690912e-07, + "loss": 1.5363093614578247, + "step": 372 + }, + { + "epoch": 0.13614852566436111, + "grad_norm": 22.375, + "learning_rate": 9.974583382084574e-07, + "loss": 1.1907068490982056, + "step": 374 + }, + { + "epoch": 0.1368765926465235, + "grad_norm": 11.4375, + "learning_rate": 9.974235587536362e-07, + "loss": 1.4294685125350952, + "step": 376 + }, + { + "epoch": 0.13760465962868584, + "grad_norm": 13.625, + "learning_rate": 9.973885437252428e-07, + "loss": 1.548203468322754, + "step": 378 + }, + { + "epoch": 0.1383327266108482, + "grad_norm": 12.8125, + "learning_rate": 9.973532931440323e-07, + "loss": 1.421518087387085, + "step": 380 + }, + { + "epoch": 0.13906079359301055, + "grad_norm": 10.6875, + "learning_rate": 9.973178070308996e-07, + "loss": 1.314737319946289, + "step": 382 + }, + { + "epoch": 0.13978886057517292, + "grad_norm": 11.4375, + "learning_rate": 9.972820854068796e-07, + "loss": 1.5612094402313232, + "step": 384 + }, + { + "epoch": 0.14051692755733527, + "grad_norm": 11.0, + "learning_rate": 9.972461282931456e-07, + "loss": 1.5735772848129272, + "step": 386 + }, + { + "epoch": 0.14124499453949763, + "grad_norm": 13.25, + "learning_rate": 9.97209935711012e-07, + "loss": 1.5486785173416138, + "step": 388 + }, + { + "epoch": 0.14197306152166, + "grad_norm": 19.0, + "learning_rate": 9.971735076819317e-07, + "loss": 1.6336555480957031, + "step": 390 + }, + { + "epoch": 0.14270112850382236, + "grad_norm": 47.75, + "learning_rate": 9.971368442274974e-07, + "loss": 1.876061201095581, + "step": 392 + }, + { + "epoch": 0.1434291954859847, + "grad_norm": 8.1875, + "learning_rate": 9.970999453694415e-07, + "loss": 1.145683765411377, + "step": 394 + }, + { + "epoch": 0.14415726246814706, + "grad_norm": 15.875, + "learning_rate": 9.970628111296357e-07, + "loss": 1.6138184070587158, + "step": 396 + }, + { + "epoch": 0.14488532945030944, + "grad_norm": 4.03125, + "learning_rate": 9.97025441530092e-07, + "loss": 1.297837257385254, + "step": 398 + }, + { + "epoch": 0.1456133964324718, + "grad_norm": 7.625, + "learning_rate": 9.969878365929604e-07, + "loss": 1.9408948421478271, + "step": 400 + }, + { + "epoch": 0.14634146341463414, + "grad_norm": 3.078125, + "learning_rate": 9.96949996340532e-07, + "loss": 1.2934406995773315, + "step": 402 + }, + { + "epoch": 0.14706953039679652, + "grad_norm": 21.0, + "learning_rate": 9.969119207952366e-07, + "loss": 1.6207654476165771, + "step": 404 + }, + { + "epoch": 0.14779759737895887, + "grad_norm": 21.625, + "learning_rate": 9.968736099796434e-07, + "loss": 1.4777135848999023, + "step": 406 + }, + { + "epoch": 0.14852566436112122, + "grad_norm": 12.9375, + "learning_rate": 9.96835063916461e-07, + "loss": 1.5631728172302246, + "step": 408 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 16.75, + "learning_rate": 9.967962826285378e-07, + "loss": 1.4863369464874268, + "step": 410 + }, + { + "epoch": 0.14998179832544595, + "grad_norm": 50.5, + "learning_rate": 9.967572661388619e-07, + "loss": 1.6331464052200317, + "step": 412 + }, + { + "epoch": 0.1507098653076083, + "grad_norm": 22.5, + "learning_rate": 9.967180144705599e-07, + "loss": 1.033508539199829, + "step": 414 + }, + { + "epoch": 0.15143793228977065, + "grad_norm": 4.53125, + "learning_rate": 9.966785276468984e-07, + "loss": 1.4137763977050781, + "step": 416 + }, + { + "epoch": 0.15216599927193303, + "grad_norm": 24.125, + "learning_rate": 9.96638805691283e-07, + "loss": 1.2360162734985352, + "step": 418 + }, + { + "epoch": 0.15289406625409538, + "grad_norm": 13.375, + "learning_rate": 9.965988486272598e-07, + "loss": 1.8099534511566162, + "step": 420 + }, + { + "epoch": 0.15362213323625773, + "grad_norm": 9.5625, + "learning_rate": 9.96558656478513e-07, + "loss": 1.4680379629135132, + "step": 422 + }, + { + "epoch": 0.1543502002184201, + "grad_norm": 15.25, + "learning_rate": 9.965182292688661e-07, + "loss": 1.7172298431396484, + "step": 424 + }, + { + "epoch": 0.15507826720058246, + "grad_norm": 12.875, + "learning_rate": 9.96477567022283e-07, + "loss": 1.8104881048202515, + "step": 426 + }, + { + "epoch": 0.1558063341827448, + "grad_norm": 20.25, + "learning_rate": 9.96436669762866e-07, + "loss": 1.4450172185897827, + "step": 428 + }, + { + "epoch": 0.15653440116490716, + "grad_norm": 4.375, + "learning_rate": 9.963955375148574e-07, + "loss": 1.3675915002822876, + "step": 430 + }, + { + "epoch": 0.15726246814706954, + "grad_norm": 58.5, + "learning_rate": 9.963541703026382e-07, + "loss": 1.0661791563034058, + "step": 432 + }, + { + "epoch": 0.1579905351292319, + "grad_norm": 4.1875, + "learning_rate": 9.963125681507288e-07, + "loss": 1.4803763628005981, + "step": 434 + }, + { + "epoch": 0.15871860211139424, + "grad_norm": 6.59375, + "learning_rate": 9.962707310837888e-07, + "loss": 1.3467367887496948, + "step": 436 + }, + { + "epoch": 0.15944666909355662, + "grad_norm": 13.625, + "learning_rate": 9.962286591266176e-07, + "loss": 1.5567784309387207, + "step": 438 + }, + { + "epoch": 0.16017473607571897, + "grad_norm": 24.375, + "learning_rate": 9.961863523041534e-07, + "loss": 1.643012523651123, + "step": 440 + }, + { + "epoch": 0.16090280305788132, + "grad_norm": 10.3125, + "learning_rate": 9.961438106414733e-07, + "loss": 1.7364649772644043, + "step": 442 + }, + { + "epoch": 0.16163087004004367, + "grad_norm": 652.0, + "learning_rate": 9.961010341637944e-07, + "loss": 1.80694580078125, + "step": 444 + }, + { + "epoch": 0.16235893702220605, + "grad_norm": 13.25, + "learning_rate": 9.96058022896472e-07, + "loss": 1.1390612125396729, + "step": 446 + }, + { + "epoch": 0.1630870040043684, + "grad_norm": 7.875, + "learning_rate": 9.960147768650016e-07, + "loss": 0.9993503093719482, + "step": 448 + }, + { + "epoch": 0.16381507098653075, + "grad_norm": 11.375, + "learning_rate": 9.959712960950171e-07, + "loss": 1.5846750736236572, + "step": 450 + }, + { + "epoch": 0.16454313796869313, + "grad_norm": 18.875, + "learning_rate": 9.95927580612292e-07, + "loss": 1.630218267440796, + "step": 452 + }, + { + "epoch": 0.16527120495085548, + "grad_norm": 7.4375, + "learning_rate": 9.958836304427384e-07, + "loss": 1.3499300479888916, + "step": 454 + }, + { + "epoch": 0.16599927193301783, + "grad_norm": 14.125, + "learning_rate": 9.95839445612408e-07, + "loss": 1.0567526817321777, + "step": 456 + }, + { + "epoch": 0.16672733891518018, + "grad_norm": 14.9375, + "learning_rate": 9.957950261474922e-07, + "loss": 1.5589746236801147, + "step": 458 + }, + { + "epoch": 0.16745540589734256, + "grad_norm": 10.0, + "learning_rate": 9.957503720743195e-07, + "loss": 1.5366376638412476, + "step": 460 + }, + { + "epoch": 0.1681834728795049, + "grad_norm": 17.5, + "learning_rate": 9.957054834193592e-07, + "loss": 1.4441428184509277, + "step": 462 + }, + { + "epoch": 0.16891153986166726, + "grad_norm": 12.3125, + "learning_rate": 9.956603602092193e-07, + "loss": 1.6015567779541016, + "step": 464 + }, + { + "epoch": 0.16963960684382964, + "grad_norm": 19.125, + "learning_rate": 9.956150024706465e-07, + "loss": 1.856958270072937, + "step": 466 + }, + { + "epoch": 0.170367673825992, + "grad_norm": 5.15625, + "learning_rate": 9.955694102305268e-07, + "loss": 1.3967382907867432, + "step": 468 + }, + { + "epoch": 0.17109574080815435, + "grad_norm": 9.875, + "learning_rate": 9.95523583515885e-07, + "loss": 1.4672859907150269, + "step": 470 + }, + { + "epoch": 0.1718238077903167, + "grad_norm": 20.0, + "learning_rate": 9.954775223538847e-07, + "loss": 1.7151458263397217, + "step": 472 + }, + { + "epoch": 0.17255187477247907, + "grad_norm": 5.90625, + "learning_rate": 9.954312267718294e-07, + "loss": 1.3410698175430298, + "step": 474 + }, + { + "epoch": 0.17327994175464143, + "grad_norm": 11.1875, + "learning_rate": 9.953846967971603e-07, + "loss": 1.5901882648468018, + "step": 476 + }, + { + "epoch": 0.17400800873680378, + "grad_norm": 10.8125, + "learning_rate": 9.953379324574583e-07, + "loss": 1.5626811981201172, + "step": 478 + }, + { + "epoch": 0.17473607571896616, + "grad_norm": 14.875, + "learning_rate": 9.95290933780443e-07, + "loss": 1.8437721729278564, + "step": 480 + }, + { + "epoch": 0.1754641427011285, + "grad_norm": 8.8125, + "learning_rate": 9.952437007939733e-07, + "loss": 1.4951350688934326, + "step": 482 + }, + { + "epoch": 0.17619220968329086, + "grad_norm": 27.75, + "learning_rate": 9.951962335260464e-07, + "loss": 1.9525244235992432, + "step": 484 + }, + { + "epoch": 0.17692027666545324, + "grad_norm": 11.875, + "learning_rate": 9.951485320047985e-07, + "loss": 1.324737548828125, + "step": 486 + }, + { + "epoch": 0.1776483436476156, + "grad_norm": 21.0, + "learning_rate": 9.951005962585048e-07, + "loss": 1.5963342189788818, + "step": 488 + }, + { + "epoch": 0.17837641062977794, + "grad_norm": 22.25, + "learning_rate": 9.950524263155795e-07, + "loss": 1.9673795700073242, + "step": 490 + }, + { + "epoch": 0.1791044776119403, + "grad_norm": 9.1875, + "learning_rate": 9.950040222045754e-07, + "loss": 1.3773561716079712, + "step": 492 + }, + { + "epoch": 0.17983254459410267, + "grad_norm": 21.75, + "learning_rate": 9.949553839541837e-07, + "loss": 1.2185407876968384, + "step": 494 + }, + { + "epoch": 0.18056061157626502, + "grad_norm": 12.0, + "learning_rate": 9.949065115932354e-07, + "loss": 1.4488515853881836, + "step": 496 + }, + { + "epoch": 0.18128867855842737, + "grad_norm": 18.25, + "learning_rate": 9.948574051506994e-07, + "loss": 1.6164606809616089, + "step": 498 + }, + { + "epoch": 0.18201674554058975, + "grad_norm": 8.0625, + "learning_rate": 9.948080646556838e-07, + "loss": 1.6966197490692139, + "step": 500 + }, + { + "epoch": 0.1827448125227521, + "grad_norm": 19.875, + "learning_rate": 9.947584901374349e-07, + "loss": 1.0857893228530884, + "step": 502 + }, + { + "epoch": 0.18347287950491445, + "grad_norm": 32.5, + "learning_rate": 9.947086816253384e-07, + "loss": 0.7579207420349121, + "step": 504 + }, + { + "epoch": 0.1842009464870768, + "grad_norm": 12.375, + "learning_rate": 9.946586391489185e-07, + "loss": 1.2130523920059204, + "step": 506 + }, + { + "epoch": 0.18492901346923918, + "grad_norm": 9.25, + "learning_rate": 9.946083627378377e-07, + "loss": 1.7941536903381348, + "step": 508 + }, + { + "epoch": 0.18565708045140153, + "grad_norm": 11.6875, + "learning_rate": 9.945578524218973e-07, + "loss": 1.9550271034240723, + "step": 510 + }, + { + "epoch": 0.18638514743356388, + "grad_norm": 31.625, + "learning_rate": 9.945071082310379e-07, + "loss": 1.5446611642837524, + "step": 512 + }, + { + "epoch": 0.18711321441572626, + "grad_norm": 17.0, + "learning_rate": 9.944561301953377e-07, + "loss": 1.372767686843872, + "step": 514 + }, + { + "epoch": 0.1878412813978886, + "grad_norm": 15.9375, + "learning_rate": 9.944049183450147e-07, + "loss": 1.6656838655471802, + "step": 516 + }, + { + "epoch": 0.18856934838005096, + "grad_norm": 15.25, + "learning_rate": 9.94353472710424e-07, + "loss": 1.5876491069793701, + "step": 518 + }, + { + "epoch": 0.1892974153622133, + "grad_norm": 13.9375, + "learning_rate": 9.943017933220606e-07, + "loss": 1.6090747117996216, + "step": 520 + }, + { + "epoch": 0.1900254823443757, + "grad_norm": 9.6875, + "learning_rate": 9.942498802105574e-07, + "loss": 1.4101369380950928, + "step": 522 + }, + { + "epoch": 0.19075354932653804, + "grad_norm": 20.75, + "learning_rate": 9.941977334066862e-07, + "loss": 1.3943030834197998, + "step": 524 + }, + { + "epoch": 0.1914816163087004, + "grad_norm": 11.125, + "learning_rate": 9.941453529413568e-07, + "loss": 1.5068813562393188, + "step": 526 + }, + { + "epoch": 0.19220968329086277, + "grad_norm": 13.6875, + "learning_rate": 9.94092738845618e-07, + "loss": 1.588093876838684, + "step": 528 + }, + { + "epoch": 0.19293775027302512, + "grad_norm": 17.5, + "learning_rate": 9.94039891150657e-07, + "loss": 1.820513367652893, + "step": 530 + }, + { + "epoch": 0.19366581725518747, + "grad_norm": 4.78125, + "learning_rate": 9.93986809887799e-07, + "loss": 1.0934842824935913, + "step": 532 + }, + { + "epoch": 0.19439388423734982, + "grad_norm": 10.625, + "learning_rate": 9.939334950885086e-07, + "loss": 1.4000884294509888, + "step": 534 + }, + { + "epoch": 0.1951219512195122, + "grad_norm": 30.25, + "learning_rate": 9.938799467843877e-07, + "loss": 1.3196907043457031, + "step": 536 + }, + { + "epoch": 0.19585001820167455, + "grad_norm": 70.0, + "learning_rate": 9.938261650071774e-07, + "loss": 1.2089288234710693, + "step": 538 + }, + { + "epoch": 0.1965780851838369, + "grad_norm": 15.6875, + "learning_rate": 9.93772149788757e-07, + "loss": 1.6223084926605225, + "step": 540 + }, + { + "epoch": 0.19730615216599928, + "grad_norm": 10.25, + "learning_rate": 9.93717901161144e-07, + "loss": 1.4608285427093506, + "step": 542 + }, + { + "epoch": 0.19803421914816163, + "grad_norm": 9.375, + "learning_rate": 9.936634191564943e-07, + "loss": 1.3243443965911865, + "step": 544 + }, + { + "epoch": 0.19876228613032398, + "grad_norm": 7.5, + "learning_rate": 9.936087038071026e-07, + "loss": 1.0767841339111328, + "step": 546 + }, + { + "epoch": 0.19949035311248633, + "grad_norm": 41.25, + "learning_rate": 9.935537551454012e-07, + "loss": 1.6305677890777588, + "step": 548 + }, + { + "epoch": 0.2002184200946487, + "grad_norm": 6.4375, + "learning_rate": 9.934985732039612e-07, + "loss": 1.1933289766311646, + "step": 550 + }, + { + "epoch": 0.20094648707681106, + "grad_norm": 17.125, + "learning_rate": 9.934431580154913e-07, + "loss": 1.6773250102996826, + "step": 552 + }, + { + "epoch": 0.20167455405897342, + "grad_norm": 11.3125, + "learning_rate": 9.9338750961284e-07, + "loss": 1.7222200632095337, + "step": 554 + }, + { + "epoch": 0.2024026210411358, + "grad_norm": 10.9375, + "learning_rate": 9.933316280289917e-07, + "loss": 1.4817653894424438, + "step": 556 + }, + { + "epoch": 0.20313068802329814, + "grad_norm": 8.1875, + "learning_rate": 9.932755132970713e-07, + "loss": 1.2613826990127563, + "step": 558 + }, + { + "epoch": 0.2038587550054605, + "grad_norm": 13.5625, + "learning_rate": 9.932191654503406e-07, + "loss": 1.6333990097045898, + "step": 560 + }, + { + "epoch": 0.20458682198762287, + "grad_norm": 11.125, + "learning_rate": 9.931625845221997e-07, + "loss": 1.7207322120666504, + "step": 562 + }, + { + "epoch": 0.20531488896978523, + "grad_norm": 10.9375, + "learning_rate": 9.931057705461874e-07, + "loss": 1.1559470891952515, + "step": 564 + }, + { + "epoch": 0.20604295595194758, + "grad_norm": 15.4375, + "learning_rate": 9.9304872355598e-07, + "loss": 1.0401029586791992, + "step": 566 + }, + { + "epoch": 0.20677102293410993, + "grad_norm": 10.4375, + "learning_rate": 9.929914435853926e-07, + "loss": 1.3629839420318604, + "step": 568 + }, + { + "epoch": 0.2074990899162723, + "grad_norm": 13.25, + "learning_rate": 9.929339306683775e-07, + "loss": 1.5948340892791748, + "step": 570 + }, + { + "epoch": 0.20822715689843466, + "grad_norm": 11.875, + "learning_rate": 9.92876184839026e-07, + "loss": 1.457888126373291, + "step": 572 + }, + { + "epoch": 0.208955223880597, + "grad_norm": 8.0625, + "learning_rate": 9.928182061315667e-07, + "loss": 1.3774800300598145, + "step": 574 + }, + { + "epoch": 0.20968329086275939, + "grad_norm": 11.875, + "learning_rate": 9.927599945803666e-07, + "loss": 0.9555914998054504, + "step": 576 + }, + { + "epoch": 0.21041135784492174, + "grad_norm": 9.3125, + "learning_rate": 9.927015502199314e-07, + "loss": 1.056607723236084, + "step": 578 + }, + { + "epoch": 0.2111394248270841, + "grad_norm": 7.78125, + "learning_rate": 9.926428730849032e-07, + "loss": 1.18709135055542, + "step": 580 + }, + { + "epoch": 0.21186749180924644, + "grad_norm": 88.5, + "learning_rate": 9.925839632100637e-07, + "loss": 1.5369737148284912, + "step": 582 + }, + { + "epoch": 0.21259555879140882, + "grad_norm": 5.375, + "learning_rate": 9.92524820630331e-07, + "loss": 0.8187214136123657, + "step": 584 + }, + { + "epoch": 0.21332362577357117, + "grad_norm": 17.5, + "learning_rate": 9.92465445380763e-07, + "loss": 1.6230435371398926, + "step": 586 + }, + { + "epoch": 0.21405169275573352, + "grad_norm": 9.375, + "learning_rate": 9.92405837496554e-07, + "loss": 1.498671293258667, + "step": 588 + }, + { + "epoch": 0.2147797597378959, + "grad_norm": 10.125, + "learning_rate": 9.923459970130364e-07, + "loss": 1.1864230632781982, + "step": 590 + }, + { + "epoch": 0.21550782672005825, + "grad_norm": 31.5, + "learning_rate": 9.922859239656812e-07, + "loss": 1.8998475074768066, + "step": 592 + }, + { + "epoch": 0.2162358937022206, + "grad_norm": 28.75, + "learning_rate": 9.922256183900963e-07, + "loss": 1.526608943939209, + "step": 594 + }, + { + "epoch": 0.21696396068438295, + "grad_norm": 14.625, + "learning_rate": 9.921650803220288e-07, + "loss": 1.5253762006759644, + "step": 596 + }, + { + "epoch": 0.21769202766654533, + "grad_norm": 39.75, + "learning_rate": 9.921043097973617e-07, + "loss": 1.3672726154327393, + "step": 598 + }, + { + "epoch": 0.21842009464870768, + "grad_norm": 7.21875, + "learning_rate": 9.920433068521179e-07, + "loss": 1.3152724504470825, + "step": 600 + }, + { + "epoch": 0.21914816163087003, + "grad_norm": 23.875, + "learning_rate": 9.919820715224563e-07, + "loss": 0.7481328845024109, + "step": 602 + }, + { + "epoch": 0.2198762286130324, + "grad_norm": 59.25, + "learning_rate": 9.919206038446745e-07, + "loss": 1.1317870616912842, + "step": 604 + }, + { + "epoch": 0.22060429559519476, + "grad_norm": 12.375, + "learning_rate": 9.918589038552073e-07, + "loss": 1.5622992515563965, + "step": 606 + }, + { + "epoch": 0.2213323625773571, + "grad_norm": 16.375, + "learning_rate": 9.917969715906281e-07, + "loss": 1.4338456392288208, + "step": 608 + }, + { + "epoch": 0.22206042955951946, + "grad_norm": 5.84375, + "learning_rate": 9.917348070876469e-07, + "loss": 1.155889868736267, + "step": 610 + }, + { + "epoch": 0.22278849654168184, + "grad_norm": 64.0, + "learning_rate": 9.916724103831118e-07, + "loss": 1.0399384498596191, + "step": 612 + }, + { + "epoch": 0.2235165635238442, + "grad_norm": 12.1875, + "learning_rate": 9.916097815140088e-07, + "loss": 1.39493989944458, + "step": 614 + }, + { + "epoch": 0.22424463050600654, + "grad_norm": 13.8125, + "learning_rate": 9.91546920517461e-07, + "loss": 1.457120418548584, + "step": 616 + }, + { + "epoch": 0.22497269748816892, + "grad_norm": 8.9375, + "learning_rate": 9.914838274307296e-07, + "loss": 1.2869017124176025, + "step": 618 + }, + { + "epoch": 0.22570076447033127, + "grad_norm": 14.375, + "learning_rate": 9.914205022912133e-07, + "loss": 0.9633007645606995, + "step": 620 + }, + { + "epoch": 0.22642883145249362, + "grad_norm": 6.125, + "learning_rate": 9.913569451364478e-07, + "loss": 1.4225205183029175, + "step": 622 + }, + { + "epoch": 0.227156898434656, + "grad_norm": 16.375, + "learning_rate": 9.91293156004107e-07, + "loss": 1.2953007221221924, + "step": 624 + }, + { + "epoch": 0.22788496541681835, + "grad_norm": 19.5, + "learning_rate": 9.91229134932002e-07, + "loss": 0.9614678621292114, + "step": 626 + }, + { + "epoch": 0.2286130323989807, + "grad_norm": 12.0625, + "learning_rate": 9.911648819580814e-07, + "loss": 1.253426432609558, + "step": 628 + }, + { + "epoch": 0.22934109938114305, + "grad_norm": 8.6875, + "learning_rate": 9.911003971204311e-07, + "loss": 1.585209608078003, + "step": 630 + }, + { + "epoch": 0.23006916636330543, + "grad_norm": 12.25, + "learning_rate": 9.910356804572745e-07, + "loss": 1.403122901916504, + "step": 632 + }, + { + "epoch": 0.23079723334546778, + "grad_norm": 10.4375, + "learning_rate": 9.90970732006973e-07, + "loss": 1.5030596256256104, + "step": 634 + }, + { + "epoch": 0.23152530032763013, + "grad_norm": 29.5, + "learning_rate": 9.90905551808025e-07, + "loss": 1.3137760162353516, + "step": 636 + }, + { + "epoch": 0.2322533673097925, + "grad_norm": 11.375, + "learning_rate": 9.908401398990655e-07, + "loss": 1.0689858198165894, + "step": 638 + }, + { + "epoch": 0.23298143429195486, + "grad_norm": 24.0, + "learning_rate": 9.90774496318868e-07, + "loss": 1.3031456470489502, + "step": 640 + }, + { + "epoch": 0.23370950127411722, + "grad_norm": 9.9375, + "learning_rate": 9.907086211063428e-07, + "loss": 1.5108141899108887, + "step": 642 + }, + { + "epoch": 0.23443756825627957, + "grad_norm": 32.5, + "learning_rate": 9.906425143005376e-07, + "loss": 1.3012267351150513, + "step": 644 + }, + { + "epoch": 0.23516563523844194, + "grad_norm": 14.3125, + "learning_rate": 9.905761759406372e-07, + "loss": 0.9942807555198669, + "step": 646 + }, + { + "epoch": 0.2358937022206043, + "grad_norm": 22.0, + "learning_rate": 9.905096060659637e-07, + "loss": 1.1023329496383667, + "step": 648 + }, + { + "epoch": 0.23662176920276665, + "grad_norm": 13.875, + "learning_rate": 9.90442804715977e-07, + "loss": 1.1440300941467285, + "step": 650 + }, + { + "epoch": 0.23734983618492903, + "grad_norm": 6.3125, + "learning_rate": 9.90375771930273e-07, + "loss": 1.4478724002838135, + "step": 652 + }, + { + "epoch": 0.23807790316709138, + "grad_norm": 38.5, + "learning_rate": 9.903085077485862e-07, + "loss": 1.7724552154541016, + "step": 654 + }, + { + "epoch": 0.23880597014925373, + "grad_norm": 12.5, + "learning_rate": 9.90241012210787e-07, + "loss": 1.0664501190185547, + "step": 656 + }, + { + "epoch": 0.23953403713141608, + "grad_norm": 10.625, + "learning_rate": 9.90173285356884e-07, + "loss": 1.527841567993164, + "step": 658 + }, + { + "epoch": 0.24026210411357846, + "grad_norm": 7.75, + "learning_rate": 9.901053272270224e-07, + "loss": 1.154727816581726, + "step": 660 + }, + { + "epoch": 0.2409901710957408, + "grad_norm": 12.3125, + "learning_rate": 9.900371378614837e-07, + "loss": 1.8276673555374146, + "step": 662 + }, + { + "epoch": 0.24171823807790316, + "grad_norm": 11.1875, + "learning_rate": 9.899687173006886e-07, + "loss": 1.472938060760498, + "step": 664 + }, + { + "epoch": 0.24244630506006554, + "grad_norm": 9.6875, + "learning_rate": 9.899000655851926e-07, + "loss": 1.7192471027374268, + "step": 666 + }, + { + "epoch": 0.2431743720422279, + "grad_norm": 15.9375, + "learning_rate": 9.898311827556892e-07, + "loss": 1.557612657546997, + "step": 668 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 26.25, + "learning_rate": 9.89762068853009e-07, + "loss": 1.5527622699737549, + "step": 670 + }, + { + "epoch": 0.2446305060065526, + "grad_norm": 7.6875, + "learning_rate": 9.896927239181193e-07, + "loss": 1.4383560419082642, + "step": 672 + }, + { + "epoch": 0.24535857298871497, + "grad_norm": 24.75, + "learning_rate": 9.896231479921246e-07, + "loss": 1.7028650045394897, + "step": 674 + }, + { + "epoch": 0.24608663997087732, + "grad_norm": 21.375, + "learning_rate": 9.895533411162661e-07, + "loss": 1.6367833614349365, + "step": 676 + }, + { + "epoch": 0.24681470695303967, + "grad_norm": 9.625, + "learning_rate": 9.89483303331922e-07, + "loss": 1.419440746307373, + "step": 678 + }, + { + "epoch": 0.24754277393520205, + "grad_norm": 14.0, + "learning_rate": 9.894130346806073e-07, + "loss": 1.520108699798584, + "step": 680 + }, + { + "epoch": 0.2482708409173644, + "grad_norm": 10.625, + "learning_rate": 9.893425352039736e-07, + "loss": 1.0187759399414062, + "step": 682 + }, + { + "epoch": 0.24899890789952675, + "grad_norm": 8.3125, + "learning_rate": 9.892718049438103e-07, + "loss": 1.6056427955627441, + "step": 684 + }, + { + "epoch": 0.24972697488168913, + "grad_norm": 17.375, + "learning_rate": 9.89200843942042e-07, + "loss": 1.4781806468963623, + "step": 686 + }, + { + "epoch": 0.2504550418638515, + "grad_norm": 14.4375, + "learning_rate": 9.891296522407316e-07, + "loss": 1.424162745475769, + "step": 688 + }, + { + "epoch": 0.25118310884601386, + "grad_norm": 15.0, + "learning_rate": 9.89058229882078e-07, + "loss": 1.5313165187835693, + "step": 690 + }, + { + "epoch": 0.2519111758281762, + "grad_norm": 21.375, + "learning_rate": 9.889865769084168e-07, + "loss": 1.5670483112335205, + "step": 692 + }, + { + "epoch": 0.25263924281033856, + "grad_norm": 11.4375, + "learning_rate": 9.889146933622203e-07, + "loss": 1.5647892951965332, + "step": 694 + }, + { + "epoch": 0.2533673097925009, + "grad_norm": 14.5625, + "learning_rate": 9.88842579286098e-07, + "loss": 1.621058702468872, + "step": 696 + }, + { + "epoch": 0.25409537677466326, + "grad_norm": 11.0625, + "learning_rate": 9.887702347227954e-07, + "loss": 1.9542927742004395, + "step": 698 + }, + { + "epoch": 0.25482344375682564, + "grad_norm": 12.6875, + "learning_rate": 9.886976597151953e-07, + "loss": 1.5523183345794678, + "step": 700 + }, + { + "epoch": 0.25555151073898796, + "grad_norm": 45.0, + "learning_rate": 9.886248543063158e-07, + "loss": 1.635611653327942, + "step": 702 + }, + { + "epoch": 0.25627957772115034, + "grad_norm": 12.5, + "learning_rate": 9.885518185393134e-07, + "loss": 1.3867954015731812, + "step": 704 + }, + { + "epoch": 0.2570076447033127, + "grad_norm": 12.3125, + "learning_rate": 9.884785524574794e-07, + "loss": 1.4117157459259033, + "step": 706 + }, + { + "epoch": 0.25773571168547504, + "grad_norm": 12.4375, + "learning_rate": 9.88405056104243e-07, + "loss": 1.380401372909546, + "step": 708 + }, + { + "epoch": 0.2584637786676374, + "grad_norm": 14.8125, + "learning_rate": 9.88331329523169e-07, + "loss": 1.2659910917282104, + "step": 710 + }, + { + "epoch": 0.2591918456497998, + "grad_norm": 14.75, + "learning_rate": 9.88257372757959e-07, + "loss": 1.8586393594741821, + "step": 712 + }, + { + "epoch": 0.2599199126319621, + "grad_norm": 8.6875, + "learning_rate": 9.881831858524512e-07, + "loss": 1.5523242950439453, + "step": 714 + }, + { + "epoch": 0.2606479796141245, + "grad_norm": 7.34375, + "learning_rate": 9.881087688506199e-07, + "loss": 1.2005550861358643, + "step": 716 + }, + { + "epoch": 0.2613760465962869, + "grad_norm": 14.125, + "learning_rate": 9.880341217965755e-07, + "loss": 1.2614223957061768, + "step": 718 + }, + { + "epoch": 0.2621041135784492, + "grad_norm": 13.3125, + "learning_rate": 9.87959244734566e-07, + "loss": 1.357144832611084, + "step": 720 + }, + { + "epoch": 0.2628321805606116, + "grad_norm": 8.6875, + "learning_rate": 9.878841377089745e-07, + "loss": 1.3924534320831299, + "step": 722 + }, + { + "epoch": 0.26356024754277396, + "grad_norm": 10.5625, + "learning_rate": 9.878088007643206e-07, + "loss": 1.4029022455215454, + "step": 724 + }, + { + "epoch": 0.2642883145249363, + "grad_norm": 17.375, + "learning_rate": 9.877332339452609e-07, + "loss": 1.4985699653625488, + "step": 726 + }, + { + "epoch": 0.26501638150709866, + "grad_norm": 21.0, + "learning_rate": 9.876574372965873e-07, + "loss": 1.1987252235412598, + "step": 728 + }, + { + "epoch": 0.265744448489261, + "grad_norm": 23.25, + "learning_rate": 9.875814108632288e-07, + "loss": 1.6480119228363037, + "step": 730 + }, + { + "epoch": 0.26647251547142337, + "grad_norm": 9.8125, + "learning_rate": 9.875051546902502e-07, + "loss": 1.545173168182373, + "step": 732 + }, + { + "epoch": 0.26720058245358574, + "grad_norm": 37.75, + "learning_rate": 9.874286688228521e-07, + "loss": 1.6362556219100952, + "step": 734 + }, + { + "epoch": 0.26792864943574807, + "grad_norm": 16.75, + "learning_rate": 9.87351953306372e-07, + "loss": 1.3623369932174683, + "step": 736 + }, + { + "epoch": 0.26865671641791045, + "grad_norm": 9.5, + "learning_rate": 9.87275008186283e-07, + "loss": 1.5691869258880615, + "step": 738 + }, + { + "epoch": 0.2693847834000728, + "grad_norm": 13.5, + "learning_rate": 9.871978335081945e-07, + "loss": 1.5286710262298584, + "step": 740 + }, + { + "epoch": 0.27011285038223515, + "grad_norm": 14.1875, + "learning_rate": 9.87120429317852e-07, + "loss": 1.4223175048828125, + "step": 742 + }, + { + "epoch": 0.2708409173643975, + "grad_norm": 58.75, + "learning_rate": 9.870427956611371e-07, + "loss": 1.6060209274291992, + "step": 744 + }, + { + "epoch": 0.2715689843465599, + "grad_norm": 20.875, + "learning_rate": 9.86964932584067e-07, + "loss": 1.8082635402679443, + "step": 746 + }, + { + "epoch": 0.27229705132872223, + "grad_norm": 14.9375, + "learning_rate": 9.868868401327952e-07, + "loss": 1.4018604755401611, + "step": 748 + }, + { + "epoch": 0.2730251183108846, + "grad_norm": 26.5, + "learning_rate": 9.868085183536117e-07, + "loss": 1.286295771598816, + "step": 750 + }, + { + "epoch": 0.273753185293047, + "grad_norm": 13.125, + "learning_rate": 9.86729967292941e-07, + "loss": 1.1643632650375366, + "step": 752 + }, + { + "epoch": 0.2744812522752093, + "grad_norm": 24.75, + "learning_rate": 9.866511869973453e-07, + "loss": 1.7096303701400757, + "step": 754 + }, + { + "epoch": 0.2752093192573717, + "grad_norm": 13.25, + "learning_rate": 9.86572177513521e-07, + "loss": 1.6228950023651123, + "step": 756 + }, + { + "epoch": 0.275937386239534, + "grad_norm": 12.5, + "learning_rate": 9.864929388883015e-07, + "loss": 1.7837457656860352, + "step": 758 + }, + { + "epoch": 0.2766654532216964, + "grad_norm": 9.625, + "learning_rate": 9.864134711686555e-07, + "loss": 1.4664897918701172, + "step": 760 + }, + { + "epoch": 0.27739352020385877, + "grad_norm": 9.6875, + "learning_rate": 9.86333774401688e-07, + "loss": 1.1846156120300293, + "step": 762 + }, + { + "epoch": 0.2781215871860211, + "grad_norm": 17.625, + "learning_rate": 9.862538486346393e-07, + "loss": 1.5938024520874023, + "step": 764 + }, + { + "epoch": 0.27884965416818347, + "grad_norm": 27.125, + "learning_rate": 9.861736939148849e-07, + "loss": 1.8376797437667847, + "step": 766 + }, + { + "epoch": 0.27957772115034585, + "grad_norm": 10.6875, + "learning_rate": 9.860933102899373e-07, + "loss": 1.3181192874908447, + "step": 768 + }, + { + "epoch": 0.28030578813250817, + "grad_norm": 11.5625, + "learning_rate": 9.86012697807444e-07, + "loss": 1.1592066287994385, + "step": 770 + }, + { + "epoch": 0.28103385511467055, + "grad_norm": 17.5, + "learning_rate": 9.859318565151882e-07, + "loss": 1.403892993927002, + "step": 772 + }, + { + "epoch": 0.28176192209683293, + "grad_norm": 13.9375, + "learning_rate": 9.858507864610885e-07, + "loss": 1.3312666416168213, + "step": 774 + }, + { + "epoch": 0.28248998907899525, + "grad_norm": 20.375, + "learning_rate": 9.857694876931996e-07, + "loss": 1.3652257919311523, + "step": 776 + }, + { + "epoch": 0.28321805606115763, + "grad_norm": 9.6875, + "learning_rate": 9.856879602597112e-07, + "loss": 1.5103617906570435, + "step": 778 + }, + { + "epoch": 0.28394612304332, + "grad_norm": 9.375, + "learning_rate": 9.856062042089493e-07, + "loss": 1.3994077444076538, + "step": 780 + }, + { + "epoch": 0.28467419002548233, + "grad_norm": 5.65625, + "learning_rate": 9.855242195893744e-07, + "loss": 1.15517258644104, + "step": 782 + }, + { + "epoch": 0.2854022570076447, + "grad_norm": 14.5625, + "learning_rate": 9.854420064495833e-07, + "loss": 1.4841251373291016, + "step": 784 + }, + { + "epoch": 0.2861303239898071, + "grad_norm": 14.6875, + "learning_rate": 9.853595648383083e-07, + "loss": 1.4009931087493896, + "step": 786 + }, + { + "epoch": 0.2868583909719694, + "grad_norm": 7.9375, + "learning_rate": 9.852768948044163e-07, + "loss": 1.3002843856811523, + "step": 788 + }, + { + "epoch": 0.2875864579541318, + "grad_norm": 12.375, + "learning_rate": 9.851939963969103e-07, + "loss": 1.441417932510376, + "step": 790 + }, + { + "epoch": 0.2883145249362941, + "grad_norm": 13.3125, + "learning_rate": 9.851108696649285e-07, + "loss": 1.4893580675125122, + "step": 792 + }, + { + "epoch": 0.2890425919184565, + "grad_norm": 4.84375, + "learning_rate": 9.850275146577448e-07, + "loss": 1.075164794921875, + "step": 794 + }, + { + "epoch": 0.28977065890061887, + "grad_norm": 10.75, + "learning_rate": 9.849439314247674e-07, + "loss": 1.2105753421783447, + "step": 796 + }, + { + "epoch": 0.2904987258827812, + "grad_norm": 8.3125, + "learning_rate": 9.848601200155406e-07, + "loss": 1.4600268602371216, + "step": 798 + }, + { + "epoch": 0.2912267928649436, + "grad_norm": 12.125, + "learning_rate": 9.847760804797443e-07, + "loss": 1.4392478466033936, + "step": 800 + }, + { + "epoch": 0.29195485984710595, + "grad_norm": 23.75, + "learning_rate": 9.846918128671924e-07, + "loss": 0.9092085957527161, + "step": 802 + }, + { + "epoch": 0.2926829268292683, + "grad_norm": 17.75, + "learning_rate": 9.846073172278352e-07, + "loss": 1.4556753635406494, + "step": 804 + }, + { + "epoch": 0.29341099381143065, + "grad_norm": 13.5, + "learning_rate": 9.845225936117572e-07, + "loss": 1.4581878185272217, + "step": 806 + }, + { + "epoch": 0.29413906079359303, + "grad_norm": 9.875, + "learning_rate": 9.844376420691787e-07, + "loss": 1.4578557014465332, + "step": 808 + }, + { + "epoch": 0.29486712777575536, + "grad_norm": 9.3125, + "learning_rate": 9.84352462650455e-07, + "loss": 1.2197456359863281, + "step": 810 + }, + { + "epoch": 0.29559519475791773, + "grad_norm": 12.875, + "learning_rate": 9.842670554060764e-07, + "loss": 1.3540329933166504, + "step": 812 + }, + { + "epoch": 0.2963232617400801, + "grad_norm": 10.8125, + "learning_rate": 9.841814203866678e-07, + "loss": 1.4303345680236816, + "step": 814 + }, + { + "epoch": 0.29705132872224244, + "grad_norm": 9.5, + "learning_rate": 9.8409555764299e-07, + "loss": 1.449954628944397, + "step": 816 + }, + { + "epoch": 0.2977793957044048, + "grad_norm": 14.5, + "learning_rate": 9.84009467225938e-07, + "loss": 0.8323717713356018, + "step": 818 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 12.6875, + "learning_rate": 9.839231491865423e-07, + "loss": 0.9165869951248169, + "step": 820 + }, + { + "epoch": 0.2992355296687295, + "grad_norm": 17.375, + "learning_rate": 9.838366035759677e-07, + "loss": 1.2325783967971802, + "step": 822 + }, + { + "epoch": 0.2999635966508919, + "grad_norm": 6.6875, + "learning_rate": 9.837498304455147e-07, + "loss": 1.2098603248596191, + "step": 824 + }, + { + "epoch": 0.3006916636330542, + "grad_norm": 22.0, + "learning_rate": 9.83662829846618e-07, + "loss": 1.429964303970337, + "step": 826 + }, + { + "epoch": 0.3014197306152166, + "grad_norm": 116.5, + "learning_rate": 9.835756018308477e-07, + "loss": 1.2539451122283936, + "step": 828 + }, + { + "epoch": 0.302147797597379, + "grad_norm": 20.25, + "learning_rate": 9.834881464499077e-07, + "loss": 1.0150898694992065, + "step": 830 + }, + { + "epoch": 0.3028758645795413, + "grad_norm": 20.0, + "learning_rate": 9.83400463755638e-07, + "loss": 1.645345687866211, + "step": 832 + }, + { + "epoch": 0.3036039315617037, + "grad_norm": 12.6875, + "learning_rate": 9.833125538000121e-07, + "loss": 1.5085272789001465, + "step": 834 + }, + { + "epoch": 0.30433199854386606, + "grad_norm": 12.5625, + "learning_rate": 9.832244166351394e-07, + "loss": 1.1630547046661377, + "step": 836 + }, + { + "epoch": 0.3050600655260284, + "grad_norm": 22.0, + "learning_rate": 9.83136052313263e-07, + "loss": 0.8843675255775452, + "step": 838 + }, + { + "epoch": 0.30578813250819076, + "grad_norm": 10.75, + "learning_rate": 9.83047460886761e-07, + "loss": 1.331636905670166, + "step": 840 + }, + { + "epoch": 0.30651619949035314, + "grad_norm": 5.5625, + "learning_rate": 9.829586424081467e-07, + "loss": 1.290670394897461, + "step": 842 + }, + { + "epoch": 0.30724426647251546, + "grad_norm": 9.0625, + "learning_rate": 9.828695969300667e-07, + "loss": 1.3856630325317383, + "step": 844 + }, + { + "epoch": 0.30797233345467784, + "grad_norm": 12.625, + "learning_rate": 9.82780324505303e-07, + "loss": 1.2308765649795532, + "step": 846 + }, + { + "epoch": 0.3087004004368402, + "grad_norm": 20.5, + "learning_rate": 9.826908251867722e-07, + "loss": 1.3808432817459106, + "step": 848 + }, + { + "epoch": 0.30942846741900254, + "grad_norm": 8.0, + "learning_rate": 9.826010990275254e-07, + "loss": 0.8696513772010803, + "step": 850 + }, + { + "epoch": 0.3101565344011649, + "grad_norm": 19.125, + "learning_rate": 9.825111460807473e-07, + "loss": 1.5128706693649292, + "step": 852 + }, + { + "epoch": 0.31088460138332724, + "grad_norm": 7.875, + "learning_rate": 9.824209663997584e-07, + "loss": 1.366605520248413, + "step": 854 + }, + { + "epoch": 0.3116126683654896, + "grad_norm": 22.375, + "learning_rate": 9.823305600380123e-07, + "loss": 1.1499278545379639, + "step": 856 + }, + { + "epoch": 0.312340735347652, + "grad_norm": 15.5625, + "learning_rate": 9.82239927049098e-07, + "loss": 1.327001690864563, + "step": 858 + }, + { + "epoch": 0.3130688023298143, + "grad_norm": 8.25, + "learning_rate": 9.82149067486738e-07, + "loss": 1.190638542175293, + "step": 860 + }, + { + "epoch": 0.3137968693119767, + "grad_norm": 44.0, + "learning_rate": 9.820579814047899e-07, + "loss": 1.7610671520233154, + "step": 862 + }, + { + "epoch": 0.3145249362941391, + "grad_norm": 13.25, + "learning_rate": 9.819666688572449e-07, + "loss": 1.4833555221557617, + "step": 864 + }, + { + "epoch": 0.3152530032763014, + "grad_norm": 15.75, + "learning_rate": 9.818751298982284e-07, + "loss": 1.3377705812454224, + "step": 866 + }, + { + "epoch": 0.3159810702584638, + "grad_norm": 8.375, + "learning_rate": 9.817833645820006e-07, + "loss": 1.26751708984375, + "step": 868 + }, + { + "epoch": 0.31670913724062616, + "grad_norm": 14.5625, + "learning_rate": 9.816913729629557e-07, + "loss": 1.4252474308013916, + "step": 870 + }, + { + "epoch": 0.3174372042227885, + "grad_norm": 23.75, + "learning_rate": 9.815991550956218e-07, + "loss": 1.7691978216171265, + "step": 872 + }, + { + "epoch": 0.31816527120495086, + "grad_norm": 7.375, + "learning_rate": 9.815067110346612e-07, + "loss": 1.575103759765625, + "step": 874 + }, + { + "epoch": 0.31889333818711324, + "grad_norm": 11.9375, + "learning_rate": 9.8141404083487e-07, + "loss": 1.3454792499542236, + "step": 876 + }, + { + "epoch": 0.31962140516927556, + "grad_norm": 17.75, + "learning_rate": 9.81321144551179e-07, + "loss": 1.4380028247833252, + "step": 878 + }, + { + "epoch": 0.32034947215143794, + "grad_norm": 8.375, + "learning_rate": 9.812280222386526e-07, + "loss": 1.2545723915100098, + "step": 880 + }, + { + "epoch": 0.32107753913360026, + "grad_norm": 20.5, + "learning_rate": 9.811346739524893e-07, + "loss": 1.3912880420684814, + "step": 882 + }, + { + "epoch": 0.32180560611576264, + "grad_norm": 5.40625, + "learning_rate": 9.810410997480213e-07, + "loss": 1.3181822299957275, + "step": 884 + }, + { + "epoch": 0.322533673097925, + "grad_norm": 14.25, + "learning_rate": 9.809472996807146e-07, + "loss": 1.3010824918746948, + "step": 886 + }, + { + "epoch": 0.32326174008008735, + "grad_norm": 10.1875, + "learning_rate": 9.808532738061702e-07, + "loss": 0.9234017729759216, + "step": 888 + }, + { + "epoch": 0.3239898070622497, + "grad_norm": 13.5, + "learning_rate": 9.807590221801213e-07, + "loss": 1.585287094116211, + "step": 890 + }, + { + "epoch": 0.3247178740444121, + "grad_norm": 9.5625, + "learning_rate": 9.806645448584362e-07, + "loss": 1.6475350856781006, + "step": 892 + }, + { + "epoch": 0.3254459410265744, + "grad_norm": 18.0, + "learning_rate": 9.805698418971163e-07, + "loss": 1.4935126304626465, + "step": 894 + }, + { + "epoch": 0.3261740080087368, + "grad_norm": 16.5, + "learning_rate": 9.804749133522972e-07, + "loss": 1.743594765663147, + "step": 896 + }, + { + "epoch": 0.3269020749908992, + "grad_norm": 11.75, + "learning_rate": 9.803797592802473e-07, + "loss": 1.041639804840088, + "step": 898 + }, + { + "epoch": 0.3276301419730615, + "grad_norm": 21.5, + "learning_rate": 9.802843797373705e-07, + "loss": 1.254799246788025, + "step": 900 + }, + { + "epoch": 0.3283582089552239, + "grad_norm": 14.1875, + "learning_rate": 9.801887747802021e-07, + "loss": 1.5237921476364136, + "step": 902 + }, + { + "epoch": 0.32908627593738626, + "grad_norm": 6.34375, + "learning_rate": 9.800929444654132e-07, + "loss": 1.211348533630371, + "step": 904 + }, + { + "epoch": 0.3298143429195486, + "grad_norm": 22.5, + "learning_rate": 9.799968888498062e-07, + "loss": 1.000844955444336, + "step": 906 + }, + { + "epoch": 0.33054240990171097, + "grad_norm": 14.75, + "learning_rate": 9.799006079903189e-07, + "loss": 1.6406400203704834, + "step": 908 + }, + { + "epoch": 0.33127047688387334, + "grad_norm": 40.0, + "learning_rate": 9.798041019440221e-07, + "loss": 1.715691089630127, + "step": 910 + }, + { + "epoch": 0.33199854386603567, + "grad_norm": 16.0, + "learning_rate": 9.797073707681197e-07, + "loss": 1.3822669982910156, + "step": 912 + }, + { + "epoch": 0.33272661084819805, + "grad_norm": 60.0, + "learning_rate": 9.796104145199493e-07, + "loss": 0.8454775810241699, + "step": 914 + }, + { + "epoch": 0.33345467783036037, + "grad_norm": 16.625, + "learning_rate": 9.79513233256982e-07, + "loss": 1.3302778005599976, + "step": 916 + }, + { + "epoch": 0.33418274481252275, + "grad_norm": 15.6875, + "learning_rate": 9.794158270368222e-07, + "loss": 1.1882603168487549, + "step": 918 + }, + { + "epoch": 0.3349108117946851, + "grad_norm": 3.53125, + "learning_rate": 9.793181959172072e-07, + "loss": 1.3208725452423096, + "step": 920 + }, + { + "epoch": 0.33563887877684745, + "grad_norm": 18.5, + "learning_rate": 9.792203399560087e-07, + "loss": 1.5005824565887451, + "step": 922 + }, + { + "epoch": 0.3363669457590098, + "grad_norm": 16.875, + "learning_rate": 9.791222592112306e-07, + "loss": 1.4183735847473145, + "step": 924 + }, + { + "epoch": 0.3370950127411722, + "grad_norm": 17.875, + "learning_rate": 9.790239537410105e-07, + "loss": 1.5375800132751465, + "step": 926 + }, + { + "epoch": 0.33782307972333453, + "grad_norm": 13.4375, + "learning_rate": 9.78925423603619e-07, + "loss": 1.6783370971679688, + "step": 928 + }, + { + "epoch": 0.3385511467054969, + "grad_norm": 13.875, + "learning_rate": 9.788266688574604e-07, + "loss": 1.009158730506897, + "step": 930 + }, + { + "epoch": 0.3392792136876593, + "grad_norm": 19.25, + "learning_rate": 9.787276895610717e-07, + "loss": 1.6327277421951294, + "step": 932 + }, + { + "epoch": 0.3400072806698216, + "grad_norm": 27.625, + "learning_rate": 9.78628485773123e-07, + "loss": 1.174685001373291, + "step": 934 + }, + { + "epoch": 0.340735347651984, + "grad_norm": 8.3125, + "learning_rate": 9.78529057552417e-07, + "loss": 1.3129394054412842, + "step": 936 + }, + { + "epoch": 0.34146341463414637, + "grad_norm": 17.875, + "learning_rate": 9.784294049578907e-07, + "loss": 1.9013786315917969, + "step": 938 + }, + { + "epoch": 0.3421914816163087, + "grad_norm": 40.25, + "learning_rate": 9.783295280486133e-07, + "loss": 1.3297946453094482, + "step": 940 + }, + { + "epoch": 0.34291954859847107, + "grad_norm": 29.375, + "learning_rate": 9.782294268837866e-07, + "loss": 1.6928989887237549, + "step": 942 + }, + { + "epoch": 0.3436476155806334, + "grad_norm": 12.125, + "learning_rate": 9.78129101522746e-07, + "loss": 1.2127022743225098, + "step": 944 + }, + { + "epoch": 0.34437568256279577, + "grad_norm": 15.6875, + "learning_rate": 9.780285520249595e-07, + "loss": 1.5473664999008179, + "step": 946 + }, + { + "epoch": 0.34510374954495815, + "grad_norm": 9.8125, + "learning_rate": 9.779277784500281e-07, + "loss": 1.5960968732833862, + "step": 948 + }, + { + "epoch": 0.3458318165271205, + "grad_norm": 9.875, + "learning_rate": 9.778267808576856e-07, + "loss": 0.9435205459594727, + "step": 950 + }, + { + "epoch": 0.34655988350928285, + "grad_norm": 9.375, + "learning_rate": 9.777255593077985e-07, + "loss": 1.1264784336090088, + "step": 952 + }, + { + "epoch": 0.34728795049144523, + "grad_norm": 15.5625, + "learning_rate": 9.776241138603655e-07, + "loss": 1.439509391784668, + "step": 954 + }, + { + "epoch": 0.34801601747360755, + "grad_norm": 9.875, + "learning_rate": 9.775224445755196e-07, + "loss": 1.3497865200042725, + "step": 956 + }, + { + "epoch": 0.34874408445576993, + "grad_norm": 13.5, + "learning_rate": 9.774205515135246e-07, + "loss": 1.0644817352294922, + "step": 958 + }, + { + "epoch": 0.3494721514379323, + "grad_norm": 11.125, + "learning_rate": 9.773184347347781e-07, + "loss": 1.2022318840026855, + "step": 960 + }, + { + "epoch": 0.35020021842009463, + "grad_norm": 7.46875, + "learning_rate": 9.772160942998097e-07, + "loss": 1.1206250190734863, + "step": 962 + }, + { + "epoch": 0.350928285402257, + "grad_norm": 12.1875, + "learning_rate": 9.771135302692829e-07, + "loss": 1.4630513191223145, + "step": 964 + }, + { + "epoch": 0.3516563523844194, + "grad_norm": 61.0, + "learning_rate": 9.770107427039915e-07, + "loss": 1.31239652633667, + "step": 966 + }, + { + "epoch": 0.3523844193665817, + "grad_norm": 14.75, + "learning_rate": 9.769077316648638e-07, + "loss": 1.6549444198608398, + "step": 968 + }, + { + "epoch": 0.3531124863487441, + "grad_norm": 11.3125, + "learning_rate": 9.768044972129593e-07, + "loss": 1.3320286273956299, + "step": 970 + }, + { + "epoch": 0.35384055333090647, + "grad_norm": 41.0, + "learning_rate": 9.767010394094711e-07, + "loss": 2.03749418258667, + "step": 972 + }, + { + "epoch": 0.3545686203130688, + "grad_norm": 61.25, + "learning_rate": 9.765973583157235e-07, + "loss": 1.5686509609222412, + "step": 974 + }, + { + "epoch": 0.3552966872952312, + "grad_norm": 20.5, + "learning_rate": 9.764934539931736e-07, + "loss": 1.7346678972244263, + "step": 976 + }, + { + "epoch": 0.3560247542773935, + "grad_norm": 15.3125, + "learning_rate": 9.763893265034109e-07, + "loss": 1.4326144456863403, + "step": 978 + }, + { + "epoch": 0.3567528212595559, + "grad_norm": 18.125, + "learning_rate": 9.762849759081575e-07, + "loss": 1.4782772064208984, + "step": 980 + }, + { + "epoch": 0.35748088824171825, + "grad_norm": 20.75, + "learning_rate": 9.761804022692672e-07, + "loss": 1.8400791883468628, + "step": 982 + }, + { + "epoch": 0.3582089552238806, + "grad_norm": 48.0, + "learning_rate": 9.760756056487262e-07, + "loss": 1.4965910911560059, + "step": 984 + }, + { + "epoch": 0.35893702220604295, + "grad_norm": 9.9375, + "learning_rate": 9.759705861086528e-07, + "loss": 0.8845205307006836, + "step": 986 + }, + { + "epoch": 0.35966508918820533, + "grad_norm": 14.1875, + "learning_rate": 9.75865343711298e-07, + "loss": 1.4706084728240967, + "step": 988 + }, + { + "epoch": 0.36039315617036766, + "grad_norm": 10.75, + "learning_rate": 9.757598785190438e-07, + "loss": 1.3372465372085571, + "step": 990 + }, + { + "epoch": 0.36112122315253004, + "grad_norm": 13.875, + "learning_rate": 9.756541905944052e-07, + "loss": 1.659303069114685, + "step": 992 + }, + { + "epoch": 0.3618492901346924, + "grad_norm": 35.5, + "learning_rate": 9.75548280000029e-07, + "loss": 1.3204107284545898, + "step": 994 + }, + { + "epoch": 0.36257735711685474, + "grad_norm": 17.125, + "learning_rate": 9.75442146798694e-07, + "loss": 1.1220965385437012, + "step": 996 + }, + { + "epoch": 0.3633054240990171, + "grad_norm": 19.125, + "learning_rate": 9.753357910533105e-07, + "loss": 1.6682801246643066, + "step": 998 + }, + { + "epoch": 0.3640334910811795, + "grad_norm": 9.625, + "learning_rate": 9.752292128269213e-07, + "loss": 1.3711200952529907, + "step": 1000 + }, + { + "epoch": 0.3647615580633418, + "grad_norm": 6.53125, + "learning_rate": 9.751224121827005e-07, + "loss": 1.1421229839324951, + "step": 1002 + }, + { + "epoch": 0.3654896250455042, + "grad_norm": 7.71875, + "learning_rate": 9.75015389183955e-07, + "loss": 1.1252611875534058, + "step": 1004 + }, + { + "epoch": 0.3662176920276665, + "grad_norm": 14.3125, + "learning_rate": 9.749081438941224e-07, + "loss": 1.3754892349243164, + "step": 1006 + }, + { + "epoch": 0.3669457590098289, + "grad_norm": 17.625, + "learning_rate": 9.74800676376773e-07, + "loss": 1.3377655744552612, + "step": 1008 + }, + { + "epoch": 0.3676738259919913, + "grad_norm": 14.8125, + "learning_rate": 9.746929866956077e-07, + "loss": 1.4268475770950317, + "step": 1010 + }, + { + "epoch": 0.3684018929741536, + "grad_norm": 9.25, + "learning_rate": 9.745850749144604e-07, + "loss": 1.1588679552078247, + "step": 1012 + }, + { + "epoch": 0.369129959956316, + "grad_norm": 9.125, + "learning_rate": 9.74476941097296e-07, + "loss": 1.3914282321929932, + "step": 1014 + }, + { + "epoch": 0.36985802693847836, + "grad_norm": 15.75, + "learning_rate": 9.743685853082105e-07, + "loss": 1.4204139709472656, + "step": 1016 + }, + { + "epoch": 0.3705860939206407, + "grad_norm": 8.9375, + "learning_rate": 9.742600076114322e-07, + "loss": 1.2996437549591064, + "step": 1018 + }, + { + "epoch": 0.37131416090280306, + "grad_norm": 10.75, + "learning_rate": 9.74151208071321e-07, + "loss": 1.3504772186279297, + "step": 1020 + }, + { + "epoch": 0.37204222788496544, + "grad_norm": 16.375, + "learning_rate": 9.740421867523677e-07, + "loss": 1.5931551456451416, + "step": 1022 + }, + { + "epoch": 0.37277029486712776, + "grad_norm": 23.375, + "learning_rate": 9.739329437191952e-07, + "loss": 1.4746217727661133, + "step": 1024 + }, + { + "epoch": 0.37349836184929014, + "grad_norm": 23.25, + "learning_rate": 9.738234790365573e-07, + "loss": 1.129415512084961, + "step": 1026 + }, + { + "epoch": 0.3742264288314525, + "grad_norm": 16.375, + "learning_rate": 9.737137927693396e-07, + "loss": 1.3902003765106201, + "step": 1028 + }, + { + "epoch": 0.37495449581361484, + "grad_norm": 12.625, + "learning_rate": 9.736038849825588e-07, + "loss": 1.668203353881836, + "step": 1030 + }, + { + "epoch": 0.3756825627957772, + "grad_norm": 19.125, + "learning_rate": 9.734937557413628e-07, + "loss": 1.8524250984191895, + "step": 1032 + }, + { + "epoch": 0.37641062977793954, + "grad_norm": 13.75, + "learning_rate": 9.733834051110307e-07, + "loss": 1.5700359344482422, + "step": 1034 + }, + { + "epoch": 0.3771386967601019, + "grad_norm": 6.28125, + "learning_rate": 9.732728331569737e-07, + "loss": 1.7512898445129395, + "step": 1036 + }, + { + "epoch": 0.3778667637422643, + "grad_norm": 7.15625, + "learning_rate": 9.731620399447328e-07, + "loss": 1.2604296207427979, + "step": 1038 + }, + { + "epoch": 0.3785948307244266, + "grad_norm": 3.328125, + "learning_rate": 9.730510255399818e-07, + "loss": 0.8426098823547363, + "step": 1040 + }, + { + "epoch": 0.379322897706589, + "grad_norm": 12.1875, + "learning_rate": 9.729397900085239e-07, + "loss": 1.2458206415176392, + "step": 1042 + }, + { + "epoch": 0.3800509646887514, + "grad_norm": 14.625, + "learning_rate": 9.728283334162942e-07, + "loss": 1.5105273723602295, + "step": 1044 + }, + { + "epoch": 0.3807790316709137, + "grad_norm": 13.0, + "learning_rate": 9.727166558293594e-07, + "loss": 1.0772390365600586, + "step": 1046 + }, + { + "epoch": 0.3815070986530761, + "grad_norm": 13.4375, + "learning_rate": 9.72604757313916e-07, + "loss": 1.7043192386627197, + "step": 1048 + }, + { + "epoch": 0.38223516563523846, + "grad_norm": 7.96875, + "learning_rate": 9.724926379362923e-07, + "loss": 1.4606322050094604, + "step": 1050 + }, + { + "epoch": 0.3829632326174008, + "grad_norm": 45.0, + "learning_rate": 9.723802977629476e-07, + "loss": 1.3900823593139648, + "step": 1052 + }, + { + "epoch": 0.38369129959956316, + "grad_norm": 13.875, + "learning_rate": 9.72267736860471e-07, + "loss": 1.2503985166549683, + "step": 1054 + }, + { + "epoch": 0.38441936658172554, + "grad_norm": 10.625, + "learning_rate": 9.72154955295584e-07, + "loss": 1.3292158842086792, + "step": 1056 + }, + { + "epoch": 0.38514743356388786, + "grad_norm": 22.375, + "learning_rate": 9.720419531351373e-07, + "loss": 1.391345739364624, + "step": 1058 + }, + { + "epoch": 0.38587550054605024, + "grad_norm": 24.375, + "learning_rate": 9.719287304461138e-07, + "loss": 1.708967685699463, + "step": 1060 + }, + { + "epoch": 0.3866035675282126, + "grad_norm": 17.25, + "learning_rate": 9.718152872956262e-07, + "loss": 1.7240874767303467, + "step": 1062 + }, + { + "epoch": 0.38733163451037494, + "grad_norm": 9.75, + "learning_rate": 9.717016237509184e-07, + "loss": 1.5330109596252441, + "step": 1064 + }, + { + "epoch": 0.3880597014925373, + "grad_norm": 13.3125, + "learning_rate": 9.715877398793642e-07, + "loss": 1.3935208320617676, + "step": 1066 + }, + { + "epoch": 0.38878776847469965, + "grad_norm": 14.6875, + "learning_rate": 9.714736357484685e-07, + "loss": 1.5584144592285156, + "step": 1068 + }, + { + "epoch": 0.389515835456862, + "grad_norm": 24.25, + "learning_rate": 9.713593114258674e-07, + "loss": 1.4579991102218628, + "step": 1070 + }, + { + "epoch": 0.3902439024390244, + "grad_norm": 12.8125, + "learning_rate": 9.71244766979326e-07, + "loss": 1.2849230766296387, + "step": 1072 + }, + { + "epoch": 0.3909719694211867, + "grad_norm": 14.4375, + "learning_rate": 9.711300024767416e-07, + "loss": 1.7298519611358643, + "step": 1074 + }, + { + "epoch": 0.3917000364033491, + "grad_norm": 33.0, + "learning_rate": 9.710150179861404e-07, + "loss": 1.3457467555999756, + "step": 1076 + }, + { + "epoch": 0.3924281033855115, + "grad_norm": 11.8125, + "learning_rate": 9.708998135756798e-07, + "loss": 1.4167814254760742, + "step": 1078 + }, + { + "epoch": 0.3931561703676738, + "grad_norm": 150.0, + "learning_rate": 9.707843893136477e-07, + "loss": 1.559863567352295, + "step": 1080 + }, + { + "epoch": 0.3938842373498362, + "grad_norm": 12.8125, + "learning_rate": 9.706687452684621e-07, + "loss": 1.459658145904541, + "step": 1082 + }, + { + "epoch": 0.39461230433199856, + "grad_norm": 25.875, + "learning_rate": 9.705528815086707e-07, + "loss": 1.4616550207138062, + "step": 1084 + }, + { + "epoch": 0.3953403713141609, + "grad_norm": 16.625, + "learning_rate": 9.704367981029525e-07, + "loss": 1.4848504066467285, + "step": 1086 + }, + { + "epoch": 0.39606843829632327, + "grad_norm": 10.3125, + "learning_rate": 9.70320495120116e-07, + "loss": 0.7380646467208862, + "step": 1088 + }, + { + "epoch": 0.39679650527848565, + "grad_norm": 14.25, + "learning_rate": 9.702039726291e-07, + "loss": 1.5676989555358887, + "step": 1090 + }, + { + "epoch": 0.39752457226064797, + "grad_norm": 36.25, + "learning_rate": 9.700872306989732e-07, + "loss": 1.7854106426239014, + "step": 1092 + }, + { + "epoch": 0.39825263924281035, + "grad_norm": 11.75, + "learning_rate": 9.699702693989348e-07, + "loss": 1.207574486732483, + "step": 1094 + }, + { + "epoch": 0.39898070622497267, + "grad_norm": 4.59375, + "learning_rate": 9.698530887983138e-07, + "loss": 1.1762537956237793, + "step": 1096 + }, + { + "epoch": 0.39970877320713505, + "grad_norm": 23.875, + "learning_rate": 9.697356889665695e-07, + "loss": 1.4341844320297241, + "step": 1098 + }, + { + "epoch": 0.4004368401892974, + "grad_norm": 10.75, + "learning_rate": 9.696180699732906e-07, + "loss": 1.1325860023498535, + "step": 1100 + }, + { + "epoch": 0.40116490717145975, + "grad_norm": 17.0, + "learning_rate": 9.695002318881959e-07, + "loss": 1.3704644441604614, + "step": 1102 + }, + { + "epoch": 0.40189297415362213, + "grad_norm": 5.25, + "learning_rate": 9.693821747811343e-07, + "loss": 1.208422303199768, + "step": 1104 + }, + { + "epoch": 0.4026210411357845, + "grad_norm": 20.625, + "learning_rate": 9.692638987220845e-07, + "loss": 1.4641880989074707, + "step": 1106 + }, + { + "epoch": 0.40334910811794683, + "grad_norm": 13.5625, + "learning_rate": 9.691454037811543e-07, + "loss": 1.7966339588165283, + "step": 1108 + }, + { + "epoch": 0.4040771751001092, + "grad_norm": 15.125, + "learning_rate": 9.690266900285827e-07, + "loss": 1.7560460567474365, + "step": 1110 + }, + { + "epoch": 0.4048052420822716, + "grad_norm": 7.5625, + "learning_rate": 9.689077575347366e-07, + "loss": 1.407801866531372, + "step": 1112 + }, + { + "epoch": 0.4055333090644339, + "grad_norm": 19.5, + "learning_rate": 9.687886063701143e-07, + "loss": 1.3088862895965576, + "step": 1114 + }, + { + "epoch": 0.4062613760465963, + "grad_norm": 15.625, + "learning_rate": 9.686692366053422e-07, + "loss": 1.303182601928711, + "step": 1116 + }, + { + "epoch": 0.40698944302875867, + "grad_norm": 14.5625, + "learning_rate": 9.685496483111773e-07, + "loss": 1.4070264101028442, + "step": 1118 + }, + { + "epoch": 0.407717510010921, + "grad_norm": 14.125, + "learning_rate": 9.684298415585054e-07, + "loss": 1.4971115589141846, + "step": 1120 + }, + { + "epoch": 0.40844557699308337, + "grad_norm": 14.8125, + "learning_rate": 9.683098164183427e-07, + "loss": 1.448760747909546, + "step": 1122 + }, + { + "epoch": 0.40917364397524575, + "grad_norm": 10.8125, + "learning_rate": 9.68189572961834e-07, + "loss": 1.6257266998291016, + "step": 1124 + }, + { + "epoch": 0.40990171095740807, + "grad_norm": 7.59375, + "learning_rate": 9.680691112602542e-07, + "loss": 1.2700321674346924, + "step": 1126 + }, + { + "epoch": 0.41062977793957045, + "grad_norm": 14.3125, + "learning_rate": 9.679484313850067e-07, + "loss": 1.6311366558074951, + "step": 1128 + }, + { + "epoch": 0.4113578449217328, + "grad_norm": 15.6875, + "learning_rate": 9.678275334076252e-07, + "loss": 1.5970368385314941, + "step": 1130 + }, + { + "epoch": 0.41208591190389515, + "grad_norm": 57.25, + "learning_rate": 9.677064173997718e-07, + "loss": 1.3370639085769653, + "step": 1132 + }, + { + "epoch": 0.41281397888605753, + "grad_norm": 13.75, + "learning_rate": 9.675850834332383e-07, + "loss": 1.5839629173278809, + "step": 1134 + }, + { + "epoch": 0.41354204586821985, + "grad_norm": 15.75, + "learning_rate": 9.674635315799459e-07, + "loss": 1.4133682250976562, + "step": 1136 + }, + { + "epoch": 0.41427011285038223, + "grad_norm": 93.5, + "learning_rate": 9.673417619119447e-07, + "loss": 1.677177906036377, + "step": 1138 + }, + { + "epoch": 0.4149981798325446, + "grad_norm": 12.75, + "learning_rate": 9.672197745014136e-07, + "loss": 1.293471097946167, + "step": 1140 + }, + { + "epoch": 0.41572624681470693, + "grad_norm": 11.9375, + "learning_rate": 9.670975694206611e-07, + "loss": 1.620890736579895, + "step": 1142 + }, + { + "epoch": 0.4164543137968693, + "grad_norm": 12.625, + "learning_rate": 9.669751467421246e-07, + "loss": 1.40601646900177, + "step": 1144 + }, + { + "epoch": 0.4171823807790317, + "grad_norm": 20.5, + "learning_rate": 9.6685250653837e-07, + "loss": 1.2822861671447754, + "step": 1146 + }, + { + "epoch": 0.417910447761194, + "grad_norm": 2.71875, + "learning_rate": 9.667296488820927e-07, + "loss": 1.3684256076812744, + "step": 1148 + }, + { + "epoch": 0.4186385147433564, + "grad_norm": 6.4375, + "learning_rate": 9.666065738461167e-07, + "loss": 1.3050000667572021, + "step": 1150 + }, + { + "epoch": 0.41936658172551877, + "grad_norm": 30.625, + "learning_rate": 9.664832815033955e-07, + "loss": 1.6496574878692627, + "step": 1152 + }, + { + "epoch": 0.4200946487076811, + "grad_norm": 13.5, + "learning_rate": 9.6635977192701e-07, + "loss": 1.488876223564148, + "step": 1154 + }, + { + "epoch": 0.4208227156898435, + "grad_norm": 7.3125, + "learning_rate": 9.662360451901713e-07, + "loss": 1.111842155456543, + "step": 1156 + }, + { + "epoch": 0.4215507826720058, + "grad_norm": 15.1875, + "learning_rate": 9.661121013662188e-07, + "loss": 1.199295997619629, + "step": 1158 + }, + { + "epoch": 0.4222788496541682, + "grad_norm": 25.0, + "learning_rate": 9.659879405286202e-07, + "loss": 1.0735878944396973, + "step": 1160 + }, + { + "epoch": 0.42300691663633055, + "grad_norm": 4.90625, + "learning_rate": 9.658635627509719e-07, + "loss": 1.1244606971740723, + "step": 1162 + }, + { + "epoch": 0.4237349836184929, + "grad_norm": 22.75, + "learning_rate": 9.657389681069991e-07, + "loss": 1.80363929271698, + "step": 1164 + }, + { + "epoch": 0.42446305060065526, + "grad_norm": 7.09375, + "learning_rate": 9.656141566705558e-07, + "loss": 1.0574681758880615, + "step": 1166 + }, + { + "epoch": 0.42519111758281763, + "grad_norm": 10.75, + "learning_rate": 9.654891285156242e-07, + "loss": 1.649153470993042, + "step": 1168 + }, + { + "epoch": 0.42591918456497996, + "grad_norm": 15.3125, + "learning_rate": 9.653638837163147e-07, + "loss": 1.125114917755127, + "step": 1170 + }, + { + "epoch": 0.42664725154714234, + "grad_norm": 15.0625, + "learning_rate": 9.652384223468667e-07, + "loss": 0.6538023352622986, + "step": 1172 + }, + { + "epoch": 0.4273753185293047, + "grad_norm": 21.125, + "learning_rate": 9.651127444816472e-07, + "loss": 1.5532705783843994, + "step": 1174 + }, + { + "epoch": 0.42810338551146704, + "grad_norm": 24.75, + "learning_rate": 9.649868501951524e-07, + "loss": 1.7286773920059204, + "step": 1176 + }, + { + "epoch": 0.4288314524936294, + "grad_norm": 4.875, + "learning_rate": 9.648607395620062e-07, + "loss": 1.3070796728134155, + "step": 1178 + }, + { + "epoch": 0.4295595194757918, + "grad_norm": 10.5625, + "learning_rate": 9.647344126569612e-07, + "loss": 1.1811411380767822, + "step": 1180 + }, + { + "epoch": 0.4302875864579541, + "grad_norm": 5.1875, + "learning_rate": 9.646078695548975e-07, + "loss": 1.3946648836135864, + "step": 1182 + }, + { + "epoch": 0.4310156534401165, + "grad_norm": 10.0, + "learning_rate": 9.644811103308243e-07, + "loss": 1.4337646961212158, + "step": 1184 + }, + { + "epoch": 0.4317437204222789, + "grad_norm": 34.25, + "learning_rate": 9.643541350598778e-07, + "loss": 1.3881990909576416, + "step": 1186 + }, + { + "epoch": 0.4324717874044412, + "grad_norm": 14.75, + "learning_rate": 9.64226943817323e-07, + "loss": 0.8097792863845825, + "step": 1188 + }, + { + "epoch": 0.4331998543866036, + "grad_norm": 18.5, + "learning_rate": 9.640995366785528e-07, + "loss": 1.3712999820709229, + "step": 1190 + }, + { + "epoch": 0.4339279213687659, + "grad_norm": 13.625, + "learning_rate": 9.639719137190885e-07, + "loss": 1.5774592161178589, + "step": 1192 + }, + { + "epoch": 0.4346559883509283, + "grad_norm": 15.6875, + "learning_rate": 9.63844075014578e-07, + "loss": 1.4193360805511475, + "step": 1194 + }, + { + "epoch": 0.43538405533309066, + "grad_norm": 13.6875, + "learning_rate": 9.637160206407983e-07, + "loss": 1.8821792602539062, + "step": 1196 + }, + { + "epoch": 0.436112122315253, + "grad_norm": 10.9375, + "learning_rate": 9.63587750673654e-07, + "loss": 1.55940842628479, + "step": 1198 + }, + { + "epoch": 0.43684018929741536, + "grad_norm": 26.625, + "learning_rate": 9.634592651891773e-07, + "loss": 1.5647202730178833, + "step": 1200 + }, + { + "epoch": 0.43756825627957774, + "grad_norm": 17.25, + "learning_rate": 9.633305642635283e-07, + "loss": 1.499056100845337, + "step": 1202 + }, + { + "epoch": 0.43829632326174006, + "grad_norm": 4.75, + "learning_rate": 9.632016479729945e-07, + "loss": 1.3874766826629639, + "step": 1204 + }, + { + "epoch": 0.43902439024390244, + "grad_norm": 16.75, + "learning_rate": 9.630725163939914e-07, + "loss": 1.4649603366851807, + "step": 1206 + }, + { + "epoch": 0.4397524572260648, + "grad_norm": 19.25, + "learning_rate": 9.629431696030622e-07, + "loss": 1.7015445232391357, + "step": 1208 + }, + { + "epoch": 0.44048052420822714, + "grad_norm": 5.125, + "learning_rate": 9.62813607676877e-07, + "loss": 1.1668474674224854, + "step": 1210 + }, + { + "epoch": 0.4412085911903895, + "grad_norm": 18.375, + "learning_rate": 9.62683830692234e-07, + "loss": 1.6450257301330566, + "step": 1212 + }, + { + "epoch": 0.4419366581725519, + "grad_norm": 7.125, + "learning_rate": 9.62553838726059e-07, + "loss": 1.2077171802520752, + "step": 1214 + }, + { + "epoch": 0.4426647251547142, + "grad_norm": 37.0, + "learning_rate": 9.62423631855405e-07, + "loss": 1.4596638679504395, + "step": 1216 + }, + { + "epoch": 0.4433927921368766, + "grad_norm": 21.75, + "learning_rate": 9.622932101574522e-07, + "loss": 1.384730577468872, + "step": 1218 + }, + { + "epoch": 0.4441208591190389, + "grad_norm": 22.625, + "learning_rate": 9.621625737095082e-07, + "loss": 1.7237133979797363, + "step": 1220 + }, + { + "epoch": 0.4448489261012013, + "grad_norm": 9.4375, + "learning_rate": 9.620317225890082e-07, + "loss": 1.2890348434448242, + "step": 1222 + }, + { + "epoch": 0.4455769930833637, + "grad_norm": 8.25, + "learning_rate": 9.619006568735145e-07, + "loss": 1.2845311164855957, + "step": 1224 + }, + { + "epoch": 0.446305060065526, + "grad_norm": 16.375, + "learning_rate": 9.617693766407162e-07, + "loss": 1.695064902305603, + "step": 1226 + }, + { + "epoch": 0.4470331270476884, + "grad_norm": 45.0, + "learning_rate": 9.616378819684303e-07, + "loss": 1.4220951795578003, + "step": 1228 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 9.1875, + "learning_rate": 9.615061729346002e-07, + "loss": 1.422806739807129, + "step": 1230 + }, + { + "epoch": 0.4484892610120131, + "grad_norm": 14.5, + "learning_rate": 9.613742496172972e-07, + "loss": 1.1806005239486694, + "step": 1232 + }, + { + "epoch": 0.44921732799417546, + "grad_norm": 4.75, + "learning_rate": 9.612421120947185e-07, + "loss": 1.4849927425384521, + "step": 1234 + }, + { + "epoch": 0.44994539497633784, + "grad_norm": 8.625, + "learning_rate": 9.611097604451891e-07, + "loss": 1.3780394792556763, + "step": 1236 + }, + { + "epoch": 0.45067346195850017, + "grad_norm": 11.75, + "learning_rate": 9.60977194747161e-07, + "loss": 1.4589874744415283, + "step": 1238 + }, + { + "epoch": 0.45140152894066254, + "grad_norm": 11.9375, + "learning_rate": 9.608444150792122e-07, + "loss": 1.3657958507537842, + "step": 1240 + }, + { + "epoch": 0.4521295959228249, + "grad_norm": 11.75, + "learning_rate": 9.607114215200488e-07, + "loss": 1.4701348543167114, + "step": 1242 + }, + { + "epoch": 0.45285766290498725, + "grad_norm": 17.5, + "learning_rate": 9.605782141485022e-07, + "loss": 1.31443452835083, + "step": 1244 + }, + { + "epoch": 0.4535857298871496, + "grad_norm": 12.25, + "learning_rate": 9.604447930435315e-07, + "loss": 1.252429485321045, + "step": 1246 + }, + { + "epoch": 0.454313796869312, + "grad_norm": 67.0, + "learning_rate": 9.603111582842227e-07, + "loss": 0.8179546594619751, + "step": 1248 + }, + { + "epoch": 0.4550418638514743, + "grad_norm": 12.9375, + "learning_rate": 9.60177309949788e-07, + "loss": 1.4385579824447632, + "step": 1250 + }, + { + "epoch": 0.4557699308336367, + "grad_norm": 10.9375, + "learning_rate": 9.600432481195662e-07, + "loss": 1.4098849296569824, + "step": 1252 + }, + { + "epoch": 0.45649799781579903, + "grad_norm": 14.4375, + "learning_rate": 9.599089728730223e-07, + "loss": 1.855327844619751, + "step": 1254 + }, + { + "epoch": 0.4572260647979614, + "grad_norm": 53.75, + "learning_rate": 9.597744842897489e-07, + "loss": 1.4976952075958252, + "step": 1256 + }, + { + "epoch": 0.4579541317801238, + "grad_norm": 9.625, + "learning_rate": 9.596397824494638e-07, + "loss": 1.3468235731124878, + "step": 1258 + }, + { + "epoch": 0.4586821987622861, + "grad_norm": 10.3125, + "learning_rate": 9.595048674320119e-07, + "loss": 1.3717831373214722, + "step": 1260 + }, + { + "epoch": 0.4594102657444485, + "grad_norm": 8.875, + "learning_rate": 9.593697393173645e-07, + "loss": 1.3464164733886719, + "step": 1262 + }, + { + "epoch": 0.46013833272661087, + "grad_norm": 11.8125, + "learning_rate": 9.592343981856187e-07, + "loss": 1.2338759899139404, + "step": 1264 + }, + { + "epoch": 0.4608663997087732, + "grad_norm": 17.125, + "learning_rate": 9.590988441169989e-07, + "loss": 1.3063161373138428, + "step": 1266 + }, + { + "epoch": 0.46159446669093557, + "grad_norm": 8.0625, + "learning_rate": 9.589630771918545e-07, + "loss": 1.5045244693756104, + "step": 1268 + }, + { + "epoch": 0.46232253367309795, + "grad_norm": 7.65625, + "learning_rate": 9.588270974906616e-07, + "loss": 1.3927631378173828, + "step": 1270 + }, + { + "epoch": 0.46305060065526027, + "grad_norm": 15.0625, + "learning_rate": 9.586909050940223e-07, + "loss": 1.4836037158966064, + "step": 1272 + }, + { + "epoch": 0.46377866763742265, + "grad_norm": 10.6875, + "learning_rate": 9.585545000826657e-07, + "loss": 1.4663872718811035, + "step": 1274 + }, + { + "epoch": 0.464506734619585, + "grad_norm": 9.1875, + "learning_rate": 9.584178825374452e-07, + "loss": 1.4291770458221436, + "step": 1276 + }, + { + "epoch": 0.46523480160174735, + "grad_norm": 10.1875, + "learning_rate": 9.582810525393415e-07, + "loss": 1.2997241020202637, + "step": 1278 + }, + { + "epoch": 0.46596286858390973, + "grad_norm": 9.3125, + "learning_rate": 9.58144010169461e-07, + "loss": 1.579464316368103, + "step": 1280 + }, + { + "epoch": 0.46669093556607205, + "grad_norm": 10.1875, + "learning_rate": 9.580067555090355e-07, + "loss": 1.4255378246307373, + "step": 1282 + }, + { + "epoch": 0.46741900254823443, + "grad_norm": 19.875, + "learning_rate": 9.578692886394228e-07, + "loss": 1.223609209060669, + "step": 1284 + }, + { + "epoch": 0.4681470695303968, + "grad_norm": 15.4375, + "learning_rate": 9.577316096421072e-07, + "loss": 1.1854100227355957, + "step": 1286 + }, + { + "epoch": 0.46887513651255913, + "grad_norm": 15.0625, + "learning_rate": 9.575937185986982e-07, + "loss": 1.77070951461792, + "step": 1288 + }, + { + "epoch": 0.4696032034947215, + "grad_norm": 11.5, + "learning_rate": 9.574556155909304e-07, + "loss": 1.1872940063476562, + "step": 1290 + }, + { + "epoch": 0.4703312704768839, + "grad_norm": 12.625, + "learning_rate": 9.573173007006652e-07, + "loss": 1.2390830516815186, + "step": 1292 + }, + { + "epoch": 0.4710593374590462, + "grad_norm": 11.0625, + "learning_rate": 9.571787740098887e-07, + "loss": 1.352156639099121, + "step": 1294 + }, + { + "epoch": 0.4717874044412086, + "grad_norm": 19.875, + "learning_rate": 9.570400356007128e-07, + "loss": 1.2924940586090088, + "step": 1296 + }, + { + "epoch": 0.47251547142337097, + "grad_norm": 49.25, + "learning_rate": 9.56901085555375e-07, + "loss": 1.2630419731140137, + "step": 1298 + }, + { + "epoch": 0.4732435384055333, + "grad_norm": 12.8125, + "learning_rate": 9.567619239562383e-07, + "loss": 1.2953908443450928, + "step": 1300 + }, + { + "epoch": 0.47397160538769567, + "grad_norm": 4.0625, + "learning_rate": 9.566225508857912e-07, + "loss": 1.0397285223007202, + "step": 1302 + }, + { + "epoch": 0.47469967236985805, + "grad_norm": 11.5, + "learning_rate": 9.56482966426647e-07, + "loss": 0.6560619473457336, + "step": 1304 + }, + { + "epoch": 0.4754277393520204, + "grad_norm": 216.0, + "learning_rate": 9.563431706615445e-07, + "loss": 1.4134376049041748, + "step": 1306 + }, + { + "epoch": 0.47615580633418275, + "grad_norm": 9.5625, + "learning_rate": 9.562031636733488e-07, + "loss": 1.5105488300323486, + "step": 1308 + }, + { + "epoch": 0.47688387331634513, + "grad_norm": 20.5, + "learning_rate": 9.560629455450483e-07, + "loss": 1.1349685192108154, + "step": 1310 + }, + { + "epoch": 0.47761194029850745, + "grad_norm": 44.75, + "learning_rate": 9.55922516359758e-07, + "loss": 1.446705937385559, + "step": 1312 + }, + { + "epoch": 0.47834000728066983, + "grad_norm": 14.3125, + "learning_rate": 9.557818762007177e-07, + "loss": 1.5887718200683594, + "step": 1314 + }, + { + "epoch": 0.47906807426283216, + "grad_norm": 11.9375, + "learning_rate": 9.556410251512918e-07, + "loss": 1.2883789539337158, + "step": 1316 + }, + { + "epoch": 0.47979614124499453, + "grad_norm": 4.21875, + "learning_rate": 9.554999632949703e-07, + "loss": 1.256500482559204, + "step": 1318 + }, + { + "epoch": 0.4805242082271569, + "grad_norm": 6.25, + "learning_rate": 9.553586907153677e-07, + "loss": 1.0872838497161865, + "step": 1320 + }, + { + "epoch": 0.48125227520931924, + "grad_norm": 5.71875, + "learning_rate": 9.552172074962237e-07, + "loss": 1.491745948791504, + "step": 1322 + }, + { + "epoch": 0.4819803421914816, + "grad_norm": 25.125, + "learning_rate": 9.550755137214027e-07, + "loss": 1.4779605865478516, + "step": 1324 + }, + { + "epoch": 0.482708409173644, + "grad_norm": 12.4375, + "learning_rate": 9.54933609474894e-07, + "loss": 1.2364628314971924, + "step": 1326 + }, + { + "epoch": 0.4834364761558063, + "grad_norm": 14.5625, + "learning_rate": 9.547914948408113e-07, + "loss": 1.4419231414794922, + "step": 1328 + }, + { + "epoch": 0.4841645431379687, + "grad_norm": 13.6875, + "learning_rate": 9.546491699033938e-07, + "loss": 1.4463355541229248, + "step": 1330 + }, + { + "epoch": 0.4848926101201311, + "grad_norm": 10.375, + "learning_rate": 9.545066347470048e-07, + "loss": 1.423866868019104, + "step": 1332 + }, + { + "epoch": 0.4856206771022934, + "grad_norm": 11.1875, + "learning_rate": 9.543638894561318e-07, + "loss": 1.5960347652435303, + "step": 1334 + }, + { + "epoch": 0.4863487440844558, + "grad_norm": 14.0, + "learning_rate": 9.542209341153882e-07, + "loss": 1.46177077293396, + "step": 1336 + }, + { + "epoch": 0.48707681106661815, + "grad_norm": 17.5, + "learning_rate": 9.540777688095104e-07, + "loss": 1.4224399328231812, + "step": 1338 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 16.875, + "learning_rate": 9.539343936233597e-07, + "loss": 1.2945530414581299, + "step": 1340 + }, + { + "epoch": 0.48853294503094286, + "grad_norm": 12.1875, + "learning_rate": 9.537908086419226e-07, + "loss": 1.1429345607757568, + "step": 1342 + }, + { + "epoch": 0.4892610120131052, + "grad_norm": 14.9375, + "learning_rate": 9.536470139503092e-07, + "loss": 1.5957437753677368, + "step": 1344 + }, + { + "epoch": 0.48998907899526756, + "grad_norm": 20.125, + "learning_rate": 9.535030096337541e-07, + "loss": 1.4378583431243896, + "step": 1346 + }, + { + "epoch": 0.49071714597742994, + "grad_norm": 10.4375, + "learning_rate": 9.533587957776159e-07, + "loss": 1.0828802585601807, + "step": 1348 + }, + { + "epoch": 0.49144521295959226, + "grad_norm": 11.0625, + "learning_rate": 9.53214372467378e-07, + "loss": 1.297530174255371, + "step": 1350 + }, + { + "epoch": 0.49217327994175464, + "grad_norm": 7.375, + "learning_rate": 9.530697397886471e-07, + "loss": 1.285134196281433, + "step": 1352 + }, + { + "epoch": 0.492901346923917, + "grad_norm": 15.25, + "learning_rate": 9.529248978271551e-07, + "loss": 1.5339641571044922, + "step": 1354 + }, + { + "epoch": 0.49362941390607934, + "grad_norm": 63.75, + "learning_rate": 9.527798466687569e-07, + "loss": 1.3771967887878418, + "step": 1356 + }, + { + "epoch": 0.4943574808882417, + "grad_norm": 14.875, + "learning_rate": 9.52634586399432e-07, + "loss": 0.9618538618087769, + "step": 1358 + }, + { + "epoch": 0.4950855478704041, + "grad_norm": 9.25, + "learning_rate": 9.524891171052835e-07, + "loss": 1.5211610794067383, + "step": 1360 + }, + { + "epoch": 0.4958136148525664, + "grad_norm": 12.0, + "learning_rate": 9.523434388725391e-07, + "loss": 1.2418715953826904, + "step": 1362 + }, + { + "epoch": 0.4965416818347288, + "grad_norm": 20.0, + "learning_rate": 9.521975517875494e-07, + "loss": 1.536102056503296, + "step": 1364 + }, + { + "epoch": 0.4972697488168912, + "grad_norm": 12.1875, + "learning_rate": 9.520514559367894e-07, + "loss": 1.3683820962905884, + "step": 1366 + }, + { + "epoch": 0.4979978157990535, + "grad_norm": 8.625, + "learning_rate": 9.519051514068574e-07, + "loss": 1.3063020706176758, + "step": 1368 + }, + { + "epoch": 0.4987258827812159, + "grad_norm": 22.5, + "learning_rate": 9.517586382844765e-07, + "loss": 1.4466381072998047, + "step": 1370 + }, + { + "epoch": 0.49945394976337826, + "grad_norm": 11.3125, + "learning_rate": 9.516119166564918e-07, + "loss": 1.4567177295684814, + "step": 1372 + }, + { + "epoch": 0.5001820167455406, + "grad_norm": 18.75, + "learning_rate": 9.514649866098732e-07, + "loss": 1.8033032417297363, + "step": 1374 + }, + { + "epoch": 0.500910083727703, + "grad_norm": 7.96875, + "learning_rate": 9.513178482317134e-07, + "loss": 1.3142805099487305, + "step": 1376 + }, + { + "epoch": 0.5016381507098653, + "grad_norm": 11.625, + "learning_rate": 9.511705016092297e-07, + "loss": 1.539224624633789, + "step": 1378 + }, + { + "epoch": 0.5023662176920277, + "grad_norm": 20.875, + "learning_rate": 9.510229468297615e-07, + "loss": 1.5599443912506104, + "step": 1380 + }, + { + "epoch": 0.50309428467419, + "grad_norm": 10.0625, + "learning_rate": 9.508751839807721e-07, + "loss": 1.1496529579162598, + "step": 1382 + }, + { + "epoch": 0.5038223516563524, + "grad_norm": 20.625, + "learning_rate": 9.507272131498487e-07, + "loss": 1.3222904205322266, + "step": 1384 + }, + { + "epoch": 0.5045504186385148, + "grad_norm": 11.1875, + "learning_rate": 9.505790344247009e-07, + "loss": 1.2819888591766357, + "step": 1386 + }, + { + "epoch": 0.5052784856206771, + "grad_norm": 3.578125, + "learning_rate": 9.504306478931619e-07, + "loss": 1.5472346544265747, + "step": 1388 + }, + { + "epoch": 0.5060065526028394, + "grad_norm": 16.5, + "learning_rate": 9.502820536431884e-07, + "loss": 1.1562196016311646, + "step": 1390 + }, + { + "epoch": 0.5067346195850018, + "grad_norm": 13.125, + "learning_rate": 9.501332517628597e-07, + "loss": 1.4038900136947632, + "step": 1392 + }, + { + "epoch": 0.5074626865671642, + "grad_norm": 10.9375, + "learning_rate": 9.499842423403785e-07, + "loss": 1.1714105606079102, + "step": 1394 + }, + { + "epoch": 0.5081907535493265, + "grad_norm": 16.5, + "learning_rate": 9.498350254640704e-07, + "loss": 1.5616514682769775, + "step": 1396 + }, + { + "epoch": 0.5089188205314888, + "grad_norm": 10.0, + "learning_rate": 9.496856012223842e-07, + "loss": 1.4193261861801147, + "step": 1398 + }, + { + "epoch": 0.5096468875136513, + "grad_norm": 19.375, + "learning_rate": 9.495359697038907e-07, + "loss": 1.3218188285827637, + "step": 1400 + }, + { + "epoch": 0.5103749544958136, + "grad_norm": 18.0, + "learning_rate": 9.49386130997285e-07, + "loss": 1.0799472332000732, + "step": 1402 + }, + { + "epoch": 0.5111030214779759, + "grad_norm": 18.375, + "learning_rate": 9.492360851913839e-07, + "loss": 1.6455869674682617, + "step": 1404 + }, + { + "epoch": 0.5118310884601384, + "grad_norm": 35.5, + "learning_rate": 9.490858323751276e-07, + "loss": 1.2113614082336426, + "step": 1406 + }, + { + "epoch": 0.5125591554423007, + "grad_norm": 40.25, + "learning_rate": 9.489353726375787e-07, + "loss": 0.9057716727256775, + "step": 1408 + }, + { + "epoch": 0.513287222424463, + "grad_norm": 27.25, + "learning_rate": 9.487847060679221e-07, + "loss": 1.7633662223815918, + "step": 1410 + }, + { + "epoch": 0.5140152894066254, + "grad_norm": 24.125, + "learning_rate": 9.486338327554662e-07, + "loss": 2.1992573738098145, + "step": 1412 + }, + { + "epoch": 0.5147433563887878, + "grad_norm": 47.5, + "learning_rate": 9.484827527896413e-07, + "loss": 1.6743590831756592, + "step": 1414 + }, + { + "epoch": 0.5154714233709501, + "grad_norm": 13.0625, + "learning_rate": 9.483314662599999e-07, + "loss": 0.9837207794189453, + "step": 1416 + }, + { + "epoch": 0.5161994903531125, + "grad_norm": 11.0, + "learning_rate": 9.481799732562181e-07, + "loss": 1.519030213356018, + "step": 1418 + }, + { + "epoch": 0.5169275573352748, + "grad_norm": 55.5, + "learning_rate": 9.480282738680932e-07, + "loss": 1.211798906326294, + "step": 1420 + }, + { + "epoch": 0.5176556243174372, + "grad_norm": 6.0625, + "learning_rate": 9.478763681855455e-07, + "loss": 1.4611196517944336, + "step": 1422 + }, + { + "epoch": 0.5183836912995996, + "grad_norm": 14.5625, + "learning_rate": 9.477242562986172e-07, + "loss": 1.2340278625488281, + "step": 1424 + }, + { + "epoch": 0.5191117582817619, + "grad_norm": 12.25, + "learning_rate": 9.475719382974732e-07, + "loss": 1.4552310705184937, + "step": 1426 + }, + { + "epoch": 0.5198398252639242, + "grad_norm": 13.6875, + "learning_rate": 9.474194142723999e-07, + "loss": 1.1726797819137573, + "step": 1428 + }, + { + "epoch": 0.5205678922460867, + "grad_norm": 15.5625, + "learning_rate": 9.472666843138064e-07, + "loss": 1.3448482751846313, + "step": 1430 + }, + { + "epoch": 0.521295959228249, + "grad_norm": 25.125, + "learning_rate": 9.471137485122238e-07, + "loss": 1.5541718006134033, + "step": 1432 + }, + { + "epoch": 0.5220240262104113, + "grad_norm": 23.375, + "learning_rate": 9.469606069583049e-07, + "loss": 1.8315887451171875, + "step": 1434 + }, + { + "epoch": 0.5227520931925738, + "grad_norm": 13.9375, + "learning_rate": 9.46807259742825e-07, + "loss": 1.5471179485321045, + "step": 1436 + }, + { + "epoch": 0.5234801601747361, + "grad_norm": 16.0, + "learning_rate": 9.466537069566803e-07, + "loss": 1.518141269683838, + "step": 1438 + }, + { + "epoch": 0.5242082271568984, + "grad_norm": 32.0, + "learning_rate": 9.4649994869089e-07, + "loss": 1.4696861505508423, + "step": 1440 + }, + { + "epoch": 0.5249362941390608, + "grad_norm": 18.625, + "learning_rate": 9.463459850365945e-07, + "loss": 1.4636242389678955, + "step": 1442 + }, + { + "epoch": 0.5256643611212232, + "grad_norm": 23.0, + "learning_rate": 9.46191816085056e-07, + "loss": 1.5517750978469849, + "step": 1444 + }, + { + "epoch": 0.5263924281033855, + "grad_norm": 29.75, + "learning_rate": 9.460374419276584e-07, + "loss": 1.1619925498962402, + "step": 1446 + }, + { + "epoch": 0.5271204950855479, + "grad_norm": 9.625, + "learning_rate": 9.458828626559075e-07, + "loss": 1.2186235189437866, + "step": 1448 + }, + { + "epoch": 0.5278485620677102, + "grad_norm": 16.0, + "learning_rate": 9.457280783614303e-07, + "loss": 1.398910641670227, + "step": 1450 + }, + { + "epoch": 0.5285766290498726, + "grad_norm": 13.4375, + "learning_rate": 9.455730891359755e-07, + "loss": 1.6344785690307617, + "step": 1452 + }, + { + "epoch": 0.5293046960320349, + "grad_norm": 18.0, + "learning_rate": 9.454178950714131e-07, + "loss": 1.7311197519302368, + "step": 1454 + }, + { + "epoch": 0.5300327630141973, + "grad_norm": 19.0, + "learning_rate": 9.452624962597351e-07, + "loss": 1.4906041622161865, + "step": 1456 + }, + { + "epoch": 0.5307608299963597, + "grad_norm": 20.875, + "learning_rate": 9.451068927930538e-07, + "loss": 1.4144517183303833, + "step": 1458 + }, + { + "epoch": 0.531488896978522, + "grad_norm": 11.9375, + "learning_rate": 9.449510847636042e-07, + "loss": 1.4886242151260376, + "step": 1460 + }, + { + "epoch": 0.5322169639606844, + "grad_norm": 10.3125, + "learning_rate": 9.447950722637415e-07, + "loss": 1.5163488388061523, + "step": 1462 + }, + { + "epoch": 0.5329450309428467, + "grad_norm": 10.375, + "learning_rate": 9.446388553859419e-07, + "loss": 0.5711454749107361, + "step": 1464 + }, + { + "epoch": 0.533673097925009, + "grad_norm": 20.25, + "learning_rate": 9.444824342228039e-07, + "loss": 0.5944864749908447, + "step": 1466 + }, + { + "epoch": 0.5344011649071715, + "grad_norm": 14.0625, + "learning_rate": 9.443258088670462e-07, + "loss": 1.2792809009552002, + "step": 1468 + }, + { + "epoch": 0.5351292318893338, + "grad_norm": 12.6875, + "learning_rate": 9.441689794115088e-07, + "loss": 1.4684710502624512, + "step": 1470 + }, + { + "epoch": 0.5358572988714961, + "grad_norm": 10.9375, + "learning_rate": 9.440119459491529e-07, + "loss": 1.492457389831543, + "step": 1472 + }, + { + "epoch": 0.5365853658536586, + "grad_norm": 11.9375, + "learning_rate": 9.438547085730599e-07, + "loss": 1.448129415512085, + "step": 1474 + }, + { + "epoch": 0.5373134328358209, + "grad_norm": 10.4375, + "learning_rate": 9.43697267376433e-07, + "loss": 1.5418062210083008, + "step": 1476 + }, + { + "epoch": 0.5380414998179832, + "grad_norm": 43.0, + "learning_rate": 9.435396224525953e-07, + "loss": 1.6280195713043213, + "step": 1478 + }, + { + "epoch": 0.5387695668001456, + "grad_norm": 7.125, + "learning_rate": 9.433817738949916e-07, + "loss": 1.4109612703323364, + "step": 1480 + }, + { + "epoch": 0.539497633782308, + "grad_norm": 12.75, + "learning_rate": 9.432237217971867e-07, + "loss": 1.451993465423584, + "step": 1482 + }, + { + "epoch": 0.5402257007644703, + "grad_norm": 19.0, + "learning_rate": 9.430654662528663e-07, + "loss": 1.2106096744537354, + "step": 1484 + }, + { + "epoch": 0.5409537677466327, + "grad_norm": 10.5625, + "learning_rate": 9.429070073558368e-07, + "loss": 1.6775755882263184, + "step": 1486 + }, + { + "epoch": 0.541681834728795, + "grad_norm": 9.4375, + "learning_rate": 9.427483452000248e-07, + "loss": 1.6519020795822144, + "step": 1488 + }, + { + "epoch": 0.5424099017109574, + "grad_norm": 14.1875, + "learning_rate": 9.425894798794777e-07, + "loss": 1.5437301397323608, + "step": 1490 + }, + { + "epoch": 0.5431379686931198, + "grad_norm": 49.25, + "learning_rate": 9.424304114883635e-07, + "loss": 2.0106287002563477, + "step": 1492 + }, + { + "epoch": 0.5438660356752821, + "grad_norm": 22.5, + "learning_rate": 9.422711401209697e-07, + "loss": 1.4373929500579834, + "step": 1494 + }, + { + "epoch": 0.5445941026574445, + "grad_norm": 10.5625, + "learning_rate": 9.421116658717052e-07, + "loss": 1.4898631572723389, + "step": 1496 + }, + { + "epoch": 0.5453221696396069, + "grad_norm": 7.8125, + "learning_rate": 9.419519888350984e-07, + "loss": 0.792231023311615, + "step": 1498 + }, + { + "epoch": 0.5460502366217692, + "grad_norm": 11.125, + "learning_rate": 9.417921091057983e-07, + "loss": 1.3271379470825195, + "step": 1500 + }, + { + "epoch": 0.5467783036039315, + "grad_norm": 11.125, + "learning_rate": 9.416320267785738e-07, + "loss": 1.5323938131332397, + "step": 1502 + }, + { + "epoch": 0.547506370586094, + "grad_norm": 50.0, + "learning_rate": 9.41471741948314e-07, + "loss": 1.0945998430252075, + "step": 1504 + }, + { + "epoch": 0.5482344375682563, + "grad_norm": 28.5, + "learning_rate": 9.413112547100284e-07, + "loss": 1.3627337217330933, + "step": 1506 + }, + { + "epoch": 0.5489625045504186, + "grad_norm": 12.4375, + "learning_rate": 9.411505651588456e-07, + "loss": 1.3237662315368652, + "step": 1508 + }, + { + "epoch": 0.549690571532581, + "grad_norm": 14.8125, + "learning_rate": 9.409896733900151e-07, + "loss": 1.6444835662841797, + "step": 1510 + }, + { + "epoch": 0.5504186385147434, + "grad_norm": 6.5625, + "learning_rate": 9.408285794989052e-07, + "loss": 1.4095518589019775, + "step": 1512 + }, + { + "epoch": 0.5511467054969057, + "grad_norm": 9.5625, + "learning_rate": 9.406672835810055e-07, + "loss": 1.5294262170791626, + "step": 1514 + }, + { + "epoch": 0.551874772479068, + "grad_norm": 15.5625, + "learning_rate": 9.405057857319237e-07, + "loss": 1.0562444925308228, + "step": 1516 + }, + { + "epoch": 0.5526028394612305, + "grad_norm": 9.5625, + "learning_rate": 9.403440860473882e-07, + "loss": 1.6421406269073486, + "step": 1518 + }, + { + "epoch": 0.5533309064433928, + "grad_norm": 11.875, + "learning_rate": 9.401821846232467e-07, + "loss": 1.454357624053955, + "step": 1520 + }, + { + "epoch": 0.5540589734255551, + "grad_norm": 20.75, + "learning_rate": 9.400200815554666e-07, + "loss": 1.0757343769073486, + "step": 1522 + }, + { + "epoch": 0.5547870404077175, + "grad_norm": 21.625, + "learning_rate": 9.398577769401352e-07, + "loss": 1.245919942855835, + "step": 1524 + }, + { + "epoch": 0.5555151073898799, + "grad_norm": 33.0, + "learning_rate": 9.396952708734584e-07, + "loss": 1.5568921566009521, + "step": 1526 + }, + { + "epoch": 0.5562431743720422, + "grad_norm": 7.90625, + "learning_rate": 9.395325634517621e-07, + "loss": 1.0385042428970337, + "step": 1528 + }, + { + "epoch": 0.5569712413542046, + "grad_norm": 7.1875, + "learning_rate": 9.393696547714916e-07, + "loss": 0.8519480228424072, + "step": 1530 + }, + { + "epoch": 0.5576993083363669, + "grad_norm": 20.0, + "learning_rate": 9.39206544929211e-07, + "loss": 1.2682623863220215, + "step": 1532 + }, + { + "epoch": 0.5584273753185293, + "grad_norm": 14.9375, + "learning_rate": 9.390432340216044e-07, + "loss": 1.5363035202026367, + "step": 1534 + }, + { + "epoch": 0.5591554423006917, + "grad_norm": 11.0625, + "learning_rate": 9.388797221454743e-07, + "loss": 1.4752392768859863, + "step": 1536 + }, + { + "epoch": 0.559883509282854, + "grad_norm": 27.375, + "learning_rate": 9.387160093977429e-07, + "loss": 1.7215142250061035, + "step": 1538 + }, + { + "epoch": 0.5606115762650163, + "grad_norm": 10.3125, + "learning_rate": 9.385520958754509e-07, + "loss": 1.4824178218841553, + "step": 1540 + }, + { + "epoch": 0.5613396432471788, + "grad_norm": 11.0625, + "learning_rate": 9.383879816757588e-07, + "loss": 1.4371330738067627, + "step": 1542 + }, + { + "epoch": 0.5620677102293411, + "grad_norm": 29.5, + "learning_rate": 9.382236668959453e-07, + "loss": 0.6484320163726807, + "step": 1544 + }, + { + "epoch": 0.5627957772115034, + "grad_norm": 51.0, + "learning_rate": 9.380591516334084e-07, + "loss": 1.4053144454956055, + "step": 1546 + }, + { + "epoch": 0.5635238441936659, + "grad_norm": 12.125, + "learning_rate": 9.378944359856651e-07, + "loss": 1.2351181507110596, + "step": 1548 + }, + { + "epoch": 0.5642519111758282, + "grad_norm": 11.1875, + "learning_rate": 9.377295200503504e-07, + "loss": 1.5107074975967407, + "step": 1550 + }, + { + "epoch": 0.5649799781579905, + "grad_norm": 11.9375, + "learning_rate": 9.375644039252186e-07, + "loss": 1.2700657844543457, + "step": 1552 + }, + { + "epoch": 0.5657080451401529, + "grad_norm": 22.125, + "learning_rate": 9.37399087708143e-07, + "loss": 1.4463136196136475, + "step": 1554 + }, + { + "epoch": 0.5664361121223153, + "grad_norm": 22.25, + "learning_rate": 9.372335714971151e-07, + "loss": 1.6818041801452637, + "step": 1556 + }, + { + "epoch": 0.5671641791044776, + "grad_norm": 18.625, + "learning_rate": 9.370678553902445e-07, + "loss": 1.2628449201583862, + "step": 1558 + }, + { + "epoch": 0.56789224608664, + "grad_norm": 49.25, + "learning_rate": 9.369019394857598e-07, + "loss": 1.2309166193008423, + "step": 1560 + }, + { + "epoch": 0.5686203130688023, + "grad_norm": 8.9375, + "learning_rate": 9.367358238820082e-07, + "loss": 1.2420250177383423, + "step": 1562 + }, + { + "epoch": 0.5693483800509647, + "grad_norm": 5.59375, + "learning_rate": 9.365695086774552e-07, + "loss": 1.15067720413208, + "step": 1564 + }, + { + "epoch": 0.5700764470331271, + "grad_norm": 12.375, + "learning_rate": 9.364029939706843e-07, + "loss": 1.2852246761322021, + "step": 1566 + }, + { + "epoch": 0.5708045140152894, + "grad_norm": 6.0, + "learning_rate": 9.362362798603971e-07, + "loss": 1.168177604675293, + "step": 1568 + }, + { + "epoch": 0.5715325809974517, + "grad_norm": 14.3125, + "learning_rate": 9.36069366445414e-07, + "loss": 1.9643328189849854, + "step": 1570 + }, + { + "epoch": 0.5722606479796142, + "grad_norm": 7.21875, + "learning_rate": 9.359022538246732e-07, + "loss": 1.1161983013153076, + "step": 1572 + }, + { + "epoch": 0.5729887149617765, + "grad_norm": 40.0, + "learning_rate": 9.357349420972311e-07, + "loss": 1.4544919729232788, + "step": 1574 + }, + { + "epoch": 0.5737167819439388, + "grad_norm": 3.890625, + "learning_rate": 9.355674313622619e-07, + "loss": 1.3140437602996826, + "step": 1576 + }, + { + "epoch": 0.5744448489261011, + "grad_norm": 22.375, + "learning_rate": 9.353997217190581e-07, + "loss": 0.9034355282783508, + "step": 1578 + }, + { + "epoch": 0.5751729159082636, + "grad_norm": 11.375, + "learning_rate": 9.352318132670296e-07, + "loss": 1.4425523281097412, + "step": 1580 + }, + { + "epoch": 0.5759009828904259, + "grad_norm": 32.5, + "learning_rate": 9.350637061057047e-07, + "loss": 1.1042414903640747, + "step": 1582 + }, + { + "epoch": 0.5766290498725882, + "grad_norm": 32.75, + "learning_rate": 9.348954003347294e-07, + "loss": 1.5225791931152344, + "step": 1584 + }, + { + "epoch": 0.5773571168547507, + "grad_norm": 35.75, + "learning_rate": 9.347268960538668e-07, + "loss": 1.3240678310394287, + "step": 1586 + }, + { + "epoch": 0.578085183836913, + "grad_norm": 22.875, + "learning_rate": 9.345581933629985e-07, + "loss": 1.5799564123153687, + "step": 1588 + }, + { + "epoch": 0.5788132508190753, + "grad_norm": 13.0625, + "learning_rate": 9.34389292362123e-07, + "loss": 1.3312690258026123, + "step": 1590 + }, + { + "epoch": 0.5795413178012377, + "grad_norm": 6.375, + "learning_rate": 9.342201931513571e-07, + "loss": 1.2580642700195312, + "step": 1592 + }, + { + "epoch": 0.5802693847834001, + "grad_norm": 10.875, + "learning_rate": 9.340508958309345e-07, + "loss": 1.2592438459396362, + "step": 1594 + }, + { + "epoch": 0.5809974517655624, + "grad_norm": 14.625, + "learning_rate": 9.338814005012066e-07, + "loss": 1.0369317531585693, + "step": 1596 + }, + { + "epoch": 0.5817255187477248, + "grad_norm": 73.5, + "learning_rate": 9.337117072626419e-07, + "loss": 1.1292498111724854, + "step": 1598 + }, + { + "epoch": 0.5824535857298871, + "grad_norm": 9.3125, + "learning_rate": 9.335418162158264e-07, + "loss": 1.2639811038970947, + "step": 1600 + }, + { + "epoch": 0.5831816527120495, + "grad_norm": 20.625, + "learning_rate": 9.333717274614634e-07, + "loss": 1.7338676452636719, + "step": 1602 + }, + { + "epoch": 0.5839097196942119, + "grad_norm": 16.25, + "learning_rate": 9.332014411003735e-07, + "loss": 1.3670799732208252, + "step": 1604 + }, + { + "epoch": 0.5846377866763742, + "grad_norm": 21.75, + "learning_rate": 9.33030957233494e-07, + "loss": 1.7347283363342285, + "step": 1606 + }, + { + "epoch": 0.5853658536585366, + "grad_norm": 6.5625, + "learning_rate": 9.328602759618796e-07, + "loss": 1.3957867622375488, + "step": 1608 + }, + { + "epoch": 0.586093920640699, + "grad_norm": 11.75, + "learning_rate": 9.32689397386702e-07, + "loss": 1.4478832483291626, + "step": 1610 + }, + { + "epoch": 0.5868219876228613, + "grad_norm": 19.125, + "learning_rate": 9.325183216092501e-07, + "loss": 1.4609471559524536, + "step": 1612 + }, + { + "epoch": 0.5875500546050236, + "grad_norm": 25.75, + "learning_rate": 9.323470487309288e-07, + "loss": 1.5368876457214355, + "step": 1614 + }, + { + "epoch": 0.5882781215871861, + "grad_norm": 18.625, + "learning_rate": 9.321755788532608e-07, + "loss": 1.7292656898498535, + "step": 1616 + }, + { + "epoch": 0.5890061885693484, + "grad_norm": 12.6875, + "learning_rate": 9.320039120778851e-07, + "loss": 1.2410686016082764, + "step": 1618 + }, + { + "epoch": 0.5897342555515107, + "grad_norm": 5.28125, + "learning_rate": 9.318320485065576e-07, + "loss": 1.3123751878738403, + "step": 1620 + }, + { + "epoch": 0.5904623225336731, + "grad_norm": 18.0, + "learning_rate": 9.316599882411504e-07, + "loss": 0.969210684299469, + "step": 1622 + }, + { + "epoch": 0.5911903895158355, + "grad_norm": 18.625, + "learning_rate": 9.314877313836532e-07, + "loss": 1.3068957328796387, + "step": 1624 + }, + { + "epoch": 0.5919184564979978, + "grad_norm": 13.3125, + "learning_rate": 9.313152780361711e-07, + "loss": 1.628964900970459, + "step": 1626 + }, + { + "epoch": 0.5926465234801602, + "grad_norm": 19.5, + "learning_rate": 9.31142628300926e-07, + "loss": 1.6994214057922363, + "step": 1628 + }, + { + "epoch": 0.5933745904623225, + "grad_norm": 14.3125, + "learning_rate": 9.309697822802571e-07, + "loss": 1.601043939590454, + "step": 1630 + }, + { + "epoch": 0.5941026574444849, + "grad_norm": 14.125, + "learning_rate": 9.307967400766183e-07, + "loss": 1.2637888193130493, + "step": 1632 + }, + { + "epoch": 0.5948307244266473, + "grad_norm": 18.25, + "learning_rate": 9.306235017925814e-07, + "loss": 1.1639573574066162, + "step": 1634 + }, + { + "epoch": 0.5955587914088096, + "grad_norm": 14.0, + "learning_rate": 9.304500675308336e-07, + "loss": 1.330085277557373, + "step": 1636 + }, + { + "epoch": 0.596286858390972, + "grad_norm": 10.75, + "learning_rate": 9.302764373941778e-07, + "loss": 1.3711938858032227, + "step": 1638 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 13.125, + "learning_rate": 9.301026114855344e-07, + "loss": 1.4097814559936523, + "step": 1640 + }, + { + "epoch": 0.5977429923552967, + "grad_norm": 19.875, + "learning_rate": 9.299285899079386e-07, + "loss": 1.201526403427124, + "step": 1642 + }, + { + "epoch": 0.598471059337459, + "grad_norm": 9.9375, + "learning_rate": 9.297543727645418e-07, + "loss": 1.590818166732788, + "step": 1644 + }, + { + "epoch": 0.5991991263196214, + "grad_norm": 81.5, + "learning_rate": 9.295799601586123e-07, + "loss": 1.235093593597412, + "step": 1646 + }, + { + "epoch": 0.5999271933017838, + "grad_norm": 10.375, + "learning_rate": 9.294053521935328e-07, + "loss": 1.4714232683181763, + "step": 1648 + }, + { + "epoch": 0.6006552602839461, + "grad_norm": 24.125, + "learning_rate": 9.292305489728027e-07, + "loss": 1.5945860147476196, + "step": 1650 + }, + { + "epoch": 0.6013833272661084, + "grad_norm": 18.875, + "learning_rate": 9.29055550600037e-07, + "loss": 1.2780591249465942, + "step": 1652 + }, + { + "epoch": 0.6021113942482709, + "grad_norm": 11.0625, + "learning_rate": 9.288803571789665e-07, + "loss": 1.3222019672393799, + "step": 1654 + }, + { + "epoch": 0.6028394612304332, + "grad_norm": 22.75, + "learning_rate": 9.28704968813437e-07, + "loss": 1.5864791870117188, + "step": 1656 + }, + { + "epoch": 0.6035675282125955, + "grad_norm": 3.625, + "learning_rate": 9.285293856074108e-07, + "loss": 1.348010778427124, + "step": 1658 + }, + { + "epoch": 0.604295595194758, + "grad_norm": 8.4375, + "learning_rate": 9.283536076649647e-07, + "loss": 1.0766938924789429, + "step": 1660 + }, + { + "epoch": 0.6050236621769203, + "grad_norm": 18.625, + "learning_rate": 9.281776350902916e-07, + "loss": 1.5156760215759277, + "step": 1662 + }, + { + "epoch": 0.6057517291590826, + "grad_norm": 19.25, + "learning_rate": 9.280014679877e-07, + "loss": 1.5269720554351807, + "step": 1664 + }, + { + "epoch": 0.606479796141245, + "grad_norm": 50.75, + "learning_rate": 9.278251064616125e-07, + "loss": 1.9046599864959717, + "step": 1666 + }, + { + "epoch": 0.6072078631234074, + "grad_norm": 13.6875, + "learning_rate": 9.276485506165682e-07, + "loss": 1.6765878200531006, + "step": 1668 + }, + { + "epoch": 0.6079359301055697, + "grad_norm": 43.75, + "learning_rate": 9.274718005572208e-07, + "loss": 1.7212928533554077, + "step": 1670 + }, + { + "epoch": 0.6086639970877321, + "grad_norm": 5.65625, + "learning_rate": 9.272948563883394e-07, + "loss": 1.2742149829864502, + "step": 1672 + }, + { + "epoch": 0.6093920640698944, + "grad_norm": 20.625, + "learning_rate": 9.271177182148078e-07, + "loss": 0.7997211217880249, + "step": 1674 + }, + { + "epoch": 0.6101201310520568, + "grad_norm": 9.6875, + "learning_rate": 9.269403861416252e-07, + "loss": 1.6116950511932373, + "step": 1676 + }, + { + "epoch": 0.6108481980342192, + "grad_norm": 14.4375, + "learning_rate": 9.267628602739051e-07, + "loss": 1.3693790435791016, + "step": 1678 + }, + { + "epoch": 0.6115762650163815, + "grad_norm": 11.9375, + "learning_rate": 9.265851407168765e-07, + "loss": 1.3668937683105469, + "step": 1680 + }, + { + "epoch": 0.6123043319985438, + "grad_norm": 12.75, + "learning_rate": 9.264072275758834e-07, + "loss": 1.4822938442230225, + "step": 1682 + }, + { + "epoch": 0.6130323989807063, + "grad_norm": 7.625, + "learning_rate": 9.262291209563837e-07, + "loss": 1.388484239578247, + "step": 1684 + }, + { + "epoch": 0.6137604659628686, + "grad_norm": 78.0, + "learning_rate": 9.260508209639508e-07, + "loss": 1.5740796327590942, + "step": 1686 + }, + { + "epoch": 0.6144885329450309, + "grad_norm": 19.125, + "learning_rate": 9.258723277042717e-07, + "loss": 1.4938759803771973, + "step": 1688 + }, + { + "epoch": 0.6152165999271934, + "grad_norm": 4.84375, + "learning_rate": 9.256936412831493e-07, + "loss": 1.013038992881775, + "step": 1690 + }, + { + "epoch": 0.6159446669093557, + "grad_norm": 16.0, + "learning_rate": 9.255147618065e-07, + "loss": 1.3060342073440552, + "step": 1692 + }, + { + "epoch": 0.616672733891518, + "grad_norm": 12.5, + "learning_rate": 9.253356893803552e-07, + "loss": 1.5061357021331787, + "step": 1694 + }, + { + "epoch": 0.6174008008736804, + "grad_norm": 21.75, + "learning_rate": 9.2515642411086e-07, + "loss": 1.8787455558776855, + "step": 1696 + }, + { + "epoch": 0.6181288678558428, + "grad_norm": 40.5, + "learning_rate": 9.249769661042745e-07, + "loss": 0.941942036151886, + "step": 1698 + }, + { + "epoch": 0.6188569348380051, + "grad_norm": 27.625, + "learning_rate": 9.24797315466973e-07, + "loss": 1.4722591638565063, + "step": 1700 + }, + { + "epoch": 0.6195850018201674, + "grad_norm": 3.90625, + "learning_rate": 9.246174723054434e-07, + "loss": 1.1384129524230957, + "step": 1702 + }, + { + "epoch": 0.6203130688023298, + "grad_norm": 4.625, + "learning_rate": 9.244374367262881e-07, + "loss": 1.016237735748291, + "step": 1704 + }, + { + "epoch": 0.6210411357844922, + "grad_norm": 16.25, + "learning_rate": 9.242572088362235e-07, + "loss": 1.243192434310913, + "step": 1706 + }, + { + "epoch": 0.6217692027666545, + "grad_norm": 14.0, + "learning_rate": 9.240767887420804e-07, + "loss": 1.344420313835144, + "step": 1708 + }, + { + "epoch": 0.6224972697488169, + "grad_norm": 30.75, + "learning_rate": 9.238961765508029e-07, + "loss": 1.0155845880508423, + "step": 1710 + }, + { + "epoch": 0.6232253367309792, + "grad_norm": 14.3125, + "learning_rate": 9.237153723694491e-07, + "loss": 1.335033655166626, + "step": 1712 + }, + { + "epoch": 0.6239534037131416, + "grad_norm": 18.0, + "learning_rate": 9.235343763051911e-07, + "loss": 1.4777297973632812, + "step": 1714 + }, + { + "epoch": 0.624681470695304, + "grad_norm": 28.875, + "learning_rate": 9.233531884653147e-07, + "loss": 0.9719295501708984, + "step": 1716 + }, + { + "epoch": 0.6254095376774663, + "grad_norm": 8.6875, + "learning_rate": 9.231718089572194e-07, + "loss": 1.480360984802246, + "step": 1718 + }, + { + "epoch": 0.6261376046596286, + "grad_norm": 10.1875, + "learning_rate": 9.22990237888418e-07, + "loss": 1.625566005706787, + "step": 1720 + }, + { + "epoch": 0.6268656716417911, + "grad_norm": 15.0, + "learning_rate": 9.228084753665377e-07, + "loss": 1.6612961292266846, + "step": 1722 + }, + { + "epoch": 0.6275937386239534, + "grad_norm": 45.75, + "learning_rate": 9.226265214993179e-07, + "loss": 1.4117205142974854, + "step": 1724 + }, + { + "epoch": 0.6283218056061157, + "grad_norm": 10.375, + "learning_rate": 9.224443763946124e-07, + "loss": 0.996017336845398, + "step": 1726 + }, + { + "epoch": 0.6290498725882782, + "grad_norm": 8.0625, + "learning_rate": 9.22262040160388e-07, + "loss": 1.3521080017089844, + "step": 1728 + }, + { + "epoch": 0.6297779395704405, + "grad_norm": 12.125, + "learning_rate": 9.220795129047247e-07, + "loss": 1.5970375537872314, + "step": 1730 + }, + { + "epoch": 0.6305060065526028, + "grad_norm": 26.5, + "learning_rate": 9.218967947358165e-07, + "loss": 1.31834077835083, + "step": 1732 + }, + { + "epoch": 0.6312340735347652, + "grad_norm": 6.59375, + "learning_rate": 9.217138857619691e-07, + "loss": 1.1581727266311646, + "step": 1734 + }, + { + "epoch": 0.6319621405169276, + "grad_norm": 15.3125, + "learning_rate": 9.215307860916029e-07, + "loss": 1.6091313362121582, + "step": 1736 + }, + { + "epoch": 0.6326902074990899, + "grad_norm": 34.0, + "learning_rate": 9.213474958332499e-07, + "loss": 1.6980090141296387, + "step": 1738 + }, + { + "epoch": 0.6334182744812523, + "grad_norm": 12.1875, + "learning_rate": 9.211640150955562e-07, + "loss": 1.5215051174163818, + "step": 1740 + }, + { + "epoch": 0.6341463414634146, + "grad_norm": 7.90625, + "learning_rate": 9.209803439872803e-07, + "loss": 1.4400357007980347, + "step": 1742 + }, + { + "epoch": 0.634874408445577, + "grad_norm": 31.25, + "learning_rate": 9.207964826172933e-07, + "loss": 1.1345932483673096, + "step": 1744 + }, + { + "epoch": 0.6356024754277394, + "grad_norm": 13.625, + "learning_rate": 9.206124310945797e-07, + "loss": 1.1594189405441284, + "step": 1746 + }, + { + "epoch": 0.6363305424099017, + "grad_norm": 41.25, + "learning_rate": 9.204281895282364e-07, + "loss": 1.1367210149765015, + "step": 1748 + }, + { + "epoch": 0.637058609392064, + "grad_norm": 10.9375, + "learning_rate": 9.202437580274729e-07, + "loss": 1.472306489944458, + "step": 1750 + }, + { + "epoch": 0.6377866763742265, + "grad_norm": 9.1875, + "learning_rate": 9.20059136701611e-07, + "loss": 1.2343062162399292, + "step": 1752 + }, + { + "epoch": 0.6385147433563888, + "grad_norm": 13.125, + "learning_rate": 9.198743256600858e-07, + "loss": 1.453023910522461, + "step": 1754 + }, + { + "epoch": 0.6392428103385511, + "grad_norm": 6.28125, + "learning_rate": 9.196893250124443e-07, + "loss": 1.3155677318572998, + "step": 1756 + }, + { + "epoch": 0.6399708773207136, + "grad_norm": 12.0625, + "learning_rate": 9.195041348683457e-07, + "loss": 1.3846875429153442, + "step": 1758 + }, + { + "epoch": 0.6406989443028759, + "grad_norm": 29.625, + "learning_rate": 9.193187553375621e-07, + "loss": 1.4541205167770386, + "step": 1760 + }, + { + "epoch": 0.6414270112850382, + "grad_norm": 12.875, + "learning_rate": 9.191331865299774e-07, + "loss": 1.7376492023468018, + "step": 1762 + }, + { + "epoch": 0.6421550782672005, + "grad_norm": 11.1875, + "learning_rate": 9.189474285555879e-07, + "loss": 1.273536205291748, + "step": 1764 + }, + { + "epoch": 0.642883145249363, + "grad_norm": 10.4375, + "learning_rate": 9.18761481524502e-07, + "loss": 1.3672184944152832, + "step": 1766 + }, + { + "epoch": 0.6436112122315253, + "grad_norm": 16.875, + "learning_rate": 9.185753455469403e-07, + "loss": 1.2902827262878418, + "step": 1768 + }, + { + "epoch": 0.6443392792136876, + "grad_norm": 14.5625, + "learning_rate": 9.183890207332352e-07, + "loss": 1.4905905723571777, + "step": 1770 + }, + { + "epoch": 0.64506734619585, + "grad_norm": 13.625, + "learning_rate": 9.182025071938308e-07, + "loss": 1.500184416770935, + "step": 1772 + }, + { + "epoch": 0.6457954131780124, + "grad_norm": 13.5625, + "learning_rate": 9.180158050392837e-07, + "loss": 1.4009766578674316, + "step": 1774 + }, + { + "epoch": 0.6465234801601747, + "grad_norm": 13.625, + "learning_rate": 9.17828914380262e-07, + "loss": 1.6044259071350098, + "step": 1776 + }, + { + "epoch": 0.6472515471423371, + "grad_norm": 32.25, + "learning_rate": 9.17641835327545e-07, + "loss": 1.35463547706604, + "step": 1778 + }, + { + "epoch": 0.6479796141244994, + "grad_norm": 14.375, + "learning_rate": 9.174545679920247e-07, + "loss": 1.0684609413146973, + "step": 1780 + }, + { + "epoch": 0.6487076811066618, + "grad_norm": 6.0, + "learning_rate": 9.17267112484704e-07, + "loss": 0.9236390590667725, + "step": 1782 + }, + { + "epoch": 0.6494357480888242, + "grad_norm": 30.0, + "learning_rate": 9.170794689166974e-07, + "loss": 1.325068473815918, + "step": 1784 + }, + { + "epoch": 0.6501638150709865, + "grad_norm": 8.25, + "learning_rate": 9.16891637399231e-07, + "loss": 1.1971938610076904, + "step": 1786 + }, + { + "epoch": 0.6508918820531489, + "grad_norm": 11.375, + "learning_rate": 9.167036180436427e-07, + "loss": 1.2860158681869507, + "step": 1788 + }, + { + "epoch": 0.6516199490353113, + "grad_norm": 34.0, + "learning_rate": 9.165154109613806e-07, + "loss": 1.5646440982818604, + "step": 1790 + }, + { + "epoch": 0.6523480160174736, + "grad_norm": 83.5, + "learning_rate": 9.163270162640053e-07, + "loss": 1.5978153944015503, + "step": 1792 + }, + { + "epoch": 0.6530760829996359, + "grad_norm": 18.5, + "learning_rate": 9.161384340631881e-07, + "loss": 1.5223355293273926, + "step": 1794 + }, + { + "epoch": 0.6538041499817984, + "grad_norm": 9.5, + "learning_rate": 9.159496644707114e-07, + "loss": 1.6478946208953857, + "step": 1796 + }, + { + "epoch": 0.6545322169639607, + "grad_norm": 4.84375, + "learning_rate": 9.157607075984685e-07, + "loss": 1.2399314641952515, + "step": 1798 + }, + { + "epoch": 0.655260283946123, + "grad_norm": 5.71875, + "learning_rate": 9.155715635584644e-07, + "loss": 1.3088821172714233, + "step": 1800 + }, + { + "epoch": 0.6559883509282854, + "grad_norm": 12.125, + "learning_rate": 9.153822324628142e-07, + "loss": 1.4322490692138672, + "step": 1802 + }, + { + "epoch": 0.6567164179104478, + "grad_norm": 18.5, + "learning_rate": 9.151927144237443e-07, + "loss": 1.7116109132766724, + "step": 1804 + }, + { + "epoch": 0.6574444848926101, + "grad_norm": 6.34375, + "learning_rate": 9.150030095535921e-07, + "loss": 1.2413341999053955, + "step": 1806 + }, + { + "epoch": 0.6581725518747725, + "grad_norm": 100.0, + "learning_rate": 9.148131179648053e-07, + "loss": 1.1262481212615967, + "step": 1808 + }, + { + "epoch": 0.6589006188569348, + "grad_norm": 29.25, + "learning_rate": 9.146230397699428e-07, + "loss": 1.650876760482788, + "step": 1810 + }, + { + "epoch": 0.6596286858390972, + "grad_norm": 13.3125, + "learning_rate": 9.144327750816736e-07, + "loss": 0.7731285095214844, + "step": 1812 + }, + { + "epoch": 0.6603567528212596, + "grad_norm": 13.125, + "learning_rate": 9.142423240127775e-07, + "loss": 1.6047508716583252, + "step": 1814 + }, + { + "epoch": 0.6610848198034219, + "grad_norm": 27.125, + "learning_rate": 9.140516866761447e-07, + "loss": 1.6227905750274658, + "step": 1816 + }, + { + "epoch": 0.6618128867855843, + "grad_norm": 20.25, + "learning_rate": 9.138608631847757e-07, + "loss": 1.78618323802948, + "step": 1818 + }, + { + "epoch": 0.6625409537677467, + "grad_norm": 15.25, + "learning_rate": 9.136698536517817e-07, + "loss": 1.7346434593200684, + "step": 1820 + }, + { + "epoch": 0.663269020749909, + "grad_norm": 24.75, + "learning_rate": 9.134786581903842e-07, + "loss": 1.3621857166290283, + "step": 1822 + }, + { + "epoch": 0.6639970877320713, + "grad_norm": 26.375, + "learning_rate": 9.132872769139139e-07, + "loss": 1.1923943758010864, + "step": 1824 + }, + { + "epoch": 0.6647251547142337, + "grad_norm": 12.125, + "learning_rate": 9.130957099358132e-07, + "loss": 1.7635855674743652, + "step": 1826 + }, + { + "epoch": 0.6654532216963961, + "grad_norm": 10.6875, + "learning_rate": 9.129039573696336e-07, + "loss": 1.4531279802322388, + "step": 1828 + }, + { + "epoch": 0.6661812886785584, + "grad_norm": 8.6875, + "learning_rate": 9.127120193290366e-07, + "loss": 1.8058863878250122, + "step": 1830 + }, + { + "epoch": 0.6669093556607207, + "grad_norm": 11.0, + "learning_rate": 9.125198959277935e-07, + "loss": 1.0941154956817627, + "step": 1832 + }, + { + "epoch": 0.6676374226428832, + "grad_norm": 17.125, + "learning_rate": 9.123275872797869e-07, + "loss": 1.6039718389511108, + "step": 1834 + }, + { + "epoch": 0.6683654896250455, + "grad_norm": 13.5625, + "learning_rate": 9.121350934990071e-07, + "loss": 1.5419042110443115, + "step": 1836 + }, + { + "epoch": 0.6690935566072078, + "grad_norm": 5.9375, + "learning_rate": 9.119424146995555e-07, + "loss": 1.3580117225646973, + "step": 1838 + }, + { + "epoch": 0.6698216235893703, + "grad_norm": 12.1875, + "learning_rate": 9.11749550995643e-07, + "loss": 1.4810190200805664, + "step": 1840 + }, + { + "epoch": 0.6705496905715326, + "grad_norm": 17.625, + "learning_rate": 9.115565025015899e-07, + "loss": 1.6529779434204102, + "step": 1842 + }, + { + "epoch": 0.6712777575536949, + "grad_norm": 18.25, + "learning_rate": 9.113632693318256e-07, + "loss": 1.994378924369812, + "step": 1844 + }, + { + "epoch": 0.6720058245358573, + "grad_norm": 14.5625, + "learning_rate": 9.111698516008901e-07, + "loss": 1.3107621669769287, + "step": 1846 + }, + { + "epoch": 0.6727338915180197, + "grad_norm": 17.625, + "learning_rate": 9.109762494234316e-07, + "loss": 1.0461395978927612, + "step": 1848 + }, + { + "epoch": 0.673461958500182, + "grad_norm": 17.75, + "learning_rate": 9.107824629142082e-07, + "loss": 1.6170899868011475, + "step": 1850 + }, + { + "epoch": 0.6741900254823444, + "grad_norm": 21.625, + "learning_rate": 9.105884921880878e-07, + "loss": 0.8737703561782837, + "step": 1852 + }, + { + "epoch": 0.6749180924645067, + "grad_norm": 27.125, + "learning_rate": 9.10394337360046e-07, + "loss": 1.6610827445983887, + "step": 1854 + }, + { + "epoch": 0.6756461594466691, + "grad_norm": 12.5625, + "learning_rate": 9.101999985451693e-07, + "loss": 1.3163466453552246, + "step": 1856 + }, + { + "epoch": 0.6763742264288315, + "grad_norm": 24.375, + "learning_rate": 9.100054758586519e-07, + "loss": 1.011348009109497, + "step": 1858 + }, + { + "epoch": 0.6771022934109938, + "grad_norm": 11.1875, + "learning_rate": 9.098107694157977e-07, + "loss": 1.4332120418548584, + "step": 1860 + }, + { + "epoch": 0.6778303603931561, + "grad_norm": 7.96875, + "learning_rate": 9.096158793320192e-07, + "loss": 1.604081153869629, + "step": 1862 + }, + { + "epoch": 0.6785584273753186, + "grad_norm": 12.3125, + "learning_rate": 9.094208057228378e-07, + "loss": 1.457472801208496, + "step": 1864 + }, + { + "epoch": 0.6792864943574809, + "grad_norm": 8.875, + "learning_rate": 9.092255487038842e-07, + "loss": 1.5123369693756104, + "step": 1866 + }, + { + "epoch": 0.6800145613396432, + "grad_norm": 10.5, + "learning_rate": 9.090301083908969e-07, + "loss": 1.1716439723968506, + "step": 1868 + }, + { + "epoch": 0.6807426283218057, + "grad_norm": 17.5, + "learning_rate": 9.088344848997236e-07, + "loss": 1.001671552658081, + "step": 1870 + }, + { + "epoch": 0.681470695303968, + "grad_norm": 15.0, + "learning_rate": 9.086386783463204e-07, + "loss": 1.1232812404632568, + "step": 1872 + }, + { + "epoch": 0.6821987622861303, + "grad_norm": 14.9375, + "learning_rate": 9.084426888467525e-07, + "loss": 1.3047420978546143, + "step": 1874 + }, + { + "epoch": 0.6829268292682927, + "grad_norm": 9.5625, + "learning_rate": 9.082465165171924e-07, + "loss": 1.1976680755615234, + "step": 1876 + }, + { + "epoch": 0.6836548962504551, + "grad_norm": 11.6875, + "learning_rate": 9.080501614739222e-07, + "loss": 1.4000158309936523, + "step": 1878 + }, + { + "epoch": 0.6843829632326174, + "grad_norm": 14.75, + "learning_rate": 9.078536238333314e-07, + "loss": 1.3848509788513184, + "step": 1880 + }, + { + "epoch": 0.6851110302147798, + "grad_norm": 13.0, + "learning_rate": 9.076569037119181e-07, + "loss": 1.1940157413482666, + "step": 1882 + }, + { + "epoch": 0.6858390971969421, + "grad_norm": 9.5625, + "learning_rate": 9.074600012262885e-07, + "loss": 1.1878256797790527, + "step": 1884 + }, + { + "epoch": 0.6865671641791045, + "grad_norm": 15.5625, + "learning_rate": 9.072629164931571e-07, + "loss": 1.7873784303665161, + "step": 1886 + }, + { + "epoch": 0.6872952311612668, + "grad_norm": 8.375, + "learning_rate": 9.070656496293461e-07, + "loss": 1.1342498064041138, + "step": 1888 + }, + { + "epoch": 0.6880232981434292, + "grad_norm": 21.625, + "learning_rate": 9.068682007517858e-07, + "loss": 1.6528568267822266, + "step": 1890 + }, + { + "epoch": 0.6887513651255915, + "grad_norm": 12.9375, + "learning_rate": 9.066705699775144e-07, + "loss": 1.4960280656814575, + "step": 1892 + }, + { + "epoch": 0.6894794321077539, + "grad_norm": 23.625, + "learning_rate": 9.064727574236781e-07, + "loss": 1.5311808586120605, + "step": 1894 + }, + { + "epoch": 0.6902074990899163, + "grad_norm": 16.5, + "learning_rate": 9.062747632075303e-07, + "loss": 1.3724381923675537, + "step": 1896 + }, + { + "epoch": 0.6909355660720786, + "grad_norm": 56.75, + "learning_rate": 9.060765874464329e-07, + "loss": 1.3301405906677246, + "step": 1898 + }, + { + "epoch": 0.691663633054241, + "grad_norm": 15.1875, + "learning_rate": 9.058782302578547e-07, + "loss": 1.1856707334518433, + "step": 1900 + }, + { + "epoch": 0.6923917000364034, + "grad_norm": 9.3125, + "learning_rate": 9.056796917593721e-07, + "loss": 1.117443323135376, + "step": 1902 + }, + { + "epoch": 0.6931197670185657, + "grad_norm": 15.5, + "learning_rate": 9.054809720686696e-07, + "loss": 1.4983378648757935, + "step": 1904 + }, + { + "epoch": 0.693847834000728, + "grad_norm": 22.375, + "learning_rate": 9.052820713035385e-07, + "loss": 1.2235255241394043, + "step": 1906 + }, + { + "epoch": 0.6945759009828905, + "grad_norm": 6.03125, + "learning_rate": 9.050829895818775e-07, + "loss": 1.4153095483779907, + "step": 1908 + }, + { + "epoch": 0.6953039679650528, + "grad_norm": 12.6875, + "learning_rate": 9.048837270216927e-07, + "loss": 1.5429484844207764, + "step": 1910 + }, + { + "epoch": 0.6960320349472151, + "grad_norm": 64.0, + "learning_rate": 9.046842837410976e-07, + "loss": 1.5345087051391602, + "step": 1912 + }, + { + "epoch": 0.6967601019293775, + "grad_norm": 11.5625, + "learning_rate": 9.044846598583123e-07, + "loss": 1.3023669719696045, + "step": 1914 + }, + { + "epoch": 0.6974881689115399, + "grad_norm": 17.5, + "learning_rate": 9.042848554916645e-07, + "loss": 0.9731202721595764, + "step": 1916 + }, + { + "epoch": 0.6982162358937022, + "grad_norm": 11.875, + "learning_rate": 9.040848707595882e-07, + "loss": 1.2548606395721436, + "step": 1918 + }, + { + "epoch": 0.6989443028758646, + "grad_norm": 24.5, + "learning_rate": 9.038847057806252e-07, + "loss": 0.797179639339447, + "step": 1920 + }, + { + "epoch": 0.6996723698580269, + "grad_norm": 17.75, + "learning_rate": 9.036843606734234e-07, + "loss": 1.4619134664535522, + "step": 1922 + }, + { + "epoch": 0.7004004368401893, + "grad_norm": 10.0625, + "learning_rate": 9.034838355567378e-07, + "loss": 1.3842718601226807, + "step": 1924 + }, + { + "epoch": 0.7011285038223517, + "grad_norm": 10.0, + "learning_rate": 9.032831305494301e-07, + "loss": 1.1962783336639404, + "step": 1926 + }, + { + "epoch": 0.701856570804514, + "grad_norm": 20.5, + "learning_rate": 9.030822457704685e-07, + "loss": 1.5028080940246582, + "step": 1928 + }, + { + "epoch": 0.7025846377866763, + "grad_norm": 14.6875, + "learning_rate": 9.028811813389278e-07, + "loss": 1.6164817810058594, + "step": 1930 + }, + { + "epoch": 0.7033127047688388, + "grad_norm": 10.3125, + "learning_rate": 9.026799373739897e-07, + "loss": 1.3629875183105469, + "step": 1932 + }, + { + "epoch": 0.7040407717510011, + "grad_norm": 13.5625, + "learning_rate": 9.024785139949413e-07, + "loss": 1.5785894393920898, + "step": 1934 + }, + { + "epoch": 0.7047688387331634, + "grad_norm": 10.625, + "learning_rate": 9.022769113211772e-07, + "loss": 1.3331024646759033, + "step": 1936 + }, + { + "epoch": 0.7054969057153259, + "grad_norm": 13.0, + "learning_rate": 9.020751294721976e-07, + "loss": 1.5846672058105469, + "step": 1938 + }, + { + "epoch": 0.7062249726974882, + "grad_norm": 11.8125, + "learning_rate": 9.018731685676091e-07, + "loss": 1.4124033451080322, + "step": 1940 + }, + { + "epoch": 0.7069530396796505, + "grad_norm": 13.0625, + "learning_rate": 9.016710287271243e-07, + "loss": 1.3312616348266602, + "step": 1942 + }, + { + "epoch": 0.7076811066618129, + "grad_norm": 14.3125, + "learning_rate": 9.01468710070562e-07, + "loss": 1.4166638851165771, + "step": 1944 + }, + { + "epoch": 0.7084091736439753, + "grad_norm": 12.75, + "learning_rate": 9.012662127178472e-07, + "loss": 1.6767387390136719, + "step": 1946 + }, + { + "epoch": 0.7091372406261376, + "grad_norm": 19.875, + "learning_rate": 9.010635367890102e-07, + "loss": 1.1992321014404297, + "step": 1948 + }, + { + "epoch": 0.7098653076082999, + "grad_norm": 20.625, + "learning_rate": 9.00860682404188e-07, + "loss": 1.4489197731018066, + "step": 1950 + }, + { + "epoch": 0.7105933745904623, + "grad_norm": 21.5, + "learning_rate": 9.006576496836227e-07, + "loss": 0.9472750425338745, + "step": 1952 + }, + { + "epoch": 0.7113214415726247, + "grad_norm": 21.75, + "learning_rate": 9.004544387476622e-07, + "loss": 1.3581335544586182, + "step": 1954 + }, + { + "epoch": 0.712049508554787, + "grad_norm": 12.0625, + "learning_rate": 9.002510497167603e-07, + "loss": 1.4068057537078857, + "step": 1956 + }, + { + "epoch": 0.7127775755369494, + "grad_norm": 23.75, + "learning_rate": 9.000474827114764e-07, + "loss": 1.4395711421966553, + "step": 1958 + }, + { + "epoch": 0.7135056425191117, + "grad_norm": 11.75, + "learning_rate": 8.998437378524748e-07, + "loss": 1.173989176750183, + "step": 1960 + }, + { + "epoch": 0.7142337095012741, + "grad_norm": 27.75, + "learning_rate": 8.99639815260526e-07, + "loss": 1.2158374786376953, + "step": 1962 + }, + { + "epoch": 0.7149617764834365, + "grad_norm": 10.4375, + "learning_rate": 8.994357150565055e-07, + "loss": 1.3894617557525635, + "step": 1964 + }, + { + "epoch": 0.7156898434655988, + "grad_norm": 9.625, + "learning_rate": 8.992314373613938e-07, + "loss": 1.170532464981079, + "step": 1966 + }, + { + "epoch": 0.7164179104477612, + "grad_norm": 14.25, + "learning_rate": 8.990269822962772e-07, + "loss": 1.6030280590057373, + "step": 1968 + }, + { + "epoch": 0.7171459774299236, + "grad_norm": 5.375, + "learning_rate": 8.988223499823468e-07, + "loss": 1.161352515220642, + "step": 1970 + }, + { + "epoch": 0.7178740444120859, + "grad_norm": 10.375, + "learning_rate": 8.986175405408987e-07, + "loss": 0.8275282382965088, + "step": 1972 + }, + { + "epoch": 0.7186021113942482, + "grad_norm": 13.875, + "learning_rate": 8.984125540933343e-07, + "loss": 1.6736547946929932, + "step": 1974 + }, + { + "epoch": 0.7193301783764107, + "grad_norm": 15.0625, + "learning_rate": 8.982073907611591e-07, + "loss": 1.3614916801452637, + "step": 1976 + }, + { + "epoch": 0.720058245358573, + "grad_norm": 26.0, + "learning_rate": 8.980020506659849e-07, + "loss": 1.5886743068695068, + "step": 1978 + }, + { + "epoch": 0.7207863123407353, + "grad_norm": 4.84375, + "learning_rate": 8.977965339295266e-07, + "loss": 1.1597849130630493, + "step": 1980 + }, + { + "epoch": 0.7215143793228977, + "grad_norm": 9.3125, + "learning_rate": 8.975908406736053e-07, + "loss": 1.4031915664672852, + "step": 1982 + }, + { + "epoch": 0.7222424463050601, + "grad_norm": 24.25, + "learning_rate": 8.973849710201458e-07, + "loss": 1.4797863960266113, + "step": 1984 + }, + { + "epoch": 0.7229705132872224, + "grad_norm": 24.875, + "learning_rate": 8.971789250911777e-07, + "loss": 1.3726301193237305, + "step": 1986 + }, + { + "epoch": 0.7236985802693848, + "grad_norm": 9.6875, + "learning_rate": 8.969727030088352e-07, + "loss": 1.0655194520950317, + "step": 1988 + }, + { + "epoch": 0.7244266472515472, + "grad_norm": 13.5625, + "learning_rate": 8.967663048953567e-07, + "loss": 1.270705223083496, + "step": 1990 + }, + { + "epoch": 0.7251547142337095, + "grad_norm": 11.875, + "learning_rate": 8.965597308730852e-07, + "loss": 1.3082408905029297, + "step": 1992 + }, + { + "epoch": 0.7258827812158719, + "grad_norm": 11.9375, + "learning_rate": 8.963529810644679e-07, + "loss": 1.3961737155914307, + "step": 1994 + }, + { + "epoch": 0.7266108481980342, + "grad_norm": 11.625, + "learning_rate": 8.961460555920559e-07, + "loss": 1.725907564163208, + "step": 1996 + }, + { + "epoch": 0.7273389151801966, + "grad_norm": 28.75, + "learning_rate": 8.959389545785048e-07, + "loss": 1.382678508758545, + "step": 1998 + }, + { + "epoch": 0.728066982162359, + "grad_norm": 18.625, + "learning_rate": 8.957316781465743e-07, + "loss": 1.6142023801803589, + "step": 2000 + }, + { + "epoch": 0.7287950491445213, + "grad_norm": 33.5, + "learning_rate": 8.955242264191276e-07, + "loss": 1.7216551303863525, + "step": 2002 + }, + { + "epoch": 0.7295231161266836, + "grad_norm": 8.375, + "learning_rate": 8.953165995191323e-07, + "loss": 1.3730015754699707, + "step": 2004 + }, + { + "epoch": 0.7302511831088461, + "grad_norm": 12.125, + "learning_rate": 8.951087975696596e-07, + "loss": 1.354269027709961, + "step": 2006 + }, + { + "epoch": 0.7309792500910084, + "grad_norm": 19.75, + "learning_rate": 8.949008206938846e-07, + "loss": 1.7062320709228516, + "step": 2008 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 11.5625, + "learning_rate": 8.946926690150857e-07, + "loss": 0.8354568481445312, + "step": 2010 + }, + { + "epoch": 0.732435384055333, + "grad_norm": 23.0, + "learning_rate": 8.944843426566455e-07, + "loss": 1.7684767246246338, + "step": 2012 + }, + { + "epoch": 0.7331634510374955, + "grad_norm": 7.15625, + "learning_rate": 8.9427584174205e-07, + "loss": 1.16962730884552, + "step": 2014 + }, + { + "epoch": 0.7338915180196578, + "grad_norm": 5.0625, + "learning_rate": 8.940671663948878e-07, + "loss": 1.1426913738250732, + "step": 2016 + }, + { + "epoch": 0.7346195850018201, + "grad_norm": 15.9375, + "learning_rate": 8.938583167388522e-07, + "loss": 0.926419734954834, + "step": 2018 + }, + { + "epoch": 0.7353476519839826, + "grad_norm": 19.125, + "learning_rate": 8.936492928977393e-07, + "loss": 1.5157438516616821, + "step": 2020 + }, + { + "epoch": 0.7360757189661449, + "grad_norm": 8.875, + "learning_rate": 8.934400949954477e-07, + "loss": 1.4122189283370972, + "step": 2022 + }, + { + "epoch": 0.7368037859483072, + "grad_norm": 16.25, + "learning_rate": 8.932307231559808e-07, + "loss": 1.5148566961288452, + "step": 2024 + }, + { + "epoch": 0.7375318529304696, + "grad_norm": 7.8125, + "learning_rate": 8.930211775034435e-07, + "loss": 1.348271131515503, + "step": 2026 + }, + { + "epoch": 0.738259919912632, + "grad_norm": 20.75, + "learning_rate": 8.928114581620445e-07, + "loss": 1.7178332805633545, + "step": 2028 + }, + { + "epoch": 0.7389879868947943, + "grad_norm": 10.6875, + "learning_rate": 8.926015652560954e-07, + "loss": 1.0974351167678833, + "step": 2030 + }, + { + "epoch": 0.7397160538769567, + "grad_norm": 14.25, + "learning_rate": 8.923914989100108e-07, + "loss": 1.7823076248168945, + "step": 2032 + }, + { + "epoch": 0.740444120859119, + "grad_norm": 16.0, + "learning_rate": 8.921812592483078e-07, + "loss": 1.5265107154846191, + "step": 2034 + }, + { + "epoch": 0.7411721878412814, + "grad_norm": 9.5625, + "learning_rate": 8.919708463956059e-07, + "loss": 1.3083072900772095, + "step": 2036 + }, + { + "epoch": 0.7419002548234438, + "grad_norm": 12.1875, + "learning_rate": 8.917602604766285e-07, + "loss": 1.4533865451812744, + "step": 2038 + }, + { + "epoch": 0.7426283218056061, + "grad_norm": 13.0, + "learning_rate": 8.915495016162004e-07, + "loss": 1.5961275100708008, + "step": 2040 + }, + { + "epoch": 0.7433563887877684, + "grad_norm": 35.75, + "learning_rate": 8.913385699392493e-07, + "loss": 0.984943151473999, + "step": 2042 + }, + { + "epoch": 0.7440844557699309, + "grad_norm": 14.5625, + "learning_rate": 8.911274655708055e-07, + "loss": 1.6076760292053223, + "step": 2044 + }, + { + "epoch": 0.7448125227520932, + "grad_norm": 32.5, + "learning_rate": 8.909161886360012e-07, + "loss": 1.4232481718063354, + "step": 2046 + }, + { + "epoch": 0.7455405897342555, + "grad_norm": 16.0, + "learning_rate": 8.907047392600718e-07, + "loss": 1.6762185096740723, + "step": 2048 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 11.375, + "learning_rate": 8.904931175683536e-07, + "loss": 1.5308107137680054, + "step": 2050 + }, + { + "epoch": 0.7469967236985803, + "grad_norm": 12.75, + "learning_rate": 8.902813236862863e-07, + "loss": 1.4931962490081787, + "step": 2052 + }, + { + "epoch": 0.7477247906807426, + "grad_norm": 8.625, + "learning_rate": 8.900693577394107e-07, + "loss": 1.4062120914459229, + "step": 2054 + }, + { + "epoch": 0.748452857662905, + "grad_norm": 19.125, + "learning_rate": 8.898572198533705e-07, + "loss": 1.0250781774520874, + "step": 2056 + }, + { + "epoch": 0.7491809246450674, + "grad_norm": 11.3125, + "learning_rate": 8.896449101539105e-07, + "loss": 1.1214791536331177, + "step": 2058 + }, + { + "epoch": 0.7499089916272297, + "grad_norm": 12.9375, + "learning_rate": 8.894324287668779e-07, + "loss": 1.545520305633545, + "step": 2060 + }, + { + "epoch": 0.7506370586093921, + "grad_norm": 13.875, + "learning_rate": 8.892197758182212e-07, + "loss": 1.336254358291626, + "step": 2062 + }, + { + "epoch": 0.7513651255915544, + "grad_norm": 9.625, + "learning_rate": 8.890069514339911e-07, + "loss": 1.7803747653961182, + "step": 2064 + }, + { + "epoch": 0.7520931925737168, + "grad_norm": 11.0, + "learning_rate": 8.887939557403395e-07, + "loss": 1.0765395164489746, + "step": 2066 + }, + { + "epoch": 0.7528212595558791, + "grad_norm": 23.125, + "learning_rate": 8.885807888635197e-07, + "loss": 1.2467716932296753, + "step": 2068 + }, + { + "epoch": 0.7535493265380415, + "grad_norm": 3.46875, + "learning_rate": 8.883674509298875e-07, + "loss": 1.3080624341964722, + "step": 2070 + }, + { + "epoch": 0.7542773935202038, + "grad_norm": 17.875, + "learning_rate": 8.881539420658988e-07, + "loss": 1.6158418655395508, + "step": 2072 + }, + { + "epoch": 0.7550054605023662, + "grad_norm": 13.5625, + "learning_rate": 8.879402623981116e-07, + "loss": 1.4543311595916748, + "step": 2074 + }, + { + "epoch": 0.7557335274845286, + "grad_norm": 15.3125, + "learning_rate": 8.877264120531849e-07, + "loss": 1.0276598930358887, + "step": 2076 + }, + { + "epoch": 0.7564615944666909, + "grad_norm": 9.3125, + "learning_rate": 8.875123911578788e-07, + "loss": 1.5012457370758057, + "step": 2078 + }, + { + "epoch": 0.7571896614488532, + "grad_norm": 12.5, + "learning_rate": 8.872981998390547e-07, + "loss": 1.3950307369232178, + "step": 2080 + }, + { + "epoch": 0.7579177284310157, + "grad_norm": 13.375, + "learning_rate": 8.870838382236749e-07, + "loss": 1.6168758869171143, + "step": 2082 + }, + { + "epoch": 0.758645795413178, + "grad_norm": 20.625, + "learning_rate": 8.868693064388026e-07, + "loss": 1.438460350036621, + "step": 2084 + }, + { + "epoch": 0.7593738623953403, + "grad_norm": 14.4375, + "learning_rate": 8.866546046116021e-07, + "loss": 1.2971091270446777, + "step": 2086 + }, + { + "epoch": 0.7601019293775028, + "grad_norm": 106.0, + "learning_rate": 8.86439732869338e-07, + "loss": 1.5227291584014893, + "step": 2088 + }, + { + "epoch": 0.7608299963596651, + "grad_norm": 15.9375, + "learning_rate": 8.862246913393765e-07, + "loss": 1.241652250289917, + "step": 2090 + }, + { + "epoch": 0.7615580633418274, + "grad_norm": 26.625, + "learning_rate": 8.860094801491831e-07, + "loss": 1.761426329612732, + "step": 2092 + }, + { + "epoch": 0.7622861303239898, + "grad_norm": 7.4375, + "learning_rate": 8.857940994263253e-07, + "loss": 1.3624589443206787, + "step": 2094 + }, + { + "epoch": 0.7630141973061522, + "grad_norm": 6.90625, + "learning_rate": 8.8557854929847e-07, + "loss": 1.2184607982635498, + "step": 2096 + }, + { + "epoch": 0.7637422642883145, + "grad_norm": 9.3125, + "learning_rate": 8.853628298933853e-07, + "loss": 1.28450608253479, + "step": 2098 + }, + { + "epoch": 0.7644703312704769, + "grad_norm": 22.0, + "learning_rate": 8.851469413389388e-07, + "loss": 1.6692204475402832, + "step": 2100 + }, + { + "epoch": 0.7651983982526392, + "grad_norm": 22.75, + "learning_rate": 8.849308837630995e-07, + "loss": 1.6794381141662598, + "step": 2102 + }, + { + "epoch": 0.7659264652348016, + "grad_norm": 15.4375, + "learning_rate": 8.847146572939355e-07, + "loss": 1.5040552616119385, + "step": 2104 + }, + { + "epoch": 0.766654532216964, + "grad_norm": 13.0625, + "learning_rate": 8.844982620596156e-07, + "loss": 1.2914185523986816, + "step": 2106 + }, + { + "epoch": 0.7673825991991263, + "grad_norm": 18.25, + "learning_rate": 8.842816981884082e-07, + "loss": 1.1447745561599731, + "step": 2108 + }, + { + "epoch": 0.7681106661812886, + "grad_norm": 13.25, + "learning_rate": 8.840649658086825e-07, + "loss": 1.2059820890426636, + "step": 2110 + }, + { + "epoch": 0.7688387331634511, + "grad_norm": 25.625, + "learning_rate": 8.838480650489066e-07, + "loss": 1.8234148025512695, + "step": 2112 + }, + { + "epoch": 0.7695668001456134, + "grad_norm": 47.75, + "learning_rate": 8.836309960376489e-07, + "loss": 1.4719223976135254, + "step": 2114 + }, + { + "epoch": 0.7702948671277757, + "grad_norm": 14.8125, + "learning_rate": 8.834137589035777e-07, + "loss": 1.2399178743362427, + "step": 2116 + }, + { + "epoch": 0.7710229341099382, + "grad_norm": 21.625, + "learning_rate": 8.831963537754603e-07, + "loss": 1.1088004112243652, + "step": 2118 + }, + { + "epoch": 0.7717510010921005, + "grad_norm": 14.5, + "learning_rate": 8.829787807821645e-07, + "loss": 1.5853427648544312, + "step": 2120 + }, + { + "epoch": 0.7724790680742628, + "grad_norm": 13.6875, + "learning_rate": 8.827610400526568e-07, + "loss": 1.2733540534973145, + "step": 2122 + }, + { + "epoch": 0.7732071350564252, + "grad_norm": 13.625, + "learning_rate": 8.825431317160035e-07, + "loss": 1.3817839622497559, + "step": 2124 + }, + { + "epoch": 0.7739352020385876, + "grad_norm": 13.0625, + "learning_rate": 8.823250559013702e-07, + "loss": 1.6347246170043945, + "step": 2126 + }, + { + "epoch": 0.7746632690207499, + "grad_norm": 15.375, + "learning_rate": 8.821068127380217e-07, + "loss": 1.6616439819335938, + "step": 2128 + }, + { + "epoch": 0.7753913360029122, + "grad_norm": 14.5625, + "learning_rate": 8.818884023553221e-07, + "loss": 1.3004727363586426, + "step": 2130 + }, + { + "epoch": 0.7761194029850746, + "grad_norm": 13.75, + "learning_rate": 8.816698248827348e-07, + "loss": 1.7452231645584106, + "step": 2132 + }, + { + "epoch": 0.776847469967237, + "grad_norm": 8.75, + "learning_rate": 8.814510804498214e-07, + "loss": 1.2688758373260498, + "step": 2134 + }, + { + "epoch": 0.7775755369493993, + "grad_norm": 9.3125, + "learning_rate": 8.812321691862436e-07, + "loss": 1.438370704650879, + "step": 2136 + }, + { + "epoch": 0.7783036039315617, + "grad_norm": 5.4375, + "learning_rate": 8.810130912217614e-07, + "loss": 1.0979642868041992, + "step": 2138 + }, + { + "epoch": 0.779031670913724, + "grad_norm": 11.5, + "learning_rate": 8.807938466862335e-07, + "loss": 1.3366403579711914, + "step": 2140 + }, + { + "epoch": 0.7797597378958864, + "grad_norm": 41.5, + "learning_rate": 8.805744357096176e-07, + "loss": 1.4364418983459473, + "step": 2142 + }, + { + "epoch": 0.7804878048780488, + "grad_norm": 11.625, + "learning_rate": 8.803548584219702e-07, + "loss": 1.1907100677490234, + "step": 2144 + }, + { + "epoch": 0.7812158718602111, + "grad_norm": 13.125, + "learning_rate": 8.801351149534458e-07, + "loss": 1.2334513664245605, + "step": 2146 + }, + { + "epoch": 0.7819439388423735, + "grad_norm": 8.625, + "learning_rate": 8.79915205434298e-07, + "loss": 1.5943961143493652, + "step": 2148 + }, + { + "epoch": 0.7826720058245359, + "grad_norm": 208.0, + "learning_rate": 8.796951299948785e-07, + "loss": 1.452537178993225, + "step": 2150 + }, + { + "epoch": 0.7834000728066982, + "grad_norm": 21.375, + "learning_rate": 8.794748887656372e-07, + "loss": 1.4900299310684204, + "step": 2152 + }, + { + "epoch": 0.7841281397888605, + "grad_norm": 9.375, + "learning_rate": 8.79254481877123e-07, + "loss": 1.3501627445220947, + "step": 2154 + }, + { + "epoch": 0.784856206771023, + "grad_norm": 9.375, + "learning_rate": 8.790339094599821e-07, + "loss": 1.2924821376800537, + "step": 2156 + }, + { + "epoch": 0.7855842737531853, + "grad_norm": 2.734375, + "learning_rate": 8.788131716449593e-07, + "loss": 1.234983205795288, + "step": 2158 + }, + { + "epoch": 0.7863123407353476, + "grad_norm": 10.3125, + "learning_rate": 8.785922685628977e-07, + "loss": 1.4036426544189453, + "step": 2160 + }, + { + "epoch": 0.78704040771751, + "grad_norm": 14.1875, + "learning_rate": 8.783712003447374e-07, + "loss": 1.7244772911071777, + "step": 2162 + }, + { + "epoch": 0.7877684746996724, + "grad_norm": 5.40625, + "learning_rate": 8.781499671215178e-07, + "loss": 1.0859498977661133, + "step": 2164 + }, + { + "epoch": 0.7884965416818347, + "grad_norm": 32.25, + "learning_rate": 8.779285690243747e-07, + "loss": 1.0254180431365967, + "step": 2166 + }, + { + "epoch": 0.7892246086639971, + "grad_norm": 10.0625, + "learning_rate": 8.777070061845424e-07, + "loss": 1.1202802658081055, + "step": 2168 + }, + { + "epoch": 0.7899526756461595, + "grad_norm": 7.75, + "learning_rate": 8.774852787333527e-07, + "loss": 1.2634648084640503, + "step": 2170 + }, + { + "epoch": 0.7906807426283218, + "grad_norm": 36.25, + "learning_rate": 8.772633868022348e-07, + "loss": 1.419220209121704, + "step": 2172 + }, + { + "epoch": 0.7914088096104842, + "grad_norm": 16.5, + "learning_rate": 8.77041330522716e-07, + "loss": 1.366602897644043, + "step": 2174 + }, + { + "epoch": 0.7921368765926465, + "grad_norm": 12.25, + "learning_rate": 8.768191100264202e-07, + "loss": 1.2290388345718384, + "step": 2176 + }, + { + "epoch": 0.7928649435748089, + "grad_norm": 6.25, + "learning_rate": 8.765967254450692e-07, + "loss": 1.4164923429489136, + "step": 2178 + }, + { + "epoch": 0.7935930105569713, + "grad_norm": 10.875, + "learning_rate": 8.763741769104818e-07, + "loss": 1.528586983680725, + "step": 2180 + }, + { + "epoch": 0.7943210775391336, + "grad_norm": 12.6875, + "learning_rate": 8.761514645545743e-07, + "loss": 1.3049019575119019, + "step": 2182 + }, + { + "epoch": 0.7950491445212959, + "grad_norm": 12.4375, + "learning_rate": 8.759285885093594e-07, + "loss": 1.412095308303833, + "step": 2184 + }, + { + "epoch": 0.7957772115034584, + "grad_norm": 10.4375, + "learning_rate": 8.757055489069478e-07, + "loss": 1.3520796298980713, + "step": 2186 + }, + { + "epoch": 0.7965052784856207, + "grad_norm": 15.875, + "learning_rate": 8.754823458795463e-07, + "loss": 1.4058706760406494, + "step": 2188 + }, + { + "epoch": 0.797233345467783, + "grad_norm": 11.0625, + "learning_rate": 8.752589795594592e-07, + "loss": 1.4258029460906982, + "step": 2190 + }, + { + "epoch": 0.7979614124499453, + "grad_norm": 10.75, + "learning_rate": 8.750354500790871e-07, + "loss": 1.3938333988189697, + "step": 2192 + }, + { + "epoch": 0.7986894794321078, + "grad_norm": 11.9375, + "learning_rate": 8.748117575709275e-07, + "loss": 1.4986026287078857, + "step": 2194 + }, + { + "epoch": 0.7994175464142701, + "grad_norm": 12.0625, + "learning_rate": 8.745879021675746e-07, + "loss": 0.9052993655204773, + "step": 2196 + }, + { + "epoch": 0.8001456133964324, + "grad_norm": 14.4375, + "learning_rate": 8.743638840017194e-07, + "loss": 1.3740031719207764, + "step": 2198 + }, + { + "epoch": 0.8008736803785949, + "grad_norm": 8.5, + "learning_rate": 8.741397032061486e-07, + "loss": 1.2580406665802002, + "step": 2200 + }, + { + "epoch": 0.8016017473607572, + "grad_norm": 19.5, + "learning_rate": 8.739153599137464e-07, + "loss": 1.4671556949615479, + "step": 2202 + }, + { + "epoch": 0.8023298143429195, + "grad_norm": 62.0, + "learning_rate": 8.736908542574919e-07, + "loss": 1.0512604713439941, + "step": 2204 + }, + { + "epoch": 0.8030578813250819, + "grad_norm": 10.4375, + "learning_rate": 8.734661863704622e-07, + "loss": 1.3953458070755005, + "step": 2206 + }, + { + "epoch": 0.8037859483072443, + "grad_norm": 58.25, + "learning_rate": 8.73241356385829e-07, + "loss": 0.9411048293113708, + "step": 2208 + }, + { + "epoch": 0.8045140152894066, + "grad_norm": 5.21875, + "learning_rate": 8.730163644368606e-07, + "loss": 1.1762077808380127, + "step": 2210 + }, + { + "epoch": 0.805242082271569, + "grad_norm": 11.6875, + "learning_rate": 8.727912106569219e-07, + "loss": 1.303157091140747, + "step": 2212 + }, + { + "epoch": 0.8059701492537313, + "grad_norm": 5.03125, + "learning_rate": 8.72565895179473e-07, + "loss": 1.275782585144043, + "step": 2214 + }, + { + "epoch": 0.8066982162358937, + "grad_norm": 19.625, + "learning_rate": 8.7234041813807e-07, + "loss": 1.5566943883895874, + "step": 2216 + }, + { + "epoch": 0.8074262832180561, + "grad_norm": 16.375, + "learning_rate": 8.721147796663649e-07, + "loss": 1.303802251815796, + "step": 2218 + }, + { + "epoch": 0.8081543502002184, + "grad_norm": 5.25, + "learning_rate": 8.718889798981052e-07, + "loss": 1.1706900596618652, + "step": 2220 + }, + { + "epoch": 0.8088824171823807, + "grad_norm": 20.75, + "learning_rate": 8.716630189671343e-07, + "loss": 1.6404458284378052, + "step": 2222 + }, + { + "epoch": 0.8096104841645432, + "grad_norm": 10.1875, + "learning_rate": 8.714368970073914e-07, + "loss": 1.4171723127365112, + "step": 2224 + }, + { + "epoch": 0.8103385511467055, + "grad_norm": 18.125, + "learning_rate": 8.712106141529099e-07, + "loss": 0.9651477932929993, + "step": 2226 + }, + { + "epoch": 0.8110666181288678, + "grad_norm": 3.796875, + "learning_rate": 8.709841705378198e-07, + "loss": 0.8411148190498352, + "step": 2228 + }, + { + "epoch": 0.8117946851110303, + "grad_norm": 43.5, + "learning_rate": 8.70757566296346e-07, + "loss": 1.620361566543579, + "step": 2230 + }, + { + "epoch": 0.8125227520931926, + "grad_norm": 17.0, + "learning_rate": 8.705308015628086e-07, + "loss": 2.0064477920532227, + "step": 2232 + }, + { + "epoch": 0.8132508190753549, + "grad_norm": 7.46875, + "learning_rate": 8.703038764716228e-07, + "loss": 1.4872006177902222, + "step": 2234 + }, + { + "epoch": 0.8139788860575173, + "grad_norm": 11.3125, + "learning_rate": 8.700767911572988e-07, + "loss": 1.4738264083862305, + "step": 2236 + }, + { + "epoch": 0.8147069530396797, + "grad_norm": 8.625, + "learning_rate": 8.698495457544419e-07, + "loss": 1.3313895463943481, + "step": 2238 + }, + { + "epoch": 0.815435020021842, + "grad_norm": 11.0, + "learning_rate": 8.696221403977524e-07, + "loss": 1.6007938385009766, + "step": 2240 + }, + { + "epoch": 0.8161630870040044, + "grad_norm": 9.9375, + "learning_rate": 8.693945752220251e-07, + "loss": 1.5220977067947388, + "step": 2242 + }, + { + "epoch": 0.8168911539861667, + "grad_norm": 15.3125, + "learning_rate": 8.691668503621497e-07, + "loss": 1.271368145942688, + "step": 2244 + }, + { + "epoch": 0.8176192209683291, + "grad_norm": 17.875, + "learning_rate": 8.689389659531103e-07, + "loss": 1.0644943714141846, + "step": 2246 + }, + { + "epoch": 0.8183472879504915, + "grad_norm": 13.6875, + "learning_rate": 8.687109221299864e-07, + "loss": 1.5870472192764282, + "step": 2248 + }, + { + "epoch": 0.8190753549326538, + "grad_norm": 23.875, + "learning_rate": 8.684827190279509e-07, + "loss": 1.6687254905700684, + "step": 2250 + }, + { + "epoch": 0.8198034219148161, + "grad_norm": 9.9375, + "learning_rate": 8.68254356782272e-07, + "loss": 1.210860013961792, + "step": 2252 + }, + { + "epoch": 0.8205314888969785, + "grad_norm": 12.1875, + "learning_rate": 8.680258355283116e-07, + "loss": 1.5567452907562256, + "step": 2254 + }, + { + "epoch": 0.8212595558791409, + "grad_norm": 13.125, + "learning_rate": 8.677971554015262e-07, + "loss": 1.0140738487243652, + "step": 2256 + }, + { + "epoch": 0.8219876228613032, + "grad_norm": 11.25, + "learning_rate": 8.675683165374663e-07, + "loss": 1.3238792419433594, + "step": 2258 + }, + { + "epoch": 0.8227156898434655, + "grad_norm": 51.0, + "learning_rate": 8.673393190717766e-07, + "loss": 1.35414457321167, + "step": 2260 + }, + { + "epoch": 0.823443756825628, + "grad_norm": 16.875, + "learning_rate": 8.671101631401958e-07, + "loss": 0.8817785978317261, + "step": 2262 + }, + { + "epoch": 0.8241718238077903, + "grad_norm": 10.8125, + "learning_rate": 8.668808488785567e-07, + "loss": 1.4515552520751953, + "step": 2264 + }, + { + "epoch": 0.8248998907899526, + "grad_norm": 6.34375, + "learning_rate": 8.666513764227854e-07, + "loss": 1.3207008838653564, + "step": 2266 + }, + { + "epoch": 0.8256279577721151, + "grad_norm": 9.75, + "learning_rate": 8.664217459089025e-07, + "loss": 1.2439181804656982, + "step": 2268 + }, + { + "epoch": 0.8263560247542774, + "grad_norm": 12.1875, + "learning_rate": 8.661919574730216e-07, + "loss": 1.4488630294799805, + "step": 2270 + }, + { + "epoch": 0.8270840917364397, + "grad_norm": 14.0, + "learning_rate": 8.659620112513506e-07, + "loss": 1.4173847436904907, + "step": 2272 + }, + { + "epoch": 0.8278121587186021, + "grad_norm": 13.875, + "learning_rate": 8.657319073801903e-07, + "loss": 1.2540509700775146, + "step": 2274 + }, + { + "epoch": 0.8285402257007645, + "grad_norm": 13.625, + "learning_rate": 8.65501645995935e-07, + "loss": 1.0589783191680908, + "step": 2276 + }, + { + "epoch": 0.8292682926829268, + "grad_norm": 31.375, + "learning_rate": 8.65271227235073e-07, + "loss": 1.6106246709823608, + "step": 2278 + }, + { + "epoch": 0.8299963596650892, + "grad_norm": 11.9375, + "learning_rate": 8.650406512341855e-07, + "loss": 1.4478310346603394, + "step": 2280 + }, + { + "epoch": 0.8307244266472515, + "grad_norm": 6.90625, + "learning_rate": 8.648099181299464e-07, + "loss": 1.2178795337677002, + "step": 2282 + }, + { + "epoch": 0.8314524936294139, + "grad_norm": 13.875, + "learning_rate": 8.645790280591236e-07, + "loss": 1.5422062873840332, + "step": 2284 + }, + { + "epoch": 0.8321805606115763, + "grad_norm": 10.8125, + "learning_rate": 8.643479811585774e-07, + "loss": 1.3758317232131958, + "step": 2286 + }, + { + "epoch": 0.8329086275937386, + "grad_norm": 2.8125, + "learning_rate": 8.641167775652614e-07, + "loss": 1.3345708847045898, + "step": 2288 + }, + { + "epoch": 0.833636694575901, + "grad_norm": 16.125, + "learning_rate": 8.63885417416222e-07, + "loss": 1.6120717525482178, + "step": 2290 + }, + { + "epoch": 0.8343647615580634, + "grad_norm": 15.625, + "learning_rate": 8.636539008485983e-07, + "loss": 1.502807855606079, + "step": 2292 + }, + { + "epoch": 0.8350928285402257, + "grad_norm": 23.5, + "learning_rate": 8.634222279996221e-07, + "loss": 1.5890371799468994, + "step": 2294 + }, + { + "epoch": 0.835820895522388, + "grad_norm": 21.625, + "learning_rate": 8.63190399006618e-07, + "loss": 1.4459121227264404, + "step": 2296 + }, + { + "epoch": 0.8365489625045505, + "grad_norm": 13.375, + "learning_rate": 8.629584140070032e-07, + "loss": 1.0251116752624512, + "step": 2298 + }, + { + "epoch": 0.8372770294867128, + "grad_norm": 13.9375, + "learning_rate": 8.627262731382872e-07, + "loss": 1.6844968795776367, + "step": 2300 + }, + { + "epoch": 0.8380050964688751, + "grad_norm": 9.375, + "learning_rate": 8.624939765380716e-07, + "loss": 1.8155863285064697, + "step": 2302 + }, + { + "epoch": 0.8387331634510375, + "grad_norm": 16.125, + "learning_rate": 8.622615243440512e-07, + "loss": 1.3565492630004883, + "step": 2304 + }, + { + "epoch": 0.8394612304331999, + "grad_norm": 17.625, + "learning_rate": 8.620289166940122e-07, + "loss": 1.6126853227615356, + "step": 2306 + }, + { + "epoch": 0.8401892974153622, + "grad_norm": 3.921875, + "learning_rate": 8.617961537258331e-07, + "loss": 1.0827155113220215, + "step": 2308 + }, + { + "epoch": 0.8409173643975246, + "grad_norm": 16.125, + "learning_rate": 8.615632355774849e-07, + "loss": 1.4426319599151611, + "step": 2310 + }, + { + "epoch": 0.841645431379687, + "grad_norm": 4.90625, + "learning_rate": 8.613301623870302e-07, + "loss": 1.447786808013916, + "step": 2312 + }, + { + "epoch": 0.8423734983618493, + "grad_norm": 17.5, + "learning_rate": 8.610969342926233e-07, + "loss": 1.6906659603118896, + "step": 2314 + }, + { + "epoch": 0.8431015653440116, + "grad_norm": 29.625, + "learning_rate": 8.608635514325109e-07, + "loss": 1.900593638420105, + "step": 2316 + }, + { + "epoch": 0.843829632326174, + "grad_norm": 11.0, + "learning_rate": 8.60630013945031e-07, + "loss": 1.086639642715454, + "step": 2318 + }, + { + "epoch": 0.8445576993083364, + "grad_norm": 10.625, + "learning_rate": 8.603963219686135e-07, + "loss": 1.3610684871673584, + "step": 2320 + }, + { + "epoch": 0.8452857662904987, + "grad_norm": 13.375, + "learning_rate": 8.601624756417796e-07, + "loss": 1.3948695659637451, + "step": 2322 + }, + { + "epoch": 0.8460138332726611, + "grad_norm": 16.5, + "learning_rate": 8.599284751031421e-07, + "loss": 1.609932541847229, + "step": 2324 + }, + { + "epoch": 0.8467419002548234, + "grad_norm": 14.125, + "learning_rate": 8.596943204914058e-07, + "loss": 1.040374755859375, + "step": 2326 + }, + { + "epoch": 0.8474699672369858, + "grad_norm": 10.1875, + "learning_rate": 8.594600119453656e-07, + "loss": 1.318025827407837, + "step": 2328 + }, + { + "epoch": 0.8481980342191482, + "grad_norm": 22.625, + "learning_rate": 8.592255496039087e-07, + "loss": 1.387387990951538, + "step": 2330 + }, + { + "epoch": 0.8489261012013105, + "grad_norm": 4.8125, + "learning_rate": 8.589909336060131e-07, + "loss": 1.3046350479125977, + "step": 2332 + }, + { + "epoch": 0.8496541681834728, + "grad_norm": 21.0, + "learning_rate": 8.587561640907477e-07, + "loss": 1.324312686920166, + "step": 2334 + }, + { + "epoch": 0.8503822351656353, + "grad_norm": 14.9375, + "learning_rate": 8.585212411972727e-07, + "loss": 1.6144561767578125, + "step": 2336 + }, + { + "epoch": 0.8511103021477976, + "grad_norm": 23.375, + "learning_rate": 8.582861650648392e-07, + "loss": 1.6291710138320923, + "step": 2338 + }, + { + "epoch": 0.8518383691299599, + "grad_norm": 10.875, + "learning_rate": 8.58050935832789e-07, + "loss": 1.3175082206726074, + "step": 2340 + }, + { + "epoch": 0.8525664361121224, + "grad_norm": 8.125, + "learning_rate": 8.578155536405542e-07, + "loss": 1.4208762645721436, + "step": 2342 + }, + { + "epoch": 0.8532945030942847, + "grad_norm": 7.59375, + "learning_rate": 8.575800186276587e-07, + "loss": 1.0850070714950562, + "step": 2344 + }, + { + "epoch": 0.854022570076447, + "grad_norm": 9.125, + "learning_rate": 8.573443309337161e-07, + "loss": 1.6062126159667969, + "step": 2346 + }, + { + "epoch": 0.8547506370586094, + "grad_norm": 12.125, + "learning_rate": 8.571084906984308e-07, + "loss": 1.8593559265136719, + "step": 2348 + }, + { + "epoch": 0.8554787040407718, + "grad_norm": 15.0625, + "learning_rate": 8.568724980615972e-07, + "loss": 1.412750005722046, + "step": 2350 + }, + { + "epoch": 0.8562067710229341, + "grad_norm": 11.25, + "learning_rate": 8.566363531631007e-07, + "loss": 1.4419487714767456, + "step": 2352 + }, + { + "epoch": 0.8569348380050965, + "grad_norm": 14.3125, + "learning_rate": 8.564000561429168e-07, + "loss": 1.2945282459259033, + "step": 2354 + }, + { + "epoch": 0.8576629049872588, + "grad_norm": 25.5, + "learning_rate": 8.561636071411105e-07, + "loss": 1.125868320465088, + "step": 2356 + }, + { + "epoch": 0.8583909719694212, + "grad_norm": 10.875, + "learning_rate": 8.559270062978377e-07, + "loss": 1.0711567401885986, + "step": 2358 + }, + { + "epoch": 0.8591190389515836, + "grad_norm": 39.75, + "learning_rate": 8.556902537533439e-07, + "loss": 1.6194987297058105, + "step": 2360 + }, + { + "epoch": 0.8598471059337459, + "grad_norm": 17.5, + "learning_rate": 8.554533496479648e-07, + "loss": 1.6500740051269531, + "step": 2362 + }, + { + "epoch": 0.8605751729159082, + "grad_norm": 15.25, + "learning_rate": 8.55216294122126e-07, + "loss": 1.4962422847747803, + "step": 2364 + }, + { + "epoch": 0.8613032398980707, + "grad_norm": 10.875, + "learning_rate": 8.549790873163416e-07, + "loss": 1.3390296697616577, + "step": 2366 + }, + { + "epoch": 0.862031306880233, + "grad_norm": 14.6875, + "learning_rate": 8.547417293712176e-07, + "loss": 1.5214158296585083, + "step": 2368 + }, + { + "epoch": 0.8627593738623953, + "grad_norm": 12.375, + "learning_rate": 8.545042204274475e-07, + "loss": 1.1768289804458618, + "step": 2370 + }, + { + "epoch": 0.8634874408445578, + "grad_norm": 11.375, + "learning_rate": 8.542665606258157e-07, + "loss": 1.137587547302246, + "step": 2372 + }, + { + "epoch": 0.8642155078267201, + "grad_norm": 12.5, + "learning_rate": 8.540287501071952e-07, + "loss": 1.4847252368927002, + "step": 2374 + }, + { + "epoch": 0.8649435748088824, + "grad_norm": 20.375, + "learning_rate": 8.537907890125488e-07, + "loss": 1.4600777626037598, + "step": 2376 + }, + { + "epoch": 0.8656716417910447, + "grad_norm": 11.1875, + "learning_rate": 8.535526774829283e-07, + "loss": 1.4638421535491943, + "step": 2378 + }, + { + "epoch": 0.8663997087732072, + "grad_norm": 45.0, + "learning_rate": 8.533144156594749e-07, + "loss": 1.208853006362915, + "step": 2380 + }, + { + "epoch": 0.8671277757553695, + "grad_norm": 13.1875, + "learning_rate": 8.530760036834186e-07, + "loss": 1.5356422662734985, + "step": 2382 + }, + { + "epoch": 0.8678558427375318, + "grad_norm": 17.5, + "learning_rate": 8.528374416960786e-07, + "loss": 1.317340612411499, + "step": 2384 + }, + { + "epoch": 0.8685839097196942, + "grad_norm": 15.6875, + "learning_rate": 8.52598729838863e-07, + "loss": 1.4778132438659668, + "step": 2386 + }, + { + "epoch": 0.8693119767018566, + "grad_norm": 8.5625, + "learning_rate": 8.523598682532687e-07, + "loss": 1.3116151094436646, + "step": 2388 + }, + { + "epoch": 0.8700400436840189, + "grad_norm": 10.0625, + "learning_rate": 8.521208570808813e-07, + "loss": 1.2627559900283813, + "step": 2390 + }, + { + "epoch": 0.8707681106661813, + "grad_norm": 18.5, + "learning_rate": 8.518816964633752e-07, + "loss": 1.4627039432525635, + "step": 2392 + }, + { + "epoch": 0.8714961776483436, + "grad_norm": 16.625, + "learning_rate": 8.516423865425133e-07, + "loss": 1.4075419902801514, + "step": 2394 + }, + { + "epoch": 0.872224244630506, + "grad_norm": 13.5625, + "learning_rate": 8.514029274601469e-07, + "loss": 1.4367598295211792, + "step": 2396 + }, + { + "epoch": 0.8729523116126684, + "grad_norm": 11.5625, + "learning_rate": 8.511633193582162e-07, + "loss": 1.744253396987915, + "step": 2398 + }, + { + "epoch": 0.8736803785948307, + "grad_norm": 15.9375, + "learning_rate": 8.509235623787488e-07, + "loss": 1.5200308561325073, + "step": 2400 + }, + { + "epoch": 0.874408445576993, + "grad_norm": 4.84375, + "learning_rate": 8.506836566638615e-07, + "loss": 1.4426331520080566, + "step": 2402 + }, + { + "epoch": 0.8751365125591555, + "grad_norm": 13.5625, + "learning_rate": 8.504436023557588e-07, + "loss": 1.3117856979370117, + "step": 2404 + }, + { + "epoch": 0.8758645795413178, + "grad_norm": 19.5, + "learning_rate": 8.502033995967333e-07, + "loss": 1.1446746587753296, + "step": 2406 + }, + { + "epoch": 0.8765926465234801, + "grad_norm": 17.125, + "learning_rate": 8.499630485291655e-07, + "loss": 1.5068570375442505, + "step": 2408 + }, + { + "epoch": 0.8773207135056426, + "grad_norm": 17.75, + "learning_rate": 8.497225492955242e-07, + "loss": 1.7045989036560059, + "step": 2410 + }, + { + "epoch": 0.8780487804878049, + "grad_norm": 10.5, + "learning_rate": 8.494819020383659e-07, + "loss": 1.477632999420166, + "step": 2412 + }, + { + "epoch": 0.8787768474699672, + "grad_norm": 9.625, + "learning_rate": 8.492411069003343e-07, + "loss": 1.1641278266906738, + "step": 2414 + }, + { + "epoch": 0.8795049144521296, + "grad_norm": 21.875, + "learning_rate": 8.490001640241615e-07, + "loss": 1.360015869140625, + "step": 2416 + }, + { + "epoch": 0.880232981434292, + "grad_norm": 5.65625, + "learning_rate": 8.487590735526667e-07, + "loss": 1.2023588418960571, + "step": 2418 + }, + { + "epoch": 0.8809610484164543, + "grad_norm": 8.9375, + "learning_rate": 8.485178356287568e-07, + "loss": 1.1860418319702148, + "step": 2420 + }, + { + "epoch": 0.8816891153986167, + "grad_norm": 10.9375, + "learning_rate": 8.482764503954258e-07, + "loss": 1.143653154373169, + "step": 2422 + }, + { + "epoch": 0.882417182380779, + "grad_norm": 572.0, + "learning_rate": 8.480349179957557e-07, + "loss": 1.077162742614746, + "step": 2424 + }, + { + "epoch": 0.8831452493629414, + "grad_norm": 47.0, + "learning_rate": 8.477932385729152e-07, + "loss": 1.4315608739852905, + "step": 2426 + }, + { + "epoch": 0.8838733163451038, + "grad_norm": 17.125, + "learning_rate": 8.475514122701597e-07, + "loss": 1.2064841985702515, + "step": 2428 + }, + { + "epoch": 0.8846013833272661, + "grad_norm": 42.5, + "learning_rate": 8.473094392308328e-07, + "loss": 1.8022531270980835, + "step": 2430 + }, + { + "epoch": 0.8853294503094284, + "grad_norm": 4.8125, + "learning_rate": 8.470673195983644e-07, + "loss": 1.4968467950820923, + "step": 2432 + }, + { + "epoch": 0.8860575172915909, + "grad_norm": 18.25, + "learning_rate": 8.46825053516271e-07, + "loss": 0.8578945994377136, + "step": 2434 + }, + { + "epoch": 0.8867855842737532, + "grad_norm": 12.4375, + "learning_rate": 8.465826411281567e-07, + "loss": 1.155950665473938, + "step": 2436 + }, + { + "epoch": 0.8875136512559155, + "grad_norm": 10.5, + "learning_rate": 8.463400825777118e-07, + "loss": 1.470492959022522, + "step": 2438 + }, + { + "epoch": 0.8882417182380778, + "grad_norm": 8.5625, + "learning_rate": 8.460973780087133e-07, + "loss": 1.4155642986297607, + "step": 2440 + }, + { + "epoch": 0.8889697852202403, + "grad_norm": 9.875, + "learning_rate": 8.458545275650246e-07, + "loss": 1.1921131610870361, + "step": 2442 + }, + { + "epoch": 0.8896978522024026, + "grad_norm": 9.9375, + "learning_rate": 8.45611531390596e-07, + "loss": 1.3157901763916016, + "step": 2444 + }, + { + "epoch": 0.8904259191845649, + "grad_norm": 18.875, + "learning_rate": 8.453683896294642e-07, + "loss": 1.5719568729400635, + "step": 2446 + }, + { + "epoch": 0.8911539861667274, + "grad_norm": 14.0625, + "learning_rate": 8.451251024257511e-07, + "loss": 1.0341441631317139, + "step": 2448 + }, + { + "epoch": 0.8918820531488897, + "grad_norm": 3.84375, + "learning_rate": 8.448816699236665e-07, + "loss": 0.7408318519592285, + "step": 2450 + }, + { + "epoch": 0.892610120131052, + "grad_norm": 39.5, + "learning_rate": 8.446380922675051e-07, + "loss": 1.3424630165100098, + "step": 2452 + }, + { + "epoch": 0.8933381871132144, + "grad_norm": 11.75, + "learning_rate": 8.443943696016481e-07, + "loss": 1.118253469467163, + "step": 2454 + }, + { + "epoch": 0.8940662540953768, + "grad_norm": 10.75, + "learning_rate": 8.441505020705626e-07, + "loss": 1.5101635456085205, + "step": 2456 + }, + { + "epoch": 0.8947943210775391, + "grad_norm": 8.1875, + "learning_rate": 8.439064898188016e-07, + "loss": 1.4759283065795898, + "step": 2458 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 28.625, + "learning_rate": 8.436623329910036e-07, + "loss": 1.5489836931228638, + "step": 2460 + }, + { + "epoch": 0.8962504550418638, + "grad_norm": 30.0, + "learning_rate": 8.43418031731893e-07, + "loss": 1.2119265794754028, + "step": 2462 + }, + { + "epoch": 0.8969785220240262, + "grad_norm": 10.0, + "learning_rate": 8.4317358618628e-07, + "loss": 1.4687142372131348, + "step": 2464 + }, + { + "epoch": 0.8977065890061886, + "grad_norm": 59.5, + "learning_rate": 8.429289964990604e-07, + "loss": 1.5389881134033203, + "step": 2466 + }, + { + "epoch": 0.8984346559883509, + "grad_norm": 8.25, + "learning_rate": 8.426842628152149e-07, + "loss": 1.3656291961669922, + "step": 2468 + }, + { + "epoch": 0.8991627229705133, + "grad_norm": 12.4375, + "learning_rate": 8.424393852798099e-07, + "loss": 1.4480030536651611, + "step": 2470 + }, + { + "epoch": 0.8998907899526757, + "grad_norm": 6.8125, + "learning_rate": 8.421943640379971e-07, + "loss": 1.4476206302642822, + "step": 2472 + }, + { + "epoch": 0.900618856934838, + "grad_norm": 7.8125, + "learning_rate": 8.419491992350132e-07, + "loss": 1.2799961566925049, + "step": 2474 + }, + { + "epoch": 0.9013469239170003, + "grad_norm": 16.25, + "learning_rate": 8.417038910161802e-07, + "loss": 1.6170034408569336, + "step": 2476 + }, + { + "epoch": 0.9020749908991628, + "grad_norm": 8.8125, + "learning_rate": 8.414584395269049e-07, + "loss": 0.995517373085022, + "step": 2478 + }, + { + "epoch": 0.9028030578813251, + "grad_norm": 15.625, + "learning_rate": 8.412128449126792e-07, + "loss": 1.4871735572814941, + "step": 2480 + }, + { + "epoch": 0.9035311248634874, + "grad_norm": 22.875, + "learning_rate": 8.409671073190798e-07, + "loss": 1.2772575616836548, + "step": 2482 + }, + { + "epoch": 0.9042591918456498, + "grad_norm": 8.9375, + "learning_rate": 8.407212268917682e-07, + "loss": 1.1406450271606445, + "step": 2484 + }, + { + "epoch": 0.9049872588278122, + "grad_norm": 22.5, + "learning_rate": 8.404752037764904e-07, + "loss": 1.3448662757873535, + "step": 2486 + }, + { + "epoch": 0.9057153258099745, + "grad_norm": 12.9375, + "learning_rate": 8.402290381190771e-07, + "loss": 1.521700143814087, + "step": 2488 + }, + { + "epoch": 0.9064433927921369, + "grad_norm": 21.125, + "learning_rate": 8.399827300654434e-07, + "loss": 1.3910189867019653, + "step": 2490 + }, + { + "epoch": 0.9071714597742992, + "grad_norm": 60.0, + "learning_rate": 8.39736279761589e-07, + "loss": 1.6469383239746094, + "step": 2492 + }, + { + "epoch": 0.9078995267564616, + "grad_norm": 12.75, + "learning_rate": 8.394896873535974e-07, + "loss": 1.450331211090088, + "step": 2494 + }, + { + "epoch": 0.908627593738624, + "grad_norm": 25.875, + "learning_rate": 8.392429529876372e-07, + "loss": 1.6783294677734375, + "step": 2496 + }, + { + "epoch": 0.9093556607207863, + "grad_norm": 18.125, + "learning_rate": 8.389960768099606e-07, + "loss": 1.4257748126983643, + "step": 2498 + }, + { + "epoch": 0.9100837277029487, + "grad_norm": 17.5, + "learning_rate": 8.387490589669036e-07, + "loss": 0.9494454860687256, + "step": 2500 + }, + { + "epoch": 0.910811794685111, + "grad_norm": 13.375, + "learning_rate": 8.385018996048867e-07, + "loss": 0.9950515627861023, + "step": 2502 + }, + { + "epoch": 0.9115398616672734, + "grad_norm": 10.5625, + "learning_rate": 8.382545988704144e-07, + "loss": 1.5436673164367676, + "step": 2504 + }, + { + "epoch": 0.9122679286494357, + "grad_norm": 7.71875, + "learning_rate": 8.380071569100742e-07, + "loss": 1.2840231657028198, + "step": 2506 + }, + { + "epoch": 0.9129959956315981, + "grad_norm": 7.875, + "learning_rate": 8.377595738705376e-07, + "loss": 1.320253610610962, + "step": 2508 + }, + { + "epoch": 0.9137240626137605, + "grad_norm": 13.8125, + "learning_rate": 8.375118498985607e-07, + "loss": 1.3841376304626465, + "step": 2510 + }, + { + "epoch": 0.9144521295959228, + "grad_norm": 8.8125, + "learning_rate": 8.37263985140982e-07, + "loss": 1.4098219871520996, + "step": 2512 + }, + { + "epoch": 0.9151801965780851, + "grad_norm": 13.375, + "learning_rate": 8.370159797447234e-07, + "loss": 1.3886960744857788, + "step": 2514 + }, + { + "epoch": 0.9159082635602476, + "grad_norm": 11.625, + "learning_rate": 8.367678338567912e-07, + "loss": 1.5731256008148193, + "step": 2516 + }, + { + "epoch": 0.9166363305424099, + "grad_norm": 16.25, + "learning_rate": 8.365195476242739e-07, + "loss": 1.5181853771209717, + "step": 2518 + }, + { + "epoch": 0.9173643975245722, + "grad_norm": 30.375, + "learning_rate": 8.362711211943439e-07, + "loss": 1.5256881713867188, + "step": 2520 + }, + { + "epoch": 0.9180924645067347, + "grad_norm": 7.59375, + "learning_rate": 8.360225547142563e-07, + "loss": 1.1073682308197021, + "step": 2522 + }, + { + "epoch": 0.918820531488897, + "grad_norm": 8.625, + "learning_rate": 8.357738483313495e-07, + "loss": 1.5524561405181885, + "step": 2524 + }, + { + "epoch": 0.9195485984710593, + "grad_norm": 6.78125, + "learning_rate": 8.355250021930445e-07, + "loss": 1.2016141414642334, + "step": 2526 + }, + { + "epoch": 0.9202766654532217, + "grad_norm": 15.625, + "learning_rate": 8.352760164468453e-07, + "loss": 1.1225922107696533, + "step": 2528 + }, + { + "epoch": 0.921004732435384, + "grad_norm": 9.0625, + "learning_rate": 8.350268912403389e-07, + "loss": 1.2426254749298096, + "step": 2530 + }, + { + "epoch": 0.9217327994175464, + "grad_norm": 6.125, + "learning_rate": 8.347776267211945e-07, + "loss": 1.192581057548523, + "step": 2532 + }, + { + "epoch": 0.9224608663997088, + "grad_norm": 19.125, + "learning_rate": 8.345282230371644e-07, + "loss": 1.4481141567230225, + "step": 2534 + }, + { + "epoch": 0.9231889333818711, + "grad_norm": 9.6875, + "learning_rate": 8.342786803360828e-07, + "loss": 1.0403404235839844, + "step": 2536 + }, + { + "epoch": 0.9239170003640335, + "grad_norm": 8.4375, + "learning_rate": 8.340289987658671e-07, + "loss": 1.176543951034546, + "step": 2538 + }, + { + "epoch": 0.9246450673461959, + "grad_norm": 11.5625, + "learning_rate": 8.337791784745156e-07, + "loss": 1.106257677078247, + "step": 2540 + }, + { + "epoch": 0.9253731343283582, + "grad_norm": 7.46875, + "learning_rate": 8.335292196101109e-07, + "loss": 1.19044029712677, + "step": 2542 + }, + { + "epoch": 0.9261012013105205, + "grad_norm": 15.25, + "learning_rate": 8.332791223208157e-07, + "loss": 1.2715553045272827, + "step": 2544 + }, + { + "epoch": 0.926829268292683, + "grad_norm": 15.375, + "learning_rate": 8.33028886754876e-07, + "loss": 1.5048950910568237, + "step": 2546 + }, + { + "epoch": 0.9275573352748453, + "grad_norm": 8.5625, + "learning_rate": 8.327785130606198e-07, + "loss": 1.671478271484375, + "step": 2548 + }, + { + "epoch": 0.9282854022570076, + "grad_norm": 9.25, + "learning_rate": 8.325280013864557e-07, + "loss": 1.1253774166107178, + "step": 2550 + }, + { + "epoch": 0.92901346923917, + "grad_norm": 11.3125, + "learning_rate": 8.322773518808757e-07, + "loss": 1.2977187633514404, + "step": 2552 + }, + { + "epoch": 0.9297415362213324, + "grad_norm": 8.4375, + "learning_rate": 8.320265646924522e-07, + "loss": 1.6451172828674316, + "step": 2554 + }, + { + "epoch": 0.9304696032034947, + "grad_norm": 5.875, + "learning_rate": 8.317756399698404e-07, + "loss": 1.2246999740600586, + "step": 2556 + }, + { + "epoch": 0.9311976701856571, + "grad_norm": 14.125, + "learning_rate": 8.315245778617761e-07, + "loss": 1.142179250717163, + "step": 2558 + }, + { + "epoch": 0.9319257371678195, + "grad_norm": 4.84375, + "learning_rate": 8.312733785170766e-07, + "loss": 1.5658926963806152, + "step": 2560 + }, + { + "epoch": 0.9326538041499818, + "grad_norm": 10.5625, + "learning_rate": 8.310220420846414e-07, + "loss": 1.4307031631469727, + "step": 2562 + }, + { + "epoch": 0.9333818711321441, + "grad_norm": 8.0625, + "learning_rate": 8.3077056871345e-07, + "loss": 1.1920253038406372, + "step": 2564 + }, + { + "epoch": 0.9341099381143065, + "grad_norm": 16.375, + "learning_rate": 8.305189585525643e-07, + "loss": 1.194356083869934, + "step": 2566 + }, + { + "epoch": 0.9348380050964689, + "grad_norm": 11.0625, + "learning_rate": 8.302672117511262e-07, + "loss": 1.272974967956543, + "step": 2568 + }, + { + "epoch": 0.9355660720786312, + "grad_norm": 39.5, + "learning_rate": 8.300153284583593e-07, + "loss": 1.8199732303619385, + "step": 2570 + }, + { + "epoch": 0.9362941390607936, + "grad_norm": 19.875, + "learning_rate": 8.297633088235681e-07, + "loss": 1.5819928646087646, + "step": 2572 + }, + { + "epoch": 0.9370222060429559, + "grad_norm": 8.0625, + "learning_rate": 8.295111529961374e-07, + "loss": 1.1697304248809814, + "step": 2574 + }, + { + "epoch": 0.9377502730251183, + "grad_norm": 9.5, + "learning_rate": 8.292588611255331e-07, + "loss": 1.1478781700134277, + "step": 2576 + }, + { + "epoch": 0.9384783400072807, + "grad_norm": 13.125, + "learning_rate": 8.290064333613019e-07, + "loss": 1.5654715299606323, + "step": 2578 + }, + { + "epoch": 0.939206406989443, + "grad_norm": 8.9375, + "learning_rate": 8.287538698530708e-07, + "loss": 1.3304080963134766, + "step": 2580 + }, + { + "epoch": 0.9399344739716053, + "grad_norm": 21.125, + "learning_rate": 8.285011707505469e-07, + "loss": 1.274862289428711, + "step": 2582 + }, + { + "epoch": 0.9406625409537678, + "grad_norm": 16.625, + "learning_rate": 8.282483362035186e-07, + "loss": 1.236227035522461, + "step": 2584 + }, + { + "epoch": 0.9413906079359301, + "grad_norm": 17.125, + "learning_rate": 8.279953663618539e-07, + "loss": 1.5972484350204468, + "step": 2586 + }, + { + "epoch": 0.9421186749180924, + "grad_norm": 7.96875, + "learning_rate": 8.27742261375501e-07, + "loss": 1.3442740440368652, + "step": 2588 + }, + { + "epoch": 0.9428467419002549, + "grad_norm": 11.4375, + "learning_rate": 8.274890213944886e-07, + "loss": 1.4182988405227661, + "step": 2590 + }, + { + "epoch": 0.9435748088824172, + "grad_norm": 10.4375, + "learning_rate": 8.272356465689247e-07, + "loss": 1.2513564825057983, + "step": 2592 + }, + { + "epoch": 0.9443028758645795, + "grad_norm": 11.375, + "learning_rate": 8.269821370489982e-07, + "loss": 1.1049457788467407, + "step": 2594 + }, + { + "epoch": 0.9450309428467419, + "grad_norm": 7.0, + "learning_rate": 8.267284929849774e-07, + "loss": 1.2679188251495361, + "step": 2596 + }, + { + "epoch": 0.9457590098289043, + "grad_norm": 13.125, + "learning_rate": 8.264747145272099e-07, + "loss": 1.4669312238693237, + "step": 2598 + }, + { + "epoch": 0.9464870768110666, + "grad_norm": 10.4375, + "learning_rate": 8.262208018261236e-07, + "loss": 1.7974510192871094, + "step": 2600 + }, + { + "epoch": 0.947215143793229, + "grad_norm": 34.75, + "learning_rate": 8.25966755032226e-07, + "loss": 1.269831895828247, + "step": 2602 + }, + { + "epoch": 0.9479432107753913, + "grad_norm": 19.125, + "learning_rate": 8.257125742961031e-07, + "loss": 1.6039361953735352, + "step": 2604 + }, + { + "epoch": 0.9486712777575537, + "grad_norm": 11.125, + "learning_rate": 8.254582597684217e-07, + "loss": 1.277653694152832, + "step": 2606 + }, + { + "epoch": 0.9493993447397161, + "grad_norm": 12.875, + "learning_rate": 8.252038115999269e-07, + "loss": 1.4994066953659058, + "step": 2608 + }, + { + "epoch": 0.9501274117218784, + "grad_norm": 15.8125, + "learning_rate": 8.249492299414436e-07, + "loss": 0.9784266352653503, + "step": 2610 + }, + { + "epoch": 0.9508554787040407, + "grad_norm": 8.25, + "learning_rate": 8.246945149438751e-07, + "loss": 0.9560913443565369, + "step": 2612 + }, + { + "epoch": 0.9515835456862032, + "grad_norm": 11.625, + "learning_rate": 8.244396667582045e-07, + "loss": 1.2694449424743652, + "step": 2614 + }, + { + "epoch": 0.9523116126683655, + "grad_norm": 74.0, + "learning_rate": 8.241846855354937e-07, + "loss": 1.376665711402893, + "step": 2616 + }, + { + "epoch": 0.9530396796505278, + "grad_norm": 90.5, + "learning_rate": 8.23929571426883e-07, + "loss": 1.8077497482299805, + "step": 2618 + }, + { + "epoch": 0.9537677466326903, + "grad_norm": 12.3125, + "learning_rate": 8.236743245835918e-07, + "loss": 1.526737093925476, + "step": 2620 + }, + { + "epoch": 0.9544958136148526, + "grad_norm": 11.625, + "learning_rate": 8.234189451569182e-07, + "loss": 1.4959609508514404, + "step": 2622 + }, + { + "epoch": 0.9552238805970149, + "grad_norm": 9.3125, + "learning_rate": 8.231634332982392e-07, + "loss": 1.4442641735076904, + "step": 2624 + }, + { + "epoch": 0.9559519475791772, + "grad_norm": 13.5625, + "learning_rate": 8.229077891590094e-07, + "loss": 1.3356958627700806, + "step": 2626 + }, + { + "epoch": 0.9566800145613397, + "grad_norm": 18.625, + "learning_rate": 8.226520128907626e-07, + "loss": 1.6429953575134277, + "step": 2628 + }, + { + "epoch": 0.957408081543502, + "grad_norm": 17.5, + "learning_rate": 8.223961046451109e-07, + "loss": 1.6257344484329224, + "step": 2630 + }, + { + "epoch": 0.9581361485256643, + "grad_norm": 12.5, + "learning_rate": 8.22140064573744e-07, + "loss": 1.306258201599121, + "step": 2632 + }, + { + "epoch": 0.9588642155078267, + "grad_norm": 9.875, + "learning_rate": 8.218838928284302e-07, + "loss": 1.4769309759140015, + "step": 2634 + }, + { + "epoch": 0.9595922824899891, + "grad_norm": 12.875, + "learning_rate": 8.216275895610161e-07, + "loss": 1.0276743173599243, + "step": 2636 + }, + { + "epoch": 0.9603203494721514, + "grad_norm": 18.0, + "learning_rate": 8.213711549234254e-07, + "loss": 1.4604713916778564, + "step": 2638 + }, + { + "epoch": 0.9610484164543138, + "grad_norm": 13.9375, + "learning_rate": 8.211145890676607e-07, + "loss": 1.4298914670944214, + "step": 2640 + }, + { + "epoch": 0.9617764834364761, + "grad_norm": 6.78125, + "learning_rate": 8.20857892145802e-07, + "loss": 1.2925302982330322, + "step": 2642 + }, + { + "epoch": 0.9625045504186385, + "grad_norm": 22.25, + "learning_rate": 8.206010643100063e-07, + "loss": 1.2889964580535889, + "step": 2644 + }, + { + "epoch": 0.9632326174008009, + "grad_norm": 12.5625, + "learning_rate": 8.203441057125093e-07, + "loss": 0.8914753794670105, + "step": 2646 + }, + { + "epoch": 0.9639606843829632, + "grad_norm": 17.0, + "learning_rate": 8.200870165056237e-07, + "loss": 1.6695901155471802, + "step": 2648 + }, + { + "epoch": 0.9646887513651256, + "grad_norm": 20.25, + "learning_rate": 8.198297968417394e-07, + "loss": 0.9618207216262817, + "step": 2650 + }, + { + "epoch": 0.965416818347288, + "grad_norm": 11.0, + "learning_rate": 8.195724468733236e-07, + "loss": 1.3174715042114258, + "step": 2652 + }, + { + "epoch": 0.9661448853294503, + "grad_norm": 19.25, + "learning_rate": 8.193149667529216e-07, + "loss": 1.4354273080825806, + "step": 2654 + }, + { + "epoch": 0.9668729523116126, + "grad_norm": 7.46875, + "learning_rate": 8.190573566331547e-07, + "loss": 1.384384274482727, + "step": 2656 + }, + { + "epoch": 0.9676010192937751, + "grad_norm": 11.25, + "learning_rate": 8.187996166667222e-07, + "loss": 1.3514490127563477, + "step": 2658 + }, + { + "epoch": 0.9683290862759374, + "grad_norm": 9.9375, + "learning_rate": 8.185417470063996e-07, + "loss": 1.4639127254486084, + "step": 2660 + }, + { + "epoch": 0.9690571532580997, + "grad_norm": 13.6875, + "learning_rate": 8.1828374780504e-07, + "loss": 1.4599788188934326, + "step": 2662 + }, + { + "epoch": 0.9697852202402621, + "grad_norm": 9.6875, + "learning_rate": 8.180256192155723e-07, + "loss": 1.3529106378555298, + "step": 2664 + }, + { + "epoch": 0.9705132872224245, + "grad_norm": 42.0, + "learning_rate": 8.177673613910034e-07, + "loss": 1.4340229034423828, + "step": 2666 + }, + { + "epoch": 0.9712413542045868, + "grad_norm": 12.5625, + "learning_rate": 8.175089744844159e-07, + "loss": 1.6145766973495483, + "step": 2668 + }, + { + "epoch": 0.9719694211867492, + "grad_norm": 20.375, + "learning_rate": 8.172504586489693e-07, + "loss": 1.672612190246582, + "step": 2670 + }, + { + "epoch": 0.9726974881689116, + "grad_norm": 21.5, + "learning_rate": 8.169918140378988e-07, + "loss": 1.3867149353027344, + "step": 2672 + }, + { + "epoch": 0.9734255551510739, + "grad_norm": 14.25, + "learning_rate": 8.167330408045168e-07, + "loss": 1.2815804481506348, + "step": 2674 + }, + { + "epoch": 0.9741536221332363, + "grad_norm": 18.75, + "learning_rate": 8.164741391022116e-07, + "loss": 1.39165198802948, + "step": 2676 + }, + { + "epoch": 0.9748816891153986, + "grad_norm": 5.53125, + "learning_rate": 8.16215109084448e-07, + "loss": 1.1112006902694702, + "step": 2678 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 24.0, + "learning_rate": 8.15955950904766e-07, + "loss": 1.216090440750122, + "step": 2680 + }, + { + "epoch": 0.9763378230797234, + "grad_norm": 19.875, + "learning_rate": 8.156966647167824e-07, + "loss": 1.2461457252502441, + "step": 2682 + }, + { + "epoch": 0.9770658900618857, + "grad_norm": 19.5, + "learning_rate": 8.154372506741898e-07, + "loss": 1.5713176727294922, + "step": 2684 + }, + { + "epoch": 0.977793957044048, + "grad_norm": 9.5625, + "learning_rate": 8.151777089307559e-07, + "loss": 1.260617733001709, + "step": 2686 + }, + { + "epoch": 0.9785220240262104, + "grad_norm": 17.375, + "learning_rate": 8.149180396403247e-07, + "loss": 1.9782915115356445, + "step": 2688 + }, + { + "epoch": 0.9792500910083728, + "grad_norm": 77.5, + "learning_rate": 8.146582429568159e-07, + "loss": 1.091056227684021, + "step": 2690 + }, + { + "epoch": 0.9799781579905351, + "grad_norm": 185.0, + "learning_rate": 8.143983190342243e-07, + "loss": 1.454491376876831, + "step": 2692 + }, + { + "epoch": 0.9807062249726974, + "grad_norm": 23.625, + "learning_rate": 8.141382680266202e-07, + "loss": 0.8891265988349915, + "step": 2694 + }, + { + "epoch": 0.9814342919548599, + "grad_norm": 18.125, + "learning_rate": 8.138780900881496e-07, + "loss": 1.1875460147857666, + "step": 2696 + }, + { + "epoch": 0.9821623589370222, + "grad_norm": 5.8125, + "learning_rate": 8.136177853730333e-07, + "loss": 1.0526376962661743, + "step": 2698 + }, + { + "epoch": 0.9828904259191845, + "grad_norm": 10.6875, + "learning_rate": 8.133573540355676e-07, + "loss": 1.5471982955932617, + "step": 2700 + }, + { + "epoch": 0.983618492901347, + "grad_norm": 8.1875, + "learning_rate": 8.130967962301232e-07, + "loss": 1.2620494365692139, + "step": 2702 + }, + { + "epoch": 0.9843465598835093, + "grad_norm": 15.8125, + "learning_rate": 8.128361121111467e-07, + "loss": 1.6144521236419678, + "step": 2704 + }, + { + "epoch": 0.9850746268656716, + "grad_norm": 10.6875, + "learning_rate": 8.125753018331589e-07, + "loss": 1.2452671527862549, + "step": 2706 + }, + { + "epoch": 0.985802693847834, + "grad_norm": 13.0, + "learning_rate": 8.123143655507558e-07, + "loss": 1.3571007251739502, + "step": 2708 + }, + { + "epoch": 0.9865307608299964, + "grad_norm": 12.4375, + "learning_rate": 8.120533034186076e-07, + "loss": 0.5547715425491333, + "step": 2710 + }, + { + "epoch": 0.9872588278121587, + "grad_norm": 8.25, + "learning_rate": 8.117921155914596e-07, + "loss": 1.4436923265457153, + "step": 2712 + }, + { + "epoch": 0.9879868947943211, + "grad_norm": 8.75, + "learning_rate": 8.115308022241312e-07, + "loss": 1.3985812664031982, + "step": 2714 + }, + { + "epoch": 0.9887149617764834, + "grad_norm": 17.0, + "learning_rate": 8.112693634715165e-07, + "loss": 1.368992567062378, + "step": 2716 + }, + { + "epoch": 0.9894430287586458, + "grad_norm": 13.75, + "learning_rate": 8.110077994885838e-07, + "loss": 1.2773106098175049, + "step": 2718 + }, + { + "epoch": 0.9901710957408082, + "grad_norm": 50.5, + "learning_rate": 8.107461104303756e-07, + "loss": 1.2548387050628662, + "step": 2720 + }, + { + "epoch": 0.9908991627229705, + "grad_norm": 10.1875, + "learning_rate": 8.104842964520087e-07, + "loss": 1.148345947265625, + "step": 2722 + }, + { + "epoch": 0.9916272297051328, + "grad_norm": 8.25, + "learning_rate": 8.102223577086738e-07, + "loss": 1.3860697746276855, + "step": 2724 + }, + { + "epoch": 0.9923552966872953, + "grad_norm": 14.8125, + "learning_rate": 8.099602943556354e-07, + "loss": 1.4397653341293335, + "step": 2726 + }, + { + "epoch": 0.9930833636694576, + "grad_norm": 10.625, + "learning_rate": 8.096981065482324e-07, + "loss": 1.3697749376296997, + "step": 2728 + }, + { + "epoch": 0.9938114306516199, + "grad_norm": 21.875, + "learning_rate": 8.09435794441877e-07, + "loss": 1.6635875701904297, + "step": 2730 + }, + { + "epoch": 0.9945394976337824, + "grad_norm": 18.625, + "learning_rate": 8.091733581920548e-07, + "loss": 1.4166569709777832, + "step": 2732 + }, + { + "epoch": 0.9952675646159447, + "grad_norm": 32.5, + "learning_rate": 8.089107979543262e-07, + "loss": 1.2083587646484375, + "step": 2734 + }, + { + "epoch": 0.995995631598107, + "grad_norm": 36.25, + "learning_rate": 8.086481138843235e-07, + "loss": 1.4112759828567505, + "step": 2736 + }, + { + "epoch": 0.9967236985802694, + "grad_norm": 54.75, + "learning_rate": 8.083853061377534e-07, + "loss": 1.11830472946167, + "step": 2738 + }, + { + "epoch": 0.9974517655624318, + "grad_norm": 97.0, + "learning_rate": 8.08122374870396e-07, + "loss": 1.2512047290802002, + "step": 2740 + }, + { + "epoch": 0.9981798325445941, + "grad_norm": 11.6875, + "learning_rate": 8.078593202381041e-07, + "loss": 1.3382766246795654, + "step": 2742 + }, + { + "epoch": 0.9989078995267565, + "grad_norm": 7.59375, + "learning_rate": 8.075961423968039e-07, + "loss": 1.2923805713653564, + "step": 2744 + }, + { + "epoch": 0.9996359665089188, + "grad_norm": 12.5625, + "learning_rate": 8.073328415024942e-07, + "loss": 1.749849796295166, + "step": 2746 + }, + { + "epoch": 1.0003640334910813, + "grad_norm": 14.625, + "learning_rate": 8.070694177112476e-07, + "loss": 1.4299918413162231, + "step": 2748 + }, + { + "epoch": 1.0010921004732436, + "grad_norm": 21.25, + "learning_rate": 8.068058711792089e-07, + "loss": 1.1774227619171143, + "step": 2750 + }, + { + "epoch": 1.001820167455406, + "grad_norm": 23.375, + "learning_rate": 8.065422020625956e-07, + "loss": 1.2391502857208252, + "step": 2752 + }, + { + "epoch": 1.0025482344375682, + "grad_norm": 19.5, + "learning_rate": 8.062784105176983e-07, + "loss": 0.864935576915741, + "step": 2754 + }, + { + "epoch": 1.0032763014197306, + "grad_norm": 12.5625, + "learning_rate": 8.060144967008799e-07, + "loss": 1.3000609874725342, + "step": 2756 + }, + { + "epoch": 1.0040043684018929, + "grad_norm": 55.0, + "learning_rate": 8.057504607685759e-07, + "loss": 1.4930975437164307, + "step": 2758 + }, + { + "epoch": 1.0047324353840554, + "grad_norm": 9.5, + "learning_rate": 8.054863028772938e-07, + "loss": 1.3979196548461914, + "step": 2760 + }, + { + "epoch": 1.0054605023662178, + "grad_norm": 20.625, + "learning_rate": 8.05222023183614e-07, + "loss": 1.5246769189834595, + "step": 2762 + }, + { + "epoch": 1.00618856934838, + "grad_norm": 15.375, + "learning_rate": 8.049576218441887e-07, + "loss": 1.5889782905578613, + "step": 2764 + }, + { + "epoch": 1.0069166363305424, + "grad_norm": 19.125, + "learning_rate": 8.046930990157424e-07, + "loss": 1.2064764499664307, + "step": 2766 + }, + { + "epoch": 1.0076447033127047, + "grad_norm": 11.4375, + "learning_rate": 8.044284548550713e-07, + "loss": 1.489729404449463, + "step": 2768 + }, + { + "epoch": 1.008372770294867, + "grad_norm": 20.25, + "learning_rate": 8.041636895190435e-07, + "loss": 1.381157636642456, + "step": 2770 + }, + { + "epoch": 1.0091008372770296, + "grad_norm": 16.375, + "learning_rate": 8.038988031645999e-07, + "loss": 1.4584299325942993, + "step": 2772 + }, + { + "epoch": 1.009828904259192, + "grad_norm": 3.09375, + "learning_rate": 8.036337959487518e-07, + "loss": 1.1626390218734741, + "step": 2774 + }, + { + "epoch": 1.0105569712413542, + "grad_norm": 13.0, + "learning_rate": 8.033686680285832e-07, + "loss": 1.325387716293335, + "step": 2776 + }, + { + "epoch": 1.0112850382235166, + "grad_norm": 11.75, + "learning_rate": 8.031034195612487e-07, + "loss": 1.4932076930999756, + "step": 2778 + }, + { + "epoch": 1.0120131052056789, + "grad_norm": 14.9375, + "learning_rate": 8.028380507039747e-07, + "loss": 1.4184925556182861, + "step": 2780 + }, + { + "epoch": 1.0127411721878412, + "grad_norm": 6.46875, + "learning_rate": 8.025725616140598e-07, + "loss": 0.9352809190750122, + "step": 2782 + }, + { + "epoch": 1.0134692391700035, + "grad_norm": 13.375, + "learning_rate": 8.023069524488726e-07, + "loss": 1.4751240015029907, + "step": 2784 + }, + { + "epoch": 1.014197306152166, + "grad_norm": 14.125, + "learning_rate": 8.020412233658538e-07, + "loss": 1.3774054050445557, + "step": 2786 + }, + { + "epoch": 1.0149253731343284, + "grad_norm": 12.375, + "learning_rate": 8.017753745225144e-07, + "loss": 1.3511857986450195, + "step": 2788 + }, + { + "epoch": 1.0156534401164907, + "grad_norm": 8.625, + "learning_rate": 8.015094060764373e-07, + "loss": 1.256922721862793, + "step": 2790 + }, + { + "epoch": 1.016381507098653, + "grad_norm": 4.25, + "learning_rate": 8.012433181852756e-07, + "loss": 1.5246145725250244, + "step": 2792 + }, + { + "epoch": 1.0171095740808154, + "grad_norm": 16.75, + "learning_rate": 8.009771110067531e-07, + "loss": 1.5548666715621948, + "step": 2794 + }, + { + "epoch": 1.0178376410629777, + "grad_norm": 17.25, + "learning_rate": 8.007107846986649e-07, + "loss": 1.4889410734176636, + "step": 2796 + }, + { + "epoch": 1.0185657080451402, + "grad_norm": 13.0, + "learning_rate": 8.004443394188764e-07, + "loss": 1.356172800064087, + "step": 2798 + }, + { + "epoch": 1.0192937750273026, + "grad_norm": 15.4375, + "learning_rate": 8.001777753253238e-07, + "loss": 1.389439582824707, + "step": 2800 + }, + { + "epoch": 1.0200218420094649, + "grad_norm": 10.9375, + "learning_rate": 7.999110925760131e-07, + "loss": 1.351150631904602, + "step": 2802 + }, + { + "epoch": 1.0207499089916272, + "grad_norm": 15.625, + "learning_rate": 7.996442913290209e-07, + "loss": 1.3712376356124878, + "step": 2804 + }, + { + "epoch": 1.0214779759737895, + "grad_norm": 32.25, + "learning_rate": 7.993773717424948e-07, + "loss": 1.6385815143585205, + "step": 2806 + }, + { + "epoch": 1.0222060429559519, + "grad_norm": 9.875, + "learning_rate": 7.99110333974651e-07, + "loss": 1.0346195697784424, + "step": 2808 + }, + { + "epoch": 1.0229341099381144, + "grad_norm": 5.34375, + "learning_rate": 7.988431781837775e-07, + "loss": 1.2778658866882324, + "step": 2810 + }, + { + "epoch": 1.0236621769202767, + "grad_norm": 14.375, + "learning_rate": 7.985759045282307e-07, + "loss": 1.518615484237671, + "step": 2812 + }, + { + "epoch": 1.024390243902439, + "grad_norm": 15.3125, + "learning_rate": 7.983085131664382e-07, + "loss": 1.4899874925613403, + "step": 2814 + }, + { + "epoch": 1.0251183108846014, + "grad_norm": 8.6875, + "learning_rate": 7.980410042568963e-07, + "loss": 1.4484541416168213, + "step": 2816 + }, + { + "epoch": 1.0258463778667637, + "grad_norm": 6.34375, + "learning_rate": 7.977733779581715e-07, + "loss": 1.1634225845336914, + "step": 2818 + }, + { + "epoch": 1.026574444848926, + "grad_norm": 14.375, + "learning_rate": 7.975056344289e-07, + "loss": 1.1845849752426147, + "step": 2820 + }, + { + "epoch": 1.0273025118310886, + "grad_norm": 4.34375, + "learning_rate": 7.972377738277869e-07, + "loss": 0.8920297026634216, + "step": 2822 + }, + { + "epoch": 1.0280305788132509, + "grad_norm": 11.75, + "learning_rate": 7.969697963136075e-07, + "loss": 1.538903832435608, + "step": 2824 + }, + { + "epoch": 1.0287586457954132, + "grad_norm": 13.3125, + "learning_rate": 7.967017020452058e-07, + "loss": 1.5311305522918701, + "step": 2826 + }, + { + "epoch": 1.0294867127775755, + "grad_norm": 21.125, + "learning_rate": 7.964334911814952e-07, + "loss": 1.5196905136108398, + "step": 2828 + }, + { + "epoch": 1.0302147797597379, + "grad_norm": 16.375, + "learning_rate": 7.961651638814579e-07, + "loss": 1.3057652711868286, + "step": 2830 + }, + { + "epoch": 1.0309428467419002, + "grad_norm": 10.0, + "learning_rate": 7.958967203041457e-07, + "loss": 1.0468437671661377, + "step": 2832 + }, + { + "epoch": 1.0316709137240627, + "grad_norm": 44.5, + "learning_rate": 7.956281606086796e-07, + "loss": 1.2331998348236084, + "step": 2834 + }, + { + "epoch": 1.032398980706225, + "grad_norm": 14.1875, + "learning_rate": 7.95359484954248e-07, + "loss": 1.4627890586853027, + "step": 2836 + }, + { + "epoch": 1.0331270476883874, + "grad_norm": 23.0, + "learning_rate": 7.950906935001091e-07, + "loss": 1.5515284538269043, + "step": 2838 + }, + { + "epoch": 1.0338551146705497, + "grad_norm": 13.6875, + "learning_rate": 7.948217864055899e-07, + "loss": 1.2405390739440918, + "step": 2840 + }, + { + "epoch": 1.034583181652712, + "grad_norm": 15.25, + "learning_rate": 7.94552763830085e-07, + "loss": 1.3742467164993286, + "step": 2842 + }, + { + "epoch": 1.0353112486348743, + "grad_norm": 18.25, + "learning_rate": 7.942836259330587e-07, + "loss": 0.9180659055709839, + "step": 2844 + }, + { + "epoch": 1.0360393156170367, + "grad_norm": 7.1875, + "learning_rate": 7.940143728740427e-07, + "loss": 1.3987147808074951, + "step": 2846 + }, + { + "epoch": 1.0367673825991992, + "grad_norm": 11.3125, + "learning_rate": 7.937450048126373e-07, + "loss": 1.1654677391052246, + "step": 2848 + }, + { + "epoch": 1.0374954495813615, + "grad_norm": 9.5, + "learning_rate": 7.934755219085107e-07, + "loss": 1.151093602180481, + "step": 2850 + }, + { + "epoch": 1.0382235165635239, + "grad_norm": 29.125, + "learning_rate": 7.932059243214e-07, + "loss": 1.1527493000030518, + "step": 2852 + }, + { + "epoch": 1.0389515835456862, + "grad_norm": 5.09375, + "learning_rate": 7.929362122111089e-07, + "loss": 1.2163081169128418, + "step": 2854 + }, + { + "epoch": 1.0396796505278485, + "grad_norm": 39.25, + "learning_rate": 7.926663857375104e-07, + "loss": 1.443825364112854, + "step": 2856 + }, + { + "epoch": 1.0404077175100108, + "grad_norm": 5.78125, + "learning_rate": 7.923964450605443e-07, + "loss": 1.2925432920455933, + "step": 2858 + }, + { + "epoch": 1.0411357844921734, + "grad_norm": 17.75, + "learning_rate": 7.921263903402186e-07, + "loss": 1.4616130590438843, + "step": 2860 + }, + { + "epoch": 1.0418638514743357, + "grad_norm": 4.84375, + "learning_rate": 7.91856221736609e-07, + "loss": 1.3662967681884766, + "step": 2862 + }, + { + "epoch": 1.042591918456498, + "grad_norm": 22.25, + "learning_rate": 7.915859394098579e-07, + "loss": 1.327544927597046, + "step": 2864 + }, + { + "epoch": 1.0433199854386603, + "grad_norm": 10.375, + "learning_rate": 7.913155435201757e-07, + "loss": 1.3802189826965332, + "step": 2866 + }, + { + "epoch": 1.0440480524208227, + "grad_norm": 3.5625, + "learning_rate": 7.910450342278408e-07, + "loss": 1.2094066143035889, + "step": 2868 + }, + { + "epoch": 1.044776119402985, + "grad_norm": 36.0, + "learning_rate": 7.907744116931974e-07, + "loss": 1.4857620000839233, + "step": 2870 + }, + { + "epoch": 1.0455041863851475, + "grad_norm": 10.4375, + "learning_rate": 7.905036760766577e-07, + "loss": 1.3177602291107178, + "step": 2872 + }, + { + "epoch": 1.0462322533673099, + "grad_norm": 18.5, + "learning_rate": 7.902328275387007e-07, + "loss": 1.5900269746780396, + "step": 2874 + }, + { + "epoch": 1.0469603203494722, + "grad_norm": 18.625, + "learning_rate": 7.899618662398729e-07, + "loss": 1.1586143970489502, + "step": 2876 + }, + { + "epoch": 1.0476883873316345, + "grad_norm": 17.0, + "learning_rate": 7.896907923407863e-07, + "loss": 1.7054274082183838, + "step": 2878 + }, + { + "epoch": 1.0484164543137968, + "grad_norm": 24.5, + "learning_rate": 7.89419606002121e-07, + "loss": 0.6297630071640015, + "step": 2880 + }, + { + "epoch": 1.0491445212959591, + "grad_norm": 12.3125, + "learning_rate": 7.89148307384623e-07, + "loss": 1.1905344724655151, + "step": 2882 + }, + { + "epoch": 1.0498725882781217, + "grad_norm": 5.625, + "learning_rate": 7.888768966491055e-07, + "loss": 1.0476936101913452, + "step": 2884 + }, + { + "epoch": 1.050600655260284, + "grad_norm": 7.21875, + "learning_rate": 7.886053739564471e-07, + "loss": 0.9628153443336487, + "step": 2886 + }, + { + "epoch": 1.0513287222424463, + "grad_norm": 17.25, + "learning_rate": 7.883337394675938e-07, + "loss": 1.00679612159729, + "step": 2888 + }, + { + "epoch": 1.0520567892246087, + "grad_norm": 17.375, + "learning_rate": 7.880619933435573e-07, + "loss": 1.7509620189666748, + "step": 2890 + }, + { + "epoch": 1.052784856206771, + "grad_norm": 23.625, + "learning_rate": 7.877901357454158e-07, + "loss": 1.6114308834075928, + "step": 2892 + }, + { + "epoch": 1.0535129231889333, + "grad_norm": 29.0, + "learning_rate": 7.875181668343132e-07, + "loss": 1.9089949131011963, + "step": 2894 + }, + { + "epoch": 1.0542409901710958, + "grad_norm": 14.5625, + "learning_rate": 7.872460867714597e-07, + "loss": 1.3066809177398682, + "step": 2896 + }, + { + "epoch": 1.0549690571532582, + "grad_norm": 6.46875, + "learning_rate": 7.869738957181314e-07, + "loss": 1.410874843597412, + "step": 2898 + }, + { + "epoch": 1.0556971241354205, + "grad_norm": 8.625, + "learning_rate": 7.867015938356696e-07, + "loss": 1.055498719215393, + "step": 2900 + }, + { + "epoch": 1.0564251911175828, + "grad_norm": 12.9375, + "learning_rate": 7.864291812854824e-07, + "loss": 1.5530226230621338, + "step": 2902 + }, + { + "epoch": 1.0571532580997451, + "grad_norm": 11.0625, + "learning_rate": 7.861566582290425e-07, + "loss": 1.0250847339630127, + "step": 2904 + }, + { + "epoch": 1.0578813250819075, + "grad_norm": 18.875, + "learning_rate": 7.858840248278884e-07, + "loss": 1.655206322669983, + "step": 2906 + }, + { + "epoch": 1.0586093920640698, + "grad_norm": 12.1875, + "learning_rate": 7.856112812436245e-07, + "loss": 0.8612507581710815, + "step": 2908 + }, + { + "epoch": 1.0593374590462323, + "grad_norm": 12.5, + "learning_rate": 7.853384276379193e-07, + "loss": 1.5975005626678467, + "step": 2910 + }, + { + "epoch": 1.0600655260283947, + "grad_norm": 14.25, + "learning_rate": 7.850654641725081e-07, + "loss": 1.8806583881378174, + "step": 2912 + }, + { + "epoch": 1.060793593010557, + "grad_norm": 11.3125, + "learning_rate": 7.8479239100919e-07, + "loss": 1.5122857093811035, + "step": 2914 + }, + { + "epoch": 1.0615216599927193, + "grad_norm": 16.25, + "learning_rate": 7.8451920830983e-07, + "loss": 1.4064913988113403, + "step": 2916 + }, + { + "epoch": 1.0622497269748816, + "grad_norm": 5.78125, + "learning_rate": 7.842459162363573e-07, + "loss": 1.3637027740478516, + "step": 2918 + }, + { + "epoch": 1.062977793957044, + "grad_norm": 12.0625, + "learning_rate": 7.839725149507666e-07, + "loss": 1.4520833492279053, + "step": 2920 + }, + { + "epoch": 1.0637058609392065, + "grad_norm": 14.75, + "learning_rate": 7.836990046151166e-07, + "loss": 1.713083028793335, + "step": 2922 + }, + { + "epoch": 1.0644339279213688, + "grad_norm": 5.375, + "learning_rate": 7.834253853915316e-07, + "loss": 1.3338745832443237, + "step": 2924 + }, + { + "epoch": 1.0651619949035311, + "grad_norm": 16.375, + "learning_rate": 7.831516574421996e-07, + "loss": 1.5786457061767578, + "step": 2926 + }, + { + "epoch": 1.0658900618856935, + "grad_norm": 14.5625, + "learning_rate": 7.828778209293738e-07, + "loss": 1.4788072109222412, + "step": 2928 + }, + { + "epoch": 1.0666181288678558, + "grad_norm": 7.5, + "learning_rate": 7.826038760153706e-07, + "loss": 1.172348976135254, + "step": 2930 + }, + { + "epoch": 1.067346195850018, + "grad_norm": 47.5, + "learning_rate": 7.823298228625719e-07, + "loss": 1.1002264022827148, + "step": 2932 + }, + { + "epoch": 1.0680742628321807, + "grad_norm": 14.4375, + "learning_rate": 7.820556616334231e-07, + "loss": 1.4739265441894531, + "step": 2934 + }, + { + "epoch": 1.068802329814343, + "grad_norm": 14.6875, + "learning_rate": 7.817813924904337e-07, + "loss": 1.5285440683364868, + "step": 2936 + }, + { + "epoch": 1.0695303967965053, + "grad_norm": 16.25, + "learning_rate": 7.815070155961773e-07, + "loss": 1.623680830001831, + "step": 2938 + }, + { + "epoch": 1.0702584637786676, + "grad_norm": 13.8125, + "learning_rate": 7.812325311132915e-07, + "loss": 1.3830218315124512, + "step": 2940 + }, + { + "epoch": 1.07098653076083, + "grad_norm": 13.6875, + "learning_rate": 7.809579392044772e-07, + "loss": 0.9909210205078125, + "step": 2942 + }, + { + "epoch": 1.0717145977429923, + "grad_norm": 12.5, + "learning_rate": 7.806832400324991e-07, + "loss": 1.31742525100708, + "step": 2944 + }, + { + "epoch": 1.0724426647251548, + "grad_norm": 15.0, + "learning_rate": 7.804084337601866e-07, + "loss": 1.5436780452728271, + "step": 2946 + }, + { + "epoch": 1.0731707317073171, + "grad_norm": 22.5, + "learning_rate": 7.801335205504307e-07, + "loss": 1.3621745109558105, + "step": 2948 + }, + { + "epoch": 1.0738987986894795, + "grad_norm": 13.0, + "learning_rate": 7.798585005661873e-07, + "loss": 1.5048787593841553, + "step": 2950 + }, + { + "epoch": 1.0746268656716418, + "grad_norm": 12.3125, + "learning_rate": 7.795833739704745e-07, + "loss": 1.5376381874084473, + "step": 2952 + }, + { + "epoch": 1.075354932653804, + "grad_norm": 26.375, + "learning_rate": 7.793081409263744e-07, + "loss": 1.364687442779541, + "step": 2954 + }, + { + "epoch": 1.0760829996359664, + "grad_norm": 35.25, + "learning_rate": 7.79032801597032e-07, + "loss": 1.6569695472717285, + "step": 2956 + }, + { + "epoch": 1.076811066618129, + "grad_norm": 16.125, + "learning_rate": 7.787573561456551e-07, + "loss": 1.3665719032287598, + "step": 2958 + }, + { + "epoch": 1.0775391336002913, + "grad_norm": 4.5, + "learning_rate": 7.784818047355146e-07, + "loss": 1.2708712816238403, + "step": 2960 + }, + { + "epoch": 1.0782672005824536, + "grad_norm": 64.5, + "learning_rate": 7.782061475299439e-07, + "loss": 1.6397353410720825, + "step": 2962 + }, + { + "epoch": 1.078995267564616, + "grad_norm": 11.875, + "learning_rate": 7.779303846923395e-07, + "loss": 1.4247617721557617, + "step": 2964 + }, + { + "epoch": 1.0797233345467783, + "grad_norm": 17.125, + "learning_rate": 7.776545163861606e-07, + "loss": 1.4974164962768555, + "step": 2966 + }, + { + "epoch": 1.0804514015289406, + "grad_norm": 42.5, + "learning_rate": 7.773785427749282e-07, + "loss": 1.426703929901123, + "step": 2968 + }, + { + "epoch": 1.081179468511103, + "grad_norm": 9.875, + "learning_rate": 7.771024640222263e-07, + "loss": 1.8366591930389404, + "step": 2970 + }, + { + "epoch": 1.0819075354932655, + "grad_norm": 20.375, + "learning_rate": 7.768262802917014e-07, + "loss": 0.9733418822288513, + "step": 2972 + }, + { + "epoch": 1.0826356024754278, + "grad_norm": 27.375, + "learning_rate": 7.765499917470617e-07, + "loss": 1.597959280014038, + "step": 2974 + }, + { + "epoch": 1.08336366945759, + "grad_norm": 13.0, + "learning_rate": 7.762735985520777e-07, + "loss": 1.1030209064483643, + "step": 2976 + }, + { + "epoch": 1.0840917364397524, + "grad_norm": 22.375, + "learning_rate": 7.759971008705819e-07, + "loss": 1.6733412742614746, + "step": 2978 + }, + { + "epoch": 1.0848198034219148, + "grad_norm": 63.0, + "learning_rate": 7.757204988664691e-07, + "loss": 1.6756718158721924, + "step": 2980 + }, + { + "epoch": 1.085547870404077, + "grad_norm": 12.9375, + "learning_rate": 7.754437927036959e-07, + "loss": 1.4230358600616455, + "step": 2982 + }, + { + "epoch": 1.0862759373862396, + "grad_norm": 9.4375, + "learning_rate": 7.751669825462797e-07, + "loss": 0.9148332476615906, + "step": 2984 + }, + { + "epoch": 1.087004004368402, + "grad_norm": 4.46875, + "learning_rate": 7.748900685583007e-07, + "loss": 0.9252538681030273, + "step": 2986 + }, + { + "epoch": 1.0877320713505643, + "grad_norm": 14.6875, + "learning_rate": 7.746130509039004e-07, + "loss": 1.4372045993804932, + "step": 2988 + }, + { + "epoch": 1.0884601383327266, + "grad_norm": 20.375, + "learning_rate": 7.743359297472809e-07, + "loss": 1.287437081336975, + "step": 2990 + }, + { + "epoch": 1.089188205314889, + "grad_norm": 10.5625, + "learning_rate": 7.740587052527068e-07, + "loss": 1.4651877880096436, + "step": 2992 + }, + { + "epoch": 1.0899162722970512, + "grad_norm": 19.125, + "learning_rate": 7.737813775845032e-07, + "loss": 1.935342788696289, + "step": 2994 + }, + { + "epoch": 1.0906443392792138, + "grad_norm": 14.0, + "learning_rate": 7.73503946907057e-07, + "loss": 1.33158540725708, + "step": 2996 + }, + { + "epoch": 1.091372406261376, + "grad_norm": 14.625, + "learning_rate": 7.732264133848152e-07, + "loss": 1.1597857475280762, + "step": 2998 + }, + { + "epoch": 1.0921004732435384, + "grad_norm": 14.1875, + "learning_rate": 7.729487771822866e-07, + "loss": 1.251415491104126, + "step": 3000 + }, + { + "epoch": 1.0928285402257008, + "grad_norm": 14.0, + "learning_rate": 7.726710384640407e-07, + "loss": 1.253814458847046, + "step": 3002 + }, + { + "epoch": 1.093556607207863, + "grad_norm": 15.0, + "learning_rate": 7.723931973947075e-07, + "loss": 1.5032384395599365, + "step": 3004 + }, + { + "epoch": 1.0942846741900254, + "grad_norm": 13.375, + "learning_rate": 7.721152541389779e-07, + "loss": 1.302152395248413, + "step": 3006 + }, + { + "epoch": 1.095012741172188, + "grad_norm": 33.5, + "learning_rate": 7.718372088616034e-07, + "loss": 1.6317181587219238, + "step": 3008 + }, + { + "epoch": 1.0957408081543503, + "grad_norm": 6.46875, + "learning_rate": 7.715590617273955e-07, + "loss": 1.575524926185608, + "step": 3010 + }, + { + "epoch": 1.0964688751365126, + "grad_norm": 10.8125, + "learning_rate": 7.712808129012267e-07, + "loss": 1.5187182426452637, + "step": 3012 + }, + { + "epoch": 1.097196942118675, + "grad_norm": 4.125, + "learning_rate": 7.710024625480296e-07, + "loss": 1.0924588441848755, + "step": 3014 + }, + { + "epoch": 1.0979250091008372, + "grad_norm": 10.875, + "learning_rate": 7.707240108327966e-07, + "loss": 1.3684775829315186, + "step": 3016 + }, + { + "epoch": 1.0986530760829996, + "grad_norm": 20.125, + "learning_rate": 7.704454579205808e-07, + "loss": 1.1852086782455444, + "step": 3018 + }, + { + "epoch": 1.099381143065162, + "grad_norm": 6.9375, + "learning_rate": 7.701668039764951e-07, + "loss": 1.4629124402999878, + "step": 3020 + }, + { + "epoch": 1.1001092100473244, + "grad_norm": 14.75, + "learning_rate": 7.698880491657116e-07, + "loss": 1.227146029472351, + "step": 3022 + }, + { + "epoch": 1.1008372770294867, + "grad_norm": 10.125, + "learning_rate": 7.69609193653463e-07, + "loss": 0.8923517465591431, + "step": 3024 + }, + { + "epoch": 1.101565344011649, + "grad_norm": 43.75, + "learning_rate": 7.693302376050416e-07, + "loss": 1.8073244094848633, + "step": 3026 + }, + { + "epoch": 1.1022934109938114, + "grad_norm": 27.125, + "learning_rate": 7.690511811857991e-07, + "loss": 1.5364580154418945, + "step": 3028 + }, + { + "epoch": 1.1030214779759737, + "grad_norm": 9.3125, + "learning_rate": 7.687720245611464e-07, + "loss": 1.1297965049743652, + "step": 3030 + }, + { + "epoch": 1.103749544958136, + "grad_norm": 10.1875, + "learning_rate": 7.684927678965547e-07, + "loss": 1.077048897743225, + "step": 3032 + }, + { + "epoch": 1.1044776119402986, + "grad_norm": 12.0, + "learning_rate": 7.682134113575533e-07, + "loss": 1.4534276723861694, + "step": 3034 + }, + { + "epoch": 1.105205678922461, + "grad_norm": 13.0625, + "learning_rate": 7.679339551097317e-07, + "loss": 1.5194637775421143, + "step": 3036 + }, + { + "epoch": 1.1059337459046232, + "grad_norm": 13.8125, + "learning_rate": 7.676543993187381e-07, + "loss": 1.740049123764038, + "step": 3038 + }, + { + "epoch": 1.1066618128867856, + "grad_norm": 12.5, + "learning_rate": 7.673747441502796e-07, + "loss": 1.509361982345581, + "step": 3040 + }, + { + "epoch": 1.1073898798689479, + "grad_norm": 15.3125, + "learning_rate": 7.670949897701224e-07, + "loss": 1.1496083736419678, + "step": 3042 + }, + { + "epoch": 1.1081179468511102, + "grad_norm": 3.90625, + "learning_rate": 7.668151363440915e-07, + "loss": 1.4118932485580444, + "step": 3044 + }, + { + "epoch": 1.1088460138332727, + "grad_norm": 24.375, + "learning_rate": 7.665351840380705e-07, + "loss": 1.7153046131134033, + "step": 3046 + }, + { + "epoch": 1.109574080815435, + "grad_norm": 6.28125, + "learning_rate": 7.662551330180016e-07, + "loss": 1.2397010326385498, + "step": 3048 + }, + { + "epoch": 1.1103021477975974, + "grad_norm": 9.0625, + "learning_rate": 7.659749834498854e-07, + "loss": 1.3957154750823975, + "step": 3050 + }, + { + "epoch": 1.1110302147797597, + "grad_norm": 16.5, + "learning_rate": 7.656947354997816e-07, + "loss": 1.4913358688354492, + "step": 3052 + }, + { + "epoch": 1.111758281761922, + "grad_norm": 9.0, + "learning_rate": 7.654143893338074e-07, + "loss": 1.1892744302749634, + "step": 3054 + }, + { + "epoch": 1.1124863487440844, + "grad_norm": 16.375, + "learning_rate": 7.651339451181384e-07, + "loss": 1.3129997253417969, + "step": 3056 + }, + { + "epoch": 1.113214415726247, + "grad_norm": 11.125, + "learning_rate": 7.648534030190088e-07, + "loss": 1.4636383056640625, + "step": 3058 + }, + { + "epoch": 1.1139424827084092, + "grad_norm": 39.75, + "learning_rate": 7.645727632027102e-07, + "loss": 1.650086760520935, + "step": 3060 + }, + { + "epoch": 1.1146705496905716, + "grad_norm": 16.75, + "learning_rate": 7.642920258355923e-07, + "loss": 1.565767526626587, + "step": 3062 + }, + { + "epoch": 1.1153986166727339, + "grad_norm": 32.75, + "learning_rate": 7.640111910840628e-07, + "loss": 1.6336307525634766, + "step": 3064 + }, + { + "epoch": 1.1161266836548962, + "grad_norm": 19.25, + "learning_rate": 7.637302591145873e-07, + "loss": 1.471846580505371, + "step": 3066 + }, + { + "epoch": 1.1168547506370585, + "grad_norm": 20.75, + "learning_rate": 7.634492300936886e-07, + "loss": 1.5384643077850342, + "step": 3068 + }, + { + "epoch": 1.117582817619221, + "grad_norm": 9.9375, + "learning_rate": 7.631681041879467e-07, + "loss": 1.3491666316986084, + "step": 3070 + }, + { + "epoch": 1.1183108846013834, + "grad_norm": 5.21875, + "learning_rate": 7.628868815640004e-07, + "loss": 1.2024714946746826, + "step": 3072 + }, + { + "epoch": 1.1190389515835457, + "grad_norm": 21.0, + "learning_rate": 7.626055623885444e-07, + "loss": 1.7679624557495117, + "step": 3074 + }, + { + "epoch": 1.119767018565708, + "grad_norm": 12.875, + "learning_rate": 7.623241468283312e-07, + "loss": 0.9320430755615234, + "step": 3076 + }, + { + "epoch": 1.1204950855478704, + "grad_norm": 20.625, + "learning_rate": 7.620426350501705e-07, + "loss": 0.9064109325408936, + "step": 3078 + }, + { + "epoch": 1.1212231525300327, + "grad_norm": 11.625, + "learning_rate": 7.61761027220929e-07, + "loss": 1.417181372642517, + "step": 3080 + }, + { + "epoch": 1.1219512195121952, + "grad_norm": 12.8125, + "learning_rate": 7.614793235075301e-07, + "loss": 1.5234074592590332, + "step": 3082 + }, + { + "epoch": 1.1226792864943576, + "grad_norm": 6.28125, + "learning_rate": 7.611975240769541e-07, + "loss": 1.6155809164047241, + "step": 3084 + }, + { + "epoch": 1.1234073534765199, + "grad_norm": 18.625, + "learning_rate": 7.609156290962389e-07, + "loss": 1.2171564102172852, + "step": 3086 + }, + { + "epoch": 1.1241354204586822, + "grad_norm": 9.5625, + "learning_rate": 7.606336387324775e-07, + "loss": 1.363595724105835, + "step": 3088 + }, + { + "epoch": 1.1248634874408445, + "grad_norm": 17.5, + "learning_rate": 7.603515531528202e-07, + "loss": 1.2106103897094727, + "step": 3090 + }, + { + "epoch": 1.1255915544230068, + "grad_norm": 21.0, + "learning_rate": 7.600693725244744e-07, + "loss": 1.4685299396514893, + "step": 3092 + }, + { + "epoch": 1.1263196214051692, + "grad_norm": 5.15625, + "learning_rate": 7.597870970147029e-07, + "loss": 1.169956088066101, + "step": 3094 + }, + { + "epoch": 1.1270476883873317, + "grad_norm": 11.375, + "learning_rate": 7.595047267908248e-07, + "loss": 1.410547137260437, + "step": 3096 + }, + { + "epoch": 1.127775755369494, + "grad_norm": 17.125, + "learning_rate": 7.592222620202158e-07, + "loss": 1.680928111076355, + "step": 3098 + }, + { + "epoch": 1.1285038223516564, + "grad_norm": 17.25, + "learning_rate": 7.589397028703073e-07, + "loss": 1.155842900276184, + "step": 3100 + }, + { + "epoch": 1.1292318893338187, + "grad_norm": 3.828125, + "learning_rate": 7.586570495085871e-07, + "loss": 1.3301548957824707, + "step": 3102 + }, + { + "epoch": 1.129959956315981, + "grad_norm": 38.5, + "learning_rate": 7.583743021025983e-07, + "loss": 0.9927915334701538, + "step": 3104 + }, + { + "epoch": 1.1306880232981436, + "grad_norm": 10.0, + "learning_rate": 7.5809146081994e-07, + "loss": 1.1140646934509277, + "step": 3106 + }, + { + "epoch": 1.1314160902803059, + "grad_norm": 14.4375, + "learning_rate": 7.578085258282667e-07, + "loss": 1.6815648078918457, + "step": 3108 + }, + { + "epoch": 1.1321441572624682, + "grad_norm": 20.0, + "learning_rate": 7.575254972952889e-07, + "loss": 1.5611786842346191, + "step": 3110 + }, + { + "epoch": 1.1328722242446305, + "grad_norm": 9.25, + "learning_rate": 7.572423753887725e-07, + "loss": 1.3067207336425781, + "step": 3112 + }, + { + "epoch": 1.1336002912267928, + "grad_norm": 16.0, + "learning_rate": 7.569591602765381e-07, + "loss": 1.182687759399414, + "step": 3114 + }, + { + "epoch": 1.1343283582089552, + "grad_norm": 15.9375, + "learning_rate": 7.566758521264622e-07, + "loss": 1.4300012588500977, + "step": 3116 + }, + { + "epoch": 1.1350564251911175, + "grad_norm": 17.0, + "learning_rate": 7.563924511064763e-07, + "loss": 1.6996471881866455, + "step": 3118 + }, + { + "epoch": 1.13578449217328, + "grad_norm": 22.625, + "learning_rate": 7.561089573845667e-07, + "loss": 0.9931000471115112, + "step": 3120 + }, + { + "epoch": 1.1365125591554424, + "grad_norm": 6.03125, + "learning_rate": 7.558253711287752e-07, + "loss": 1.2983801364898682, + "step": 3122 + }, + { + "epoch": 1.1372406261376047, + "grad_norm": 16.5, + "learning_rate": 7.555416925071976e-07, + "loss": 1.4925622940063477, + "step": 3124 + }, + { + "epoch": 1.137968693119767, + "grad_norm": 6.84375, + "learning_rate": 7.552579216879853e-07, + "loss": 1.3099830150604248, + "step": 3126 + }, + { + "epoch": 1.1386967601019293, + "grad_norm": 161.0, + "learning_rate": 7.549740588393439e-07, + "loss": 1.26654052734375, + "step": 3128 + }, + { + "epoch": 1.1394248270840917, + "grad_norm": 17.625, + "learning_rate": 7.546901041295337e-07, + "loss": 1.4521512985229492, + "step": 3130 + }, + { + "epoch": 1.140152894066254, + "grad_norm": 16.25, + "learning_rate": 7.544060577268693e-07, + "loss": 1.4011918306350708, + "step": 3132 + }, + { + "epoch": 1.1408809610484165, + "grad_norm": 16.25, + "learning_rate": 7.541219197997199e-07, + "loss": 1.4144008159637451, + "step": 3134 + }, + { + "epoch": 1.1416090280305788, + "grad_norm": 10.875, + "learning_rate": 7.538376905165084e-07, + "loss": 1.4438748359680176, + "step": 3136 + }, + { + "epoch": 1.1423370950127412, + "grad_norm": 17.5, + "learning_rate": 7.535533700457131e-07, + "loss": 1.7868558168411255, + "step": 3138 + }, + { + "epoch": 1.1430651619949035, + "grad_norm": 34.25, + "learning_rate": 7.532689585558645e-07, + "loss": 1.4058823585510254, + "step": 3140 + }, + { + "epoch": 1.1437932289770658, + "grad_norm": 13.25, + "learning_rate": 7.529844562155485e-07, + "loss": 1.2535533905029297, + "step": 3142 + }, + { + "epoch": 1.1445212959592284, + "grad_norm": 14.75, + "learning_rate": 7.526998631934048e-07, + "loss": 1.3866677284240723, + "step": 3144 + }, + { + "epoch": 1.1452493629413907, + "grad_norm": 16.125, + "learning_rate": 7.524151796581255e-07, + "loss": 1.68190598487854, + "step": 3146 + }, + { + "epoch": 1.145977429923553, + "grad_norm": 9.875, + "learning_rate": 7.521304057784584e-07, + "loss": 1.345764398574829, + "step": 3148 + }, + { + "epoch": 1.1467054969057153, + "grad_norm": 25.625, + "learning_rate": 7.518455417232028e-07, + "loss": 1.1426867246627808, + "step": 3150 + }, + { + "epoch": 1.1474335638878776, + "grad_norm": 17.75, + "learning_rate": 7.51560587661213e-07, + "loss": 1.9592173099517822, + "step": 3152 + }, + { + "epoch": 1.14816163087004, + "grad_norm": 20.0, + "learning_rate": 7.51275543761396e-07, + "loss": 1.166360855102539, + "step": 3154 + }, + { + "epoch": 1.1488896978522023, + "grad_norm": 20.5, + "learning_rate": 7.509904101927118e-07, + "loss": 1.3629536628723145, + "step": 3156 + }, + { + "epoch": 1.1496177648343648, + "grad_norm": 16.25, + "learning_rate": 7.507051871241743e-07, + "loss": 1.4413706064224243, + "step": 3158 + }, + { + "epoch": 1.1503458318165272, + "grad_norm": 9.6875, + "learning_rate": 7.504198747248495e-07, + "loss": 1.3213022947311401, + "step": 3160 + }, + { + "epoch": 1.1510738987986895, + "grad_norm": 13.6875, + "learning_rate": 7.501344731638575e-07, + "loss": 1.173963189125061, + "step": 3162 + }, + { + "epoch": 1.1518019657808518, + "grad_norm": 12.8125, + "learning_rate": 7.498489826103703e-07, + "loss": 1.249001383781433, + "step": 3164 + }, + { + "epoch": 1.1525300327630141, + "grad_norm": 21.375, + "learning_rate": 7.495634032336126e-07, + "loss": 1.4631551504135132, + "step": 3166 + }, + { + "epoch": 1.1532580997451767, + "grad_norm": 15.0625, + "learning_rate": 7.49277735202863e-07, + "loss": 1.4307080507278442, + "step": 3168 + }, + { + "epoch": 1.153986166727339, + "grad_norm": 18.0, + "learning_rate": 7.489919786874507e-07, + "loss": 1.4381202459335327, + "step": 3170 + }, + { + "epoch": 1.1547142337095013, + "grad_norm": 16.25, + "learning_rate": 7.487061338567595e-07, + "loss": 1.8269298076629639, + "step": 3172 + }, + { + "epoch": 1.1554423006916636, + "grad_norm": 17.125, + "learning_rate": 7.484202008802236e-07, + "loss": 1.4004244804382324, + "step": 3174 + }, + { + "epoch": 1.156170367673826, + "grad_norm": 13.9375, + "learning_rate": 7.481341799273306e-07, + "loss": 1.3714299201965332, + "step": 3176 + }, + { + "epoch": 1.1568984346559883, + "grad_norm": 17.0, + "learning_rate": 7.478480711676203e-07, + "loss": 1.2340729236602783, + "step": 3178 + }, + { + "epoch": 1.1576265016381506, + "grad_norm": 8.8125, + "learning_rate": 7.475618747706838e-07, + "loss": 0.9506329298019409, + "step": 3180 + }, + { + "epoch": 1.1583545686203132, + "grad_norm": 7.125, + "learning_rate": 7.472755909061645e-07, + "loss": 1.4006938934326172, + "step": 3182 + }, + { + "epoch": 1.1590826356024755, + "grad_norm": 21.375, + "learning_rate": 7.46989219743758e-07, + "loss": 1.3743348121643066, + "step": 3184 + }, + { + "epoch": 1.1598107025846378, + "grad_norm": 15.6875, + "learning_rate": 7.467027614532113e-07, + "loss": 1.398172378540039, + "step": 3186 + }, + { + "epoch": 1.1605387695668001, + "grad_norm": 17.5, + "learning_rate": 7.464162162043233e-07, + "loss": 1.6854944229125977, + "step": 3188 + }, + { + "epoch": 1.1612668365489625, + "grad_norm": 19.375, + "learning_rate": 7.461295841669435e-07, + "loss": 1.7926838397979736, + "step": 3190 + }, + { + "epoch": 1.1619949035311248, + "grad_norm": 6.65625, + "learning_rate": 7.458428655109745e-07, + "loss": 1.179733157157898, + "step": 3192 + }, + { + "epoch": 1.162722970513287, + "grad_norm": 5.28125, + "learning_rate": 7.455560604063687e-07, + "loss": 1.0275070667266846, + "step": 3194 + }, + { + "epoch": 1.1634510374954496, + "grad_norm": 11.375, + "learning_rate": 7.452691690231305e-07, + "loss": 1.2018892765045166, + "step": 3196 + }, + { + "epoch": 1.164179104477612, + "grad_norm": 14.5, + "learning_rate": 7.449821915313154e-07, + "loss": 1.514192819595337, + "step": 3198 + }, + { + "epoch": 1.1649071714597743, + "grad_norm": 14.875, + "learning_rate": 7.446951281010301e-07, + "loss": 1.4654862880706787, + "step": 3200 + }, + { + "epoch": 1.1656352384419366, + "grad_norm": 5.03125, + "learning_rate": 7.444079789024318e-07, + "loss": 1.0592668056488037, + "step": 3202 + }, + { + "epoch": 1.166363305424099, + "grad_norm": 18.625, + "learning_rate": 7.441207441057284e-07, + "loss": 1.1930619478225708, + "step": 3204 + }, + { + "epoch": 1.1670913724062615, + "grad_norm": 13.5625, + "learning_rate": 7.438334238811792e-07, + "loss": 1.378472089767456, + "step": 3206 + }, + { + "epoch": 1.1678194393884238, + "grad_norm": 21.875, + "learning_rate": 7.435460183990938e-07, + "loss": 1.5339888334274292, + "step": 3208 + }, + { + "epoch": 1.1685475063705861, + "grad_norm": 16.625, + "learning_rate": 7.432585278298324e-07, + "loss": 1.2837213277816772, + "step": 3210 + }, + { + "epoch": 1.1692755733527485, + "grad_norm": 21.0, + "learning_rate": 7.429709523438055e-07, + "loss": 1.6055446863174438, + "step": 3212 + }, + { + "epoch": 1.1700036403349108, + "grad_norm": 9.8125, + "learning_rate": 7.426832921114738e-07, + "loss": 1.656429409980774, + "step": 3214 + }, + { + "epoch": 1.170731707317073, + "grad_norm": 31.625, + "learning_rate": 7.423955473033488e-07, + "loss": 1.295668363571167, + "step": 3216 + }, + { + "epoch": 1.1714597742992354, + "grad_norm": 20.75, + "learning_rate": 7.421077180899915e-07, + "loss": 1.4649769067764282, + "step": 3218 + }, + { + "epoch": 1.172187841281398, + "grad_norm": 15.25, + "learning_rate": 7.418198046420137e-07, + "loss": 1.5466828346252441, + "step": 3220 + }, + { + "epoch": 1.1729159082635603, + "grad_norm": 12.9375, + "learning_rate": 7.415318071300762e-07, + "loss": 1.2343864440917969, + "step": 3222 + }, + { + "epoch": 1.1736439752457226, + "grad_norm": 10.375, + "learning_rate": 7.412437257248901e-07, + "loss": 1.4467864036560059, + "step": 3224 + }, + { + "epoch": 1.174372042227885, + "grad_norm": 15.625, + "learning_rate": 7.409555605972162e-07, + "loss": 1.602095603942871, + "step": 3226 + }, + { + "epoch": 1.1751001092100473, + "grad_norm": 6.59375, + "learning_rate": 7.406673119178653e-07, + "loss": 1.6207306385040283, + "step": 3228 + }, + { + "epoch": 1.1758281761922098, + "grad_norm": 37.25, + "learning_rate": 7.403789798576967e-07, + "loss": 1.4914159774780273, + "step": 3230 + }, + { + "epoch": 1.1765562431743721, + "grad_norm": 25.125, + "learning_rate": 7.400905645876203e-07, + "loss": 1.7298457622528076, + "step": 3232 + }, + { + "epoch": 1.1772843101565345, + "grad_norm": 10.875, + "learning_rate": 7.398020662785946e-07, + "loss": 1.2807588577270508, + "step": 3234 + }, + { + "epoch": 1.1780123771386968, + "grad_norm": 19.125, + "learning_rate": 7.395134851016276e-07, + "loss": 1.6460895538330078, + "step": 3236 + }, + { + "epoch": 1.178740444120859, + "grad_norm": 16.875, + "learning_rate": 7.392248212277763e-07, + "loss": 1.6865427494049072, + "step": 3238 + }, + { + "epoch": 1.1794685111030214, + "grad_norm": 6.40625, + "learning_rate": 7.389360748281468e-07, + "loss": 1.2292823791503906, + "step": 3240 + }, + { + "epoch": 1.1801965780851837, + "grad_norm": 18.125, + "learning_rate": 7.386472460738942e-07, + "loss": 1.1814864873886108, + "step": 3242 + }, + { + "epoch": 1.1809246450673463, + "grad_norm": 10.3125, + "learning_rate": 7.383583351362219e-07, + "loss": 1.4591881036758423, + "step": 3244 + }, + { + "epoch": 1.1816527120495086, + "grad_norm": 24.375, + "learning_rate": 7.380693421863829e-07, + "loss": 1.7987847328186035, + "step": 3246 + }, + { + "epoch": 1.182380779031671, + "grad_norm": 9.75, + "learning_rate": 7.377802673956779e-07, + "loss": 1.1932018995285034, + "step": 3248 + }, + { + "epoch": 1.1831088460138333, + "grad_norm": 19.875, + "learning_rate": 7.374911109354569e-07, + "loss": 0.9341808557510376, + "step": 3250 + }, + { + "epoch": 1.1838369129959956, + "grad_norm": 18.5, + "learning_rate": 7.372018729771176e-07, + "loss": 0.28610512614250183, + "step": 3252 + }, + { + "epoch": 1.184564979978158, + "grad_norm": 21.25, + "learning_rate": 7.369125536921064e-07, + "loss": 1.634751558303833, + "step": 3254 + }, + { + "epoch": 1.1852930469603202, + "grad_norm": 14.75, + "learning_rate": 7.366231532519183e-07, + "loss": 1.8596796989440918, + "step": 3256 + }, + { + "epoch": 1.1860211139424828, + "grad_norm": 14.125, + "learning_rate": 7.363336718280952e-07, + "loss": 1.4375503063201904, + "step": 3258 + }, + { + "epoch": 1.186749180924645, + "grad_norm": 10.625, + "learning_rate": 7.360441095922285e-07, + "loss": 1.6593804359436035, + "step": 3260 + }, + { + "epoch": 1.1874772479068074, + "grad_norm": 1600.0, + "learning_rate": 7.357544667159563e-07, + "loss": 1.4019763469696045, + "step": 3262 + }, + { + "epoch": 1.1882053148889697, + "grad_norm": 11.0, + "learning_rate": 7.354647433709653e-07, + "loss": 1.5358561277389526, + "step": 3264 + }, + { + "epoch": 1.188933381871132, + "grad_norm": 12.5, + "learning_rate": 7.351749397289893e-07, + "loss": 1.483398675918579, + "step": 3266 + }, + { + "epoch": 1.1896614488532946, + "grad_norm": 9.875, + "learning_rate": 7.348850559618101e-07, + "loss": 1.4304120540618896, + "step": 3268 + }, + { + "epoch": 1.190389515835457, + "grad_norm": 9.375, + "learning_rate": 7.345950922412567e-07, + "loss": 1.2050080299377441, + "step": 3270 + }, + { + "epoch": 1.1911175828176193, + "grad_norm": 10.5625, + "learning_rate": 7.34305048739206e-07, + "loss": 1.4453988075256348, + "step": 3272 + }, + { + "epoch": 1.1918456497997816, + "grad_norm": 24.625, + "learning_rate": 7.340149256275813e-07, + "loss": 1.4199376106262207, + "step": 3274 + }, + { + "epoch": 1.192573716781944, + "grad_norm": 15.0625, + "learning_rate": 7.337247230783541e-07, + "loss": 1.56522536277771, + "step": 3276 + }, + { + "epoch": 1.1933017837641062, + "grad_norm": 10.0, + "learning_rate": 7.334344412635423e-07, + "loss": 1.5458581447601318, + "step": 3278 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 6.875, + "learning_rate": 7.331440803552109e-07, + "loss": 1.009720802307129, + "step": 3280 + }, + { + "epoch": 1.194757917728431, + "grad_norm": 18.5, + "learning_rate": 7.328536405254722e-07, + "loss": 1.4484598636627197, + "step": 3282 + }, + { + "epoch": 1.1954859847105934, + "grad_norm": 15.0625, + "learning_rate": 7.325631219464847e-07, + "loss": 1.1317164897918701, + "step": 3284 + }, + { + "epoch": 1.1962140516927557, + "grad_norm": 25.75, + "learning_rate": 7.32272524790454e-07, + "loss": 1.3629131317138672, + "step": 3286 + }, + { + "epoch": 1.196942118674918, + "grad_norm": 13.5625, + "learning_rate": 7.31981849229632e-07, + "loss": 1.4233986139297485, + "step": 3288 + }, + { + "epoch": 1.1976701856570804, + "grad_norm": 13.3125, + "learning_rate": 7.316910954363174e-07, + "loss": 1.210113286972046, + "step": 3290 + }, + { + "epoch": 1.198398252639243, + "grad_norm": 3.21875, + "learning_rate": 7.314002635828547e-07, + "loss": 1.1558411121368408, + "step": 3292 + }, + { + "epoch": 1.1991263196214053, + "grad_norm": 13.0625, + "learning_rate": 7.31109353841635e-07, + "loss": 1.380199670791626, + "step": 3294 + }, + { + "epoch": 1.1998543866035676, + "grad_norm": 18.125, + "learning_rate": 7.308183663850962e-07, + "loss": 1.275402545928955, + "step": 3296 + }, + { + "epoch": 1.20058245358573, + "grad_norm": 14.3125, + "learning_rate": 7.305273013857214e-07, + "loss": 1.3778529167175293, + "step": 3298 + }, + { + "epoch": 1.2013105205678922, + "grad_norm": 12.5625, + "learning_rate": 7.302361590160395e-07, + "loss": 1.7238298654556274, + "step": 3300 + }, + { + "epoch": 1.2020385875500545, + "grad_norm": 11.875, + "learning_rate": 7.299449394486261e-07, + "loss": 1.4809215068817139, + "step": 3302 + }, + { + "epoch": 1.2027666545322169, + "grad_norm": 4.4375, + "learning_rate": 7.296536428561024e-07, + "loss": 1.3073759078979492, + "step": 3304 + }, + { + "epoch": 1.2034947215143794, + "grad_norm": 77.0, + "learning_rate": 7.293622694111345e-07, + "loss": 1.449177622795105, + "step": 3306 + }, + { + "epoch": 1.2042227884965417, + "grad_norm": 17.375, + "learning_rate": 7.290708192864347e-07, + "loss": 1.6258866786956787, + "step": 3308 + }, + { + "epoch": 1.204950855478704, + "grad_norm": 13.4375, + "learning_rate": 7.287792926547607e-07, + "loss": 1.3763225078582764, + "step": 3310 + }, + { + "epoch": 1.2056789224608664, + "grad_norm": 6.84375, + "learning_rate": 7.284876896889156e-07, + "loss": 0.9770424365997314, + "step": 3312 + }, + { + "epoch": 1.2064069894430287, + "grad_norm": 15.6875, + "learning_rate": 7.281960105617473e-07, + "loss": 1.029457926750183, + "step": 3314 + }, + { + "epoch": 1.207135056425191, + "grad_norm": 17.75, + "learning_rate": 7.279042554461494e-07, + "loss": 1.4875173568725586, + "step": 3316 + }, + { + "epoch": 1.2078631234073534, + "grad_norm": 15.6875, + "learning_rate": 7.276124245150603e-07, + "loss": 1.3963212966918945, + "step": 3318 + }, + { + "epoch": 1.208591190389516, + "grad_norm": 9.625, + "learning_rate": 7.273205179414633e-07, + "loss": 1.3795726299285889, + "step": 3320 + }, + { + "epoch": 1.2093192573716782, + "grad_norm": 5.84375, + "learning_rate": 7.270285358983863e-07, + "loss": 1.0082175731658936, + "step": 3322 + }, + { + "epoch": 1.2100473243538405, + "grad_norm": 14.25, + "learning_rate": 7.267364785589026e-07, + "loss": 0.7051547169685364, + "step": 3324 + }, + { + "epoch": 1.2107753913360029, + "grad_norm": 4.96875, + "learning_rate": 7.264443460961296e-07, + "loss": 1.1940947771072388, + "step": 3326 + }, + { + "epoch": 1.2115034583181652, + "grad_norm": 15.375, + "learning_rate": 7.261521386832294e-07, + "loss": 1.3826723098754883, + "step": 3328 + }, + { + "epoch": 1.2122315253003277, + "grad_norm": 73.0, + "learning_rate": 7.258598564934082e-07, + "loss": 0.9582946300506592, + "step": 3330 + }, + { + "epoch": 1.21295959228249, + "grad_norm": 42.75, + "learning_rate": 7.255674996999174e-07, + "loss": 1.3178778886795044, + "step": 3332 + }, + { + "epoch": 1.2136876592646524, + "grad_norm": 14.1875, + "learning_rate": 7.252750684760518e-07, + "loss": 1.4391541481018066, + "step": 3334 + }, + { + "epoch": 1.2144157262468147, + "grad_norm": 3.90625, + "learning_rate": 7.249825629951505e-07, + "loss": 1.195455551147461, + "step": 3336 + }, + { + "epoch": 1.215143793228977, + "grad_norm": 22.5, + "learning_rate": 7.246899834305964e-07, + "loss": 1.4570657014846802, + "step": 3338 + }, + { + "epoch": 1.2158718602111394, + "grad_norm": 10.125, + "learning_rate": 7.243973299558172e-07, + "loss": 1.7970304489135742, + "step": 3340 + }, + { + "epoch": 1.2165999271933017, + "grad_norm": 9.0, + "learning_rate": 7.241046027442832e-07, + "loss": 1.3954393863677979, + "step": 3342 + }, + { + "epoch": 1.2173279941754642, + "grad_norm": 21.625, + "learning_rate": 7.238118019695097e-07, + "loss": 1.4312398433685303, + "step": 3344 + }, + { + "epoch": 1.2180560611576265, + "grad_norm": 14.25, + "learning_rate": 7.235189278050542e-07, + "loss": 1.3827424049377441, + "step": 3346 + }, + { + "epoch": 1.2187841281397889, + "grad_norm": 25.375, + "learning_rate": 7.232259804245191e-07, + "loss": 0.8970829248428345, + "step": 3348 + }, + { + "epoch": 1.2195121951219512, + "grad_norm": 10.5, + "learning_rate": 7.229329600015488e-07, + "loss": 0.8581031560897827, + "step": 3350 + }, + { + "epoch": 1.2202402621041135, + "grad_norm": 16.5, + "learning_rate": 7.226398667098321e-07, + "loss": 1.2529585361480713, + "step": 3352 + }, + { + "epoch": 1.220968329086276, + "grad_norm": 44.25, + "learning_rate": 7.223467007231008e-07, + "loss": 1.3656203746795654, + "step": 3354 + }, + { + "epoch": 1.2216963960684384, + "grad_norm": 5.65625, + "learning_rate": 7.220534622151293e-07, + "loss": 1.2666774988174438, + "step": 3356 + }, + { + "epoch": 1.2224244630506007, + "grad_norm": 9.0625, + "learning_rate": 7.217601513597353e-07, + "loss": 1.1210920810699463, + "step": 3358 + }, + { + "epoch": 1.223152530032763, + "grad_norm": 11.5, + "learning_rate": 7.214667683307795e-07, + "loss": 1.019651174545288, + "step": 3360 + }, + { + "epoch": 1.2238805970149254, + "grad_norm": 14.5625, + "learning_rate": 7.211733133021652e-07, + "loss": 1.2983779907226562, + "step": 3362 + }, + { + "epoch": 1.2246086639970877, + "grad_norm": 44.25, + "learning_rate": 7.208797864478382e-07, + "loss": 1.4012659788131714, + "step": 3364 + }, + { + "epoch": 1.22533673097925, + "grad_norm": 17.125, + "learning_rate": 7.205861879417876e-07, + "loss": 1.0330891609191895, + "step": 3366 + }, + { + "epoch": 1.2260647979614125, + "grad_norm": 39.5, + "learning_rate": 7.202925179580441e-07, + "loss": 1.2626029253005981, + "step": 3368 + }, + { + "epoch": 1.2267928649435749, + "grad_norm": 6.21875, + "learning_rate": 7.199987766706812e-07, + "loss": 1.1249897480010986, + "step": 3370 + }, + { + "epoch": 1.2275209319257372, + "grad_norm": 9.375, + "learning_rate": 7.197049642538147e-07, + "loss": 1.238515019416809, + "step": 3372 + }, + { + "epoch": 1.2282489989078995, + "grad_norm": 9.8125, + "learning_rate": 7.194110808816025e-07, + "loss": 0.7849839925765991, + "step": 3374 + }, + { + "epoch": 1.2289770658900618, + "grad_norm": 13.5, + "learning_rate": 7.191171267282443e-07, + "loss": 1.597214698791504, + "step": 3376 + }, + { + "epoch": 1.2297051328722242, + "grad_norm": 9.375, + "learning_rate": 7.18823101967982e-07, + "loss": 1.302781581878662, + "step": 3378 + }, + { + "epoch": 1.2304331998543865, + "grad_norm": 13.4375, + "learning_rate": 7.185290067750998e-07, + "loss": 1.487734317779541, + "step": 3380 + }, + { + "epoch": 1.231161266836549, + "grad_norm": 10.8125, + "learning_rate": 7.182348413239226e-07, + "loss": 1.1858845949172974, + "step": 3382 + }, + { + "epoch": 1.2318893338187114, + "grad_norm": 15.9375, + "learning_rate": 7.179406057888178e-07, + "loss": 1.0438250303268433, + "step": 3384 + }, + { + "epoch": 1.2326174008008737, + "grad_norm": 21.625, + "learning_rate": 7.176463003441941e-07, + "loss": 1.0831880569458008, + "step": 3386 + }, + { + "epoch": 1.233345467783036, + "grad_norm": 10.0625, + "learning_rate": 7.173519251645015e-07, + "loss": 1.5226027965545654, + "step": 3388 + }, + { + "epoch": 1.2340735347651983, + "grad_norm": 11.625, + "learning_rate": 7.170574804242318e-07, + "loss": 1.4456634521484375, + "step": 3390 + }, + { + "epoch": 1.2348016017473609, + "grad_norm": 26.625, + "learning_rate": 7.167629662979171e-07, + "loss": 0.7379730343818665, + "step": 3392 + }, + { + "epoch": 1.2355296687295232, + "grad_norm": 14.5, + "learning_rate": 7.164683829601317e-07, + "loss": 1.152913212776184, + "step": 3394 + }, + { + "epoch": 1.2362577357116855, + "grad_norm": 42.0, + "learning_rate": 7.1617373058549e-07, + "loss": 0.8473098278045654, + "step": 3396 + }, + { + "epoch": 1.2369858026938478, + "grad_norm": 9.25, + "learning_rate": 7.158790093486482e-07, + "loss": 1.4110465049743652, + "step": 3398 + }, + { + "epoch": 1.2377138696760102, + "grad_norm": 35.75, + "learning_rate": 7.155842194243024e-07, + "loss": 1.5243785381317139, + "step": 3400 + }, + { + "epoch": 1.2384419366581725, + "grad_norm": 4.25, + "learning_rate": 7.152893609871906e-07, + "loss": 1.2991764545440674, + "step": 3402 + }, + { + "epoch": 1.2391700036403348, + "grad_norm": 16.5, + "learning_rate": 7.149944342120899e-07, + "loss": 1.3084542751312256, + "step": 3404 + }, + { + "epoch": 1.2398980706224974, + "grad_norm": 19.125, + "learning_rate": 7.146994392738193e-07, + "loss": 1.4175339937210083, + "step": 3406 + }, + { + "epoch": 1.2406261376046597, + "grad_norm": 15.8125, + "learning_rate": 7.144043763472371e-07, + "loss": 1.39274001121521, + "step": 3408 + }, + { + "epoch": 1.241354204586822, + "grad_norm": 19.625, + "learning_rate": 7.141092456072433e-07, + "loss": 1.577301263809204, + "step": 3410 + }, + { + "epoch": 1.2420822715689843, + "grad_norm": 26.375, + "learning_rate": 7.138140472287762e-07, + "loss": 1.5047677755355835, + "step": 3412 + }, + { + "epoch": 1.2428103385511466, + "grad_norm": 14.25, + "learning_rate": 7.135187813868156e-07, + "loss": 1.5578992366790771, + "step": 3414 + }, + { + "epoch": 1.2435384055333092, + "grad_norm": 16.5, + "learning_rate": 7.132234482563808e-07, + "loss": 1.5164861679077148, + "step": 3416 + }, + { + "epoch": 1.2442664725154715, + "grad_norm": 21.25, + "learning_rate": 7.129280480125313e-07, + "loss": 1.4224605560302734, + "step": 3418 + }, + { + "epoch": 1.2449945394976338, + "grad_norm": 10.0, + "learning_rate": 7.126325808303662e-07, + "loss": 1.4164526462554932, + "step": 3420 + }, + { + "epoch": 1.2457226064797962, + "grad_norm": 13.4375, + "learning_rate": 7.123370468850238e-07, + "loss": 1.6357030868530273, + "step": 3422 + }, + { + "epoch": 1.2464506734619585, + "grad_norm": 14.4375, + "learning_rate": 7.120414463516828e-07, + "loss": 1.5305184125900269, + "step": 3424 + }, + { + "epoch": 1.2471787404441208, + "grad_norm": 15.0625, + "learning_rate": 7.117457794055608e-07, + "loss": 1.34114670753479, + "step": 3426 + }, + { + "epoch": 1.2479068074262831, + "grad_norm": 12.4375, + "learning_rate": 7.11450046221915e-07, + "loss": 1.1727616786956787, + "step": 3428 + }, + { + "epoch": 1.2486348744084457, + "grad_norm": 20.25, + "learning_rate": 7.111542469760419e-07, + "loss": 1.3590433597564697, + "step": 3430 + }, + { + "epoch": 1.249362941390608, + "grad_norm": 9.5625, + "learning_rate": 7.108583818432772e-07, + "loss": 1.4366416931152344, + "step": 3432 + }, + { + "epoch": 1.2500910083727703, + "grad_norm": 17.125, + "learning_rate": 7.10562450998995e-07, + "loss": 1.3638899326324463, + "step": 3434 + }, + { + "epoch": 1.2508190753549326, + "grad_norm": 43.5, + "learning_rate": 7.10266454618609e-07, + "loss": 1.508440613746643, + "step": 3436 + }, + { + "epoch": 1.251547142337095, + "grad_norm": 16.375, + "learning_rate": 7.099703928775722e-07, + "loss": 1.468846082687378, + "step": 3438 + }, + { + "epoch": 1.2522752093192575, + "grad_norm": 25.25, + "learning_rate": 7.096742659513752e-07, + "loss": 1.5184085369110107, + "step": 3440 + }, + { + "epoch": 1.2530032763014196, + "grad_norm": 11.5625, + "learning_rate": 7.093780740155479e-07, + "loss": 1.4722588062286377, + "step": 3442 + }, + { + "epoch": 1.2537313432835822, + "grad_norm": 23.0, + "learning_rate": 7.090818172456587e-07, + "loss": 1.7995455265045166, + "step": 3444 + }, + { + "epoch": 1.2544594102657445, + "grad_norm": 24.125, + "learning_rate": 7.087854958173145e-07, + "loss": 1.644504189491272, + "step": 3446 + }, + { + "epoch": 1.2551874772479068, + "grad_norm": 72.0, + "learning_rate": 7.0848910990616e-07, + "loss": 1.4690901041030884, + "step": 3448 + }, + { + "epoch": 1.2559155442300691, + "grad_norm": 9.6875, + "learning_rate": 7.081926596878788e-07, + "loss": 1.4353034496307373, + "step": 3450 + }, + { + "epoch": 1.2566436112122314, + "grad_norm": 8.625, + "learning_rate": 7.078961453381926e-07, + "loss": 1.320157766342163, + "step": 3452 + }, + { + "epoch": 1.257371678194394, + "grad_norm": 17.0, + "learning_rate": 7.075995670328602e-07, + "loss": 1.3898496627807617, + "step": 3454 + }, + { + "epoch": 1.2580997451765563, + "grad_norm": 8.9375, + "learning_rate": 7.073029249476794e-07, + "loss": 1.196812629699707, + "step": 3456 + }, + { + "epoch": 1.2588278121587186, + "grad_norm": 9.0625, + "learning_rate": 7.070062192584853e-07, + "loss": 1.5040093660354614, + "step": 3458 + }, + { + "epoch": 1.259555879140881, + "grad_norm": 12.9375, + "learning_rate": 7.067094501411508e-07, + "loss": 1.6962318420410156, + "step": 3460 + }, + { + "epoch": 1.2602839461230433, + "grad_norm": 9.0625, + "learning_rate": 7.064126177715861e-07, + "loss": 1.4598841667175293, + "step": 3462 + }, + { + "epoch": 1.2610120131052056, + "grad_norm": 20.0, + "learning_rate": 7.061157223257395e-07, + "loss": 0.78590327501297, + "step": 3464 + }, + { + "epoch": 1.261740080087368, + "grad_norm": 6.5, + "learning_rate": 7.058187639795961e-07, + "loss": 1.4296863079071045, + "step": 3466 + }, + { + "epoch": 1.2624681470695305, + "grad_norm": 19.5, + "learning_rate": 7.055217429091784e-07, + "loss": 1.4968440532684326, + "step": 3468 + }, + { + "epoch": 1.2631962140516928, + "grad_norm": 8.5, + "learning_rate": 7.052246592905466e-07, + "loss": 1.1844947338104248, + "step": 3470 + }, + { + "epoch": 1.2639242810338551, + "grad_norm": 12.625, + "learning_rate": 7.049275132997973e-07, + "loss": 1.4284515380859375, + "step": 3472 + }, + { + "epoch": 1.2646523480160174, + "grad_norm": 5.28125, + "learning_rate": 7.046303051130646e-07, + "loss": 1.3739935159683228, + "step": 3474 + }, + { + "epoch": 1.2653804149981798, + "grad_norm": 51.5, + "learning_rate": 7.043330349065189e-07, + "loss": 1.4676437377929688, + "step": 3476 + }, + { + "epoch": 1.2661084819803423, + "grad_norm": 11.5, + "learning_rate": 7.04035702856368e-07, + "loss": 1.4145677089691162, + "step": 3478 + }, + { + "epoch": 1.2668365489625044, + "grad_norm": 10.25, + "learning_rate": 7.037383091388558e-07, + "loss": 1.4122488498687744, + "step": 3480 + }, + { + "epoch": 1.267564615944667, + "grad_norm": 21.875, + "learning_rate": 7.03440853930263e-07, + "loss": 1.4875725507736206, + "step": 3482 + }, + { + "epoch": 1.2682926829268293, + "grad_norm": 9.0, + "learning_rate": 7.031433374069069e-07, + "loss": 1.4630677700042725, + "step": 3484 + }, + { + "epoch": 1.2690207499089916, + "grad_norm": 7.28125, + "learning_rate": 7.02845759745141e-07, + "loss": 1.3706104755401611, + "step": 3486 + }, + { + "epoch": 1.269748816891154, + "grad_norm": 14.9375, + "learning_rate": 7.025481211213546e-07, + "loss": 1.4735506772994995, + "step": 3488 + }, + { + "epoch": 1.2704768838733163, + "grad_norm": 12.375, + "learning_rate": 7.022504217119743e-07, + "loss": 1.4279154539108276, + "step": 3490 + }, + { + "epoch": 1.2712049508554788, + "grad_norm": 11.5, + "learning_rate": 7.019526616934612e-07, + "loss": 1.7582213878631592, + "step": 3492 + }, + { + "epoch": 1.2719330178376411, + "grad_norm": 9.0625, + "learning_rate": 7.016548412423138e-07, + "loss": 1.6748790740966797, + "step": 3494 + }, + { + "epoch": 1.2726610848198034, + "grad_norm": 24.875, + "learning_rate": 7.013569605350653e-07, + "loss": 1.0103142261505127, + "step": 3496 + }, + { + "epoch": 1.2733891518019658, + "grad_norm": 4.71875, + "learning_rate": 7.010590197482853e-07, + "loss": 1.164278268814087, + "step": 3498 + }, + { + "epoch": 1.274117218784128, + "grad_norm": 34.5, + "learning_rate": 7.007610190585784e-07, + "loss": 1.5862723588943481, + "step": 3500 + }, + { + "epoch": 1.2748452857662906, + "grad_norm": 22.625, + "learning_rate": 7.004629586425857e-07, + "loss": 1.5077109336853027, + "step": 3502 + }, + { + "epoch": 1.2755733527484527, + "grad_norm": 16.375, + "learning_rate": 7.001648386769825e-07, + "loss": 1.7495529651641846, + "step": 3504 + }, + { + "epoch": 1.2763014197306153, + "grad_norm": 17.125, + "learning_rate": 6.998666593384801e-07, + "loss": 1.4484033584594727, + "step": 3506 + }, + { + "epoch": 1.2770294867127776, + "grad_norm": 5.46875, + "learning_rate": 6.995684208038253e-07, + "loss": 1.2957067489624023, + "step": 3508 + }, + { + "epoch": 1.27775755369494, + "grad_norm": 12.875, + "learning_rate": 6.992701232497991e-07, + "loss": 1.3490028381347656, + "step": 3510 + }, + { + "epoch": 1.2784856206771023, + "grad_norm": 22.125, + "learning_rate": 6.989717668532181e-07, + "loss": 1.716500163078308, + "step": 3512 + }, + { + "epoch": 1.2792136876592646, + "grad_norm": 5.625, + "learning_rate": 6.986733517909333e-07, + "loss": 1.439678430557251, + "step": 3514 + }, + { + "epoch": 1.2799417546414271, + "grad_norm": 8.9375, + "learning_rate": 6.983748782398314e-07, + "loss": 1.3623406887054443, + "step": 3516 + }, + { + "epoch": 1.2806698216235894, + "grad_norm": 10.5, + "learning_rate": 6.980763463768329e-07, + "loss": 1.1676409244537354, + "step": 3518 + }, + { + "epoch": 1.2813978886057518, + "grad_norm": 20.125, + "learning_rate": 6.977777563788929e-07, + "loss": 1.0829548835754395, + "step": 3520 + }, + { + "epoch": 1.282125955587914, + "grad_norm": 20.5, + "learning_rate": 6.974791084230014e-07, + "loss": 1.4518284797668457, + "step": 3522 + }, + { + "epoch": 1.2828540225700764, + "grad_norm": 14.8125, + "learning_rate": 6.971804026861827e-07, + "loss": 1.3861544132232666, + "step": 3524 + }, + { + "epoch": 1.2835820895522387, + "grad_norm": 15.0625, + "learning_rate": 6.96881639345495e-07, + "loss": 1.4500890970230103, + "step": 3526 + }, + { + "epoch": 1.284310156534401, + "grad_norm": 31.5, + "learning_rate": 6.965828185780307e-07, + "loss": 1.1326911449432373, + "step": 3528 + }, + { + "epoch": 1.2850382235165636, + "grad_norm": 70.5, + "learning_rate": 6.962839405609166e-07, + "loss": 1.288429617881775, + "step": 3530 + }, + { + "epoch": 1.285766290498726, + "grad_norm": 13.125, + "learning_rate": 6.959850054713135e-07, + "loss": 1.5106329917907715, + "step": 3532 + }, + { + "epoch": 1.2864943574808883, + "grad_norm": 23.375, + "learning_rate": 6.956860134864149e-07, + "loss": 1.188486099243164, + "step": 3534 + }, + { + "epoch": 1.2872224244630506, + "grad_norm": 14.75, + "learning_rate": 6.953869647834496e-07, + "loss": 1.4082539081573486, + "step": 3536 + }, + { + "epoch": 1.287950491445213, + "grad_norm": 12.125, + "learning_rate": 6.950878595396792e-07, + "loss": 1.4189486503601074, + "step": 3538 + }, + { + "epoch": 1.2886785584273754, + "grad_norm": 7.90625, + "learning_rate": 6.947886979323986e-07, + "loss": 1.1523163318634033, + "step": 3540 + }, + { + "epoch": 1.2894066254095375, + "grad_norm": 6.6875, + "learning_rate": 6.944894801389365e-07, + "loss": 1.2182693481445312, + "step": 3542 + }, + { + "epoch": 1.2901346923917, + "grad_norm": 10.9375, + "learning_rate": 6.941902063366552e-07, + "loss": 1.235659122467041, + "step": 3544 + }, + { + "epoch": 1.2908627593738624, + "grad_norm": 55.75, + "learning_rate": 6.938908767029492e-07, + "loss": 1.4465806484222412, + "step": 3546 + }, + { + "epoch": 1.2915908263560247, + "grad_norm": 8.875, + "learning_rate": 6.93591491415247e-07, + "loss": 1.2191390991210938, + "step": 3548 + }, + { + "epoch": 1.292318893338187, + "grad_norm": 7.90625, + "learning_rate": 6.9329205065101e-07, + "loss": 0.8912258148193359, + "step": 3550 + }, + { + "epoch": 1.2930469603203494, + "grad_norm": 52.75, + "learning_rate": 6.929925545877321e-07, + "loss": 1.4775289297103882, + "step": 3552 + }, + { + "epoch": 1.293775027302512, + "grad_norm": 11.875, + "learning_rate": 6.926930034029401e-07, + "loss": 1.4275285005569458, + "step": 3554 + }, + { + "epoch": 1.2945030942846742, + "grad_norm": 16.875, + "learning_rate": 6.923933972741937e-07, + "loss": 1.2589417695999146, + "step": 3556 + }, + { + "epoch": 1.2952311612668366, + "grad_norm": 9.5, + "learning_rate": 6.920937363790849e-07, + "loss": 1.2432568073272705, + "step": 3558 + }, + { + "epoch": 1.295959228248999, + "grad_norm": 14.5, + "learning_rate": 6.917940208952382e-07, + "loss": 1.2800997495651245, + "step": 3560 + }, + { + "epoch": 1.2966872952311612, + "grad_norm": 23.875, + "learning_rate": 6.914942510003105e-07, + "loss": 1.6592881679534912, + "step": 3562 + }, + { + "epoch": 1.2974153622133238, + "grad_norm": 72.0, + "learning_rate": 6.911944268719914e-07, + "loss": 1.1789777278900146, + "step": 3564 + }, + { + "epoch": 1.2981434291954859, + "grad_norm": 22.375, + "learning_rate": 6.908945486880016e-07, + "loss": 0.2618753910064697, + "step": 3566 + }, + { + "epoch": 1.2988714961776484, + "grad_norm": 17.5, + "learning_rate": 6.905946166260949e-07, + "loss": 1.0993835926055908, + "step": 3568 + }, + { + "epoch": 1.2995995631598107, + "grad_norm": 21.75, + "learning_rate": 6.902946308640565e-07, + "loss": 1.4687378406524658, + "step": 3570 + }, + { + "epoch": 1.300327630141973, + "grad_norm": 14.625, + "learning_rate": 6.899945915797035e-07, + "loss": 1.2178938388824463, + "step": 3572 + }, + { + "epoch": 1.3010556971241354, + "grad_norm": 18.25, + "learning_rate": 6.896944989508846e-07, + "loss": 1.0805833339691162, + "step": 3574 + }, + { + "epoch": 1.3017837641062977, + "grad_norm": 14.4375, + "learning_rate": 6.893943531554806e-07, + "loss": 1.3598343133926392, + "step": 3576 + }, + { + "epoch": 1.3025118310884602, + "grad_norm": 35.25, + "learning_rate": 6.890941543714033e-07, + "loss": 1.1867520809173584, + "step": 3578 + }, + { + "epoch": 1.3032398980706226, + "grad_norm": 31.5, + "learning_rate": 6.887939027765961e-07, + "loss": 1.4391090869903564, + "step": 3580 + }, + { + "epoch": 1.303967965052785, + "grad_norm": 24.25, + "learning_rate": 6.884935985490336e-07, + "loss": 1.3934730291366577, + "step": 3582 + }, + { + "epoch": 1.3046960320349472, + "grad_norm": 39.0, + "learning_rate": 6.88193241866722e-07, + "loss": 0.9448332786560059, + "step": 3584 + }, + { + "epoch": 1.3054240990171095, + "grad_norm": 25.75, + "learning_rate": 6.878928329076982e-07, + "loss": 0.9333141446113586, + "step": 3586 + }, + { + "epoch": 1.3061521659992719, + "grad_norm": 18.25, + "learning_rate": 6.8759237185003e-07, + "loss": 1.422725796699524, + "step": 3588 + }, + { + "epoch": 1.3068802329814342, + "grad_norm": 12.5, + "learning_rate": 6.872918588718165e-07, + "loss": 1.118701696395874, + "step": 3590 + }, + { + "epoch": 1.3076082999635967, + "grad_norm": 5.25, + "learning_rate": 6.869912941511872e-07, + "loss": 1.3205444812774658, + "step": 3592 + }, + { + "epoch": 1.308336366945759, + "grad_norm": 15.0, + "learning_rate": 6.866906778663024e-07, + "loss": 1.238567590713501, + "step": 3594 + }, + { + "epoch": 1.3090644339279214, + "grad_norm": 14.4375, + "learning_rate": 6.86390010195353e-07, + "loss": 0.9423297643661499, + "step": 3596 + }, + { + "epoch": 1.3097925009100837, + "grad_norm": 16.125, + "learning_rate": 6.860892913165603e-07, + "loss": 1.2012231349945068, + "step": 3598 + }, + { + "epoch": 1.310520567892246, + "grad_norm": 20.625, + "learning_rate": 6.857885214081762e-07, + "loss": 1.485168695449829, + "step": 3600 + }, + { + "epoch": 1.3112486348744086, + "grad_norm": 16.125, + "learning_rate": 6.854877006484823e-07, + "loss": 1.2250447273254395, + "step": 3602 + }, + { + "epoch": 1.3119767018565707, + "grad_norm": 7.03125, + "learning_rate": 6.85186829215791e-07, + "loss": 1.1251697540283203, + "step": 3604 + }, + { + "epoch": 1.3127047688387332, + "grad_norm": 8.9375, + "learning_rate": 6.848859072884441e-07, + "loss": 1.2670562267303467, + "step": 3606 + }, + { + "epoch": 1.3134328358208955, + "grad_norm": 14.5625, + "learning_rate": 6.845849350448138e-07, + "loss": 1.350937843322754, + "step": 3608 + }, + { + "epoch": 1.3141609028030579, + "grad_norm": 28.5, + "learning_rate": 6.842839126633022e-07, + "loss": 1.6243629455566406, + "step": 3610 + }, + { + "epoch": 1.3148889697852202, + "grad_norm": 5.5, + "learning_rate": 6.839828403223406e-07, + "loss": 1.2871955633163452, + "step": 3612 + }, + { + "epoch": 1.3156170367673825, + "grad_norm": 13.375, + "learning_rate": 6.836817182003902e-07, + "loss": 1.4146288633346558, + "step": 3614 + }, + { + "epoch": 1.316345103749545, + "grad_norm": 18.0, + "learning_rate": 6.833805464759419e-07, + "loss": 1.150397777557373, + "step": 3616 + }, + { + "epoch": 1.3170731707317074, + "grad_norm": 30.125, + "learning_rate": 6.830793253275156e-07, + "loss": 1.6404919624328613, + "step": 3618 + }, + { + "epoch": 1.3178012377138697, + "grad_norm": 12.25, + "learning_rate": 6.827780549336608e-07, + "loss": 1.798444151878357, + "step": 3620 + }, + { + "epoch": 1.318529304696032, + "grad_norm": 6.71875, + "learning_rate": 6.824767354729561e-07, + "loss": 1.138660192489624, + "step": 3622 + }, + { + "epoch": 1.3192573716781943, + "grad_norm": 16.125, + "learning_rate": 6.821753671240093e-07, + "loss": 1.4037799835205078, + "step": 3624 + }, + { + "epoch": 1.319985438660357, + "grad_norm": 25.25, + "learning_rate": 6.818739500654569e-07, + "loss": 1.3510794639587402, + "step": 3626 + }, + { + "epoch": 1.320713505642519, + "grad_norm": 6.4375, + "learning_rate": 6.815724844759645e-07, + "loss": 1.1344504356384277, + "step": 3628 + }, + { + "epoch": 1.3214415726246815, + "grad_norm": 9.875, + "learning_rate": 6.812709705342265e-07, + "loss": 1.498512864112854, + "step": 3630 + }, + { + "epoch": 1.3221696396068439, + "grad_norm": 15.125, + "learning_rate": 6.809694084189657e-07, + "loss": 1.2761714458465576, + "step": 3632 + }, + { + "epoch": 1.3228977065890062, + "grad_norm": 10.8125, + "learning_rate": 6.806677983089337e-07, + "loss": 0.8114470839500427, + "step": 3634 + }, + { + "epoch": 1.3236257735711685, + "grad_norm": 20.75, + "learning_rate": 6.803661403829104e-07, + "loss": 1.4368181228637695, + "step": 3636 + }, + { + "epoch": 1.3243538405533308, + "grad_norm": 15.25, + "learning_rate": 6.800644348197042e-07, + "loss": 1.5797152519226074, + "step": 3638 + }, + { + "epoch": 1.3250819075354934, + "grad_norm": 12.5, + "learning_rate": 6.797626817981515e-07, + "loss": 1.3563282489776611, + "step": 3640 + }, + { + "epoch": 1.3258099745176557, + "grad_norm": 14.6875, + "learning_rate": 6.794608814971174e-07, + "loss": 1.7799694538116455, + "step": 3642 + }, + { + "epoch": 1.326538041499818, + "grad_norm": 12.625, + "learning_rate": 6.79159034095494e-07, + "loss": 1.379778504371643, + "step": 3644 + }, + { + "epoch": 1.3272661084819803, + "grad_norm": 13.625, + "learning_rate": 6.788571397722023e-07, + "loss": 0.7234645485877991, + "step": 3646 + }, + { + "epoch": 1.3279941754641427, + "grad_norm": 16.0, + "learning_rate": 6.785551987061906e-07, + "loss": 1.5848326683044434, + "step": 3648 + }, + { + "epoch": 1.328722242446305, + "grad_norm": 47.25, + "learning_rate": 6.782532110764353e-07, + "loss": 1.3769917488098145, + "step": 3650 + }, + { + "epoch": 1.3294503094284673, + "grad_norm": 8.9375, + "learning_rate": 6.779511770619397e-07, + "loss": 0.962924599647522, + "step": 3652 + }, + { + "epoch": 1.3301783764106299, + "grad_norm": 14.9375, + "learning_rate": 6.776490968417355e-07, + "loss": 1.3547892570495605, + "step": 3654 + }, + { + "epoch": 1.3309064433927922, + "grad_norm": 13.5, + "learning_rate": 6.77346970594881e-07, + "loss": 1.4879813194274902, + "step": 3656 + }, + { + "epoch": 1.3316345103749545, + "grad_norm": 12.9375, + "learning_rate": 6.770447985004621e-07, + "loss": 1.646762728691101, + "step": 3658 + }, + { + "epoch": 1.3323625773571168, + "grad_norm": 33.75, + "learning_rate": 6.767425807375922e-07, + "loss": 0.9777312278747559, + "step": 3660 + }, + { + "epoch": 1.3330906443392792, + "grad_norm": 5.96875, + "learning_rate": 6.76440317485411e-07, + "loss": 0.973010778427124, + "step": 3662 + }, + { + "epoch": 1.3338187113214417, + "grad_norm": 10.5625, + "learning_rate": 6.761380089230861e-07, + "loss": 1.380549669265747, + "step": 3664 + }, + { + "epoch": 1.3345467783036038, + "grad_norm": 19.125, + "learning_rate": 6.758356552298113e-07, + "loss": 1.3709235191345215, + "step": 3666 + }, + { + "epoch": 1.3352748452857663, + "grad_norm": 13.625, + "learning_rate": 6.75533256584807e-07, + "loss": 1.0934035778045654, + "step": 3668 + }, + { + "epoch": 1.3360029122679287, + "grad_norm": 9.375, + "learning_rate": 6.75230813167321e-07, + "loss": 1.4153802394866943, + "step": 3670 + }, + { + "epoch": 1.336730979250091, + "grad_norm": 14.0, + "learning_rate": 6.74928325156627e-07, + "loss": 1.5877366065979004, + "step": 3672 + }, + { + "epoch": 1.3374590462322533, + "grad_norm": 30.75, + "learning_rate": 6.746257927320254e-07, + "loss": 1.4228789806365967, + "step": 3674 + }, + { + "epoch": 1.3381871132144156, + "grad_norm": 49.0, + "learning_rate": 6.743232160728429e-07, + "loss": 1.49105966091156, + "step": 3676 + }, + { + "epoch": 1.3389151801965782, + "grad_norm": 20.875, + "learning_rate": 6.740205953584326e-07, + "loss": 1.1155225038528442, + "step": 3678 + }, + { + "epoch": 1.3396432471787405, + "grad_norm": 27.625, + "learning_rate": 6.73717930768173e-07, + "loss": 1.4184553623199463, + "step": 3680 + }, + { + "epoch": 1.3403713141609028, + "grad_norm": 10.3125, + "learning_rate": 6.734152224814696e-07, + "loss": 1.1609480381011963, + "step": 3682 + }, + { + "epoch": 1.3410993811430652, + "grad_norm": 21.75, + "learning_rate": 6.731124706777529e-07, + "loss": 1.4837830066680908, + "step": 3684 + }, + { + "epoch": 1.3418274481252275, + "grad_norm": 18.0, + "learning_rate": 6.728096755364798e-07, + "loss": 1.6749844551086426, + "step": 3686 + }, + { + "epoch": 1.34255551510739, + "grad_norm": 18.75, + "learning_rate": 6.725068372371327e-07, + "loss": 1.717020869255066, + "step": 3688 + }, + { + "epoch": 1.3432835820895521, + "grad_norm": 27.75, + "learning_rate": 6.722039559592199e-07, + "loss": 0.9746057987213135, + "step": 3690 + }, + { + "epoch": 1.3440116490717147, + "grad_norm": 21.375, + "learning_rate": 6.719010318822743e-07, + "loss": 1.523344874382019, + "step": 3692 + }, + { + "epoch": 1.344739716053877, + "grad_norm": 60.0, + "learning_rate": 6.715980651858548e-07, + "loss": 1.5210598707199097, + "step": 3694 + }, + { + "epoch": 1.3454677830360393, + "grad_norm": 6.125, + "learning_rate": 6.712950560495458e-07, + "loss": 1.1968969106674194, + "step": 3696 + }, + { + "epoch": 1.3461958500182016, + "grad_norm": 14.25, + "learning_rate": 6.709920046529564e-07, + "loss": 0.8323822617530823, + "step": 3698 + }, + { + "epoch": 1.346923917000364, + "grad_norm": 14.875, + "learning_rate": 6.70688911175721e-07, + "loss": 1.4234132766723633, + "step": 3700 + }, + { + "epoch": 1.3476519839825265, + "grad_norm": 10.5, + "learning_rate": 6.703857757974988e-07, + "loss": 1.3329381942749023, + "step": 3702 + }, + { + "epoch": 1.3483800509646888, + "grad_norm": 3.40625, + "learning_rate": 6.700825986979737e-07, + "loss": 0.9677647352218628, + "step": 3704 + }, + { + "epoch": 1.3491081179468511, + "grad_norm": 5.28125, + "learning_rate": 6.697793800568545e-07, + "loss": 1.2242599725723267, + "step": 3706 + }, + { + "epoch": 1.3498361849290135, + "grad_norm": 14.375, + "learning_rate": 6.694761200538748e-07, + "loss": 1.245905876159668, + "step": 3708 + }, + { + "epoch": 1.3505642519111758, + "grad_norm": 10.5625, + "learning_rate": 6.691728188687924e-07, + "loss": 1.2302407026290894, + "step": 3710 + }, + { + "epoch": 1.3512923188933381, + "grad_norm": 9.5, + "learning_rate": 6.688694766813898e-07, + "loss": 1.286030650138855, + "step": 3712 + }, + { + "epoch": 1.3520203858755004, + "grad_norm": 21.25, + "learning_rate": 6.685660936714737e-07, + "loss": 1.409557819366455, + "step": 3714 + }, + { + "epoch": 1.352748452857663, + "grad_norm": 20.625, + "learning_rate": 6.682626700188747e-07, + "loss": 1.6719077825546265, + "step": 3716 + }, + { + "epoch": 1.3534765198398253, + "grad_norm": 27.5, + "learning_rate": 6.67959205903448e-07, + "loss": 1.3167437314987183, + "step": 3718 + }, + { + "epoch": 1.3542045868219876, + "grad_norm": 17.25, + "learning_rate": 6.676557015050723e-07, + "loss": 1.9144887924194336, + "step": 3720 + }, + { + "epoch": 1.35493265380415, + "grad_norm": 14.125, + "learning_rate": 6.673521570036507e-07, + "loss": 1.5709826946258545, + "step": 3722 + }, + { + "epoch": 1.3556607207863123, + "grad_norm": 11.1875, + "learning_rate": 6.670485725791095e-07, + "loss": 1.6061331033706665, + "step": 3724 + }, + { + "epoch": 1.3563887877684748, + "grad_norm": 22.875, + "learning_rate": 6.667449484113991e-07, + "loss": 1.3315695524215698, + "step": 3726 + }, + { + "epoch": 1.357116854750637, + "grad_norm": 15.5625, + "learning_rate": 6.664412846804933e-07, + "loss": 1.4914230108261108, + "step": 3728 + }, + { + "epoch": 1.3578449217327995, + "grad_norm": 17.0, + "learning_rate": 6.661375815663894e-07, + "loss": 1.6431195735931396, + "step": 3730 + }, + { + "epoch": 1.3585729887149618, + "grad_norm": 10.1875, + "learning_rate": 6.658338392491077e-07, + "loss": 1.3646483421325684, + "step": 3732 + }, + { + "epoch": 1.3593010556971241, + "grad_norm": 11.3125, + "learning_rate": 6.655300579086924e-07, + "loss": 0.8460204005241394, + "step": 3734 + }, + { + "epoch": 1.3600291226792864, + "grad_norm": 22.625, + "learning_rate": 6.652262377252104e-07, + "loss": 1.3707780838012695, + "step": 3736 + }, + { + "epoch": 1.3607571896614488, + "grad_norm": 10.125, + "learning_rate": 6.649223788787517e-07, + "loss": 1.4841562509536743, + "step": 3738 + }, + { + "epoch": 1.3614852566436113, + "grad_norm": 29.0, + "learning_rate": 6.646184815494288e-07, + "loss": 1.6636885404586792, + "step": 3740 + }, + { + "epoch": 1.3622133236257736, + "grad_norm": 15.1875, + "learning_rate": 6.643145459173782e-07, + "loss": 1.209041714668274, + "step": 3742 + }, + { + "epoch": 1.362941390607936, + "grad_norm": 13.0625, + "learning_rate": 6.640105721627575e-07, + "loss": 1.0198569297790527, + "step": 3744 + }, + { + "epoch": 1.3636694575900983, + "grad_norm": 12.6875, + "learning_rate": 6.637065604657483e-07, + "loss": 1.681509017944336, + "step": 3746 + }, + { + "epoch": 1.3643975245722606, + "grad_norm": 28.375, + "learning_rate": 6.63402511006554e-07, + "loss": 1.0157673358917236, + "step": 3748 + }, + { + "epoch": 1.3651255915544231, + "grad_norm": 4.875, + "learning_rate": 6.630984239654002e-07, + "loss": 1.1611175537109375, + "step": 3750 + }, + { + "epoch": 1.3658536585365852, + "grad_norm": 20.0, + "learning_rate": 6.627942995225355e-07, + "loss": 1.2832145690917969, + "step": 3752 + }, + { + "epoch": 1.3665817255187478, + "grad_norm": 22.125, + "learning_rate": 6.624901378582299e-07, + "loss": 1.367387056350708, + "step": 3754 + }, + { + "epoch": 1.3673097925009101, + "grad_norm": 16.25, + "learning_rate": 6.621859391527761e-07, + "loss": 1.3340249061584473, + "step": 3756 + }, + { + "epoch": 1.3680378594830724, + "grad_norm": 13.8125, + "learning_rate": 6.618817035864881e-07, + "loss": 1.0947965383529663, + "step": 3758 + }, + { + "epoch": 1.3687659264652348, + "grad_norm": 10.9375, + "learning_rate": 6.615774313397025e-07, + "loss": 1.327880859375, + "step": 3760 + }, + { + "epoch": 1.369493993447397, + "grad_norm": 12.0, + "learning_rate": 6.61273122592777e-07, + "loss": 1.3811143636703491, + "step": 3762 + }, + { + "epoch": 1.3702220604295596, + "grad_norm": 10.75, + "learning_rate": 6.609687775260912e-07, + "loss": 1.3871002197265625, + "step": 3764 + }, + { + "epoch": 1.370950127411722, + "grad_norm": 10.4375, + "learning_rate": 6.606643963200463e-07, + "loss": 1.2039234638214111, + "step": 3766 + }, + { + "epoch": 1.3716781943938843, + "grad_norm": 14.9375, + "learning_rate": 6.603599791550649e-07, + "loss": 1.4359140396118164, + "step": 3768 + }, + { + "epoch": 1.3724062613760466, + "grad_norm": 11.6875, + "learning_rate": 6.600555262115908e-07, + "loss": 1.449021577835083, + "step": 3770 + }, + { + "epoch": 1.373134328358209, + "grad_norm": 6.90625, + "learning_rate": 6.597510376700889e-07, + "loss": 1.3889931440353394, + "step": 3772 + }, + { + "epoch": 1.3738623953403712, + "grad_norm": 6.3125, + "learning_rate": 6.594465137110455e-07, + "loss": 1.0853608846664429, + "step": 3774 + }, + { + "epoch": 1.3745904623225336, + "grad_norm": 12.0625, + "learning_rate": 6.591419545149679e-07, + "loss": 1.504687786102295, + "step": 3776 + }, + { + "epoch": 1.3753185293046961, + "grad_norm": 35.25, + "learning_rate": 6.58837360262384e-07, + "loss": 1.7174909114837646, + "step": 3778 + }, + { + "epoch": 1.3760465962868584, + "grad_norm": 14.5, + "learning_rate": 6.585327311338425e-07, + "loss": 1.6571626663208008, + "step": 3780 + }, + { + "epoch": 1.3767746632690208, + "grad_norm": 72.5, + "learning_rate": 6.582280673099131e-07, + "loss": 1.8086985349655151, + "step": 3782 + }, + { + "epoch": 1.377502730251183, + "grad_norm": 19.875, + "learning_rate": 6.579233689711858e-07, + "loss": 1.5314476490020752, + "step": 3784 + }, + { + "epoch": 1.3782307972333454, + "grad_norm": 13.25, + "learning_rate": 6.57618636298271e-07, + "loss": 0.7431083917617798, + "step": 3786 + }, + { + "epoch": 1.378958864215508, + "grad_norm": 16.375, + "learning_rate": 6.573138694717996e-07, + "loss": 1.0177147388458252, + "step": 3788 + }, + { + "epoch": 1.37968693119767, + "grad_norm": 16.625, + "learning_rate": 6.57009068672423e-07, + "loss": 1.431829810142517, + "step": 3790 + }, + { + "epoch": 1.3804149981798326, + "grad_norm": 6.09375, + "learning_rate": 6.567042340808122e-07, + "loss": 1.1377356052398682, + "step": 3792 + }, + { + "epoch": 1.381143065161995, + "grad_norm": 12.625, + "learning_rate": 6.563993658776586e-07, + "loss": 1.508866310119629, + "step": 3794 + }, + { + "epoch": 1.3818711321441572, + "grad_norm": 24.75, + "learning_rate": 6.560944642436732e-07, + "loss": 1.5603351593017578, + "step": 3796 + }, + { + "epoch": 1.3825991991263196, + "grad_norm": 13.0, + "learning_rate": 6.557895293595875e-07, + "loss": 1.3629566431045532, + "step": 3798 + }, + { + "epoch": 1.383327266108482, + "grad_norm": 22.125, + "learning_rate": 6.554845614061515e-07, + "loss": 1.1047141551971436, + "step": 3800 + }, + { + "epoch": 1.3840553330906444, + "grad_norm": 21.5, + "learning_rate": 6.551795605641361e-07, + "loss": 1.4798870086669922, + "step": 3802 + }, + { + "epoch": 1.3847834000728068, + "grad_norm": 7.4375, + "learning_rate": 6.54874527014331e-07, + "loss": 1.0619583129882812, + "step": 3804 + }, + { + "epoch": 1.385511467054969, + "grad_norm": 9.8125, + "learning_rate": 6.545694609375452e-07, + "loss": 1.6426262855529785, + "step": 3806 + }, + { + "epoch": 1.3862395340371314, + "grad_norm": 19.125, + "learning_rate": 6.542643625146076e-07, + "loss": 1.7321751117706299, + "step": 3808 + }, + { + "epoch": 1.3869676010192937, + "grad_norm": 20.875, + "learning_rate": 6.539592319263656e-07, + "loss": 1.5332138538360596, + "step": 3810 + }, + { + "epoch": 1.387695668001456, + "grad_norm": 17.5, + "learning_rate": 6.53654069353686e-07, + "loss": 1.267845869064331, + "step": 3812 + }, + { + "epoch": 1.3884237349836184, + "grad_norm": 15.5625, + "learning_rate": 6.533488749774545e-07, + "loss": 1.571042537689209, + "step": 3814 + }, + { + "epoch": 1.389151801965781, + "grad_norm": 20.125, + "learning_rate": 6.530436489785757e-07, + "loss": 1.596555471420288, + "step": 3816 + }, + { + "epoch": 1.3898798689479432, + "grad_norm": 13.1875, + "learning_rate": 6.527383915379729e-07, + "loss": 1.1032521724700928, + "step": 3818 + }, + { + "epoch": 1.3906079359301056, + "grad_norm": 14.625, + "learning_rate": 6.524331028365877e-07, + "loss": 1.6202313899993896, + "step": 3820 + }, + { + "epoch": 1.3913360029122679, + "grad_norm": 6.375, + "learning_rate": 6.521277830553811e-07, + "loss": 1.3171429634094238, + "step": 3822 + }, + { + "epoch": 1.3920640698944302, + "grad_norm": 12.0625, + "learning_rate": 6.518224323753314e-07, + "loss": 1.462318778038025, + "step": 3824 + }, + { + "epoch": 1.3927921368765928, + "grad_norm": 16.375, + "learning_rate": 6.515170509774364e-07, + "loss": 1.223466396331787, + "step": 3826 + }, + { + "epoch": 1.393520203858755, + "grad_norm": 16.625, + "learning_rate": 6.512116390427112e-07, + "loss": 1.8018159866333008, + "step": 3828 + }, + { + "epoch": 1.3942482708409174, + "grad_norm": 17.75, + "learning_rate": 6.509061967521892e-07, + "loss": 1.519150972366333, + "step": 3830 + }, + { + "epoch": 1.3949763378230797, + "grad_norm": 18.5, + "learning_rate": 6.506007242869219e-07, + "loss": 1.2078821659088135, + "step": 3832 + }, + { + "epoch": 1.395704404805242, + "grad_norm": 6.40625, + "learning_rate": 6.50295221827979e-07, + "loss": 1.1076700687408447, + "step": 3834 + }, + { + "epoch": 1.3964324717874044, + "grad_norm": 13.0625, + "learning_rate": 6.499896895564474e-07, + "loss": 1.0756750106811523, + "step": 3836 + }, + { + "epoch": 1.3971605387695667, + "grad_norm": 23.5, + "learning_rate": 6.49684127653432e-07, + "loss": 1.6983544826507568, + "step": 3838 + }, + { + "epoch": 1.3978886057517292, + "grad_norm": 15.25, + "learning_rate": 6.493785363000552e-07, + "loss": 1.2538745403289795, + "step": 3840 + }, + { + "epoch": 1.3986166727338916, + "grad_norm": 9.5625, + "learning_rate": 6.490729156774569e-07, + "loss": 1.3573771715164185, + "step": 3842 + }, + { + "epoch": 1.3993447397160539, + "grad_norm": 11.75, + "learning_rate": 6.487672659667943e-07, + "loss": 1.1904714107513428, + "step": 3844 + }, + { + "epoch": 1.4000728066982162, + "grad_norm": 18.125, + "learning_rate": 6.484615873492419e-07, + "loss": 1.095213532447815, + "step": 3846 + }, + { + "epoch": 1.4008008736803785, + "grad_norm": 19.75, + "learning_rate": 6.48155880005991e-07, + "loss": 1.4790022373199463, + "step": 3848 + }, + { + "epoch": 1.401528940662541, + "grad_norm": 5.53125, + "learning_rate": 6.478501441182504e-07, + "loss": 1.2590405941009521, + "step": 3850 + }, + { + "epoch": 1.4022570076447032, + "grad_norm": 9.625, + "learning_rate": 6.475443798672458e-07, + "loss": 1.2084271907806396, + "step": 3852 + }, + { + "epoch": 1.4029850746268657, + "grad_norm": 15.5625, + "learning_rate": 6.472385874342193e-07, + "loss": 1.6172959804534912, + "step": 3854 + }, + { + "epoch": 1.403713141609028, + "grad_norm": 26.125, + "learning_rate": 6.4693276700043e-07, + "loss": 1.8665236234664917, + "step": 3856 + }, + { + "epoch": 1.4044412085911904, + "grad_norm": 11.9375, + "learning_rate": 6.466269187471538e-07, + "loss": 1.5053999423980713, + "step": 3858 + }, + { + "epoch": 1.4051692755733527, + "grad_norm": 11.0625, + "learning_rate": 6.463210428556825e-07, + "loss": 1.2505218982696533, + "step": 3860 + }, + { + "epoch": 1.405897342555515, + "grad_norm": 7.15625, + "learning_rate": 6.460151395073247e-07, + "loss": 1.1932175159454346, + "step": 3862 + }, + { + "epoch": 1.4066254095376776, + "grad_norm": 19.75, + "learning_rate": 6.45709208883405e-07, + "loss": 1.2861859798431396, + "step": 3864 + }, + { + "epoch": 1.4073534765198399, + "grad_norm": 14.5, + "learning_rate": 6.45403251165265e-07, + "loss": 1.566422462463379, + "step": 3866 + }, + { + "epoch": 1.4080815435020022, + "grad_norm": 17.125, + "learning_rate": 6.450972665342611e-07, + "loss": 1.4239284992218018, + "step": 3868 + }, + { + "epoch": 1.4088096104841645, + "grad_norm": 13.0625, + "learning_rate": 6.447912551717665e-07, + "loss": 1.4863903522491455, + "step": 3870 + }, + { + "epoch": 1.4095376774663269, + "grad_norm": 9.5625, + "learning_rate": 6.444852172591698e-07, + "loss": 1.4886350631713867, + "step": 3872 + }, + { + "epoch": 1.4102657444484892, + "grad_norm": 25.75, + "learning_rate": 6.44179152977876e-07, + "loss": 1.3763282299041748, + "step": 3874 + }, + { + "epoch": 1.4109938114306515, + "grad_norm": 10.125, + "learning_rate": 6.43873062509305e-07, + "loss": 1.4551891088485718, + "step": 3876 + }, + { + "epoch": 1.411721878412814, + "grad_norm": 10.1875, + "learning_rate": 6.435669460348926e-07, + "loss": 1.6057417392730713, + "step": 3878 + }, + { + "epoch": 1.4124499453949764, + "grad_norm": 18.625, + "learning_rate": 6.432608037360898e-07, + "loss": 1.463627576828003, + "step": 3880 + }, + { + "epoch": 1.4131780123771387, + "grad_norm": 8.5625, + "learning_rate": 6.429546357943632e-07, + "loss": 1.2950654029846191, + "step": 3882 + }, + { + "epoch": 1.413906079359301, + "grad_norm": 20.375, + "learning_rate": 6.426484423911942e-07, + "loss": 1.5979102849960327, + "step": 3884 + }, + { + "epoch": 1.4146341463414633, + "grad_norm": 14.125, + "learning_rate": 6.423422237080796e-07, + "loss": 1.4105210304260254, + "step": 3886 + }, + { + "epoch": 1.4153622133236259, + "grad_norm": 15.625, + "learning_rate": 6.420359799265314e-07, + "loss": 1.4112498760223389, + "step": 3888 + }, + { + "epoch": 1.4160902803057882, + "grad_norm": 12.125, + "learning_rate": 6.417297112280758e-07, + "loss": 1.4504566192626953, + "step": 3890 + }, + { + "epoch": 1.4168183472879505, + "grad_norm": 8.4375, + "learning_rate": 6.414234177942544e-07, + "loss": 1.4659658670425415, + "step": 3892 + }, + { + "epoch": 1.4175464142701129, + "grad_norm": 10.5625, + "learning_rate": 6.41117099806623e-07, + "loss": 1.2336108684539795, + "step": 3894 + }, + { + "epoch": 1.4182744812522752, + "grad_norm": 24.5, + "learning_rate": 6.408107574467525e-07, + "loss": 1.2613639831542969, + "step": 3896 + }, + { + "epoch": 1.4190025482344375, + "grad_norm": 12.875, + "learning_rate": 6.405043908962275e-07, + "loss": 1.4311940670013428, + "step": 3898 + }, + { + "epoch": 1.4197306152165998, + "grad_norm": 12.1875, + "learning_rate": 6.401980003366476e-07, + "loss": 1.5557042360305786, + "step": 3900 + }, + { + "epoch": 1.4204586821987624, + "grad_norm": 33.25, + "learning_rate": 6.398915859496265e-07, + "loss": 1.364403247833252, + "step": 3902 + }, + { + "epoch": 1.4211867491809247, + "grad_norm": 5.875, + "learning_rate": 6.395851479167916e-07, + "loss": 0.9844739437103271, + "step": 3904 + }, + { + "epoch": 1.421914816163087, + "grad_norm": 13.4375, + "learning_rate": 6.392786864197848e-07, + "loss": 0.9559416770935059, + "step": 3906 + }, + { + "epoch": 1.4226428831452493, + "grad_norm": 5.0, + "learning_rate": 6.389722016402619e-07, + "loss": 1.3423237800598145, + "step": 3908 + }, + { + "epoch": 1.4233709501274117, + "grad_norm": 14.0, + "learning_rate": 6.386656937598922e-07, + "loss": 1.2695932388305664, + "step": 3910 + }, + { + "epoch": 1.4240990171095742, + "grad_norm": 5.0, + "learning_rate": 6.383591629603586e-07, + "loss": 1.5015126466751099, + "step": 3912 + }, + { + "epoch": 1.4248270840917363, + "grad_norm": 17.125, + "learning_rate": 6.380526094233582e-07, + "loss": 1.3726730346679688, + "step": 3914 + }, + { + "epoch": 1.4255551510738989, + "grad_norm": 22.25, + "learning_rate": 6.377460333306009e-07, + "loss": 1.5401873588562012, + "step": 3916 + }, + { + "epoch": 1.4262832180560612, + "grad_norm": 14.6875, + "learning_rate": 6.374394348638104e-07, + "loss": 0.5006361603736877, + "step": 3918 + }, + { + "epoch": 1.4270112850382235, + "grad_norm": 13.0, + "learning_rate": 6.371328142047234e-07, + "loss": 1.0892443656921387, + "step": 3920 + }, + { + "epoch": 1.4277393520203858, + "grad_norm": 14.1875, + "learning_rate": 6.368261715350902e-07, + "loss": 1.7723450660705566, + "step": 3922 + }, + { + "epoch": 1.4284674190025481, + "grad_norm": 30.75, + "learning_rate": 6.365195070366736e-07, + "loss": 1.4715778827667236, + "step": 3924 + }, + { + "epoch": 1.4291954859847107, + "grad_norm": 7.5625, + "learning_rate": 6.362128208912494e-07, + "loss": 1.004819631576538, + "step": 3926 + }, + { + "epoch": 1.429923552966873, + "grad_norm": 13.125, + "learning_rate": 6.359061132806067e-07, + "loss": 1.406633734703064, + "step": 3928 + }, + { + "epoch": 1.4306516199490353, + "grad_norm": 46.5, + "learning_rate": 6.355993843865469e-07, + "loss": 1.3550755977630615, + "step": 3930 + }, + { + "epoch": 1.4313796869311977, + "grad_norm": 8.5625, + "learning_rate": 6.352926343908839e-07, + "loss": 1.3447465896606445, + "step": 3932 + }, + { + "epoch": 1.43210775391336, + "grad_norm": 10.9375, + "learning_rate": 6.349858634754449e-07, + "loss": 1.2751665115356445, + "step": 3934 + }, + { + "epoch": 1.4328358208955223, + "grad_norm": 13.125, + "learning_rate": 6.346790718220685e-07, + "loss": 0.8918267488479614, + "step": 3936 + }, + { + "epoch": 1.4335638878776846, + "grad_norm": 14.5625, + "learning_rate": 6.343722596126061e-07, + "loss": 1.3893500566482544, + "step": 3938 + }, + { + "epoch": 1.4342919548598472, + "grad_norm": 12.1875, + "learning_rate": 6.340654270289215e-07, + "loss": 1.52457594871521, + "step": 3940 + }, + { + "epoch": 1.4350200218420095, + "grad_norm": 12.5, + "learning_rate": 6.337585742528897e-07, + "loss": 1.5272226333618164, + "step": 3942 + }, + { + "epoch": 1.4357480888241718, + "grad_norm": 23.25, + "learning_rate": 6.33451701466399e-07, + "loss": 1.8061047792434692, + "step": 3944 + }, + { + "epoch": 1.4364761558063341, + "grad_norm": 14.125, + "learning_rate": 6.331448088513482e-07, + "loss": 1.4305622577667236, + "step": 3946 + }, + { + "epoch": 1.4372042227884965, + "grad_norm": 36.25, + "learning_rate": 6.32837896589649e-07, + "loss": 1.4935046434402466, + "step": 3948 + }, + { + "epoch": 1.437932289770659, + "grad_norm": 17.125, + "learning_rate": 6.325309648632237e-07, + "loss": 1.5713564157485962, + "step": 3950 + }, + { + "epoch": 1.4386603567528213, + "grad_norm": 7.09375, + "learning_rate": 6.322240138540072e-07, + "loss": 1.1652381420135498, + "step": 3952 + }, + { + "epoch": 1.4393884237349837, + "grad_norm": 10.6875, + "learning_rate": 6.31917043743945e-07, + "loss": 1.487485408782959, + "step": 3954 + }, + { + "epoch": 1.440116490717146, + "grad_norm": 9.5625, + "learning_rate": 6.316100547149943e-07, + "loss": 1.6507375240325928, + "step": 3956 + }, + { + "epoch": 1.4408445576993083, + "grad_norm": 18.5, + "learning_rate": 6.313030469491234e-07, + "loss": 1.5057244300842285, + "step": 3958 + }, + { + "epoch": 1.4415726246814706, + "grad_norm": 18.75, + "learning_rate": 6.309960206283117e-07, + "loss": 1.2892944812774658, + "step": 3960 + }, + { + "epoch": 1.442300691663633, + "grad_norm": 11.125, + "learning_rate": 6.306889759345497e-07, + "loss": 1.3819247484207153, + "step": 3962 + }, + { + "epoch": 1.4430287586457955, + "grad_norm": 7.21875, + "learning_rate": 6.303819130498386e-07, + "loss": 1.1314518451690674, + "step": 3964 + }, + { + "epoch": 1.4437568256279578, + "grad_norm": 26.5, + "learning_rate": 6.300748321561907e-07, + "loss": 1.6306374073028564, + "step": 3966 + }, + { + "epoch": 1.4444848926101201, + "grad_norm": 14.8125, + "learning_rate": 6.297677334356289e-07, + "loss": 1.4747436046600342, + "step": 3968 + }, + { + "epoch": 1.4452129595922825, + "grad_norm": 8.25, + "learning_rate": 6.294606170701859e-07, + "loss": 1.252538800239563, + "step": 3970 + }, + { + "epoch": 1.4459410265744448, + "grad_norm": 7.34375, + "learning_rate": 6.291534832419063e-07, + "loss": 1.2767164707183838, + "step": 3972 + }, + { + "epoch": 1.4466690935566073, + "grad_norm": 9.6875, + "learning_rate": 6.288463321328438e-07, + "loss": 1.7152031660079956, + "step": 3974 + }, + { + "epoch": 1.4473971605387694, + "grad_norm": 18.875, + "learning_rate": 6.28539163925063e-07, + "loss": 1.4015365839004517, + "step": 3976 + }, + { + "epoch": 1.448125227520932, + "grad_norm": 12.125, + "learning_rate": 6.282319788006382e-07, + "loss": 1.2811528444290161, + "step": 3978 + }, + { + "epoch": 1.4488532945030943, + "grad_norm": 41.25, + "learning_rate": 6.279247769416545e-07, + "loss": 1.4580905437469482, + "step": 3980 + }, + { + "epoch": 1.4495813614852566, + "grad_norm": 28.0, + "learning_rate": 6.276175585302057e-07, + "loss": 1.2499969005584717, + "step": 3982 + }, + { + "epoch": 1.450309428467419, + "grad_norm": 11.25, + "learning_rate": 6.273103237483966e-07, + "loss": 1.3543986082077026, + "step": 3984 + }, + { + "epoch": 1.4510374954495813, + "grad_norm": 17.125, + "learning_rate": 6.270030727783408e-07, + "loss": 1.4321038722991943, + "step": 3986 + }, + { + "epoch": 1.4517655624317438, + "grad_norm": 16.625, + "learning_rate": 6.266958058021623e-07, + "loss": 1.352935791015625, + "step": 3988 + }, + { + "epoch": 1.4524936294139061, + "grad_norm": 12.875, + "learning_rate": 6.26388523001994e-07, + "loss": 1.2796860933303833, + "step": 3990 + }, + { + "epoch": 1.4532216963960685, + "grad_norm": 51.75, + "learning_rate": 6.260812245599784e-07, + "loss": 1.2445205450057983, + "step": 3992 + }, + { + "epoch": 1.4539497633782308, + "grad_norm": 11.9375, + "learning_rate": 6.257739106582672e-07, + "loss": 1.2376993894577026, + "step": 3994 + }, + { + "epoch": 1.454677830360393, + "grad_norm": 10.9375, + "learning_rate": 6.254665814790214e-07, + "loss": 0.9253160953521729, + "step": 3996 + }, + { + "epoch": 1.4554058973425554, + "grad_norm": 26.625, + "learning_rate": 6.251592372044105e-07, + "loss": 1.5481733083724976, + "step": 3998 + }, + { + "epoch": 1.4561339643247178, + "grad_norm": 22.875, + "learning_rate": 6.248518780166143e-07, + "loss": 1.36992609500885, + "step": 4000 + }, + { + "epoch": 1.4568620313068803, + "grad_norm": 19.5, + "learning_rate": 6.245445040978198e-07, + "loss": 1.6849555969238281, + "step": 4002 + }, + { + "epoch": 1.4575900982890426, + "grad_norm": 13.625, + "learning_rate": 6.242371156302237e-07, + "loss": 1.500856876373291, + "step": 4004 + }, + { + "epoch": 1.458318165271205, + "grad_norm": 4.4375, + "learning_rate": 6.23929712796031e-07, + "loss": 1.2027935981750488, + "step": 4006 + }, + { + "epoch": 1.4590462322533673, + "grad_norm": 13.125, + "learning_rate": 6.236222957774556e-07, + "loss": 1.4808398485183716, + "step": 4008 + }, + { + "epoch": 1.4597742992355296, + "grad_norm": 11.5, + "learning_rate": 6.233148647567193e-07, + "loss": 1.2671887874603271, + "step": 4010 + }, + { + "epoch": 1.4605023662176921, + "grad_norm": 24.875, + "learning_rate": 6.230074199160522e-07, + "loss": 1.0237560272216797, + "step": 4012 + }, + { + "epoch": 1.4612304331998545, + "grad_norm": 13.5625, + "learning_rate": 6.226999614376934e-07, + "loss": 1.5391194820404053, + "step": 4014 + }, + { + "epoch": 1.4619585001820168, + "grad_norm": 17.625, + "learning_rate": 6.223924895038891e-07, + "loss": 1.4633374214172363, + "step": 4016 + }, + { + "epoch": 1.462686567164179, + "grad_norm": 12.0625, + "learning_rate": 6.220850042968938e-07, + "loss": 1.3611488342285156, + "step": 4018 + }, + { + "epoch": 1.4634146341463414, + "grad_norm": 15.375, + "learning_rate": 6.217775059989703e-07, + "loss": 1.4329619407653809, + "step": 4020 + }, + { + "epoch": 1.4641427011285038, + "grad_norm": 18.0, + "learning_rate": 6.214699947923885e-07, + "loss": 1.4241976737976074, + "step": 4022 + }, + { + "epoch": 1.464870768110666, + "grad_norm": 6.28125, + "learning_rate": 6.211624708594263e-07, + "loss": 1.322940707206726, + "step": 4024 + }, + { + "epoch": 1.4655988350928286, + "grad_norm": 20.25, + "learning_rate": 6.208549343823693e-07, + "loss": 1.64707612991333, + "step": 4026 + }, + { + "epoch": 1.466326902074991, + "grad_norm": 12.625, + "learning_rate": 6.205473855435099e-07, + "loss": 1.2618904113769531, + "step": 4028 + }, + { + "epoch": 1.4670549690571533, + "grad_norm": 6.0, + "learning_rate": 6.20239824525149e-07, + "loss": 1.1907159090042114, + "step": 4030 + }, + { + "epoch": 1.4677830360393156, + "grad_norm": 10.0625, + "learning_rate": 6.199322515095932e-07, + "loss": 1.319791316986084, + "step": 4032 + }, + { + "epoch": 1.468511103021478, + "grad_norm": 23.375, + "learning_rate": 6.196246666791573e-07, + "loss": 1.4296483993530273, + "step": 4034 + }, + { + "epoch": 1.4692391700036405, + "grad_norm": 11.75, + "learning_rate": 6.193170702161629e-07, + "loss": 1.3877637386322021, + "step": 4036 + }, + { + "epoch": 1.4699672369858026, + "grad_norm": 15.5, + "learning_rate": 6.190094623029383e-07, + "loss": 1.1940171718597412, + "step": 4038 + }, + { + "epoch": 1.470695303967965, + "grad_norm": 29.375, + "learning_rate": 6.187018431218187e-07, + "loss": 1.1363303661346436, + "step": 4040 + }, + { + "epoch": 1.4714233709501274, + "grad_norm": 12.875, + "learning_rate": 6.183942128551459e-07, + "loss": 1.4611146450042725, + "step": 4042 + }, + { + "epoch": 1.4721514379322898, + "grad_norm": 4.5625, + "learning_rate": 6.180865716852683e-07, + "loss": 1.1247847080230713, + "step": 4044 + }, + { + "epoch": 1.472879504914452, + "grad_norm": 8.8125, + "learning_rate": 6.17778919794541e-07, + "loss": 1.3958759307861328, + "step": 4046 + }, + { + "epoch": 1.4736075718966144, + "grad_norm": 4.625, + "learning_rate": 6.174712573653249e-07, + "loss": 1.18114173412323, + "step": 4048 + }, + { + "epoch": 1.474335638878777, + "grad_norm": 13.125, + "learning_rate": 6.171635845799881e-07, + "loss": 0.6044919490814209, + "step": 4050 + }, + { + "epoch": 1.4750637058609393, + "grad_norm": 11.375, + "learning_rate": 6.168559016209034e-07, + "loss": 1.0543203353881836, + "step": 4052 + }, + { + "epoch": 1.4757917728431016, + "grad_norm": 12.5625, + "learning_rate": 6.165482086704509e-07, + "loss": 1.566157341003418, + "step": 4054 + }, + { + "epoch": 1.476519839825264, + "grad_norm": 4.46875, + "learning_rate": 6.162405059110163e-07, + "loss": 1.0936524868011475, + "step": 4056 + }, + { + "epoch": 1.4772479068074262, + "grad_norm": 15.3125, + "learning_rate": 6.159327935249908e-07, + "loss": 1.403761625289917, + "step": 4058 + }, + { + "epoch": 1.4779759737895886, + "grad_norm": 25.875, + "learning_rate": 6.156250716947716e-07, + "loss": 1.6006920337677002, + "step": 4060 + }, + { + "epoch": 1.4787040407717509, + "grad_norm": 6.21875, + "learning_rate": 6.153173406027612e-07, + "loss": 1.2054849863052368, + "step": 4062 + }, + { + "epoch": 1.4794321077539134, + "grad_norm": 13.8125, + "learning_rate": 6.150096004313681e-07, + "loss": 1.4369503259658813, + "step": 4064 + }, + { + "epoch": 1.4801601747360758, + "grad_norm": 9.0625, + "learning_rate": 6.147018513630058e-07, + "loss": 0.953758955001831, + "step": 4066 + }, + { + "epoch": 1.480888241718238, + "grad_norm": 8.875, + "learning_rate": 6.14394093580093e-07, + "loss": 1.3431429862976074, + "step": 4068 + }, + { + "epoch": 1.4816163087004004, + "grad_norm": 17.75, + "learning_rate": 6.140863272650537e-07, + "loss": 1.4058783054351807, + "step": 4070 + }, + { + "epoch": 1.4823443756825627, + "grad_norm": 22.125, + "learning_rate": 6.137785526003172e-07, + "loss": 1.3462706804275513, + "step": 4072 + }, + { + "epoch": 1.4830724426647253, + "grad_norm": 28.0, + "learning_rate": 6.134707697683172e-07, + "loss": 1.3559824228286743, + "step": 4074 + }, + { + "epoch": 1.4838005096468876, + "grad_norm": 19.25, + "learning_rate": 6.131629789514929e-07, + "loss": 1.3563416004180908, + "step": 4076 + }, + { + "epoch": 1.48452857662905, + "grad_norm": 12.4375, + "learning_rate": 6.128551803322877e-07, + "loss": 1.371191143989563, + "step": 4078 + }, + { + "epoch": 1.4852566436112122, + "grad_norm": 28.625, + "learning_rate": 6.125473740931498e-07, + "loss": 1.5291110277175903, + "step": 4080 + }, + { + "epoch": 1.4859847105933746, + "grad_norm": 16.5, + "learning_rate": 6.12239560416532e-07, + "loss": 1.5274462699890137, + "step": 4082 + }, + { + "epoch": 1.4867127775755369, + "grad_norm": 12.5, + "learning_rate": 6.119317394848914e-07, + "loss": 1.406619906425476, + "step": 4084 + }, + { + "epoch": 1.4874408445576992, + "grad_norm": 14.9375, + "learning_rate": 6.116239114806896e-07, + "loss": 1.3195240497589111, + "step": 4086 + }, + { + "epoch": 1.4881689115398617, + "grad_norm": 13.875, + "learning_rate": 6.11316076586392e-07, + "loss": 1.057878017425537, + "step": 4088 + }, + { + "epoch": 1.488896978522024, + "grad_norm": 23.375, + "learning_rate": 6.110082349844685e-07, + "loss": 1.3967978954315186, + "step": 4090 + }, + { + "epoch": 1.4896250455041864, + "grad_norm": 17.5, + "learning_rate": 6.107003868573925e-07, + "loss": 1.6329823732376099, + "step": 4092 + }, + { + "epoch": 1.4903531124863487, + "grad_norm": 29.5, + "learning_rate": 6.10392532387642e-07, + "loss": 1.027433156967163, + "step": 4094 + }, + { + "epoch": 1.491081179468511, + "grad_norm": 6.8125, + "learning_rate": 6.10084671757698e-07, + "loss": 1.188098430633545, + "step": 4096 + }, + { + "epoch": 1.4918092464506736, + "grad_norm": 8.25, + "learning_rate": 6.097768051500459e-07, + "loss": 1.4085434675216675, + "step": 4098 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 16.375, + "learning_rate": 6.094689327471736e-07, + "loss": 1.3444855213165283, + "step": 4100 + }, + { + "epoch": 1.4932653804149982, + "grad_norm": 6.8125, + "learning_rate": 6.091610547315734e-07, + "loss": 1.3068437576293945, + "step": 4102 + }, + { + "epoch": 1.4939934473971606, + "grad_norm": 28.0, + "learning_rate": 6.088531712857405e-07, + "loss": 1.395565390586853, + "step": 4104 + }, + { + "epoch": 1.4947215143793229, + "grad_norm": 24.125, + "learning_rate": 6.085452825921736e-07, + "loss": 1.0218948125839233, + "step": 4106 + }, + { + "epoch": 1.4954495813614852, + "grad_norm": 11.25, + "learning_rate": 6.082373888333741e-07, + "loss": 1.3491613864898682, + "step": 4108 + }, + { + "epoch": 1.4961776483436475, + "grad_norm": 10.0625, + "learning_rate": 6.079294901918466e-07, + "loss": 1.2713433504104614, + "step": 4110 + }, + { + "epoch": 1.49690571532581, + "grad_norm": 14.5625, + "learning_rate": 6.076215868500986e-07, + "loss": 1.588794469833374, + "step": 4112 + }, + { + "epoch": 1.4976337823079724, + "grad_norm": 6.25, + "learning_rate": 6.073136789906406e-07, + "loss": 1.1561927795410156, + "step": 4114 + }, + { + "epoch": 1.4983618492901347, + "grad_norm": 14.1875, + "learning_rate": 6.070057667959854e-07, + "loss": 1.3972256183624268, + "step": 4116 + }, + { + "epoch": 1.499089916272297, + "grad_norm": 11.5, + "learning_rate": 6.066978504486485e-07, + "loss": 1.6003105640411377, + "step": 4118 + }, + { + "epoch": 1.4998179832544594, + "grad_norm": 9.75, + "learning_rate": 6.063899301311482e-07, + "loss": 1.4745633602142334, + "step": 4120 + }, + { + "epoch": 1.500546050236622, + "grad_norm": 12.375, + "learning_rate": 6.060820060260044e-07, + "loss": 1.6261639595031738, + "step": 4122 + }, + { + "epoch": 1.501274117218784, + "grad_norm": 29.0, + "learning_rate": 6.057740783157399e-07, + "loss": 1.3089203834533691, + "step": 4124 + }, + { + "epoch": 1.5020021842009466, + "grad_norm": 56.25, + "learning_rate": 6.054661471828796e-07, + "loss": 1.6007089614868164, + "step": 4126 + }, + { + "epoch": 1.5027302511831089, + "grad_norm": 7.1875, + "learning_rate": 6.051582128099501e-07, + "loss": 1.2423182725906372, + "step": 4128 + }, + { + "epoch": 1.5034583181652712, + "grad_norm": 15.0, + "learning_rate": 6.048502753794801e-07, + "loss": 1.2513182163238525, + "step": 4130 + }, + { + "epoch": 1.5041863851474335, + "grad_norm": 6.59375, + "learning_rate": 6.045423350740002e-07, + "loss": 1.1914944648742676, + "step": 4132 + }, + { + "epoch": 1.5049144521295958, + "grad_norm": 17.125, + "learning_rate": 6.042343920760424e-07, + "loss": 1.5736831426620483, + "step": 4134 + }, + { + "epoch": 1.5056425191117584, + "grad_norm": 6.5, + "learning_rate": 6.039264465681408e-07, + "loss": 1.1519869565963745, + "step": 4136 + }, + { + "epoch": 1.5063705860939205, + "grad_norm": 36.25, + "learning_rate": 6.036184987328304e-07, + "loss": 1.3402732610702515, + "step": 4138 + }, + { + "epoch": 1.507098653076083, + "grad_norm": 6.71875, + "learning_rate": 6.03310548752648e-07, + "loss": 1.2350085973739624, + "step": 4140 + }, + { + "epoch": 1.5078267200582454, + "grad_norm": 12.5, + "learning_rate": 6.030025968101315e-07, + "loss": 1.3648707866668701, + "step": 4142 + }, + { + "epoch": 1.5085547870404077, + "grad_norm": 7.03125, + "learning_rate": 6.026946430878201e-07, + "loss": 1.405854344367981, + "step": 4144 + }, + { + "epoch": 1.5092828540225702, + "grad_norm": 14.125, + "learning_rate": 6.023866877682537e-07, + "loss": 1.4388325214385986, + "step": 4146 + }, + { + "epoch": 1.5100109210047323, + "grad_norm": 18.125, + "learning_rate": 6.020787310339737e-07, + "loss": 0.9963948726654053, + "step": 4148 + }, + { + "epoch": 1.5107389879868949, + "grad_norm": 18.25, + "learning_rate": 6.017707730675219e-07, + "loss": 1.410457968711853, + "step": 4150 + }, + { + "epoch": 1.5114670549690572, + "grad_norm": 6.0, + "learning_rate": 6.014628140514408e-07, + "loss": 1.3712738752365112, + "step": 4152 + }, + { + "epoch": 1.5121951219512195, + "grad_norm": 10.75, + "learning_rate": 6.011548541682742e-07, + "loss": 1.1067781448364258, + "step": 4154 + }, + { + "epoch": 1.5129231889333818, + "grad_norm": 14.375, + "learning_rate": 6.008468936005655e-07, + "loss": 1.335561990737915, + "step": 4156 + }, + { + "epoch": 1.5136512559155442, + "grad_norm": 17.5, + "learning_rate": 6.005389325308589e-07, + "loss": 1.9020140171051025, + "step": 4158 + }, + { + "epoch": 1.5143793228977067, + "grad_norm": 16.5, + "learning_rate": 6.002309711416991e-07, + "loss": 1.986282467842102, + "step": 4160 + }, + { + "epoch": 1.5151073898798688, + "grad_norm": 17.125, + "learning_rate": 5.999230096156307e-07, + "loss": 1.0174099206924438, + "step": 4162 + }, + { + "epoch": 1.5158354568620314, + "grad_norm": 11.0, + "learning_rate": 5.996150481351984e-07, + "loss": 1.4091817140579224, + "step": 4164 + }, + { + "epoch": 1.5165635238441937, + "grad_norm": 17.75, + "learning_rate": 5.993070868829473e-07, + "loss": 1.4867862462997437, + "step": 4166 + }, + { + "epoch": 1.517291590826356, + "grad_norm": 15.8125, + "learning_rate": 5.989991260414214e-07, + "loss": 1.2250216007232666, + "step": 4168 + }, + { + "epoch": 1.5180196578085183, + "grad_norm": 8.4375, + "learning_rate": 5.986911657931657e-07, + "loss": 1.3905556201934814, + "step": 4170 + }, + { + "epoch": 1.5187477247906807, + "grad_norm": 29.125, + "learning_rate": 5.98383206320724e-07, + "loss": 1.1927540302276611, + "step": 4172 + }, + { + "epoch": 1.5194757917728432, + "grad_norm": 6.4375, + "learning_rate": 5.980752478066397e-07, + "loss": 1.226758360862732, + "step": 4174 + }, + { + "epoch": 1.5202038587550053, + "grad_norm": 19.75, + "learning_rate": 5.977672904334559e-07, + "loss": 1.1725494861602783, + "step": 4176 + }, + { + "epoch": 1.5209319257371678, + "grad_norm": 19.25, + "learning_rate": 5.974593343837152e-07, + "loss": 1.5460450649261475, + "step": 4178 + }, + { + "epoch": 1.5216599927193302, + "grad_norm": 12.5625, + "learning_rate": 5.971513798399585e-07, + "loss": 1.6132973432540894, + "step": 4180 + }, + { + "epoch": 1.5223880597014925, + "grad_norm": 22.75, + "learning_rate": 5.968434269847271e-07, + "loss": 1.6883294582366943, + "step": 4182 + }, + { + "epoch": 1.523116126683655, + "grad_norm": 10.5, + "learning_rate": 5.965354760005605e-07, + "loss": 1.5315957069396973, + "step": 4184 + }, + { + "epoch": 1.5238441936658171, + "grad_norm": 17.75, + "learning_rate": 5.962275270699968e-07, + "loss": 1.2015408277511597, + "step": 4186 + }, + { + "epoch": 1.5245722606479797, + "grad_norm": 16.25, + "learning_rate": 5.959195803755736e-07, + "loss": 1.8356841802597046, + "step": 4188 + }, + { + "epoch": 1.525300327630142, + "grad_norm": 12.75, + "learning_rate": 5.956116360998273e-07, + "loss": 1.3111207485198975, + "step": 4190 + }, + { + "epoch": 1.5260283946123043, + "grad_norm": 16.75, + "learning_rate": 5.953036944252919e-07, + "loss": 1.4179209470748901, + "step": 4192 + }, + { + "epoch": 1.5267564615944667, + "grad_norm": 118.0, + "learning_rate": 5.949957555345004e-07, + "loss": 0.997340977191925, + "step": 4194 + }, + { + "epoch": 1.527484528576629, + "grad_norm": 5.34375, + "learning_rate": 5.946878196099845e-07, + "loss": 1.3055473566055298, + "step": 4196 + }, + { + "epoch": 1.5282125955587915, + "grad_norm": 11.8125, + "learning_rate": 5.943798868342736e-07, + "loss": 1.4962043762207031, + "step": 4198 + }, + { + "epoch": 1.5289406625409536, + "grad_norm": 34.0, + "learning_rate": 5.940719573898956e-07, + "loss": 1.6376724243164062, + "step": 4200 + }, + { + "epoch": 1.5296687295231162, + "grad_norm": 18.25, + "learning_rate": 5.937640314593761e-07, + "loss": 1.6778538227081299, + "step": 4202 + }, + { + "epoch": 1.5303967965052785, + "grad_norm": 14.25, + "learning_rate": 5.934561092252388e-07, + "loss": 1.4029226303100586, + "step": 4204 + }, + { + "epoch": 1.5311248634874408, + "grad_norm": 12.625, + "learning_rate": 5.931481908700053e-07, + "loss": 1.393689513206482, + "step": 4206 + }, + { + "epoch": 1.5318529304696034, + "grad_norm": 52.75, + "learning_rate": 5.928402765761946e-07, + "loss": 1.4990143775939941, + "step": 4208 + }, + { + "epoch": 1.5325809974517655, + "grad_norm": 5.15625, + "learning_rate": 5.925323665263235e-07, + "loss": 1.165509581565857, + "step": 4210 + }, + { + "epoch": 1.533309064433928, + "grad_norm": 24.625, + "learning_rate": 5.922244609029068e-07, + "loss": 0.3872649371623993, + "step": 4212 + }, + { + "epoch": 1.5340371314160903, + "grad_norm": 7.3125, + "learning_rate": 5.919165598884554e-07, + "loss": 0.8612807393074036, + "step": 4214 + }, + { + "epoch": 1.5347651983982527, + "grad_norm": 11.75, + "learning_rate": 5.916086636654787e-07, + "loss": 1.4135667085647583, + "step": 4216 + }, + { + "epoch": 1.535493265380415, + "grad_norm": 15.75, + "learning_rate": 5.913007724164826e-07, + "loss": 1.493973970413208, + "step": 4218 + }, + { + "epoch": 1.5362213323625773, + "grad_norm": 22.125, + "learning_rate": 5.909928863239702e-07, + "loss": 1.5033578872680664, + "step": 4220 + }, + { + "epoch": 1.5369493993447398, + "grad_norm": 24.625, + "learning_rate": 5.906850055704415e-07, + "loss": 1.4366812705993652, + "step": 4222 + }, + { + "epoch": 1.537677466326902, + "grad_norm": 13.5, + "learning_rate": 5.903771303383935e-07, + "loss": 1.4734852313995361, + "step": 4224 + }, + { + "epoch": 1.5384055333090645, + "grad_norm": 9.4375, + "learning_rate": 5.900692608103201e-07, + "loss": 1.5671709775924683, + "step": 4226 + }, + { + "epoch": 1.5391336002912268, + "grad_norm": 15.375, + "learning_rate": 5.89761397168711e-07, + "loss": 1.3559931516647339, + "step": 4228 + }, + { + "epoch": 1.5398616672733891, + "grad_norm": 6.9375, + "learning_rate": 5.894535395960533e-07, + "loss": 1.2686572074890137, + "step": 4230 + }, + { + "epoch": 1.5405897342555515, + "grad_norm": 34.25, + "learning_rate": 5.891456882748302e-07, + "loss": 1.4788247346878052, + "step": 4232 + }, + { + "epoch": 1.5413178012377138, + "grad_norm": 21.75, + "learning_rate": 5.888378433875211e-07, + "loss": 1.6718707084655762, + "step": 4234 + }, + { + "epoch": 1.5420458682198763, + "grad_norm": 13.625, + "learning_rate": 5.885300051166016e-07, + "loss": 1.488576889038086, + "step": 4236 + }, + { + "epoch": 1.5427739352020384, + "grad_norm": 19.0, + "learning_rate": 5.882221736445434e-07, + "loss": 1.612959384918213, + "step": 4238 + }, + { + "epoch": 1.543502002184201, + "grad_norm": 24.375, + "learning_rate": 5.879143491538141e-07, + "loss": 1.9990922212600708, + "step": 4240 + }, + { + "epoch": 1.5442300691663633, + "grad_norm": 10.5625, + "learning_rate": 5.876065318268777e-07, + "loss": 1.3185300827026367, + "step": 4242 + }, + { + "epoch": 1.5449581361485256, + "grad_norm": 11.625, + "learning_rate": 5.872987218461929e-07, + "loss": 1.2862370014190674, + "step": 4244 + }, + { + "epoch": 1.5456862031306882, + "grad_norm": 16.75, + "learning_rate": 5.869909193942151e-07, + "loss": 0.7837271690368652, + "step": 4246 + }, + { + "epoch": 1.5464142701128503, + "grad_norm": 9.3125, + "learning_rate": 5.866831246533946e-07, + "loss": 1.3873531818389893, + "step": 4248 + }, + { + "epoch": 1.5471423370950128, + "grad_norm": 16.0, + "learning_rate": 5.863753378061772e-07, + "loss": 1.4325916767120361, + "step": 4250 + }, + { + "epoch": 1.5478704040771751, + "grad_norm": 9.3125, + "learning_rate": 5.860675590350045e-07, + "loss": 1.0660045146942139, + "step": 4252 + }, + { + "epoch": 1.5485984710593375, + "grad_norm": 8.75, + "learning_rate": 5.857597885223127e-07, + "loss": 1.330763816833496, + "step": 4254 + }, + { + "epoch": 1.5493265380414998, + "grad_norm": 18.875, + "learning_rate": 5.854520264505332e-07, + "loss": 1.6252425909042358, + "step": 4256 + }, + { + "epoch": 1.550054605023662, + "grad_norm": 19.25, + "learning_rate": 5.85144273002093e-07, + "loss": 1.4018521308898926, + "step": 4258 + }, + { + "epoch": 1.5507826720058246, + "grad_norm": 15.875, + "learning_rate": 5.848365283594132e-07, + "loss": 1.4854384660720825, + "step": 4260 + }, + { + "epoch": 1.5515107389879867, + "grad_norm": 7.78125, + "learning_rate": 5.8452879270491e-07, + "loss": 1.1960992813110352, + "step": 4262 + }, + { + "epoch": 1.5522388059701493, + "grad_norm": 14.75, + "learning_rate": 5.842210662209947e-07, + "loss": 1.3604114055633545, + "step": 4264 + }, + { + "epoch": 1.5529668729523116, + "grad_norm": 15.125, + "learning_rate": 5.839133490900723e-07, + "loss": 1.4990696907043457, + "step": 4266 + }, + { + "epoch": 1.553694939934474, + "grad_norm": 12.9375, + "learning_rate": 5.83605641494543e-07, + "loss": 1.3202805519104004, + "step": 4268 + }, + { + "epoch": 1.5544230069166365, + "grad_norm": 9.5, + "learning_rate": 5.83297943616801e-07, + "loss": 0.9178682565689087, + "step": 4270 + }, + { + "epoch": 1.5551510738987986, + "grad_norm": 33.5, + "learning_rate": 5.829902556392346e-07, + "loss": 1.5506095886230469, + "step": 4272 + }, + { + "epoch": 1.5558791408809611, + "grad_norm": 10.0, + "learning_rate": 5.826825777442266e-07, + "loss": 1.2502272129058838, + "step": 4274 + }, + { + "epoch": 1.5566072078631235, + "grad_norm": 39.0, + "learning_rate": 5.823749101141539e-07, + "loss": 0.861689567565918, + "step": 4276 + }, + { + "epoch": 1.5573352748452858, + "grad_norm": 32.5, + "learning_rate": 5.820672529313866e-07, + "loss": 0.9460551738739014, + "step": 4278 + }, + { + "epoch": 1.558063341827448, + "grad_norm": 29.875, + "learning_rate": 5.817596063782892e-07, + "loss": 1.4976201057434082, + "step": 4280 + }, + { + "epoch": 1.5587914088096104, + "grad_norm": 11.5625, + "learning_rate": 5.814519706372202e-07, + "loss": 1.4365921020507812, + "step": 4282 + }, + { + "epoch": 1.559519475791773, + "grad_norm": 9.75, + "learning_rate": 5.811443458905307e-07, + "loss": 1.4735729694366455, + "step": 4284 + }, + { + "epoch": 1.560247542773935, + "grad_norm": 12.0625, + "learning_rate": 5.808367323205662e-07, + "loss": 1.7493886947631836, + "step": 4286 + }, + { + "epoch": 1.5609756097560976, + "grad_norm": 26.375, + "learning_rate": 5.805291301096653e-07, + "loss": 1.4209492206573486, + "step": 4288 + }, + { + "epoch": 1.56170367673826, + "grad_norm": 9.75, + "learning_rate": 5.802215394401597e-07, + "loss": 1.09389328956604, + "step": 4290 + }, + { + "epoch": 1.5624317437204223, + "grad_norm": 32.25, + "learning_rate": 5.799139604943742e-07, + "loss": 0.9795057773590088, + "step": 4292 + }, + { + "epoch": 1.5631598107025846, + "grad_norm": 5.84375, + "learning_rate": 5.796063934546268e-07, + "loss": 1.315716028213501, + "step": 4294 + }, + { + "epoch": 1.563887877684747, + "grad_norm": 14.75, + "learning_rate": 5.792988385032287e-07, + "loss": 1.3092536926269531, + "step": 4296 + }, + { + "epoch": 1.5646159446669095, + "grad_norm": 13.3125, + "learning_rate": 5.789912958224837e-07, + "loss": 1.2734240293502808, + "step": 4298 + }, + { + "epoch": 1.5653440116490716, + "grad_norm": 16.375, + "learning_rate": 5.786837655946877e-07, + "loss": 1.4170877933502197, + "step": 4300 + }, + { + "epoch": 1.566072078631234, + "grad_norm": 16.75, + "learning_rate": 5.783762480021306e-07, + "loss": 1.4484753608703613, + "step": 4302 + }, + { + "epoch": 1.5668001456133964, + "grad_norm": 8.5, + "learning_rate": 5.780687432270937e-07, + "loss": 1.412430763244629, + "step": 4304 + }, + { + "epoch": 1.5675282125955587, + "grad_norm": 21.875, + "learning_rate": 5.777612514518509e-07, + "loss": 1.4254083633422852, + "step": 4306 + }, + { + "epoch": 1.5682562795777213, + "grad_norm": 9.125, + "learning_rate": 5.774537728586688e-07, + "loss": 1.0924150943756104, + "step": 4308 + }, + { + "epoch": 1.5689843465598834, + "grad_norm": 5.3125, + "learning_rate": 5.77146307629806e-07, + "loss": 1.2442469596862793, + "step": 4310 + }, + { + "epoch": 1.569712413542046, + "grad_norm": 17.375, + "learning_rate": 5.768388559475127e-07, + "loss": 1.304983377456665, + "step": 4312 + }, + { + "epoch": 1.5704404805242083, + "grad_norm": 20.375, + "learning_rate": 5.765314179940317e-07, + "loss": 1.0672584772109985, + "step": 4314 + }, + { + "epoch": 1.5711685475063706, + "grad_norm": 14.25, + "learning_rate": 5.762239939515976e-07, + "loss": 1.570826768875122, + "step": 4316 + }, + { + "epoch": 1.571896614488533, + "grad_norm": 3.578125, + "learning_rate": 5.759165840024363e-07, + "loss": 1.4954659938812256, + "step": 4318 + }, + { + "epoch": 1.5726246814706952, + "grad_norm": 11.1875, + "learning_rate": 5.756091883287657e-07, + "loss": 1.1822288036346436, + "step": 4320 + }, + { + "epoch": 1.5733527484528578, + "grad_norm": 14.0625, + "learning_rate": 5.753018071127953e-07, + "loss": 1.5350130796432495, + "step": 4322 + }, + { + "epoch": 1.5740808154350199, + "grad_norm": 36.75, + "learning_rate": 5.749944405367259e-07, + "loss": 0.986802875995636, + "step": 4324 + }, + { + "epoch": 1.5748088824171824, + "grad_norm": 11.9375, + "learning_rate": 5.746870887827493e-07, + "loss": 1.1030707359313965, + "step": 4326 + }, + { + "epoch": 1.5755369493993447, + "grad_norm": 20.0, + "learning_rate": 5.743797520330493e-07, + "loss": 1.5319020748138428, + "step": 4328 + }, + { + "epoch": 1.576265016381507, + "grad_norm": 11.1875, + "learning_rate": 5.740724304698001e-07, + "loss": 1.0915346145629883, + "step": 4330 + }, + { + "epoch": 1.5769930833636696, + "grad_norm": 3.46875, + "learning_rate": 5.737651242751674e-07, + "loss": 1.3357970714569092, + "step": 4332 + }, + { + "epoch": 1.5777211503458317, + "grad_norm": 15.625, + "learning_rate": 5.734578336313067e-07, + "loss": 1.3509914875030518, + "step": 4334 + }, + { + "epoch": 1.5784492173279943, + "grad_norm": 13.8125, + "learning_rate": 5.731505587203661e-07, + "loss": 1.4590164422988892, + "step": 4336 + }, + { + "epoch": 1.5791772843101566, + "grad_norm": 16.0, + "learning_rate": 5.728432997244829e-07, + "loss": 1.3732280731201172, + "step": 4338 + }, + { + "epoch": 1.579905351292319, + "grad_norm": 6.28125, + "learning_rate": 5.725360568257857e-07, + "loss": 1.1733324527740479, + "step": 4340 + }, + { + "epoch": 1.5806334182744812, + "grad_norm": 55.0, + "learning_rate": 5.722288302063931e-07, + "loss": 1.2268296480178833, + "step": 4342 + }, + { + "epoch": 1.5813614852566436, + "grad_norm": 19.75, + "learning_rate": 5.719216200484144e-07, + "loss": 0.7952990531921387, + "step": 4344 + }, + { + "epoch": 1.582089552238806, + "grad_norm": 13.125, + "learning_rate": 5.71614426533949e-07, + "loss": 1.4817588329315186, + "step": 4346 + }, + { + "epoch": 1.5828176192209682, + "grad_norm": 19.125, + "learning_rate": 5.713072498450861e-07, + "loss": 1.3785014152526855, + "step": 4348 + }, + { + "epoch": 1.5835456862031307, + "grad_norm": 12.3125, + "learning_rate": 5.71000090163906e-07, + "loss": 1.4064472913742065, + "step": 4350 + }, + { + "epoch": 1.584273753185293, + "grad_norm": 25.0, + "learning_rate": 5.706929476724776e-07, + "loss": 1.5639386177062988, + "step": 4352 + }, + { + "epoch": 1.5850018201674554, + "grad_norm": 16.375, + "learning_rate": 5.703858225528601e-07, + "loss": 1.6902164220809937, + "step": 4354 + }, + { + "epoch": 1.5857298871496177, + "grad_norm": 12.4375, + "learning_rate": 5.700787149871029e-07, + "loss": 1.4350606203079224, + "step": 4356 + }, + { + "epoch": 1.58645795413178, + "grad_norm": 14.0625, + "learning_rate": 5.697716251572444e-07, + "loss": 1.3333762884140015, + "step": 4358 + }, + { + "epoch": 1.5871860211139426, + "grad_norm": 19.125, + "learning_rate": 5.694645532453128e-07, + "loss": 1.4667965173721313, + "step": 4360 + }, + { + "epoch": 1.5879140880961047, + "grad_norm": 24.625, + "learning_rate": 5.691574994333255e-07, + "loss": 1.7051572799682617, + "step": 4362 + }, + { + "epoch": 1.5886421550782672, + "grad_norm": 10.6875, + "learning_rate": 5.688504639032889e-07, + "loss": 1.4676733016967773, + "step": 4364 + }, + { + "epoch": 1.5893702220604295, + "grad_norm": 29.625, + "learning_rate": 5.685434468371995e-07, + "loss": 1.2310655117034912, + "step": 4366 + }, + { + "epoch": 1.5900982890425919, + "grad_norm": 13.875, + "learning_rate": 5.682364484170421e-07, + "loss": 1.2109506130218506, + "step": 4368 + }, + { + "epoch": 1.5908263560247544, + "grad_norm": 8.875, + "learning_rate": 5.679294688247902e-07, + "loss": 0.8470150232315063, + "step": 4370 + }, + { + "epoch": 1.5915544230069165, + "grad_norm": 13.3125, + "learning_rate": 5.676225082424068e-07, + "loss": 1.5422285795211792, + "step": 4372 + }, + { + "epoch": 1.592282489989079, + "grad_norm": 19.5, + "learning_rate": 5.673155668518435e-07, + "loss": 1.6503697633743286, + "step": 4374 + }, + { + "epoch": 1.5930105569712414, + "grad_norm": 14.8125, + "learning_rate": 5.670086448350402e-07, + "loss": 1.7161517143249512, + "step": 4376 + }, + { + "epoch": 1.5937386239534037, + "grad_norm": 37.25, + "learning_rate": 5.667017423739257e-07, + "loss": 1.4237356185913086, + "step": 4378 + }, + { + "epoch": 1.594466690935566, + "grad_norm": 23.625, + "learning_rate": 5.663948596504168e-07, + "loss": 0.8836872577667236, + "step": 4380 + }, + { + "epoch": 1.5951947579177284, + "grad_norm": 6.53125, + "learning_rate": 5.660879968464187e-07, + "loss": 1.3664089441299438, + "step": 4382 + }, + { + "epoch": 1.595922824899891, + "grad_norm": 18.25, + "learning_rate": 5.657811541438255e-07, + "loss": 1.6608195304870605, + "step": 4384 + }, + { + "epoch": 1.596650891882053, + "grad_norm": 6.40625, + "learning_rate": 5.654743317245181e-07, + "loss": 1.0337456464767456, + "step": 4386 + }, + { + "epoch": 1.5973789588642155, + "grad_norm": 15.125, + "learning_rate": 5.651675297703667e-07, + "loss": 1.5579779148101807, + "step": 4388 + }, + { + "epoch": 1.5981070258463779, + "grad_norm": 19.75, + "learning_rate": 5.648607484632281e-07, + "loss": 1.2693969011306763, + "step": 4390 + }, + { + "epoch": 1.5988350928285402, + "grad_norm": 20.375, + "learning_rate": 5.645539879849481e-07, + "loss": 1.3188320398330688, + "step": 4392 + }, + { + "epoch": 1.5995631598107027, + "grad_norm": 17.25, + "learning_rate": 5.642472485173593e-07, + "loss": 1.3863551616668701, + "step": 4394 + }, + { + "epoch": 1.6002912267928648, + "grad_norm": 11.25, + "learning_rate": 5.63940530242282e-07, + "loss": 1.4948256015777588, + "step": 4396 + }, + { + "epoch": 1.6010192937750274, + "grad_norm": 5.3125, + "learning_rate": 5.63633833341524e-07, + "loss": 1.2730497121810913, + "step": 4398 + }, + { + "epoch": 1.6017473607571897, + "grad_norm": 8.1875, + "learning_rate": 5.633271579968809e-07, + "loss": 1.2641348838806152, + "step": 4400 + }, + { + "epoch": 1.602475427739352, + "grad_norm": 13.0, + "learning_rate": 5.630205043901348e-07, + "loss": 1.4917261600494385, + "step": 4402 + }, + { + "epoch": 1.6032034947215144, + "grad_norm": 19.625, + "learning_rate": 5.627138727030552e-07, + "loss": 1.6723511219024658, + "step": 4404 + }, + { + "epoch": 1.6039315617036767, + "grad_norm": 8.25, + "learning_rate": 5.624072631173984e-07, + "loss": 0.9682920575141907, + "step": 4406 + }, + { + "epoch": 1.6046596286858392, + "grad_norm": 10.0, + "learning_rate": 5.621006758149083e-07, + "loss": 1.3120651245117188, + "step": 4408 + }, + { + "epoch": 1.6053876956680013, + "grad_norm": 12.0, + "learning_rate": 5.617941109773148e-07, + "loss": 1.3957972526550293, + "step": 4410 + }, + { + "epoch": 1.6061157626501639, + "grad_norm": 11.9375, + "learning_rate": 5.614875687863347e-07, + "loss": 1.6103792190551758, + "step": 4412 + }, + { + "epoch": 1.6068438296323262, + "grad_norm": 20.625, + "learning_rate": 5.611810494236719e-07, + "loss": 2.062405824661255, + "step": 4414 + }, + { + "epoch": 1.6075718966144885, + "grad_norm": 24.75, + "learning_rate": 5.608745530710162e-07, + "loss": 1.4262094497680664, + "step": 4416 + }, + { + "epoch": 1.6082999635966508, + "grad_norm": 24.875, + "learning_rate": 5.605680799100435e-07, + "loss": 1.7564465999603271, + "step": 4418 + }, + { + "epoch": 1.6090280305788132, + "grad_norm": 5.4375, + "learning_rate": 5.602616301224166e-07, + "loss": 0.9250645637512207, + "step": 4420 + }, + { + "epoch": 1.6097560975609757, + "grad_norm": 24.625, + "learning_rate": 5.599552038897842e-07, + "loss": 1.0951513051986694, + "step": 4422 + }, + { + "epoch": 1.6104841645431378, + "grad_norm": 11.0625, + "learning_rate": 5.596488013937813e-07, + "loss": 1.5317528247833252, + "step": 4424 + }, + { + "epoch": 1.6112122315253004, + "grad_norm": 20.375, + "learning_rate": 5.59342422816028e-07, + "loss": 1.2601919174194336, + "step": 4426 + }, + { + "epoch": 1.6119402985074627, + "grad_norm": 78.5, + "learning_rate": 5.590360683381313e-07, + "loss": 1.4187424182891846, + "step": 4428 + }, + { + "epoch": 1.612668365489625, + "grad_norm": 9.5, + "learning_rate": 5.587297381416831e-07, + "loss": 1.4520632028579712, + "step": 4430 + }, + { + "epoch": 1.6133964324717875, + "grad_norm": 15.125, + "learning_rate": 5.584234324082611e-07, + "loss": 1.4374263286590576, + "step": 4432 + }, + { + "epoch": 1.6141244994539496, + "grad_norm": 33.5, + "learning_rate": 5.581171513194289e-07, + "loss": 1.509294033050537, + "step": 4434 + }, + { + "epoch": 1.6148525664361122, + "grad_norm": 3.78125, + "learning_rate": 5.578108950567353e-07, + "loss": 1.210991382598877, + "step": 4436 + }, + { + "epoch": 1.6155806334182745, + "grad_norm": 14.9375, + "learning_rate": 5.57504663801714e-07, + "loss": 1.047816514968872, + "step": 4438 + }, + { + "epoch": 1.6163087004004368, + "grad_norm": 16.875, + "learning_rate": 5.571984577358842e-07, + "loss": 1.6130690574645996, + "step": 4440 + }, + { + "epoch": 1.6170367673825992, + "grad_norm": 20.375, + "learning_rate": 5.568922770407504e-07, + "loss": 1.486649990081787, + "step": 4442 + }, + { + "epoch": 1.6177648343647615, + "grad_norm": 12.0, + "learning_rate": 5.565861218978014e-07, + "loss": 1.61936616897583, + "step": 4444 + }, + { + "epoch": 1.618492901346924, + "grad_norm": 7.15625, + "learning_rate": 5.562799924885113e-07, + "loss": 0.9443035125732422, + "step": 4446 + }, + { + "epoch": 1.6192209683290861, + "grad_norm": 10.5, + "learning_rate": 5.559738889943393e-07, + "loss": 1.236096978187561, + "step": 4448 + }, + { + "epoch": 1.6199490353112487, + "grad_norm": 3.859375, + "learning_rate": 5.556678115967284e-07, + "loss": 1.2083079814910889, + "step": 4450 + }, + { + "epoch": 1.620677102293411, + "grad_norm": 14.0625, + "learning_rate": 5.553617604771064e-07, + "loss": 1.1052048206329346, + "step": 4452 + }, + { + "epoch": 1.6214051692755733, + "grad_norm": 10.8125, + "learning_rate": 5.550557358168863e-07, + "loss": 1.2193975448608398, + "step": 4454 + }, + { + "epoch": 1.6221332362577359, + "grad_norm": 7.125, + "learning_rate": 5.547497377974644e-07, + "loss": 1.3210954666137695, + "step": 4456 + }, + { + "epoch": 1.622861303239898, + "grad_norm": 8.1875, + "learning_rate": 5.544437666002219e-07, + "loss": 1.0203919410705566, + "step": 4458 + }, + { + "epoch": 1.6235893702220605, + "grad_norm": 10.4375, + "learning_rate": 5.541378224065234e-07, + "loss": 1.4835970401763916, + "step": 4460 + }, + { + "epoch": 1.6243174372042228, + "grad_norm": 468.0, + "learning_rate": 5.538319053977181e-07, + "loss": 1.2525068521499634, + "step": 4462 + }, + { + "epoch": 1.6250455041863852, + "grad_norm": 11.25, + "learning_rate": 5.53526015755139e-07, + "loss": 1.1622049808502197, + "step": 4464 + }, + { + "epoch": 1.6257735711685475, + "grad_norm": 16.625, + "learning_rate": 5.532201536601026e-07, + "loss": 1.634397029876709, + "step": 4466 + }, + { + "epoch": 1.6265016381507098, + "grad_norm": 18.0, + "learning_rate": 5.529143192939092e-07, + "loss": 1.6307299137115479, + "step": 4468 + }, + { + "epoch": 1.6272297051328724, + "grad_norm": 8.75, + "learning_rate": 5.52608512837843e-07, + "loss": 1.4044524431228638, + "step": 4470 + }, + { + "epoch": 1.6279577721150345, + "grad_norm": 17.375, + "learning_rate": 5.523027344731712e-07, + "loss": 1.0537914037704468, + "step": 4472 + }, + { + "epoch": 1.628685839097197, + "grad_norm": 12.625, + "learning_rate": 5.519969843811443e-07, + "loss": 1.2850558757781982, + "step": 4474 + }, + { + "epoch": 1.6294139060793593, + "grad_norm": 15.9375, + "learning_rate": 5.516912627429966e-07, + "loss": 1.400985598564148, + "step": 4476 + }, + { + "epoch": 1.6301419730615216, + "grad_norm": 37.5, + "learning_rate": 5.513855697399449e-07, + "loss": 1.346358060836792, + "step": 4478 + }, + { + "epoch": 1.630870040043684, + "grad_norm": 12.75, + "learning_rate": 5.510799055531894e-07, + "loss": 1.3743152618408203, + "step": 4480 + }, + { + "epoch": 1.6315981070258463, + "grad_norm": 10.8125, + "learning_rate": 5.507742703639132e-07, + "loss": 1.226538062095642, + "step": 4482 + }, + { + "epoch": 1.6323261740080088, + "grad_norm": 20.125, + "learning_rate": 5.504686643532821e-07, + "loss": 1.6790469884872437, + "step": 4484 + }, + { + "epoch": 1.633054240990171, + "grad_norm": 13.3125, + "learning_rate": 5.501630877024446e-07, + "loss": 1.6266311407089233, + "step": 4486 + }, + { + "epoch": 1.6337823079723335, + "grad_norm": 13.0, + "learning_rate": 5.498575405925319e-07, + "loss": 1.4359632730484009, + "step": 4488 + }, + { + "epoch": 1.6345103749544958, + "grad_norm": 7.9375, + "learning_rate": 5.495520232046576e-07, + "loss": 1.213646650314331, + "step": 4490 + }, + { + "epoch": 1.6352384419366581, + "grad_norm": 26.25, + "learning_rate": 5.492465357199178e-07, + "loss": 1.0491834878921509, + "step": 4492 + }, + { + "epoch": 1.6359665089188207, + "grad_norm": 3.734375, + "learning_rate": 5.489410783193906e-07, + "loss": 1.2411267757415771, + "step": 4494 + }, + { + "epoch": 1.6366945759009828, + "grad_norm": 11.0, + "learning_rate": 5.486356511841366e-07, + "loss": 1.3546173572540283, + "step": 4496 + }, + { + "epoch": 1.6374226428831453, + "grad_norm": 9.25, + "learning_rate": 5.483302544951984e-07, + "loss": 1.2503242492675781, + "step": 4498 + }, + { + "epoch": 1.6381507098653076, + "grad_norm": 10.4375, + "learning_rate": 5.480248884336004e-07, + "loss": 1.3471622467041016, + "step": 4500 + }, + { + "epoch": 1.63887877684747, + "grad_norm": 9.0625, + "learning_rate": 5.477195531803487e-07, + "loss": 1.3567955493927002, + "step": 4502 + }, + { + "epoch": 1.6396068438296323, + "grad_norm": 14.875, + "learning_rate": 5.474142489164317e-07, + "loss": 1.338493824005127, + "step": 4504 + }, + { + "epoch": 1.6403349108117946, + "grad_norm": 21.5, + "learning_rate": 5.471089758228187e-07, + "loss": 1.509122610092163, + "step": 4506 + }, + { + "epoch": 1.6410629777939572, + "grad_norm": 21.0, + "learning_rate": 5.468037340804612e-07, + "loss": 1.5288268327713013, + "step": 4508 + }, + { + "epoch": 1.6417910447761193, + "grad_norm": 7.28125, + "learning_rate": 5.46498523870292e-07, + "loss": 1.415841817855835, + "step": 4510 + }, + { + "epoch": 1.6425191117582818, + "grad_norm": 23.0, + "learning_rate": 5.461933453732244e-07, + "loss": 1.3760550022125244, + "step": 4512 + }, + { + "epoch": 1.6432471787404441, + "grad_norm": 13.75, + "learning_rate": 5.45888198770154e-07, + "loss": 1.1817827224731445, + "step": 4514 + }, + { + "epoch": 1.6439752457226064, + "grad_norm": 21.875, + "learning_rate": 5.455830842419571e-07, + "loss": 1.48640775680542, + "step": 4516 + }, + { + "epoch": 1.644703312704769, + "grad_norm": 12.4375, + "learning_rate": 5.452780019694907e-07, + "loss": 1.4830236434936523, + "step": 4518 + }, + { + "epoch": 1.645431379686931, + "grad_norm": 7.96875, + "learning_rate": 5.44972952133593e-07, + "loss": 1.3118398189544678, + "step": 4520 + }, + { + "epoch": 1.6461594466690936, + "grad_norm": 14.0625, + "learning_rate": 5.446679349150829e-07, + "loss": 1.525355339050293, + "step": 4522 + }, + { + "epoch": 1.646887513651256, + "grad_norm": 11.1875, + "learning_rate": 5.443629504947596e-07, + "loss": 1.4672331809997559, + "step": 4524 + }, + { + "epoch": 1.6476155806334183, + "grad_norm": 5.875, + "learning_rate": 5.440579990534035e-07, + "loss": 1.2221566438674927, + "step": 4526 + }, + { + "epoch": 1.6483436476155806, + "grad_norm": 10.4375, + "learning_rate": 5.437530807717754e-07, + "loss": 0.8971381783485413, + "step": 4528 + }, + { + "epoch": 1.649071714597743, + "grad_norm": 19.875, + "learning_rate": 5.434481958306157e-07, + "loss": 1.1867766380310059, + "step": 4530 + }, + { + "epoch": 1.6497997815799055, + "grad_norm": 9.125, + "learning_rate": 5.431433444106456e-07, + "loss": 1.3045543432235718, + "step": 4532 + }, + { + "epoch": 1.6505278485620676, + "grad_norm": 7.5625, + "learning_rate": 5.428385266925671e-07, + "loss": 1.0093860626220703, + "step": 4534 + }, + { + "epoch": 1.6512559155442301, + "grad_norm": 12.625, + "learning_rate": 5.425337428570604e-07, + "loss": 1.5077139139175415, + "step": 4536 + }, + { + "epoch": 1.6519839825263924, + "grad_norm": 27.25, + "learning_rate": 5.422289930847872e-07, + "loss": 1.7245657444000244, + "step": 4538 + }, + { + "epoch": 1.6527120495085548, + "grad_norm": 13.75, + "learning_rate": 5.419242775563885e-07, + "loss": 1.3472990989685059, + "step": 4540 + }, + { + "epoch": 1.653440116490717, + "grad_norm": 15.25, + "learning_rate": 5.416195964524852e-07, + "loss": 1.573764681816101, + "step": 4542 + }, + { + "epoch": 1.6541681834728794, + "grad_norm": 21.5, + "learning_rate": 5.413149499536773e-07, + "loss": 1.5780576467514038, + "step": 4544 + }, + { + "epoch": 1.654896250455042, + "grad_norm": 9.6875, + "learning_rate": 5.410103382405447e-07, + "loss": 1.2108266353607178, + "step": 4546 + }, + { + "epoch": 1.655624317437204, + "grad_norm": 9.3125, + "learning_rate": 5.407057614936466e-07, + "loss": 1.2939231395721436, + "step": 4548 + }, + { + "epoch": 1.6563523844193666, + "grad_norm": 23.375, + "learning_rate": 5.404012198935215e-07, + "loss": 1.4897058010101318, + "step": 4550 + }, + { + "epoch": 1.657080451401529, + "grad_norm": 11.5625, + "learning_rate": 5.400967136206866e-07, + "loss": 1.5886950492858887, + "step": 4552 + }, + { + "epoch": 1.6578085183836913, + "grad_norm": 44.0, + "learning_rate": 5.397922428556391e-07, + "loss": 0.9262874722480774, + "step": 4554 + }, + { + "epoch": 1.6585365853658538, + "grad_norm": 14.25, + "learning_rate": 5.394878077788546e-07, + "loss": 1.5272457599639893, + "step": 4556 + }, + { + "epoch": 1.659264652348016, + "grad_norm": 4.375, + "learning_rate": 5.391834085707874e-07, + "loss": 0.9048165678977966, + "step": 4558 + }, + { + "epoch": 1.6599927193301784, + "grad_norm": 27.75, + "learning_rate": 5.388790454118703e-07, + "loss": 1.5270885229110718, + "step": 4560 + }, + { + "epoch": 1.6607207863123408, + "grad_norm": 11.75, + "learning_rate": 5.38574718482516e-07, + "loss": 1.52750825881958, + "step": 4562 + }, + { + "epoch": 1.661448853294503, + "grad_norm": 12.75, + "learning_rate": 5.382704279631142e-07, + "loss": 1.6246002912521362, + "step": 4564 + }, + { + "epoch": 1.6621769202766654, + "grad_norm": 10.8125, + "learning_rate": 5.379661740340341e-07, + "loss": 1.6588307619094849, + "step": 4566 + }, + { + "epoch": 1.6629049872588277, + "grad_norm": 11.3125, + "learning_rate": 5.376619568756227e-07, + "loss": 1.7106444835662842, + "step": 4568 + }, + { + "epoch": 1.6636330542409903, + "grad_norm": 15.5, + "learning_rate": 5.373577766682049e-07, + "loss": 1.0492030382156372, + "step": 4570 + }, + { + "epoch": 1.6643611212231524, + "grad_norm": 18.5, + "learning_rate": 5.370536335920846e-07, + "loss": 1.749925136566162, + "step": 4572 + }, + { + "epoch": 1.665089188205315, + "grad_norm": 18.25, + "learning_rate": 5.36749527827543e-07, + "loss": 1.4761316776275635, + "step": 4574 + }, + { + "epoch": 1.6658172551874773, + "grad_norm": 23.375, + "learning_rate": 5.364454595548393e-07, + "loss": 1.6586060523986816, + "step": 4576 + }, + { + "epoch": 1.6665453221696396, + "grad_norm": 16.125, + "learning_rate": 5.361414289542105e-07, + "loss": 1.3006761074066162, + "step": 4578 + }, + { + "epoch": 1.6672733891518021, + "grad_norm": 10.375, + "learning_rate": 5.358374362058718e-07, + "loss": 1.2601702213287354, + "step": 4580 + }, + { + "epoch": 1.6680014561339642, + "grad_norm": 18.375, + "learning_rate": 5.355334814900148e-07, + "loss": 1.8648641109466553, + "step": 4582 + }, + { + "epoch": 1.6687295231161268, + "grad_norm": 12.0, + "learning_rate": 5.352295649868098e-07, + "loss": 1.2928123474121094, + "step": 4584 + }, + { + "epoch": 1.669457590098289, + "grad_norm": 19.75, + "learning_rate": 5.349256868764033e-07, + "loss": 1.367138147354126, + "step": 4586 + }, + { + "epoch": 1.6701856570804514, + "grad_norm": 18.0, + "learning_rate": 5.346218473389203e-07, + "loss": 1.5187242031097412, + "step": 4588 + }, + { + "epoch": 1.6709137240626137, + "grad_norm": 20.5, + "learning_rate": 5.343180465544619e-07, + "loss": 1.942753553390503, + "step": 4590 + }, + { + "epoch": 1.671641791044776, + "grad_norm": 73.5, + "learning_rate": 5.340142847031066e-07, + "loss": 1.7075425386428833, + "step": 4592 + }, + { + "epoch": 1.6723698580269386, + "grad_norm": 11.125, + "learning_rate": 5.337105619649096e-07, + "loss": 1.1258366107940674, + "step": 4594 + }, + { + "epoch": 1.6730979250091007, + "grad_norm": 13.625, + "learning_rate": 5.334068785199038e-07, + "loss": 1.1058069467544556, + "step": 4596 + }, + { + "epoch": 1.6738259919912633, + "grad_norm": 24.125, + "learning_rate": 5.331032345480975e-07, + "loss": 1.3908824920654297, + "step": 4598 + }, + { + "epoch": 1.6745540589734256, + "grad_norm": 16.625, + "learning_rate": 5.327996302294764e-07, + "loss": 1.0268142223358154, + "step": 4600 + }, + { + "epoch": 1.675282125955588, + "grad_norm": 9.1875, + "learning_rate": 5.324960657440027e-07, + "loss": 1.6549043655395508, + "step": 4602 + }, + { + "epoch": 1.6760101929377502, + "grad_norm": 13.1875, + "learning_rate": 5.321925412716146e-07, + "loss": 1.0583324432373047, + "step": 4604 + }, + { + "epoch": 1.6767382599199125, + "grad_norm": 16.625, + "learning_rate": 5.31889056992227e-07, + "loss": 1.0769517421722412, + "step": 4606 + }, + { + "epoch": 1.677466326902075, + "grad_norm": 50.5, + "learning_rate": 5.315856130857309e-07, + "loss": 1.8755528926849365, + "step": 4608 + }, + { + "epoch": 1.6781943938842372, + "grad_norm": 9.0625, + "learning_rate": 5.312822097319929e-07, + "loss": 1.235159158706665, + "step": 4610 + }, + { + "epoch": 1.6789224608663997, + "grad_norm": 10.625, + "learning_rate": 5.309788471108562e-07, + "loss": 1.498553991317749, + "step": 4612 + }, + { + "epoch": 1.679650527848562, + "grad_norm": 9.125, + "learning_rate": 5.306755254021395e-07, + "loss": 1.2686576843261719, + "step": 4614 + }, + { + "epoch": 1.6803785948307244, + "grad_norm": 24.125, + "learning_rate": 5.303722447856372e-07, + "loss": 0.9211684465408325, + "step": 4616 + }, + { + "epoch": 1.681106661812887, + "grad_norm": 12.5625, + "learning_rate": 5.300690054411194e-07, + "loss": 1.0011937618255615, + "step": 4618 + }, + { + "epoch": 1.681834728795049, + "grad_norm": 7.5, + "learning_rate": 5.297658075483322e-07, + "loss": 1.2802972793579102, + "step": 4620 + }, + { + "epoch": 1.6825627957772116, + "grad_norm": 6.65625, + "learning_rate": 5.294626512869964e-07, + "loss": 1.2722283601760864, + "step": 4622 + }, + { + "epoch": 1.683290862759374, + "grad_norm": 11.75, + "learning_rate": 5.291595368368084e-07, + "loss": 1.366947889328003, + "step": 4624 + }, + { + "epoch": 1.6840189297415362, + "grad_norm": 21.25, + "learning_rate": 5.288564643774404e-07, + "loss": 1.4054360389709473, + "step": 4626 + }, + { + "epoch": 1.6847469967236985, + "grad_norm": 5.65625, + "learning_rate": 5.285534340885384e-07, + "loss": 1.1707695722579956, + "step": 4628 + }, + { + "epoch": 1.6854750637058609, + "grad_norm": 9.5, + "learning_rate": 5.282504461497248e-07, + "loss": 1.259289026260376, + "step": 4630 + }, + { + "epoch": 1.6862031306880234, + "grad_norm": 22.125, + "learning_rate": 5.279475007405958e-07, + "loss": 1.4635331630706787, + "step": 4632 + }, + { + "epoch": 1.6869311976701855, + "grad_norm": 9.875, + "learning_rate": 5.276445980407232e-07, + "loss": 1.4942362308502197, + "step": 4634 + }, + { + "epoch": 1.687659264652348, + "grad_norm": 15.0625, + "learning_rate": 5.273417382296532e-07, + "loss": 1.2410290241241455, + "step": 4636 + }, + { + "epoch": 1.6883873316345104, + "grad_norm": 23.0, + "learning_rate": 5.270389214869062e-07, + "loss": 1.587583303451538, + "step": 4638 + }, + { + "epoch": 1.6891153986166727, + "grad_norm": 12.8125, + "learning_rate": 5.267361479919777e-07, + "loss": 1.4919211864471436, + "step": 4640 + }, + { + "epoch": 1.6898434655988352, + "grad_norm": 16.75, + "learning_rate": 5.264334179243372e-07, + "loss": 1.4847959280014038, + "step": 4642 + }, + { + "epoch": 1.6905715325809973, + "grad_norm": 7.625, + "learning_rate": 5.261307314634284e-07, + "loss": 1.2777965068817139, + "step": 4644 + }, + { + "epoch": 1.69129959956316, + "grad_norm": 5.6875, + "learning_rate": 5.258280887886693e-07, + "loss": 1.2693015336990356, + "step": 4646 + }, + { + "epoch": 1.6920276665453222, + "grad_norm": 9.4375, + "learning_rate": 5.255254900794522e-07, + "loss": 1.07283616065979, + "step": 4648 + }, + { + "epoch": 1.6927557335274845, + "grad_norm": 25.875, + "learning_rate": 5.252229355151426e-07, + "loss": 1.516840934753418, + "step": 4650 + }, + { + "epoch": 1.6934838005096469, + "grad_norm": 16.375, + "learning_rate": 5.249204252750806e-07, + "loss": 1.019869327545166, + "step": 4652 + }, + { + "epoch": 1.6942118674918092, + "grad_norm": 12.625, + "learning_rate": 5.246179595385798e-07, + "loss": 1.452100396156311, + "step": 4654 + }, + { + "epoch": 1.6949399344739717, + "grad_norm": 12.9375, + "learning_rate": 5.243155384849271e-07, + "loss": 1.4474271535873413, + "step": 4656 + }, + { + "epoch": 1.6956680014561338, + "grad_norm": 43.0, + "learning_rate": 5.240131622933832e-07, + "loss": 1.5042743682861328, + "step": 4658 + }, + { + "epoch": 1.6963960684382964, + "grad_norm": 17.625, + "learning_rate": 5.237108311431825e-07, + "loss": 1.5193098783493042, + "step": 4660 + }, + { + "epoch": 1.6971241354204587, + "grad_norm": 14.625, + "learning_rate": 5.234085452135319e-07, + "loss": 0.9320967197418213, + "step": 4662 + }, + { + "epoch": 1.697852202402621, + "grad_norm": 13.875, + "learning_rate": 5.23106304683612e-07, + "loss": 1.2734155654907227, + "step": 4664 + }, + { + "epoch": 1.6985802693847833, + "grad_norm": 10.5625, + "learning_rate": 5.228041097325769e-07, + "loss": 0.9612847566604614, + "step": 4666 + }, + { + "epoch": 1.6993083363669457, + "grad_norm": 13.0, + "learning_rate": 5.225019605395529e-07, + "loss": 0.9934066534042358, + "step": 4668 + }, + { + "epoch": 1.7000364033491082, + "grad_norm": 16.875, + "learning_rate": 5.221998572836394e-07, + "loss": 1.5868362188339233, + "step": 4670 + }, + { + "epoch": 1.7007644703312703, + "grad_norm": 24.375, + "learning_rate": 5.218978001439089e-07, + "loss": 1.2157166004180908, + "step": 4672 + }, + { + "epoch": 1.7014925373134329, + "grad_norm": 16.75, + "learning_rate": 5.21595789299406e-07, + "loss": 1.2606585025787354, + "step": 4674 + }, + { + "epoch": 1.7022206042955952, + "grad_norm": 35.25, + "learning_rate": 5.21293824929149e-07, + "loss": 1.5949749946594238, + "step": 4676 + }, + { + "epoch": 1.7029486712777575, + "grad_norm": 23.25, + "learning_rate": 5.20991907212127e-07, + "loss": 1.393242597579956, + "step": 4678 + }, + { + "epoch": 1.70367673825992, + "grad_norm": 19.125, + "learning_rate": 5.206900363273025e-07, + "loss": 1.5257446765899658, + "step": 4680 + }, + { + "epoch": 1.7044048052420822, + "grad_norm": 14.75, + "learning_rate": 5.203882124536105e-07, + "loss": 1.5988425016403198, + "step": 4682 + }, + { + "epoch": 1.7051328722242447, + "grad_norm": 10.0625, + "learning_rate": 5.200864357699572e-07, + "loss": 1.328212857246399, + "step": 4684 + }, + { + "epoch": 1.705860939206407, + "grad_norm": 24.5, + "learning_rate": 5.197847064552214e-07, + "loss": 1.4430851936340332, + "step": 4686 + }, + { + "epoch": 1.7065890061885693, + "grad_norm": 6.03125, + "learning_rate": 5.19483024688254e-07, + "loss": 1.3649506568908691, + "step": 4688 + }, + { + "epoch": 1.7073170731707317, + "grad_norm": 23.625, + "learning_rate": 5.191813906478771e-07, + "loss": 1.310099482536316, + "step": 4690 + }, + { + "epoch": 1.708045140152894, + "grad_norm": 19.375, + "learning_rate": 5.188798045128848e-07, + "loss": 1.7040886878967285, + "step": 4692 + }, + { + "epoch": 1.7087732071350565, + "grad_norm": 7.15625, + "learning_rate": 5.185782664620433e-07, + "loss": 1.2587907314300537, + "step": 4694 + }, + { + "epoch": 1.7095012741172186, + "grad_norm": 7.59375, + "learning_rate": 5.182767766740896e-07, + "loss": 1.2869244813919067, + "step": 4696 + }, + { + "epoch": 1.7102293410993812, + "grad_norm": 10.625, + "learning_rate": 5.17975335327732e-07, + "loss": 1.2847518920898438, + "step": 4698 + }, + { + "epoch": 1.7109574080815435, + "grad_norm": 8.5, + "learning_rate": 5.176739426016512e-07, + "loss": 0.9488272070884705, + "step": 4700 + }, + { + "epoch": 1.7116854750637058, + "grad_norm": 8.25, + "learning_rate": 5.173725986744976e-07, + "loss": 1.4156451225280762, + "step": 4702 + }, + { + "epoch": 1.7124135420458684, + "grad_norm": 10.875, + "learning_rate": 5.170713037248937e-07, + "loss": 1.3787081241607666, + "step": 4704 + }, + { + "epoch": 1.7131416090280305, + "grad_norm": 10.125, + "learning_rate": 5.167700579314328e-07, + "loss": 1.3259871006011963, + "step": 4706 + }, + { + "epoch": 1.713869676010193, + "grad_norm": 22.5, + "learning_rate": 5.164688614726787e-07, + "loss": 1.09104323387146, + "step": 4708 + }, + { + "epoch": 1.7145977429923553, + "grad_norm": 18.25, + "learning_rate": 5.161677145271661e-07, + "loss": 1.3859983682632446, + "step": 4710 + }, + { + "epoch": 1.7153258099745177, + "grad_norm": 15.1875, + "learning_rate": 5.158666172734006e-07, + "loss": 1.4055299758911133, + "step": 4712 + }, + { + "epoch": 1.71605387695668, + "grad_norm": 16.75, + "learning_rate": 5.15565569889858e-07, + "loss": 1.3274331092834473, + "step": 4714 + }, + { + "epoch": 1.7167819439388423, + "grad_norm": 10.5625, + "learning_rate": 5.152645725549851e-07, + "loss": 1.4287595748901367, + "step": 4716 + }, + { + "epoch": 1.7175100109210049, + "grad_norm": 5.40625, + "learning_rate": 5.149636254471983e-07, + "loss": 0.9363580942153931, + "step": 4718 + }, + { + "epoch": 1.718238077903167, + "grad_norm": 12.4375, + "learning_rate": 5.146627287448844e-07, + "loss": 1.1392850875854492, + "step": 4720 + }, + { + "epoch": 1.7189661448853295, + "grad_norm": 11.5, + "learning_rate": 5.143618826264009e-07, + "loss": 1.514790654182434, + "step": 4722 + }, + { + "epoch": 1.7196942118674918, + "grad_norm": 8.5, + "learning_rate": 5.140610872700748e-07, + "loss": 1.3532181978225708, + "step": 4724 + }, + { + "epoch": 1.7204222788496542, + "grad_norm": 9.9375, + "learning_rate": 5.13760342854203e-07, + "loss": 1.4628684520721436, + "step": 4726 + }, + { + "epoch": 1.7211503458318165, + "grad_norm": 11.5625, + "learning_rate": 5.134596495570525e-07, + "loss": 1.1716821193695068, + "step": 4728 + }, + { + "epoch": 1.7218784128139788, + "grad_norm": 16.75, + "learning_rate": 5.131590075568594e-07, + "loss": 1.3539010286331177, + "step": 4730 + }, + { + "epoch": 1.7226064797961413, + "grad_norm": 17.125, + "learning_rate": 5.128584170318304e-07, + "loss": 1.686438798904419, + "step": 4732 + }, + { + "epoch": 1.7233345467783034, + "grad_norm": 23.875, + "learning_rate": 5.12557878160141e-07, + "loss": 0.9991534352302551, + "step": 4734 + }, + { + "epoch": 1.724062613760466, + "grad_norm": 7.03125, + "learning_rate": 5.122573911199357e-07, + "loss": 1.2460020780563354, + "step": 4736 + }, + { + "epoch": 1.7247906807426283, + "grad_norm": 14.3125, + "learning_rate": 5.119569560893293e-07, + "loss": 1.4083077907562256, + "step": 4738 + }, + { + "epoch": 1.7255187477247906, + "grad_norm": 14.625, + "learning_rate": 5.116565732464051e-07, + "loss": 1.2464728355407715, + "step": 4740 + }, + { + "epoch": 1.7262468147069532, + "grad_norm": 24.25, + "learning_rate": 5.113562427692153e-07, + "loss": 1.587965726852417, + "step": 4742 + }, + { + "epoch": 1.7269748816891153, + "grad_norm": 10.3125, + "learning_rate": 5.110559648357817e-07, + "loss": 1.4550707340240479, + "step": 4744 + }, + { + "epoch": 1.7277029486712778, + "grad_norm": 14.0, + "learning_rate": 5.107557396240947e-07, + "loss": 1.5006968975067139, + "step": 4746 + }, + { + "epoch": 1.7284310156534402, + "grad_norm": 17.375, + "learning_rate": 5.10455567312113e-07, + "loss": 1.6203792095184326, + "step": 4748 + }, + { + "epoch": 1.7291590826356025, + "grad_norm": 10.8125, + "learning_rate": 5.101554480777646e-07, + "loss": 1.6275056600570679, + "step": 4750 + }, + { + "epoch": 1.7298871496177648, + "grad_norm": 10.4375, + "learning_rate": 5.098553820989455e-07, + "loss": 1.2600164413452148, + "step": 4752 + }, + { + "epoch": 1.7306152165999271, + "grad_norm": 11.5, + "learning_rate": 5.095553695535206e-07, + "loss": 1.5540000200271606, + "step": 4754 + }, + { + "epoch": 1.7313432835820897, + "grad_norm": 20.0, + "learning_rate": 5.09255410619323e-07, + "loss": 1.1363813877105713, + "step": 4756 + }, + { + "epoch": 1.7320713505642518, + "grad_norm": 12.875, + "learning_rate": 5.089555054741537e-07, + "loss": 1.2778738737106323, + "step": 4758 + }, + { + "epoch": 1.7327994175464143, + "grad_norm": 5.46875, + "learning_rate": 5.08655654295782e-07, + "loss": 1.5923006534576416, + "step": 4760 + }, + { + "epoch": 1.7335274845285766, + "grad_norm": 15.0, + "learning_rate": 5.083558572619455e-07, + "loss": 1.2394068241119385, + "step": 4762 + }, + { + "epoch": 1.734255551510739, + "grad_norm": 10.125, + "learning_rate": 5.080561145503492e-07, + "loss": 1.0730369091033936, + "step": 4764 + }, + { + "epoch": 1.7349836184929015, + "grad_norm": 13.0625, + "learning_rate": 5.077564263386663e-07, + "loss": 1.0098779201507568, + "step": 4766 + }, + { + "epoch": 1.7357116854750636, + "grad_norm": 10.625, + "learning_rate": 5.074567928045377e-07, + "loss": 1.4257750511169434, + "step": 4768 + }, + { + "epoch": 1.7364397524572261, + "grad_norm": 18.625, + "learning_rate": 5.071572141255714e-07, + "loss": 1.4562413692474365, + "step": 4770 + }, + { + "epoch": 1.7371678194393885, + "grad_norm": 20.0, + "learning_rate": 5.068576904793435e-07, + "loss": 1.4765502214431763, + "step": 4772 + }, + { + "epoch": 1.7378958864215508, + "grad_norm": 15.5625, + "learning_rate": 5.065582220433972e-07, + "loss": 1.3236093521118164, + "step": 4774 + }, + { + "epoch": 1.7386239534037131, + "grad_norm": 11.75, + "learning_rate": 5.06258808995243e-07, + "loss": 1.4899080991744995, + "step": 4776 + }, + { + "epoch": 1.7393520203858754, + "grad_norm": 20.25, + "learning_rate": 5.059594515123585e-07, + "loss": 1.5791923999786377, + "step": 4778 + }, + { + "epoch": 1.740080087368038, + "grad_norm": 10.5, + "learning_rate": 5.056601497721882e-07, + "loss": 1.5152193307876587, + "step": 4780 + }, + { + "epoch": 1.7408081543502, + "grad_norm": 14.3125, + "learning_rate": 5.053609039521442e-07, + "loss": 1.4691245555877686, + "step": 4782 + }, + { + "epoch": 1.7415362213323626, + "grad_norm": 15.6875, + "learning_rate": 5.050617142296047e-07, + "loss": 1.257453203201294, + "step": 4784 + }, + { + "epoch": 1.742264288314525, + "grad_norm": 12.25, + "learning_rate": 5.047625807819152e-07, + "loss": 1.5568325519561768, + "step": 4786 + }, + { + "epoch": 1.7429923552966873, + "grad_norm": 13.9375, + "learning_rate": 5.044635037863875e-07, + "loss": 1.507300853729248, + "step": 4788 + }, + { + "epoch": 1.7437204222788496, + "grad_norm": 17.0, + "learning_rate": 5.041644834202999e-07, + "loss": 0.9536606073379517, + "step": 4790 + }, + { + "epoch": 1.744448489261012, + "grad_norm": 11.5625, + "learning_rate": 5.038655198608976e-07, + "loss": 1.551884651184082, + "step": 4792 + }, + { + "epoch": 1.7451765562431745, + "grad_norm": 13.5625, + "learning_rate": 5.035666132853917e-07, + "loss": 1.7289868593215942, + "step": 4794 + }, + { + "epoch": 1.7459046232253366, + "grad_norm": 13.6875, + "learning_rate": 5.032677638709599e-07, + "loss": 1.4637770652770996, + "step": 4796 + }, + { + "epoch": 1.7466326902074991, + "grad_norm": 19.625, + "learning_rate": 5.029689717947452e-07, + "loss": 1.4414632320404053, + "step": 4798 + }, + { + "epoch": 1.7473607571896614, + "grad_norm": 11.4375, + "learning_rate": 5.026702372338573e-07, + "loss": 1.4497214555740356, + "step": 4800 + }, + { + "epoch": 1.7480888241718238, + "grad_norm": 9.5, + "learning_rate": 5.023715603653722e-07, + "loss": 1.1246516704559326, + "step": 4802 + }, + { + "epoch": 1.7488168911539863, + "grad_norm": 29.5, + "learning_rate": 5.020729413663307e-07, + "loss": 0.9471349120140076, + "step": 4804 + }, + { + "epoch": 1.7495449581361484, + "grad_norm": 25.375, + "learning_rate": 5.017743804137397e-07, + "loss": 1.5081110000610352, + "step": 4806 + }, + { + "epoch": 1.750273025118311, + "grad_norm": 17.75, + "learning_rate": 5.014758776845722e-07, + "loss": 1.4541873931884766, + "step": 4808 + }, + { + "epoch": 1.7510010921004733, + "grad_norm": 13.375, + "learning_rate": 5.011774333557662e-07, + "loss": 1.4689409732818604, + "step": 4810 + }, + { + "epoch": 1.7517291590826356, + "grad_norm": 11.375, + "learning_rate": 5.008790476042247e-07, + "loss": 1.5493974685668945, + "step": 4812 + }, + { + "epoch": 1.752457226064798, + "grad_norm": 8.6875, + "learning_rate": 5.005807206068168e-07, + "loss": 0.9573611617088318, + "step": 4814 + }, + { + "epoch": 1.7531852930469602, + "grad_norm": 11.375, + "learning_rate": 5.002824525403764e-07, + "loss": 1.4235085248947144, + "step": 4816 + }, + { + "epoch": 1.7539133600291228, + "grad_norm": 35.25, + "learning_rate": 4.999842435817021e-07, + "loss": 1.4572174549102783, + "step": 4818 + }, + { + "epoch": 1.754641427011285, + "grad_norm": 18.75, + "learning_rate": 4.996860939075584e-07, + "loss": 1.3580149412155151, + "step": 4820 + }, + { + "epoch": 1.7553694939934474, + "grad_norm": 12.875, + "learning_rate": 4.993880036946736e-07, + "loss": 1.234640121459961, + "step": 4822 + }, + { + "epoch": 1.7560975609756098, + "grad_norm": 16.75, + "learning_rate": 4.990899731197415e-07, + "loss": 1.4872140884399414, + "step": 4824 + }, + { + "epoch": 1.756825627957772, + "grad_norm": 9.5625, + "learning_rate": 4.987920023594204e-07, + "loss": 1.2963769435882568, + "step": 4826 + }, + { + "epoch": 1.7575536949399346, + "grad_norm": 15.4375, + "learning_rate": 4.984940915903327e-07, + "loss": 1.5830564498901367, + "step": 4828 + }, + { + "epoch": 1.7582817619220967, + "grad_norm": 11.0625, + "learning_rate": 4.981962409890657e-07, + "loss": 1.487833023071289, + "step": 4830 + }, + { + "epoch": 1.7590098289042593, + "grad_norm": 10.25, + "learning_rate": 4.978984507321712e-07, + "loss": 1.3125817775726318, + "step": 4832 + }, + { + "epoch": 1.7597378958864216, + "grad_norm": 10.0, + "learning_rate": 4.976007209961643e-07, + "loss": 1.4098962545394897, + "step": 4834 + }, + { + "epoch": 1.760465962868584, + "grad_norm": 12.6875, + "learning_rate": 4.973030519575255e-07, + "loss": 1.323737382888794, + "step": 4836 + }, + { + "epoch": 1.7611940298507462, + "grad_norm": 24.0, + "learning_rate": 4.970054437926984e-07, + "loss": 1.5644478797912598, + "step": 4838 + }, + { + "epoch": 1.7619220968329086, + "grad_norm": 12.4375, + "learning_rate": 4.967078966780908e-07, + "loss": 1.5193032026290894, + "step": 4840 + }, + { + "epoch": 1.7626501638150711, + "grad_norm": 11.3125, + "learning_rate": 4.964104107900746e-07, + "loss": 1.294032096862793, + "step": 4842 + }, + { + "epoch": 1.7633782307972332, + "grad_norm": 13.9375, + "learning_rate": 4.961129863049848e-07, + "loss": 1.2603144645690918, + "step": 4844 + }, + { + "epoch": 1.7641062977793958, + "grad_norm": 18.25, + "learning_rate": 4.958156233991203e-07, + "loss": 1.3641313314437866, + "step": 4846 + }, + { + "epoch": 1.764834364761558, + "grad_norm": 17.875, + "learning_rate": 4.955183222487438e-07, + "loss": 1.7334001064300537, + "step": 4848 + }, + { + "epoch": 1.7655624317437204, + "grad_norm": 15.875, + "learning_rate": 4.95221083030081e-07, + "loss": 1.4627082347869873, + "step": 4850 + }, + { + "epoch": 1.7662904987258827, + "grad_norm": 37.25, + "learning_rate": 4.949239059193209e-07, + "loss": 1.3436360359191895, + "step": 4852 + }, + { + "epoch": 1.767018565708045, + "grad_norm": 10.4375, + "learning_rate": 4.94626791092616e-07, + "loss": 1.3054277896881104, + "step": 4854 + }, + { + "epoch": 1.7677466326902076, + "grad_norm": 11.5625, + "learning_rate": 4.943297387260814e-07, + "loss": 1.1454042196273804, + "step": 4856 + }, + { + "epoch": 1.7684746996723697, + "grad_norm": 25.75, + "learning_rate": 4.940327489957958e-07, + "loss": 1.3904287815093994, + "step": 4858 + }, + { + "epoch": 1.7692027666545322, + "grad_norm": 7.46875, + "learning_rate": 4.937358220778004e-07, + "loss": 1.6580021381378174, + "step": 4860 + }, + { + "epoch": 1.7699308336366946, + "grad_norm": 24.375, + "learning_rate": 4.934389581480988e-07, + "loss": 1.4910988807678223, + "step": 4862 + }, + { + "epoch": 1.770658900618857, + "grad_norm": 19.75, + "learning_rate": 4.93142157382658e-07, + "loss": 0.8184197545051575, + "step": 4864 + }, + { + "epoch": 1.7713869676010194, + "grad_norm": 13.5, + "learning_rate": 4.928454199574072e-07, + "loss": 1.550306797027588, + "step": 4866 + }, + { + "epoch": 1.7721150345831815, + "grad_norm": 6.125, + "learning_rate": 4.92548746048238e-07, + "loss": 1.4107385873794556, + "step": 4868 + }, + { + "epoch": 1.772843101565344, + "grad_norm": 10.3125, + "learning_rate": 4.922521358310044e-07, + "loss": 1.2893753051757812, + "step": 4870 + }, + { + "epoch": 1.7735711685475064, + "grad_norm": 20.5, + "learning_rate": 4.919555894815227e-07, + "loss": 1.5938329696655273, + "step": 4872 + }, + { + "epoch": 1.7742992355296687, + "grad_norm": 17.5, + "learning_rate": 4.916591071755714e-07, + "loss": 1.5677850246429443, + "step": 4874 + }, + { + "epoch": 1.775027302511831, + "grad_norm": 23.0, + "learning_rate": 4.913626890888909e-07, + "loss": 1.4362049102783203, + "step": 4876 + }, + { + "epoch": 1.7757553694939934, + "grad_norm": 20.25, + "learning_rate": 4.910663353971836e-07, + "loss": 1.4486889839172363, + "step": 4878 + }, + { + "epoch": 1.776483436476156, + "grad_norm": 10.3125, + "learning_rate": 4.907700462761135e-07, + "loss": 1.6277176141738892, + "step": 4880 + }, + { + "epoch": 1.777211503458318, + "grad_norm": 15.8125, + "learning_rate": 4.904738219013069e-07, + "loss": 1.2896448373794556, + "step": 4882 + }, + { + "epoch": 1.7779395704404806, + "grad_norm": 18.75, + "learning_rate": 4.901776624483511e-07, + "loss": 1.3544483184814453, + "step": 4884 + }, + { + "epoch": 1.7786676374226429, + "grad_norm": 11.5, + "learning_rate": 4.898815680927953e-07, + "loss": 1.124427080154419, + "step": 4886 + }, + { + "epoch": 1.7793957044048052, + "grad_norm": 9.875, + "learning_rate": 4.8958553901015e-07, + "loss": 1.2407491207122803, + "step": 4888 + }, + { + "epoch": 1.7801237713869678, + "grad_norm": 28.25, + "learning_rate": 4.892895753758869e-07, + "loss": 1.5007063150405884, + "step": 4890 + }, + { + "epoch": 1.7808518383691299, + "grad_norm": 13.375, + "learning_rate": 4.889936773654391e-07, + "loss": 1.121708631515503, + "step": 4892 + }, + { + "epoch": 1.7815799053512924, + "grad_norm": 16.125, + "learning_rate": 4.886978451542008e-07, + "loss": 1.5245016813278198, + "step": 4894 + }, + { + "epoch": 1.7823079723334547, + "grad_norm": 4.59375, + "learning_rate": 4.884020789175267e-07, + "loss": 1.3298766613006592, + "step": 4896 + }, + { + "epoch": 1.783036039315617, + "grad_norm": 101.0, + "learning_rate": 4.881063788307332e-07, + "loss": 1.4852957725524902, + "step": 4898 + }, + { + "epoch": 1.7837641062977794, + "grad_norm": 13.375, + "learning_rate": 4.878107450690972e-07, + "loss": 1.48579740524292, + "step": 4900 + }, + { + "epoch": 1.7844921732799417, + "grad_norm": 17.25, + "learning_rate": 4.875151778078557e-07, + "loss": 1.3079392910003662, + "step": 4902 + }, + { + "epoch": 1.7852202402621042, + "grad_norm": 19.125, + "learning_rate": 4.87219677222207e-07, + "loss": 1.270232915878296, + "step": 4904 + }, + { + "epoch": 1.7859483072442663, + "grad_norm": 7.84375, + "learning_rate": 4.869242434873099e-07, + "loss": 1.2085566520690918, + "step": 4906 + }, + { + "epoch": 1.7866763742264289, + "grad_norm": 13.0625, + "learning_rate": 4.866288767782829e-07, + "loss": 1.5448709726333618, + "step": 4908 + }, + { + "epoch": 1.7874044412085912, + "grad_norm": 31.875, + "learning_rate": 4.863335772702053e-07, + "loss": 1.3731622695922852, + "step": 4910 + }, + { + "epoch": 1.7881325081907535, + "grad_norm": 17.875, + "learning_rate": 4.860383451381167e-07, + "loss": 1.188496708869934, + "step": 4912 + }, + { + "epoch": 1.7888605751729159, + "grad_norm": 13.625, + "learning_rate": 4.857431805570161e-07, + "loss": 0.9090881943702698, + "step": 4914 + }, + { + "epoch": 1.7895886421550782, + "grad_norm": 14.625, + "learning_rate": 4.85448083701863e-07, + "loss": 1.3924635648727417, + "step": 4916 + }, + { + "epoch": 1.7903167091372407, + "grad_norm": 13.9375, + "learning_rate": 4.851530547475769e-07, + "loss": 1.2613779306411743, + "step": 4918 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 30.25, + "learning_rate": 4.848580938690364e-07, + "loss": 1.0955307483673096, + "step": 4920 + }, + { + "epoch": 1.7917728431015654, + "grad_norm": 6.59375, + "learning_rate": 4.845632012410803e-07, + "loss": 1.3571293354034424, + "step": 4922 + }, + { + "epoch": 1.7925009100837277, + "grad_norm": 12.0625, + "learning_rate": 4.842683770385069e-07, + "loss": 1.4757554531097412, + "step": 4924 + }, + { + "epoch": 1.79322897706589, + "grad_norm": 13.0, + "learning_rate": 4.839736214360735e-07, + "loss": 1.4330881834030151, + "step": 4926 + }, + { + "epoch": 1.7939570440480526, + "grad_norm": 9.0, + "learning_rate": 4.836789346084973e-07, + "loss": 1.3936123847961426, + "step": 4928 + }, + { + "epoch": 1.7946851110302147, + "grad_norm": 17.25, + "learning_rate": 4.833843167304544e-07, + "loss": 1.323575735092163, + "step": 4930 + }, + { + "epoch": 1.7954131780123772, + "grad_norm": 8.125, + "learning_rate": 4.8308976797658e-07, + "loss": 1.3632922172546387, + "step": 4932 + }, + { + "epoch": 1.7961412449945395, + "grad_norm": 7.875, + "learning_rate": 4.827952885214686e-07, + "loss": 1.329136848449707, + "step": 4934 + }, + { + "epoch": 1.7968693119767019, + "grad_norm": 12.375, + "learning_rate": 4.825008785396733e-07, + "loss": 1.39815354347229, + "step": 4936 + }, + { + "epoch": 1.7975973789588642, + "grad_norm": 13.9375, + "learning_rate": 4.822065382057063e-07, + "loss": 1.476628303527832, + "step": 4938 + }, + { + "epoch": 1.7983254459410265, + "grad_norm": 11.5625, + "learning_rate": 4.819122676940386e-07, + "loss": 1.362483263015747, + "step": 4940 + }, + { + "epoch": 1.799053512923189, + "grad_norm": 9.25, + "learning_rate": 4.81618067179099e-07, + "loss": 1.338100552558899, + "step": 4942 + }, + { + "epoch": 1.7997815799053511, + "grad_norm": 12.6875, + "learning_rate": 4.81323936835276e-07, + "loss": 0.9342026710510254, + "step": 4944 + }, + { + "epoch": 1.8005096468875137, + "grad_norm": 8.875, + "learning_rate": 4.810298768369157e-07, + "loss": 1.3586630821228027, + "step": 4946 + }, + { + "epoch": 1.801237713869676, + "grad_norm": 16.625, + "learning_rate": 4.807358873583226e-07, + "loss": 1.3932251930236816, + "step": 4948 + }, + { + "epoch": 1.8019657808518383, + "grad_norm": 25.75, + "learning_rate": 4.804419685737598e-07, + "loss": 1.2967803478240967, + "step": 4950 + }, + { + "epoch": 1.8026938478340009, + "grad_norm": 12.9375, + "learning_rate": 4.80148120657448e-07, + "loss": 1.2882816791534424, + "step": 4952 + }, + { + "epoch": 1.803421914816163, + "grad_norm": 45.0, + "learning_rate": 4.798543437835662e-07, + "loss": 0.8463873267173767, + "step": 4954 + }, + { + "epoch": 1.8041499817983255, + "grad_norm": 8.4375, + "learning_rate": 4.795606381262512e-07, + "loss": 1.1847871541976929, + "step": 4956 + }, + { + "epoch": 1.8048780487804879, + "grad_norm": 10.9375, + "learning_rate": 4.792670038595976e-07, + "loss": 1.1329805850982666, + "step": 4958 + }, + { + "epoch": 1.8056061157626502, + "grad_norm": 9.5625, + "learning_rate": 4.789734411576575e-07, + "loss": 1.3902671337127686, + "step": 4960 + }, + { + "epoch": 1.8063341827448125, + "grad_norm": 19.125, + "learning_rate": 4.786799501944409e-07, + "loss": 1.507314682006836, + "step": 4962 + }, + { + "epoch": 1.8070622497269748, + "grad_norm": 14.9375, + "learning_rate": 4.783865311439155e-07, + "loss": 1.2486763000488281, + "step": 4964 + }, + { + "epoch": 1.8077903167091374, + "grad_norm": 11.0, + "learning_rate": 4.780931841800052e-07, + "loss": 1.3369146585464478, + "step": 4966 + }, + { + "epoch": 1.8085183836912995, + "grad_norm": 14.5625, + "learning_rate": 4.777999094765927e-07, + "loss": 1.3120849132537842, + "step": 4968 + }, + { + "epoch": 1.809246450673462, + "grad_norm": 5.875, + "learning_rate": 4.775067072075166e-07, + "loss": 1.4481295347213745, + "step": 4970 + }, + { + "epoch": 1.8099745176556243, + "grad_norm": 4.78125, + "learning_rate": 4.772135775465735e-07, + "loss": 1.2433547973632812, + "step": 4972 + }, + { + "epoch": 1.8107025846377867, + "grad_norm": 10.375, + "learning_rate": 4.769205206675164e-07, + "loss": 0.8423742055892944, + "step": 4974 + }, + { + "epoch": 1.811430651619949, + "grad_norm": 18.0, + "learning_rate": 4.7662753674405524e-07, + "loss": 1.1339185237884521, + "step": 4976 + }, + { + "epoch": 1.8121587186021113, + "grad_norm": 9.625, + "learning_rate": 4.7633462594985695e-07, + "loss": 1.940333604812622, + "step": 4978 + }, + { + "epoch": 1.8128867855842739, + "grad_norm": 11.8125, + "learning_rate": 4.7604178845854504e-07, + "loss": 1.7707788944244385, + "step": 4980 + }, + { + "epoch": 1.813614852566436, + "grad_norm": 15.1875, + "learning_rate": 4.757490244436991e-07, + "loss": 1.4298672676086426, + "step": 4982 + }, + { + "epoch": 1.8143429195485985, + "grad_norm": 23.875, + "learning_rate": 4.7545633407885595e-07, + "loss": 1.5174107551574707, + "step": 4984 + }, + { + "epoch": 1.8150709865307608, + "grad_norm": 17.125, + "learning_rate": 4.7516371753750827e-07, + "loss": 1.2245168685913086, + "step": 4986 + }, + { + "epoch": 1.8157990535129231, + "grad_norm": 12.8125, + "learning_rate": 4.7487117499310493e-07, + "loss": 1.5885469913482666, + "step": 4988 + }, + { + "epoch": 1.8165271204950857, + "grad_norm": 12.25, + "learning_rate": 4.7457870661905125e-07, + "loss": 1.5243793725967407, + "step": 4990 + }, + { + "epoch": 1.8172551874772478, + "grad_norm": 16.375, + "learning_rate": 4.742863125887084e-07, + "loss": 0.8776862621307373, + "step": 4992 + }, + { + "epoch": 1.8179832544594103, + "grad_norm": 14.375, + "learning_rate": 4.7399399307539335e-07, + "loss": 1.5170053243637085, + "step": 4994 + }, + { + "epoch": 1.8187113214415727, + "grad_norm": 14.5625, + "learning_rate": 4.737017482523791e-07, + "loss": 1.6693642139434814, + "step": 4996 + }, + { + "epoch": 1.819439388423735, + "grad_norm": 10.8125, + "learning_rate": 4.7340957829289447e-07, + "loss": 1.4614213705062866, + "step": 4998 + }, + { + "epoch": 1.8201674554058973, + "grad_norm": 12.5625, + "learning_rate": 4.731174833701236e-07, + "loss": 1.2518202066421509, + "step": 5000 + }, + { + "epoch": 1.8208955223880596, + "grad_norm": 9.6875, + "learning_rate": 4.7282546365720634e-07, + "loss": 1.0787901878356934, + "step": 5002 + }, + { + "epoch": 1.8216235893702222, + "grad_norm": 12.5625, + "learning_rate": 4.7253351932723807e-07, + "loss": 1.3764450550079346, + "step": 5004 + }, + { + "epoch": 1.8223516563523843, + "grad_norm": 13.4375, + "learning_rate": 4.722416505532692e-07, + "loss": 1.3568028211593628, + "step": 5006 + }, + { + "epoch": 1.8230797233345468, + "grad_norm": 12.5, + "learning_rate": 4.7194985750830563e-07, + "loss": 0.8882184028625488, + "step": 5008 + }, + { + "epoch": 1.8238077903167091, + "grad_norm": 7.3125, + "learning_rate": 4.7165814036530806e-07, + "loss": 1.2430992126464844, + "step": 5010 + }, + { + "epoch": 1.8245358572988715, + "grad_norm": 19.875, + "learning_rate": 4.713664992971925e-07, + "loss": 1.5687246322631836, + "step": 5012 + }, + { + "epoch": 1.825263924281034, + "grad_norm": 12.625, + "learning_rate": 4.7107493447682973e-07, + "loss": 1.2302193641662598, + "step": 5014 + }, + { + "epoch": 1.8259919912631961, + "grad_norm": 17.25, + "learning_rate": 4.707834460770453e-07, + "loss": 1.2846306562423706, + "step": 5016 + }, + { + "epoch": 1.8267200582453587, + "grad_norm": 9.25, + "learning_rate": 4.7049203427061956e-07, + "loss": 1.4958398342132568, + "step": 5018 + }, + { + "epoch": 1.827448125227521, + "grad_norm": 12.625, + "learning_rate": 4.7020069923028757e-07, + "loss": 1.2902781963348389, + "step": 5020 + }, + { + "epoch": 1.8281761922096833, + "grad_norm": 6.4375, + "learning_rate": 4.6990944112873846e-07, + "loss": 1.0420807600021362, + "step": 5022 + }, + { + "epoch": 1.8289042591918456, + "grad_norm": 19.25, + "learning_rate": 4.6961826013861613e-07, + "loss": 1.4692292213439941, + "step": 5024 + }, + { + "epoch": 1.829632326174008, + "grad_norm": 11.125, + "learning_rate": 4.693271564325188e-07, + "loss": 1.4218440055847168, + "step": 5026 + }, + { + "epoch": 1.8303603931561705, + "grad_norm": 55.5, + "learning_rate": 4.6903613018299857e-07, + "loss": 1.4618947505950928, + "step": 5028 + }, + { + "epoch": 1.8310884601383326, + "grad_norm": 18.25, + "learning_rate": 4.687451815625618e-07, + "loss": 1.13822603225708, + "step": 5030 + }, + { + "epoch": 1.8318165271204951, + "grad_norm": 29.75, + "learning_rate": 4.6845431074366905e-07, + "loss": 1.5297691822052002, + "step": 5032 + }, + { + "epoch": 1.8325445941026575, + "grad_norm": 15.875, + "learning_rate": 4.6816351789873433e-07, + "loss": 1.4241657257080078, + "step": 5034 + }, + { + "epoch": 1.8332726610848198, + "grad_norm": 10.0, + "learning_rate": 4.6787280320012564e-07, + "loss": 1.2630621194839478, + "step": 5036 + }, + { + "epoch": 1.8340007280669821, + "grad_norm": 15.5625, + "learning_rate": 4.6758216682016493e-07, + "loss": 1.663926362991333, + "step": 5038 + }, + { + "epoch": 1.8347287950491444, + "grad_norm": 20.0, + "learning_rate": 4.6729160893112723e-07, + "loss": 1.3994669914245605, + "step": 5040 + }, + { + "epoch": 1.835456862031307, + "grad_norm": 13.5625, + "learning_rate": 4.670011297052413e-07, + "loss": 1.8440449237823486, + "step": 5042 + }, + { + "epoch": 1.836184929013469, + "grad_norm": 17.75, + "learning_rate": 4.6671072931468944e-07, + "loss": 1.0558710098266602, + "step": 5044 + }, + { + "epoch": 1.8369129959956316, + "grad_norm": 16.125, + "learning_rate": 4.6642040793160676e-07, + "loss": 1.1933045387268066, + "step": 5046 + }, + { + "epoch": 1.837641062977794, + "grad_norm": 24.125, + "learning_rate": 4.66130165728082e-07, + "loss": 2.015310287475586, + "step": 5048 + }, + { + "epoch": 1.8383691299599563, + "grad_norm": 21.375, + "learning_rate": 4.6584000287615685e-07, + "loss": 1.4095380306243896, + "step": 5050 + }, + { + "epoch": 1.8390971969421188, + "grad_norm": 20.875, + "learning_rate": 4.655499195478256e-07, + "loss": 1.4578955173492432, + "step": 5052 + }, + { + "epoch": 1.839825263924281, + "grad_norm": 9.1875, + "learning_rate": 4.652599159150361e-07, + "loss": 1.359964370727539, + "step": 5054 + }, + { + "epoch": 1.8405533309064435, + "grad_norm": 16.375, + "learning_rate": 4.649699921496882e-07, + "loss": 1.1143771409988403, + "step": 5056 + }, + { + "epoch": 1.8412813978886058, + "grad_norm": 12.375, + "learning_rate": 4.646801484236348e-07, + "loss": 1.6194716691970825, + "step": 5058 + }, + { + "epoch": 1.842009464870768, + "grad_norm": 14.0, + "learning_rate": 4.643903849086816e-07, + "loss": 1.5600671768188477, + "step": 5060 + }, + { + "epoch": 1.8427375318529304, + "grad_norm": 70.5, + "learning_rate": 4.641007017765862e-07, + "loss": 1.6275317668914795, + "step": 5062 + }, + { + "epoch": 1.8434655988350928, + "grad_norm": 15.875, + "learning_rate": 4.638110991990589e-07, + "loss": 1.7146220207214355, + "step": 5064 + }, + { + "epoch": 1.8441936658172553, + "grad_norm": 12.625, + "learning_rate": 4.635215773477623e-07, + "loss": 1.1357980966567993, + "step": 5066 + }, + { + "epoch": 1.8449217327994174, + "grad_norm": 10.25, + "learning_rate": 4.632321363943108e-07, + "loss": 1.2891511917114258, + "step": 5068 + }, + { + "epoch": 1.84564979978158, + "grad_norm": 21.25, + "learning_rate": 4.629427765102713e-07, + "loss": 1.5129547119140625, + "step": 5070 + }, + { + "epoch": 1.8463778667637423, + "grad_norm": 18.75, + "learning_rate": 4.626534978671624e-07, + "loss": 1.088972568511963, + "step": 5072 + }, + { + "epoch": 1.8471059337459046, + "grad_norm": 15.5, + "learning_rate": 4.6236430063645436e-07, + "loss": 1.3288090229034424, + "step": 5074 + }, + { + "epoch": 1.8478340007280671, + "grad_norm": 23.25, + "learning_rate": 4.620751849895696e-07, + "loss": 1.344456434249878, + "step": 5076 + }, + { + "epoch": 1.8485620677102292, + "grad_norm": 17.5, + "learning_rate": 4.61786151097882e-07, + "loss": 1.63629949092865, + "step": 5078 + }, + { + "epoch": 1.8492901346923918, + "grad_norm": 5.59375, + "learning_rate": 4.614971991327166e-07, + "loss": 0.8924068212509155, + "step": 5080 + }, + { + "epoch": 1.850018201674554, + "grad_norm": 17.75, + "learning_rate": 4.6120832926535066e-07, + "loss": 1.6737096309661865, + "step": 5082 + }, + { + "epoch": 1.8507462686567164, + "grad_norm": 16.0, + "learning_rate": 4.609195416670122e-07, + "loss": 1.4915194511413574, + "step": 5084 + }, + { + "epoch": 1.8514743356388788, + "grad_norm": 18.5, + "learning_rate": 4.606308365088805e-07, + "loss": 1.5298212766647339, + "step": 5086 + }, + { + "epoch": 1.852202402621041, + "grad_norm": 18.25, + "learning_rate": 4.6034221396208616e-07, + "loss": 1.381276249885559, + "step": 5088 + }, + { + "epoch": 1.8529304696032036, + "grad_norm": 5.03125, + "learning_rate": 4.6005367419771086e-07, + "loss": 1.2281761169433594, + "step": 5090 + }, + { + "epoch": 1.8536585365853657, + "grad_norm": 23.75, + "learning_rate": 4.5976521738678686e-07, + "loss": 1.2572712898254395, + "step": 5092 + }, + { + "epoch": 1.8543866035675283, + "grad_norm": 34.0, + "learning_rate": 4.5947684370029785e-07, + "loss": 1.8405920267105103, + "step": 5094 + }, + { + "epoch": 1.8551146705496906, + "grad_norm": 17.0, + "learning_rate": 4.591885533091773e-07, + "loss": 1.5841753482818604, + "step": 5096 + }, + { + "epoch": 1.855842737531853, + "grad_norm": 15.1875, + "learning_rate": 4.589003463843105e-07, + "loss": 1.439342975616455, + "step": 5098 + }, + { + "epoch": 1.8565708045140152, + "grad_norm": 6.59375, + "learning_rate": 4.5861222309653234e-07, + "loss": 1.214406967163086, + "step": 5100 + }, + { + "epoch": 1.8572988714961776, + "grad_norm": 17.125, + "learning_rate": 4.583241836166285e-07, + "loss": 1.4181333780288696, + "step": 5102 + }, + { + "epoch": 1.85802693847834, + "grad_norm": 11.5, + "learning_rate": 4.5803622811533494e-07, + "loss": 0.6286987662315369, + "step": 5104 + }, + { + "epoch": 1.8587550054605022, + "grad_norm": 25.125, + "learning_rate": 4.577483567633379e-07, + "loss": 1.5129811763763428, + "step": 5106 + }, + { + "epoch": 1.8594830724426648, + "grad_norm": 17.375, + "learning_rate": 4.574605697312735e-07, + "loss": 1.9072209596633911, + "step": 5108 + }, + { + "epoch": 1.860211139424827, + "grad_norm": 12.625, + "learning_rate": 4.5717286718972816e-07, + "loss": 1.461531400680542, + "step": 5110 + }, + { + "epoch": 1.8609392064069894, + "grad_norm": 8.0625, + "learning_rate": 4.5688524930923816e-07, + "loss": 1.3169548511505127, + "step": 5112 + }, + { + "epoch": 1.861667273389152, + "grad_norm": 13.1875, + "learning_rate": 4.5659771626028944e-07, + "loss": 1.479888916015625, + "step": 5114 + }, + { + "epoch": 1.862395340371314, + "grad_norm": 13.125, + "learning_rate": 4.5631026821331774e-07, + "loss": 1.362253189086914, + "step": 5116 + }, + { + "epoch": 1.8631234073534766, + "grad_norm": 39.25, + "learning_rate": 4.560229053387087e-07, + "loss": 0.8538451194763184, + "step": 5118 + }, + { + "epoch": 1.863851474335639, + "grad_norm": 18.5, + "learning_rate": 4.557356278067969e-07, + "loss": 1.5633865594863892, + "step": 5120 + }, + { + "epoch": 1.8645795413178012, + "grad_norm": 26.5, + "learning_rate": 4.5544843578786675e-07, + "loss": 1.3199468851089478, + "step": 5122 + }, + { + "epoch": 1.8653076082999636, + "grad_norm": 9.5, + "learning_rate": 4.5516132945215197e-07, + "loss": 1.555328369140625, + "step": 5124 + }, + { + "epoch": 1.8660356752821259, + "grad_norm": 9.1875, + "learning_rate": 4.5487430896983526e-07, + "loss": 1.2959411144256592, + "step": 5126 + }, + { + "epoch": 1.8667637422642884, + "grad_norm": 16.625, + "learning_rate": 4.5458737451104854e-07, + "loss": 1.385221242904663, + "step": 5128 + }, + { + "epoch": 1.8674918092464505, + "grad_norm": 13.5625, + "learning_rate": 4.54300526245873e-07, + "loss": 1.4149985313415527, + "step": 5130 + }, + { + "epoch": 1.868219876228613, + "grad_norm": 10.0625, + "learning_rate": 4.540137643443382e-07, + "loss": 1.5066630840301514, + "step": 5132 + }, + { + "epoch": 1.8689479432107754, + "grad_norm": 12.5625, + "learning_rate": 4.537270889764229e-07, + "loss": 1.3174116611480713, + "step": 5134 + }, + { + "epoch": 1.8696760101929377, + "grad_norm": 25.5, + "learning_rate": 4.534405003120545e-07, + "loss": 1.2631373405456543, + "step": 5136 + }, + { + "epoch": 1.8704040771751003, + "grad_norm": 16.125, + "learning_rate": 4.53153998521109e-07, + "loss": 1.2825340032577515, + "step": 5138 + }, + { + "epoch": 1.8711321441572624, + "grad_norm": 10.9375, + "learning_rate": 4.5286758377341093e-07, + "loss": 1.4608080387115479, + "step": 5140 + }, + { + "epoch": 1.871860211139425, + "grad_norm": 7.25, + "learning_rate": 4.525812562387329e-07, + "loss": 1.3504549264907837, + "step": 5142 + }, + { + "epoch": 1.8725882781215872, + "grad_norm": 15.4375, + "learning_rate": 4.5229501608679633e-07, + "loss": 1.4331270456314087, + "step": 5144 + }, + { + "epoch": 1.8733163451037496, + "grad_norm": 26.875, + "learning_rate": 4.520088634872707e-07, + "loss": 1.6306771039962769, + "step": 5146 + }, + { + "epoch": 1.8740444120859119, + "grad_norm": 12.9375, + "learning_rate": 4.5172279860977336e-07, + "loss": 1.7214951515197754, + "step": 5148 + }, + { + "epoch": 1.8747724790680742, + "grad_norm": 10.9375, + "learning_rate": 4.5143682162386976e-07, + "loss": 1.3194423913955688, + "step": 5150 + }, + { + "epoch": 1.8755005460502367, + "grad_norm": 12.625, + "learning_rate": 4.511509326990735e-07, + "loss": 1.0496890544891357, + "step": 5152 + }, + { + "epoch": 1.8762286130323989, + "grad_norm": 10.0, + "learning_rate": 4.5086513200484565e-07, + "loss": 1.3700191974639893, + "step": 5154 + }, + { + "epoch": 1.8769566800145614, + "grad_norm": 37.25, + "learning_rate": 4.5057941971059506e-07, + "loss": 1.5885138511657715, + "step": 5156 + }, + { + "epoch": 1.8776847469967237, + "grad_norm": 16.5, + "learning_rate": 4.5029379598567847e-07, + "loss": 1.7978395223617554, + "step": 5158 + }, + { + "epoch": 1.878412813978886, + "grad_norm": 22.25, + "learning_rate": 4.5000826099939965e-07, + "loss": 1.0766664743423462, + "step": 5160 + }, + { + "epoch": 1.8791408809610484, + "grad_norm": 24.75, + "learning_rate": 4.4972281492101e-07, + "loss": 1.61814284324646, + "step": 5162 + }, + { + "epoch": 1.8798689479432107, + "grad_norm": 27.125, + "learning_rate": 4.4943745791970845e-07, + "loss": 1.1292535066604614, + "step": 5164 + }, + { + "epoch": 1.8805970149253732, + "grad_norm": 9.75, + "learning_rate": 4.491521901646407e-07, + "loss": 0.9990778565406799, + "step": 5166 + }, + { + "epoch": 1.8813250819075353, + "grad_norm": 14.0, + "learning_rate": 4.4886701182489985e-07, + "loss": 1.3372540473937988, + "step": 5168 + }, + { + "epoch": 1.8820531488896979, + "grad_norm": 10.25, + "learning_rate": 4.4858192306952594e-07, + "loss": 1.0960527658462524, + "step": 5170 + }, + { + "epoch": 1.8827812158718602, + "grad_norm": 8.5, + "learning_rate": 4.4829692406750574e-07, + "loss": 1.0105725526809692, + "step": 5172 + }, + { + "epoch": 1.8835092828540225, + "grad_norm": 13.1875, + "learning_rate": 4.480120149877731e-07, + "loss": 1.161435842514038, + "step": 5174 + }, + { + "epoch": 1.884237349836185, + "grad_norm": 19.125, + "learning_rate": 4.477271959992084e-07, + "loss": 1.614087700843811, + "step": 5176 + }, + { + "epoch": 1.8849654168183472, + "grad_norm": 14.0625, + "learning_rate": 4.474424672706385e-07, + "loss": 1.87417733669281, + "step": 5178 + }, + { + "epoch": 1.8856934838005097, + "grad_norm": 61.25, + "learning_rate": 4.4715782897083703e-07, + "loss": 1.0084642171859741, + "step": 5180 + }, + { + "epoch": 1.886421550782672, + "grad_norm": 11.0, + "learning_rate": 4.4687328126852386e-07, + "loss": 0.9533799290657043, + "step": 5182 + }, + { + "epoch": 1.8871496177648344, + "grad_norm": 11.9375, + "learning_rate": 4.46588824332365e-07, + "loss": 1.384581446647644, + "step": 5184 + }, + { + "epoch": 1.8878776847469967, + "grad_norm": 22.625, + "learning_rate": 4.463044583309732e-07, + "loss": 1.5321415662765503, + "step": 5186 + }, + { + "epoch": 1.888605751729159, + "grad_norm": 19.0, + "learning_rate": 4.460201834329066e-07, + "loss": 1.216037631034851, + "step": 5188 + }, + { + "epoch": 1.8893338187113216, + "grad_norm": 9.6875, + "learning_rate": 4.4573599980667e-07, + "loss": 1.317460298538208, + "step": 5190 + }, + { + "epoch": 1.8900618856934837, + "grad_norm": 21.375, + "learning_rate": 4.454519076207136e-07, + "loss": 1.304356336593628, + "step": 5192 + }, + { + "epoch": 1.8907899526756462, + "grad_norm": 32.0, + "learning_rate": 4.4516790704343356e-07, + "loss": 1.4246207475662231, + "step": 5194 + }, + { + "epoch": 1.8915180196578085, + "grad_norm": 29.5, + "learning_rate": 4.448839982431718e-07, + "loss": 0.6813961267471313, + "step": 5196 + }, + { + "epoch": 1.8922460866399708, + "grad_norm": 12.25, + "learning_rate": 4.4460018138821597e-07, + "loss": 1.105685830116272, + "step": 5198 + }, + { + "epoch": 1.8929741536221334, + "grad_norm": 31.0, + "learning_rate": 4.4431645664679865e-07, + "loss": 1.1678590774536133, + "step": 5200 + }, + { + "epoch": 1.8937022206042955, + "grad_norm": 17.625, + "learning_rate": 4.4403282418709863e-07, + "loss": 1.501304030418396, + "step": 5202 + }, + { + "epoch": 1.894430287586458, + "grad_norm": 19.875, + "learning_rate": 4.4374928417723956e-07, + "loss": 1.3576005697250366, + "step": 5204 + }, + { + "epoch": 1.8951583545686204, + "grad_norm": 8.875, + "learning_rate": 4.4346583678529e-07, + "loss": 1.2684423923492432, + "step": 5206 + }, + { + "epoch": 1.8958864215507827, + "grad_norm": 19.5, + "learning_rate": 4.431824821792642e-07, + "loss": 1.6708173751831055, + "step": 5208 + }, + { + "epoch": 1.896614488532945, + "grad_norm": 18.5, + "learning_rate": 4.428992205271211e-07, + "loss": 1.297919511795044, + "step": 5210 + }, + { + "epoch": 1.8973425555151073, + "grad_norm": 16.125, + "learning_rate": 4.4261605199676444e-07, + "loss": 1.5115559101104736, + "step": 5212 + }, + { + "epoch": 1.8980706224972699, + "grad_norm": 14.75, + "learning_rate": 4.4233297675604304e-07, + "loss": 1.4550561904907227, + "step": 5214 + }, + { + "epoch": 1.898798689479432, + "grad_norm": 15.625, + "learning_rate": 4.420499949727504e-07, + "loss": 1.1901280879974365, + "step": 5216 + }, + { + "epoch": 1.8995267564615945, + "grad_norm": 16.125, + "learning_rate": 4.417671068146243e-07, + "loss": 1.7047991752624512, + "step": 5218 + }, + { + "epoch": 1.9002548234437568, + "grad_norm": 4.03125, + "learning_rate": 4.4148431244934746e-07, + "loss": 1.1716625690460205, + "step": 5220 + }, + { + "epoch": 1.9009828904259192, + "grad_norm": 17.5, + "learning_rate": 4.4120161204454645e-07, + "loss": 1.3201984167099, + "step": 5222 + }, + { + "epoch": 1.9017109574080815, + "grad_norm": 9.5, + "learning_rate": 4.4091900576779275e-07, + "loss": 1.4086216688156128, + "step": 5224 + }, + { + "epoch": 1.9024390243902438, + "grad_norm": 12.0625, + "learning_rate": 4.406364937866017e-07, + "loss": 1.2256436347961426, + "step": 5226 + }, + { + "epoch": 1.9031670913724064, + "grad_norm": 10.4375, + "learning_rate": 4.403540762684327e-07, + "loss": 1.3549785614013672, + "step": 5228 + }, + { + "epoch": 1.9038951583545685, + "grad_norm": 13.0625, + "learning_rate": 4.4007175338068943e-07, + "loss": 1.318088412284851, + "step": 5230 + }, + { + "epoch": 1.904623225336731, + "grad_norm": 13.4375, + "learning_rate": 4.3978952529071936e-07, + "loss": 1.0730161666870117, + "step": 5232 + }, + { + "epoch": 1.9053512923188933, + "grad_norm": 14.9375, + "learning_rate": 4.395073921658134e-07, + "loss": 1.5360090732574463, + "step": 5234 + }, + { + "epoch": 1.9060793593010557, + "grad_norm": 11.5, + "learning_rate": 4.3922535417320673e-07, + "loss": 1.4272119998931885, + "step": 5236 + }, + { + "epoch": 1.9068074262832182, + "grad_norm": 12.5625, + "learning_rate": 4.389434114800781e-07, + "loss": 1.292188048362732, + "step": 5238 + }, + { + "epoch": 1.9075354932653803, + "grad_norm": 19.5, + "learning_rate": 4.3866156425354935e-07, + "loss": 1.5705926418304443, + "step": 5240 + }, + { + "epoch": 1.9082635602475428, + "grad_norm": 14.8125, + "learning_rate": 4.383798126606858e-07, + "loss": 1.597636342048645, + "step": 5242 + }, + { + "epoch": 1.9089916272297052, + "grad_norm": 17.375, + "learning_rate": 4.3809815686849673e-07, + "loss": 1.6855599880218506, + "step": 5244 + }, + { + "epoch": 1.9097196942118675, + "grad_norm": 14.8125, + "learning_rate": 4.3781659704393367e-07, + "loss": 1.123718023300171, + "step": 5246 + }, + { + "epoch": 1.9104477611940298, + "grad_norm": 15.75, + "learning_rate": 4.3753513335389186e-07, + "loss": 0.755746603012085, + "step": 5248 + }, + { + "epoch": 1.9111758281761921, + "grad_norm": 16.0, + "learning_rate": 4.372537659652097e-07, + "loss": 1.6585173606872559, + "step": 5250 + }, + { + "epoch": 1.9119038951583547, + "grad_norm": 11.125, + "learning_rate": 4.3697249504466794e-07, + "loss": 1.3326367139816284, + "step": 5252 + }, + { + "epoch": 1.9126319621405168, + "grad_norm": 11.5, + "learning_rate": 4.366913207589905e-07, + "loss": 1.2440919876098633, + "step": 5254 + }, + { + "epoch": 1.9133600291226793, + "grad_norm": 14.75, + "learning_rate": 4.364102432748442e-07, + "loss": 1.2502678632736206, + "step": 5256 + }, + { + "epoch": 1.9140880961048417, + "grad_norm": 10.25, + "learning_rate": 4.3612926275883797e-07, + "loss": 1.4713858366012573, + "step": 5258 + }, + { + "epoch": 1.914816163087004, + "grad_norm": 7.6875, + "learning_rate": 4.358483793775237e-07, + "loss": 1.243448257446289, + "step": 5260 + }, + { + "epoch": 1.9155442300691665, + "grad_norm": 22.0, + "learning_rate": 4.355675932973957e-07, + "loss": 1.5587774515151978, + "step": 5262 + }, + { + "epoch": 1.9162722970513286, + "grad_norm": 12.75, + "learning_rate": 4.3528690468489e-07, + "loss": 1.5684515237808228, + "step": 5264 + }, + { + "epoch": 1.9170003640334912, + "grad_norm": 13.5, + "learning_rate": 4.350063137063857e-07, + "loss": 1.4960618019104004, + "step": 5266 + }, + { + "epoch": 1.9177284310156535, + "grad_norm": 12.9375, + "learning_rate": 4.347258205282036e-07, + "loss": 1.362812876701355, + "step": 5268 + }, + { + "epoch": 1.9184564979978158, + "grad_norm": 16.125, + "learning_rate": 4.344454253166063e-07, + "loss": 1.4091293811798096, + "step": 5270 + }, + { + "epoch": 1.9191845649799781, + "grad_norm": 3.484375, + "learning_rate": 4.34165128237799e-07, + "loss": 1.0174040794372559, + "step": 5272 + }, + { + "epoch": 1.9199126319621405, + "grad_norm": 8.3125, + "learning_rate": 4.33884929457928e-07, + "loss": 1.2516772747039795, + "step": 5274 + }, + { + "epoch": 1.920640698944303, + "grad_norm": 14.5, + "learning_rate": 4.336048291430817e-07, + "loss": 1.2578668594360352, + "step": 5276 + }, + { + "epoch": 1.921368765926465, + "grad_norm": 3.90625, + "learning_rate": 4.333248274592904e-07, + "loss": 1.2186925411224365, + "step": 5278 + }, + { + "epoch": 1.9220968329086277, + "grad_norm": 12.5625, + "learning_rate": 4.3304492457252517e-07, + "loss": 1.2952587604522705, + "step": 5280 + }, + { + "epoch": 1.92282489989079, + "grad_norm": 13.9375, + "learning_rate": 4.3276512064869935e-07, + "loss": 1.5164308547973633, + "step": 5282 + }, + { + "epoch": 1.9235529668729523, + "grad_norm": 13.4375, + "learning_rate": 4.3248541585366714e-07, + "loss": 0.9335699081420898, + "step": 5284 + }, + { + "epoch": 1.9242810338551146, + "grad_norm": 11.8125, + "learning_rate": 4.322058103532239e-07, + "loss": 0.9154990315437317, + "step": 5286 + }, + { + "epoch": 1.925009100837277, + "grad_norm": 9.0625, + "learning_rate": 4.319263043131065e-07, + "loss": 1.3241159915924072, + "step": 5288 + }, + { + "epoch": 1.9257371678194395, + "grad_norm": 17.75, + "learning_rate": 4.316468978989929e-07, + "loss": 1.077293038368225, + "step": 5290 + }, + { + "epoch": 1.9264652348016016, + "grad_norm": 6.59375, + "learning_rate": 4.3136759127650123e-07, + "loss": 1.3208149671554565, + "step": 5292 + }, + { + "epoch": 1.9271933017837641, + "grad_norm": 9.8125, + "learning_rate": 4.3108838461119144e-07, + "loss": 1.8432893753051758, + "step": 5294 + }, + { + "epoch": 1.9279213687659265, + "grad_norm": 14.75, + "learning_rate": 4.3080927806856383e-07, + "loss": 1.269153118133545, + "step": 5296 + }, + { + "epoch": 1.9286494357480888, + "grad_norm": 5.65625, + "learning_rate": 4.305302718140593e-07, + "loss": 1.1437751054763794, + "step": 5298 + }, + { + "epoch": 1.9293775027302513, + "grad_norm": 19.625, + "learning_rate": 4.3025136601305893e-07, + "loss": 1.6432139873504639, + "step": 5300 + }, + { + "epoch": 1.9301055697124134, + "grad_norm": 18.5, + "learning_rate": 4.2997256083088527e-07, + "loss": 1.3277668952941895, + "step": 5302 + }, + { + "epoch": 1.930833636694576, + "grad_norm": 18.0, + "learning_rate": 4.296938564328002e-07, + "loss": 0.9986153841018677, + "step": 5304 + }, + { + "epoch": 1.9315617036767383, + "grad_norm": 26.5, + "learning_rate": 4.294152529840066e-07, + "loss": 1.6669729948043823, + "step": 5306 + }, + { + "epoch": 1.9322897706589006, + "grad_norm": 15.125, + "learning_rate": 4.2913675064964685e-07, + "loss": 1.1953425407409668, + "step": 5308 + }, + { + "epoch": 1.933017837641063, + "grad_norm": 10.0625, + "learning_rate": 4.2885834959480394e-07, + "loss": 1.3593227863311768, + "step": 5310 + }, + { + "epoch": 1.9337459046232253, + "grad_norm": 5.40625, + "learning_rate": 4.285800499845007e-07, + "loss": 1.101417064666748, + "step": 5312 + }, + { + "epoch": 1.9344739716053878, + "grad_norm": 15.875, + "learning_rate": 4.283018519836997e-07, + "loss": 1.2339668273925781, + "step": 5314 + }, + { + "epoch": 1.93520203858755, + "grad_norm": 44.0, + "learning_rate": 4.280237557573031e-07, + "loss": 1.6686416864395142, + "step": 5316 + }, + { + "epoch": 1.9359301055697125, + "grad_norm": 12.9375, + "learning_rate": 4.2774576147015334e-07, + "loss": 1.624393105506897, + "step": 5318 + }, + { + "epoch": 1.9366581725518748, + "grad_norm": 12.375, + "learning_rate": 4.274678692870317e-07, + "loss": 1.5231093168258667, + "step": 5320 + }, + { + "epoch": 1.937386239534037, + "grad_norm": 3.6875, + "learning_rate": 4.271900793726595e-07, + "loss": 0.9599215984344482, + "step": 5322 + }, + { + "epoch": 1.9381143065161996, + "grad_norm": 15.875, + "learning_rate": 4.2691239189169736e-07, + "loss": 1.3052797317504883, + "step": 5324 + }, + { + "epoch": 1.9388423734983617, + "grad_norm": 13.5, + "learning_rate": 4.2663480700874495e-07, + "loss": 1.5621004104614258, + "step": 5326 + }, + { + "epoch": 1.9395704404805243, + "grad_norm": 21.375, + "learning_rate": 4.263573248883411e-07, + "loss": 1.3269169330596924, + "step": 5328 + }, + { + "epoch": 1.9402985074626866, + "grad_norm": 17.125, + "learning_rate": 4.2607994569496414e-07, + "loss": 1.0493719577789307, + "step": 5330 + }, + { + "epoch": 1.941026574444849, + "grad_norm": 19.125, + "learning_rate": 4.2580266959303075e-07, + "loss": 1.6125602722167969, + "step": 5332 + }, + { + "epoch": 1.9417546414270113, + "grad_norm": 18.125, + "learning_rate": 4.255254967468971e-07, + "loss": 1.4883863925933838, + "step": 5334 + }, + { + "epoch": 1.9424827084091736, + "grad_norm": 21.0, + "learning_rate": 4.252484273208581e-07, + "loss": 1.2816545963287354, + "step": 5336 + }, + { + "epoch": 1.9432107753913361, + "grad_norm": 12.875, + "learning_rate": 4.2497146147914677e-07, + "loss": 1.4210858345031738, + "step": 5338 + }, + { + "epoch": 1.9439388423734982, + "grad_norm": 16.25, + "learning_rate": 4.246945993859355e-07, + "loss": 0.9522960186004639, + "step": 5340 + }, + { + "epoch": 1.9446669093556608, + "grad_norm": 15.6875, + "learning_rate": 4.2441784120533475e-07, + "loss": 1.3234107494354248, + "step": 5342 + }, + { + "epoch": 1.945394976337823, + "grad_norm": 15.375, + "learning_rate": 4.2414118710139313e-07, + "loss": 1.3565607070922852, + "step": 5344 + }, + { + "epoch": 1.9461230433199854, + "grad_norm": 20.75, + "learning_rate": 4.238646372380984e-07, + "loss": 1.5031850337982178, + "step": 5346 + }, + { + "epoch": 1.9468511103021477, + "grad_norm": 9.5, + "learning_rate": 4.2358819177937555e-07, + "loss": 1.5888288021087646, + "step": 5348 + }, + { + "epoch": 1.94757917728431, + "grad_norm": 18.5, + "learning_rate": 4.233118508890884e-07, + "loss": 1.5596145391464233, + "step": 5350 + }, + { + "epoch": 1.9483072442664726, + "grad_norm": 5.625, + "learning_rate": 4.2303561473103865e-07, + "loss": 1.1608482599258423, + "step": 5352 + }, + { + "epoch": 1.9490353112486347, + "grad_norm": 10.25, + "learning_rate": 4.227594834689656e-07, + "loss": 1.5768556594848633, + "step": 5354 + }, + { + "epoch": 1.9497633782307973, + "grad_norm": 20.375, + "learning_rate": 4.2248345726654655e-07, + "loss": 1.2801625728607178, + "step": 5356 + }, + { + "epoch": 1.9504914452129596, + "grad_norm": 17.875, + "learning_rate": 4.222075362873969e-07, + "loss": 0.8807549476623535, + "step": 5358 + }, + { + "epoch": 1.951219512195122, + "grad_norm": 5.5, + "learning_rate": 4.219317206950688e-07, + "loss": 1.023951768875122, + "step": 5360 + }, + { + "epoch": 1.9519475791772845, + "grad_norm": 12.0, + "learning_rate": 4.216560106530528e-07, + "loss": 1.4083223342895508, + "step": 5362 + }, + { + "epoch": 1.9526756461594466, + "grad_norm": 13.1875, + "learning_rate": 4.213804063247766e-07, + "loss": 1.3836915493011475, + "step": 5364 + }, + { + "epoch": 1.953403713141609, + "grad_norm": 17.5, + "learning_rate": 4.211049078736051e-07, + "loss": 1.7741522789001465, + "step": 5366 + }, + { + "epoch": 1.9541317801237714, + "grad_norm": 16.0, + "learning_rate": 4.208295154628405e-07, + "loss": 1.5273537635803223, + "step": 5368 + }, + { + "epoch": 1.9548598471059337, + "grad_norm": 12.9375, + "learning_rate": 4.205542292557223e-07, + "loss": 1.5348329544067383, + "step": 5370 + }, + { + "epoch": 1.955587914088096, + "grad_norm": 8.875, + "learning_rate": 4.2027904941542667e-07, + "loss": 1.1293091773986816, + "step": 5372 + }, + { + "epoch": 1.9563159810702584, + "grad_norm": 85.0, + "learning_rate": 4.2000397610506726e-07, + "loss": 1.7013676166534424, + "step": 5374 + }, + { + "epoch": 1.957044048052421, + "grad_norm": 14.625, + "learning_rate": 4.197290094876942e-07, + "loss": 1.6561744213104248, + "step": 5376 + }, + { + "epoch": 1.957772115034583, + "grad_norm": 6.96875, + "learning_rate": 4.194541497262944e-07, + "loss": 1.249587059020996, + "step": 5378 + }, + { + "epoch": 1.9585001820167456, + "grad_norm": 11.9375, + "learning_rate": 4.191793969837917e-07, + "loss": 1.5321943759918213, + "step": 5380 + }, + { + "epoch": 1.959228248998908, + "grad_norm": 5.25, + "learning_rate": 4.1890475142304616e-07, + "loss": 1.2402524948120117, + "step": 5382 + }, + { + "epoch": 1.9599563159810702, + "grad_norm": 21.0, + "learning_rate": 4.186302132068544e-07, + "loss": 1.2525988817214966, + "step": 5384 + }, + { + "epoch": 1.9606843829632328, + "grad_norm": 26.375, + "learning_rate": 4.183557824979495e-07, + "loss": 1.3800227642059326, + "step": 5386 + }, + { + "epoch": 1.9614124499453949, + "grad_norm": 10.1875, + "learning_rate": 4.1808145945900095e-07, + "loss": 1.4895873069763184, + "step": 5388 + }, + { + "epoch": 1.9621405169275574, + "grad_norm": 8.6875, + "learning_rate": 4.178072442526141e-07, + "loss": 1.1199052333831787, + "step": 5390 + }, + { + "epoch": 1.9628685839097197, + "grad_norm": 8.125, + "learning_rate": 4.175331370413307e-07, + "loss": 1.092468500137329, + "step": 5392 + }, + { + "epoch": 1.963596650891882, + "grad_norm": 28.125, + "learning_rate": 4.1725913798762804e-07, + "loss": 1.2565715312957764, + "step": 5394 + }, + { + "epoch": 1.9643247178740444, + "grad_norm": 6.625, + "learning_rate": 4.169852472539199e-07, + "loss": 1.3476146459579468, + "step": 5396 + }, + { + "epoch": 1.9650527848562067, + "grad_norm": 9.875, + "learning_rate": 4.167114650025554e-07, + "loss": 1.2127494812011719, + "step": 5398 + }, + { + "epoch": 1.9657808518383693, + "grad_norm": 12.625, + "learning_rate": 4.1643779139581926e-07, + "loss": 1.2300894260406494, + "step": 5400 + }, + { + "epoch": 1.9665089188205314, + "grad_norm": 19.5, + "learning_rate": 4.1616422659593227e-07, + "loss": 1.60358464717865, + "step": 5402 + }, + { + "epoch": 1.967236985802694, + "grad_norm": 13.4375, + "learning_rate": 4.1589077076505063e-07, + "loss": 1.2521380186080933, + "step": 5404 + }, + { + "epoch": 1.9679650527848562, + "grad_norm": 19.0, + "learning_rate": 4.156174240652655e-07, + "loss": 1.3757591247558594, + "step": 5406 + }, + { + "epoch": 1.9686931197670186, + "grad_norm": 18.5, + "learning_rate": 4.1534418665860375e-07, + "loss": 1.3038842678070068, + "step": 5408 + }, + { + "epoch": 1.9694211867491809, + "grad_norm": 13.6875, + "learning_rate": 4.150710587070276e-07, + "loss": 1.5610673427581787, + "step": 5410 + }, + { + "epoch": 1.9701492537313432, + "grad_norm": 12.1875, + "learning_rate": 4.1479804037243405e-07, + "loss": 1.3251566886901855, + "step": 5412 + }, + { + "epoch": 1.9708773207135057, + "grad_norm": 32.75, + "learning_rate": 4.145251318166549e-07, + "loss": 1.6915600299835205, + "step": 5414 + }, + { + "epoch": 1.9716053876956678, + "grad_norm": 73.5, + "learning_rate": 4.1425233320145777e-07, + "loss": 1.3950165510177612, + "step": 5416 + }, + { + "epoch": 1.9723334546778304, + "grad_norm": 19.125, + "learning_rate": 4.1397964468854396e-07, + "loss": 1.8406273126602173, + "step": 5418 + }, + { + "epoch": 1.9730615216599927, + "grad_norm": 34.5, + "learning_rate": 4.1370706643955056e-07, + "loss": 0.9898960590362549, + "step": 5420 + }, + { + "epoch": 1.973789588642155, + "grad_norm": 16.0, + "learning_rate": 4.134345986160488e-07, + "loss": 1.4294590950012207, + "step": 5422 + }, + { + "epoch": 1.9745176556243176, + "grad_norm": 4.15625, + "learning_rate": 4.131622413795444e-07, + "loss": 1.2870616912841797, + "step": 5424 + }, + { + "epoch": 1.9752457226064797, + "grad_norm": 10.5625, + "learning_rate": 4.1288999489147747e-07, + "loss": 1.1248412132263184, + "step": 5426 + }, + { + "epoch": 1.9759737895886422, + "grad_norm": 11.8125, + "learning_rate": 4.12617859313223e-07, + "loss": 1.0992767810821533, + "step": 5428 + }, + { + "epoch": 1.9767018565708045, + "grad_norm": 13.3125, + "learning_rate": 4.123458348060896e-07, + "loss": 1.4928805828094482, + "step": 5430 + }, + { + "epoch": 1.9774299235529669, + "grad_norm": 46.25, + "learning_rate": 4.1207392153132047e-07, + "loss": 1.507457971572876, + "step": 5432 + }, + { + "epoch": 1.9781579905351292, + "grad_norm": 23.125, + "learning_rate": 4.1180211965009247e-07, + "loss": 1.5621905326843262, + "step": 5434 + }, + { + "epoch": 1.9788860575172915, + "grad_norm": 18.5, + "learning_rate": 4.115304293235169e-07, + "loss": 1.564670205116272, + "step": 5436 + }, + { + "epoch": 1.979614124499454, + "grad_norm": 22.125, + "learning_rate": 4.1125885071263884e-07, + "loss": 1.18473219871521, + "step": 5438 + }, + { + "epoch": 1.9803421914816162, + "grad_norm": 11.9375, + "learning_rate": 4.1098738397843686e-07, + "loss": 1.353307843208313, + "step": 5440 + }, + { + "epoch": 1.9810702584637787, + "grad_norm": 7.65625, + "learning_rate": 4.107160292818233e-07, + "loss": 0.6641855835914612, + "step": 5442 + }, + { + "epoch": 1.981798325445941, + "grad_norm": 3.53125, + "learning_rate": 4.1044478678364424e-07, + "loss": 1.2503995895385742, + "step": 5444 + }, + { + "epoch": 1.9825263924281034, + "grad_norm": 11.0625, + "learning_rate": 4.1017365664467916e-07, + "loss": 1.2928166389465332, + "step": 5446 + }, + { + "epoch": 1.983254459410266, + "grad_norm": 7.0, + "learning_rate": 4.0990263902564104e-07, + "loss": 1.2950196266174316, + "step": 5448 + }, + { + "epoch": 1.983982526392428, + "grad_norm": 11.5, + "learning_rate": 4.0963173408717625e-07, + "loss": 1.5034847259521484, + "step": 5450 + }, + { + "epoch": 1.9847105933745905, + "grad_norm": 10.6875, + "learning_rate": 4.09360941989864e-07, + "loss": 1.515030026435852, + "step": 5452 + }, + { + "epoch": 1.9854386603567529, + "grad_norm": 11.8125, + "learning_rate": 4.090902628942167e-07, + "loss": 1.2822409868240356, + "step": 5454 + }, + { + "epoch": 1.9861667273389152, + "grad_norm": 23.25, + "learning_rate": 4.088196969606802e-07, + "loss": 0.9950661659240723, + "step": 5456 + }, + { + "epoch": 1.9868947943210775, + "grad_norm": 18.75, + "learning_rate": 4.085492443496327e-07, + "loss": 1.0565800666809082, + "step": 5458 + }, + { + "epoch": 1.9876228613032398, + "grad_norm": 28.0, + "learning_rate": 4.0827890522138567e-07, + "loss": 1.2607178688049316, + "step": 5460 + }, + { + "epoch": 1.9883509282854024, + "grad_norm": 11.625, + "learning_rate": 4.0800867973618316e-07, + "loss": 1.268911361694336, + "step": 5462 + }, + { + "epoch": 1.9890789952675645, + "grad_norm": 7.09375, + "learning_rate": 4.077385680542016e-07, + "loss": 1.3100900650024414, + "step": 5464 + }, + { + "epoch": 1.989807062249727, + "grad_norm": 8.6875, + "learning_rate": 4.0746857033555066e-07, + "loss": 1.3662540912628174, + "step": 5466 + }, + { + "epoch": 1.9905351292318894, + "grad_norm": 28.625, + "learning_rate": 4.071986867402717e-07, + "loss": 1.0228921175003052, + "step": 5468 + }, + { + "epoch": 1.9912631962140517, + "grad_norm": 18.0, + "learning_rate": 4.0692891742833857e-07, + "loss": 1.42484712600708, + "step": 5470 + }, + { + "epoch": 1.991991263196214, + "grad_norm": 11.1875, + "learning_rate": 4.0665926255965774e-07, + "loss": 1.3489406108856201, + "step": 5472 + }, + { + "epoch": 1.9927193301783763, + "grad_norm": 17.875, + "learning_rate": 4.0638972229406784e-07, + "loss": 1.4260890483856201, + "step": 5474 + }, + { + "epoch": 1.9934473971605389, + "grad_norm": 123.0, + "learning_rate": 4.061202967913389e-07, + "loss": 1.3785591125488281, + "step": 5476 + }, + { + "epoch": 1.994175464142701, + "grad_norm": 11.375, + "learning_rate": 4.05850986211174e-07, + "loss": 1.712681770324707, + "step": 5478 + }, + { + "epoch": 1.9949035311248635, + "grad_norm": 9.375, + "learning_rate": 4.055817907132071e-07, + "loss": 1.2652156352996826, + "step": 5480 + }, + { + "epoch": 1.9956315981070258, + "grad_norm": 7.46875, + "learning_rate": 4.0531271045700435e-07, + "loss": 1.3126335144042969, + "step": 5482 + }, + { + "epoch": 1.9963596650891882, + "grad_norm": 24.25, + "learning_rate": 4.0504374560206367e-07, + "loss": 1.0769325494766235, + "step": 5484 + }, + { + "epoch": 1.9970877320713507, + "grad_norm": 7.28125, + "learning_rate": 4.047748963078145e-07, + "loss": 1.3531816005706787, + "step": 5486 + }, + { + "epoch": 1.9978157990535128, + "grad_norm": 10.4375, + "learning_rate": 4.0450616273361763e-07, + "loss": 1.3040752410888672, + "step": 5488 + }, + { + "epoch": 1.9985438660356754, + "grad_norm": 9.1875, + "learning_rate": 4.0423754503876575e-07, + "loss": 1.3271496295928955, + "step": 5490 + }, + { + "epoch": 1.9992719330178377, + "grad_norm": 19.625, + "learning_rate": 4.039690433824821e-07, + "loss": 1.5347609519958496, + "step": 5492 + }, + { + "epoch": 2.0, + "grad_norm": 15.5, + "learning_rate": 4.037006579239219e-07, + "loss": 1.5290462970733643, + "step": 5494 + }, + { + "epoch": 2.0007280669821625, + "grad_norm": 26.375, + "learning_rate": 4.034323888221709e-07, + "loss": 1.1109535694122314, + "step": 5496 + }, + { + "epoch": 2.0014561339643246, + "grad_norm": 12.875, + "learning_rate": 4.031642362362462e-07, + "loss": 1.2557291984558105, + "step": 5498 + }, + { + "epoch": 2.002184200946487, + "grad_norm": 16.75, + "learning_rate": 4.028962003250956e-07, + "loss": 0.8921477198600769, + "step": 5500 + }, + { + "epoch": 2.0029122679286493, + "grad_norm": 7.53125, + "learning_rate": 4.026282812475983e-07, + "loss": 1.3243731260299683, + "step": 5502 + }, + { + "epoch": 2.003640334910812, + "grad_norm": 25.75, + "learning_rate": 4.0236047916256336e-07, + "loss": 1.5873147249221802, + "step": 5504 + }, + { + "epoch": 2.004368401892974, + "grad_norm": 11.25, + "learning_rate": 4.020927942287313e-07, + "loss": 1.2481932640075684, + "step": 5506 + }, + { + "epoch": 2.0050964688751365, + "grad_norm": 12.6875, + "learning_rate": 4.0182522660477283e-07, + "loss": 1.4221183061599731, + "step": 5508 + }, + { + "epoch": 2.005824535857299, + "grad_norm": 15.625, + "learning_rate": 4.0155777644928923e-07, + "loss": 1.4575718641281128, + "step": 5510 + }, + { + "epoch": 2.006552602839461, + "grad_norm": 6.75, + "learning_rate": 4.012904439208118e-07, + "loss": 1.436309814453125, + "step": 5512 + }, + { + "epoch": 2.0072806698216237, + "grad_norm": 35.5, + "learning_rate": 4.010232291778027e-07, + "loss": 1.3791894912719727, + "step": 5514 + }, + { + "epoch": 2.0080087368037858, + "grad_norm": 12.4375, + "learning_rate": 4.0075613237865374e-07, + "loss": 1.4860548973083496, + "step": 5516 + }, + { + "epoch": 2.0087368037859483, + "grad_norm": 13.375, + "learning_rate": 4.0048915368168736e-07, + "loss": 1.310502529144287, + "step": 5518 + }, + { + "epoch": 2.009464870768111, + "grad_norm": 5.75, + "learning_rate": 4.002222932451554e-07, + "loss": 1.36233651638031, + "step": 5520 + }, + { + "epoch": 2.010192937750273, + "grad_norm": 26.0, + "learning_rate": 3.9995555122724014e-07, + "loss": 1.0669679641723633, + "step": 5522 + }, + { + "epoch": 2.0109210047324355, + "grad_norm": 4.25, + "learning_rate": 3.9968892778605317e-07, + "loss": 1.5157203674316406, + "step": 5524 + }, + { + "epoch": 2.0116490717145976, + "grad_norm": 9.375, + "learning_rate": 3.9942242307963615e-07, + "loss": 1.480398416519165, + "step": 5526 + }, + { + "epoch": 2.01237713869676, + "grad_norm": 10.625, + "learning_rate": 3.991560372659602e-07, + "loss": 1.2819180488586426, + "step": 5528 + }, + { + "epoch": 2.0131052056789223, + "grad_norm": 9.9375, + "learning_rate": 3.988897705029263e-07, + "loss": 1.1487551927566528, + "step": 5530 + }, + { + "epoch": 2.013833272661085, + "grad_norm": 35.5, + "learning_rate": 3.9862362294836415e-07, + "loss": 1.3590065240859985, + "step": 5532 + }, + { + "epoch": 2.0145613396432474, + "grad_norm": 3.359375, + "learning_rate": 3.9835759476003343e-07, + "loss": 1.2764147520065308, + "step": 5534 + }, + { + "epoch": 2.0152894066254095, + "grad_norm": 5.0625, + "learning_rate": 3.9809168609562305e-07, + "loss": 1.31927490234375, + "step": 5536 + }, + { + "epoch": 2.016017473607572, + "grad_norm": 26.25, + "learning_rate": 3.9782589711275073e-07, + "loss": 1.4636015892028809, + "step": 5538 + }, + { + "epoch": 2.016745540589734, + "grad_norm": 9.1875, + "learning_rate": 3.975602279689632e-07, + "loss": 1.4883829355239868, + "step": 5540 + }, + { + "epoch": 2.0174736075718966, + "grad_norm": 5.375, + "learning_rate": 3.972946788217366e-07, + "loss": 1.2646048069000244, + "step": 5542 + }, + { + "epoch": 2.018201674554059, + "grad_norm": 12.0, + "learning_rate": 3.970292498284754e-07, + "loss": 1.672217607498169, + "step": 5544 + }, + { + "epoch": 2.0189297415362213, + "grad_norm": 20.375, + "learning_rate": 3.967639411465134e-07, + "loss": 1.40702486038208, + "step": 5546 + }, + { + "epoch": 2.019657808518384, + "grad_norm": 13.8125, + "learning_rate": 3.9649875293311294e-07, + "loss": 1.3253629207611084, + "step": 5548 + }, + { + "epoch": 2.020385875500546, + "grad_norm": 10.625, + "learning_rate": 3.9623368534546463e-07, + "loss": 1.2738597393035889, + "step": 5550 + }, + { + "epoch": 2.0211139424827085, + "grad_norm": 25.625, + "learning_rate": 3.9596873854068775e-07, + "loss": 1.4948830604553223, + "step": 5552 + }, + { + "epoch": 2.0218420094648706, + "grad_norm": 8.25, + "learning_rate": 3.957039126758302e-07, + "loss": 1.4053280353546143, + "step": 5554 + }, + { + "epoch": 2.022570076447033, + "grad_norm": 15.3125, + "learning_rate": 3.954392079078678e-07, + "loss": 1.2540547847747803, + "step": 5556 + }, + { + "epoch": 2.0232981434291957, + "grad_norm": 18.0, + "learning_rate": 3.9517462439370505e-07, + "loss": 1.3211976289749146, + "step": 5558 + }, + { + "epoch": 2.0240262104113578, + "grad_norm": 11.4375, + "learning_rate": 3.9491016229017404e-07, + "loss": 1.4880781173706055, + "step": 5560 + }, + { + "epoch": 2.0247542773935203, + "grad_norm": 13.3125, + "learning_rate": 3.946458217540354e-07, + "loss": 1.4179404973983765, + "step": 5562 + }, + { + "epoch": 2.0254823443756824, + "grad_norm": 14.3125, + "learning_rate": 3.943816029419775e-07, + "loss": 1.444284200668335, + "step": 5564 + }, + { + "epoch": 2.026210411357845, + "grad_norm": 15.75, + "learning_rate": 3.9411750601061654e-07, + "loss": 1.254011631011963, + "step": 5566 + }, + { + "epoch": 2.026938478340007, + "grad_norm": 7.28125, + "learning_rate": 3.938535311164963e-07, + "loss": 0.8216063976287842, + "step": 5568 + }, + { + "epoch": 2.0276665453221696, + "grad_norm": 32.25, + "learning_rate": 3.935896784160886e-07, + "loss": 1.3031662702560425, + "step": 5570 + }, + { + "epoch": 2.028394612304332, + "grad_norm": 18.625, + "learning_rate": 3.9332594806579244e-07, + "loss": 1.4534103870391846, + "step": 5572 + }, + { + "epoch": 2.0291226792864943, + "grad_norm": 13.0, + "learning_rate": 3.930623402219346e-07, + "loss": 1.517370343208313, + "step": 5574 + }, + { + "epoch": 2.029850746268657, + "grad_norm": 5.65625, + "learning_rate": 3.9279885504076934e-07, + "loss": 1.2749429941177368, + "step": 5576 + }, + { + "epoch": 2.030578813250819, + "grad_norm": 13.4375, + "learning_rate": 3.925354926784778e-07, + "loss": 1.0363596677780151, + "step": 5578 + }, + { + "epoch": 2.0313068802329814, + "grad_norm": 9.8125, + "learning_rate": 3.922722532911683e-07, + "loss": 1.439997673034668, + "step": 5580 + }, + { + "epoch": 2.032034947215144, + "grad_norm": 18.25, + "learning_rate": 3.9200913703487687e-07, + "loss": 1.3043384552001953, + "step": 5582 + }, + { + "epoch": 2.032763014197306, + "grad_norm": 8.3125, + "learning_rate": 3.917461440655659e-07, + "loss": 1.282688856124878, + "step": 5584 + }, + { + "epoch": 2.0334910811794686, + "grad_norm": 3.40625, + "learning_rate": 3.9148327453912506e-07, + "loss": 1.4656150341033936, + "step": 5586 + }, + { + "epoch": 2.0342191481616307, + "grad_norm": 22.5, + "learning_rate": 3.9122052861137095e-07, + "loss": 1.2365152835845947, + "step": 5588 + }, + { + "epoch": 2.0349472151437933, + "grad_norm": 12.625, + "learning_rate": 3.909579064380464e-07, + "loss": 1.6057485342025757, + "step": 5590 + }, + { + "epoch": 2.0356752821259554, + "grad_norm": 18.25, + "learning_rate": 3.906954081748215e-07, + "loss": 0.8633642792701721, + "step": 5592 + }, + { + "epoch": 2.036403349108118, + "grad_norm": 16.375, + "learning_rate": 3.9043303397729256e-07, + "loss": 1.437652349472046, + "step": 5594 + }, + { + "epoch": 2.0371314160902805, + "grad_norm": 3.796875, + "learning_rate": 3.90170784000982e-07, + "loss": 0.8546528220176697, + "step": 5596 + }, + { + "epoch": 2.0378594830724426, + "grad_norm": 4.96875, + "learning_rate": 3.8990865840133933e-07, + "loss": 1.3153935670852661, + "step": 5598 + }, + { + "epoch": 2.038587550054605, + "grad_norm": 12.875, + "learning_rate": 3.896466573337401e-07, + "loss": 1.294619083404541, + "step": 5600 + }, + { + "epoch": 2.0393156170367672, + "grad_norm": 6.4375, + "learning_rate": 3.8938478095348565e-07, + "loss": 1.0435724258422852, + "step": 5602 + }, + { + "epoch": 2.0400436840189298, + "grad_norm": 79.0, + "learning_rate": 3.89123029415804e-07, + "loss": 1.5791676044464111, + "step": 5604 + }, + { + "epoch": 2.0407717510010923, + "grad_norm": 14.4375, + "learning_rate": 3.888614028758486e-07, + "loss": 1.2550140619277954, + "step": 5606 + }, + { + "epoch": 2.0414998179832544, + "grad_norm": 16.375, + "learning_rate": 3.885999014886992e-07, + "loss": 1.487191081047058, + "step": 5608 + }, + { + "epoch": 2.042227884965417, + "grad_norm": 11.1875, + "learning_rate": 3.883385254093614e-07, + "loss": 1.3066599369049072, + "step": 5610 + }, + { + "epoch": 2.042955951947579, + "grad_norm": 23.375, + "learning_rate": 3.8807727479276596e-07, + "loss": 1.5767487287521362, + "step": 5612 + }, + { + "epoch": 2.0436840189297416, + "grad_norm": 22.125, + "learning_rate": 3.8781614979377e-07, + "loss": 1.1804451942443848, + "step": 5614 + }, + { + "epoch": 2.0444120859119037, + "grad_norm": 13.0625, + "learning_rate": 3.8755515056715594e-07, + "loss": 1.054856300354004, + "step": 5616 + }, + { + "epoch": 2.0451401528940663, + "grad_norm": 13.625, + "learning_rate": 3.872942772676312e-07, + "loss": 1.6444454193115234, + "step": 5618 + }, + { + "epoch": 2.045868219876229, + "grad_norm": 4.9375, + "learning_rate": 3.8703353004982935e-07, + "loss": 1.085721731185913, + "step": 5620 + }, + { + "epoch": 2.046596286858391, + "grad_norm": 21.0, + "learning_rate": 3.8677290906830863e-07, + "loss": 1.81964111328125, + "step": 5622 + }, + { + "epoch": 2.0473243538405534, + "grad_norm": 27.5, + "learning_rate": 3.865124144775524e-07, + "loss": 1.0447821617126465, + "step": 5624 + }, + { + "epoch": 2.0480524208227155, + "grad_norm": 5.90625, + "learning_rate": 3.862520464319695e-07, + "loss": 1.5539491176605225, + "step": 5626 + }, + { + "epoch": 2.048780487804878, + "grad_norm": 13.1875, + "learning_rate": 3.859918050858938e-07, + "loss": 0.7993375062942505, + "step": 5628 + }, + { + "epoch": 2.04950855478704, + "grad_norm": 12.125, + "learning_rate": 3.8573169059358353e-07, + "loss": 1.139165997505188, + "step": 5630 + }, + { + "epoch": 2.0502366217692027, + "grad_norm": 3.375, + "learning_rate": 3.854717031092221e-07, + "loss": 0.7222519516944885, + "step": 5632 + }, + { + "epoch": 2.0509646887513653, + "grad_norm": 18.0, + "learning_rate": 3.8521184278691786e-07, + "loss": 0.8356004357337952, + "step": 5634 + }, + { + "epoch": 2.0516927557335274, + "grad_norm": 17.25, + "learning_rate": 3.849521097807034e-07, + "loss": 1.5697500705718994, + "step": 5636 + }, + { + "epoch": 2.05242082271569, + "grad_norm": 3.203125, + "learning_rate": 3.8469250424453557e-07, + "loss": 1.4259190559387207, + "step": 5638 + }, + { + "epoch": 2.053148889697852, + "grad_norm": 25.125, + "learning_rate": 3.8443302633229656e-07, + "loss": 2.0834898948669434, + "step": 5640 + }, + { + "epoch": 2.0538769566800146, + "grad_norm": 4.375, + "learning_rate": 3.8417367619779207e-07, + "loss": 1.4688138961791992, + "step": 5642 + }, + { + "epoch": 2.054605023662177, + "grad_norm": 19.375, + "learning_rate": 3.839144539947526e-07, + "loss": 1.6396881341934204, + "step": 5644 + }, + { + "epoch": 2.0553330906443392, + "grad_norm": 10.0, + "learning_rate": 3.836553598768324e-07, + "loss": 1.0269465446472168, + "step": 5646 + }, + { + "epoch": 2.0560611576265018, + "grad_norm": 17.375, + "learning_rate": 3.8339639399761026e-07, + "loss": 1.372738242149353, + "step": 5648 + }, + { + "epoch": 2.056789224608664, + "grad_norm": 22.125, + "learning_rate": 3.8313755651058855e-07, + "loss": 1.0509620904922485, + "step": 5650 + }, + { + "epoch": 2.0575172915908264, + "grad_norm": 25.375, + "learning_rate": 3.828788475691935e-07, + "loss": 1.245283603668213, + "step": 5652 + }, + { + "epoch": 2.0582453585729885, + "grad_norm": 22.5, + "learning_rate": 3.8262026732677555e-07, + "loss": 1.8285698890686035, + "step": 5654 + }, + { + "epoch": 2.058973425555151, + "grad_norm": 20.25, + "learning_rate": 3.8236181593660865e-07, + "loss": 1.0072861909866333, + "step": 5656 + }, + { + "epoch": 2.0597014925373136, + "grad_norm": 34.75, + "learning_rate": 3.821034935518903e-07, + "loss": 1.5823345184326172, + "step": 5658 + }, + { + "epoch": 2.0604295595194757, + "grad_norm": 15.0625, + "learning_rate": 3.818453003257415e-07, + "loss": 1.8064422607421875, + "step": 5660 + }, + { + "epoch": 2.0611576265016383, + "grad_norm": 19.125, + "learning_rate": 3.815872364112069e-07, + "loss": 1.3519439697265625, + "step": 5662 + }, + { + "epoch": 2.0618856934838004, + "grad_norm": 11.625, + "learning_rate": 3.8132930196125454e-07, + "loss": 1.434523105621338, + "step": 5664 + }, + { + "epoch": 2.062613760465963, + "grad_norm": 16.125, + "learning_rate": 3.8107149712877504e-07, + "loss": 1.358332872390747, + "step": 5666 + }, + { + "epoch": 2.0633418274481254, + "grad_norm": 20.5, + "learning_rate": 3.808138220665833e-07, + "loss": 1.680306077003479, + "step": 5668 + }, + { + "epoch": 2.0640698944302875, + "grad_norm": 15.4375, + "learning_rate": 3.805562769274162e-07, + "loss": 1.549433946609497, + "step": 5670 + }, + { + "epoch": 2.06479796141245, + "grad_norm": 9.3125, + "learning_rate": 3.8029886186393436e-07, + "loss": 1.2657043933868408, + "step": 5672 + }, + { + "epoch": 2.065526028394612, + "grad_norm": 13.375, + "learning_rate": 3.8004157702872115e-07, + "loss": 1.5076112747192383, + "step": 5674 + }, + { + "epoch": 2.0662540953767747, + "grad_norm": 7.09375, + "learning_rate": 3.7978442257428254e-07, + "loss": 1.3771909475326538, + "step": 5676 + }, + { + "epoch": 2.066982162358937, + "grad_norm": 6.0625, + "learning_rate": 3.7952739865304715e-07, + "loss": 1.2591097354888916, + "step": 5678 + }, + { + "epoch": 2.0677102293410994, + "grad_norm": 21.125, + "learning_rate": 3.7927050541736676e-07, + "loss": 1.1954448223114014, + "step": 5680 + }, + { + "epoch": 2.068438296323262, + "grad_norm": 15.125, + "learning_rate": 3.790137430195149e-07, + "loss": 1.4263126850128174, + "step": 5682 + }, + { + "epoch": 2.069166363305424, + "grad_norm": 20.625, + "learning_rate": 3.787571116116884e-07, + "loss": 1.6574556827545166, + "step": 5684 + }, + { + "epoch": 2.0698944302875866, + "grad_norm": 15.6875, + "learning_rate": 3.7850061134600565e-07, + "loss": 1.5636816024780273, + "step": 5686 + }, + { + "epoch": 2.0706224972697487, + "grad_norm": 20.125, + "learning_rate": 3.782442423745078e-07, + "loss": 0.8436065316200256, + "step": 5688 + }, + { + "epoch": 2.071350564251911, + "grad_norm": 11.6875, + "learning_rate": 3.7798800484915823e-07, + "loss": 1.4080082178115845, + "step": 5690 + }, + { + "epoch": 2.0720786312340733, + "grad_norm": 13.9375, + "learning_rate": 3.7773189892184203e-07, + "loss": 1.3796714544296265, + "step": 5692 + }, + { + "epoch": 2.072806698216236, + "grad_norm": 12.75, + "learning_rate": 3.774759247443664e-07, + "loss": 1.1532148122787476, + "step": 5694 + }, + { + "epoch": 2.0735347651983984, + "grad_norm": 11.625, + "learning_rate": 3.772200824684608e-07, + "loss": 1.722951889038086, + "step": 5696 + }, + { + "epoch": 2.0742628321805605, + "grad_norm": 14.6875, + "learning_rate": 3.769643722457759e-07, + "loss": 1.5232830047607422, + "step": 5698 + }, + { + "epoch": 2.074990899162723, + "grad_norm": 15.5625, + "learning_rate": 3.767087942278846e-07, + "loss": 1.5313119888305664, + "step": 5700 + }, + { + "epoch": 2.075718966144885, + "grad_norm": 18.5, + "learning_rate": 3.764533485662814e-07, + "loss": 1.156836986541748, + "step": 5702 + }, + { + "epoch": 2.0764470331270477, + "grad_norm": 12.9375, + "learning_rate": 3.761980354123819e-07, + "loss": 1.7371861934661865, + "step": 5704 + }, + { + "epoch": 2.0771751001092102, + "grad_norm": 12.9375, + "learning_rate": 3.759428549175237e-07, + "loss": 1.3801919221878052, + "step": 5706 + }, + { + "epoch": 2.0779031670913723, + "grad_norm": 16.625, + "learning_rate": 3.7568780723296544e-07, + "loss": 1.2528457641601562, + "step": 5708 + }, + { + "epoch": 2.078631234073535, + "grad_norm": 12.5, + "learning_rate": 3.754328925098871e-07, + "loss": 1.612968921661377, + "step": 5710 + }, + { + "epoch": 2.079359301055697, + "grad_norm": 10.25, + "learning_rate": 3.7517811089938967e-07, + "loss": 1.5015103816986084, + "step": 5712 + }, + { + "epoch": 2.0800873680378595, + "grad_norm": 12.4375, + "learning_rate": 3.7492346255249596e-07, + "loss": 1.5169007778167725, + "step": 5714 + }, + { + "epoch": 2.0808154350200216, + "grad_norm": 17.0, + "learning_rate": 3.7466894762014866e-07, + "loss": 1.700662612915039, + "step": 5716 + }, + { + "epoch": 2.081543502002184, + "grad_norm": 13.75, + "learning_rate": 3.7441456625321253e-07, + "loss": 1.2173278331756592, + "step": 5718 + }, + { + "epoch": 2.0822715689843467, + "grad_norm": 15.0625, + "learning_rate": 3.7416031860247225e-07, + "loss": 1.1855030059814453, + "step": 5720 + }, + { + "epoch": 2.082999635966509, + "grad_norm": 36.75, + "learning_rate": 3.7390620481863366e-07, + "loss": 1.720398187637329, + "step": 5722 + }, + { + "epoch": 2.0837277029486714, + "grad_norm": 9.4375, + "learning_rate": 3.736522250523231e-07, + "loss": 0.9182982444763184, + "step": 5724 + }, + { + "epoch": 2.0844557699308335, + "grad_norm": 12.75, + "learning_rate": 3.7339837945408795e-07, + "loss": 1.721379041671753, + "step": 5726 + }, + { + "epoch": 2.085183836912996, + "grad_norm": 22.75, + "learning_rate": 3.7314466817439525e-07, + "loss": 1.7353403568267822, + "step": 5728 + }, + { + "epoch": 2.085911903895158, + "grad_norm": 37.0, + "learning_rate": 3.7289109136363314e-07, + "loss": 0.9806910157203674, + "step": 5730 + }, + { + "epoch": 2.0866399708773207, + "grad_norm": 14.8125, + "learning_rate": 3.7263764917210945e-07, + "loss": 0.9615619778633118, + "step": 5732 + }, + { + "epoch": 2.087368037859483, + "grad_norm": 11.5, + "learning_rate": 3.7238434175005284e-07, + "loss": 1.2300238609313965, + "step": 5734 + }, + { + "epoch": 2.0880961048416453, + "grad_norm": 12.4375, + "learning_rate": 3.721311692476117e-07, + "loss": 1.4142905473709106, + "step": 5736 + }, + { + "epoch": 2.088824171823808, + "grad_norm": 11.125, + "learning_rate": 3.718781318148542e-07, + "loss": 1.330254316329956, + "step": 5738 + }, + { + "epoch": 2.08955223880597, + "grad_norm": 26.5, + "learning_rate": 3.71625229601769e-07, + "loss": 1.72505521774292, + "step": 5740 + }, + { + "epoch": 2.0902803057881325, + "grad_norm": 41.25, + "learning_rate": 3.7137246275826454e-07, + "loss": 1.894031047821045, + "step": 5742 + }, + { + "epoch": 2.091008372770295, + "grad_norm": 7.71875, + "learning_rate": 3.711198314341685e-07, + "loss": 0.8848753571510315, + "step": 5744 + }, + { + "epoch": 2.091736439752457, + "grad_norm": 12.3125, + "learning_rate": 3.7086733577922897e-07, + "loss": 1.0816885232925415, + "step": 5746 + }, + { + "epoch": 2.0924645067346197, + "grad_norm": 19.25, + "learning_rate": 3.7061497594311296e-07, + "loss": 1.1160202026367188, + "step": 5748 + }, + { + "epoch": 2.093192573716782, + "grad_norm": 17.375, + "learning_rate": 3.703627520754072e-07, + "loss": 1.6631357669830322, + "step": 5750 + }, + { + "epoch": 2.0939206406989443, + "grad_norm": 14.375, + "learning_rate": 3.7011066432561794e-07, + "loss": 1.2328541278839111, + "step": 5752 + }, + { + "epoch": 2.0946487076811064, + "grad_norm": 13.3125, + "learning_rate": 3.6985871284317084e-07, + "loss": 1.575582504272461, + "step": 5754 + }, + { + "epoch": 2.095376774663269, + "grad_norm": 52.25, + "learning_rate": 3.6960689777741034e-07, + "loss": 1.9062939882278442, + "step": 5756 + }, + { + "epoch": 2.0961048416454315, + "grad_norm": 13.9375, + "learning_rate": 3.693552192776004e-07, + "loss": 1.3193533420562744, + "step": 5758 + }, + { + "epoch": 2.0968329086275936, + "grad_norm": 4.78125, + "learning_rate": 3.691036774929242e-07, + "loss": 1.2268621921539307, + "step": 5760 + }, + { + "epoch": 2.097560975609756, + "grad_norm": 14.875, + "learning_rate": 3.6885227257248337e-07, + "loss": 1.2062790393829346, + "step": 5762 + }, + { + "epoch": 2.0982890425919183, + "grad_norm": 6.4375, + "learning_rate": 3.6860100466529853e-07, + "loss": 1.2729251384735107, + "step": 5764 + }, + { + "epoch": 2.099017109574081, + "grad_norm": 4.84375, + "learning_rate": 3.6834987392030945e-07, + "loss": 1.3873820304870605, + "step": 5766 + }, + { + "epoch": 2.0997451765562434, + "grad_norm": 5.03125, + "learning_rate": 3.680988804863742e-07, + "loss": 1.2745611667633057, + "step": 5768 + }, + { + "epoch": 2.1004732435384055, + "grad_norm": 37.25, + "learning_rate": 3.6784802451226973e-07, + "loss": 0.8715425729751587, + "step": 5770 + }, + { + "epoch": 2.101201310520568, + "grad_norm": 12.25, + "learning_rate": 3.675973061466913e-07, + "loss": 1.5539045333862305, + "step": 5772 + }, + { + "epoch": 2.10192937750273, + "grad_norm": 13.5625, + "learning_rate": 3.6734672553825264e-07, + "loss": 1.5709238052368164, + "step": 5774 + }, + { + "epoch": 2.1026574444848927, + "grad_norm": 14.0625, + "learning_rate": 3.670962828354862e-07, + "loss": 1.3460569381713867, + "step": 5776 + }, + { + "epoch": 2.1033855114670548, + "grad_norm": 38.5, + "learning_rate": 3.668459781868421e-07, + "loss": 1.3048439025878906, + "step": 5778 + }, + { + "epoch": 2.1041135784492173, + "grad_norm": 12.9375, + "learning_rate": 3.6659581174068887e-07, + "loss": 1.1163508892059326, + "step": 5780 + }, + { + "epoch": 2.10484164543138, + "grad_norm": 21.5, + "learning_rate": 3.6634578364531337e-07, + "loss": 1.419758677482605, + "step": 5782 + }, + { + "epoch": 2.105569712413542, + "grad_norm": 23.125, + "learning_rate": 3.660958940489199e-07, + "loss": 1.8683795928955078, + "step": 5784 + }, + { + "epoch": 2.1062977793957045, + "grad_norm": 18.0, + "learning_rate": 3.6584614309963124e-07, + "loss": 1.4845898151397705, + "step": 5786 + }, + { + "epoch": 2.1070258463778666, + "grad_norm": 8.5625, + "learning_rate": 3.655965309454878e-07, + "loss": 1.3220224380493164, + "step": 5788 + }, + { + "epoch": 2.107753913360029, + "grad_norm": 12.0625, + "learning_rate": 3.653470577344476e-07, + "loss": 1.2986137866973877, + "step": 5790 + }, + { + "epoch": 2.1084819803421917, + "grad_norm": 17.5, + "learning_rate": 3.650977236143862e-07, + "loss": 1.4755935668945312, + "step": 5792 + }, + { + "epoch": 2.109210047324354, + "grad_norm": 6.09375, + "learning_rate": 3.64848528733097e-07, + "loss": 1.4774565696716309, + "step": 5794 + }, + { + "epoch": 2.1099381143065163, + "grad_norm": 19.75, + "learning_rate": 3.6459947323829066e-07, + "loss": 1.3214068412780762, + "step": 5796 + }, + { + "epoch": 2.1106661812886784, + "grad_norm": 17.875, + "learning_rate": 3.643505572775953e-07, + "loss": 1.458134412765503, + "step": 5798 + }, + { + "epoch": 2.111394248270841, + "grad_norm": 9.5625, + "learning_rate": 3.6410178099855664e-07, + "loss": 1.2962615489959717, + "step": 5800 + }, + { + "epoch": 2.112122315253003, + "grad_norm": 21.625, + "learning_rate": 3.6385314454863683e-07, + "loss": 1.3345646858215332, + "step": 5802 + }, + { + "epoch": 2.1128503822351656, + "grad_norm": 11.0625, + "learning_rate": 3.63604648075216e-07, + "loss": 1.2648944854736328, + "step": 5804 + }, + { + "epoch": 2.113578449217328, + "grad_norm": 18.625, + "learning_rate": 3.633562917255908e-07, + "loss": 1.4601542949676514, + "step": 5806 + }, + { + "epoch": 2.1143065161994903, + "grad_norm": 13.8125, + "learning_rate": 3.631080756469746e-07, + "loss": 1.7035515308380127, + "step": 5808 + }, + { + "epoch": 2.115034583181653, + "grad_norm": 38.25, + "learning_rate": 3.628599999864983e-07, + "loss": 1.5035191774368286, + "step": 5810 + }, + { + "epoch": 2.115762650163815, + "grad_norm": 23.0, + "learning_rate": 3.626120648912092e-07, + "loss": 1.6442712545394897, + "step": 5812 + }, + { + "epoch": 2.1164907171459775, + "grad_norm": 19.25, + "learning_rate": 3.623642705080711e-07, + "loss": 1.5214929580688477, + "step": 5814 + }, + { + "epoch": 2.1172187841281396, + "grad_norm": 12.375, + "learning_rate": 3.621166169839649e-07, + "loss": 1.4096050262451172, + "step": 5816 + }, + { + "epoch": 2.117946851110302, + "grad_norm": 21.0, + "learning_rate": 3.618691044656875e-07, + "loss": 1.4189574718475342, + "step": 5818 + }, + { + "epoch": 2.1186749180924647, + "grad_norm": 28.0, + "learning_rate": 3.616217330999523e-07, + "loss": 1.199664831161499, + "step": 5820 + }, + { + "epoch": 2.1194029850746268, + "grad_norm": 3.828125, + "learning_rate": 3.6137450303338945e-07, + "loss": 1.5003137588500977, + "step": 5822 + }, + { + "epoch": 2.1201310520567893, + "grad_norm": 5.25, + "learning_rate": 3.6112741441254474e-07, + "loss": 0.8780217170715332, + "step": 5824 + }, + { + "epoch": 2.1208591190389514, + "grad_norm": 13.375, + "learning_rate": 3.6088046738388056e-07, + "loss": 1.1803760528564453, + "step": 5826 + }, + { + "epoch": 2.121587186021114, + "grad_norm": 13.1875, + "learning_rate": 3.6063366209377546e-07, + "loss": 1.4683301448822021, + "step": 5828 + }, + { + "epoch": 2.1223152530032765, + "grad_norm": 40.0, + "learning_rate": 3.603869986885233e-07, + "loss": 1.7922911643981934, + "step": 5830 + }, + { + "epoch": 2.1230433199854386, + "grad_norm": 10.625, + "learning_rate": 3.601404773143346e-07, + "loss": 1.156049370765686, + "step": 5832 + }, + { + "epoch": 2.123771386967601, + "grad_norm": 12.9375, + "learning_rate": 3.5989409811733536e-07, + "loss": 1.332781195640564, + "step": 5834 + }, + { + "epoch": 2.1244994539497633, + "grad_norm": 7.375, + "learning_rate": 3.596478612435671e-07, + "loss": 1.1141793727874756, + "step": 5836 + }, + { + "epoch": 2.125227520931926, + "grad_norm": 16.125, + "learning_rate": 3.5940176683898725e-07, + "loss": 1.457383394241333, + "step": 5838 + }, + { + "epoch": 2.125955587914088, + "grad_norm": 7.71875, + "learning_rate": 3.5915581504946906e-07, + "loss": 1.3185855150222778, + "step": 5840 + }, + { + "epoch": 2.1266836548962504, + "grad_norm": 15.625, + "learning_rate": 3.589100060208006e-07, + "loss": 1.2916820049285889, + "step": 5842 + }, + { + "epoch": 2.127411721878413, + "grad_norm": 25.0, + "learning_rate": 3.5866433989868573e-07, + "loss": 1.6884398460388184, + "step": 5844 + }, + { + "epoch": 2.128139788860575, + "grad_norm": 23.125, + "learning_rate": 3.5841881682874354e-07, + "loss": 1.2331486940383911, + "step": 5846 + }, + { + "epoch": 2.1288678558427376, + "grad_norm": 15.6875, + "learning_rate": 3.5817343695650815e-07, + "loss": 1.4563452005386353, + "step": 5848 + }, + { + "epoch": 2.1295959228248997, + "grad_norm": 11.6875, + "learning_rate": 3.579282004274291e-07, + "loss": 1.178283452987671, + "step": 5850 + }, + { + "epoch": 2.1303239898070623, + "grad_norm": 14.6875, + "learning_rate": 3.5768310738687087e-07, + "loss": 0.9473758935928345, + "step": 5852 + }, + { + "epoch": 2.1310520567892244, + "grad_norm": 11.625, + "learning_rate": 3.5743815798011256e-07, + "loss": 1.374763011932373, + "step": 5854 + }, + { + "epoch": 2.131780123771387, + "grad_norm": 17.25, + "learning_rate": 3.571933523523488e-07, + "loss": 1.441290259361267, + "step": 5856 + }, + { + "epoch": 2.1325081907535495, + "grad_norm": 9.8125, + "learning_rate": 3.5694869064868815e-07, + "loss": 1.730541467666626, + "step": 5858 + }, + { + "epoch": 2.1332362577357116, + "grad_norm": 6.75, + "learning_rate": 3.5670417301415467e-07, + "loss": 0.9487756490707397, + "step": 5860 + }, + { + "epoch": 2.133964324717874, + "grad_norm": 12.0, + "learning_rate": 3.564597995936865e-07, + "loss": 1.3873651027679443, + "step": 5862 + }, + { + "epoch": 2.134692391700036, + "grad_norm": 21.25, + "learning_rate": 3.5621557053213635e-07, + "loss": 1.67753005027771, + "step": 5864 + }, + { + "epoch": 2.1354204586821988, + "grad_norm": 7.71875, + "learning_rate": 3.559714859742715e-07, + "loss": 1.4054288864135742, + "step": 5866 + }, + { + "epoch": 2.1361485256643613, + "grad_norm": 24.25, + "learning_rate": 3.5572754606477374e-07, + "loss": 0.9921908378601074, + "step": 5868 + }, + { + "epoch": 2.1368765926465234, + "grad_norm": 19.25, + "learning_rate": 3.5548375094823867e-07, + "loss": 1.3330621719360352, + "step": 5870 + }, + { + "epoch": 2.137604659628686, + "grad_norm": 13.3125, + "learning_rate": 3.5524010076917637e-07, + "loss": 1.4355268478393555, + "step": 5872 + }, + { + "epoch": 2.138332726610848, + "grad_norm": 14.625, + "learning_rate": 3.549965956720112e-07, + "loss": 1.3277404308319092, + "step": 5874 + }, + { + "epoch": 2.1390607935930106, + "grad_norm": 17.875, + "learning_rate": 3.5475323580108105e-07, + "loss": 1.228518009185791, + "step": 5876 + }, + { + "epoch": 2.139788860575173, + "grad_norm": 9.75, + "learning_rate": 3.5451002130063776e-07, + "loss": 1.426896095275879, + "step": 5878 + }, + { + "epoch": 2.1405169275573352, + "grad_norm": 9.8125, + "learning_rate": 3.542669523148476e-07, + "loss": 1.4462436437606812, + "step": 5880 + }, + { + "epoch": 2.141244994539498, + "grad_norm": 22.125, + "learning_rate": 3.5402402898778975e-07, + "loss": 1.4087471961975098, + "step": 5882 + }, + { + "epoch": 2.14197306152166, + "grad_norm": 14.0, + "learning_rate": 3.537812514634577e-07, + "loss": 1.5123066902160645, + "step": 5884 + }, + { + "epoch": 2.1427011285038224, + "grad_norm": 17.875, + "learning_rate": 3.5353861988575844e-07, + "loss": 1.7802894115447998, + "step": 5886 + }, + { + "epoch": 2.1434291954859845, + "grad_norm": 17.25, + "learning_rate": 3.5329613439851214e-07, + "loss": 1.0886167287826538, + "step": 5888 + }, + { + "epoch": 2.144157262468147, + "grad_norm": 29.0, + "learning_rate": 3.530537951454524e-07, + "loss": 1.4794470071792603, + "step": 5890 + }, + { + "epoch": 2.1448853294503096, + "grad_norm": 4.84375, + "learning_rate": 3.528116022702266e-07, + "loss": 1.2189726829528809, + "step": 5892 + }, + { + "epoch": 2.1456133964324717, + "grad_norm": 5.3125, + "learning_rate": 3.525695559163946e-07, + "loss": 1.8609716892242432, + "step": 5894 + }, + { + "epoch": 2.1463414634146343, + "grad_norm": 3.65625, + "learning_rate": 3.5232765622743025e-07, + "loss": 1.2643247842788696, + "step": 5896 + }, + { + "epoch": 2.1470695303967964, + "grad_norm": 19.625, + "learning_rate": 3.520859033467198e-07, + "loss": 1.4180665016174316, + "step": 5898 + }, + { + "epoch": 2.147797597378959, + "grad_norm": 31.875, + "learning_rate": 3.518442974175627e-07, + "loss": 1.3640692234039307, + "step": 5900 + }, + { + "epoch": 2.148525664361121, + "grad_norm": 40.5, + "learning_rate": 3.516028385831716e-07, + "loss": 1.4376933574676514, + "step": 5902 + }, + { + "epoch": 2.1492537313432836, + "grad_norm": 32.0, + "learning_rate": 3.513615269866716e-07, + "loss": 1.376408576965332, + "step": 5904 + }, + { + "epoch": 2.149981798325446, + "grad_norm": 47.25, + "learning_rate": 3.5112036277110033e-07, + "loss": 1.5213499069213867, + "step": 5906 + }, + { + "epoch": 2.150709865307608, + "grad_norm": 19.0, + "learning_rate": 3.508793460794087e-07, + "loss": 0.929718017578125, + "step": 5908 + }, + { + "epoch": 2.1514379322897708, + "grad_norm": 6.46875, + "learning_rate": 3.5063847705445946e-07, + "loss": 1.3371130228042603, + "step": 5910 + }, + { + "epoch": 2.152165999271933, + "grad_norm": 87.0, + "learning_rate": 3.503977558390284e-07, + "loss": 1.1674563884735107, + "step": 5912 + }, + { + "epoch": 2.1528940662540954, + "grad_norm": 12.9375, + "learning_rate": 3.5015718257580363e-07, + "loss": 1.7327897548675537, + "step": 5914 + }, + { + "epoch": 2.153622133236258, + "grad_norm": 11.0, + "learning_rate": 3.4991675740738514e-07, + "loss": 1.34199059009552, + "step": 5916 + }, + { + "epoch": 2.15435020021842, + "grad_norm": 21.25, + "learning_rate": 3.496764804762854e-07, + "loss": 1.5773694515228271, + "step": 5918 + }, + { + "epoch": 2.1550782672005826, + "grad_norm": 14.0, + "learning_rate": 3.4943635192492926e-07, + "loss": 1.7211443185806274, + "step": 5920 + }, + { + "epoch": 2.1558063341827447, + "grad_norm": 12.625, + "learning_rate": 3.491963718956531e-07, + "loss": 1.355418086051941, + "step": 5922 + }, + { + "epoch": 2.1565344011649072, + "grad_norm": 5.46875, + "learning_rate": 3.4895654053070565e-07, + "loss": 1.2929091453552246, + "step": 5924 + }, + { + "epoch": 2.1572624681470693, + "grad_norm": 19.75, + "learning_rate": 3.487168579722477e-07, + "loss": 0.8512256145477295, + "step": 5926 + }, + { + "epoch": 2.157990535129232, + "grad_norm": 5.84375, + "learning_rate": 3.48477324362351e-07, + "loss": 1.407363772392273, + "step": 5928 + }, + { + "epoch": 2.1587186021113944, + "grad_norm": 7.40625, + "learning_rate": 3.48237939843e-07, + "loss": 1.2975246906280518, + "step": 5930 + }, + { + "epoch": 2.1594466690935565, + "grad_norm": 9.125, + "learning_rate": 3.4799870455609016e-07, + "loss": 1.4413752555847168, + "step": 5932 + }, + { + "epoch": 2.160174736075719, + "grad_norm": 16.625, + "learning_rate": 3.477596186434285e-07, + "loss": 1.522780418395996, + "step": 5934 + }, + { + "epoch": 2.160902803057881, + "grad_norm": 12.0625, + "learning_rate": 3.475206822467339e-07, + "loss": 1.6308672428131104, + "step": 5936 + }, + { + "epoch": 2.1616308700400437, + "grad_norm": 32.75, + "learning_rate": 3.4728189550763634e-07, + "loss": 1.6849615573883057, + "step": 5938 + }, + { + "epoch": 2.162358937022206, + "grad_norm": 8.75, + "learning_rate": 3.4704325856767685e-07, + "loss": 1.0895112752914429, + "step": 5940 + }, + { + "epoch": 2.1630870040043684, + "grad_norm": 7.03125, + "learning_rate": 3.4680477156830834e-07, + "loss": 0.9601244926452637, + "step": 5942 + }, + { + "epoch": 2.163815070986531, + "grad_norm": 11.8125, + "learning_rate": 3.465664346508942e-07, + "loss": 1.430390477180481, + "step": 5944 + }, + { + "epoch": 2.164543137968693, + "grad_norm": 16.125, + "learning_rate": 3.4632824795670894e-07, + "loss": 1.5006717443466187, + "step": 5946 + }, + { + "epoch": 2.1652712049508556, + "grad_norm": 5.21875, + "learning_rate": 3.460902116269385e-07, + "loss": 1.2711468935012817, + "step": 5948 + }, + { + "epoch": 2.1659992719330177, + "grad_norm": 30.5, + "learning_rate": 3.4585232580267913e-07, + "loss": 0.959444522857666, + "step": 5950 + }, + { + "epoch": 2.16672733891518, + "grad_norm": 76.5, + "learning_rate": 3.4561459062493817e-07, + "loss": 1.482452392578125, + "step": 5952 + }, + { + "epoch": 2.1674554058973428, + "grad_norm": 11.3125, + "learning_rate": 3.453770062346337e-07, + "loss": 1.4156203269958496, + "step": 5954 + }, + { + "epoch": 2.168183472879505, + "grad_norm": 28.25, + "learning_rate": 3.4513957277259417e-07, + "loss": 1.3052093982696533, + "step": 5956 + }, + { + "epoch": 2.1689115398616674, + "grad_norm": 11.9375, + "learning_rate": 3.4490229037955887e-07, + "loss": 1.4626572132110596, + "step": 5958 + }, + { + "epoch": 2.1696396068438295, + "grad_norm": 13.125, + "learning_rate": 3.446651591961774e-07, + "loss": 1.7663476467132568, + "step": 5960 + }, + { + "epoch": 2.170367673825992, + "grad_norm": 7.34375, + "learning_rate": 3.444281793630094e-07, + "loss": 1.3236771821975708, + "step": 5962 + }, + { + "epoch": 2.171095740808154, + "grad_norm": 11.3125, + "learning_rate": 3.4419135102052545e-07, + "loss": 1.3599928617477417, + "step": 5964 + }, + { + "epoch": 2.1718238077903167, + "grad_norm": 17.375, + "learning_rate": 3.4395467430910595e-07, + "loss": 1.5919923782348633, + "step": 5966 + }, + { + "epoch": 2.1725518747724792, + "grad_norm": 10.125, + "learning_rate": 3.437181493690413e-07, + "loss": 1.2534955739974976, + "step": 5968 + }, + { + "epoch": 2.1732799417546413, + "grad_norm": 12.1875, + "learning_rate": 3.434817763405322e-07, + "loss": 1.4775116443634033, + "step": 5970 + }, + { + "epoch": 2.174008008736804, + "grad_norm": 13.5, + "learning_rate": 3.432455553636895e-07, + "loss": 1.4447672367095947, + "step": 5972 + }, + { + "epoch": 2.174736075718966, + "grad_norm": 19.5, + "learning_rate": 3.4300948657853323e-07, + "loss": 1.7350258827209473, + "step": 5974 + }, + { + "epoch": 2.1754641427011285, + "grad_norm": 11.0625, + "learning_rate": 3.4277357012499376e-07, + "loss": 1.4197678565979004, + "step": 5976 + }, + { + "epoch": 2.1761922096832906, + "grad_norm": 21.75, + "learning_rate": 3.4253780614291106e-07, + "loss": 1.844627857208252, + "step": 5978 + }, + { + "epoch": 2.176920276665453, + "grad_norm": 11.3125, + "learning_rate": 3.423021947720346e-07, + "loss": 1.236377239227295, + "step": 5980 + }, + { + "epoch": 2.1776483436476157, + "grad_norm": 18.625, + "learning_rate": 3.420667361520235e-07, + "loss": 1.4828205108642578, + "step": 5982 + }, + { + "epoch": 2.178376410629778, + "grad_norm": 20.5, + "learning_rate": 3.4183143042244634e-07, + "loss": 1.843029260635376, + "step": 5984 + }, + { + "epoch": 2.1791044776119404, + "grad_norm": 11.8125, + "learning_rate": 3.4159627772278103e-07, + "loss": 1.3111939430236816, + "step": 5986 + }, + { + "epoch": 2.1798325445941025, + "grad_norm": 16.25, + "learning_rate": 3.4136127819241477e-07, + "loss": 1.1420940160751343, + "step": 5988 + }, + { + "epoch": 2.180560611576265, + "grad_norm": 11.9375, + "learning_rate": 3.4112643197064397e-07, + "loss": 1.3537771701812744, + "step": 5990 + }, + { + "epoch": 2.1812886785584276, + "grad_norm": 16.875, + "learning_rate": 3.4089173919667405e-07, + "loss": 1.4927430152893066, + "step": 5992 + }, + { + "epoch": 2.1820167455405897, + "grad_norm": 9.8125, + "learning_rate": 3.4065720000961995e-07, + "loss": 1.6122634410858154, + "step": 5994 + }, + { + "epoch": 2.182744812522752, + "grad_norm": 32.75, + "learning_rate": 3.404228145485047e-07, + "loss": 0.9808583855628967, + "step": 5996 + }, + { + "epoch": 2.1834728795049143, + "grad_norm": 8.5, + "learning_rate": 3.4018858295226125e-07, + "loss": 0.5637869238853455, + "step": 5998 + }, + { + "epoch": 2.184200946487077, + "grad_norm": 13.1875, + "learning_rate": 3.399545053597306e-07, + "loss": 1.0111210346221924, + "step": 6000 + }, + { + "epoch": 2.1849290134692394, + "grad_norm": 10.875, + "learning_rate": 3.3972058190966274e-07, + "loss": 1.6773240566253662, + "step": 6002 + }, + { + "epoch": 2.1856570804514015, + "grad_norm": 10.25, + "learning_rate": 3.394868127407161e-07, + "loss": 1.8925261497497559, + "step": 6004 + }, + { + "epoch": 2.186385147433564, + "grad_norm": 14.8125, + "learning_rate": 3.3925319799145797e-07, + "loss": 1.4878836870193481, + "step": 6006 + }, + { + "epoch": 2.187113214415726, + "grad_norm": 10.8125, + "learning_rate": 3.3901973780036373e-07, + "loss": 1.3106377124786377, + "step": 6008 + }, + { + "epoch": 2.1878412813978887, + "grad_norm": 11.9375, + "learning_rate": 3.387864323058175e-07, + "loss": 1.5790293216705322, + "step": 6010 + }, + { + "epoch": 2.188569348380051, + "grad_norm": 8.875, + "learning_rate": 3.3855328164611166e-07, + "loss": 1.4601773023605347, + "step": 6012 + }, + { + "epoch": 2.1892974153622133, + "grad_norm": 11.625, + "learning_rate": 3.383202859594466e-07, + "loss": 1.5023212432861328, + "step": 6014 + }, + { + "epoch": 2.190025482344376, + "grad_norm": 14.6875, + "learning_rate": 3.3808744538393084e-07, + "loss": 1.3071670532226562, + "step": 6016 + }, + { + "epoch": 2.190753549326538, + "grad_norm": 18.375, + "learning_rate": 3.378547600575814e-07, + "loss": 1.307171106338501, + "step": 6018 + }, + { + "epoch": 2.1914816163087005, + "grad_norm": 22.875, + "learning_rate": 3.376222301183225e-07, + "loss": 1.4086741209030151, + "step": 6020 + }, + { + "epoch": 2.1922096832908626, + "grad_norm": 13.125, + "learning_rate": 3.373898557039869e-07, + "loss": 1.465623140335083, + "step": 6022 + }, + { + "epoch": 2.192937750273025, + "grad_norm": 15.75, + "learning_rate": 3.3715763695231526e-07, + "loss": 1.7257227897644043, + "step": 6024 + }, + { + "epoch": 2.1936658172551873, + "grad_norm": 5.90625, + "learning_rate": 3.369255740009552e-07, + "loss": 1.0455222129821777, + "step": 6026 + }, + { + "epoch": 2.19439388423735, + "grad_norm": 24.75, + "learning_rate": 3.3669366698746295e-07, + "loss": 1.3297648429870605, + "step": 6028 + }, + { + "epoch": 2.1951219512195124, + "grad_norm": 6.65625, + "learning_rate": 3.364619160493015e-07, + "loss": 1.2396814823150635, + "step": 6030 + }, + { + "epoch": 2.1958500182016745, + "grad_norm": 6.46875, + "learning_rate": 3.3623032132384175e-07, + "loss": 1.1364678144454956, + "step": 6032 + }, + { + "epoch": 2.196578085183837, + "grad_norm": 25.25, + "learning_rate": 3.35998882948362e-07, + "loss": 1.5326740741729736, + "step": 6034 + }, + { + "epoch": 2.197306152165999, + "grad_norm": 10.4375, + "learning_rate": 3.357676010600475e-07, + "loss": 1.3663378953933716, + "step": 6036 + }, + { + "epoch": 2.1980342191481617, + "grad_norm": 9.5, + "learning_rate": 3.355364757959914e-07, + "loss": 1.2529218196868896, + "step": 6038 + }, + { + "epoch": 2.198762286130324, + "grad_norm": 6.25, + "learning_rate": 3.3530550729319364e-07, + "loss": 1.0498268604278564, + "step": 6040 + }, + { + "epoch": 2.1994903531124863, + "grad_norm": 14.125, + "learning_rate": 3.3507469568856095e-07, + "loss": 1.5576153993606567, + "step": 6042 + }, + { + "epoch": 2.200218420094649, + "grad_norm": 7.5625, + "learning_rate": 3.3484404111890754e-07, + "loss": 1.133960247039795, + "step": 6044 + }, + { + "epoch": 2.200946487076811, + "grad_norm": 17.75, + "learning_rate": 3.346135437209543e-07, + "loss": 1.5918620824813843, + "step": 6046 + }, + { + "epoch": 2.2016745540589735, + "grad_norm": 11.875, + "learning_rate": 3.3438320363132884e-07, + "loss": 1.6148746013641357, + "step": 6048 + }, + { + "epoch": 2.2024026210411356, + "grad_norm": 52.75, + "learning_rate": 3.3415302098656596e-07, + "loss": 1.3812808990478516, + "step": 6050 + }, + { + "epoch": 2.203130688023298, + "grad_norm": 9.0, + "learning_rate": 3.3392299592310675e-07, + "loss": 1.2168689966201782, + "step": 6052 + }, + { + "epoch": 2.2038587550054607, + "grad_norm": 14.0625, + "learning_rate": 3.3369312857729894e-07, + "loss": 1.5556004047393799, + "step": 6054 + }, + { + "epoch": 2.204586821987623, + "grad_norm": 10.5, + "learning_rate": 3.33463419085397e-07, + "loss": 1.6028212308883667, + "step": 6056 + }, + { + "epoch": 2.2053148889697853, + "grad_norm": 10.3125, + "learning_rate": 3.3323386758356163e-07, + "loss": 1.0775208473205566, + "step": 6058 + }, + { + "epoch": 2.2060429559519474, + "grad_norm": 4.03125, + "learning_rate": 3.330044742078597e-07, + "loss": 1.0133678913116455, + "step": 6060 + }, + { + "epoch": 2.20677102293411, + "grad_norm": 13.5625, + "learning_rate": 3.327752390942647e-07, + "loss": 1.2774269580841064, + "step": 6062 + }, + { + "epoch": 2.207499089916272, + "grad_norm": 18.75, + "learning_rate": 3.3254616237865634e-07, + "loss": 1.478995680809021, + "step": 6064 + }, + { + "epoch": 2.2082271568984346, + "grad_norm": 15.5, + "learning_rate": 3.323172441968201e-07, + "loss": 1.3658077716827393, + "step": 6066 + }, + { + "epoch": 2.208955223880597, + "grad_norm": 9.375, + "learning_rate": 3.3208848468444774e-07, + "loss": 1.2966485023498535, + "step": 6068 + }, + { + "epoch": 2.2096832908627593, + "grad_norm": 5.25, + "learning_rate": 3.3185988397713684e-07, + "loss": 0.8807618021965027, + "step": 6070 + }, + { + "epoch": 2.210411357844922, + "grad_norm": 8.375, + "learning_rate": 3.31631442210391e-07, + "loss": 0.9366064071655273, + "step": 6072 + }, + { + "epoch": 2.211139424827084, + "grad_norm": 9.0, + "learning_rate": 3.314031595196195e-07, + "loss": 1.1482748985290527, + "step": 6074 + }, + { + "epoch": 2.2118674918092465, + "grad_norm": 31.125, + "learning_rate": 3.311750360401372e-07, + "loss": 1.4485727548599243, + "step": 6076 + }, + { + "epoch": 2.212595558791409, + "grad_norm": 5.5, + "learning_rate": 3.3094707190716477e-07, + "loss": 0.7264808416366577, + "step": 6078 + }, + { + "epoch": 2.213323625773571, + "grad_norm": 17.0, + "learning_rate": 3.3071926725582854e-07, + "loss": 1.5123050212860107, + "step": 6080 + }, + { + "epoch": 2.2140516927557337, + "grad_norm": 10.0, + "learning_rate": 3.304916222211599e-07, + "loss": 1.4028503894805908, + "step": 6082 + }, + { + "epoch": 2.2147797597378958, + "grad_norm": 11.6875, + "learning_rate": 3.302641369380963e-07, + "loss": 1.1577377319335938, + "step": 6084 + }, + { + "epoch": 2.2155078267200583, + "grad_norm": 39.5, + "learning_rate": 3.3003681154147966e-07, + "loss": 1.8366436958312988, + "step": 6086 + }, + { + "epoch": 2.2162358937022204, + "grad_norm": 12.5, + "learning_rate": 3.2980964616605763e-07, + "loss": 1.431992769241333, + "step": 6088 + }, + { + "epoch": 2.216963960684383, + "grad_norm": 34.5, + "learning_rate": 3.2958264094648294e-07, + "loss": 1.4775954484939575, + "step": 6090 + }, + { + "epoch": 2.2176920276665455, + "grad_norm": 22.375, + "learning_rate": 3.293557960173135e-07, + "loss": 1.3089592456817627, + "step": 6092 + }, + { + "epoch": 2.2184200946487076, + "grad_norm": 7.28125, + "learning_rate": 3.2912911151301195e-07, + "loss": 1.2835646867752075, + "step": 6094 + }, + { + "epoch": 2.21914816163087, + "grad_norm": 13.3125, + "learning_rate": 3.2890258756794586e-07, + "loss": 0.6033116579055786, + "step": 6096 + }, + { + "epoch": 2.2198762286130322, + "grad_norm": 14.125, + "learning_rate": 3.28676224316388e-07, + "loss": 1.0272536277770996, + "step": 6098 + }, + { + "epoch": 2.220604295595195, + "grad_norm": 16.125, + "learning_rate": 3.284500218925155e-07, + "loss": 1.4798494577407837, + "step": 6100 + }, + { + "epoch": 2.221332362577357, + "grad_norm": 18.25, + "learning_rate": 3.2822398043041015e-07, + "loss": 1.3442927598953247, + "step": 6102 + }, + { + "epoch": 2.2220604295595194, + "grad_norm": 8.3125, + "learning_rate": 3.279981000640587e-07, + "loss": 1.127135157585144, + "step": 6104 + }, + { + "epoch": 2.222788496541682, + "grad_norm": 67.0, + "learning_rate": 3.277723809273519e-07, + "loss": 0.9369891881942749, + "step": 6106 + }, + { + "epoch": 2.223516563523844, + "grad_norm": 14.125, + "learning_rate": 3.275468231540854e-07, + "loss": 1.301914930343628, + "step": 6108 + }, + { + "epoch": 2.2242446305060066, + "grad_norm": 18.375, + "learning_rate": 3.2732142687795884e-07, + "loss": 1.401808738708496, + "step": 6110 + }, + { + "epoch": 2.2249726974881687, + "grad_norm": 6.15625, + "learning_rate": 3.270961922325766e-07, + "loss": 1.2389943599700928, + "step": 6112 + }, + { + "epoch": 2.2257007644703313, + "grad_norm": 34.25, + "learning_rate": 3.268711193514465e-07, + "loss": 0.8731416463851929, + "step": 6114 + }, + { + "epoch": 2.226428831452494, + "grad_norm": 4.78125, + "learning_rate": 3.2664620836798117e-07, + "loss": 1.3819515705108643, + "step": 6116 + }, + { + "epoch": 2.227156898434656, + "grad_norm": 31.5, + "learning_rate": 3.26421459415497e-07, + "loss": 1.2150681018829346, + "step": 6118 + }, + { + "epoch": 2.2278849654168185, + "grad_norm": 10.0, + "learning_rate": 3.261968726272145e-07, + "loss": 0.9173740744590759, + "step": 6120 + }, + { + "epoch": 2.2286130323989806, + "grad_norm": 12.375, + "learning_rate": 3.2597244813625757e-07, + "loss": 1.1662805080413818, + "step": 6122 + }, + { + "epoch": 2.229341099381143, + "grad_norm": 8.5, + "learning_rate": 3.2574818607565444e-07, + "loss": 1.5169520378112793, + "step": 6124 + }, + { + "epoch": 2.2300691663633057, + "grad_norm": 35.75, + "learning_rate": 3.255240865783371e-07, + "loss": 1.3416730165481567, + "step": 6126 + }, + { + "epoch": 2.2307972333454678, + "grad_norm": 9.875, + "learning_rate": 3.253001497771408e-07, + "loss": 1.3972629308700562, + "step": 6128 + }, + { + "epoch": 2.2315253003276303, + "grad_norm": 13.75, + "learning_rate": 3.250763758048043e-07, + "loss": 1.2388252019882202, + "step": 6130 + }, + { + "epoch": 2.2322533673097924, + "grad_norm": 7.625, + "learning_rate": 3.2485276479397027e-07, + "loss": 0.9875936508178711, + "step": 6132 + }, + { + "epoch": 2.232981434291955, + "grad_norm": 12.875, + "learning_rate": 3.246293168771844e-07, + "loss": 1.2345679998397827, + "step": 6134 + }, + { + "epoch": 2.233709501274117, + "grad_norm": 17.0, + "learning_rate": 3.2440603218689595e-07, + "loss": 1.4213932752609253, + "step": 6136 + }, + { + "epoch": 2.2344375682562796, + "grad_norm": 13.25, + "learning_rate": 3.2418291085545743e-07, + "loss": 1.208149790763855, + "step": 6138 + }, + { + "epoch": 2.235165635238442, + "grad_norm": 13.125, + "learning_rate": 3.2395995301512445e-07, + "loss": 0.8953011631965637, + "step": 6140 + }, + { + "epoch": 2.2358937022206042, + "grad_norm": 19.125, + "learning_rate": 3.237371587980555e-07, + "loss": 1.0315399169921875, + "step": 6142 + }, + { + "epoch": 2.236621769202767, + "grad_norm": 14.4375, + "learning_rate": 3.235145283363125e-07, + "loss": 1.0369327068328857, + "step": 6144 + }, + { + "epoch": 2.237349836184929, + "grad_norm": 12.5, + "learning_rate": 3.232920617618598e-07, + "loss": 1.3874719142913818, + "step": 6146 + }, + { + "epoch": 2.2380779031670914, + "grad_norm": 33.0, + "learning_rate": 3.230697592065651e-07, + "loss": 1.6869721412658691, + "step": 6148 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 12.25, + "learning_rate": 3.2284762080219876e-07, + "loss": 1.0316747426986694, + "step": 6150 + }, + { + "epoch": 2.239534037131416, + "grad_norm": 29.25, + "learning_rate": 3.2262564668043343e-07, + "loss": 1.4352704286575317, + "step": 6152 + }, + { + "epoch": 2.2402621041135786, + "grad_norm": 8.3125, + "learning_rate": 3.22403836972845e-07, + "loss": 1.1252211332321167, + "step": 6154 + }, + { + "epoch": 2.2409901710957407, + "grad_norm": 16.625, + "learning_rate": 3.2218219181091144e-07, + "loss": 1.7884085178375244, + "step": 6156 + }, + { + "epoch": 2.2417182380779033, + "grad_norm": 18.875, + "learning_rate": 3.219607113260133e-07, + "loss": 1.4039523601531982, + "step": 6158 + }, + { + "epoch": 2.2424463050600654, + "grad_norm": 25.0, + "learning_rate": 3.2173939564943366e-07, + "loss": 1.6456010341644287, + "step": 6160 + }, + { + "epoch": 2.243174372042228, + "grad_norm": 24.75, + "learning_rate": 3.215182449123577e-07, + "loss": 1.4541257619857788, + "step": 6162 + }, + { + "epoch": 2.2439024390243905, + "grad_norm": 18.0, + "learning_rate": 3.21297259245873e-07, + "loss": 1.4770210981369019, + "step": 6164 + }, + { + "epoch": 2.2446305060065526, + "grad_norm": 7.4375, + "learning_rate": 3.210764387809694e-07, + "loss": 1.3939791917800903, + "step": 6166 + }, + { + "epoch": 2.245358572988715, + "grad_norm": 41.0, + "learning_rate": 3.208557836485384e-07, + "loss": 1.6156976222991943, + "step": 6168 + }, + { + "epoch": 2.246086639970877, + "grad_norm": 26.625, + "learning_rate": 3.206352939793739e-07, + "loss": 1.5547261238098145, + "step": 6170 + }, + { + "epoch": 2.2468147069530398, + "grad_norm": 9.625, + "learning_rate": 3.204149699041716e-07, + "loss": 1.3419009447097778, + "step": 6172 + }, + { + "epoch": 2.247542773935202, + "grad_norm": 12.3125, + "learning_rate": 3.2019481155352877e-07, + "loss": 1.4498188495635986, + "step": 6174 + }, + { + "epoch": 2.2482708409173644, + "grad_norm": 10.6875, + "learning_rate": 3.1997481905794487e-07, + "loss": 0.9910714626312256, + "step": 6176 + }, + { + "epoch": 2.248998907899527, + "grad_norm": 11.0625, + "learning_rate": 3.1975499254782093e-07, + "loss": 1.5234737396240234, + "step": 6178 + }, + { + "epoch": 2.249726974881689, + "grad_norm": 14.5625, + "learning_rate": 3.1953533215345947e-07, + "loss": 1.4161707162857056, + "step": 6180 + }, + { + "epoch": 2.2504550418638516, + "grad_norm": 11.625, + "learning_rate": 3.193158380050647e-07, + "loss": 1.355402946472168, + "step": 6182 + }, + { + "epoch": 2.2511831088460137, + "grad_norm": 25.125, + "learning_rate": 3.19096510232742e-07, + "loss": 1.4452890157699585, + "step": 6184 + }, + { + "epoch": 2.2519111758281762, + "grad_norm": 11.3125, + "learning_rate": 3.188773489664984e-07, + "loss": 1.503868579864502, + "step": 6186 + }, + { + "epoch": 2.2526392428103383, + "grad_norm": 12.4375, + "learning_rate": 3.1865835433624204e-07, + "loss": 1.5211918354034424, + "step": 6188 + }, + { + "epoch": 2.253367309792501, + "grad_norm": 15.0625, + "learning_rate": 3.184395264717828e-07, + "loss": 1.554072380065918, + "step": 6190 + }, + { + "epoch": 2.2540953767746634, + "grad_norm": 12.125, + "learning_rate": 3.182208655028309e-07, + "loss": 1.8581982851028442, + "step": 6192 + }, + { + "epoch": 2.2548234437568255, + "grad_norm": 10.0, + "learning_rate": 3.180023715589983e-07, + "loss": 1.4516820907592773, + "step": 6194 + }, + { + "epoch": 2.255551510738988, + "grad_norm": 18.375, + "learning_rate": 3.1778404476979755e-07, + "loss": 1.578871250152588, + "step": 6196 + }, + { + "epoch": 2.25627957772115, + "grad_norm": 11.0625, + "learning_rate": 3.175658852646424e-07, + "loss": 1.3156059980392456, + "step": 6198 + }, + { + "epoch": 2.2570076447033127, + "grad_norm": 11.375, + "learning_rate": 3.173478931728473e-07, + "loss": 1.3464895486831665, + "step": 6200 + }, + { + "epoch": 2.2577357116854753, + "grad_norm": 14.9375, + "learning_rate": 3.1713006862362723e-07, + "loss": 1.2938745021820068, + "step": 6202 + }, + { + "epoch": 2.2584637786676374, + "grad_norm": 12.4375, + "learning_rate": 3.1691241174609834e-07, + "loss": 1.222761869430542, + "step": 6204 + }, + { + "epoch": 2.2591918456498, + "grad_norm": 13.1875, + "learning_rate": 3.1669492266927723e-07, + "loss": 1.817077875137329, + "step": 6206 + }, + { + "epoch": 2.259919912631962, + "grad_norm": 12.875, + "learning_rate": 3.164776015220807e-07, + "loss": 1.4663760662078857, + "step": 6208 + }, + { + "epoch": 2.2606479796141246, + "grad_norm": 7.125, + "learning_rate": 3.1626044843332653e-07, + "loss": 1.1294500827789307, + "step": 6210 + }, + { + "epoch": 2.261376046596287, + "grad_norm": 11.875, + "learning_rate": 3.1604346353173253e-07, + "loss": 1.1774624586105347, + "step": 6212 + }, + { + "epoch": 2.262104113578449, + "grad_norm": 11.125, + "learning_rate": 3.1582664694591684e-07, + "loss": 1.2954576015472412, + "step": 6214 + }, + { + "epoch": 2.2628321805606118, + "grad_norm": 6.125, + "learning_rate": 3.156099988043979e-07, + "loss": 1.3345730304718018, + "step": 6216 + }, + { + "epoch": 2.263560247542774, + "grad_norm": 11.4375, + "learning_rate": 3.1539351923559457e-07, + "loss": 1.3501250743865967, + "step": 6218 + }, + { + "epoch": 2.2642883145249364, + "grad_norm": 11.125, + "learning_rate": 3.151772083678252e-07, + "loss": 1.4467440843582153, + "step": 6220 + }, + { + "epoch": 2.2650163815070985, + "grad_norm": 12.375, + "learning_rate": 3.149610663293086e-07, + "loss": 1.1720068454742432, + "step": 6222 + }, + { + "epoch": 2.265744448489261, + "grad_norm": 16.5, + "learning_rate": 3.1474509324816347e-07, + "loss": 1.5980451107025146, + "step": 6224 + }, + { + "epoch": 2.266472515471423, + "grad_norm": 11.5, + "learning_rate": 3.1452928925240816e-07, + "loss": 1.454470157623291, + "step": 6226 + }, + { + "epoch": 2.2672005824535857, + "grad_norm": 29.875, + "learning_rate": 3.143136544699609e-07, + "loss": 1.5907888412475586, + "step": 6228 + }, + { + "epoch": 2.2679286494357482, + "grad_norm": 11.5, + "learning_rate": 3.1409818902863965e-07, + "loss": 1.2934598922729492, + "step": 6230 + }, + { + "epoch": 2.2686567164179103, + "grad_norm": 13.125, + "learning_rate": 3.138828930561619e-07, + "loss": 1.4860308170318604, + "step": 6232 + }, + { + "epoch": 2.269384783400073, + "grad_norm": 15.25, + "learning_rate": 3.136677666801448e-07, + "loss": 1.4810161590576172, + "step": 6234 + }, + { + "epoch": 2.270112850382235, + "grad_norm": 19.625, + "learning_rate": 3.1345281002810494e-07, + "loss": 1.3604376316070557, + "step": 6236 + }, + { + "epoch": 2.2708409173643975, + "grad_norm": 27.5, + "learning_rate": 3.132380232274582e-07, + "loss": 1.549137830734253, + "step": 6238 + }, + { + "epoch": 2.27156898434656, + "grad_norm": 17.25, + "learning_rate": 3.1302340640552004e-07, + "loss": 1.7333500385284424, + "step": 6240 + }, + { + "epoch": 2.272297051328722, + "grad_norm": 18.625, + "learning_rate": 3.1280895968950487e-07, + "loss": 1.3337620496749878, + "step": 6242 + }, + { + "epoch": 2.2730251183108847, + "grad_norm": 23.375, + "learning_rate": 3.125946832065263e-07, + "loss": 1.1779835224151611, + "step": 6244 + }, + { + "epoch": 2.273753185293047, + "grad_norm": 13.5, + "learning_rate": 3.1238057708359725e-07, + "loss": 1.1329046487808228, + "step": 6246 + }, + { + "epoch": 2.2744812522752094, + "grad_norm": 23.5, + "learning_rate": 3.121666414476295e-07, + "loss": 1.6390118598937988, + "step": 6248 + }, + { + "epoch": 2.275209319257372, + "grad_norm": 11.4375, + "learning_rate": 3.1195287642543377e-07, + "loss": 1.5591938495635986, + "step": 6250 + }, + { + "epoch": 2.275937386239534, + "grad_norm": 10.5, + "learning_rate": 3.117392821437198e-07, + "loss": 1.7455458641052246, + "step": 6252 + }, + { + "epoch": 2.2766654532216966, + "grad_norm": 9.25, + "learning_rate": 3.11525858729096e-07, + "loss": 1.409688115119934, + "step": 6254 + }, + { + "epoch": 2.2773935202038587, + "grad_norm": 11.1875, + "learning_rate": 3.1131260630806925e-07, + "loss": 1.1554585695266724, + "step": 6256 + }, + { + "epoch": 2.278121587186021, + "grad_norm": 16.625, + "learning_rate": 3.1109952500704567e-07, + "loss": 1.5241730213165283, + "step": 6258 + }, + { + "epoch": 2.2788496541681833, + "grad_norm": 31.125, + "learning_rate": 3.108866149523293e-07, + "loss": 1.7754125595092773, + "step": 6260 + }, + { + "epoch": 2.279577721150346, + "grad_norm": 13.3125, + "learning_rate": 3.106738762701232e-07, + "loss": 1.2551428079605103, + "step": 6262 + }, + { + "epoch": 2.280305788132508, + "grad_norm": 8.1875, + "learning_rate": 3.104613090865286e-07, + "loss": 1.1148711442947388, + "step": 6264 + }, + { + "epoch": 2.2810338551146705, + "grad_norm": 11.625, + "learning_rate": 3.1024891352754503e-07, + "loss": 1.3510916233062744, + "step": 6266 + }, + { + "epoch": 2.281761922096833, + "grad_norm": 23.5, + "learning_rate": 3.100366897190705e-07, + "loss": 1.2466145753860474, + "step": 6268 + }, + { + "epoch": 2.282489989078995, + "grad_norm": 15.6875, + "learning_rate": 3.0982463778690096e-07, + "loss": 1.2892639636993408, + "step": 6270 + }, + { + "epoch": 2.2832180560611577, + "grad_norm": 14.0625, + "learning_rate": 3.0961275785673044e-07, + "loss": 1.4508719444274902, + "step": 6272 + }, + { + "epoch": 2.28394612304332, + "grad_norm": 8.625, + "learning_rate": 3.094010500541514e-07, + "loss": 1.3325765132904053, + "step": 6274 + }, + { + "epoch": 2.2846741900254823, + "grad_norm": 7.125, + "learning_rate": 3.091895145046541e-07, + "loss": 1.1269574165344238, + "step": 6276 + }, + { + "epoch": 2.285402257007645, + "grad_norm": 10.75, + "learning_rate": 3.089781513336264e-07, + "loss": 1.4028818607330322, + "step": 6278 + }, + { + "epoch": 2.286130323989807, + "grad_norm": 31.25, + "learning_rate": 3.087669606663545e-07, + "loss": 1.332125186920166, + "step": 6280 + }, + { + "epoch": 2.2868583909719695, + "grad_norm": 7.5625, + "learning_rate": 3.085559426280219e-07, + "loss": 1.254866123199463, + "step": 6282 + }, + { + "epoch": 2.2875864579541316, + "grad_norm": 11.3125, + "learning_rate": 3.0834509734370986e-07, + "loss": 1.4101008176803589, + "step": 6284 + }, + { + "epoch": 2.288314524936294, + "grad_norm": 10.25, + "learning_rate": 3.0813442493839747e-07, + "loss": 1.4421353340148926, + "step": 6286 + }, + { + "epoch": 2.2890425919184567, + "grad_norm": 6.53125, + "learning_rate": 3.0792392553696115e-07, + "loss": 1.0523264408111572, + "step": 6288 + }, + { + "epoch": 2.289770658900619, + "grad_norm": 6.0, + "learning_rate": 3.077135992641748e-07, + "loss": 1.1850032806396484, + "step": 6290 + }, + { + "epoch": 2.2904987258827814, + "grad_norm": 13.5, + "learning_rate": 3.0750344624470986e-07, + "loss": 1.4040499925613403, + "step": 6292 + }, + { + "epoch": 2.2912267928649435, + "grad_norm": 12.75, + "learning_rate": 3.072934666031347e-07, + "loss": 1.3968956470489502, + "step": 6294 + }, + { + "epoch": 2.291954859847106, + "grad_norm": 16.75, + "learning_rate": 3.070836604639154e-07, + "loss": 0.8251042366027832, + "step": 6296 + }, + { + "epoch": 2.292682926829268, + "grad_norm": 12.5, + "learning_rate": 3.068740279514148e-07, + "loss": 1.4016964435577393, + "step": 6298 + }, + { + "epoch": 2.2934109938114307, + "grad_norm": 11.875, + "learning_rate": 3.0666456918989295e-07, + "loss": 1.3720228672027588, + "step": 6300 + }, + { + "epoch": 2.294139060793593, + "grad_norm": 9.625, + "learning_rate": 3.0645528430350694e-07, + "loss": 1.4037294387817383, + "step": 6302 + }, + { + "epoch": 2.2948671277757553, + "grad_norm": 6.84375, + "learning_rate": 3.0624617341631094e-07, + "loss": 1.1970716714859009, + "step": 6304 + }, + { + "epoch": 2.295595194757918, + "grad_norm": 20.0, + "learning_rate": 3.060372366522556e-07, + "loss": 1.3015151023864746, + "step": 6306 + }, + { + "epoch": 2.29632326174008, + "grad_norm": 13.4375, + "learning_rate": 3.0582847413518884e-07, + "loss": 1.3795393705368042, + "step": 6308 + }, + { + "epoch": 2.2970513287222425, + "grad_norm": 7.09375, + "learning_rate": 3.0561988598885486e-07, + "loss": 1.3986828327178955, + "step": 6310 + }, + { + "epoch": 2.2977793957044046, + "grad_norm": 14.875, + "learning_rate": 3.0541147233689457e-07, + "loss": 0.7174405455589294, + "step": 6312 + }, + { + "epoch": 2.298507462686567, + "grad_norm": 13.875, + "learning_rate": 3.052032333028458e-07, + "loss": 0.8590954542160034, + "step": 6314 + }, + { + "epoch": 2.2992355296687297, + "grad_norm": 24.375, + "learning_rate": 3.0499516901014263e-07, + "loss": 1.1754486560821533, + "step": 6316 + }, + { + "epoch": 2.299963596650892, + "grad_norm": 8.25, + "learning_rate": 3.047872795821153e-07, + "loss": 1.1667795181274414, + "step": 6318 + }, + { + "epoch": 2.3006916636330543, + "grad_norm": 13.0625, + "learning_rate": 3.04579565141991e-07, + "loss": 1.3826136589050293, + "step": 6320 + }, + { + "epoch": 2.3014197306152164, + "grad_norm": 15.5, + "learning_rate": 3.0437202581289255e-07, + "loss": 1.1737334728240967, + "step": 6322 + }, + { + "epoch": 2.302147797597379, + "grad_norm": 11.6875, + "learning_rate": 3.0416466171783963e-07, + "loss": 0.9265611171722412, + "step": 6324 + }, + { + "epoch": 2.3028758645795415, + "grad_norm": 15.625, + "learning_rate": 3.0395747297974746e-07, + "loss": 1.5897672176361084, + "step": 6326 + }, + { + "epoch": 2.3036039315617036, + "grad_norm": 18.25, + "learning_rate": 3.0375045972142746e-07, + "loss": 1.439791202545166, + "step": 6328 + }, + { + "epoch": 2.304331998543866, + "grad_norm": 19.25, + "learning_rate": 3.035436220655874e-07, + "loss": 1.105435848236084, + "step": 6330 + }, + { + "epoch": 2.3050600655260283, + "grad_norm": 50.0, + "learning_rate": 3.0333696013483054e-07, + "loss": 0.7961536049842834, + "step": 6332 + }, + { + "epoch": 2.305788132508191, + "grad_norm": 12.625, + "learning_rate": 3.0313047405165616e-07, + "loss": 1.2911365032196045, + "step": 6334 + }, + { + "epoch": 2.3065161994903534, + "grad_norm": 11.8125, + "learning_rate": 3.029241639384592e-07, + "loss": 1.2564961910247803, + "step": 6336 + }, + { + "epoch": 2.3072442664725155, + "grad_norm": 13.25, + "learning_rate": 3.027180299175307e-07, + "loss": 1.334195852279663, + "step": 6338 + }, + { + "epoch": 2.307972333454678, + "grad_norm": 14.125, + "learning_rate": 3.025120721110567e-07, + "loss": 1.198038101196289, + "step": 6340 + }, + { + "epoch": 2.30870040043684, + "grad_norm": 24.25, + "learning_rate": 3.0230629064111897e-07, + "loss": 1.292498230934143, + "step": 6342 + }, + { + "epoch": 2.3094284674190027, + "grad_norm": 8.0, + "learning_rate": 3.021006856296952e-07, + "loss": 0.8261981010437012, + "step": 6344 + }, + { + "epoch": 2.3101565344011648, + "grad_norm": 16.375, + "learning_rate": 3.018952571986579e-07, + "loss": 1.471326231956482, + "step": 6346 + }, + { + "epoch": 2.3108846013833273, + "grad_norm": 9.6875, + "learning_rate": 3.016900054697752e-07, + "loss": 1.3190181255340576, + "step": 6348 + }, + { + "epoch": 2.3116126683654894, + "grad_norm": 12.5625, + "learning_rate": 3.0148493056471063e-07, + "loss": 1.0911686420440674, + "step": 6350 + }, + { + "epoch": 2.312340735347652, + "grad_norm": 10.1875, + "learning_rate": 3.012800326050227e-07, + "loss": 1.293243408203125, + "step": 6352 + }, + { + "epoch": 2.3130688023298145, + "grad_norm": 12.3125, + "learning_rate": 3.0107531171216474e-07, + "loss": 1.152498722076416, + "step": 6354 + }, + { + "epoch": 2.3137968693119766, + "grad_norm": 28.625, + "learning_rate": 3.0087076800748587e-07, + "loss": 1.6850712299346924, + "step": 6356 + }, + { + "epoch": 2.314524936294139, + "grad_norm": 14.375, + "learning_rate": 3.006664016122294e-07, + "loss": 1.4269928932189941, + "step": 6358 + }, + { + "epoch": 2.3152530032763012, + "grad_norm": 12.25, + "learning_rate": 3.0046221264753414e-07, + "loss": 1.2763667106628418, + "step": 6360 + }, + { + "epoch": 2.315981070258464, + "grad_norm": 10.4375, + "learning_rate": 3.0025820123443347e-07, + "loss": 1.2223560810089111, + "step": 6362 + }, + { + "epoch": 2.3167091372406263, + "grad_norm": 16.625, + "learning_rate": 3.000543674938554e-07, + "loss": 1.3776737451553345, + "step": 6364 + }, + { + "epoch": 2.3174372042227884, + "grad_norm": 16.875, + "learning_rate": 2.998507115466231e-07, + "loss": 1.7370796203613281, + "step": 6366 + }, + { + "epoch": 2.318165271204951, + "grad_norm": 16.5, + "learning_rate": 2.9964723351345367e-07, + "loss": 1.5478923320770264, + "step": 6368 + }, + { + "epoch": 2.318893338187113, + "grad_norm": 16.75, + "learning_rate": 2.994439335149591e-07, + "loss": 1.288620114326477, + "step": 6370 + }, + { + "epoch": 2.3196214051692756, + "grad_norm": 23.375, + "learning_rate": 2.9924081167164615e-07, + "loss": 1.3628039360046387, + "step": 6372 + }, + { + "epoch": 2.320349472151438, + "grad_norm": 9.375, + "learning_rate": 2.990378681039154e-07, + "loss": 1.220543384552002, + "step": 6374 + }, + { + "epoch": 2.3210775391336003, + "grad_norm": 17.5, + "learning_rate": 2.9883510293206205e-07, + "loss": 1.337418794631958, + "step": 6376 + }, + { + "epoch": 2.321805606115763, + "grad_norm": 15.125, + "learning_rate": 2.9863251627627577e-07, + "loss": 1.282632827758789, + "step": 6378 + }, + { + "epoch": 2.322533673097925, + "grad_norm": 12.75, + "learning_rate": 2.9843010825664005e-07, + "loss": 1.2562873363494873, + "step": 6380 + }, + { + "epoch": 2.3232617400800875, + "grad_norm": 10.4375, + "learning_rate": 2.9822787899313255e-07, + "loss": 0.8646048307418823, + "step": 6382 + }, + { + "epoch": 2.3239898070622496, + "grad_norm": 16.125, + "learning_rate": 2.980258286056252e-07, + "loss": 1.526928424835205, + "step": 6384 + }, + { + "epoch": 2.324717874044412, + "grad_norm": 17.75, + "learning_rate": 2.9782395721388366e-07, + "loss": 1.5811107158660889, + "step": 6386 + }, + { + "epoch": 2.325445941026574, + "grad_norm": 28.625, + "learning_rate": 2.976222649375675e-07, + "loss": 1.4497644901275635, + "step": 6388 + }, + { + "epoch": 2.3261740080087367, + "grad_norm": 16.75, + "learning_rate": 2.9742075189623047e-07, + "loss": 1.7063870429992676, + "step": 6390 + }, + { + "epoch": 2.3269020749908993, + "grad_norm": 9.1875, + "learning_rate": 2.972194182093195e-07, + "loss": 1.013759732246399, + "step": 6392 + }, + { + "epoch": 2.3276301419730614, + "grad_norm": 21.25, + "learning_rate": 2.970182639961758e-07, + "loss": 1.1840308904647827, + "step": 6394 + }, + { + "epoch": 2.328358208955224, + "grad_norm": 14.0625, + "learning_rate": 2.9681728937603365e-07, + "loss": 1.4314606189727783, + "step": 6396 + }, + { + "epoch": 2.329086275937386, + "grad_norm": 9.9375, + "learning_rate": 2.9661649446802134e-07, + "loss": 1.173577070236206, + "step": 6398 + }, + { + "epoch": 2.3298143429195486, + "grad_norm": 28.0, + "learning_rate": 2.964158793911603e-07, + "loss": 0.9525898694992065, + "step": 6400 + }, + { + "epoch": 2.330542409901711, + "grad_norm": 12.625, + "learning_rate": 2.9621544426436563e-07, + "loss": 1.6011019945144653, + "step": 6402 + }, + { + "epoch": 2.3312704768838732, + "grad_norm": 20.125, + "learning_rate": 2.960151892064454e-07, + "loss": 1.6649761199951172, + "step": 6404 + }, + { + "epoch": 2.3319985438660358, + "grad_norm": 18.75, + "learning_rate": 2.9581511433610153e-07, + "loss": 1.290449857711792, + "step": 6406 + }, + { + "epoch": 2.332726610848198, + "grad_norm": 4.4375, + "learning_rate": 2.956152197719286e-07, + "loss": 0.806548535823822, + "step": 6408 + }, + { + "epoch": 2.3334546778303604, + "grad_norm": 9.375, + "learning_rate": 2.954155056324143e-07, + "loss": 1.290272831916809, + "step": 6410 + }, + { + "epoch": 2.334182744812523, + "grad_norm": 19.25, + "learning_rate": 2.952159720359399e-07, + "loss": 1.1331778764724731, + "step": 6412 + }, + { + "epoch": 2.334910811794685, + "grad_norm": 8.125, + "learning_rate": 2.95016619100779e-07, + "loss": 1.298039197921753, + "step": 6414 + }, + { + "epoch": 2.3356388787768476, + "grad_norm": 14.9375, + "learning_rate": 2.948174469450987e-07, + "loss": 1.472787618637085, + "step": 6416 + }, + { + "epoch": 2.3363669457590097, + "grad_norm": 14.6875, + "learning_rate": 2.9461845568695863e-07, + "loss": 1.3630478382110596, + "step": 6418 + }, + { + "epoch": 2.3370950127411723, + "grad_norm": 23.25, + "learning_rate": 2.944196454443111e-07, + "loss": 1.474900722503662, + "step": 6420 + }, + { + "epoch": 2.3378230797233344, + "grad_norm": 17.625, + "learning_rate": 2.9422101633500145e-07, + "loss": 1.633987545967102, + "step": 6422 + }, + { + "epoch": 2.338551146705497, + "grad_norm": 11.875, + "learning_rate": 2.9402256847676735e-07, + "loss": 0.9540705680847168, + "step": 6424 + }, + { + "epoch": 2.3392792136876595, + "grad_norm": 14.625, + "learning_rate": 2.938243019872391e-07, + "loss": 1.5863673686981201, + "step": 6426 + }, + { + "epoch": 2.3400072806698216, + "grad_norm": 12.5, + "learning_rate": 2.9362621698393947e-07, + "loss": 1.1388983726501465, + "step": 6428 + }, + { + "epoch": 2.340735347651984, + "grad_norm": 19.375, + "learning_rate": 2.9342831358428404e-07, + "loss": 1.2458865642547607, + "step": 6430 + }, + { + "epoch": 2.341463414634146, + "grad_norm": 18.75, + "learning_rate": 2.9323059190558014e-07, + "loss": 1.8429067134857178, + "step": 6432 + }, + { + "epoch": 2.3421914816163087, + "grad_norm": 8.6875, + "learning_rate": 2.930330520650277e-07, + "loss": 1.2974172830581665, + "step": 6434 + }, + { + "epoch": 2.342919548598471, + "grad_norm": 29.75, + "learning_rate": 2.92835694179719e-07, + "loss": 1.6267732381820679, + "step": 6436 + }, + { + "epoch": 2.3436476155806334, + "grad_norm": 13.4375, + "learning_rate": 2.9263851836663835e-07, + "loss": 1.1608515977859497, + "step": 6438 + }, + { + "epoch": 2.344375682562796, + "grad_norm": 14.0625, + "learning_rate": 2.924415247426617e-07, + "loss": 1.4824105501174927, + "step": 6440 + }, + { + "epoch": 2.345103749544958, + "grad_norm": 12.625, + "learning_rate": 2.922447134245578e-07, + "loss": 1.52664315700531, + "step": 6442 + }, + { + "epoch": 2.3458318165271206, + "grad_norm": 16.25, + "learning_rate": 2.9204808452898664e-07, + "loss": 0.9121277332305908, + "step": 6444 + }, + { + "epoch": 2.3465598835092827, + "grad_norm": 6.1875, + "learning_rate": 2.918516381725007e-07, + "loss": 1.0830274820327759, + "step": 6446 + }, + { + "epoch": 2.3472879504914452, + "grad_norm": 14.25, + "learning_rate": 2.916553744715435e-07, + "loss": 1.3859456777572632, + "step": 6448 + }, + { + "epoch": 2.3480160174736078, + "grad_norm": 7.59375, + "learning_rate": 2.914592935424512e-07, + "loss": 1.283904790878296, + "step": 6450 + }, + { + "epoch": 2.34874408445577, + "grad_norm": 14.5625, + "learning_rate": 2.9126339550145087e-07, + "loss": 1.0288848876953125, + "step": 6452 + }, + { + "epoch": 2.3494721514379324, + "grad_norm": 14.4375, + "learning_rate": 2.9106768046466134e-07, + "loss": 1.1657896041870117, + "step": 6454 + }, + { + "epoch": 2.3502002184200945, + "grad_norm": 10.625, + "learning_rate": 2.9087214854809323e-07, + "loss": 1.080242395401001, + "step": 6456 + }, + { + "epoch": 2.350928285402257, + "grad_norm": 12.3125, + "learning_rate": 2.906767998676486e-07, + "loss": 1.404772162437439, + "step": 6458 + }, + { + "epoch": 2.3516563523844196, + "grad_norm": 8.625, + "learning_rate": 2.904816345391204e-07, + "loss": 1.2640372514724731, + "step": 6460 + }, + { + "epoch": 2.3523844193665817, + "grad_norm": 11.375, + "learning_rate": 2.902866526781935e-07, + "loss": 1.611601710319519, + "step": 6462 + }, + { + "epoch": 2.3531124863487443, + "grad_norm": 13.875, + "learning_rate": 2.900918544004438e-07, + "loss": 1.2875325679779053, + "step": 6464 + }, + { + "epoch": 2.3538405533309064, + "grad_norm": 38.5, + "learning_rate": 2.898972398213383e-07, + "loss": 1.9911233186721802, + "step": 6466 + }, + { + "epoch": 2.354568620313069, + "grad_norm": 11.875, + "learning_rate": 2.89702809056235e-07, + "loss": 1.50071382522583, + "step": 6468 + }, + { + "epoch": 2.355296687295231, + "grad_norm": 20.875, + "learning_rate": 2.895085622203834e-07, + "loss": 1.664984941482544, + "step": 6470 + }, + { + "epoch": 2.3560247542773936, + "grad_norm": 25.75, + "learning_rate": 2.893144994289234e-07, + "loss": 1.3697190284729004, + "step": 6472 + }, + { + "epoch": 2.3567528212595557, + "grad_norm": 85.0, + "learning_rate": 2.8912062079688617e-07, + "loss": 1.4123151302337646, + "step": 6474 + }, + { + "epoch": 2.357480888241718, + "grad_norm": 18.375, + "learning_rate": 2.8892692643919373e-07, + "loss": 1.7595241069793701, + "step": 6476 + }, + { + "epoch": 2.3582089552238807, + "grad_norm": 14.875, + "learning_rate": 2.887334164706588e-07, + "loss": 1.4573192596435547, + "step": 6478 + }, + { + "epoch": 2.358937022206043, + "grad_norm": 26.375, + "learning_rate": 2.885400910059846e-07, + "loss": 0.8024557828903198, + "step": 6480 + }, + { + "epoch": 2.3596650891882054, + "grad_norm": 18.875, + "learning_rate": 2.8834695015976535e-07, + "loss": 1.423046350479126, + "step": 6482 + }, + { + "epoch": 2.3603931561703675, + "grad_norm": 25.25, + "learning_rate": 2.881539940464855e-07, + "loss": 1.2803380489349365, + "step": 6484 + }, + { + "epoch": 2.36112122315253, + "grad_norm": 12.5, + "learning_rate": 2.8796122278052025e-07, + "loss": 1.602477788925171, + "step": 6486 + }, + { + "epoch": 2.3618492901346926, + "grad_norm": 16.125, + "learning_rate": 2.877686364761352e-07, + "loss": 1.2951278686523438, + "step": 6488 + }, + { + "epoch": 2.3625773571168547, + "grad_norm": 16.375, + "learning_rate": 2.8757623524748607e-07, + "loss": 1.03763747215271, + "step": 6490 + }, + { + "epoch": 2.3633054240990172, + "grad_norm": 65.0, + "learning_rate": 2.873840192086193e-07, + "loss": 1.6155928373336792, + "step": 6492 + }, + { + "epoch": 2.3640334910811793, + "grad_norm": 12.25, + "learning_rate": 2.8719198847347117e-07, + "loss": 1.3214967250823975, + "step": 6494 + }, + { + "epoch": 2.364761558063342, + "grad_norm": 9.125, + "learning_rate": 2.8700014315586815e-07, + "loss": 1.102116584777832, + "step": 6496 + }, + { + "epoch": 2.3654896250455044, + "grad_norm": 6.40625, + "learning_rate": 2.868084833695271e-07, + "loss": 1.099461317062378, + "step": 6498 + }, + { + "epoch": 2.3662176920276665, + "grad_norm": 6.1875, + "learning_rate": 2.866170092280545e-07, + "loss": 1.3375420570373535, + "step": 6500 + }, + { + "epoch": 2.366945759009829, + "grad_norm": 7.71875, + "learning_rate": 2.864257208449473e-07, + "loss": 1.2983028888702393, + "step": 6502 + }, + { + "epoch": 2.367673825991991, + "grad_norm": 19.25, + "learning_rate": 2.862346183335919e-07, + "loss": 1.3711466789245605, + "step": 6504 + }, + { + "epoch": 2.3684018929741537, + "grad_norm": 10.625, + "learning_rate": 2.8604370180726477e-07, + "loss": 1.1283843517303467, + "step": 6506 + }, + { + "epoch": 2.369129959956316, + "grad_norm": 13.0, + "learning_rate": 2.858529713791319e-07, + "loss": 1.3351354598999023, + "step": 6508 + }, + { + "epoch": 2.3698580269384784, + "grad_norm": 15.75, + "learning_rate": 2.856624271622492e-07, + "loss": 1.3940119743347168, + "step": 6510 + }, + { + "epoch": 2.3705860939206405, + "grad_norm": 6.1875, + "learning_rate": 2.8547206926956206e-07, + "loss": 1.2587482929229736, + "step": 6512 + }, + { + "epoch": 2.371314160902803, + "grad_norm": 8.625, + "learning_rate": 2.8528189781390556e-07, + "loss": 1.3069417476654053, + "step": 6514 + }, + { + "epoch": 2.3720422278849655, + "grad_norm": 17.75, + "learning_rate": 2.8509191290800425e-07, + "loss": 1.5515795946121216, + "step": 6516 + }, + { + "epoch": 2.3727702948671276, + "grad_norm": 20.875, + "learning_rate": 2.8490211466447184e-07, + "loss": 1.4367560148239136, + "step": 6518 + }, + { + "epoch": 2.37349836184929, + "grad_norm": 12.5, + "learning_rate": 2.847125031958118e-07, + "loss": 1.0952129364013672, + "step": 6520 + }, + { + "epoch": 2.3742264288314523, + "grad_norm": 32.0, + "learning_rate": 2.8452307861441663e-07, + "loss": 1.3477327823638916, + "step": 6522 + }, + { + "epoch": 2.374954495813615, + "grad_norm": 20.75, + "learning_rate": 2.84333841032568e-07, + "loss": 1.617757797241211, + "step": 6524 + }, + { + "epoch": 2.3756825627957774, + "grad_norm": 20.125, + "learning_rate": 2.84144790562437e-07, + "loss": 1.785931944847107, + "step": 6526 + }, + { + "epoch": 2.3764106297779395, + "grad_norm": 11.375, + "learning_rate": 2.8395592731608364e-07, + "loss": 1.5056750774383545, + "step": 6528 + }, + { + "epoch": 2.377138696760102, + "grad_norm": 12.3125, + "learning_rate": 2.8376725140545676e-07, + "loss": 1.7157129049301147, + "step": 6530 + }, + { + "epoch": 2.377866763742264, + "grad_norm": 7.1875, + "learning_rate": 2.835787629423947e-07, + "loss": 1.2371762990951538, + "step": 6532 + }, + { + "epoch": 2.3785948307244267, + "grad_norm": 3.515625, + "learning_rate": 2.8339046203862405e-07, + "loss": 0.8035846948623657, + "step": 6534 + }, + { + "epoch": 2.3793228977065892, + "grad_norm": 9.9375, + "learning_rate": 2.8320234880576083e-07, + "loss": 1.1977893114089966, + "step": 6536 + }, + { + "epoch": 2.3800509646887513, + "grad_norm": 35.0, + "learning_rate": 2.8301442335530935e-07, + "loss": 1.441650390625, + "step": 6538 + }, + { + "epoch": 2.380779031670914, + "grad_norm": 5.28125, + "learning_rate": 2.828266857986626e-07, + "loss": 1.0540441274642944, + "step": 6540 + }, + { + "epoch": 2.381507098653076, + "grad_norm": 14.0625, + "learning_rate": 2.8263913624710263e-07, + "loss": 1.6574046611785889, + "step": 6542 + }, + { + "epoch": 2.3822351656352385, + "grad_norm": 6.28125, + "learning_rate": 2.8245177481179984e-07, + "loss": 1.4298672676086426, + "step": 6544 + }, + { + "epoch": 2.3829632326174006, + "grad_norm": 11.3125, + "learning_rate": 2.822646016038128e-07, + "loss": 1.335983157157898, + "step": 6546 + }, + { + "epoch": 2.383691299599563, + "grad_norm": 11.6875, + "learning_rate": 2.82077616734089e-07, + "loss": 1.1965689659118652, + "step": 6548 + }, + { + "epoch": 2.3844193665817257, + "grad_norm": 45.75, + "learning_rate": 2.8189082031346413e-07, + "loss": 1.277300238609314, + "step": 6550 + }, + { + "epoch": 2.385147433563888, + "grad_norm": 19.375, + "learning_rate": 2.8170421245266184e-07, + "loss": 1.3595154285430908, + "step": 6552 + }, + { + "epoch": 2.3858755005460504, + "grad_norm": 23.75, + "learning_rate": 2.8151779326229454e-07, + "loss": 1.6427298784255981, + "step": 6554 + }, + { + "epoch": 2.3866035675282125, + "grad_norm": 15.0625, + "learning_rate": 2.8133156285286254e-07, + "loss": 1.6696240901947021, + "step": 6556 + }, + { + "epoch": 2.387331634510375, + "grad_norm": 15.75, + "learning_rate": 2.8114552133475415e-07, + "loss": 1.4680750370025635, + "step": 6558 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 13.5, + "learning_rate": 2.809596688182459e-07, + "loss": 1.3531396389007568, + "step": 6560 + }, + { + "epoch": 2.3887877684746996, + "grad_norm": 14.0, + "learning_rate": 2.807740054135024e-07, + "loss": 1.5014060735702515, + "step": 6562 + }, + { + "epoch": 2.389515835456862, + "grad_norm": 11.9375, + "learning_rate": 2.8058853123057573e-07, + "loss": 1.3979357481002808, + "step": 6564 + }, + { + "epoch": 2.3902439024390243, + "grad_norm": 31.25, + "learning_rate": 2.8040324637940603e-07, + "loss": 1.2346593141555786, + "step": 6566 + }, + { + "epoch": 2.390971969421187, + "grad_norm": 14.875, + "learning_rate": 2.802181509698216e-07, + "loss": 1.6833527088165283, + "step": 6568 + }, + { + "epoch": 2.391700036403349, + "grad_norm": 9.4375, + "learning_rate": 2.8003324511153763e-07, + "loss": 1.2997976541519165, + "step": 6570 + }, + { + "epoch": 2.3924281033855115, + "grad_norm": 10.625, + "learning_rate": 2.798485289141579e-07, + "loss": 1.3649864196777344, + "step": 6572 + }, + { + "epoch": 2.393156170367674, + "grad_norm": 21.625, + "learning_rate": 2.796640024871728e-07, + "loss": 1.53578519821167, + "step": 6574 + }, + { + "epoch": 2.393884237349836, + "grad_norm": 60.75, + "learning_rate": 2.794796659399612e-07, + "loss": 1.430138349533081, + "step": 6576 + }, + { + "epoch": 2.3946123043319987, + "grad_norm": 7.6875, + "learning_rate": 2.7929551938178863e-07, + "loss": 1.4293309450149536, + "step": 6578 + }, + { + "epoch": 2.3953403713141608, + "grad_norm": 21.25, + "learning_rate": 2.7911156292180834e-07, + "loss": 1.4397828578948975, + "step": 6580 + }, + { + "epoch": 2.3960684382963233, + "grad_norm": 10.4375, + "learning_rate": 2.7892779666906087e-07, + "loss": 0.6977670192718506, + "step": 6582 + }, + { + "epoch": 2.396796505278486, + "grad_norm": 13.9375, + "learning_rate": 2.787442207324742e-07, + "loss": 1.5050909519195557, + "step": 6584 + }, + { + "epoch": 2.397524572260648, + "grad_norm": 12.1875, + "learning_rate": 2.785608352208631e-07, + "loss": 1.73898446559906, + "step": 6586 + }, + { + "epoch": 2.3982526392428105, + "grad_norm": 9.625, + "learning_rate": 2.7837764024292973e-07, + "loss": 1.1734671592712402, + "step": 6588 + }, + { + "epoch": 2.3989807062249726, + "grad_norm": 5.75, + "learning_rate": 2.781946359072635e-07, + "loss": 1.1589254140853882, + "step": 6590 + }, + { + "epoch": 2.399708773207135, + "grad_norm": 13.625, + "learning_rate": 2.780118223223403e-07, + "loss": 1.4095396995544434, + "step": 6592 + }, + { + "epoch": 2.4004368401892973, + "grad_norm": 10.0, + "learning_rate": 2.778291995965232e-07, + "loss": 1.073001503944397, + "step": 6594 + }, + { + "epoch": 2.40116490717146, + "grad_norm": 28.375, + "learning_rate": 2.7764676783806247e-07, + "loss": 1.331437587738037, + "step": 6596 + }, + { + "epoch": 2.401892974153622, + "grad_norm": 23.0, + "learning_rate": 2.7746452715509455e-07, + "loss": 1.1945278644561768, + "step": 6598 + }, + { + "epoch": 2.4026210411357845, + "grad_norm": 18.5, + "learning_rate": 2.7728247765564304e-07, + "loss": 1.4175152778625488, + "step": 6600 + }, + { + "epoch": 2.403349108117947, + "grad_norm": 129.0, + "learning_rate": 2.7710061944761835e-07, + "loss": 1.7667490243911743, + "step": 6602 + }, + { + "epoch": 2.404077175100109, + "grad_norm": 12.25, + "learning_rate": 2.769189526388171e-07, + "loss": 1.7042346000671387, + "step": 6604 + }, + { + "epoch": 2.4048052420822716, + "grad_norm": 7.03125, + "learning_rate": 2.7673747733692253e-07, + "loss": 1.365522027015686, + "step": 6606 + }, + { + "epoch": 2.4055333090644337, + "grad_norm": 10.125, + "learning_rate": 2.765561936495047e-07, + "loss": 1.2611021995544434, + "step": 6608 + }, + { + "epoch": 2.4062613760465963, + "grad_norm": 10.3125, + "learning_rate": 2.7637510168401966e-07, + "loss": 1.2829726934432983, + "step": 6610 + }, + { + "epoch": 2.406989443028759, + "grad_norm": 14.5625, + "learning_rate": 2.761942015478101e-07, + "loss": 1.3774306774139404, + "step": 6612 + }, + { + "epoch": 2.407717510010921, + "grad_norm": 10.5625, + "learning_rate": 2.760134933481051e-07, + "loss": 1.4485899209976196, + "step": 6614 + }, + { + "epoch": 2.4084455769930835, + "grad_norm": 24.5, + "learning_rate": 2.7583297719201946e-07, + "loss": 1.3933846950531006, + "step": 6616 + }, + { + "epoch": 2.4091736439752456, + "grad_norm": 11.25, + "learning_rate": 2.7565265318655477e-07, + "loss": 1.5879325866699219, + "step": 6618 + }, + { + "epoch": 2.409901710957408, + "grad_norm": 12.4375, + "learning_rate": 2.754725214385982e-07, + "loss": 1.2370315790176392, + "step": 6620 + }, + { + "epoch": 2.4106297779395707, + "grad_norm": 14.0, + "learning_rate": 2.7529258205492333e-07, + "loss": 1.5985283851623535, + "step": 6622 + }, + { + "epoch": 2.4113578449217328, + "grad_norm": 23.125, + "learning_rate": 2.7511283514218956e-07, + "loss": 1.568756103515625, + "step": 6624 + }, + { + "epoch": 2.4120859119038953, + "grad_norm": 17.25, + "learning_rate": 2.7493328080694206e-07, + "loss": 1.3071112632751465, + "step": 6626 + }, + { + "epoch": 2.4128139788860574, + "grad_norm": 11.3125, + "learning_rate": 2.7475391915561215e-07, + "loss": 1.5413353443145752, + "step": 6628 + }, + { + "epoch": 2.41354204586822, + "grad_norm": 20.375, + "learning_rate": 2.745747502945168e-07, + "loss": 1.372725248336792, + "step": 6630 + }, + { + "epoch": 2.414270112850382, + "grad_norm": 11.0, + "learning_rate": 2.7439577432985866e-07, + "loss": 1.6484804153442383, + "step": 6632 + }, + { + "epoch": 2.4149981798325446, + "grad_norm": 13.375, + "learning_rate": 2.7421699136772614e-07, + "loss": 1.264376163482666, + "step": 6634 + }, + { + "epoch": 2.4157262468147067, + "grad_norm": 15.1875, + "learning_rate": 2.740384015140931e-07, + "loss": 1.5684046745300293, + "step": 6636 + }, + { + "epoch": 2.4164543137968693, + "grad_norm": 22.875, + "learning_rate": 2.7386000487481897e-07, + "loss": 1.3650507926940918, + "step": 6638 + }, + { + "epoch": 2.417182380779032, + "grad_norm": 6.53125, + "learning_rate": 2.7368180155564876e-07, + "loss": 1.2446740865707397, + "step": 6640 + }, + { + "epoch": 2.417910447761194, + "grad_norm": 3.53125, + "learning_rate": 2.735037916622129e-07, + "loss": 1.334517478942871, + "step": 6642 + }, + { + "epoch": 2.4186385147433564, + "grad_norm": 12.75, + "learning_rate": 2.733259753000269e-07, + "loss": 1.2817590236663818, + "step": 6644 + }, + { + "epoch": 2.4193665817255186, + "grad_norm": 16.25, + "learning_rate": 2.73148352574492e-07, + "loss": 1.6067270040512085, + "step": 6646 + }, + { + "epoch": 2.420094648707681, + "grad_norm": 13.5, + "learning_rate": 2.7297092359089426e-07, + "loss": 1.4388861656188965, + "step": 6648 + }, + { + "epoch": 2.4208227156898436, + "grad_norm": 3.6875, + "learning_rate": 2.7279368845440494e-07, + "loss": 1.0828008651733398, + "step": 6650 + }, + { + "epoch": 2.4215507826720057, + "grad_norm": 29.625, + "learning_rate": 2.7261664727008067e-07, + "loss": 1.1791088581085205, + "step": 6652 + }, + { + "epoch": 2.4222788496541683, + "grad_norm": 24.875, + "learning_rate": 2.72439800142863e-07, + "loss": 1.0187509059906006, + "step": 6654 + }, + { + "epoch": 2.4230069166363304, + "grad_norm": 5.53125, + "learning_rate": 2.7226314717757825e-07, + "loss": 1.108128309249878, + "step": 6656 + }, + { + "epoch": 2.423734983618493, + "grad_norm": 70.0, + "learning_rate": 2.720866884789379e-07, + "loss": 1.7614941596984863, + "step": 6658 + }, + { + "epoch": 2.4244630506006555, + "grad_norm": 51.5, + "learning_rate": 2.719104241515381e-07, + "loss": 1.0399326086044312, + "step": 6660 + }, + { + "epoch": 2.4251911175828176, + "grad_norm": 13.3125, + "learning_rate": 2.7173435429986e-07, + "loss": 1.6098991632461548, + "step": 6662 + }, + { + "epoch": 2.42591918456498, + "grad_norm": 66.5, + "learning_rate": 2.715584790282692e-07, + "loss": 1.0722503662109375, + "step": 6664 + }, + { + "epoch": 2.4266472515471422, + "grad_norm": 18.125, + "learning_rate": 2.713827984410162e-07, + "loss": 0.5909181237220764, + "step": 6666 + }, + { + "epoch": 2.4273753185293048, + "grad_norm": 11.4375, + "learning_rate": 2.7120731264223583e-07, + "loss": 1.5101183652877808, + "step": 6668 + }, + { + "epoch": 2.428103385511467, + "grad_norm": 13.8125, + "learning_rate": 2.7103202173594784e-07, + "loss": 1.676002025604248, + "step": 6670 + }, + { + "epoch": 2.4288314524936294, + "grad_norm": 7.625, + "learning_rate": 2.708569258260561e-07, + "loss": 1.2747023105621338, + "step": 6672 + }, + { + "epoch": 2.429559519475792, + "grad_norm": 9.875, + "learning_rate": 2.706820250163493e-07, + "loss": 1.1483790874481201, + "step": 6674 + }, + { + "epoch": 2.430287586457954, + "grad_norm": 4.96875, + "learning_rate": 2.7050731941049993e-07, + "loss": 1.3670051097869873, + "step": 6676 + }, + { + "epoch": 2.4310156534401166, + "grad_norm": 10.1875, + "learning_rate": 2.7033280911206514e-07, + "loss": 1.3920660018920898, + "step": 6678 + }, + { + "epoch": 2.4317437204222787, + "grad_norm": 18.5, + "learning_rate": 2.7015849422448625e-07, + "loss": 1.3697346448898315, + "step": 6680 + }, + { + "epoch": 2.4324717874044413, + "grad_norm": 11.5, + "learning_rate": 2.699843748510889e-07, + "loss": 0.76566481590271, + "step": 6682 + }, + { + "epoch": 2.4331998543866034, + "grad_norm": 5.96875, + "learning_rate": 2.698104510950824e-07, + "loss": 1.3464720249176025, + "step": 6684 + }, + { + "epoch": 2.433927921368766, + "grad_norm": 16.875, + "learning_rate": 2.6963672305956065e-07, + "loss": 1.5206269025802612, + "step": 6686 + }, + { + "epoch": 2.4346559883509284, + "grad_norm": 29.0, + "learning_rate": 2.694631908475012e-07, + "loss": 1.3917434215545654, + "step": 6688 + }, + { + "epoch": 2.4353840553330905, + "grad_norm": 16.875, + "learning_rate": 2.6928985456176554e-07, + "loss": 1.8404608964920044, + "step": 6690 + }, + { + "epoch": 2.436112122315253, + "grad_norm": 9.625, + "learning_rate": 2.691167143050991e-07, + "loss": 1.5042699575424194, + "step": 6692 + }, + { + "epoch": 2.436840189297415, + "grad_norm": 46.5, + "learning_rate": 2.6894377018013113e-07, + "loss": 1.5168453454971313, + "step": 6694 + }, + { + "epoch": 2.4375682562795777, + "grad_norm": 22.5, + "learning_rate": 2.687710222893745e-07, + "loss": 1.437510371208191, + "step": 6696 + }, + { + "epoch": 2.4382963232617403, + "grad_norm": 8.875, + "learning_rate": 2.6859847073522594e-07, + "loss": 1.3662800788879395, + "step": 6698 + }, + { + "epoch": 2.4390243902439024, + "grad_norm": 13.3125, + "learning_rate": 2.6842611561996577e-07, + "loss": 1.442291498184204, + "step": 6700 + }, + { + "epoch": 2.439752457226065, + "grad_norm": 26.375, + "learning_rate": 2.682539570457576e-07, + "loss": 1.6629726886749268, + "step": 6702 + }, + { + "epoch": 2.440480524208227, + "grad_norm": 5.0, + "learning_rate": 2.680819951146491e-07, + "loss": 1.1361616849899292, + "step": 6704 + }, + { + "epoch": 2.4412085911903896, + "grad_norm": 9.5625, + "learning_rate": 2.679102299285707e-07, + "loss": 1.6242060661315918, + "step": 6706 + }, + { + "epoch": 2.441936658172552, + "grad_norm": 7.0, + "learning_rate": 2.6773866158933667e-07, + "loss": 1.1749210357666016, + "step": 6708 + }, + { + "epoch": 2.4426647251547142, + "grad_norm": 10.1875, + "learning_rate": 2.675672901986445e-07, + "loss": 1.4349305629730225, + "step": 6710 + }, + { + "epoch": 2.4433927921368768, + "grad_norm": 34.25, + "learning_rate": 2.673961158580748e-07, + "loss": 1.3483836650848389, + "step": 6712 + }, + { + "epoch": 2.444120859119039, + "grad_norm": 18.0, + "learning_rate": 2.6722513866909153e-07, + "loss": 1.689392328262329, + "step": 6714 + }, + { + "epoch": 2.4448489261012014, + "grad_norm": 6.25, + "learning_rate": 2.670543587330418e-07, + "loss": 1.27418851852417, + "step": 6716 + }, + { + "epoch": 2.4455769930833635, + "grad_norm": 10.1875, + "learning_rate": 2.6688377615115584e-07, + "loss": 1.2575088739395142, + "step": 6718 + }, + { + "epoch": 2.446305060065526, + "grad_norm": 23.75, + "learning_rate": 2.6671339102454654e-07, + "loss": 1.6780484914779663, + "step": 6720 + }, + { + "epoch": 2.447033127047688, + "grad_norm": 11.625, + "learning_rate": 2.665432034542103e-07, + "loss": 1.3777331113815308, + "step": 6722 + }, + { + "epoch": 2.4477611940298507, + "grad_norm": 11.625, + "learning_rate": 2.6637321354102576e-07, + "loss": 1.3844552040100098, + "step": 6724 + }, + { + "epoch": 2.4484892610120133, + "grad_norm": 9.375, + "learning_rate": 2.6620342138575507e-07, + "loss": 1.146431803703308, + "step": 6726 + }, + { + "epoch": 2.4492173279941754, + "grad_norm": 5.25, + "learning_rate": 2.6603382708904277e-07, + "loss": 1.4640130996704102, + "step": 6728 + }, + { + "epoch": 2.449945394976338, + "grad_norm": 10.125, + "learning_rate": 2.6586443075141616e-07, + "loss": 1.3453290462493896, + "step": 6730 + }, + { + "epoch": 2.4506734619585, + "grad_norm": 12.25, + "learning_rate": 2.656952324732854e-07, + "loss": 1.4111078977584839, + "step": 6732 + }, + { + "epoch": 2.4514015289406625, + "grad_norm": 12.75, + "learning_rate": 2.655262323549429e-07, + "loss": 1.3204175233840942, + "step": 6734 + }, + { + "epoch": 2.452129595922825, + "grad_norm": 14.25, + "learning_rate": 2.6535743049656384e-07, + "loss": 1.435326337814331, + "step": 6736 + }, + { + "epoch": 2.452857662904987, + "grad_norm": 16.75, + "learning_rate": 2.651888269982058e-07, + "loss": 1.2799218893051147, + "step": 6738 + }, + { + "epoch": 2.4535857298871497, + "grad_norm": 15.0625, + "learning_rate": 2.650204219598091e-07, + "loss": 1.2355027198791504, + "step": 6740 + }, + { + "epoch": 2.454313796869312, + "grad_norm": 14.1875, + "learning_rate": 2.648522154811958e-07, + "loss": 0.7619717121124268, + "step": 6742 + }, + { + "epoch": 2.4550418638514744, + "grad_norm": 14.0625, + "learning_rate": 2.646842076620708e-07, + "loss": 1.4025920629501343, + "step": 6744 + }, + { + "epoch": 2.455769930833637, + "grad_norm": 11.125, + "learning_rate": 2.645163986020211e-07, + "loss": 1.3862593173980713, + "step": 6746 + }, + { + "epoch": 2.456497997815799, + "grad_norm": 10.9375, + "learning_rate": 2.6434878840051556e-07, + "loss": 1.8202754259109497, + "step": 6748 + }, + { + "epoch": 2.4572260647979616, + "grad_norm": 18.75, + "learning_rate": 2.641813771569058e-07, + "loss": 1.4731407165527344, + "step": 6750 + }, + { + "epoch": 2.4579541317801237, + "grad_norm": 9.4375, + "learning_rate": 2.640141649704249e-07, + "loss": 1.313232421875, + "step": 6752 + }, + { + "epoch": 2.458682198762286, + "grad_norm": 7.90625, + "learning_rate": 2.638471519401883e-07, + "loss": 1.3598300218582153, + "step": 6754 + }, + { + "epoch": 2.4594102657444483, + "grad_norm": 9.625, + "learning_rate": 2.636803381651933e-07, + "loss": 1.3079771995544434, + "step": 6756 + }, + { + "epoch": 2.460138332726611, + "grad_norm": 10.9375, + "learning_rate": 2.6351372374431905e-07, + "loss": 1.1983091831207275, + "step": 6758 + }, + { + "epoch": 2.460866399708773, + "grad_norm": 14.4375, + "learning_rate": 2.6334730877632673e-07, + "loss": 1.2580256462097168, + "step": 6760 + }, + { + "epoch": 2.4615944666909355, + "grad_norm": 6.375, + "learning_rate": 2.631810933598589e-07, + "loss": 1.483885645866394, + "step": 6762 + }, + { + "epoch": 2.462322533673098, + "grad_norm": 10.5625, + "learning_rate": 2.630150775934401e-07, + "loss": 1.363759160041809, + "step": 6764 + }, + { + "epoch": 2.46305060065526, + "grad_norm": 14.4375, + "learning_rate": 2.628492615754765e-07, + "loss": 1.4348363876342773, + "step": 6766 + }, + { + "epoch": 2.4637786676374227, + "grad_norm": 14.8125, + "learning_rate": 2.62683645404256e-07, + "loss": 1.4219460487365723, + "step": 6768 + }, + { + "epoch": 2.464506734619585, + "grad_norm": 10.375, + "learning_rate": 2.625182291779478e-07, + "loss": 1.388803482055664, + "step": 6770 + }, + { + "epoch": 2.4652348016017473, + "grad_norm": 10.1875, + "learning_rate": 2.623530129946027e-07, + "loss": 1.2727341651916504, + "step": 6772 + }, + { + "epoch": 2.46596286858391, + "grad_norm": 7.78125, + "learning_rate": 2.6218799695215294e-07, + "loss": 1.553836703300476, + "step": 6774 + }, + { + "epoch": 2.466690935566072, + "grad_norm": 15.5, + "learning_rate": 2.6202318114841194e-07, + "loss": 1.3743908405303955, + "step": 6776 + }, + { + "epoch": 2.4674190025482345, + "grad_norm": 17.375, + "learning_rate": 2.6185856568107465e-07, + "loss": 1.1938812732696533, + "step": 6778 + }, + { + "epoch": 2.4681470695303966, + "grad_norm": 7.9375, + "learning_rate": 2.6169415064771744e-07, + "loss": 1.159532070159912, + "step": 6780 + }, + { + "epoch": 2.468875136512559, + "grad_norm": 11.75, + "learning_rate": 2.615299361457972e-07, + "loss": 1.7310431003570557, + "step": 6782 + }, + { + "epoch": 2.4696032034947217, + "grad_norm": 6.8125, + "learning_rate": 2.6136592227265286e-07, + "loss": 1.1602779626846313, + "step": 6784 + }, + { + "epoch": 2.470331270476884, + "grad_norm": 10.125, + "learning_rate": 2.6120210912550356e-07, + "loss": 1.207646369934082, + "step": 6786 + }, + { + "epoch": 2.4710593374590464, + "grad_norm": 13.0, + "learning_rate": 2.6103849680145014e-07, + "loss": 1.3086488246917725, + "step": 6788 + }, + { + "epoch": 2.4717874044412085, + "grad_norm": 97.5, + "learning_rate": 2.608750853974739e-07, + "loss": 1.2647979259490967, + "step": 6790 + }, + { + "epoch": 2.472515471423371, + "grad_norm": 13.3125, + "learning_rate": 2.607118750104374e-07, + "loss": 1.2398289442062378, + "step": 6792 + }, + { + "epoch": 2.473243538405533, + "grad_norm": 14.5, + "learning_rate": 2.6054886573708383e-07, + "loss": 1.2702267169952393, + "step": 6794 + }, + { + "epoch": 2.4739716053876957, + "grad_norm": 3.765625, + "learning_rate": 2.603860576740374e-07, + "loss": 1.0244336128234863, + "step": 6796 + }, + { + "epoch": 2.474699672369858, + "grad_norm": 39.75, + "learning_rate": 2.602234509178026e-07, + "loss": 0.5970927476882935, + "step": 6798 + }, + { + "epoch": 2.4754277393520203, + "grad_norm": 16.625, + "learning_rate": 2.6006104556476514e-07, + "loss": 1.37803053855896, + "step": 6800 + }, + { + "epoch": 2.476155806334183, + "grad_norm": 13.5, + "learning_rate": 2.598988417111911e-07, + "loss": 1.4695135354995728, + "step": 6802 + }, + { + "epoch": 2.476883873316345, + "grad_norm": 9.3125, + "learning_rate": 2.597368394532271e-07, + "loss": 1.1142771244049072, + "step": 6804 + }, + { + "epoch": 2.4776119402985075, + "grad_norm": 14.6875, + "learning_rate": 2.5957503888690014e-07, + "loss": 1.4116989374160767, + "step": 6806 + }, + { + "epoch": 2.4783400072806696, + "grad_norm": 14.1875, + "learning_rate": 2.5941344010811804e-07, + "loss": 1.5484392642974854, + "step": 6808 + }, + { + "epoch": 2.479068074262832, + "grad_norm": 10.4375, + "learning_rate": 2.592520432126686e-07, + "loss": 1.2474876642227173, + "step": 6810 + }, + { + "epoch": 2.4797961412449947, + "grad_norm": 5.0, + "learning_rate": 2.5909084829622016e-07, + "loss": 1.2281057834625244, + "step": 6812 + }, + { + "epoch": 2.480524208227157, + "grad_norm": 6.5, + "learning_rate": 2.589298554543214e-07, + "loss": 1.0700092315673828, + "step": 6814 + }, + { + "epoch": 2.4812522752093193, + "grad_norm": 5.46875, + "learning_rate": 2.587690647824012e-07, + "loss": 1.4618507623672485, + "step": 6816 + }, + { + "epoch": 2.4819803421914814, + "grad_norm": 42.5, + "learning_rate": 2.5860847637576826e-07, + "loss": 1.454222321510315, + "step": 6818 + }, + { + "epoch": 2.482708409173644, + "grad_norm": 11.5, + "learning_rate": 2.5844809032961196e-07, + "loss": 1.2014336585998535, + "step": 6820 + }, + { + "epoch": 2.4834364761558065, + "grad_norm": 12.6875, + "learning_rate": 2.582879067390012e-07, + "loss": 1.3950724601745605, + "step": 6822 + }, + { + "epoch": 2.4841645431379686, + "grad_norm": 14.3125, + "learning_rate": 2.581279256988853e-07, + "loss": 1.4030526876449585, + "step": 6824 + }, + { + "epoch": 2.484892610120131, + "grad_norm": 10.0625, + "learning_rate": 2.5796814730409327e-07, + "loss": 1.396420955657959, + "step": 6826 + }, + { + "epoch": 2.4856206771022933, + "grad_norm": 20.625, + "learning_rate": 2.578085716493339e-07, + "loss": 1.563439130783081, + "step": 6828 + }, + { + "epoch": 2.486348744084456, + "grad_norm": 12.4375, + "learning_rate": 2.5764919882919627e-07, + "loss": 1.4329544305801392, + "step": 6830 + }, + { + "epoch": 2.4870768110666184, + "grad_norm": 20.75, + "learning_rate": 2.5749002893814876e-07, + "loss": 1.3825175762176514, + "step": 6832 + }, + { + "epoch": 2.4878048780487805, + "grad_norm": 13.8125, + "learning_rate": 2.5733106207053945e-07, + "loss": 1.2603591680526733, + "step": 6834 + }, + { + "epoch": 2.488532945030943, + "grad_norm": 7.25, + "learning_rate": 2.5717229832059656e-07, + "loss": 1.109574794769287, + "step": 6836 + }, + { + "epoch": 2.489261012013105, + "grad_norm": 21.25, + "learning_rate": 2.5701373778242736e-07, + "loss": 1.568451166152954, + "step": 6838 + }, + { + "epoch": 2.4899890789952677, + "grad_norm": 20.375, + "learning_rate": 2.5685538055001905e-07, + "loss": 1.3908023834228516, + "step": 6840 + }, + { + "epoch": 2.4907171459774298, + "grad_norm": 13.5, + "learning_rate": 2.566972267172382e-07, + "loss": 1.040195345878601, + "step": 6842 + }, + { + "epoch": 2.4914452129595923, + "grad_norm": 36.5, + "learning_rate": 2.565392763778308e-07, + "loss": 1.2721433639526367, + "step": 6844 + }, + { + "epoch": 2.4921732799417544, + "grad_norm": 8.9375, + "learning_rate": 2.56381529625422e-07, + "loss": 1.2596639394760132, + "step": 6846 + }, + { + "epoch": 2.492901346923917, + "grad_norm": 10.25, + "learning_rate": 2.5622398655351677e-07, + "loss": 1.4886136054992676, + "step": 6848 + }, + { + "epoch": 2.4936294139060795, + "grad_norm": 23.125, + "learning_rate": 2.5606664725549886e-07, + "loss": 1.3428442478179932, + "step": 6850 + }, + { + "epoch": 2.4943574808882416, + "grad_norm": 15.4375, + "learning_rate": 2.559095118246315e-07, + "loss": 0.9194296598434448, + "step": 6852 + }, + { + "epoch": 2.495085547870404, + "grad_norm": 10.3125, + "learning_rate": 2.557525803540572e-07, + "loss": 1.493058681488037, + "step": 6854 + }, + { + "epoch": 2.4958136148525663, + "grad_norm": 11.875, + "learning_rate": 2.555958529367971e-07, + "loss": 1.2191228866577148, + "step": 6856 + }, + { + "epoch": 2.496541681834729, + "grad_norm": 25.625, + "learning_rate": 2.5543932966575187e-07, + "loss": 1.5052497386932373, + "step": 6858 + }, + { + "epoch": 2.4972697488168913, + "grad_norm": 14.6875, + "learning_rate": 2.5528301063370093e-07, + "loss": 1.3226182460784912, + "step": 6860 + }, + { + "epoch": 2.4979978157990534, + "grad_norm": 11.0, + "learning_rate": 2.5512689593330256e-07, + "loss": 1.2850534915924072, + "step": 6862 + }, + { + "epoch": 2.498725882781216, + "grad_norm": 20.0, + "learning_rate": 2.5497098565709416e-07, + "loss": 1.4015401601791382, + "step": 6864 + }, + { + "epoch": 2.499453949763378, + "grad_norm": 12.875, + "learning_rate": 2.548152798974919e-07, + "loss": 1.4337224960327148, + "step": 6866 + }, + { + "epoch": 2.5001820167455406, + "grad_norm": 13.625, + "learning_rate": 2.546597787467906e-07, + "loss": 1.776935338973999, + "step": 6868 + }, + { + "epoch": 2.500910083727703, + "grad_norm": 22.25, + "learning_rate": 2.545044822971639e-07, + "loss": 1.2928359508514404, + "step": 6870 + }, + { + "epoch": 2.5016381507098653, + "grad_norm": 13.25, + "learning_rate": 2.5434939064066397e-07, + "loss": 1.5125930309295654, + "step": 6872 + }, + { + "epoch": 2.502366217692028, + "grad_norm": 14.625, + "learning_rate": 2.541945038692217e-07, + "loss": 1.533915638923645, + "step": 6874 + }, + { + "epoch": 2.50309428467419, + "grad_norm": 12.75, + "learning_rate": 2.540398220746467e-07, + "loss": 1.1317052841186523, + "step": 6876 + }, + { + "epoch": 2.5038223516563525, + "grad_norm": 12.625, + "learning_rate": 2.5388534534862665e-07, + "loss": 1.2879853248596191, + "step": 6878 + }, + { + "epoch": 2.504550418638515, + "grad_norm": 11.3125, + "learning_rate": 2.5373107378272807e-07, + "loss": 1.2544724941253662, + "step": 6880 + }, + { + "epoch": 2.505278485620677, + "grad_norm": 7.75, + "learning_rate": 2.5357700746839574e-07, + "loss": 1.5285348892211914, + "step": 6882 + }, + { + "epoch": 2.506006552602839, + "grad_norm": 11.375, + "learning_rate": 2.5342314649695257e-07, + "loss": 1.1309514045715332, + "step": 6884 + }, + { + "epoch": 2.5067346195850018, + "grad_norm": 16.25, + "learning_rate": 2.532694909596003e-07, + "loss": 1.3745746612548828, + "step": 6886 + }, + { + "epoch": 2.5074626865671643, + "grad_norm": 15.8125, + "learning_rate": 2.531160409474182e-07, + "loss": 1.1510708332061768, + "step": 6888 + }, + { + "epoch": 2.5081907535493264, + "grad_norm": 16.0, + "learning_rate": 2.529627965513642e-07, + "loss": 1.5269010066986084, + "step": 6890 + }, + { + "epoch": 2.508918820531489, + "grad_norm": 12.0, + "learning_rate": 2.5280975786227416e-07, + "loss": 1.3847788572311401, + "step": 6892 + }, + { + "epoch": 2.509646887513651, + "grad_norm": 14.5, + "learning_rate": 2.526569249708622e-07, + "loss": 1.283866286277771, + "step": 6894 + }, + { + "epoch": 2.5103749544958136, + "grad_norm": 9.8125, + "learning_rate": 2.525042979677201e-07, + "loss": 1.0361875295639038, + "step": 6896 + }, + { + "epoch": 2.511103021477976, + "grad_norm": 24.5, + "learning_rate": 2.5235187694331786e-07, + "loss": 1.6150097846984863, + "step": 6898 + }, + { + "epoch": 2.5118310884601383, + "grad_norm": 12.75, + "learning_rate": 2.521996619880035e-07, + "loss": 1.1899933815002441, + "step": 6900 + }, + { + "epoch": 2.512559155442301, + "grad_norm": 35.75, + "learning_rate": 2.520476531920026e-07, + "loss": 0.8863071203231812, + "step": 6902 + }, + { + "epoch": 2.513287222424463, + "grad_norm": 18.875, + "learning_rate": 2.518958506454186e-07, + "loss": 1.7267930507659912, + "step": 6904 + }, + { + "epoch": 2.5140152894066254, + "grad_norm": 21.875, + "learning_rate": 2.517442544382329e-07, + "loss": 2.1500072479248047, + "step": 6906 + }, + { + "epoch": 2.514743356388788, + "grad_norm": 45.5, + "learning_rate": 2.515928646603043e-07, + "loss": 1.6365923881530762, + "step": 6908 + }, + { + "epoch": 2.51547142337095, + "grad_norm": 14.0625, + "learning_rate": 2.5144168140136936e-07, + "loss": 0.9392219185829163, + "step": 6910 + }, + { + "epoch": 2.5161994903531126, + "grad_norm": 28.875, + "learning_rate": 2.512907047510425e-07, + "loss": 1.469531774520874, + "step": 6912 + }, + { + "epoch": 2.5169275573352747, + "grad_norm": 32.25, + "learning_rate": 2.5113993479881516e-07, + "loss": 1.1782994270324707, + "step": 6914 + }, + { + "epoch": 2.5176556243174373, + "grad_norm": 5.90625, + "learning_rate": 2.5098937163405657e-07, + "loss": 1.4409124851226807, + "step": 6916 + }, + { + "epoch": 2.5183836912996, + "grad_norm": 11.5625, + "learning_rate": 2.508390153460134e-07, + "loss": 1.2031923532485962, + "step": 6918 + }, + { + "epoch": 2.519111758281762, + "grad_norm": 12.5625, + "learning_rate": 2.506888660238095e-07, + "loss": 1.43040132522583, + "step": 6920 + }, + { + "epoch": 2.519839825263924, + "grad_norm": 14.125, + "learning_rate": 2.5053892375644635e-07, + "loss": 1.1454442739486694, + "step": 6922 + }, + { + "epoch": 2.5205678922460866, + "grad_norm": 22.5, + "learning_rate": 2.5038918863280235e-07, + "loss": 1.3094165325164795, + "step": 6924 + }, + { + "epoch": 2.521295959228249, + "grad_norm": 59.5, + "learning_rate": 2.502396607416333e-07, + "loss": 1.5099796056747437, + "step": 6926 + }, + { + "epoch": 2.522024026210411, + "grad_norm": 23.625, + "learning_rate": 2.5009034017157235e-07, + "loss": 1.784252405166626, + "step": 6928 + }, + { + "epoch": 2.5227520931925738, + "grad_norm": 14.375, + "learning_rate": 2.499412270111293e-07, + "loss": 1.5066218376159668, + "step": 6930 + }, + { + "epoch": 2.523480160174736, + "grad_norm": 17.375, + "learning_rate": 2.4979232134869135e-07, + "loss": 1.4790408611297607, + "step": 6932 + }, + { + "epoch": 2.5242082271568984, + "grad_norm": 55.75, + "learning_rate": 2.4964362327252265e-07, + "loss": 1.448436975479126, + "step": 6934 + }, + { + "epoch": 2.524936294139061, + "grad_norm": 14.625, + "learning_rate": 2.4949513287076414e-07, + "loss": 1.4459888935089111, + "step": 6936 + }, + { + "epoch": 2.525664361121223, + "grad_norm": 54.75, + "learning_rate": 2.493468502314339e-07, + "loss": 1.5230753421783447, + "step": 6938 + }, + { + "epoch": 2.5263924281033856, + "grad_norm": 24.75, + "learning_rate": 2.4919877544242684e-07, + "loss": 1.1420352458953857, + "step": 6940 + }, + { + "epoch": 2.5271204950855477, + "grad_norm": 9.875, + "learning_rate": 2.4905090859151443e-07, + "loss": 1.158059000968933, + "step": 6942 + }, + { + "epoch": 2.5278485620677102, + "grad_norm": 21.0, + "learning_rate": 2.48903249766345e-07, + "loss": 1.373079776763916, + "step": 6944 + }, + { + "epoch": 2.528576629049873, + "grad_norm": 14.875, + "learning_rate": 2.4875579905444375e-07, + "loss": 1.5905284881591797, + "step": 6946 + }, + { + "epoch": 2.529304696032035, + "grad_norm": 13.375, + "learning_rate": 2.4860855654321224e-07, + "loss": 1.7122997045516968, + "step": 6948 + }, + { + "epoch": 2.5300327630141974, + "grad_norm": 14.5625, + "learning_rate": 2.4846152231992874e-07, + "loss": 1.4596366882324219, + "step": 6950 + }, + { + "epoch": 2.5307608299963595, + "grad_norm": 11.9375, + "learning_rate": 2.483146964717482e-07, + "loss": 1.3798186779022217, + "step": 6952 + }, + { + "epoch": 2.531488896978522, + "grad_norm": 26.5, + "learning_rate": 2.4816807908570173e-07, + "loss": 1.456946611404419, + "step": 6954 + }, + { + "epoch": 2.5322169639606846, + "grad_norm": 9.75, + "learning_rate": 2.4802167024869733e-07, + "loss": 1.5011539459228516, + "step": 6956 + }, + { + "epoch": 2.5329450309428467, + "grad_norm": 19.25, + "learning_rate": 2.478754700475189e-07, + "loss": 0.5429284572601318, + "step": 6958 + }, + { + "epoch": 2.533673097925009, + "grad_norm": 17.0, + "learning_rate": 2.477294785688269e-07, + "loss": 0.5535826086997986, + "step": 6960 + }, + { + "epoch": 2.5344011649071714, + "grad_norm": 70.0, + "learning_rate": 2.4758369589915825e-07, + "loss": 1.2512660026550293, + "step": 6962 + }, + { + "epoch": 2.535129231889334, + "grad_norm": 12.5, + "learning_rate": 2.474381221249256e-07, + "loss": 1.436976671218872, + "step": 6964 + }, + { + "epoch": 2.535857298871496, + "grad_norm": 15.0625, + "learning_rate": 2.472927573324183e-07, + "loss": 1.4649364948272705, + "step": 6966 + }, + { + "epoch": 2.5365853658536586, + "grad_norm": 24.75, + "learning_rate": 2.471476016078016e-07, + "loss": 1.4242216348648071, + "step": 6968 + }, + { + "epoch": 2.5373134328358207, + "grad_norm": 9.8125, + "learning_rate": 2.4700265503711674e-07, + "loss": 1.5034315586090088, + "step": 6970 + }, + { + "epoch": 2.538041499817983, + "grad_norm": 31.625, + "learning_rate": 2.468579177062811e-07, + "loss": 1.5997254848480225, + "step": 6972 + }, + { + "epoch": 2.5387695668001458, + "grad_norm": 9.6875, + "learning_rate": 2.4671338970108814e-07, + "loss": 1.3849997520446777, + "step": 6974 + }, + { + "epoch": 2.539497633782308, + "grad_norm": 10.625, + "learning_rate": 2.465690711072069e-07, + "loss": 1.4180588722229004, + "step": 6976 + }, + { + "epoch": 2.5402257007644704, + "grad_norm": 16.625, + "learning_rate": 2.4642496201018256e-07, + "loss": 1.1822429895401, + "step": 6978 + }, + { + "epoch": 2.5409537677466325, + "grad_norm": 9.9375, + "learning_rate": 2.4628106249543623e-07, + "loss": 1.6423015594482422, + "step": 6980 + }, + { + "epoch": 2.541681834728795, + "grad_norm": 24.25, + "learning_rate": 2.461373726482644e-07, + "loss": 1.6208796501159668, + "step": 6982 + }, + { + "epoch": 2.5424099017109576, + "grad_norm": 17.0, + "learning_rate": 2.4599389255383967e-07, + "loss": 1.5040407180786133, + "step": 6984 + }, + { + "epoch": 2.5431379686931197, + "grad_norm": 37.0, + "learning_rate": 2.4585062229721007e-07, + "loss": 1.9780805110931396, + "step": 6986 + }, + { + "epoch": 2.5438660356752822, + "grad_norm": 5.15625, + "learning_rate": 2.4570756196329927e-07, + "loss": 1.419926643371582, + "step": 6988 + }, + { + "epoch": 2.5445941026574443, + "grad_norm": 26.875, + "learning_rate": 2.455647116369066e-07, + "loss": 1.455362319946289, + "step": 6990 + }, + { + "epoch": 2.545322169639607, + "grad_norm": 22.625, + "learning_rate": 2.4542207140270686e-07, + "loss": 0.7543390989303589, + "step": 6992 + }, + { + "epoch": 2.5460502366217694, + "grad_norm": 15.75, + "learning_rate": 2.4527964134525037e-07, + "loss": 1.2983546257019043, + "step": 6994 + }, + { + "epoch": 2.5467783036039315, + "grad_norm": 9.125, + "learning_rate": 2.4513742154896274e-07, + "loss": 1.4985978603363037, + "step": 6996 + }, + { + "epoch": 2.547506370586094, + "grad_norm": 15.0, + "learning_rate": 2.4499541209814504e-07, + "loss": 1.0557861328125, + "step": 6998 + }, + { + "epoch": 2.548234437568256, + "grad_norm": 52.5, + "learning_rate": 2.4485361307697375e-07, + "loss": 1.3368124961853027, + "step": 7000 + }, + { + "epoch": 2.5489625045504187, + "grad_norm": 13.8125, + "learning_rate": 2.447120245695004e-07, + "loss": 1.2946007251739502, + "step": 7002 + }, + { + "epoch": 2.5496905715325813, + "grad_norm": 8.75, + "learning_rate": 2.445706466596518e-07, + "loss": 1.6215639114379883, + "step": 7004 + }, + { + "epoch": 2.5504186385147434, + "grad_norm": 31.25, + "learning_rate": 2.4442947943123e-07, + "loss": 1.3898422718048096, + "step": 7006 + }, + { + "epoch": 2.5511467054969055, + "grad_norm": 24.75, + "learning_rate": 2.4428852296791234e-07, + "loss": 1.495128870010376, + "step": 7008 + }, + { + "epoch": 2.551874772479068, + "grad_norm": 6.5, + "learning_rate": 2.4414777735325076e-07, + "loss": 1.0410609245300293, + "step": 7010 + }, + { + "epoch": 2.5526028394612306, + "grad_norm": 32.25, + "learning_rate": 2.440072426706727e-07, + "loss": 1.618689775466919, + "step": 7012 + }, + { + "epoch": 2.5533309064433927, + "grad_norm": 19.875, + "learning_rate": 2.4386691900348027e-07, + "loss": 1.4242198467254639, + "step": 7014 + }, + { + "epoch": 2.554058973425555, + "grad_norm": 19.375, + "learning_rate": 2.4372680643485055e-07, + "loss": 1.0284533500671387, + "step": 7016 + }, + { + "epoch": 2.5547870404077173, + "grad_norm": 13.75, + "learning_rate": 2.435869050478355e-07, + "loss": 1.2262368202209473, + "step": 7018 + }, + { + "epoch": 2.55551510738988, + "grad_norm": 15.875, + "learning_rate": 2.4344721492536214e-07, + "loss": 1.529994249343872, + "step": 7020 + }, + { + "epoch": 2.5562431743720424, + "grad_norm": 40.75, + "learning_rate": 2.4330773615023185e-07, + "loss": 1.0223617553710938, + "step": 7022 + }, + { + "epoch": 2.5569712413542045, + "grad_norm": 9.25, + "learning_rate": 2.43168468805121e-07, + "loss": 0.8236181735992432, + "step": 7024 + }, + { + "epoch": 2.557699308336367, + "grad_norm": 21.375, + "learning_rate": 2.430294129725807e-07, + "loss": 1.2377922534942627, + "step": 7026 + }, + { + "epoch": 2.558427375318529, + "grad_norm": 16.625, + "learning_rate": 2.428905687350364e-07, + "loss": 1.5116455554962158, + "step": 7028 + }, + { + "epoch": 2.5591554423006917, + "grad_norm": 22.0, + "learning_rate": 2.427519361747883e-07, + "loss": 1.4463073015213013, + "step": 7030 + }, + { + "epoch": 2.5598835092828542, + "grad_norm": 28.25, + "learning_rate": 2.426135153740113e-07, + "loss": 1.6984381675720215, + "step": 7032 + }, + { + "epoch": 2.5606115762650163, + "grad_norm": 9.75, + "learning_rate": 2.4247530641475424e-07, + "loss": 1.4462287425994873, + "step": 7034 + }, + { + "epoch": 2.561339643247179, + "grad_norm": 11.9375, + "learning_rate": 2.4233730937894096e-07, + "loss": 1.4010303020477295, + "step": 7036 + }, + { + "epoch": 2.562067710229341, + "grad_norm": 56.25, + "learning_rate": 2.421995243483696e-07, + "loss": 0.6116601824760437, + "step": 7038 + }, + { + "epoch": 2.5627957772115035, + "grad_norm": 44.5, + "learning_rate": 2.420619514047123e-07, + "loss": 1.3797173500061035, + "step": 7040 + }, + { + "epoch": 2.563523844193666, + "grad_norm": 8.75, + "learning_rate": 2.419245906295158e-07, + "loss": 1.2204258441925049, + "step": 7042 + }, + { + "epoch": 2.564251911175828, + "grad_norm": 8.6875, + "learning_rate": 2.417874421042008e-07, + "loss": 1.4763867855072021, + "step": 7044 + }, + { + "epoch": 2.5649799781579903, + "grad_norm": 11.1875, + "learning_rate": 2.416505059100625e-07, + "loss": 1.2450218200683594, + "step": 7046 + }, + { + "epoch": 2.565708045140153, + "grad_norm": 26.25, + "learning_rate": 2.4151378212827e-07, + "loss": 1.4100055694580078, + "step": 7048 + }, + { + "epoch": 2.5664361121223154, + "grad_norm": 21.0, + "learning_rate": 2.413772708398667e-07, + "loss": 1.6636126041412354, + "step": 7050 + }, + { + "epoch": 2.5671641791044775, + "grad_norm": 12.125, + "learning_rate": 2.4124097212576966e-07, + "loss": 1.2454887628555298, + "step": 7052 + }, + { + "epoch": 2.56789224608664, + "grad_norm": 21.25, + "learning_rate": 2.411048860667706e-07, + "loss": 1.2083666324615479, + "step": 7054 + }, + { + "epoch": 2.568620313068802, + "grad_norm": 21.5, + "learning_rate": 2.4096901274353456e-07, + "loss": 1.2242989540100098, + "step": 7056 + }, + { + "epoch": 2.5693483800509647, + "grad_norm": 14.625, + "learning_rate": 2.4083335223660066e-07, + "loss": 1.1409248113632202, + "step": 7058 + }, + { + "epoch": 2.570076447033127, + "grad_norm": 7.5625, + "learning_rate": 2.406979046263821e-07, + "loss": 1.2696894407272339, + "step": 7060 + }, + { + "epoch": 2.5708045140152893, + "grad_norm": 9.3125, + "learning_rate": 2.4056266999316553e-07, + "loss": 1.159008264541626, + "step": 7062 + }, + { + "epoch": 2.571532580997452, + "grad_norm": 23.375, + "learning_rate": 2.4042764841711156e-07, + "loss": 1.9418699741363525, + "step": 7064 + }, + { + "epoch": 2.572260647979614, + "grad_norm": 4.625, + "learning_rate": 2.402928399782547e-07, + "loss": 1.1063013076782227, + "step": 7066 + }, + { + "epoch": 2.5729887149617765, + "grad_norm": 16.0, + "learning_rate": 2.401582447565027e-07, + "loss": 1.429677963256836, + "step": 7068 + }, + { + "epoch": 2.573716781943939, + "grad_norm": 3.484375, + "learning_rate": 2.400238628316371e-07, + "loss": 1.2936146259307861, + "step": 7070 + }, + { + "epoch": 2.574444848926101, + "grad_norm": 21.125, + "learning_rate": 2.398896942833132e-07, + "loss": 0.8748884797096252, + "step": 7072 + }, + { + "epoch": 2.5751729159082637, + "grad_norm": 10.125, + "learning_rate": 2.3975573919105953e-07, + "loss": 1.4105935096740723, + "step": 7074 + }, + { + "epoch": 2.575900982890426, + "grad_norm": 45.5, + "learning_rate": 2.396219976342783e-07, + "loss": 1.055872917175293, + "step": 7076 + }, + { + "epoch": 2.5766290498725883, + "grad_norm": 8.1875, + "learning_rate": 2.3948846969224513e-07, + "loss": 1.4895720481872559, + "step": 7078 + }, + { + "epoch": 2.577357116854751, + "grad_norm": 14.9375, + "learning_rate": 2.393551554441088e-07, + "loss": 1.3146991729736328, + "step": 7080 + }, + { + "epoch": 2.578085183836913, + "grad_norm": 17.875, + "learning_rate": 2.3922205496889175e-07, + "loss": 1.5616636276245117, + "step": 7082 + }, + { + "epoch": 2.578813250819075, + "grad_norm": 18.25, + "learning_rate": 2.390891683454895e-07, + "loss": 1.3143680095672607, + "step": 7084 + }, + { + "epoch": 2.5795413178012376, + "grad_norm": 5.84375, + "learning_rate": 2.389564956526707e-07, + "loss": 1.247016191482544, + "step": 7086 + }, + { + "epoch": 2.5802693847834, + "grad_norm": 12.0625, + "learning_rate": 2.388240369690775e-07, + "loss": 1.2491657733917236, + "step": 7088 + }, + { + "epoch": 2.5809974517655623, + "grad_norm": 11.625, + "learning_rate": 2.38691792373225e-07, + "loss": 1.0192911624908447, + "step": 7090 + }, + { + "epoch": 2.581725518747725, + "grad_norm": 14.0625, + "learning_rate": 2.3855976194350144e-07, + "loss": 1.0934501886367798, + "step": 7092 + }, + { + "epoch": 2.582453585729887, + "grad_norm": 13.125, + "learning_rate": 2.3842794575816813e-07, + "loss": 1.2385969161987305, + "step": 7094 + }, + { + "epoch": 2.5831816527120495, + "grad_norm": 26.0, + "learning_rate": 2.3829634389535928e-07, + "loss": 1.7042624950408936, + "step": 7096 + }, + { + "epoch": 2.583909719694212, + "grad_norm": 20.125, + "learning_rate": 2.381649564330823e-07, + "loss": 1.3518667221069336, + "step": 7098 + }, + { + "epoch": 2.584637786676374, + "grad_norm": 55.5, + "learning_rate": 2.380337834492172e-07, + "loss": 1.6991522312164307, + "step": 7100 + }, + { + "epoch": 2.5853658536585367, + "grad_norm": 7.46875, + "learning_rate": 2.3790282502151703e-07, + "loss": 1.3755462169647217, + "step": 7102 + }, + { + "epoch": 2.5860939206406988, + "grad_norm": 13.0625, + "learning_rate": 2.3777208122760773e-07, + "loss": 1.4194140434265137, + "step": 7104 + }, + { + "epoch": 2.5868219876228613, + "grad_norm": 24.75, + "learning_rate": 2.376415521449879e-07, + "loss": 1.418677806854248, + "step": 7106 + }, + { + "epoch": 2.587550054605024, + "grad_norm": 24.25, + "learning_rate": 2.3751123785102885e-07, + "loss": 1.5151848793029785, + "step": 7108 + }, + { + "epoch": 2.588278121587186, + "grad_norm": 31.875, + "learning_rate": 2.373811384229748e-07, + "loss": 1.7025607824325562, + "step": 7110 + }, + { + "epoch": 2.5890061885693485, + "grad_norm": 7.4375, + "learning_rate": 2.3725125393794224e-07, + "loss": 1.2188754081726074, + "step": 7112 + }, + { + "epoch": 2.5897342555515106, + "grad_norm": 4.34375, + "learning_rate": 2.3712158447292046e-07, + "loss": 1.2868876457214355, + "step": 7114 + }, + { + "epoch": 2.590462322533673, + "grad_norm": 14.125, + "learning_rate": 2.3699213010477136e-07, + "loss": 0.9437057375907898, + "step": 7116 + }, + { + "epoch": 2.5911903895158357, + "grad_norm": 14.875, + "learning_rate": 2.3686289091022933e-07, + "loss": 1.2810871601104736, + "step": 7118 + }, + { + "epoch": 2.591918456497998, + "grad_norm": 32.75, + "learning_rate": 2.3673386696590097e-07, + "loss": 1.5987074375152588, + "step": 7120 + }, + { + "epoch": 2.5926465234801603, + "grad_norm": 20.0, + "learning_rate": 2.366050583482656e-07, + "loss": 1.6757781505584717, + "step": 7122 + }, + { + "epoch": 2.5933745904623224, + "grad_norm": 13.5, + "learning_rate": 2.364764651336747e-07, + "loss": 1.5706384181976318, + "step": 7124 + }, + { + "epoch": 2.594102657444485, + "grad_norm": 15.375, + "learning_rate": 2.3634808739835225e-07, + "loss": 1.242426872253418, + "step": 7126 + }, + { + "epoch": 2.5948307244266475, + "grad_norm": 15.3125, + "learning_rate": 2.362199252183943e-07, + "loss": 1.1405067443847656, + "step": 7128 + }, + { + "epoch": 2.5955587914088096, + "grad_norm": 14.25, + "learning_rate": 2.360919786697692e-07, + "loss": 1.3076536655426025, + "step": 7130 + }, + { + "epoch": 2.5962868583909717, + "grad_norm": 8.5, + "learning_rate": 2.3596424782831752e-07, + "loss": 1.3573139905929565, + "step": 7132 + }, + { + "epoch": 2.5970149253731343, + "grad_norm": 13.125, + "learning_rate": 2.3583673276975213e-07, + "loss": 1.3817039728164673, + "step": 7134 + }, + { + "epoch": 2.597742992355297, + "grad_norm": 32.75, + "learning_rate": 2.3570943356965757e-07, + "loss": 1.1740851402282715, + "step": 7136 + }, + { + "epoch": 2.598471059337459, + "grad_norm": 10.375, + "learning_rate": 2.3558235030349095e-07, + "loss": 1.5655245780944824, + "step": 7138 + }, + { + "epoch": 2.5991991263196215, + "grad_norm": 27.25, + "learning_rate": 2.3545548304658093e-07, + "loss": 1.2022862434387207, + "step": 7140 + }, + { + "epoch": 2.5999271933017836, + "grad_norm": 9.375, + "learning_rate": 2.3532883187412829e-07, + "loss": 1.436737060546875, + "step": 7142 + }, + { + "epoch": 2.600655260283946, + "grad_norm": 29.0, + "learning_rate": 2.3520239686120584e-07, + "loss": 1.5761299133300781, + "step": 7144 + }, + { + "epoch": 2.6013833272661087, + "grad_norm": 11.75, + "learning_rate": 2.3507617808275826e-07, + "loss": 1.2550275325775146, + "step": 7146 + }, + { + "epoch": 2.6021113942482708, + "grad_norm": 14.125, + "learning_rate": 2.3495017561360175e-07, + "loss": 1.297851324081421, + "step": 7148 + }, + { + "epoch": 2.6028394612304333, + "grad_norm": 17.0, + "learning_rate": 2.3482438952842467e-07, + "loss": 1.5516706705093384, + "step": 7150 + }, + { + "epoch": 2.6035675282125954, + "grad_norm": 6.125, + "learning_rate": 2.3469881990178696e-07, + "loss": 1.3336596488952637, + "step": 7152 + }, + { + "epoch": 2.604295595194758, + "grad_norm": 7.0, + "learning_rate": 2.345734668081203e-07, + "loss": 1.0623795986175537, + "step": 7154 + }, + { + "epoch": 2.6050236621769205, + "grad_norm": 16.875, + "learning_rate": 2.3444833032172785e-07, + "loss": 1.4837968349456787, + "step": 7156 + }, + { + "epoch": 2.6057517291590826, + "grad_norm": 23.375, + "learning_rate": 2.3432341051678452e-07, + "loss": 1.494350790977478, + "step": 7158 + }, + { + "epoch": 2.606479796141245, + "grad_norm": 92.0, + "learning_rate": 2.341987074673368e-07, + "loss": 1.8801259994506836, + "step": 7160 + }, + { + "epoch": 2.6072078631234072, + "grad_norm": 13.25, + "learning_rate": 2.3407422124730264e-07, + "loss": 1.6376819610595703, + "step": 7162 + }, + { + "epoch": 2.60793593010557, + "grad_norm": 58.0, + "learning_rate": 2.3394995193047158e-07, + "loss": 1.6991844177246094, + "step": 7164 + }, + { + "epoch": 2.6086639970877323, + "grad_norm": 14.9375, + "learning_rate": 2.3382589959050438e-07, + "loss": 1.2581121921539307, + "step": 7166 + }, + { + "epoch": 2.6093920640698944, + "grad_norm": 101.5, + "learning_rate": 2.3370206430093337e-07, + "loss": 0.7665554881095886, + "step": 7168 + }, + { + "epoch": 2.6101201310520565, + "grad_norm": 25.875, + "learning_rate": 2.3357844613516203e-07, + "loss": 1.5842291116714478, + "step": 7170 + }, + { + "epoch": 2.610848198034219, + "grad_norm": 14.3125, + "learning_rate": 2.3345504516646532e-07, + "loss": 1.3373420238494873, + "step": 7172 + }, + { + "epoch": 2.6115762650163816, + "grad_norm": 22.125, + "learning_rate": 2.3333186146798955e-07, + "loss": 1.3372244834899902, + "step": 7174 + }, + { + "epoch": 2.6123043319985437, + "grad_norm": 8.6875, + "learning_rate": 2.3320889511275191e-07, + "loss": 1.4575693607330322, + "step": 7176 + }, + { + "epoch": 2.6130323989807063, + "grad_norm": 10.5, + "learning_rate": 2.3308614617364101e-07, + "loss": 1.3619928359985352, + "step": 7178 + }, + { + "epoch": 2.6137604659628684, + "grad_norm": 18.375, + "learning_rate": 2.3296361472341656e-07, + "loss": 1.5471386909484863, + "step": 7180 + }, + { + "epoch": 2.614488532945031, + "grad_norm": 14.6875, + "learning_rate": 2.3284130083470927e-07, + "loss": 1.4692388772964478, + "step": 7182 + }, + { + "epoch": 2.6152165999271935, + "grad_norm": 5.65625, + "learning_rate": 2.3271920458002088e-07, + "loss": 1.0043506622314453, + "step": 7184 + }, + { + "epoch": 2.6159446669093556, + "grad_norm": 14.9375, + "learning_rate": 2.3259732603172426e-07, + "loss": 1.2813234329223633, + "step": 7186 + }, + { + "epoch": 2.616672733891518, + "grad_norm": 14.5, + "learning_rate": 2.3247566526206307e-07, + "loss": 1.4872498512268066, + "step": 7188 + }, + { + "epoch": 2.61740080087368, + "grad_norm": 29.375, + "learning_rate": 2.3235422234315197e-07, + "loss": 1.8540421724319458, + "step": 7190 + }, + { + "epoch": 2.6181288678558428, + "grad_norm": 37.75, + "learning_rate": 2.3223299734697665e-07, + "loss": 0.9119278192520142, + "step": 7192 + }, + { + "epoch": 2.6188569348380053, + "grad_norm": 13.625, + "learning_rate": 2.321119903453932e-07, + "loss": 1.4573297500610352, + "step": 7194 + }, + { + "epoch": 2.6195850018201674, + "grad_norm": 10.1875, + "learning_rate": 2.3199120141012898e-07, + "loss": 1.1274808645248413, + "step": 7196 + }, + { + "epoch": 2.62031306880233, + "grad_norm": 5.28125, + "learning_rate": 2.3187063061278176e-07, + "loss": 1.0032143592834473, + "step": 7198 + }, + { + "epoch": 2.621041135784492, + "grad_norm": 10.625, + "learning_rate": 2.3175027802482005e-07, + "loss": 1.2210203409194946, + "step": 7200 + }, + { + "epoch": 2.6217692027666546, + "grad_norm": 12.5, + "learning_rate": 2.3163014371758313e-07, + "loss": 1.3301470279693604, + "step": 7202 + }, + { + "epoch": 2.622497269748817, + "grad_norm": 7.3125, + "learning_rate": 2.3151022776228088e-07, + "loss": 1.0077489614486694, + "step": 7204 + }, + { + "epoch": 2.6232253367309792, + "grad_norm": 11.6875, + "learning_rate": 2.3139053022999364e-07, + "loss": 1.3058182001113892, + "step": 7206 + }, + { + "epoch": 2.6239534037131413, + "grad_norm": 9.125, + "learning_rate": 2.3127105119167245e-07, + "loss": 1.446943759918213, + "step": 7208 + }, + { + "epoch": 2.624681470695304, + "grad_norm": 21.875, + "learning_rate": 2.3115179071813868e-07, + "loss": 0.9458024501800537, + "step": 7210 + }, + { + "epoch": 2.6254095376774664, + "grad_norm": 10.25, + "learning_rate": 2.310327488800841e-07, + "loss": 1.4595839977264404, + "step": 7212 + }, + { + "epoch": 2.6261376046596285, + "grad_norm": 10.9375, + "learning_rate": 2.309139257480712e-07, + "loss": 1.5937573909759521, + "step": 7214 + }, + { + "epoch": 2.626865671641791, + "grad_norm": 13.3125, + "learning_rate": 2.307953213925324e-07, + "loss": 1.6360868215560913, + "step": 7216 + }, + { + "epoch": 2.627593738623953, + "grad_norm": 12.6875, + "learning_rate": 2.306769358837708e-07, + "loss": 1.3872673511505127, + "step": 7218 + }, + { + "epoch": 2.6283218056061157, + "grad_norm": 11.8125, + "learning_rate": 2.3055876929195963e-07, + "loss": 0.971673309803009, + "step": 7220 + }, + { + "epoch": 2.6290498725882783, + "grad_norm": 8.125, + "learning_rate": 2.3044082168714227e-07, + "loss": 1.3312809467315674, + "step": 7222 + }, + { + "epoch": 2.6297779395704404, + "grad_norm": 16.25, + "learning_rate": 2.303230931392325e-07, + "loss": 1.5589497089385986, + "step": 7224 + }, + { + "epoch": 2.630506006552603, + "grad_norm": 17.5, + "learning_rate": 2.3020558371801412e-07, + "loss": 1.2965642213821411, + "step": 7226 + }, + { + "epoch": 2.631234073534765, + "grad_norm": 3.421875, + "learning_rate": 2.3008829349314096e-07, + "loss": 1.1434199810028076, + "step": 7228 + }, + { + "epoch": 2.6319621405169276, + "grad_norm": 18.5, + "learning_rate": 2.299712225341371e-07, + "loss": 1.5938948392868042, + "step": 7230 + }, + { + "epoch": 2.63269020749909, + "grad_norm": 52.5, + "learning_rate": 2.298543709103966e-07, + "loss": 1.679323673248291, + "step": 7232 + }, + { + "epoch": 2.633418274481252, + "grad_norm": 10.875, + "learning_rate": 2.297377386911834e-07, + "loss": 1.4949226379394531, + "step": 7234 + }, + { + "epoch": 2.6341463414634148, + "grad_norm": 9.75, + "learning_rate": 2.2962132594563164e-07, + "loss": 1.4109604358673096, + "step": 7236 + }, + { + "epoch": 2.634874408445577, + "grad_norm": 32.25, + "learning_rate": 2.29505132742745e-07, + "loss": 1.122046709060669, + "step": 7238 + }, + { + "epoch": 2.6356024754277394, + "grad_norm": 11.6875, + "learning_rate": 2.2938915915139728e-07, + "loss": 1.126784086227417, + "step": 7240 + }, + { + "epoch": 2.636330542409902, + "grad_norm": 9.0625, + "learning_rate": 2.2927340524033205e-07, + "loss": 1.11781907081604, + "step": 7242 + }, + { + "epoch": 2.637058609392064, + "grad_norm": 13.5, + "learning_rate": 2.2915787107816275e-07, + "loss": 1.447349190711975, + "step": 7244 + }, + { + "epoch": 2.6377866763742266, + "grad_norm": 11.75, + "learning_rate": 2.290425567333724e-07, + "loss": 1.21555757522583, + "step": 7246 + }, + { + "epoch": 2.6385147433563887, + "grad_norm": 14.5625, + "learning_rate": 2.2892746227431372e-07, + "loss": 1.4273302555084229, + "step": 7248 + }, + { + "epoch": 2.6392428103385512, + "grad_norm": 10.875, + "learning_rate": 2.2881258776920942e-07, + "loss": 1.2944669723510742, + "step": 7250 + }, + { + "epoch": 2.639970877320714, + "grad_norm": 24.0, + "learning_rate": 2.286979332861513e-07, + "loss": 1.3552048206329346, + "step": 7252 + }, + { + "epoch": 2.640698944302876, + "grad_norm": 11.875, + "learning_rate": 2.2858349889310123e-07, + "loss": 1.4447144269943237, + "step": 7254 + }, + { + "epoch": 2.641427011285038, + "grad_norm": 18.125, + "learning_rate": 2.2846928465789023e-07, + "loss": 1.7058790922164917, + "step": 7256 + }, + { + "epoch": 2.6421550782672005, + "grad_norm": 11.875, + "learning_rate": 2.283552906482191e-07, + "loss": 1.259598731994629, + "step": 7258 + }, + { + "epoch": 2.642883145249363, + "grad_norm": 21.625, + "learning_rate": 2.2824151693165805e-07, + "loss": 1.3348042964935303, + "step": 7260 + }, + { + "epoch": 2.643611212231525, + "grad_norm": 12.0625, + "learning_rate": 2.2812796357564657e-07, + "loss": 1.2702785730361938, + "step": 7262 + }, + { + "epoch": 2.6443392792136877, + "grad_norm": 17.5, + "learning_rate": 2.280146306474937e-07, + "loss": 1.4606359004974365, + "step": 7264 + }, + { + "epoch": 2.64506734619585, + "grad_norm": 25.25, + "learning_rate": 2.2790151821437776e-07, + "loss": 1.4617960453033447, + "step": 7266 + }, + { + "epoch": 2.6457954131780124, + "grad_norm": 11.5625, + "learning_rate": 2.2778862634334636e-07, + "loss": 1.3792232275009155, + "step": 7268 + }, + { + "epoch": 2.646523480160175, + "grad_norm": 33.75, + "learning_rate": 2.2767595510131629e-07, + "loss": 1.5778677463531494, + "step": 7270 + }, + { + "epoch": 2.647251547142337, + "grad_norm": 12.25, + "learning_rate": 2.2756350455507373e-07, + "loss": 1.3343102931976318, + "step": 7272 + }, + { + "epoch": 2.6479796141244996, + "grad_norm": 6.5625, + "learning_rate": 2.274512747712739e-07, + "loss": 1.056491494178772, + "step": 7274 + }, + { + "epoch": 2.6487076811066617, + "grad_norm": 7.8125, + "learning_rate": 2.2733926581644122e-07, + "loss": 0.9071427583694458, + "step": 7276 + }, + { + "epoch": 2.649435748088824, + "grad_norm": 11.5, + "learning_rate": 2.2722747775696927e-07, + "loss": 1.3022968769073486, + "step": 7278 + }, + { + "epoch": 2.6501638150709868, + "grad_norm": 9.4375, + "learning_rate": 2.2711591065912064e-07, + "loss": 1.1783210039138794, + "step": 7280 + }, + { + "epoch": 2.650891882053149, + "grad_norm": 15.8125, + "learning_rate": 2.270045645890268e-07, + "loss": 1.260572910308838, + "step": 7282 + }, + { + "epoch": 2.6516199490353114, + "grad_norm": 15.6875, + "learning_rate": 2.2689343961268852e-07, + "loss": 1.5300710201263428, + "step": 7284 + }, + { + "epoch": 2.6523480160174735, + "grad_norm": 11.125, + "learning_rate": 2.2678253579597524e-07, + "loss": 1.5705375671386719, + "step": 7286 + }, + { + "epoch": 2.653076082999636, + "grad_norm": 13.4375, + "learning_rate": 2.2667185320462537e-07, + "loss": 1.4982233047485352, + "step": 7288 + }, + { + "epoch": 2.6538041499817986, + "grad_norm": 9.5625, + "learning_rate": 2.2656139190424637e-07, + "loss": 1.6170737743377686, + "step": 7290 + }, + { + "epoch": 2.6545322169639607, + "grad_norm": 12.0625, + "learning_rate": 2.2645115196031425e-07, + "loss": 1.2265284061431885, + "step": 7292 + }, + { + "epoch": 2.655260283946123, + "grad_norm": 12.5, + "learning_rate": 2.263411334381741e-07, + "loss": 1.288074254989624, + "step": 7294 + }, + { + "epoch": 2.6559883509282853, + "grad_norm": 16.125, + "learning_rate": 2.262313364030395e-07, + "loss": 1.4011592864990234, + "step": 7296 + }, + { + "epoch": 2.656716417910448, + "grad_norm": 25.25, + "learning_rate": 2.2612176091999276e-07, + "loss": 1.6951806545257568, + "step": 7298 + }, + { + "epoch": 2.65744448489261, + "grad_norm": 5.4375, + "learning_rate": 2.2601240705398516e-07, + "loss": 1.220876693725586, + "step": 7300 + }, + { + "epoch": 2.6581725518747725, + "grad_norm": 8.875, + "learning_rate": 2.2590327486983623e-07, + "loss": 1.105665922164917, + "step": 7302 + }, + { + "epoch": 2.6589006188569346, + "grad_norm": 30.125, + "learning_rate": 2.2579436443223441e-07, + "loss": 1.6200323104858398, + "step": 7304 + }, + { + "epoch": 2.659628685839097, + "grad_norm": 11.875, + "learning_rate": 2.2568567580573653e-07, + "loss": 0.760164737701416, + "step": 7306 + }, + { + "epoch": 2.6603567528212597, + "grad_norm": 11.8125, + "learning_rate": 2.2557720905476799e-07, + "loss": 1.5751066207885742, + "step": 7308 + }, + { + "epoch": 2.661084819803422, + "grad_norm": 19.625, + "learning_rate": 2.254689642436226e-07, + "loss": 1.5951497554779053, + "step": 7310 + }, + { + "epoch": 2.6618128867855844, + "grad_norm": 14.8125, + "learning_rate": 2.253609414364628e-07, + "loss": 1.7626490592956543, + "step": 7312 + }, + { + "epoch": 2.6625409537677465, + "grad_norm": 20.625, + "learning_rate": 2.2525314069731915e-07, + "loss": 1.7158010005950928, + "step": 7314 + }, + { + "epoch": 2.663269020749909, + "grad_norm": 12.375, + "learning_rate": 2.2514556209009084e-07, + "loss": 1.339233160018921, + "step": 7316 + }, + { + "epoch": 2.6639970877320716, + "grad_norm": 17.0, + "learning_rate": 2.2503820567854537e-07, + "loss": 1.1788867712020874, + "step": 7318 + }, + { + "epoch": 2.6647251547142337, + "grad_norm": 10.4375, + "learning_rate": 2.2493107152631833e-07, + "loss": 1.7363543510437012, + "step": 7320 + }, + { + "epoch": 2.665453221696396, + "grad_norm": 15.0625, + "learning_rate": 2.2482415969691374e-07, + "loss": 1.4273102283477783, + "step": 7322 + }, + { + "epoch": 2.6661812886785583, + "grad_norm": 41.25, + "learning_rate": 2.2471747025370386e-07, + "loss": 1.7893273830413818, + "step": 7324 + }, + { + "epoch": 2.666909355660721, + "grad_norm": 9.125, + "learning_rate": 2.2461100325992888e-07, + "loss": 1.0797994136810303, + "step": 7326 + }, + { + "epoch": 2.6676374226428834, + "grad_norm": 27.5, + "learning_rate": 2.2450475877869743e-07, + "loss": 1.5808453559875488, + "step": 7328 + }, + { + "epoch": 2.6683654896250455, + "grad_norm": 14.625, + "learning_rate": 2.243987368729862e-07, + "loss": 1.5170924663543701, + "step": 7330 + }, + { + "epoch": 2.6690935566072076, + "grad_norm": 10.4375, + "learning_rate": 2.2429293760563972e-07, + "loss": 1.3387362957000732, + "step": 7332 + }, + { + "epoch": 2.66982162358937, + "grad_norm": 18.875, + "learning_rate": 2.2418736103937087e-07, + "loss": 1.4531819820404053, + "step": 7334 + }, + { + "epoch": 2.6705496905715327, + "grad_norm": 58.5, + "learning_rate": 2.240820072367603e-07, + "loss": 1.6243340969085693, + "step": 7336 + }, + { + "epoch": 2.671277757553695, + "grad_norm": 30.875, + "learning_rate": 2.2397687626025653e-07, + "loss": 1.9622907638549805, + "step": 7338 + }, + { + "epoch": 2.6720058245358573, + "grad_norm": 17.5, + "learning_rate": 2.2387196817217636e-07, + "loss": 1.2821252346038818, + "step": 7340 + }, + { + "epoch": 2.6727338915180194, + "grad_norm": 73.0, + "learning_rate": 2.2376728303470412e-07, + "loss": 1.022587776184082, + "step": 7342 + }, + { + "epoch": 2.673461958500182, + "grad_norm": 13.3125, + "learning_rate": 2.2366282090989218e-07, + "loss": 1.5848478078842163, + "step": 7344 + }, + { + "epoch": 2.6741900254823445, + "grad_norm": 18.375, + "learning_rate": 2.2355858185966074e-07, + "loss": 0.8448590040206909, + "step": 7346 + }, + { + "epoch": 2.6749180924645066, + "grad_norm": 16.625, + "learning_rate": 2.2345456594579751e-07, + "loss": 1.6400840282440186, + "step": 7348 + }, + { + "epoch": 2.675646159446669, + "grad_norm": 4.90625, + "learning_rate": 2.2335077322995832e-07, + "loss": 1.3002060651779175, + "step": 7350 + }, + { + "epoch": 2.6763742264288313, + "grad_norm": 34.5, + "learning_rate": 2.232472037736664e-07, + "loss": 0.9854254722595215, + "step": 7352 + }, + { + "epoch": 2.677102293410994, + "grad_norm": 11.6875, + "learning_rate": 2.2314385763831266e-07, + "loss": 1.4152352809906006, + "step": 7354 + }, + { + "epoch": 2.6778303603931564, + "grad_norm": 10.625, + "learning_rate": 2.2304073488515587e-07, + "loss": 1.5970702171325684, + "step": 7356 + }, + { + "epoch": 2.6785584273753185, + "grad_norm": 11.4375, + "learning_rate": 2.2293783557532219e-07, + "loss": 1.4377365112304688, + "step": 7358 + }, + { + "epoch": 2.679286494357481, + "grad_norm": 9.8125, + "learning_rate": 2.2283515976980537e-07, + "loss": 1.4892995357513428, + "step": 7360 + }, + { + "epoch": 2.680014561339643, + "grad_norm": 11.25, + "learning_rate": 2.2273270752946662e-07, + "loss": 1.1512417793273926, + "step": 7362 + }, + { + "epoch": 2.6807426283218057, + "grad_norm": 14.8125, + "learning_rate": 2.2263047891503483e-07, + "loss": 0.9726536273956299, + "step": 7364 + }, + { + "epoch": 2.681470695303968, + "grad_norm": 23.0, + "learning_rate": 2.225284739871062e-07, + "loss": 1.0972918272018433, + "step": 7366 + }, + { + "epoch": 2.6821987622861303, + "grad_norm": 14.25, + "learning_rate": 2.224266928061442e-07, + "loss": 1.2929961681365967, + "step": 7368 + }, + { + "epoch": 2.682926829268293, + "grad_norm": 10.1875, + "learning_rate": 2.2232513543248e-07, + "loss": 1.1788389682769775, + "step": 7370 + }, + { + "epoch": 2.683654896250455, + "grad_norm": 13.6875, + "learning_rate": 2.2222380192631175e-07, + "loss": 1.3784116506576538, + "step": 7372 + }, + { + "epoch": 2.6843829632326175, + "grad_norm": 21.25, + "learning_rate": 2.2212269234770515e-07, + "loss": 1.3584096431732178, + "step": 7374 + }, + { + "epoch": 2.68511103021478, + "grad_norm": 6.5, + "learning_rate": 2.2202180675659321e-07, + "loss": 1.1857936382293701, + "step": 7376 + }, + { + "epoch": 2.685839097196942, + "grad_norm": 15.125, + "learning_rate": 2.2192114521277594e-07, + "loss": 1.1737173795700073, + "step": 7378 + }, + { + "epoch": 2.6865671641791042, + "grad_norm": 16.875, + "learning_rate": 2.2182070777592065e-07, + "loss": 1.7598810195922852, + "step": 7380 + }, + { + "epoch": 2.687295231161267, + "grad_norm": 7.5, + "learning_rate": 2.2172049450556185e-07, + "loss": 1.1135178804397583, + "step": 7382 + }, + { + "epoch": 2.6880232981434293, + "grad_norm": 15.875, + "learning_rate": 2.2162050546110113e-07, + "loss": 1.6275099515914917, + "step": 7384 + }, + { + "epoch": 2.6887513651255914, + "grad_norm": 11.375, + "learning_rate": 2.2152074070180727e-07, + "loss": 1.4715653657913208, + "step": 7386 + }, + { + "epoch": 2.689479432107754, + "grad_norm": 33.0, + "learning_rate": 2.2142120028681588e-07, + "loss": 1.5158636569976807, + "step": 7388 + }, + { + "epoch": 2.690207499089916, + "grad_norm": 14.5, + "learning_rate": 2.213218842751298e-07, + "loss": 1.351304292678833, + "step": 7390 + }, + { + "epoch": 2.6909355660720786, + "grad_norm": 23.25, + "learning_rate": 2.2122279272561886e-07, + "loss": 1.3102582693099976, + "step": 7392 + }, + { + "epoch": 2.691663633054241, + "grad_norm": 10.125, + "learning_rate": 2.2112392569701977e-07, + "loss": 1.1773525476455688, + "step": 7394 + }, + { + "epoch": 2.6923917000364033, + "grad_norm": 10.875, + "learning_rate": 2.2102528324793592e-07, + "loss": 1.1017882823944092, + "step": 7396 + }, + { + "epoch": 2.693119767018566, + "grad_norm": 13.5625, + "learning_rate": 2.2092686543683807e-07, + "loss": 1.4752449989318848, + "step": 7398 + }, + { + "epoch": 2.693847834000728, + "grad_norm": 13.9375, + "learning_rate": 2.2082867232206342e-07, + "loss": 1.2021892070770264, + "step": 7400 + }, + { + "epoch": 2.6945759009828905, + "grad_norm": 8.4375, + "learning_rate": 2.2073070396181616e-07, + "loss": 1.4070212841033936, + "step": 7402 + }, + { + "epoch": 2.695303967965053, + "grad_norm": 17.875, + "learning_rate": 2.2063296041416724e-07, + "loss": 1.5133140087127686, + "step": 7404 + }, + { + "epoch": 2.696032034947215, + "grad_norm": 227.0, + "learning_rate": 2.2053544173705437e-07, + "loss": 1.51877760887146, + "step": 7406 + }, + { + "epoch": 2.6967601019293777, + "grad_norm": 15.5625, + "learning_rate": 2.2043814798828187e-07, + "loss": 1.2707483768463135, + "step": 7408 + }, + { + "epoch": 2.6974881689115398, + "grad_norm": 45.75, + "learning_rate": 2.203410792255208e-07, + "loss": 0.9503387212753296, + "step": 7410 + }, + { + "epoch": 2.6982162358937023, + "grad_norm": 25.375, + "learning_rate": 2.2024423550630884e-07, + "loss": 1.2451908588409424, + "step": 7412 + }, + { + "epoch": 2.698944302875865, + "grad_norm": 82.5, + "learning_rate": 2.2014761688805034e-07, + "loss": 0.7692964673042297, + "step": 7414 + }, + { + "epoch": 2.699672369858027, + "grad_norm": 13.125, + "learning_rate": 2.2005122342801618e-07, + "loss": 1.438571572303772, + "step": 7416 + }, + { + "epoch": 2.700400436840189, + "grad_norm": 9.1875, + "learning_rate": 2.1995505518334367e-07, + "loss": 1.366706132888794, + "step": 7418 + }, + { + "epoch": 2.7011285038223516, + "grad_norm": 19.0, + "learning_rate": 2.1985911221103685e-07, + "loss": 1.1790852546691895, + "step": 7420 + }, + { + "epoch": 2.701856570804514, + "grad_norm": 14.8125, + "learning_rate": 2.1976339456796599e-07, + "loss": 1.4763299226760864, + "step": 7422 + }, + { + "epoch": 2.7025846377866762, + "grad_norm": 22.125, + "learning_rate": 2.1966790231086797e-07, + "loss": 1.5880510807037354, + "step": 7424 + }, + { + "epoch": 2.703312704768839, + "grad_norm": 9.6875, + "learning_rate": 2.19572635496346e-07, + "loss": 1.3416132926940918, + "step": 7426 + }, + { + "epoch": 2.704040771751001, + "grad_norm": 15.4375, + "learning_rate": 2.1947759418086958e-07, + "loss": 1.557618498802185, + "step": 7428 + }, + { + "epoch": 2.7047688387331634, + "grad_norm": 6.0, + "learning_rate": 2.1938277842077475e-07, + "loss": 1.324063777923584, + "step": 7430 + }, + { + "epoch": 2.705496905715326, + "grad_norm": 21.5, + "learning_rate": 2.192881882722637e-07, + "loss": 1.5582811832427979, + "step": 7432 + }, + { + "epoch": 2.706224972697488, + "grad_norm": 11.625, + "learning_rate": 2.1919382379140495e-07, + "loss": 1.3855839967727661, + "step": 7434 + }, + { + "epoch": 2.7069530396796506, + "grad_norm": 11.6875, + "learning_rate": 2.190996850341331e-07, + "loss": 1.3150756359100342, + "step": 7436 + }, + { + "epoch": 2.7076811066618127, + "grad_norm": 23.625, + "learning_rate": 2.1900577205624917e-07, + "loss": 1.4014395475387573, + "step": 7438 + }, + { + "epoch": 2.7084091736439753, + "grad_norm": 12.1875, + "learning_rate": 2.1891208491342025e-07, + "loss": 1.6540253162384033, + "step": 7440 + }, + { + "epoch": 2.709137240626138, + "grad_norm": 24.0, + "learning_rate": 2.188186236611796e-07, + "loss": 1.181293249130249, + "step": 7442 + }, + { + "epoch": 2.7098653076083, + "grad_norm": 21.0, + "learning_rate": 2.187253883549265e-07, + "loss": 1.4235153198242188, + "step": 7444 + }, + { + "epoch": 2.7105933745904625, + "grad_norm": 14.0625, + "learning_rate": 2.186323790499264e-07, + "loss": 0.9217972755432129, + "step": 7446 + }, + { + "epoch": 2.7113214415726246, + "grad_norm": 97.0, + "learning_rate": 2.1853959580131073e-07, + "loss": 1.347278118133545, + "step": 7448 + }, + { + "epoch": 2.712049508554787, + "grad_norm": 9.875, + "learning_rate": 2.1844703866407693e-07, + "loss": 1.3894625902175903, + "step": 7450 + }, + { + "epoch": 2.7127775755369496, + "grad_norm": 29.875, + "learning_rate": 2.183547076930884e-07, + "loss": 1.4254738092422485, + "step": 7452 + }, + { + "epoch": 2.7135056425191117, + "grad_norm": 10.1875, + "learning_rate": 2.182626029430745e-07, + "loss": 1.1579289436340332, + "step": 7454 + }, + { + "epoch": 2.714233709501274, + "grad_norm": 16.5, + "learning_rate": 2.1817072446863057e-07, + "loss": 1.1942205429077148, + "step": 7456 + }, + { + "epoch": 2.7149617764834364, + "grad_norm": 11.75, + "learning_rate": 2.180790723242177e-07, + "loss": 1.3689872026443481, + "step": 7458 + }, + { + "epoch": 2.715689843465599, + "grad_norm": 4.96875, + "learning_rate": 2.1798764656416285e-07, + "loss": 1.153327465057373, + "step": 7460 + }, + { + "epoch": 2.716417910447761, + "grad_norm": 10.625, + "learning_rate": 2.1789644724265875e-07, + "loss": 1.5765061378479004, + "step": 7462 + }, + { + "epoch": 2.7171459774299236, + "grad_norm": 40.25, + "learning_rate": 2.178054744137642e-07, + "loss": 1.1502048969268799, + "step": 7464 + }, + { + "epoch": 2.7178740444120857, + "grad_norm": 16.875, + "learning_rate": 2.1771472813140335e-07, + "loss": 0.8112169504165649, + "step": 7466 + }, + { + "epoch": 2.7186021113942482, + "grad_norm": 13.75, + "learning_rate": 2.1762420844936615e-07, + "loss": 1.6496496200561523, + "step": 7468 + }, + { + "epoch": 2.719330178376411, + "grad_norm": 8.25, + "learning_rate": 2.1753391542130847e-07, + "loss": 1.3403403759002686, + "step": 7470 + }, + { + "epoch": 2.720058245358573, + "grad_norm": 76.0, + "learning_rate": 2.1744384910075166e-07, + "loss": 1.578129768371582, + "step": 7472 + }, + { + "epoch": 2.7207863123407354, + "grad_norm": 5.21875, + "learning_rate": 2.1735400954108262e-07, + "loss": 1.1504271030426025, + "step": 7474 + }, + { + "epoch": 2.7215143793228975, + "grad_norm": 9.375, + "learning_rate": 2.1726439679555406e-07, + "loss": 1.381467580795288, + "step": 7476 + }, + { + "epoch": 2.72224244630506, + "grad_norm": 14.875, + "learning_rate": 2.1717501091728397e-07, + "loss": 1.4673864841461182, + "step": 7478 + }, + { + "epoch": 2.7229705132872226, + "grad_norm": 10.875, + "learning_rate": 2.1708585195925598e-07, + "loss": 1.358340859413147, + "step": 7480 + }, + { + "epoch": 2.7236985802693847, + "grad_norm": 8.3125, + "learning_rate": 2.1699691997431933e-07, + "loss": 1.0377681255340576, + "step": 7482 + }, + { + "epoch": 2.7244266472515473, + "grad_norm": 15.5625, + "learning_rate": 2.169082150151886e-07, + "loss": 1.2621326446533203, + "step": 7484 + }, + { + "epoch": 2.7251547142337094, + "grad_norm": 15.0625, + "learning_rate": 2.168197371344438e-07, + "loss": 1.2916529178619385, + "step": 7486 + }, + { + "epoch": 2.725882781215872, + "grad_norm": 12.875, + "learning_rate": 2.1673148638453035e-07, + "loss": 1.3801265954971313, + "step": 7488 + }, + { + "epoch": 2.7266108481980345, + "grad_norm": 13.25, + "learning_rate": 2.1664346281775914e-07, + "loss": 1.7097845077514648, + "step": 7490 + }, + { + "epoch": 2.7273389151801966, + "grad_norm": 13.0625, + "learning_rate": 2.1655566648630623e-07, + "loss": 1.362863540649414, + "step": 7492 + }, + { + "epoch": 2.728066982162359, + "grad_norm": 17.5, + "learning_rate": 2.1646809744221304e-07, + "loss": 1.5974843502044678, + "step": 7494 + }, + { + "epoch": 2.728795049144521, + "grad_norm": 25.0, + "learning_rate": 2.1638075573738635e-07, + "loss": 1.7004129886627197, + "step": 7496 + }, + { + "epoch": 2.7295231161266837, + "grad_norm": 16.875, + "learning_rate": 2.1629364142359807e-07, + "loss": 1.3541200160980225, + "step": 7498 + }, + { + "epoch": 2.7302511831088463, + "grad_norm": 17.0, + "learning_rate": 2.1620675455248537e-07, + "loss": 1.3327021598815918, + "step": 7500 + }, + { + "epoch": 2.7309792500910084, + "grad_norm": 19.0, + "learning_rate": 2.1612009517555073e-07, + "loss": 1.6738231182098389, + "step": 7502 + }, + { + "epoch": 2.7317073170731705, + "grad_norm": 5.375, + "learning_rate": 2.1603366334416157e-07, + "loss": 0.8186761140823364, + "step": 7504 + }, + { + "epoch": 2.732435384055333, + "grad_norm": 76.5, + "learning_rate": 2.1594745910955047e-07, + "loss": 1.7562346458435059, + "step": 7506 + }, + { + "epoch": 2.7331634510374956, + "grad_norm": 7.25, + "learning_rate": 2.158614825228152e-07, + "loss": 1.1579656600952148, + "step": 7508 + }, + { + "epoch": 2.7338915180196577, + "grad_norm": 4.65625, + "learning_rate": 2.1577573363491856e-07, + "loss": 1.1276189088821411, + "step": 7510 + }, + { + "epoch": 2.7346195850018202, + "grad_norm": 9.375, + "learning_rate": 2.156902124966884e-07, + "loss": 0.902687668800354, + "step": 7512 + }, + { + "epoch": 2.7353476519839823, + "grad_norm": 27.125, + "learning_rate": 2.156049191588174e-07, + "loss": 1.490831732749939, + "step": 7514 + }, + { + "epoch": 2.736075718966145, + "grad_norm": 8.125, + "learning_rate": 2.1551985367186353e-07, + "loss": 1.3988157510757446, + "step": 7516 + }, + { + "epoch": 2.7368037859483074, + "grad_norm": 13.375, + "learning_rate": 2.1543501608624945e-07, + "loss": 1.4925776720046997, + "step": 7518 + }, + { + "epoch": 2.7375318529304695, + "grad_norm": 7.46875, + "learning_rate": 2.1535040645226276e-07, + "loss": 1.3290629386901855, + "step": 7520 + }, + { + "epoch": 2.738259919912632, + "grad_norm": 16.875, + "learning_rate": 2.15266024820056e-07, + "loss": 1.7000207901000977, + "step": 7522 + }, + { + "epoch": 2.738987986894794, + "grad_norm": 10.5, + "learning_rate": 2.151818712396466e-07, + "loss": 1.0862596035003662, + "step": 7524 + }, + { + "epoch": 2.7397160538769567, + "grad_norm": 16.75, + "learning_rate": 2.1509794576091661e-07, + "loss": 1.755163311958313, + "step": 7526 + }, + { + "epoch": 2.7404441208591193, + "grad_norm": 11.125, + "learning_rate": 2.1501424843361315e-07, + "loss": 1.503108024597168, + "step": 7528 + }, + { + "epoch": 2.7411721878412814, + "grad_norm": 9.125, + "learning_rate": 2.1493077930734797e-07, + "loss": 1.2934619188308716, + "step": 7530 + }, + { + "epoch": 2.741900254823444, + "grad_norm": 36.75, + "learning_rate": 2.1484753843159752e-07, + "loss": 1.4291629791259766, + "step": 7532 + }, + { + "epoch": 2.742628321805606, + "grad_norm": 16.625, + "learning_rate": 2.14764525855703e-07, + "loss": 1.5813286304473877, + "step": 7534 + }, + { + "epoch": 2.7433563887877686, + "grad_norm": 17.5, + "learning_rate": 2.1468174162887022e-07, + "loss": 0.9666913747787476, + "step": 7536 + }, + { + "epoch": 2.744084455769931, + "grad_norm": 16.0, + "learning_rate": 2.1459918580016972e-07, + "loss": 1.579573631286621, + "step": 7538 + }, + { + "epoch": 2.744812522752093, + "grad_norm": 17.625, + "learning_rate": 2.145168584185366e-07, + "loss": 1.4093801975250244, + "step": 7540 + }, + { + "epoch": 2.7455405897342553, + "grad_norm": 16.0, + "learning_rate": 2.1443475953277067e-07, + "loss": 1.6559526920318604, + "step": 7542 + }, + { + "epoch": 2.746268656716418, + "grad_norm": 9.5625, + "learning_rate": 2.1435288919153615e-07, + "loss": 1.5072202682495117, + "step": 7544 + }, + { + "epoch": 2.7469967236985804, + "grad_norm": 25.375, + "learning_rate": 2.1427124744336182e-07, + "loss": 1.468116044998169, + "step": 7546 + }, + { + "epoch": 2.7477247906807425, + "grad_norm": 17.0, + "learning_rate": 2.1418983433664098e-07, + "loss": 1.3867874145507812, + "step": 7548 + }, + { + "epoch": 2.748452857662905, + "grad_norm": 29.75, + "learning_rate": 2.1410864991963137e-07, + "loss": 1.0115633010864258, + "step": 7550 + }, + { + "epoch": 2.749180924645067, + "grad_norm": 10.9375, + "learning_rate": 2.140276942404553e-07, + "loss": 1.0994255542755127, + "step": 7552 + }, + { + "epoch": 2.7499089916272297, + "grad_norm": 13.3125, + "learning_rate": 2.1394696734709932e-07, + "loss": 1.5285828113555908, + "step": 7554 + }, + { + "epoch": 2.7506370586093922, + "grad_norm": 8.8125, + "learning_rate": 2.1386646928741448e-07, + "loss": 1.3204987049102783, + "step": 7556 + }, + { + "epoch": 2.7513651255915543, + "grad_norm": 9.9375, + "learning_rate": 2.1378620010911618e-07, + "loss": 1.7695538997650146, + "step": 7558 + }, + { + "epoch": 2.752093192573717, + "grad_norm": 8.4375, + "learning_rate": 2.1370615985978406e-07, + "loss": 1.063084602355957, + "step": 7560 + }, + { + "epoch": 2.752821259555879, + "grad_norm": 12.625, + "learning_rate": 2.1362634858686225e-07, + "loss": 1.235320806503296, + "step": 7562 + }, + { + "epoch": 2.7535493265380415, + "grad_norm": 5.8125, + "learning_rate": 2.13546766337659e-07, + "loss": 1.2969253063201904, + "step": 7564 + }, + { + "epoch": 2.754277393520204, + "grad_norm": 13.625, + "learning_rate": 2.1346741315934673e-07, + "loss": 1.6006529331207275, + "step": 7566 + }, + { + "epoch": 2.755005460502366, + "grad_norm": 11.875, + "learning_rate": 2.133882890989623e-07, + "loss": 1.4384597539901733, + "step": 7568 + }, + { + "epoch": 2.7557335274845287, + "grad_norm": 8.375, + "learning_rate": 2.133093942034066e-07, + "loss": 1.018803596496582, + "step": 7570 + }, + { + "epoch": 2.756461594466691, + "grad_norm": 8.6875, + "learning_rate": 2.132307285194447e-07, + "loss": 1.4950809478759766, + "step": 7572 + }, + { + "epoch": 2.7571896614488534, + "grad_norm": 11.0, + "learning_rate": 2.1315229209370593e-07, + "loss": 1.3826688528060913, + "step": 7574 + }, + { + "epoch": 2.757917728431016, + "grad_norm": 20.625, + "learning_rate": 2.130740849726836e-07, + "loss": 1.6009629964828491, + "step": 7576 + }, + { + "epoch": 2.758645795413178, + "grad_norm": 13.5, + "learning_rate": 2.12996107202735e-07, + "loss": 1.4260319471359253, + "step": 7578 + }, + { + "epoch": 2.75937386239534, + "grad_norm": 11.0625, + "learning_rate": 2.1291835883008178e-07, + "loss": 1.2778112888336182, + "step": 7580 + }, + { + "epoch": 2.7601019293775026, + "grad_norm": 30.625, + "learning_rate": 2.1284083990080937e-07, + "loss": 1.4995088577270508, + "step": 7582 + }, + { + "epoch": 2.760829996359665, + "grad_norm": 330.0, + "learning_rate": 2.1276355046086718e-07, + "loss": 1.220095157623291, + "step": 7584 + }, + { + "epoch": 2.7615580633418273, + "grad_norm": 29.375, + "learning_rate": 2.126864905560688e-07, + "loss": 1.7453235387802124, + "step": 7586 + }, + { + "epoch": 2.76228613032399, + "grad_norm": 8.0, + "learning_rate": 2.1260966023209161e-07, + "loss": 1.3503886461257935, + "step": 7588 + }, + { + "epoch": 2.763014197306152, + "grad_norm": 7.0625, + "learning_rate": 2.1253305953447696e-07, + "loss": 1.201380729675293, + "step": 7590 + }, + { + "epoch": 2.7637422642883145, + "grad_norm": 11.9375, + "learning_rate": 2.124566885086299e-07, + "loss": 1.2647595405578613, + "step": 7592 + }, + { + "epoch": 2.764470331270477, + "grad_norm": 11.4375, + "learning_rate": 2.1238054719981965e-07, + "loss": 1.6429897546768188, + "step": 7594 + }, + { + "epoch": 2.765198398252639, + "grad_norm": 15.75, + "learning_rate": 2.1230463565317902e-07, + "loss": 1.6520031690597534, + "step": 7596 + }, + { + "epoch": 2.7659264652348017, + "grad_norm": 9.125, + "learning_rate": 2.1222895391370487e-07, + "loss": 1.4836949110031128, + "step": 7598 + }, + { + "epoch": 2.766654532216964, + "grad_norm": 12.125, + "learning_rate": 2.1215350202625755e-07, + "loss": 1.2801647186279297, + "step": 7600 + }, + { + "epoch": 2.7673825991991263, + "grad_norm": 9.75, + "learning_rate": 2.1207828003556137e-07, + "loss": 1.1290583610534668, + "step": 7602 + }, + { + "epoch": 2.768110666181289, + "grad_norm": 7.125, + "learning_rate": 2.1200328798620428e-07, + "loss": 1.1919676065444946, + "step": 7604 + }, + { + "epoch": 2.768838733163451, + "grad_norm": 21.25, + "learning_rate": 2.1192852592263799e-07, + "loss": 1.80873441696167, + "step": 7606 + }, + { + "epoch": 2.7695668001456135, + "grad_norm": 30.125, + "learning_rate": 2.1185399388917788e-07, + "loss": 1.4534547328948975, + "step": 7608 + }, + { + "epoch": 2.7702948671277756, + "grad_norm": 11.75, + "learning_rate": 2.117796919300029e-07, + "loss": 1.2270896434783936, + "step": 7610 + }, + { + "epoch": 2.771022934109938, + "grad_norm": 12.8125, + "learning_rate": 2.1170562008915565e-07, + "loss": 1.0826168060302734, + "step": 7612 + }, + { + "epoch": 2.7717510010921007, + "grad_norm": 10.0, + "learning_rate": 2.1163177841054248e-07, + "loss": 1.5670509338378906, + "step": 7614 + }, + { + "epoch": 2.772479068074263, + "grad_norm": 17.5, + "learning_rate": 2.1155816693793307e-07, + "loss": 1.2586274147033691, + "step": 7616 + }, + { + "epoch": 2.7732071350564254, + "grad_norm": 16.875, + "learning_rate": 2.1148478571496085e-07, + "loss": 1.3610072135925293, + "step": 7618 + }, + { + "epoch": 2.7739352020385875, + "grad_norm": 11.4375, + "learning_rate": 2.1141163478512252e-07, + "loss": 1.6160258054733276, + "step": 7620 + }, + { + "epoch": 2.77466326902075, + "grad_norm": 18.625, + "learning_rate": 2.1133871419177867e-07, + "loss": 1.6389312744140625, + "step": 7622 + }, + { + "epoch": 2.775391336002912, + "grad_norm": 17.625, + "learning_rate": 2.1126602397815294e-07, + "loss": 1.2853684425354004, + "step": 7624 + }, + { + "epoch": 2.7761194029850746, + "grad_norm": 13.8125, + "learning_rate": 2.111935641873326e-07, + "loss": 1.7296972274780273, + "step": 7626 + }, + { + "epoch": 2.7768474699672367, + "grad_norm": 6.96875, + "learning_rate": 2.1112133486226845e-07, + "loss": 1.2558321952819824, + "step": 7628 + }, + { + "epoch": 2.7775755369493993, + "grad_norm": 6.59375, + "learning_rate": 2.1104933604577446e-07, + "loss": 1.4232282638549805, + "step": 7630 + }, + { + "epoch": 2.778303603931562, + "grad_norm": 15.375, + "learning_rate": 2.1097756778052812e-07, + "loss": 1.0822720527648926, + "step": 7632 + }, + { + "epoch": 2.779031670913724, + "grad_norm": 13.3125, + "learning_rate": 2.1090603010907013e-07, + "loss": 1.3213411569595337, + "step": 7634 + }, + { + "epoch": 2.7797597378958865, + "grad_norm": 18.625, + "learning_rate": 2.1083472307380462e-07, + "loss": 1.4246447086334229, + "step": 7636 + }, + { + "epoch": 2.7804878048780486, + "grad_norm": 4.75, + "learning_rate": 2.1076364671699905e-07, + "loss": 1.1790339946746826, + "step": 7638 + }, + { + "epoch": 2.781215871860211, + "grad_norm": 12.0, + "learning_rate": 2.1069280108078393e-07, + "loss": 1.2241756916046143, + "step": 7640 + }, + { + "epoch": 2.7819439388423737, + "grad_norm": 17.25, + "learning_rate": 2.1062218620715321e-07, + "loss": 1.5762999057769775, + "step": 7642 + }, + { + "epoch": 2.7826720058245358, + "grad_norm": 10.4375, + "learning_rate": 2.1055180213796403e-07, + "loss": 1.4413495063781738, + "step": 7644 + }, + { + "epoch": 2.7834000728066983, + "grad_norm": 13.0625, + "learning_rate": 2.104816489149367e-07, + "loss": 1.463597297668457, + "step": 7646 + }, + { + "epoch": 2.7841281397888604, + "grad_norm": 41.5, + "learning_rate": 2.1041172657965453e-07, + "loss": 1.334934949874878, + "step": 7648 + }, + { + "epoch": 2.784856206771023, + "grad_norm": 17.0, + "learning_rate": 2.103420351735642e-07, + "loss": 1.2795376777648926, + "step": 7650 + }, + { + "epoch": 2.7855842737531855, + "grad_norm": 3.3125, + "learning_rate": 2.102725747379755e-07, + "loss": 1.2288786172866821, + "step": 7652 + }, + { + "epoch": 2.7863123407353476, + "grad_norm": 11.3125, + "learning_rate": 2.1020334531406113e-07, + "loss": 1.383086085319519, + "step": 7654 + }, + { + "epoch": 2.78704040771751, + "grad_norm": 19.125, + "learning_rate": 2.1013434694285702e-07, + "loss": 1.70675528049469, + "step": 7656 + }, + { + "epoch": 2.7877684746996723, + "grad_norm": 13.25, + "learning_rate": 2.1006557966526206e-07, + "loss": 1.077530860900879, + "step": 7658 + }, + { + "epoch": 2.788496541681835, + "grad_norm": 26.75, + "learning_rate": 2.0999704352203818e-07, + "loss": 1.0089654922485352, + "step": 7660 + }, + { + "epoch": 2.7892246086639974, + "grad_norm": 7.1875, + "learning_rate": 2.0992873855381034e-07, + "loss": 1.1098432540893555, + "step": 7662 + }, + { + "epoch": 2.7899526756461595, + "grad_norm": 9.3125, + "learning_rate": 2.098606648010664e-07, + "loss": 1.2507038116455078, + "step": 7664 + }, + { + "epoch": 2.7906807426283216, + "grad_norm": 24.625, + "learning_rate": 2.0979282230415724e-07, + "loss": 1.3971524238586426, + "step": 7666 + }, + { + "epoch": 2.791408809610484, + "grad_norm": 15.625, + "learning_rate": 2.0972521110329665e-07, + "loss": 1.3533592224121094, + "step": 7668 + }, + { + "epoch": 2.7921368765926466, + "grad_norm": 12.5, + "learning_rate": 2.096578312385612e-07, + "loss": 1.2224736213684082, + "step": 7670 + }, + { + "epoch": 2.7928649435748087, + "grad_norm": 6.09375, + "learning_rate": 2.0959068274989055e-07, + "loss": 1.4037983417510986, + "step": 7672 + }, + { + "epoch": 2.7935930105569713, + "grad_norm": 11.1875, + "learning_rate": 2.0952376567708707e-07, + "loss": 1.5098259449005127, + "step": 7674 + }, + { + "epoch": 2.7943210775391334, + "grad_norm": 17.25, + "learning_rate": 2.0945708005981587e-07, + "loss": 1.2908456325531006, + "step": 7676 + }, + { + "epoch": 2.795049144521296, + "grad_norm": 10.5, + "learning_rate": 2.0939062593760516e-07, + "loss": 1.3910664319992065, + "step": 7678 + }, + { + "epoch": 2.7957772115034585, + "grad_norm": 16.125, + "learning_rate": 2.0932440334984558e-07, + "loss": 1.336108922958374, + "step": 7680 + }, + { + "epoch": 2.7965052784856206, + "grad_norm": 17.0, + "learning_rate": 2.0925841233579077e-07, + "loss": 1.389346718788147, + "step": 7682 + }, + { + "epoch": 2.797233345467783, + "grad_norm": 11.125, + "learning_rate": 2.091926529345571e-07, + "loss": 1.4019428491592407, + "step": 7684 + }, + { + "epoch": 2.7979614124499452, + "grad_norm": 10.0, + "learning_rate": 2.0912712518512344e-07, + "loss": 1.3728744983673096, + "step": 7686 + }, + { + "epoch": 2.7986894794321078, + "grad_norm": 13.125, + "learning_rate": 2.090618291263316e-07, + "loss": 1.4798692464828491, + "step": 7688 + }, + { + "epoch": 2.7994175464142703, + "grad_norm": 11.3125, + "learning_rate": 2.089967647968859e-07, + "loss": 0.8737449645996094, + "step": 7690 + }, + { + "epoch": 2.8001456133964324, + "grad_norm": 24.625, + "learning_rate": 2.0893193223535324e-07, + "loss": 1.3619287014007568, + "step": 7692 + }, + { + "epoch": 2.800873680378595, + "grad_norm": 94.0, + "learning_rate": 2.088673314801634e-07, + "loss": 1.2469432353973389, + "step": 7694 + }, + { + "epoch": 2.801601747360757, + "grad_norm": 12.375, + "learning_rate": 2.088029625696086e-07, + "loss": 1.4413604736328125, + "step": 7696 + }, + { + "epoch": 2.8023298143429196, + "grad_norm": 71.0, + "learning_rate": 2.0873882554184354e-07, + "loss": 1.0301682949066162, + "step": 7698 + }, + { + "epoch": 2.803057881325082, + "grad_norm": 13.0, + "learning_rate": 2.0867492043488562e-07, + "loss": 1.3798198699951172, + "step": 7700 + }, + { + "epoch": 2.8037859483072443, + "grad_norm": 5.5, + "learning_rate": 2.0861124728661476e-07, + "loss": 0.9265398979187012, + "step": 7702 + }, + { + "epoch": 2.8045140152894064, + "grad_norm": 4.25, + "learning_rate": 2.085478061347732e-07, + "loss": 1.168776035308838, + "step": 7704 + }, + { + "epoch": 2.805242082271569, + "grad_norm": 14.4375, + "learning_rate": 2.0848459701696588e-07, + "loss": 1.286316990852356, + "step": 7706 + }, + { + "epoch": 2.8059701492537314, + "grad_norm": 8.125, + "learning_rate": 2.0842161997066016e-07, + "loss": 1.2654032707214355, + "step": 7708 + }, + { + "epoch": 2.8066982162358936, + "grad_norm": 10.625, + "learning_rate": 2.083588750331858e-07, + "loss": 1.5422921180725098, + "step": 7710 + }, + { + "epoch": 2.807426283218056, + "grad_norm": 13.875, + "learning_rate": 2.082963622417349e-07, + "loss": 1.2891054153442383, + "step": 7712 + }, + { + "epoch": 2.808154350200218, + "grad_norm": 9.8125, + "learning_rate": 2.0823408163336215e-07, + "loss": 1.1589635610580444, + "step": 7714 + }, + { + "epoch": 2.8088824171823807, + "grad_norm": 28.5, + "learning_rate": 2.0817203324498443e-07, + "loss": 1.6251064538955688, + "step": 7716 + }, + { + "epoch": 2.8096104841645433, + "grad_norm": 8.625, + "learning_rate": 2.0811021711338105e-07, + "loss": 1.4030495882034302, + "step": 7718 + }, + { + "epoch": 2.8103385511467054, + "grad_norm": 9.4375, + "learning_rate": 2.0804863327519362e-07, + "loss": 0.9573622345924377, + "step": 7720 + }, + { + "epoch": 2.811066618128868, + "grad_norm": 6.25, + "learning_rate": 2.079872817669261e-07, + "loss": 0.8307903409004211, + "step": 7722 + }, + { + "epoch": 2.81179468511103, + "grad_norm": 18.5, + "learning_rate": 2.0792616262494478e-07, + "loss": 1.6118886470794678, + "step": 7724 + }, + { + "epoch": 2.8125227520931926, + "grad_norm": 13.5625, + "learning_rate": 2.0786527588547803e-07, + "loss": 1.9925298690795898, + "step": 7726 + }, + { + "epoch": 2.813250819075355, + "grad_norm": 5.6875, + "learning_rate": 2.0780462158461658e-07, + "loss": 1.4722847938537598, + "step": 7728 + }, + { + "epoch": 2.8139788860575172, + "grad_norm": 11.5, + "learning_rate": 2.0774419975831357e-07, + "loss": 1.4548743963241577, + "step": 7730 + }, + { + "epoch": 2.8147069530396798, + "grad_norm": 20.75, + "learning_rate": 2.0768401044238393e-07, + "loss": 1.3189722299575806, + "step": 7732 + }, + { + "epoch": 2.815435020021842, + "grad_norm": 8.375, + "learning_rate": 2.0762405367250505e-07, + "loss": 1.5916292667388916, + "step": 7734 + }, + { + "epoch": 2.8161630870040044, + "grad_norm": 9.9375, + "learning_rate": 2.0756432948421654e-07, + "loss": 1.5082693099975586, + "step": 7736 + }, + { + "epoch": 2.816891153986167, + "grad_norm": 19.125, + "learning_rate": 2.0750483791291985e-07, + "loss": 1.253009557723999, + "step": 7738 + }, + { + "epoch": 2.817619220968329, + "grad_norm": 23.75, + "learning_rate": 2.0744557899387886e-07, + "loss": 1.0534443855285645, + "step": 7740 + }, + { + "epoch": 2.8183472879504916, + "grad_norm": 40.5, + "learning_rate": 2.0738655276221937e-07, + "loss": 1.568723440170288, + "step": 7742 + }, + { + "epoch": 2.8190753549326537, + "grad_norm": 42.25, + "learning_rate": 2.0732775925292921e-07, + "loss": 1.650984287261963, + "step": 7744 + }, + { + "epoch": 2.8198034219148163, + "grad_norm": 9.875, + "learning_rate": 2.0726919850085847e-07, + "loss": 1.2015697956085205, + "step": 7746 + }, + { + "epoch": 2.8205314888969784, + "grad_norm": 13.1875, + "learning_rate": 2.072108705407191e-07, + "loss": 1.5403871536254883, + "step": 7748 + }, + { + "epoch": 2.821259555879141, + "grad_norm": 27.875, + "learning_rate": 2.0715277540708505e-07, + "loss": 1.0012352466583252, + "step": 7750 + }, + { + "epoch": 2.821987622861303, + "grad_norm": 14.875, + "learning_rate": 2.0709491313439233e-07, + "loss": 1.3092074394226074, + "step": 7752 + }, + { + "epoch": 2.8227156898434655, + "grad_norm": 21.625, + "learning_rate": 2.0703728375693908e-07, + "loss": 1.3419698476791382, + "step": 7754 + }, + { + "epoch": 2.823443756825628, + "grad_norm": 13.25, + "learning_rate": 2.0697988730888497e-07, + "loss": 0.8581709265708923, + "step": 7756 + }, + { + "epoch": 2.82417182380779, + "grad_norm": 20.25, + "learning_rate": 2.0692272382425206e-07, + "loss": 1.438256025314331, + "step": 7758 + }, + { + "epoch": 2.8248998907899527, + "grad_norm": 7.03125, + "learning_rate": 2.06865793336924e-07, + "loss": 1.309208869934082, + "step": 7760 + }, + { + "epoch": 2.825627957772115, + "grad_norm": 16.375, + "learning_rate": 2.068090958806465e-07, + "loss": 1.229744553565979, + "step": 7762 + }, + { + "epoch": 2.8263560247542774, + "grad_norm": 11.625, + "learning_rate": 2.0675263148902707e-07, + "loss": 1.4337972402572632, + "step": 7764 + }, + { + "epoch": 2.82708409173644, + "grad_norm": 17.75, + "learning_rate": 2.0669640019553505e-07, + "loss": 1.4025542736053467, + "step": 7766 + }, + { + "epoch": 2.827812158718602, + "grad_norm": 11.0625, + "learning_rate": 2.0664040203350167e-07, + "loss": 1.2403582334518433, + "step": 7768 + }, + { + "epoch": 2.8285402257007646, + "grad_norm": 10.375, + "learning_rate": 2.0658463703612e-07, + "loss": 1.0490819215774536, + "step": 7770 + }, + { + "epoch": 2.8292682926829267, + "grad_norm": 10.0, + "learning_rate": 2.0652910523644482e-07, + "loss": 1.5956296920776367, + "step": 7772 + }, + { + "epoch": 2.8299963596650892, + "grad_norm": 15.625, + "learning_rate": 2.0647380666739262e-07, + "loss": 1.433046579360962, + "step": 7774 + }, + { + "epoch": 2.8307244266472518, + "grad_norm": 7.09375, + "learning_rate": 2.0641874136174186e-07, + "loss": 1.2046175003051758, + "step": 7776 + }, + { + "epoch": 2.831452493629414, + "grad_norm": 11.625, + "learning_rate": 2.0636390935213253e-07, + "loss": 1.5242550373077393, + "step": 7778 + }, + { + "epoch": 2.8321805606115764, + "grad_norm": 13.375, + "learning_rate": 2.063093106710664e-07, + "loss": 1.360374093055725, + "step": 7780 + }, + { + "epoch": 2.8329086275937385, + "grad_norm": 2.65625, + "learning_rate": 2.06254945350907e-07, + "loss": 1.3210067749023438, + "step": 7782 + }, + { + "epoch": 2.833636694575901, + "grad_norm": 20.625, + "learning_rate": 2.062008134238794e-07, + "loss": 1.599290132522583, + "step": 7784 + }, + { + "epoch": 2.8343647615580636, + "grad_norm": 27.375, + "learning_rate": 2.0614691492207054e-07, + "loss": 1.4858996868133545, + "step": 7786 + }, + { + "epoch": 2.8350928285402257, + "grad_norm": 23.375, + "learning_rate": 2.0609324987742872e-07, + "loss": 1.578185796737671, + "step": 7788 + }, + { + "epoch": 2.835820895522388, + "grad_norm": 7.125, + "learning_rate": 2.0603981832176403e-07, + "loss": 1.4383234977722168, + "step": 7790 + }, + { + "epoch": 2.8365489625045504, + "grad_norm": 10.5625, + "learning_rate": 2.0598662028674816e-07, + "loss": 1.020077109336853, + "step": 7792 + }, + { + "epoch": 2.837277029486713, + "grad_norm": 15.0, + "learning_rate": 2.0593365580391433e-07, + "loss": 1.6737645864486694, + "step": 7794 + }, + { + "epoch": 2.838005096468875, + "grad_norm": 14.1875, + "learning_rate": 2.0588092490465734e-07, + "loss": 1.7898913621902466, + "step": 7796 + }, + { + "epoch": 2.8387331634510375, + "grad_norm": 21.0, + "learning_rate": 2.0582842762023355e-07, + "loss": 1.3431928157806396, + "step": 7798 + }, + { + "epoch": 2.8394612304331996, + "grad_norm": 40.0, + "learning_rate": 2.0577616398176078e-07, + "loss": 1.593733787536621, + "step": 7800 + }, + { + "epoch": 2.840189297415362, + "grad_norm": 34.0, + "learning_rate": 2.0572413402021847e-07, + "loss": 1.0770912170410156, + "step": 7802 + }, + { + "epoch": 2.8409173643975247, + "grad_norm": 23.5, + "learning_rate": 2.0567233776644745e-07, + "loss": 1.430954098701477, + "step": 7804 + }, + { + "epoch": 2.841645431379687, + "grad_norm": 11.8125, + "learning_rate": 2.0562077525114997e-07, + "loss": 1.4358248710632324, + "step": 7806 + }, + { + "epoch": 2.8423734983618494, + "grad_norm": 34.0, + "learning_rate": 2.0556944650488983e-07, + "loss": 1.6799688339233398, + "step": 7808 + }, + { + "epoch": 2.8431015653440115, + "grad_norm": 21.25, + "learning_rate": 2.0551835155809238e-07, + "loss": 1.88125479221344, + "step": 7810 + }, + { + "epoch": 2.843829632326174, + "grad_norm": 10.625, + "learning_rate": 2.054674904410441e-07, + "loss": 1.068509817123413, + "step": 7812 + }, + { + "epoch": 2.8445576993083366, + "grad_norm": 14.0, + "learning_rate": 2.0541686318389307e-07, + "loss": 1.3479787111282349, + "step": 7814 + }, + { + "epoch": 2.8452857662904987, + "grad_norm": 23.875, + "learning_rate": 2.0536646981664873e-07, + "loss": 1.3775317668914795, + "step": 7816 + }, + { + "epoch": 2.846013833272661, + "grad_norm": 21.375, + "learning_rate": 2.053163103691818e-07, + "loss": 1.6020264625549316, + "step": 7818 + }, + { + "epoch": 2.8467419002548233, + "grad_norm": 13.6875, + "learning_rate": 2.0526638487122436e-07, + "loss": 1.0203572511672974, + "step": 7820 + }, + { + "epoch": 2.847469967236986, + "grad_norm": 12.0625, + "learning_rate": 2.0521669335236992e-07, + "loss": 1.2959376573562622, + "step": 7822 + }, + { + "epoch": 2.8481980342191484, + "grad_norm": 7.78125, + "learning_rate": 2.0516723584207307e-07, + "loss": 1.376861572265625, + "step": 7824 + }, + { + "epoch": 2.8489261012013105, + "grad_norm": 2.921875, + "learning_rate": 2.0511801236965006e-07, + "loss": 1.3009998798370361, + "step": 7826 + }, + { + "epoch": 2.8496541681834726, + "grad_norm": 16.375, + "learning_rate": 2.0506902296427808e-07, + "loss": 1.3086638450622559, + "step": 7828 + }, + { + "epoch": 2.850382235165635, + "grad_norm": 10.75, + "learning_rate": 2.0502026765499566e-07, + "loss": 1.5963079929351807, + "step": 7830 + }, + { + "epoch": 2.8511103021477977, + "grad_norm": 18.875, + "learning_rate": 2.049717464707027e-07, + "loss": 1.6178867816925049, + "step": 7832 + }, + { + "epoch": 2.85183836912996, + "grad_norm": 8.75, + "learning_rate": 2.049234594401601e-07, + "loss": 1.3013920783996582, + "step": 7834 + }, + { + "epoch": 2.8525664361121224, + "grad_norm": 17.25, + "learning_rate": 2.0487540659199022e-07, + "loss": 1.407470464706421, + "step": 7836 + }, + { + "epoch": 2.8532945030942845, + "grad_norm": 7.625, + "learning_rate": 2.0482758795467635e-07, + "loss": 1.0771088600158691, + "step": 7838 + }, + { + "epoch": 2.854022570076447, + "grad_norm": 33.25, + "learning_rate": 2.0478000355656322e-07, + "loss": 1.58818781375885, + "step": 7840 + }, + { + "epoch": 2.8547506370586095, + "grad_norm": 21.0, + "learning_rate": 2.047326534258565e-07, + "loss": 1.8508808612823486, + "step": 7842 + }, + { + "epoch": 2.8554787040407716, + "grad_norm": 12.8125, + "learning_rate": 2.0468553759062305e-07, + "loss": 1.3967604637145996, + "step": 7844 + }, + { + "epoch": 2.856206771022934, + "grad_norm": 22.5, + "learning_rate": 2.0463865607879085e-07, + "loss": 1.4228665828704834, + "step": 7846 + }, + { + "epoch": 2.8569348380050963, + "grad_norm": 13.9375, + "learning_rate": 2.045920089181491e-07, + "loss": 1.2838068008422852, + "step": 7848 + }, + { + "epoch": 2.857662904987259, + "grad_norm": 26.5, + "learning_rate": 2.0454559613634793e-07, + "loss": 1.1176376342773438, + "step": 7850 + }, + { + "epoch": 2.8583909719694214, + "grad_norm": 14.0, + "learning_rate": 2.0449941776089858e-07, + "loss": 1.05462646484375, + "step": 7852 + }, + { + "epoch": 2.8591190389515835, + "grad_norm": 13.625, + "learning_rate": 2.044534738191734e-07, + "loss": 1.6053094863891602, + "step": 7854 + }, + { + "epoch": 2.859847105933746, + "grad_norm": 14.5625, + "learning_rate": 2.0440776433840579e-07, + "loss": 1.629565954208374, + "step": 7856 + }, + { + "epoch": 2.860575172915908, + "grad_norm": 17.125, + "learning_rate": 2.043622893456901e-07, + "loss": 1.4796652793884277, + "step": 7858 + }, + { + "epoch": 2.8613032398980707, + "grad_norm": 12.5, + "learning_rate": 2.0431704886798164e-07, + "loss": 1.3246909379959106, + "step": 7860 + }, + { + "epoch": 2.862031306880233, + "grad_norm": 10.3125, + "learning_rate": 2.0427204293209696e-07, + "loss": 1.5011894702911377, + "step": 7862 + }, + { + "epoch": 2.8627593738623953, + "grad_norm": 14.0, + "learning_rate": 2.0422727156471324e-07, + "loss": 1.1526167392730713, + "step": 7864 + }, + { + "epoch": 2.863487440844558, + "grad_norm": 10.6875, + "learning_rate": 2.0418273479236892e-07, + "loss": 1.1082398891448975, + "step": 7866 + }, + { + "epoch": 2.86421550782672, + "grad_norm": 15.0625, + "learning_rate": 2.0413843264146316e-07, + "loss": 1.4643185138702393, + "step": 7868 + }, + { + "epoch": 2.8649435748088825, + "grad_norm": 31.625, + "learning_rate": 2.0409436513825627e-07, + "loss": 1.4510164260864258, + "step": 7870 + }, + { + "epoch": 2.8656716417910446, + "grad_norm": 11.3125, + "learning_rate": 2.040505323088692e-07, + "loss": 1.4438953399658203, + "step": 7872 + }, + { + "epoch": 2.866399708773207, + "grad_norm": 24.625, + "learning_rate": 2.0400693417928403e-07, + "loss": 1.195697546005249, + "step": 7874 + }, + { + "epoch": 2.8671277757553693, + "grad_norm": 12.4375, + "learning_rate": 2.039635707753436e-07, + "loss": 1.5196239948272705, + "step": 7876 + }, + { + "epoch": 2.867855842737532, + "grad_norm": 12.5625, + "learning_rate": 2.0392044212275167e-07, + "loss": 1.3054254055023193, + "step": 7878 + }, + { + "epoch": 2.8685839097196943, + "grad_norm": 26.75, + "learning_rate": 2.0387754824707284e-07, + "loss": 1.4637635946273804, + "step": 7880 + }, + { + "epoch": 2.8693119767018564, + "grad_norm": 6.40625, + "learning_rate": 2.0383488917373249e-07, + "loss": 1.3033356666564941, + "step": 7882 + }, + { + "epoch": 2.870040043684019, + "grad_norm": 12.8125, + "learning_rate": 2.0379246492801694e-07, + "loss": 1.24513840675354, + "step": 7884 + }, + { + "epoch": 2.870768110666181, + "grad_norm": 12.8125, + "learning_rate": 2.037502755350732e-07, + "loss": 1.447981834411621, + "step": 7886 + }, + { + "epoch": 2.8714961776483436, + "grad_norm": 14.375, + "learning_rate": 2.037083210199091e-07, + "loss": 1.3893728256225586, + "step": 7888 + }, + { + "epoch": 2.872224244630506, + "grad_norm": 10.625, + "learning_rate": 2.036666014073933e-07, + "loss": 1.426181674003601, + "step": 7890 + }, + { + "epoch": 2.8729523116126683, + "grad_norm": 13.5, + "learning_rate": 2.0362511672225514e-07, + "loss": 1.7361294031143188, + "step": 7892 + }, + { + "epoch": 2.873680378594831, + "grad_norm": 41.25, + "learning_rate": 2.035838669890848e-07, + "loss": 1.512399435043335, + "step": 7894 + }, + { + "epoch": 2.874408445576993, + "grad_norm": 6.78125, + "learning_rate": 2.0354285223233304e-07, + "loss": 1.435905933380127, + "step": 7896 + }, + { + "epoch": 2.8751365125591555, + "grad_norm": 14.25, + "learning_rate": 2.0350207247631152e-07, + "loss": 1.29866623878479, + "step": 7898 + }, + { + "epoch": 2.875864579541318, + "grad_norm": 10.75, + "learning_rate": 2.0346152774519246e-07, + "loss": 1.1237847805023193, + "step": 7900 + }, + { + "epoch": 2.87659264652348, + "grad_norm": 16.875, + "learning_rate": 2.0342121806300893e-07, + "loss": 1.4960262775421143, + "step": 7902 + }, + { + "epoch": 2.8773207135056427, + "grad_norm": 11.5, + "learning_rate": 2.0338114345365444e-07, + "loss": 1.6921610832214355, + "step": 7904 + }, + { + "epoch": 2.8780487804878048, + "grad_norm": 14.0625, + "learning_rate": 2.0334130394088333e-07, + "loss": 1.459842562675476, + "step": 7906 + }, + { + "epoch": 2.8787768474699673, + "grad_norm": 7.71875, + "learning_rate": 2.033016995483106e-07, + "loss": 1.1525046825408936, + "step": 7908 + }, + { + "epoch": 2.87950491445213, + "grad_norm": 13.5625, + "learning_rate": 2.0326233029941184e-07, + "loss": 1.3404717445373535, + "step": 7910 + }, + { + "epoch": 2.880232981434292, + "grad_norm": 2.875, + "learning_rate": 2.0322319621752319e-07, + "loss": 1.190673589706421, + "step": 7912 + }, + { + "epoch": 2.880961048416454, + "grad_norm": 7.59375, + "learning_rate": 2.0318429732584145e-07, + "loss": 1.1788434982299805, + "step": 7914 + }, + { + "epoch": 2.8816891153986166, + "grad_norm": 10.8125, + "learning_rate": 2.03145633647424e-07, + "loss": 1.1241605281829834, + "step": 7916 + }, + { + "epoch": 2.882417182380779, + "grad_norm": 13.5, + "learning_rate": 2.0310720520518886e-07, + "loss": 1.0601387023925781, + "step": 7918 + }, + { + "epoch": 2.8831452493629413, + "grad_norm": 18.25, + "learning_rate": 2.030690120219145e-07, + "loss": 1.4163811206817627, + "step": 7920 + }, + { + "epoch": 2.883873316345104, + "grad_norm": 24.75, + "learning_rate": 2.0303105412024014e-07, + "loss": 1.1954712867736816, + "step": 7922 + }, + { + "epoch": 2.884601383327266, + "grad_norm": 26.0, + "learning_rate": 2.029933315226652e-07, + "loss": 1.7862670421600342, + "step": 7924 + }, + { + "epoch": 2.8853294503094284, + "grad_norm": 5.28125, + "learning_rate": 2.0295584425155e-07, + "loss": 1.4910787343978882, + "step": 7926 + }, + { + "epoch": 2.886057517291591, + "grad_norm": 65.0, + "learning_rate": 2.0291859232911507e-07, + "loss": 0.8501677513122559, + "step": 7928 + }, + { + "epoch": 2.886785584273753, + "grad_norm": 12.8125, + "learning_rate": 2.0288157577744164e-07, + "loss": 1.1388781070709229, + "step": 7930 + }, + { + "epoch": 2.8875136512559156, + "grad_norm": 15.625, + "learning_rate": 2.0284479461847125e-07, + "loss": 1.462199330329895, + "step": 7932 + }, + { + "epoch": 2.8882417182380777, + "grad_norm": 14.0625, + "learning_rate": 2.0280824887400607e-07, + "loss": 1.4039278030395508, + "step": 7934 + }, + { + "epoch": 2.8889697852202403, + "grad_norm": 10.4375, + "learning_rate": 2.027719385657087e-07, + "loss": 1.176262617111206, + "step": 7936 + }, + { + "epoch": 2.889697852202403, + "grad_norm": 7.3125, + "learning_rate": 2.0273586371510204e-07, + "loss": 1.30606210231781, + "step": 7938 + }, + { + "epoch": 2.890425919184565, + "grad_norm": 22.625, + "learning_rate": 2.027000243435696e-07, + "loss": 1.5500578880310059, + "step": 7940 + }, + { + "epoch": 2.8911539861667275, + "grad_norm": 20.375, + "learning_rate": 2.026644204723552e-07, + "loss": 1.0178196430206299, + "step": 7942 + }, + { + "epoch": 2.8918820531488896, + "grad_norm": 3.984375, + "learning_rate": 2.0262905212256308e-07, + "loss": 0.7287222146987915, + "step": 7944 + }, + { + "epoch": 2.892610120131052, + "grad_norm": 14.3125, + "learning_rate": 2.0259391931515798e-07, + "loss": 1.33244788646698, + "step": 7946 + }, + { + "epoch": 2.8933381871132147, + "grad_norm": 11.5625, + "learning_rate": 2.025590220709649e-07, + "loss": 1.0921231508255005, + "step": 7948 + }, + { + "epoch": 2.8940662540953768, + "grad_norm": 13.5, + "learning_rate": 2.025243604106692e-07, + "loss": 1.495212435722351, + "step": 7950 + }, + { + "epoch": 2.894794321077539, + "grad_norm": 11.0625, + "learning_rate": 2.0248993435481668e-07, + "loss": 1.4628524780273438, + "step": 7952 + }, + { + "epoch": 2.8955223880597014, + "grad_norm": 32.0, + "learning_rate": 2.0245574392381348e-07, + "loss": 1.5423521995544434, + "step": 7954 + }, + { + "epoch": 2.896250455041864, + "grad_norm": 17.5, + "learning_rate": 2.0242178913792604e-07, + "loss": 1.1963269710540771, + "step": 7956 + }, + { + "epoch": 2.896978522024026, + "grad_norm": 27.25, + "learning_rate": 2.0238807001728105e-07, + "loss": 1.453611135482788, + "step": 7958 + }, + { + "epoch": 2.8977065890061886, + "grad_norm": 50.0, + "learning_rate": 2.0235458658186565e-07, + "loss": 1.5232495069503784, + "step": 7960 + }, + { + "epoch": 2.8984346559883507, + "grad_norm": 12.125, + "learning_rate": 2.023213388515272e-07, + "loss": 1.3508148193359375, + "step": 7962 + }, + { + "epoch": 2.8991627229705133, + "grad_norm": 9.1875, + "learning_rate": 2.0228832684597332e-07, + "loss": 1.4346957206726074, + "step": 7964 + }, + { + "epoch": 2.899890789952676, + "grad_norm": 17.25, + "learning_rate": 2.0225555058477203e-07, + "loss": 1.4392735958099365, + "step": 7966 + }, + { + "epoch": 2.900618856934838, + "grad_norm": 11.6875, + "learning_rate": 2.0222301008735144e-07, + "loss": 1.2762391567230225, + "step": 7968 + }, + { + "epoch": 2.9013469239170004, + "grad_norm": 19.75, + "learning_rate": 2.0219070537300001e-07, + "loss": 1.6005911827087402, + "step": 7970 + }, + { + "epoch": 2.9020749908991625, + "grad_norm": 11.1875, + "learning_rate": 2.0215863646086642e-07, + "loss": 0.98372882604599, + "step": 7972 + }, + { + "epoch": 2.902803057881325, + "grad_norm": 43.25, + "learning_rate": 2.0212680336995958e-07, + "loss": 1.4678664207458496, + "step": 7974 + }, + { + "epoch": 2.9035311248634876, + "grad_norm": 46.5, + "learning_rate": 2.020952061191486e-07, + "loss": 1.262685775756836, + "step": 7976 + }, + { + "epoch": 2.9042591918456497, + "grad_norm": 11.8125, + "learning_rate": 2.0206384472716287e-07, + "loss": 1.1299701929092407, + "step": 7978 + }, + { + "epoch": 2.9049872588278123, + "grad_norm": 16.5, + "learning_rate": 2.020327192125918e-07, + "loss": 1.3291025161743164, + "step": 7980 + }, + { + "epoch": 2.9057153258099744, + "grad_norm": 24.875, + "learning_rate": 2.0200182959388524e-07, + "loss": 1.5063763856887817, + "step": 7982 + }, + { + "epoch": 2.906443392792137, + "grad_norm": 14.125, + "learning_rate": 2.0197117588935297e-07, + "loss": 1.3754737377166748, + "step": 7984 + }, + { + "epoch": 2.9071714597742995, + "grad_norm": 68.5, + "learning_rate": 2.0194075811716506e-07, + "loss": 1.6347510814666748, + "step": 7986 + }, + { + "epoch": 2.9078995267564616, + "grad_norm": 13.6875, + "learning_rate": 2.0191057629535174e-07, + "loss": 1.445136547088623, + "step": 7988 + }, + { + "epoch": 2.908627593738624, + "grad_norm": 23.5, + "learning_rate": 2.0188063044180327e-07, + "loss": 1.661517858505249, + "step": 7990 + }, + { + "epoch": 2.909355660720786, + "grad_norm": 12.8125, + "learning_rate": 2.0185092057427013e-07, + "loss": 1.4164916276931763, + "step": 7992 + }, + { + "epoch": 2.9100837277029488, + "grad_norm": 14.0625, + "learning_rate": 2.0182144671036298e-07, + "loss": 0.9378368854522705, + "step": 7994 + }, + { + "epoch": 2.910811794685111, + "grad_norm": 23.375, + "learning_rate": 2.0179220886755245e-07, + "loss": 0.974675178527832, + "step": 7996 + }, + { + "epoch": 2.9115398616672734, + "grad_norm": 10.4375, + "learning_rate": 2.017632070631693e-07, + "loss": 1.529045581817627, + "step": 7998 + }, + { + "epoch": 2.9122679286494355, + "grad_norm": 9.125, + "learning_rate": 2.0173444131440443e-07, + "loss": 1.2718935012817383, + "step": 8000 + }, + { + "epoch": 2.912995995631598, + "grad_norm": 7.75, + "learning_rate": 2.0170591163830877e-07, + "loss": 1.3096375465393066, + "step": 8002 + }, + { + "epoch": 2.9137240626137606, + "grad_norm": 10.5625, + "learning_rate": 2.016776180517934e-07, + "loss": 1.3709384202957153, + "step": 8004 + }, + { + "epoch": 2.9144521295959227, + "grad_norm": 9.125, + "learning_rate": 2.0164956057162934e-07, + "loss": 1.3929789066314697, + "step": 8006 + }, + { + "epoch": 2.9151801965780852, + "grad_norm": 14.4375, + "learning_rate": 2.0162173921444776e-07, + "loss": 1.3755440711975098, + "step": 8008 + }, + { + "epoch": 2.9159082635602473, + "grad_norm": 12.4375, + "learning_rate": 2.0159415399673977e-07, + "loss": 1.5569086074829102, + "step": 8010 + }, + { + "epoch": 2.91663633054241, + "grad_norm": 10.9375, + "learning_rate": 2.0156680493485666e-07, + "loss": 1.4968904256820679, + "step": 8012 + }, + { + "epoch": 2.9173643975245724, + "grad_norm": 16.5, + "learning_rate": 2.015396920450095e-07, + "loss": 1.5117939710617065, + "step": 8014 + }, + { + "epoch": 2.9180924645067345, + "grad_norm": 5.9375, + "learning_rate": 2.0151281534326958e-07, + "loss": 1.0991700887680054, + "step": 8016 + }, + { + "epoch": 2.918820531488897, + "grad_norm": 10.0625, + "learning_rate": 2.0148617484556806e-07, + "loss": 1.5409727096557617, + "step": 8018 + }, + { + "epoch": 2.919548598471059, + "grad_norm": 9.875, + "learning_rate": 2.0145977056769627e-07, + "loss": 1.1974341869354248, + "step": 8020 + }, + { + "epoch": 2.9202766654532217, + "grad_norm": 21.0, + "learning_rate": 2.014336025253053e-07, + "loss": 1.1174416542053223, + "step": 8022 + }, + { + "epoch": 2.9210047324353843, + "grad_norm": 10.875, + "learning_rate": 2.0140767073390634e-07, + "loss": 1.2318964004516602, + "step": 8024 + }, + { + "epoch": 2.9217327994175464, + "grad_norm": 4.96875, + "learning_rate": 2.0138197520887047e-07, + "loss": 1.1861414909362793, + "step": 8026 + }, + { + "epoch": 2.922460866399709, + "grad_norm": 15.1875, + "learning_rate": 2.013565159654289e-07, + "loss": 1.4336538314819336, + "step": 8028 + }, + { + "epoch": 2.923188933381871, + "grad_norm": 7.625, + "learning_rate": 2.0133129301867241e-07, + "loss": 1.0287431478500366, + "step": 8030 + }, + { + "epoch": 2.9239170003640336, + "grad_norm": 11.1875, + "learning_rate": 2.0130630638355212e-07, + "loss": 1.1671830415725708, + "step": 8032 + }, + { + "epoch": 2.924645067346196, + "grad_norm": 13.375, + "learning_rate": 2.0128155607487884e-07, + "loss": 1.0935924053192139, + "step": 8034 + }, + { + "epoch": 2.925373134328358, + "grad_norm": 11.375, + "learning_rate": 2.012570421073234e-07, + "loss": 1.1813185214996338, + "step": 8036 + }, + { + "epoch": 2.9261012013105203, + "grad_norm": 12.375, + "learning_rate": 2.0123276449541647e-07, + "loss": 1.2623052597045898, + "step": 8038 + }, + { + "epoch": 2.926829268292683, + "grad_norm": 21.0, + "learning_rate": 2.012087232535487e-07, + "loss": 1.4954427480697632, + "step": 8040 + }, + { + "epoch": 2.9275573352748454, + "grad_norm": 9.8125, + "learning_rate": 2.0118491839597052e-07, + "loss": 1.6538885831832886, + "step": 8042 + }, + { + "epoch": 2.9282854022570075, + "grad_norm": 7.15625, + "learning_rate": 2.0116134993679232e-07, + "loss": 1.119826316833496, + "step": 8044 + }, + { + "epoch": 2.92901346923917, + "grad_norm": 10.125, + "learning_rate": 2.011380178899844e-07, + "loss": 1.2889474630355835, + "step": 8046 + }, + { + "epoch": 2.929741536221332, + "grad_norm": 7.84375, + "learning_rate": 2.011149222693768e-07, + "loss": 1.6262377500534058, + "step": 8048 + }, + { + "epoch": 2.9304696032034947, + "grad_norm": 6.25, + "learning_rate": 2.0109206308865955e-07, + "loss": 1.217801570892334, + "step": 8050 + }, + { + "epoch": 2.9311976701856572, + "grad_norm": 13.25, + "learning_rate": 2.010694403613824e-07, + "loss": 1.1254884004592896, + "step": 8052 + }, + { + "epoch": 2.9319257371678193, + "grad_norm": 7.78125, + "learning_rate": 2.0104705410095515e-07, + "loss": 1.5584194660186768, + "step": 8054 + }, + { + "epoch": 2.932653804149982, + "grad_norm": 10.8125, + "learning_rate": 2.0102490432064717e-07, + "loss": 1.4174740314483643, + "step": 8056 + }, + { + "epoch": 2.933381871132144, + "grad_norm": 5.40625, + "learning_rate": 2.010029910335878e-07, + "loss": 1.1874972581863403, + "step": 8058 + }, + { + "epoch": 2.9341099381143065, + "grad_norm": 13.4375, + "learning_rate": 2.009813142527663e-07, + "loss": 1.182803750038147, + "step": 8060 + }, + { + "epoch": 2.934838005096469, + "grad_norm": 10.125, + "learning_rate": 2.0095987399103142e-07, + "loss": 1.2599871158599854, + "step": 8062 + }, + { + "epoch": 2.935566072078631, + "grad_norm": 15.75, + "learning_rate": 2.0093867026109213e-07, + "loss": 1.8108468055725098, + "step": 8064 + }, + { + "epoch": 2.9362941390607937, + "grad_norm": 11.6875, + "learning_rate": 2.0091770307551683e-07, + "loss": 1.561493158340454, + "step": 8066 + }, + { + "epoch": 2.937022206042956, + "grad_norm": 6.40625, + "learning_rate": 2.0089697244673394e-07, + "loss": 1.162840723991394, + "step": 8068 + }, + { + "epoch": 2.9377502730251184, + "grad_norm": 18.625, + "learning_rate": 2.0087647838703155e-07, + "loss": 1.142324447631836, + "step": 8070 + }, + { + "epoch": 2.938478340007281, + "grad_norm": 11.5, + "learning_rate": 2.0085622090855757e-07, + "loss": 1.5515663623809814, + "step": 8072 + }, + { + "epoch": 2.939206406989443, + "grad_norm": 15.4375, + "learning_rate": 2.008362000233196e-07, + "loss": 1.319309949874878, + "step": 8074 + }, + { + "epoch": 2.939934473971605, + "grad_norm": 10.125, + "learning_rate": 2.0081641574318508e-07, + "loss": 1.2555091381072998, + "step": 8076 + }, + { + "epoch": 2.9406625409537677, + "grad_norm": 11.6875, + "learning_rate": 2.007968680798813e-07, + "loss": 1.221092939376831, + "step": 8078 + }, + { + "epoch": 2.94139060793593, + "grad_norm": 26.625, + "learning_rate": 2.00777557044995e-07, + "loss": 1.5880602598190308, + "step": 8080 + }, + { + "epoch": 2.9421186749180923, + "grad_norm": 6.96875, + "learning_rate": 2.007584826499729e-07, + "loss": 1.3323652744293213, + "step": 8082 + }, + { + "epoch": 2.942846741900255, + "grad_norm": 17.75, + "learning_rate": 2.0073964490612145e-07, + "loss": 1.4056622982025146, + "step": 8084 + }, + { + "epoch": 2.943574808882417, + "grad_norm": 16.5, + "learning_rate": 2.0072104382460668e-07, + "loss": 1.2395819425582886, + "step": 8086 + }, + { + "epoch": 2.9443028758645795, + "grad_norm": 11.0625, + "learning_rate": 2.007026794164544e-07, + "loss": 1.0900870561599731, + "step": 8088 + }, + { + "epoch": 2.945030942846742, + "grad_norm": 7.0, + "learning_rate": 2.0068455169255022e-07, + "loss": 1.25791597366333, + "step": 8090 + }, + { + "epoch": 2.945759009828904, + "grad_norm": 14.625, + "learning_rate": 2.0066666066363935e-07, + "loss": 1.4491243362426758, + "step": 8092 + }, + { + "epoch": 2.9464870768110667, + "grad_norm": 11.625, + "learning_rate": 2.0064900634032675e-07, + "loss": 1.7856603860855103, + "step": 8094 + }, + { + "epoch": 2.947215143793229, + "grad_norm": 17.25, + "learning_rate": 2.00631588733077e-07, + "loss": 1.2640631198883057, + "step": 8096 + }, + { + "epoch": 2.9479432107753913, + "grad_norm": 7.28125, + "learning_rate": 2.0061440785221448e-07, + "loss": 1.59052574634552, + "step": 8098 + }, + { + "epoch": 2.948671277757554, + "grad_norm": 31.375, + "learning_rate": 2.0059746370792313e-07, + "loss": 1.271122932434082, + "step": 8100 + }, + { + "epoch": 2.949399344739716, + "grad_norm": 13.625, + "learning_rate": 2.0058075631024668e-07, + "loss": 1.4843995571136475, + "step": 8102 + }, + { + "epoch": 2.9501274117218785, + "grad_norm": 25.0, + "learning_rate": 2.0056428566908845e-07, + "loss": 0.9613720178604126, + "step": 8104 + }, + { + "epoch": 2.9508554787040406, + "grad_norm": 9.3125, + "learning_rate": 2.0054805179421142e-07, + "loss": 0.9434951543807983, + "step": 8106 + }, + { + "epoch": 2.951583545686203, + "grad_norm": 9.4375, + "learning_rate": 2.005320546952383e-07, + "loss": 1.2604600191116333, + "step": 8108 + }, + { + "epoch": 2.9523116126683657, + "grad_norm": 17.625, + "learning_rate": 2.0051629438165137e-07, + "loss": 1.3642584085464478, + "step": 8110 + }, + { + "epoch": 2.953039679650528, + "grad_norm": 15.75, + "learning_rate": 2.0050077086279257e-07, + "loss": 1.7942960262298584, + "step": 8112 + }, + { + "epoch": 2.9537677466326904, + "grad_norm": 15.9375, + "learning_rate": 2.0048548414786357e-07, + "loss": 1.5137214660644531, + "step": 8114 + }, + { + "epoch": 2.9544958136148525, + "grad_norm": 20.125, + "learning_rate": 2.004704342459255e-07, + "loss": 1.4830505847930908, + "step": 8116 + }, + { + "epoch": 2.955223880597015, + "grad_norm": 10.5, + "learning_rate": 2.0045562116589927e-07, + "loss": 1.4270108938217163, + "step": 8118 + }, + { + "epoch": 2.955951947579177, + "grad_norm": 17.5, + "learning_rate": 2.0044104491656537e-07, + "loss": 1.327343463897705, + "step": 8120 + }, + { + "epoch": 2.9566800145613397, + "grad_norm": 15.625, + "learning_rate": 2.0042670550656389e-07, + "loss": 1.6308454275131226, + "step": 8122 + }, + { + "epoch": 2.9574080815435018, + "grad_norm": 61.0, + "learning_rate": 2.0041260294439448e-07, + "loss": 1.6157197952270508, + "step": 8124 + }, + { + "epoch": 2.9581361485256643, + "grad_norm": 14.8125, + "learning_rate": 2.0039873723841658e-07, + "loss": 1.2945884466171265, + "step": 8126 + }, + { + "epoch": 2.958864215507827, + "grad_norm": 11.4375, + "learning_rate": 2.0038510839684904e-07, + "loss": 1.4645934104919434, + "step": 8128 + }, + { + "epoch": 2.959592282489989, + "grad_norm": 12.4375, + "learning_rate": 2.0037171642777034e-07, + "loss": 1.0213232040405273, + "step": 8130 + }, + { + "epoch": 2.9603203494721515, + "grad_norm": 16.5, + "learning_rate": 2.0035856133911874e-07, + "loss": 1.4488648176193237, + "step": 8132 + }, + { + "epoch": 2.9610484164543136, + "grad_norm": 13.9375, + "learning_rate": 2.0034564313869176e-07, + "loss": 1.417912244796753, + "step": 8134 + }, + { + "epoch": 2.961776483436476, + "grad_norm": 4.78125, + "learning_rate": 2.0033296183414687e-07, + "loss": 1.284705400466919, + "step": 8136 + }, + { + "epoch": 2.9625045504186387, + "grad_norm": 9.375, + "learning_rate": 2.0032051743300076e-07, + "loss": 1.2809350490570068, + "step": 8138 + }, + { + "epoch": 2.963232617400801, + "grad_norm": 23.0, + "learning_rate": 2.0030830994263004e-07, + "loss": 0.8776845932006836, + "step": 8140 + }, + { + "epoch": 2.9639606843829633, + "grad_norm": 24.0, + "learning_rate": 2.0029633937027057e-07, + "loss": 1.6562855243682861, + "step": 8142 + }, + { + "epoch": 2.9646887513651254, + "grad_norm": 32.25, + "learning_rate": 2.0028460572301805e-07, + "loss": 0.9472579956054688, + "step": 8144 + }, + { + "epoch": 2.965416818347288, + "grad_norm": 8.9375, + "learning_rate": 2.0027310900782757e-07, + "loss": 1.3056223392486572, + "step": 8146 + }, + { + "epoch": 2.9661448853294505, + "grad_norm": 27.75, + "learning_rate": 2.0026184923151386e-07, + "loss": 1.4213675260543823, + "step": 8148 + }, + { + "epoch": 2.9668729523116126, + "grad_norm": 8.0625, + "learning_rate": 2.002508264007511e-07, + "loss": 1.3773770332336426, + "step": 8150 + }, + { + "epoch": 2.967601019293775, + "grad_norm": 14.1875, + "learning_rate": 2.002400405220732e-07, + "loss": 1.345163106918335, + "step": 8152 + }, + { + "epoch": 2.9683290862759373, + "grad_norm": 8.5, + "learning_rate": 2.002294916018734e-07, + "loss": 1.4528335332870483, + "step": 8154 + }, + { + "epoch": 2.9690571532581, + "grad_norm": 15.375, + "learning_rate": 2.0021917964640466e-07, + "loss": 1.4538681507110596, + "step": 8156 + }, + { + "epoch": 2.9697852202402624, + "grad_norm": 8.875, + "learning_rate": 2.0020910466177937e-07, + "loss": 1.345186471939087, + "step": 8158 + }, + { + "epoch": 2.9705132872224245, + "grad_norm": 14.625, + "learning_rate": 2.0019926665396954e-07, + "loss": 1.4220768213272095, + "step": 8160 + }, + { + "epoch": 2.9712413542045866, + "grad_norm": 12.125, + "learning_rate": 2.0018966562880666e-07, + "loss": 1.6046688556671143, + "step": 8162 + }, + { + "epoch": 2.971969421186749, + "grad_norm": 16.5, + "learning_rate": 2.0018030159198176e-07, + "loss": 1.6569523811340332, + "step": 8164 + }, + { + "epoch": 2.9726974881689117, + "grad_norm": 18.625, + "learning_rate": 2.0017117454904538e-07, + "loss": 1.3723821640014648, + "step": 8166 + }, + { + "epoch": 2.9734255551510738, + "grad_norm": 16.0, + "learning_rate": 2.0016228450540753e-07, + "loss": 1.2696672677993774, + "step": 8168 + }, + { + "epoch": 2.9741536221332363, + "grad_norm": 11.25, + "learning_rate": 2.001536314663379e-07, + "loss": 1.3820139169692993, + "step": 8170 + }, + { + "epoch": 2.9748816891153984, + "grad_norm": 5.625, + "learning_rate": 2.001452154369656e-07, + "loss": 1.1052289009094238, + "step": 8172 + }, + { + "epoch": 2.975609756097561, + "grad_norm": 32.75, + "learning_rate": 2.0013703642227916e-07, + "loss": 1.206451654434204, + "step": 8174 + }, + { + "epoch": 2.9763378230797235, + "grad_norm": 33.5, + "learning_rate": 2.0012909442712674e-07, + "loss": 1.2332465648651123, + "step": 8176 + }, + { + "epoch": 2.9770658900618856, + "grad_norm": 84.0, + "learning_rate": 2.0012138945621597e-07, + "loss": 1.5577616691589355, + "step": 8178 + }, + { + "epoch": 2.977793957044048, + "grad_norm": 10.6875, + "learning_rate": 2.0011392151411402e-07, + "loss": 1.2479498386383057, + "step": 8180 + }, + { + "epoch": 2.9785220240262102, + "grad_norm": 22.0, + "learning_rate": 2.0010669060524754e-07, + "loss": 1.9723505973815918, + "step": 8182 + }, + { + "epoch": 2.979250091008373, + "grad_norm": 59.25, + "learning_rate": 2.000996967339026e-07, + "loss": 1.0834282636642456, + "step": 8184 + }, + { + "epoch": 2.9799781579905353, + "grad_norm": 21.125, + "learning_rate": 2.0009293990422483e-07, + "loss": 1.4358558654785156, + "step": 8186 + }, + { + "epoch": 2.9807062249726974, + "grad_norm": 22.875, + "learning_rate": 2.0008642012021938e-07, + "loss": 0.871586799621582, + "step": 8188 + }, + { + "epoch": 2.98143429195486, + "grad_norm": 15.5, + "learning_rate": 2.0008013738575095e-07, + "loss": 1.1824800968170166, + "step": 8190 + }, + { + "epoch": 2.982162358937022, + "grad_norm": 8.875, + "learning_rate": 2.000740917045435e-07, + "loss": 1.048284649848938, + "step": 8192 + }, + { + "epoch": 2.9828904259191846, + "grad_norm": 11.5625, + "learning_rate": 2.0006828308018068e-07, + "loss": 1.532322645187378, + "step": 8194 + }, + { + "epoch": 2.983618492901347, + "grad_norm": 8.5625, + "learning_rate": 2.0006271151610559e-07, + "loss": 1.2561935186386108, + "step": 8196 + }, + { + "epoch": 2.9843465598835093, + "grad_norm": 13.9375, + "learning_rate": 2.0005737701562076e-07, + "loss": 1.5992321968078613, + "step": 8198 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 8.6875, + "learning_rate": 2.0005227958188819e-07, + "loss": 1.2346701622009277, + "step": 8200 + }, + { + "epoch": 2.985802693847834, + "grad_norm": 46.5, + "learning_rate": 2.0004741921792946e-07, + "loss": 1.3434027433395386, + "step": 8202 + }, + { + "epoch": 2.9865307608299965, + "grad_norm": 19.25, + "learning_rate": 2.0004279592662552e-07, + "loss": 0.5224310159683228, + "step": 8204 + }, + { + "epoch": 2.9872588278121586, + "grad_norm": 24.375, + "learning_rate": 2.0003840971071685e-07, + "loss": 1.4334192276000977, + "step": 8206 + }, + { + "epoch": 2.987986894794321, + "grad_norm": 9.875, + "learning_rate": 2.0003426057280335e-07, + "loss": 1.3835971355438232, + "step": 8208 + }, + { + "epoch": 2.988714961776483, + "grad_norm": 22.125, + "learning_rate": 2.0003034851534447e-07, + "loss": 1.3636739253997803, + "step": 8210 + }, + { + "epoch": 2.9894430287586458, + "grad_norm": 14.0, + "learning_rate": 2.0002667354065912e-07, + "loss": 1.2687699794769287, + "step": 8212 + }, + { + "epoch": 2.9901710957408083, + "grad_norm": 11.125, + "learning_rate": 2.0002323565092555e-07, + "loss": 1.2468979358673096, + "step": 8214 + }, + { + "epoch": 2.9908991627229704, + "grad_norm": 10.125, + "learning_rate": 2.0002003484818166e-07, + "loss": 1.1343700885772705, + "step": 8216 + }, + { + "epoch": 2.991627229705133, + "grad_norm": 17.0, + "learning_rate": 2.0001707113432474e-07, + "loss": 1.3773187398910522, + "step": 8218 + }, + { + "epoch": 2.992355296687295, + "grad_norm": 12.0, + "learning_rate": 2.0001434451111144e-07, + "loss": 1.424833059310913, + "step": 8220 + }, + { + "epoch": 2.9930833636694576, + "grad_norm": 18.5, + "learning_rate": 2.0001185498015808e-07, + "loss": 1.360491394996643, + "step": 8222 + }, + { + "epoch": 2.99381143065162, + "grad_norm": 27.625, + "learning_rate": 2.0000960254294027e-07, + "loss": 1.6592447757720947, + "step": 8224 + }, + { + "epoch": 2.9945394976337822, + "grad_norm": 15.75, + "learning_rate": 2.0000758720079314e-07, + "loss": 1.401857614517212, + "step": 8226 + }, + { + "epoch": 2.995267564615945, + "grad_norm": 53.75, + "learning_rate": 2.0000580895491137e-07, + "loss": 1.2038774490356445, + "step": 8228 + }, + { + "epoch": 2.995995631598107, + "grad_norm": 34.0, + "learning_rate": 2.000042678063489e-07, + "loss": 1.4040567874908447, + "step": 8230 + }, + { + "epoch": 2.9967236985802694, + "grad_norm": 20.625, + "learning_rate": 2.0000296375601933e-07, + "loss": 1.103801965713501, + "step": 8232 + }, + { + "epoch": 2.997451765562432, + "grad_norm": 18.375, + "learning_rate": 2.000018968046956e-07, + "loss": 1.243554711341858, + "step": 8234 + }, + { + "epoch": 2.998179832544594, + "grad_norm": 16.375, + "learning_rate": 2.0000106695301024e-07, + "loss": 1.325512409210205, + "step": 8236 + }, + { + "epoch": 2.9989078995267566, + "grad_norm": 12.5, + "learning_rate": 2.00000474201455e-07, + "loss": 1.2837302684783936, + "step": 8238 + }, + { + "epoch": 2.9996359665089187, + "grad_norm": 12.0625, + "learning_rate": 2.0000011855038136e-07, + "loss": 1.7418195009231567, + "step": 8240 + }, + { + "epoch": 3.0, + "step": 8241, + "total_flos": 3.2303951327715656e+18, + "train_loss": 1.376050503663886, + "train_runtime": 17603.8516, + "train_samples_per_second": 1.872, + "train_steps_per_second": 0.468 + } + ], + "logging_steps": 2, + "max_steps": 8241, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 9999999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.2303951327715656e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}