{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 6664, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 8.257464408874512, "eval_runtime": 3786.6179, "eval_samples_per_second": 12.236, "eval_steps_per_second": 0.765, "step": 0 }, { "epoch": 0.00030012004801920766, "grad_norm": 18.56479835510254, "learning_rate": 0.0, "loss": 4.7242, "step": 1 }, { "epoch": 0.0006002400960384153, "grad_norm": 49.334659576416016, "learning_rate": 3.003003003003003e-07, "loss": 5.5074, "step": 2 }, { "epoch": 0.000900360144057623, "grad_norm": 20.299957275390625, "learning_rate": 6.006006006006006e-07, "loss": 4.7201, "step": 3 }, { "epoch": 0.0012004801920768306, "grad_norm": 21.796030044555664, "learning_rate": 9.00900900900901e-07, "loss": 4.9276, "step": 4 }, { "epoch": 0.0015006002400960385, "grad_norm": 61.833614349365234, "learning_rate": 1.2012012012012013e-06, "loss": 5.5694, "step": 5 }, { "epoch": 0.001800720288115246, "grad_norm": 19.953771591186523, "learning_rate": 1.5015015015015015e-06, "loss": 4.7228, "step": 6 }, { "epoch": 0.0021008403361344537, "grad_norm": 26.493492126464844, "learning_rate": 1.801801801801802e-06, "loss": 5.1359, "step": 7 }, { "epoch": 0.0024009603841536613, "grad_norm": 16.626035690307617, "learning_rate": 2.102102102102102e-06, "loss": 4.5624, "step": 8 }, { "epoch": 0.0027010804321728693, "grad_norm": 19.844329833984375, "learning_rate": 2.4024024024024026e-06, "loss": 4.5288, "step": 9 }, { "epoch": 0.003001200480192077, "grad_norm": 19.03563690185547, "learning_rate": 2.702702702702703e-06, "loss": 4.8509, "step": 10 }, { "epoch": 0.0033013205282112846, "grad_norm": 12.096508979797363, "learning_rate": 3.003003003003003e-06, "loss": 4.272, "step": 11 }, { "epoch": 0.003601440576230492, "grad_norm": 13.593870162963867, "learning_rate": 3.3033033033033035e-06, "loss": 4.7412, "step": 12 }, { "epoch": 0.0039015606242496998, "grad_norm": 10.979002952575684, "learning_rate": 3.603603603603604e-06, "loss": 4.236, "step": 13 }, { "epoch": 0.004201680672268907, "grad_norm": 7.361409664154053, "learning_rate": 3.903903903903904e-06, "loss": 4.085, "step": 14 }, { "epoch": 0.004501800720288115, "grad_norm": 6.058810234069824, "learning_rate": 4.204204204204204e-06, "loss": 3.9739, "step": 15 }, { "epoch": 0.004801920768307323, "grad_norm": 4.1888227462768555, "learning_rate": 4.504504504504505e-06, "loss": 3.7215, "step": 16 }, { "epoch": 0.00510204081632653, "grad_norm": 3.432919979095459, "learning_rate": 4.804804804804805e-06, "loss": 3.7775, "step": 17 }, { "epoch": 0.005402160864345739, "grad_norm": 2.8430354595184326, "learning_rate": 5.105105105105106e-06, "loss": 3.692, "step": 18 }, { "epoch": 0.005702280912364946, "grad_norm": 2.4966061115264893, "learning_rate": 5.405405405405406e-06, "loss": 3.5742, "step": 19 }, { "epoch": 0.006002400960384154, "grad_norm": 2.3826422691345215, "learning_rate": 5.705705705705706e-06, "loss": 3.4681, "step": 20 }, { "epoch": 0.0063025210084033615, "grad_norm": 2.0872232913970947, "learning_rate": 6.006006006006006e-06, "loss": 3.5767, "step": 21 }, { "epoch": 0.006602641056422569, "grad_norm": 1.654006838798523, "learning_rate": 6.306306306306306e-06, "loss": 3.0309, "step": 22 }, { "epoch": 0.006902761104441777, "grad_norm": 1.6882315874099731, "learning_rate": 6.606606606606607e-06, "loss": 3.2576, "step": 23 }, { "epoch": 0.007202881152460984, "grad_norm": 1.6118031740188599, "learning_rate": 6.906906906906907e-06, "loss": 3.0787, "step": 24 }, { "epoch": 0.007503001200480192, "grad_norm": 1.591232419013977, "learning_rate": 7.207207207207208e-06, "loss": 3.1461, "step": 25 }, { "epoch": 0.0078031212484993995, "grad_norm": 1.4966458082199097, "learning_rate": 7.507507507507508e-06, "loss": 3.165, "step": 26 }, { "epoch": 0.008103241296518607, "grad_norm": 1.3096973896026611, "learning_rate": 7.807807807807808e-06, "loss": 3.0777, "step": 27 }, { "epoch": 0.008403361344537815, "grad_norm": 1.3433228731155396, "learning_rate": 8.108108108108109e-06, "loss": 3.1098, "step": 28 }, { "epoch": 0.008703481392557022, "grad_norm": 1.3368873596191406, "learning_rate": 8.408408408408409e-06, "loss": 2.9624, "step": 29 }, { "epoch": 0.00900360144057623, "grad_norm": 1.3983137607574463, "learning_rate": 8.708708708708708e-06, "loss": 2.434, "step": 30 }, { "epoch": 0.009303721488595438, "grad_norm": 1.2248708009719849, "learning_rate": 9.00900900900901e-06, "loss": 2.7106, "step": 31 }, { "epoch": 0.009603841536614645, "grad_norm": 1.2953872680664062, "learning_rate": 9.309309309309309e-06, "loss": 3.0944, "step": 32 }, { "epoch": 0.009903961584633853, "grad_norm": 1.2448747158050537, "learning_rate": 9.60960960960961e-06, "loss": 3.0083, "step": 33 }, { "epoch": 0.01020408163265306, "grad_norm": 1.2225483655929565, "learning_rate": 9.90990990990991e-06, "loss": 2.8514, "step": 34 }, { "epoch": 0.01050420168067227, "grad_norm": 1.0415458679199219, "learning_rate": 1.0210210210210211e-05, "loss": 2.5665, "step": 35 }, { "epoch": 0.010804321728691477, "grad_norm": 1.1476035118103027, "learning_rate": 1.051051051051051e-05, "loss": 2.6508, "step": 36 }, { "epoch": 0.011104441776710685, "grad_norm": 1.2470183372497559, "learning_rate": 1.0810810810810812e-05, "loss": 2.5615, "step": 37 }, { "epoch": 0.011404561824729893, "grad_norm": 0.9873207807540894, "learning_rate": 1.1111111111111112e-05, "loss": 2.371, "step": 38 }, { "epoch": 0.0117046818727491, "grad_norm": 1.2748939990997314, "learning_rate": 1.1411411411411411e-05, "loss": 2.4716, "step": 39 }, { "epoch": 0.012004801920768308, "grad_norm": 1.439241886138916, "learning_rate": 1.1711711711711713e-05, "loss": 2.4789, "step": 40 }, { "epoch": 0.012304921968787515, "grad_norm": 1.3756464719772339, "learning_rate": 1.2012012012012012e-05, "loss": 2.3341, "step": 41 }, { "epoch": 0.012605042016806723, "grad_norm": 0.8937436938285828, "learning_rate": 1.2312312312312313e-05, "loss": 1.9548, "step": 42 }, { "epoch": 0.01290516206482593, "grad_norm": 1.1486737728118896, "learning_rate": 1.2612612612612611e-05, "loss": 2.2497, "step": 43 }, { "epoch": 0.013205282112845138, "grad_norm": 1.0456668138504028, "learning_rate": 1.2912912912912914e-05, "loss": 2.256, "step": 44 }, { "epoch": 0.013505402160864346, "grad_norm": 1.137314796447754, "learning_rate": 1.3213213213213214e-05, "loss": 2.0677, "step": 45 }, { "epoch": 0.013805522208883553, "grad_norm": 1.262323021888733, "learning_rate": 1.3513513513513515e-05, "loss": 2.0702, "step": 46 }, { "epoch": 0.014105642256902761, "grad_norm": 0.7741314768791199, "learning_rate": 1.3813813813813815e-05, "loss": 1.7912, "step": 47 }, { "epoch": 0.014405762304921969, "grad_norm": 1.0190695524215698, "learning_rate": 1.4114114114114116e-05, "loss": 1.9836, "step": 48 }, { "epoch": 0.014705882352941176, "grad_norm": 1.171444296836853, "learning_rate": 1.4414414414414416e-05, "loss": 2.2635, "step": 49 }, { "epoch": 0.015006002400960384, "grad_norm": 0.9180420637130737, "learning_rate": 1.4714714714714713e-05, "loss": 2.0183, "step": 50 }, { "epoch": 0.015306122448979591, "grad_norm": 0.8563716411590576, "learning_rate": 1.5015015015015016e-05, "loss": 1.8753, "step": 51 }, { "epoch": 0.015606242496998799, "grad_norm": 0.8259108662605286, "learning_rate": 1.5315315315315316e-05, "loss": 1.771, "step": 52 }, { "epoch": 0.015906362545018007, "grad_norm": 0.8018758296966553, "learning_rate": 1.5615615615615616e-05, "loss": 1.6323, "step": 53 }, { "epoch": 0.016206482593037214, "grad_norm": 0.985905647277832, "learning_rate": 1.5915915915915915e-05, "loss": 1.8725, "step": 54 }, { "epoch": 0.016506602641056422, "grad_norm": 0.8832401037216187, "learning_rate": 1.6216216216216218e-05, "loss": 1.7508, "step": 55 }, { "epoch": 0.01680672268907563, "grad_norm": 0.8357267379760742, "learning_rate": 1.6516516516516518e-05, "loss": 1.5477, "step": 56 }, { "epoch": 0.017106842737094837, "grad_norm": 0.8062194585800171, "learning_rate": 1.6816816816816817e-05, "loss": 1.4678, "step": 57 }, { "epoch": 0.017406962785114045, "grad_norm": 0.9643778800964355, "learning_rate": 1.7117117117117117e-05, "loss": 1.5877, "step": 58 }, { "epoch": 0.017707082833133252, "grad_norm": 0.8363720774650574, "learning_rate": 1.7417417417417416e-05, "loss": 1.474, "step": 59 }, { "epoch": 0.01800720288115246, "grad_norm": 0.770699143409729, "learning_rate": 1.771771771771772e-05, "loss": 1.3455, "step": 60 }, { "epoch": 0.018307322929171668, "grad_norm": 0.948675274848938, "learning_rate": 1.801801801801802e-05, "loss": 1.4573, "step": 61 }, { "epoch": 0.018607442977190875, "grad_norm": 0.7970764636993408, "learning_rate": 1.831831831831832e-05, "loss": 1.1989, "step": 62 }, { "epoch": 0.018907563025210083, "grad_norm": 0.7488381862640381, "learning_rate": 1.8618618618618618e-05, "loss": 1.2167, "step": 63 }, { "epoch": 0.01920768307322929, "grad_norm": 0.7332976460456848, "learning_rate": 1.891891891891892e-05, "loss": 1.2067, "step": 64 }, { "epoch": 0.019507803121248498, "grad_norm": 0.6228386163711548, "learning_rate": 1.921921921921922e-05, "loss": 1.0552, "step": 65 }, { "epoch": 0.019807923169267706, "grad_norm": 0.5040204524993896, "learning_rate": 1.951951951951952e-05, "loss": 1.0225, "step": 66 }, { "epoch": 0.020108043217286913, "grad_norm": 0.638465404510498, "learning_rate": 1.981981981981982e-05, "loss": 1.1401, "step": 67 }, { "epoch": 0.02040816326530612, "grad_norm": 0.5351008176803589, "learning_rate": 2.012012012012012e-05, "loss": 1.0391, "step": 68 }, { "epoch": 0.02070828331332533, "grad_norm": 0.491290420293808, "learning_rate": 2.0420420420420422e-05, "loss": 1.0132, "step": 69 }, { "epoch": 0.02100840336134454, "grad_norm": 0.6342625617980957, "learning_rate": 2.0720720720720722e-05, "loss": 0.9974, "step": 70 }, { "epoch": 0.021308523409363747, "grad_norm": 0.4186214804649353, "learning_rate": 2.102102102102102e-05, "loss": 0.8443, "step": 71 }, { "epoch": 0.021608643457382955, "grad_norm": 0.4734509289264679, "learning_rate": 2.132132132132132e-05, "loss": 0.9307, "step": 72 }, { "epoch": 0.021908763505402162, "grad_norm": 0.39881131052970886, "learning_rate": 2.1621621621621624e-05, "loss": 0.9182, "step": 73 }, { "epoch": 0.02220888355342137, "grad_norm": 0.37704476714134216, "learning_rate": 2.1921921921921924e-05, "loss": 0.905, "step": 74 }, { "epoch": 0.022509003601440578, "grad_norm": 0.3518320918083191, "learning_rate": 2.2222222222222223e-05, "loss": 0.847, "step": 75 }, { "epoch": 0.022809123649459785, "grad_norm": 0.4406863749027252, "learning_rate": 2.2522522522522523e-05, "loss": 0.8292, "step": 76 }, { "epoch": 0.023109243697478993, "grad_norm": 0.31686627864837646, "learning_rate": 2.2822822822822822e-05, "loss": 0.893, "step": 77 }, { "epoch": 0.0234093637454982, "grad_norm": 0.3058432936668396, "learning_rate": 2.3123123123123125e-05, "loss": 0.8069, "step": 78 }, { "epoch": 0.023709483793517408, "grad_norm": 0.32039502263069153, "learning_rate": 2.3423423423423425e-05, "loss": 0.8222, "step": 79 }, { "epoch": 0.024009603841536616, "grad_norm": 0.29504913091659546, "learning_rate": 2.3723723723723725e-05, "loss": 0.8688, "step": 80 }, { "epoch": 0.024309723889555823, "grad_norm": 0.26919642090797424, "learning_rate": 2.4024024024024024e-05, "loss": 0.7322, "step": 81 }, { "epoch": 0.02460984393757503, "grad_norm": 0.32023561000823975, "learning_rate": 2.4324324324324327e-05, "loss": 0.7903, "step": 82 }, { "epoch": 0.02490996398559424, "grad_norm": 0.30055469274520874, "learning_rate": 2.4624624624624627e-05, "loss": 0.8126, "step": 83 }, { "epoch": 0.025210084033613446, "grad_norm": 0.2616680860519409, "learning_rate": 2.4924924924924926e-05, "loss": 0.7882, "step": 84 }, { "epoch": 0.025510204081632654, "grad_norm": 0.2315160185098648, "learning_rate": 2.5225225225225222e-05, "loss": 0.7482, "step": 85 }, { "epoch": 0.02581032412965186, "grad_norm": 0.28771960735321045, "learning_rate": 2.552552552552553e-05, "loss": 0.8175, "step": 86 }, { "epoch": 0.02611044417767107, "grad_norm": 0.240019753575325, "learning_rate": 2.582582582582583e-05, "loss": 0.7051, "step": 87 }, { "epoch": 0.026410564225690276, "grad_norm": 0.4383765161037445, "learning_rate": 2.6126126126126128e-05, "loss": 0.72, "step": 88 }, { "epoch": 0.026710684273709484, "grad_norm": 0.23066319525241852, "learning_rate": 2.6426426426426428e-05, "loss": 0.7018, "step": 89 }, { "epoch": 0.02701080432172869, "grad_norm": 0.2606968581676483, "learning_rate": 2.672672672672673e-05, "loss": 0.7069, "step": 90 }, { "epoch": 0.0273109243697479, "grad_norm": 0.234344020485878, "learning_rate": 2.702702702702703e-05, "loss": 0.737, "step": 91 }, { "epoch": 0.027611044417767107, "grad_norm": 0.2612946629524231, "learning_rate": 2.732732732732733e-05, "loss": 0.7118, "step": 92 }, { "epoch": 0.027911164465786314, "grad_norm": 0.23396819829940796, "learning_rate": 2.762762762762763e-05, "loss": 0.6997, "step": 93 }, { "epoch": 0.028211284513805522, "grad_norm": 0.2197222113609314, "learning_rate": 2.7927927927927926e-05, "loss": 0.6989, "step": 94 }, { "epoch": 0.02851140456182473, "grad_norm": 0.21125797927379608, "learning_rate": 2.8228228228228232e-05, "loss": 0.7102, "step": 95 }, { "epoch": 0.028811524609843937, "grad_norm": 0.18250249326229095, "learning_rate": 2.852852852852853e-05, "loss": 0.6388, "step": 96 }, { "epoch": 0.029111644657863145, "grad_norm": 0.2123304009437561, "learning_rate": 2.882882882882883e-05, "loss": 0.7159, "step": 97 }, { "epoch": 0.029411764705882353, "grad_norm": 0.19415165483951569, "learning_rate": 2.912912912912913e-05, "loss": 0.6594, "step": 98 }, { "epoch": 0.02971188475390156, "grad_norm": 0.1903056651353836, "learning_rate": 2.9429429429429427e-05, "loss": 0.712, "step": 99 }, { "epoch": 0.030012004801920768, "grad_norm": 0.17540313303470612, "learning_rate": 2.9729729729729733e-05, "loss": 0.608, "step": 100 }, { "epoch": 0.030312124849939975, "grad_norm": 0.1896171122789383, "learning_rate": 3.0030030030030033e-05, "loss": 0.711, "step": 101 }, { "epoch": 0.030612244897959183, "grad_norm": 0.18761886656284332, "learning_rate": 3.0330330330330332e-05, "loss": 0.6884, "step": 102 }, { "epoch": 0.03091236494597839, "grad_norm": 0.20305292308330536, "learning_rate": 3.063063063063063e-05, "loss": 0.6427, "step": 103 }, { "epoch": 0.031212484993997598, "grad_norm": 0.1764855533838272, "learning_rate": 3.093093093093093e-05, "loss": 0.6063, "step": 104 }, { "epoch": 0.031512605042016806, "grad_norm": 0.19730336964130402, "learning_rate": 3.123123123123123e-05, "loss": 0.6546, "step": 105 }, { "epoch": 0.03181272509003601, "grad_norm": 0.24252024292945862, "learning_rate": 3.153153153153153e-05, "loss": 0.6414, "step": 106 }, { "epoch": 0.03211284513805522, "grad_norm": 0.17647583782672882, "learning_rate": 3.183183183183183e-05, "loss": 0.6475, "step": 107 }, { "epoch": 0.03241296518607443, "grad_norm": 0.22339290380477905, "learning_rate": 3.213213213213213e-05, "loss": 0.6326, "step": 108 }, { "epoch": 0.032713085234093636, "grad_norm": 0.18274854123592377, "learning_rate": 3.2432432432432436e-05, "loss": 0.6822, "step": 109 }, { "epoch": 0.033013205282112844, "grad_norm": 0.19241338968276978, "learning_rate": 3.2732732732732736e-05, "loss": 0.6295, "step": 110 }, { "epoch": 0.03331332533013205, "grad_norm": 0.16756710410118103, "learning_rate": 3.3033033033033035e-05, "loss": 0.5989, "step": 111 }, { "epoch": 0.03361344537815126, "grad_norm": 0.1821005493402481, "learning_rate": 3.3333333333333335e-05, "loss": 0.6185, "step": 112 }, { "epoch": 0.03391356542617047, "grad_norm": 0.19916538894176483, "learning_rate": 3.3633633633633635e-05, "loss": 0.62, "step": 113 }, { "epoch": 0.034213685474189674, "grad_norm": 0.18223139643669128, "learning_rate": 3.3933933933933934e-05, "loss": 0.6281, "step": 114 }, { "epoch": 0.03451380552220888, "grad_norm": 0.20026777684688568, "learning_rate": 3.4234234234234234e-05, "loss": 0.6476, "step": 115 }, { "epoch": 0.03481392557022809, "grad_norm": 0.18175727128982544, "learning_rate": 3.453453453453453e-05, "loss": 0.6849, "step": 116 }, { "epoch": 0.0351140456182473, "grad_norm": 0.2390904575586319, "learning_rate": 3.483483483483483e-05, "loss": 0.6154, "step": 117 }, { "epoch": 0.035414165666266505, "grad_norm": 0.16057820618152618, "learning_rate": 3.513513513513514e-05, "loss": 0.5825, "step": 118 }, { "epoch": 0.03571428571428571, "grad_norm": 0.15922337770462036, "learning_rate": 3.543543543543544e-05, "loss": 0.62, "step": 119 }, { "epoch": 0.03601440576230492, "grad_norm": 0.23435887694358826, "learning_rate": 3.573573573573574e-05, "loss": 0.6075, "step": 120 }, { "epoch": 0.03631452581032413, "grad_norm": 0.1574699729681015, "learning_rate": 3.603603603603604e-05, "loss": 0.6137, "step": 121 }, { "epoch": 0.036614645858343335, "grad_norm": 0.18464826047420502, "learning_rate": 3.633633633633634e-05, "loss": 0.586, "step": 122 }, { "epoch": 0.03691476590636254, "grad_norm": 0.19280335307121277, "learning_rate": 3.663663663663664e-05, "loss": 0.6323, "step": 123 }, { "epoch": 0.03721488595438175, "grad_norm": 0.17928992211818695, "learning_rate": 3.693693693693694e-05, "loss": 0.6187, "step": 124 }, { "epoch": 0.03751500600240096, "grad_norm": 0.16199256479740143, "learning_rate": 3.7237237237237236e-05, "loss": 0.6087, "step": 125 }, { "epoch": 0.037815126050420166, "grad_norm": 0.18012242019176483, "learning_rate": 3.7537537537537536e-05, "loss": 0.6379, "step": 126 }, { "epoch": 0.03811524609843937, "grad_norm": 0.16345292329788208, "learning_rate": 3.783783783783784e-05, "loss": 0.5887, "step": 127 }, { "epoch": 0.03841536614645858, "grad_norm": 0.19358745217323303, "learning_rate": 3.813813813813814e-05, "loss": 0.5929, "step": 128 }, { "epoch": 0.03871548619447779, "grad_norm": 0.1952536702156067, "learning_rate": 3.843843843843844e-05, "loss": 0.6674, "step": 129 }, { "epoch": 0.039015606242496996, "grad_norm": 0.16455616056919098, "learning_rate": 3.873873873873874e-05, "loss": 0.5442, "step": 130 }, { "epoch": 0.039315726290516204, "grad_norm": 0.19198055565357208, "learning_rate": 3.903903903903904e-05, "loss": 0.6454, "step": 131 }, { "epoch": 0.03961584633853541, "grad_norm": 0.1654767543077469, "learning_rate": 3.933933933933934e-05, "loss": 0.6032, "step": 132 }, { "epoch": 0.03991596638655462, "grad_norm": 0.17861434817314148, "learning_rate": 3.963963963963964e-05, "loss": 0.6412, "step": 133 }, { "epoch": 0.040216086434573826, "grad_norm": 0.1698954701423645, "learning_rate": 3.993993993993994e-05, "loss": 0.5854, "step": 134 }, { "epoch": 0.040516206482593034, "grad_norm": 0.18049751222133636, "learning_rate": 4.024024024024024e-05, "loss": 0.6572, "step": 135 }, { "epoch": 0.04081632653061224, "grad_norm": 0.15890692174434662, "learning_rate": 4.0540540540540545e-05, "loss": 0.5808, "step": 136 }, { "epoch": 0.04111644657863145, "grad_norm": 0.15884089469909668, "learning_rate": 4.0840840840840845e-05, "loss": 0.5729, "step": 137 }, { "epoch": 0.04141656662665066, "grad_norm": 0.188554048538208, "learning_rate": 4.1141141141141144e-05, "loss": 0.5937, "step": 138 }, { "epoch": 0.04171668667466987, "grad_norm": 0.16957660019397736, "learning_rate": 4.1441441441441444e-05, "loss": 0.6158, "step": 139 }, { "epoch": 0.04201680672268908, "grad_norm": 0.17422978579998016, "learning_rate": 4.1741741741741744e-05, "loss": 0.6554, "step": 140 }, { "epoch": 0.04231692677070829, "grad_norm": 0.16660812497138977, "learning_rate": 4.204204204204204e-05, "loss": 0.7306, "step": 141 }, { "epoch": 0.042617046818727494, "grad_norm": 0.17347297072410583, "learning_rate": 4.234234234234234e-05, "loss": 0.5936, "step": 142 }, { "epoch": 0.0429171668667467, "grad_norm": 0.15610091388225555, "learning_rate": 4.264264264264264e-05, "loss": 0.5666, "step": 143 }, { "epoch": 0.04321728691476591, "grad_norm": 0.1987975686788559, "learning_rate": 4.294294294294294e-05, "loss": 0.6298, "step": 144 }, { "epoch": 0.04351740696278512, "grad_norm": 0.16874830424785614, "learning_rate": 4.324324324324325e-05, "loss": 0.5792, "step": 145 }, { "epoch": 0.043817527010804325, "grad_norm": 0.1750735342502594, "learning_rate": 4.354354354354355e-05, "loss": 0.5878, "step": 146 }, { "epoch": 0.04411764705882353, "grad_norm": 0.1773807853460312, "learning_rate": 4.384384384384385e-05, "loss": 0.576, "step": 147 }, { "epoch": 0.04441776710684274, "grad_norm": 0.1598764806985855, "learning_rate": 4.414414414414415e-05, "loss": 0.5881, "step": 148 }, { "epoch": 0.04471788715486195, "grad_norm": 0.1856532096862793, "learning_rate": 4.4444444444444447e-05, "loss": 0.5463, "step": 149 }, { "epoch": 0.045018007202881155, "grad_norm": 0.16291286051273346, "learning_rate": 4.4744744744744746e-05, "loss": 0.5138, "step": 150 }, { "epoch": 0.04531812725090036, "grad_norm": 0.19795602560043335, "learning_rate": 4.5045045045045046e-05, "loss": 0.5249, "step": 151 }, { "epoch": 0.04561824729891957, "grad_norm": 0.2006046324968338, "learning_rate": 4.5345345345345345e-05, "loss": 0.6039, "step": 152 }, { "epoch": 0.04591836734693878, "grad_norm": 0.17838355898857117, "learning_rate": 4.5645645645645645e-05, "loss": 0.5634, "step": 153 }, { "epoch": 0.046218487394957986, "grad_norm": 0.160551056265831, "learning_rate": 4.594594594594595e-05, "loss": 0.5825, "step": 154 }, { "epoch": 0.04651860744297719, "grad_norm": 0.16036823391914368, "learning_rate": 4.624624624624625e-05, "loss": 0.54, "step": 155 }, { "epoch": 0.0468187274909964, "grad_norm": 0.16031847894191742, "learning_rate": 4.654654654654655e-05, "loss": 0.5655, "step": 156 }, { "epoch": 0.04711884753901561, "grad_norm": 0.17296087741851807, "learning_rate": 4.684684684684685e-05, "loss": 0.6163, "step": 157 }, { "epoch": 0.047418967587034816, "grad_norm": 0.15543188154697418, "learning_rate": 4.714714714714715e-05, "loss": 0.5416, "step": 158 }, { "epoch": 0.047719087635054024, "grad_norm": 0.16270007193088531, "learning_rate": 4.744744744744745e-05, "loss": 0.5561, "step": 159 }, { "epoch": 0.04801920768307323, "grad_norm": 0.15587948262691498, "learning_rate": 4.774774774774775e-05, "loss": 0.5877, "step": 160 }, { "epoch": 0.04831932773109244, "grad_norm": 0.16889938712120056, "learning_rate": 4.804804804804805e-05, "loss": 0.5613, "step": 161 }, { "epoch": 0.048619447779111646, "grad_norm": 0.15977361798286438, "learning_rate": 4.834834834834835e-05, "loss": 0.5806, "step": 162 }, { "epoch": 0.048919567827130854, "grad_norm": 0.1628406047821045, "learning_rate": 4.8648648648648654e-05, "loss": 0.584, "step": 163 }, { "epoch": 0.04921968787515006, "grad_norm": 0.1554487943649292, "learning_rate": 4.8948948948948954e-05, "loss": 0.5566, "step": 164 }, { "epoch": 0.04951980792316927, "grad_norm": 0.16129934787750244, "learning_rate": 4.9249249249249253e-05, "loss": 0.5373, "step": 165 }, { "epoch": 0.04981992797118848, "grad_norm": 0.16680221259593964, "learning_rate": 4.954954954954955e-05, "loss": 0.5526, "step": 166 }, { "epoch": 0.050120048019207684, "grad_norm": 0.22599273920059204, "learning_rate": 4.984984984984985e-05, "loss": 0.5677, "step": 167 }, { "epoch": 0.05042016806722689, "grad_norm": 0.15940923988819122, "learning_rate": 5.015015015015015e-05, "loss": 0.5585, "step": 168 }, { "epoch": 0.0507202881152461, "grad_norm": 0.21803030371665955, "learning_rate": 5.0450450450450445e-05, "loss": 0.5465, "step": 169 }, { "epoch": 0.05102040816326531, "grad_norm": 0.155521959066391, "learning_rate": 5.075075075075075e-05, "loss": 0.5617, "step": 170 }, { "epoch": 0.051320528211284515, "grad_norm": 0.16137102246284485, "learning_rate": 5.105105105105106e-05, "loss": 0.5666, "step": 171 }, { "epoch": 0.05162064825930372, "grad_norm": 0.17119623720645905, "learning_rate": 5.135135135135135e-05, "loss": 0.5736, "step": 172 }, { "epoch": 0.05192076830732293, "grad_norm": 0.1570027768611908, "learning_rate": 5.165165165165166e-05, "loss": 0.5682, "step": 173 }, { "epoch": 0.05222088835534214, "grad_norm": 0.17013399302959442, "learning_rate": 5.195195195195195e-05, "loss": 0.5806, "step": 174 }, { "epoch": 0.052521008403361345, "grad_norm": 0.16840900480747223, "learning_rate": 5.2252252252252256e-05, "loss": 0.524, "step": 175 }, { "epoch": 0.05282112845138055, "grad_norm": 0.19608522951602936, "learning_rate": 5.2552552552552556e-05, "loss": 0.558, "step": 176 }, { "epoch": 0.05312124849939976, "grad_norm": 0.16340655088424683, "learning_rate": 5.2852852852852855e-05, "loss": 0.5473, "step": 177 }, { "epoch": 0.05342136854741897, "grad_norm": 0.17019999027252197, "learning_rate": 5.3153153153153155e-05, "loss": 0.6102, "step": 178 }, { "epoch": 0.053721488595438176, "grad_norm": 0.18055562674999237, "learning_rate": 5.345345345345346e-05, "loss": 0.5499, "step": 179 }, { "epoch": 0.05402160864345738, "grad_norm": 0.18409283459186554, "learning_rate": 5.3753753753753754e-05, "loss": 0.5542, "step": 180 }, { "epoch": 0.05432172869147659, "grad_norm": 0.15948091447353363, "learning_rate": 5.405405405405406e-05, "loss": 0.5801, "step": 181 }, { "epoch": 0.0546218487394958, "grad_norm": 0.15832237899303436, "learning_rate": 5.435435435435435e-05, "loss": 0.5363, "step": 182 }, { "epoch": 0.054921968787515006, "grad_norm": 0.16188354790210724, "learning_rate": 5.465465465465466e-05, "loss": 0.5323, "step": 183 }, { "epoch": 0.055222088835534214, "grad_norm": 0.17373915016651154, "learning_rate": 5.4954954954954966e-05, "loss": 0.5252, "step": 184 }, { "epoch": 0.05552220888355342, "grad_norm": 0.1621847003698349, "learning_rate": 5.525525525525526e-05, "loss": 0.5449, "step": 185 }, { "epoch": 0.05582232893157263, "grad_norm": 0.16235846281051636, "learning_rate": 5.555555555555556e-05, "loss": 0.5933, "step": 186 }, { "epoch": 0.05612244897959184, "grad_norm": 0.20305441319942474, "learning_rate": 5.585585585585585e-05, "loss": 0.5164, "step": 187 }, { "epoch": 0.056422569027611044, "grad_norm": 0.1579139083623886, "learning_rate": 5.615615615615616e-05, "loss": 0.5778, "step": 188 }, { "epoch": 0.05672268907563025, "grad_norm": 0.14820082485675812, "learning_rate": 5.6456456456456464e-05, "loss": 0.5317, "step": 189 }, { "epoch": 0.05702280912364946, "grad_norm": 0.4610205590724945, "learning_rate": 5.6756756756756757e-05, "loss": 0.5196, "step": 190 }, { "epoch": 0.05732292917166867, "grad_norm": 0.1503513902425766, "learning_rate": 5.705705705705706e-05, "loss": 0.5291, "step": 191 }, { "epoch": 0.057623049219687875, "grad_norm": 0.17538698017597198, "learning_rate": 5.7357357357357356e-05, "loss": 0.5559, "step": 192 }, { "epoch": 0.05792316926770708, "grad_norm": 0.17536750435829163, "learning_rate": 5.765765765765766e-05, "loss": 0.5611, "step": 193 }, { "epoch": 0.05822328931572629, "grad_norm": 0.23853377997875214, "learning_rate": 5.795795795795796e-05, "loss": 0.5556, "step": 194 }, { "epoch": 0.0585234093637455, "grad_norm": 0.16671805083751678, "learning_rate": 5.825825825825826e-05, "loss": 0.5689, "step": 195 }, { "epoch": 0.058823529411764705, "grad_norm": 0.18346083164215088, "learning_rate": 5.855855855855856e-05, "loss": 0.6339, "step": 196 }, { "epoch": 0.05912364945978391, "grad_norm": 0.16490010917186737, "learning_rate": 5.8858858858858854e-05, "loss": 0.5744, "step": 197 }, { "epoch": 0.05942376950780312, "grad_norm": 0.15952922403812408, "learning_rate": 5.915915915915916e-05, "loss": 0.5466, "step": 198 }, { "epoch": 0.05972388955582233, "grad_norm": 0.16180749237537384, "learning_rate": 5.9459459459459466e-05, "loss": 0.5249, "step": 199 }, { "epoch": 0.060024009603841535, "grad_norm": 0.1582827866077423, "learning_rate": 5.975975975975976e-05, "loss": 0.517, "step": 200 }, { "epoch": 0.06032412965186074, "grad_norm": 0.17207732796669006, "learning_rate": 6.0060060060060066e-05, "loss": 0.5024, "step": 201 }, { "epoch": 0.06062424969987995, "grad_norm": 0.1727384626865387, "learning_rate": 6.0360360360360365e-05, "loss": 0.5579, "step": 202 }, { "epoch": 0.06092436974789916, "grad_norm": 0.165960893034935, "learning_rate": 6.0660660660660665e-05, "loss": 0.556, "step": 203 }, { "epoch": 0.061224489795918366, "grad_norm": 0.1521759182214737, "learning_rate": 6.0960960960960964e-05, "loss": 0.5119, "step": 204 }, { "epoch": 0.061524609843937574, "grad_norm": 0.2733907401561737, "learning_rate": 6.126126126126126e-05, "loss": 0.5353, "step": 205 }, { "epoch": 0.06182472989195678, "grad_norm": 0.164889395236969, "learning_rate": 6.156156156156156e-05, "loss": 0.5342, "step": 206 }, { "epoch": 0.06212484993997599, "grad_norm": 0.15708684921264648, "learning_rate": 6.186186186186186e-05, "loss": 0.5417, "step": 207 }, { "epoch": 0.062424969987995196, "grad_norm": 0.14472995698451996, "learning_rate": 6.216216216216216e-05, "loss": 0.4934, "step": 208 }, { "epoch": 0.06272509003601441, "grad_norm": 0.168310284614563, "learning_rate": 6.246246246246246e-05, "loss": 0.521, "step": 209 }, { "epoch": 0.06302521008403361, "grad_norm": 0.14763006567955017, "learning_rate": 6.276276276276276e-05, "loss": 0.5292, "step": 210 }, { "epoch": 0.06332533013205283, "grad_norm": 0.14773689210414886, "learning_rate": 6.306306306306306e-05, "loss": 0.4917, "step": 211 }, { "epoch": 0.06362545018007203, "grad_norm": 0.15610907971858978, "learning_rate": 6.336336336336337e-05, "loss": 0.5573, "step": 212 }, { "epoch": 0.06392557022809124, "grad_norm": 0.15105871856212616, "learning_rate": 6.366366366366366e-05, "loss": 0.5192, "step": 213 }, { "epoch": 0.06422569027611044, "grad_norm": 0.15257810056209564, "learning_rate": 6.396396396396397e-05, "loss": 0.5685, "step": 214 }, { "epoch": 0.06452581032412966, "grad_norm": 0.16696257889270782, "learning_rate": 6.426426426426426e-05, "loss": 0.5774, "step": 215 }, { "epoch": 0.06482593037214886, "grad_norm": 0.15298417210578918, "learning_rate": 6.456456456456457e-05, "loss": 0.547, "step": 216 }, { "epoch": 0.06512605042016807, "grad_norm": 0.1644119918346405, "learning_rate": 6.486486486486487e-05, "loss": 0.5713, "step": 217 }, { "epoch": 0.06542617046818727, "grad_norm": 0.21756312251091003, "learning_rate": 6.516516516516516e-05, "loss": 0.5501, "step": 218 }, { "epoch": 0.06572629051620649, "grad_norm": 0.1546732485294342, "learning_rate": 6.546546546546547e-05, "loss": 0.5478, "step": 219 }, { "epoch": 0.06602641056422569, "grad_norm": 0.15699245035648346, "learning_rate": 6.576576576576577e-05, "loss": 0.5431, "step": 220 }, { "epoch": 0.0663265306122449, "grad_norm": 0.1714629828929901, "learning_rate": 6.606606606606607e-05, "loss": 0.6054, "step": 221 }, { "epoch": 0.0666266506602641, "grad_norm": 0.1773165464401245, "learning_rate": 6.636636636636637e-05, "loss": 0.5467, "step": 222 }, { "epoch": 0.06692677070828332, "grad_norm": 0.1482841670513153, "learning_rate": 6.666666666666667e-05, "loss": 0.5377, "step": 223 }, { "epoch": 0.06722689075630252, "grad_norm": 0.17168587446212769, "learning_rate": 6.696696696696697e-05, "loss": 0.5223, "step": 224 }, { "epoch": 0.06752701080432173, "grad_norm": 0.15758070349693298, "learning_rate": 6.726726726726727e-05, "loss": 0.54, "step": 225 }, { "epoch": 0.06782713085234093, "grad_norm": 0.1595706045627594, "learning_rate": 6.756756756756757e-05, "loss": 0.5675, "step": 226 }, { "epoch": 0.06812725090036015, "grad_norm": 0.14920708537101746, "learning_rate": 6.786786786786787e-05, "loss": 0.5844, "step": 227 }, { "epoch": 0.06842737094837935, "grad_norm": 0.15330851078033447, "learning_rate": 6.816816816816817e-05, "loss": 0.5548, "step": 228 }, { "epoch": 0.06872749099639856, "grad_norm": 0.16282516717910767, "learning_rate": 6.846846846846847e-05, "loss": 0.4937, "step": 229 }, { "epoch": 0.06902761104441776, "grad_norm": 0.15990369021892548, "learning_rate": 6.876876876876878e-05, "loss": 0.5599, "step": 230 }, { "epoch": 0.06932773109243698, "grad_norm": 0.14240749180316925, "learning_rate": 6.906906906906907e-05, "loss": 0.5104, "step": 231 }, { "epoch": 0.06962785114045618, "grad_norm": 0.18181869387626648, "learning_rate": 6.936936936936938e-05, "loss": 0.566, "step": 232 }, { "epoch": 0.0699279711884754, "grad_norm": 0.16088823974132538, "learning_rate": 6.966966966966967e-05, "loss": 0.529, "step": 233 }, { "epoch": 0.0702280912364946, "grad_norm": 0.152555450797081, "learning_rate": 6.996996996996998e-05, "loss": 0.6098, "step": 234 }, { "epoch": 0.07052821128451381, "grad_norm": 0.14834022521972656, "learning_rate": 7.027027027027028e-05, "loss": 0.4863, "step": 235 }, { "epoch": 0.07082833133253301, "grad_norm": 0.156663715839386, "learning_rate": 7.057057057057056e-05, "loss": 0.4761, "step": 236 }, { "epoch": 0.07112845138055222, "grad_norm": 0.14352193474769592, "learning_rate": 7.087087087087088e-05, "loss": 0.55, "step": 237 }, { "epoch": 0.07142857142857142, "grad_norm": 0.14808101952075958, "learning_rate": 7.117117117117116e-05, "loss": 0.5447, "step": 238 }, { "epoch": 0.07172869147659064, "grad_norm": 0.15214911103248596, "learning_rate": 7.147147147147148e-05, "loss": 0.5245, "step": 239 }, { "epoch": 0.07202881152460984, "grad_norm": 0.1910678893327713, "learning_rate": 7.177177177177178e-05, "loss": 0.5181, "step": 240 }, { "epoch": 0.07232893157262905, "grad_norm": 0.1260625272989273, "learning_rate": 7.207207207207208e-05, "loss": 0.474, "step": 241 }, { "epoch": 0.07262905162064826, "grad_norm": 0.167415589094162, "learning_rate": 7.237237237237238e-05, "loss": 0.5691, "step": 242 }, { "epoch": 0.07292917166866747, "grad_norm": 0.33259207010269165, "learning_rate": 7.267267267267268e-05, "loss": 0.5795, "step": 243 }, { "epoch": 0.07322929171668667, "grad_norm": 0.13412445783615112, "learning_rate": 7.297297297297297e-05, "loss": 0.5308, "step": 244 }, { "epoch": 0.07352941176470588, "grad_norm": 0.1500299721956253, "learning_rate": 7.327327327327327e-05, "loss": 0.5355, "step": 245 }, { "epoch": 0.07382953181272509, "grad_norm": 0.15848858654499054, "learning_rate": 7.357357357357357e-05, "loss": 0.5417, "step": 246 }, { "epoch": 0.0741296518607443, "grad_norm": 0.1430959850549698, "learning_rate": 7.387387387387387e-05, "loss": 0.473, "step": 247 }, { "epoch": 0.0744297719087635, "grad_norm": 0.14406929910182953, "learning_rate": 7.417417417417419e-05, "loss": 0.5497, "step": 248 }, { "epoch": 0.07472989195678272, "grad_norm": 0.34863534569740295, "learning_rate": 7.447447447447447e-05, "loss": 0.564, "step": 249 }, { "epoch": 0.07503001200480192, "grad_norm": 0.2568702697753906, "learning_rate": 7.477477477477479e-05, "loss": 0.5526, "step": 250 }, { "epoch": 0.07533013205282113, "grad_norm": 0.21536609530448914, "learning_rate": 7.507507507507507e-05, "loss": 0.5499, "step": 251 }, { "epoch": 0.07563025210084033, "grad_norm": 0.14825338125228882, "learning_rate": 7.537537537537538e-05, "loss": 0.5292, "step": 252 }, { "epoch": 0.07593037214885955, "grad_norm": 0.14850740134716034, "learning_rate": 7.567567567567568e-05, "loss": 0.5686, "step": 253 }, { "epoch": 0.07623049219687875, "grad_norm": 0.16904740035533905, "learning_rate": 7.597597597597597e-05, "loss": 0.5328, "step": 254 }, { "epoch": 0.07653061224489796, "grad_norm": 0.15750734508037567, "learning_rate": 7.627627627627628e-05, "loss": 0.5588, "step": 255 }, { "epoch": 0.07683073229291716, "grad_norm": 0.1463199406862259, "learning_rate": 7.657657657657657e-05, "loss": 0.5541, "step": 256 }, { "epoch": 0.07713085234093638, "grad_norm": 0.7012650370597839, "learning_rate": 7.687687687687688e-05, "loss": 0.535, "step": 257 }, { "epoch": 0.07743097238895558, "grad_norm": 0.14505982398986816, "learning_rate": 7.717717717717718e-05, "loss": 0.5628, "step": 258 }, { "epoch": 0.07773109243697479, "grad_norm": 0.13912923634052277, "learning_rate": 7.747747747747748e-05, "loss": 0.5189, "step": 259 }, { "epoch": 0.07803121248499399, "grad_norm": 0.1794511377811432, "learning_rate": 7.777777777777778e-05, "loss": 0.5081, "step": 260 }, { "epoch": 0.0783313325330132, "grad_norm": 1.2760982513427734, "learning_rate": 7.807807807807808e-05, "loss": 0.6025, "step": 261 }, { "epoch": 0.07863145258103241, "grad_norm": 0.14869123697280884, "learning_rate": 7.837837837837838e-05, "loss": 0.5255, "step": 262 }, { "epoch": 0.07893157262905162, "grad_norm": 0.15125605463981628, "learning_rate": 7.867867867867868e-05, "loss": 0.5463, "step": 263 }, { "epoch": 0.07923169267707082, "grad_norm": 0.1474657654762268, "learning_rate": 7.897897897897898e-05, "loss": 0.5469, "step": 264 }, { "epoch": 0.07953181272509004, "grad_norm": 0.14748071134090424, "learning_rate": 7.927927927927928e-05, "loss": 0.5293, "step": 265 }, { "epoch": 0.07983193277310924, "grad_norm": 0.14991365373134613, "learning_rate": 7.957957957957959e-05, "loss": 0.5938, "step": 266 }, { "epoch": 0.08013205282112845, "grad_norm": 0.15257029235363007, "learning_rate": 7.987987987987988e-05, "loss": 0.5606, "step": 267 }, { "epoch": 0.08043217286914765, "grad_norm": 0.13995935022830963, "learning_rate": 8.018018018018019e-05, "loss": 0.525, "step": 268 }, { "epoch": 0.08073229291716687, "grad_norm": 0.13580486178398132, "learning_rate": 8.048048048048048e-05, "loss": 0.5221, "step": 269 }, { "epoch": 0.08103241296518607, "grad_norm": 0.137712761759758, "learning_rate": 8.078078078078079e-05, "loss": 0.4955, "step": 270 }, { "epoch": 0.08133253301320528, "grad_norm": 0.14473161101341248, "learning_rate": 8.108108108108109e-05, "loss": 0.5652, "step": 271 }, { "epoch": 0.08163265306122448, "grad_norm": 0.1528928130865097, "learning_rate": 8.138138138138138e-05, "loss": 0.538, "step": 272 }, { "epoch": 0.0819327731092437, "grad_norm": 0.5879867672920227, "learning_rate": 8.168168168168169e-05, "loss": 0.523, "step": 273 }, { "epoch": 0.0822328931572629, "grad_norm": 0.14671894907951355, "learning_rate": 8.198198198198198e-05, "loss": 0.5658, "step": 274 }, { "epoch": 0.08253301320528211, "grad_norm": 0.23328697681427002, "learning_rate": 8.228228228228229e-05, "loss": 0.5037, "step": 275 }, { "epoch": 0.08283313325330131, "grad_norm": 0.1431223601102829, "learning_rate": 8.258258258258259e-05, "loss": 0.5596, "step": 276 }, { "epoch": 0.08313325330132053, "grad_norm": 0.13177427649497986, "learning_rate": 8.288288288288289e-05, "loss": 0.4746, "step": 277 }, { "epoch": 0.08343337334933974, "grad_norm": 0.13189871609210968, "learning_rate": 8.318318318318319e-05, "loss": 0.4897, "step": 278 }, { "epoch": 0.08373349339735894, "grad_norm": 0.12248660624027252, "learning_rate": 8.348348348348349e-05, "loss": 0.4442, "step": 279 }, { "epoch": 0.08403361344537816, "grad_norm": 0.13041585683822632, "learning_rate": 8.378378378378379e-05, "loss": 0.5096, "step": 280 }, { "epoch": 0.08433373349339736, "grad_norm": 0.13134099543094635, "learning_rate": 8.408408408408409e-05, "loss": 0.506, "step": 281 }, { "epoch": 0.08463385354141657, "grad_norm": 0.1440073847770691, "learning_rate": 8.438438438438439e-05, "loss": 0.5683, "step": 282 }, { "epoch": 0.08493397358943577, "grad_norm": 0.1371690332889557, "learning_rate": 8.468468468468469e-05, "loss": 0.5403, "step": 283 }, { "epoch": 0.08523409363745499, "grad_norm": 0.1403842717409134, "learning_rate": 8.4984984984985e-05, "loss": 0.5708, "step": 284 }, { "epoch": 0.08553421368547419, "grad_norm": 0.13641871511936188, "learning_rate": 8.528528528528528e-05, "loss": 0.5176, "step": 285 }, { "epoch": 0.0858343337334934, "grad_norm": 0.12627846002578735, "learning_rate": 8.55855855855856e-05, "loss": 0.4752, "step": 286 }, { "epoch": 0.0861344537815126, "grad_norm": 0.1365559846162796, "learning_rate": 8.588588588588588e-05, "loss": 0.5224, "step": 287 }, { "epoch": 0.08643457382953182, "grad_norm": 0.1484965831041336, "learning_rate": 8.61861861861862e-05, "loss": 0.5857, "step": 288 }, { "epoch": 0.08673469387755102, "grad_norm": 0.13150005042552948, "learning_rate": 8.64864864864865e-05, "loss": 0.5204, "step": 289 }, { "epoch": 0.08703481392557023, "grad_norm": 0.13024255633354187, "learning_rate": 8.678678678678678e-05, "loss": 0.4807, "step": 290 }, { "epoch": 0.08733493397358943, "grad_norm": 0.1377982497215271, "learning_rate": 8.70870870870871e-05, "loss": 0.582, "step": 291 }, { "epoch": 0.08763505402160865, "grad_norm": 0.13690394163131714, "learning_rate": 8.738738738738738e-05, "loss": 0.5142, "step": 292 }, { "epoch": 0.08793517406962785, "grad_norm": 0.1395886093378067, "learning_rate": 8.76876876876877e-05, "loss": 0.5291, "step": 293 }, { "epoch": 0.08823529411764706, "grad_norm": 0.174406498670578, "learning_rate": 8.7987987987988e-05, "loss": 0.5506, "step": 294 }, { "epoch": 0.08853541416566627, "grad_norm": 0.14555053412914276, "learning_rate": 8.82882882882883e-05, "loss": 0.5302, "step": 295 }, { "epoch": 0.08883553421368548, "grad_norm": 0.13706374168395996, "learning_rate": 8.85885885885886e-05, "loss": 0.529, "step": 296 }, { "epoch": 0.08913565426170468, "grad_norm": 0.16008315980434418, "learning_rate": 8.888888888888889e-05, "loss": 0.6128, "step": 297 }, { "epoch": 0.0894357743097239, "grad_norm": 0.12834343314170837, "learning_rate": 8.918918918918919e-05, "loss": 0.5424, "step": 298 }, { "epoch": 0.0897358943577431, "grad_norm": 0.15433359146118164, "learning_rate": 8.948948948948949e-05, "loss": 0.5354, "step": 299 }, { "epoch": 0.09003601440576231, "grad_norm": 0.1307957023382187, "learning_rate": 8.978978978978979e-05, "loss": 0.4998, "step": 300 }, { "epoch": 0.09033613445378151, "grad_norm": 0.12451066076755524, "learning_rate": 9.009009009009009e-05, "loss": 0.4458, "step": 301 }, { "epoch": 0.09063625450180073, "grad_norm": 0.1300276517868042, "learning_rate": 9.039039039039039e-05, "loss": 0.5064, "step": 302 }, { "epoch": 0.09093637454981993, "grad_norm": 0.19848628342151642, "learning_rate": 9.069069069069069e-05, "loss": 0.5372, "step": 303 }, { "epoch": 0.09123649459783914, "grad_norm": 0.12379094213247299, "learning_rate": 9.0990990990991e-05, "loss": 0.5418, "step": 304 }, { "epoch": 0.09153661464585834, "grad_norm": 0.1259545385837555, "learning_rate": 9.129129129129129e-05, "loss": 0.4933, "step": 305 }, { "epoch": 0.09183673469387756, "grad_norm": 0.14117270708084106, "learning_rate": 9.15915915915916e-05, "loss": 0.5039, "step": 306 }, { "epoch": 0.09213685474189676, "grad_norm": 0.16921882331371307, "learning_rate": 9.18918918918919e-05, "loss": 0.5283, "step": 307 }, { "epoch": 0.09243697478991597, "grad_norm": 0.1352926343679428, "learning_rate": 9.219219219219219e-05, "loss": 0.529, "step": 308 }, { "epoch": 0.09273709483793517, "grad_norm": 0.12800848484039307, "learning_rate": 9.24924924924925e-05, "loss": 0.5578, "step": 309 }, { "epoch": 0.09303721488595439, "grad_norm": 0.12968765199184418, "learning_rate": 9.279279279279279e-05, "loss": 0.498, "step": 310 }, { "epoch": 0.09333733493397359, "grad_norm": 0.13185739517211914, "learning_rate": 9.30930930930931e-05, "loss": 0.5547, "step": 311 }, { "epoch": 0.0936374549819928, "grad_norm": 0.1271120011806488, "learning_rate": 9.33933933933934e-05, "loss": 0.5198, "step": 312 }, { "epoch": 0.093937575030012, "grad_norm": 0.13291484117507935, "learning_rate": 9.36936936936937e-05, "loss": 0.5277, "step": 313 }, { "epoch": 0.09423769507803122, "grad_norm": 0.1287054568529129, "learning_rate": 9.3993993993994e-05, "loss": 0.5788, "step": 314 }, { "epoch": 0.09453781512605042, "grad_norm": 0.13590851426124573, "learning_rate": 9.42942942942943e-05, "loss": 0.5443, "step": 315 }, { "epoch": 0.09483793517406963, "grad_norm": 0.12829774618148804, "learning_rate": 9.45945945945946e-05, "loss": 0.4761, "step": 316 }, { "epoch": 0.09513805522208883, "grad_norm": 0.12990467250347137, "learning_rate": 9.48948948948949e-05, "loss": 0.5363, "step": 317 }, { "epoch": 0.09543817527010805, "grad_norm": 0.12929613888263702, "learning_rate": 9.51951951951952e-05, "loss": 0.5325, "step": 318 }, { "epoch": 0.09573829531812725, "grad_norm": 0.12171660363674164, "learning_rate": 9.54954954954955e-05, "loss": 0.4698, "step": 319 }, { "epoch": 0.09603841536614646, "grad_norm": 0.12339114397764206, "learning_rate": 9.57957957957958e-05, "loss": 0.512, "step": 320 }, { "epoch": 0.09633853541416566, "grad_norm": 0.12217818200588226, "learning_rate": 9.60960960960961e-05, "loss": 0.4688, "step": 321 }, { "epoch": 0.09663865546218488, "grad_norm": 0.1298341602087021, "learning_rate": 9.639639639639641e-05, "loss": 0.5217, "step": 322 }, { "epoch": 0.09693877551020408, "grad_norm": 0.1366010457277298, "learning_rate": 9.66966966966967e-05, "loss": 0.5411, "step": 323 }, { "epoch": 0.09723889555822329, "grad_norm": 0.11594495177268982, "learning_rate": 9.699699699699701e-05, "loss": 0.457, "step": 324 }, { "epoch": 0.0975390156062425, "grad_norm": 0.12001260370016098, "learning_rate": 9.729729729729731e-05, "loss": 0.4871, "step": 325 }, { "epoch": 0.09783913565426171, "grad_norm": 0.1352197229862213, "learning_rate": 9.75975975975976e-05, "loss": 0.5627, "step": 326 }, { "epoch": 0.09813925570228091, "grad_norm": 0.14736923575401306, "learning_rate": 9.789789789789791e-05, "loss": 0.4938, "step": 327 }, { "epoch": 0.09843937575030012, "grad_norm": 0.13322827219963074, "learning_rate": 9.81981981981982e-05, "loss": 0.5221, "step": 328 }, { "epoch": 0.09873949579831932, "grad_norm": 0.1316617876291275, "learning_rate": 9.849849849849851e-05, "loss": 0.4848, "step": 329 }, { "epoch": 0.09903961584633854, "grad_norm": 0.12125842273235321, "learning_rate": 9.87987987987988e-05, "loss": 0.5035, "step": 330 }, { "epoch": 0.09933973589435774, "grad_norm": 0.13165231049060822, "learning_rate": 9.90990990990991e-05, "loss": 0.4913, "step": 331 }, { "epoch": 0.09963985594237695, "grad_norm": 0.14205414056777954, "learning_rate": 9.93993993993994e-05, "loss": 0.4759, "step": 332 }, { "epoch": 0.09993997599039615, "grad_norm": 0.8694428205490112, "learning_rate": 9.96996996996997e-05, "loss": 0.527, "step": 333 }, { "epoch": 0.10024009603841537, "grad_norm": 0.13224342465400696, "learning_rate": 0.0001, "loss": 0.4765, "step": 334 }, { "epoch": 0.10054021608643457, "grad_norm": 0.12250874191522598, "learning_rate": 0.0001003003003003003, "loss": 0.4789, "step": 335 }, { "epoch": 0.10084033613445378, "grad_norm": 0.2647605836391449, "learning_rate": 0.00010060060060060062, "loss": 0.5888, "step": 336 }, { "epoch": 0.10114045618247298, "grad_norm": 0.16100604832172394, "learning_rate": 0.00010090090090090089, "loss": 0.4987, "step": 337 }, { "epoch": 0.1014405762304922, "grad_norm": 0.1282327026128769, "learning_rate": 0.0001012012012012012, "loss": 0.542, "step": 338 }, { "epoch": 0.1017406962785114, "grad_norm": 0.1321956217288971, "learning_rate": 0.0001015015015015015, "loss": 0.4768, "step": 339 }, { "epoch": 0.10204081632653061, "grad_norm": 0.1253899186849594, "learning_rate": 0.00010180180180180182, "loss": 0.5044, "step": 340 }, { "epoch": 0.10234093637454982, "grad_norm": 0.12141785025596619, "learning_rate": 0.00010210210210210212, "loss": 0.4857, "step": 341 }, { "epoch": 0.10264105642256903, "grad_norm": 0.12600919604301453, "learning_rate": 0.0001024024024024024, "loss": 0.5238, "step": 342 }, { "epoch": 0.10294117647058823, "grad_norm": 0.12873877584934235, "learning_rate": 0.0001027027027027027, "loss": 0.4718, "step": 343 }, { "epoch": 0.10324129651860744, "grad_norm": 0.11940476298332214, "learning_rate": 0.000103003003003003, "loss": 0.5153, "step": 344 }, { "epoch": 0.10354141656662665, "grad_norm": 1.817137598991394, "learning_rate": 0.00010330330330330331, "loss": 0.4968, "step": 345 }, { "epoch": 0.10384153661464586, "grad_norm": 0.13736383616924286, "learning_rate": 0.00010360360360360361, "loss": 0.4911, "step": 346 }, { "epoch": 0.10414165666266506, "grad_norm": 0.13706469535827637, "learning_rate": 0.0001039039039039039, "loss": 0.5602, "step": 347 }, { "epoch": 0.10444177671068428, "grad_norm": 0.11970093101263046, "learning_rate": 0.0001042042042042042, "loss": 0.4455, "step": 348 }, { "epoch": 0.10474189675870348, "grad_norm": 0.12620647251605988, "learning_rate": 0.00010450450450450451, "loss": 0.4818, "step": 349 }, { "epoch": 0.10504201680672269, "grad_norm": 0.1250682771205902, "learning_rate": 0.00010480480480480481, "loss": 0.536, "step": 350 }, { "epoch": 0.10534213685474189, "grad_norm": 0.1511351764202118, "learning_rate": 0.00010510510510510511, "loss": 0.5285, "step": 351 }, { "epoch": 0.1056422569027611, "grad_norm": 0.19505727291107178, "learning_rate": 0.0001054054054054054, "loss": 0.4803, "step": 352 }, { "epoch": 0.1059423769507803, "grad_norm": 0.12491302192211151, "learning_rate": 0.00010570570570570571, "loss": 0.5079, "step": 353 }, { "epoch": 0.10624249699879952, "grad_norm": 0.12418210506439209, "learning_rate": 0.00010600600600600601, "loss": 0.518, "step": 354 }, { "epoch": 0.10654261704681872, "grad_norm": 0.13729716837406158, "learning_rate": 0.00010630630630630631, "loss": 0.5257, "step": 355 }, { "epoch": 0.10684273709483794, "grad_norm": 0.12184851616621017, "learning_rate": 0.00010660660660660662, "loss": 0.5152, "step": 356 }, { "epoch": 0.10714285714285714, "grad_norm": 0.12144186347723007, "learning_rate": 0.00010690690690690692, "loss": 0.4442, "step": 357 }, { "epoch": 0.10744297719087635, "grad_norm": 0.12210109084844589, "learning_rate": 0.00010720720720720721, "loss": 0.4776, "step": 358 }, { "epoch": 0.10774309723889555, "grad_norm": 0.12035097181797028, "learning_rate": 0.00010750750750750751, "loss": 0.5004, "step": 359 }, { "epoch": 0.10804321728691477, "grad_norm": 0.4249653220176697, "learning_rate": 0.00010780780780780782, "loss": 0.5689, "step": 360 }, { "epoch": 0.10834333733493397, "grad_norm": 0.1250094324350357, "learning_rate": 0.00010810810810810812, "loss": 0.5051, "step": 361 }, { "epoch": 0.10864345738295318, "grad_norm": 0.12944050133228302, "learning_rate": 0.00010840840840840842, "loss": 0.5163, "step": 362 }, { "epoch": 0.10894357743097238, "grad_norm": 0.17619773745536804, "learning_rate": 0.0001087087087087087, "loss": 0.4903, "step": 363 }, { "epoch": 0.1092436974789916, "grad_norm": 0.13504037261009216, "learning_rate": 0.000109009009009009, "loss": 0.5294, "step": 364 }, { "epoch": 0.1095438175270108, "grad_norm": 0.14919887483119965, "learning_rate": 0.00010930930930930932, "loss": 0.5045, "step": 365 }, { "epoch": 0.10984393757503001, "grad_norm": 0.12428752332925797, "learning_rate": 0.00010960960960960962, "loss": 0.5324, "step": 366 }, { "epoch": 0.11014405762304921, "grad_norm": 0.1244334876537323, "learning_rate": 0.00010990990990990993, "loss": 0.471, "step": 367 }, { "epoch": 0.11044417767106843, "grad_norm": 0.12586592137813568, "learning_rate": 0.0001102102102102102, "loss": 0.5102, "step": 368 }, { "epoch": 0.11074429771908763, "grad_norm": 0.11768530309200287, "learning_rate": 0.00011051051051051052, "loss": 0.5072, "step": 369 }, { "epoch": 0.11104441776710684, "grad_norm": 0.17226853966712952, "learning_rate": 0.00011081081081081082, "loss": 0.4927, "step": 370 }, { "epoch": 0.11134453781512606, "grad_norm": 0.12605972588062286, "learning_rate": 0.00011111111111111112, "loss": 0.5093, "step": 371 }, { "epoch": 0.11164465786314526, "grad_norm": 0.13863171637058258, "learning_rate": 0.00011141141141141143, "loss": 0.5109, "step": 372 }, { "epoch": 0.11194477791116447, "grad_norm": 0.11750851571559906, "learning_rate": 0.0001117117117117117, "loss": 0.509, "step": 373 }, { "epoch": 0.11224489795918367, "grad_norm": 0.12177072465419769, "learning_rate": 0.00011201201201201202, "loss": 0.4714, "step": 374 }, { "epoch": 0.11254501800720289, "grad_norm": 0.12073265016078949, "learning_rate": 0.00011231231231231231, "loss": 0.5356, "step": 375 }, { "epoch": 0.11284513805522209, "grad_norm": 0.1178957149386406, "learning_rate": 0.00011261261261261263, "loss": 0.5507, "step": 376 }, { "epoch": 0.1131452581032413, "grad_norm": 0.13189667463302612, "learning_rate": 0.00011291291291291293, "loss": 0.5367, "step": 377 }, { "epoch": 0.1134453781512605, "grad_norm": 0.12463075667619705, "learning_rate": 0.00011321321321321321, "loss": 0.5351, "step": 378 }, { "epoch": 0.11374549819927972, "grad_norm": 0.11254709959030151, "learning_rate": 0.00011351351351351351, "loss": 0.4595, "step": 379 }, { "epoch": 0.11404561824729892, "grad_norm": 0.1292826384305954, "learning_rate": 0.00011381381381381381, "loss": 0.4878, "step": 380 }, { "epoch": 0.11434573829531813, "grad_norm": 0.12284654378890991, "learning_rate": 0.00011411411411411413, "loss": 0.499, "step": 381 }, { "epoch": 0.11464585834333733, "grad_norm": 0.12618692219257355, "learning_rate": 0.00011441441441441443, "loss": 0.5046, "step": 382 }, { "epoch": 0.11494597839135655, "grad_norm": 0.123695969581604, "learning_rate": 0.00011471471471471471, "loss": 0.5389, "step": 383 }, { "epoch": 0.11524609843937575, "grad_norm": 0.11941318213939667, "learning_rate": 0.00011501501501501501, "loss": 0.5136, "step": 384 }, { "epoch": 0.11554621848739496, "grad_norm": 0.12415286153554916, "learning_rate": 0.00011531531531531532, "loss": 0.5181, "step": 385 }, { "epoch": 0.11584633853541416, "grad_norm": 0.11182846873998642, "learning_rate": 0.00011561561561561562, "loss": 0.47, "step": 386 }, { "epoch": 0.11614645858343338, "grad_norm": 0.21233585476875305, "learning_rate": 0.00011591591591591592, "loss": 0.4958, "step": 387 }, { "epoch": 0.11644657863145258, "grad_norm": 0.11492254585027695, "learning_rate": 0.00011621621621621621, "loss": 0.4552, "step": 388 }, { "epoch": 0.1167466986794718, "grad_norm": 0.12788830697536469, "learning_rate": 0.00011651651651651652, "loss": 0.5562, "step": 389 }, { "epoch": 0.117046818727491, "grad_norm": 0.12814950942993164, "learning_rate": 0.00011681681681681682, "loss": 0.4751, "step": 390 }, { "epoch": 0.11734693877551021, "grad_norm": 0.11689490079879761, "learning_rate": 0.00011711711711711712, "loss": 0.5067, "step": 391 }, { "epoch": 0.11764705882352941, "grad_norm": 0.11457665264606476, "learning_rate": 0.00011741741741741743, "loss": 0.5146, "step": 392 }, { "epoch": 0.11794717887154862, "grad_norm": 0.1177125945687294, "learning_rate": 0.00011771771771771771, "loss": 0.5285, "step": 393 }, { "epoch": 0.11824729891956783, "grad_norm": 0.11953035742044449, "learning_rate": 0.00011801801801801802, "loss": 0.5156, "step": 394 }, { "epoch": 0.11854741896758704, "grad_norm": 0.11999611556529999, "learning_rate": 0.00011831831831831832, "loss": 0.5099, "step": 395 }, { "epoch": 0.11884753901560624, "grad_norm": 0.29343682527542114, "learning_rate": 0.00011861861861861863, "loss": 0.528, "step": 396 }, { "epoch": 0.11914765906362546, "grad_norm": 0.15137045085430145, "learning_rate": 0.00011891891891891893, "loss": 0.5343, "step": 397 }, { "epoch": 0.11944777911164466, "grad_norm": 0.12200610339641571, "learning_rate": 0.00011921921921921923, "loss": 0.4978, "step": 398 }, { "epoch": 0.11974789915966387, "grad_norm": 0.11575727164745331, "learning_rate": 0.00011951951951951952, "loss": 0.5404, "step": 399 }, { "epoch": 0.12004801920768307, "grad_norm": 0.12000302225351334, "learning_rate": 0.00011981981981981982, "loss": 0.4837, "step": 400 }, { "epoch": 0.12034813925570229, "grad_norm": 0.13822582364082336, "learning_rate": 0.00012012012012012013, "loss": 0.5004, "step": 401 }, { "epoch": 0.12064825930372149, "grad_norm": 0.14978677034378052, "learning_rate": 0.00012042042042042043, "loss": 0.5117, "step": 402 }, { "epoch": 0.1209483793517407, "grad_norm": 0.12508971989154816, "learning_rate": 0.00012072072072072073, "loss": 0.5458, "step": 403 }, { "epoch": 0.1212484993997599, "grad_norm": 0.12150216102600098, "learning_rate": 0.00012102102102102102, "loss": 0.5195, "step": 404 }, { "epoch": 0.12154861944777912, "grad_norm": 0.147630512714386, "learning_rate": 0.00012132132132132133, "loss": 0.5179, "step": 405 }, { "epoch": 0.12184873949579832, "grad_norm": 0.11386080831289291, "learning_rate": 0.00012162162162162163, "loss": 0.5021, "step": 406 }, { "epoch": 0.12214885954381753, "grad_norm": 0.11933228373527527, "learning_rate": 0.00012192192192192193, "loss": 0.5113, "step": 407 }, { "epoch": 0.12244897959183673, "grad_norm": 0.16394619643688202, "learning_rate": 0.00012222222222222224, "loss": 0.4879, "step": 408 }, { "epoch": 0.12274909963985595, "grad_norm": 0.12113256007432938, "learning_rate": 0.00012252252252252253, "loss": 0.4934, "step": 409 }, { "epoch": 0.12304921968787515, "grad_norm": 0.11715859174728394, "learning_rate": 0.00012282282282282281, "loss": 0.4911, "step": 410 }, { "epoch": 0.12334933973589436, "grad_norm": 0.1113106906414032, "learning_rate": 0.00012312312312312313, "loss": 0.443, "step": 411 }, { "epoch": 0.12364945978391356, "grad_norm": 0.11617186665534973, "learning_rate": 0.00012342342342342344, "loss": 0.5064, "step": 412 }, { "epoch": 0.12394957983193278, "grad_norm": 0.12112707644701004, "learning_rate": 0.00012372372372372373, "loss": 0.5287, "step": 413 }, { "epoch": 0.12424969987995198, "grad_norm": 0.12584644556045532, "learning_rate": 0.000124024024024024, "loss": 0.4628, "step": 414 }, { "epoch": 0.12454981992797119, "grad_norm": 0.12498998641967773, "learning_rate": 0.00012432432432432433, "loss": 0.524, "step": 415 }, { "epoch": 0.12484993997599039, "grad_norm": 0.12376196682453156, "learning_rate": 0.00012462462462462464, "loss": 0.5042, "step": 416 }, { "epoch": 0.1251500600240096, "grad_norm": 0.11971154063940048, "learning_rate": 0.00012492492492492492, "loss": 0.5125, "step": 417 }, { "epoch": 0.12545018007202882, "grad_norm": 0.12637357413768768, "learning_rate": 0.00012522522522522524, "loss": 0.5476, "step": 418 }, { "epoch": 0.125750300120048, "grad_norm": 0.11340111494064331, "learning_rate": 0.00012552552552552552, "loss": 0.4854, "step": 419 }, { "epoch": 0.12605042016806722, "grad_norm": 0.13555923104286194, "learning_rate": 0.00012582582582582584, "loss": 0.5619, "step": 420 }, { "epoch": 0.12635054021608644, "grad_norm": 0.11894300580024719, "learning_rate": 0.00012612612612612612, "loss": 0.518, "step": 421 }, { "epoch": 0.12665066026410565, "grad_norm": 0.1221066564321518, "learning_rate": 0.00012642642642642644, "loss": 0.4481, "step": 422 }, { "epoch": 0.12695078031212484, "grad_norm": 0.1434706300497055, "learning_rate": 0.00012672672672672675, "loss": 0.4978, "step": 423 }, { "epoch": 0.12725090036014405, "grad_norm": 0.11965423077344894, "learning_rate": 0.00012702702702702703, "loss": 0.4808, "step": 424 }, { "epoch": 0.12755102040816327, "grad_norm": 0.13010963797569275, "learning_rate": 0.00012732732732732732, "loss": 0.5166, "step": 425 }, { "epoch": 0.12785114045618248, "grad_norm": 0.11788228154182434, "learning_rate": 0.00012762762762762763, "loss": 0.4836, "step": 426 }, { "epoch": 0.12815126050420167, "grad_norm": 0.11451204121112823, "learning_rate": 0.00012792792792792795, "loss": 0.5056, "step": 427 }, { "epoch": 0.12845138055222088, "grad_norm": 0.11950941383838654, "learning_rate": 0.00012822822822822823, "loss": 0.492, "step": 428 }, { "epoch": 0.1287515006002401, "grad_norm": 0.11622516065835953, "learning_rate": 0.00012852852852852852, "loss": 0.5266, "step": 429 }, { "epoch": 0.1290516206482593, "grad_norm": 0.10531352460384369, "learning_rate": 0.00012882882882882883, "loss": 0.455, "step": 430 }, { "epoch": 0.1293517406962785, "grad_norm": 0.11804667860269547, "learning_rate": 0.00012912912912912915, "loss": 0.5124, "step": 431 }, { "epoch": 0.12965186074429771, "grad_norm": 0.1157531887292862, "learning_rate": 0.00012942942942942943, "loss": 0.5081, "step": 432 }, { "epoch": 0.12995198079231693, "grad_norm": 0.12014187127351761, "learning_rate": 0.00012972972972972974, "loss": 0.5014, "step": 433 }, { "epoch": 0.13025210084033614, "grad_norm": 0.11232215166091919, "learning_rate": 0.00013003003003003003, "loss": 0.4866, "step": 434 }, { "epoch": 0.13055222088835533, "grad_norm": 0.12502364814281464, "learning_rate": 0.00013033033033033032, "loss": 0.5622, "step": 435 }, { "epoch": 0.13085234093637454, "grad_norm": 0.10632560402154922, "learning_rate": 0.00013063063063063063, "loss": 0.446, "step": 436 }, { "epoch": 0.13115246098439376, "grad_norm": 0.12490659952163696, "learning_rate": 0.00013093093093093094, "loss": 0.4816, "step": 437 }, { "epoch": 0.13145258103241297, "grad_norm": 0.11873575299978256, "learning_rate": 0.00013123123123123126, "loss": 0.4863, "step": 438 }, { "epoch": 0.13175270108043216, "grad_norm": 0.11742313206195831, "learning_rate": 0.00013153153153153154, "loss": 0.5057, "step": 439 }, { "epoch": 0.13205282112845138, "grad_norm": 0.11808551102876663, "learning_rate": 0.00013183183183183183, "loss": 0.5425, "step": 440 }, { "epoch": 0.1323529411764706, "grad_norm": 0.12108122557401657, "learning_rate": 0.00013213213213213214, "loss": 0.505, "step": 441 }, { "epoch": 0.1326530612244898, "grad_norm": 0.11308103799819946, "learning_rate": 0.00013243243243243243, "loss": 0.4761, "step": 442 }, { "epoch": 0.132953181272509, "grad_norm": 0.11376778036355972, "learning_rate": 0.00013273273273273274, "loss": 0.4999, "step": 443 }, { "epoch": 0.1332533013205282, "grad_norm": 0.1098993644118309, "learning_rate": 0.00013303303303303305, "loss": 0.4636, "step": 444 }, { "epoch": 0.13355342136854742, "grad_norm": 0.12314711511135101, "learning_rate": 0.00013333333333333334, "loss": 0.4813, "step": 445 }, { "epoch": 0.13385354141656663, "grad_norm": 0.11199234426021576, "learning_rate": 0.00013363363363363363, "loss": 0.4767, "step": 446 }, { "epoch": 0.13415366146458582, "grad_norm": 0.11720983684062958, "learning_rate": 0.00013393393393393394, "loss": 0.4833, "step": 447 }, { "epoch": 0.13445378151260504, "grad_norm": 0.11430398374795914, "learning_rate": 0.00013423423423423425, "loss": 0.449, "step": 448 }, { "epoch": 0.13475390156062425, "grad_norm": 0.2317732721567154, "learning_rate": 0.00013453453453453454, "loss": 0.5283, "step": 449 }, { "epoch": 0.13505402160864347, "grad_norm": 0.11623187363147736, "learning_rate": 0.00013483483483483482, "loss": 0.4559, "step": 450 }, { "epoch": 0.13535414165666265, "grad_norm": 0.12079429626464844, "learning_rate": 0.00013513513513513514, "loss": 0.5142, "step": 451 }, { "epoch": 0.13565426170468187, "grad_norm": 0.1218424141407013, "learning_rate": 0.00013543543543543545, "loss": 0.4925, "step": 452 }, { "epoch": 0.13595438175270108, "grad_norm": 0.11799100786447525, "learning_rate": 0.00013573573573573574, "loss": 0.5254, "step": 453 }, { "epoch": 0.1362545018007203, "grad_norm": 0.11575998365879059, "learning_rate": 0.00013603603603603605, "loss": 0.504, "step": 454 }, { "epoch": 0.13655462184873948, "grad_norm": 0.697054386138916, "learning_rate": 0.00013633633633633634, "loss": 0.5308, "step": 455 }, { "epoch": 0.1368547418967587, "grad_norm": 0.11700598895549774, "learning_rate": 0.00013663663663663665, "loss": 0.4937, "step": 456 }, { "epoch": 0.1371548619447779, "grad_norm": 0.9261333346366882, "learning_rate": 0.00013693693693693693, "loss": 0.5606, "step": 457 }, { "epoch": 0.13745498199279713, "grad_norm": 0.13678379356861115, "learning_rate": 0.00013723723723723725, "loss": 0.558, "step": 458 }, { "epoch": 0.1377551020408163, "grad_norm": 0.13666567206382751, "learning_rate": 0.00013753753753753756, "loss": 0.4878, "step": 459 }, { "epoch": 0.13805522208883553, "grad_norm": 0.14556999504566193, "learning_rate": 0.00013783783783783785, "loss": 0.497, "step": 460 }, { "epoch": 0.13835534213685474, "grad_norm": 0.1318492740392685, "learning_rate": 0.00013813813813813813, "loss": 0.5445, "step": 461 }, { "epoch": 0.13865546218487396, "grad_norm": 0.1597602367401123, "learning_rate": 0.00013843843843843845, "loss": 0.4491, "step": 462 }, { "epoch": 0.13895558223289317, "grad_norm": 0.12518081068992615, "learning_rate": 0.00013873873873873876, "loss": 0.5233, "step": 463 }, { "epoch": 0.13925570228091236, "grad_norm": 0.20594088733196259, "learning_rate": 0.00013903903903903905, "loss": 0.5249, "step": 464 }, { "epoch": 0.13955582232893157, "grad_norm": 0.1464836448431015, "learning_rate": 0.00013933933933933933, "loss": 0.4862, "step": 465 }, { "epoch": 0.1398559423769508, "grad_norm": 0.1688799411058426, "learning_rate": 0.00013963963963963964, "loss": 0.5128, "step": 466 }, { "epoch": 0.14015606242497, "grad_norm": 0.1309269368648529, "learning_rate": 0.00013993993993993996, "loss": 0.5293, "step": 467 }, { "epoch": 0.1404561824729892, "grad_norm": 0.18867067992687225, "learning_rate": 0.00014024024024024024, "loss": 0.4913, "step": 468 }, { "epoch": 0.1407563025210084, "grad_norm": 0.1426246464252472, "learning_rate": 0.00014054054054054056, "loss": 0.5017, "step": 469 }, { "epoch": 0.14105642256902762, "grad_norm": 0.12447234243154526, "learning_rate": 0.00014084084084084084, "loss": 0.4744, "step": 470 }, { "epoch": 0.14135654261704683, "grad_norm": 0.1284492313861847, "learning_rate": 0.00014114114114114113, "loss": 0.4824, "step": 471 }, { "epoch": 0.14165666266506602, "grad_norm": 0.13119681179523468, "learning_rate": 0.00014144144144144144, "loss": 0.534, "step": 472 }, { "epoch": 0.14195678271308523, "grad_norm": 0.17826277017593384, "learning_rate": 0.00014174174174174176, "loss": 0.5433, "step": 473 }, { "epoch": 0.14225690276110445, "grad_norm": 0.14619775116443634, "learning_rate": 0.00014204204204204207, "loss": 0.5719, "step": 474 }, { "epoch": 0.14255702280912366, "grad_norm": 0.13723814487457275, "learning_rate": 0.00014234234234234233, "loss": 0.4814, "step": 475 }, { "epoch": 0.14285714285714285, "grad_norm": 0.1651460975408554, "learning_rate": 0.00014264264264264264, "loss": 0.5306, "step": 476 }, { "epoch": 0.14315726290516206, "grad_norm": 0.1289675384759903, "learning_rate": 0.00014294294294294295, "loss": 0.5254, "step": 477 }, { "epoch": 0.14345738295318128, "grad_norm": 0.15384627878665924, "learning_rate": 0.00014324324324324324, "loss": 0.5559, "step": 478 }, { "epoch": 0.1437575030012005, "grad_norm": 0.14855274558067322, "learning_rate": 0.00014354354354354355, "loss": 0.4941, "step": 479 }, { "epoch": 0.14405762304921968, "grad_norm": 0.1353788524866104, "learning_rate": 0.00014384384384384387, "loss": 0.4631, "step": 480 }, { "epoch": 0.1443577430972389, "grad_norm": 0.12929096817970276, "learning_rate": 0.00014414414414414415, "loss": 0.4867, "step": 481 }, { "epoch": 0.1446578631452581, "grad_norm": 0.13982702791690826, "learning_rate": 0.00014444444444444444, "loss": 0.5191, "step": 482 }, { "epoch": 0.14495798319327732, "grad_norm": 0.12878680229187012, "learning_rate": 0.00014474474474474475, "loss": 0.4988, "step": 483 }, { "epoch": 0.1452581032412965, "grad_norm": 0.11960715055465698, "learning_rate": 0.00014504504504504506, "loss": 0.4968, "step": 484 }, { "epoch": 0.14555822328931572, "grad_norm": 0.1115783229470253, "learning_rate": 0.00014534534534534535, "loss": 0.4758, "step": 485 }, { "epoch": 0.14585834333733494, "grad_norm": 0.1291203647851944, "learning_rate": 0.00014564564564564564, "loss": 0.4929, "step": 486 }, { "epoch": 0.14615846338535415, "grad_norm": 0.10866429656744003, "learning_rate": 0.00014594594594594595, "loss": 0.4278, "step": 487 }, { "epoch": 0.14645858343337334, "grad_norm": 0.1162528246641159, "learning_rate": 0.00014624624624624626, "loss": 0.4836, "step": 488 }, { "epoch": 0.14675870348139256, "grad_norm": 0.12269286066293716, "learning_rate": 0.00014654654654654655, "loss": 0.5048, "step": 489 }, { "epoch": 0.14705882352941177, "grad_norm": 0.11308015137910843, "learning_rate": 0.00014684684684684686, "loss": 0.4922, "step": 490 }, { "epoch": 0.14735894357743098, "grad_norm": 0.11391475796699524, "learning_rate": 0.00014714714714714715, "loss": 0.5073, "step": 491 }, { "epoch": 0.14765906362545017, "grad_norm": 0.1623706966638565, "learning_rate": 0.00014744744744744746, "loss": 0.486, "step": 492 }, { "epoch": 0.14795918367346939, "grad_norm": 0.12103123962879181, "learning_rate": 0.00014774774774774775, "loss": 0.5211, "step": 493 }, { "epoch": 0.1482593037214886, "grad_norm": 0.1285637617111206, "learning_rate": 0.00014804804804804806, "loss": 0.4595, "step": 494 }, { "epoch": 0.14855942376950781, "grad_norm": 0.13039712607860565, "learning_rate": 0.00014834834834834837, "loss": 0.5023, "step": 495 }, { "epoch": 0.148859543817527, "grad_norm": 0.11426712572574615, "learning_rate": 0.00014864864864864866, "loss": 0.4792, "step": 496 }, { "epoch": 0.14915966386554622, "grad_norm": 0.11875149607658386, "learning_rate": 0.00014894894894894895, "loss": 0.5282, "step": 497 }, { "epoch": 0.14945978391356543, "grad_norm": 0.12333963811397552, "learning_rate": 0.00014924924924924926, "loss": 0.5092, "step": 498 }, { "epoch": 0.14975990396158465, "grad_norm": 0.10604982823133469, "learning_rate": 0.00014954954954954957, "loss": 0.4027, "step": 499 }, { "epoch": 0.15006002400960383, "grad_norm": 0.11714020371437073, "learning_rate": 0.00014984984984984986, "loss": 0.5226, "step": 500 }, { "epoch": 0.15036014405762305, "grad_norm": 0.10923773050308228, "learning_rate": 0.00015015015015015014, "loss": 0.4432, "step": 501 }, { "epoch": 0.15066026410564226, "grad_norm": 0.17080309987068176, "learning_rate": 0.00015045045045045046, "loss": 0.5269, "step": 502 }, { "epoch": 0.15096038415366148, "grad_norm": 0.12028209120035172, "learning_rate": 0.00015075075075075077, "loss": 0.4871, "step": 503 }, { "epoch": 0.15126050420168066, "grad_norm": 0.12861275672912598, "learning_rate": 0.00015105105105105106, "loss": 0.4997, "step": 504 }, { "epoch": 0.15156062424969988, "grad_norm": 0.11345670372247696, "learning_rate": 0.00015135135135135137, "loss": 0.4506, "step": 505 }, { "epoch": 0.1518607442977191, "grad_norm": 0.12062987685203552, "learning_rate": 0.00015165165165165165, "loss": 0.5536, "step": 506 }, { "epoch": 0.1521608643457383, "grad_norm": 0.11072537302970886, "learning_rate": 0.00015195195195195194, "loss": 0.4368, "step": 507 }, { "epoch": 0.1524609843937575, "grad_norm": 0.11545488238334656, "learning_rate": 0.00015225225225225225, "loss": 0.4591, "step": 508 }, { "epoch": 0.1527611044417767, "grad_norm": 0.10316600650548935, "learning_rate": 0.00015255255255255257, "loss": 0.4261, "step": 509 }, { "epoch": 0.15306122448979592, "grad_norm": 0.11584563553333282, "learning_rate": 0.00015285285285285288, "loss": 0.5289, "step": 510 }, { "epoch": 0.15336134453781514, "grad_norm": 0.11215624213218689, "learning_rate": 0.00015315315315315314, "loss": 0.4747, "step": 511 }, { "epoch": 0.15366146458583432, "grad_norm": 0.10986870527267456, "learning_rate": 0.00015345345345345345, "loss": 0.4486, "step": 512 }, { "epoch": 0.15396158463385354, "grad_norm": 0.14615389704704285, "learning_rate": 0.00015375375375375377, "loss": 0.4973, "step": 513 }, { "epoch": 0.15426170468187275, "grad_norm": 0.11121159791946411, "learning_rate": 0.00015405405405405405, "loss": 0.4692, "step": 514 }, { "epoch": 0.15456182472989197, "grad_norm": 0.1426669806241989, "learning_rate": 0.00015435435435435436, "loss": 0.4925, "step": 515 }, { "epoch": 0.15486194477791115, "grad_norm": 0.1225138008594513, "learning_rate": 0.00015465465465465465, "loss": 0.4686, "step": 516 }, { "epoch": 0.15516206482593037, "grad_norm": 0.11575968563556671, "learning_rate": 0.00015495495495495496, "loss": 0.4677, "step": 517 }, { "epoch": 0.15546218487394958, "grad_norm": 0.1262788027524948, "learning_rate": 0.00015525525525525525, "loss": 0.4947, "step": 518 }, { "epoch": 0.1557623049219688, "grad_norm": 0.11267603933811188, "learning_rate": 0.00015555555555555556, "loss": 0.5223, "step": 519 }, { "epoch": 0.15606242496998798, "grad_norm": 0.11654146015644073, "learning_rate": 0.00015585585585585588, "loss": 0.5202, "step": 520 }, { "epoch": 0.1563625450180072, "grad_norm": 0.11901742964982986, "learning_rate": 0.00015615615615615616, "loss": 0.5293, "step": 521 }, { "epoch": 0.1566626650660264, "grad_norm": 0.12037361413240433, "learning_rate": 0.00015645645645645645, "loss": 0.5164, "step": 522 }, { "epoch": 0.15696278511404563, "grad_norm": 0.11640309542417526, "learning_rate": 0.00015675675675675676, "loss": 0.4944, "step": 523 }, { "epoch": 0.15726290516206481, "grad_norm": 0.1187070906162262, "learning_rate": 0.00015705705705705707, "loss": 0.5435, "step": 524 }, { "epoch": 0.15756302521008403, "grad_norm": 0.11369931697845459, "learning_rate": 0.00015735735735735736, "loss": 0.4934, "step": 525 }, { "epoch": 0.15786314525810324, "grad_norm": 0.1534053087234497, "learning_rate": 0.00015765765765765767, "loss": 0.4869, "step": 526 }, { "epoch": 0.15816326530612246, "grad_norm": 0.11573463678359985, "learning_rate": 0.00015795795795795796, "loss": 0.5013, "step": 527 }, { "epoch": 0.15846338535414164, "grad_norm": 0.1068471297621727, "learning_rate": 0.00015825825825825827, "loss": 0.4565, "step": 528 }, { "epoch": 0.15876350540216086, "grad_norm": 0.12096768617630005, "learning_rate": 0.00015855855855855856, "loss": 0.54, "step": 529 }, { "epoch": 0.15906362545018007, "grad_norm": 0.17728646099567413, "learning_rate": 0.00015885885885885887, "loss": 0.526, "step": 530 }, { "epoch": 0.1593637454981993, "grad_norm": 0.11367742717266083, "learning_rate": 0.00015915915915915919, "loss": 0.5088, "step": 531 }, { "epoch": 0.15966386554621848, "grad_norm": 0.12228825688362122, "learning_rate": 0.00015945945945945947, "loss": 0.4922, "step": 532 }, { "epoch": 0.1599639855942377, "grad_norm": 0.12022542208433151, "learning_rate": 0.00015975975975975976, "loss": 0.5108, "step": 533 }, { "epoch": 0.1602641056422569, "grad_norm": 0.11787454783916473, "learning_rate": 0.00016006006006006007, "loss": 0.5062, "step": 534 }, { "epoch": 0.16056422569027612, "grad_norm": 0.1120874211192131, "learning_rate": 0.00016036036036036038, "loss": 0.4568, "step": 535 }, { "epoch": 0.1608643457382953, "grad_norm": 0.10945652425289154, "learning_rate": 0.00016066066066066067, "loss": 0.4865, "step": 536 }, { "epoch": 0.16116446578631452, "grad_norm": 0.11467491835355759, "learning_rate": 0.00016096096096096096, "loss": 0.5116, "step": 537 }, { "epoch": 0.16146458583433373, "grad_norm": 0.14926712214946747, "learning_rate": 0.00016126126126126127, "loss": 0.5284, "step": 538 }, { "epoch": 0.16176470588235295, "grad_norm": 0.10946252942085266, "learning_rate": 0.00016156156156156158, "loss": 0.4367, "step": 539 }, { "epoch": 0.16206482593037214, "grad_norm": 0.10662908852100372, "learning_rate": 0.00016186186186186187, "loss": 0.4311, "step": 540 }, { "epoch": 0.16236494597839135, "grad_norm": 0.11896120011806488, "learning_rate": 0.00016216216216216218, "loss": 0.5218, "step": 541 }, { "epoch": 0.16266506602641057, "grad_norm": 0.13229900598526, "learning_rate": 0.00016246246246246247, "loss": 0.4867, "step": 542 }, { "epoch": 0.16296518607442978, "grad_norm": 0.11431475728750229, "learning_rate": 0.00016276276276276275, "loss": 0.4592, "step": 543 }, { "epoch": 0.16326530612244897, "grad_norm": 0.15720653533935547, "learning_rate": 0.00016306306306306307, "loss": 0.519, "step": 544 }, { "epoch": 0.16356542617046818, "grad_norm": 0.11554212868213654, "learning_rate": 0.00016336336336336338, "loss": 0.5232, "step": 545 }, { "epoch": 0.1638655462184874, "grad_norm": 0.10940536856651306, "learning_rate": 0.0001636636636636637, "loss": 0.4515, "step": 546 }, { "epoch": 0.1641656662665066, "grad_norm": 0.11184670031070709, "learning_rate": 0.00016396396396396395, "loss": 0.477, "step": 547 }, { "epoch": 0.1644657863145258, "grad_norm": 0.13043731451034546, "learning_rate": 0.00016426426426426426, "loss": 0.5079, "step": 548 }, { "epoch": 0.164765906362545, "grad_norm": 0.1191568523645401, "learning_rate": 0.00016456456456456458, "loss": 0.4987, "step": 549 }, { "epoch": 0.16506602641056423, "grad_norm": 0.10903244465589523, "learning_rate": 0.00016486486486486486, "loss": 0.4772, "step": 550 }, { "epoch": 0.16536614645858344, "grad_norm": 0.15818609297275543, "learning_rate": 0.00016516516516516518, "loss": 0.5382, "step": 551 }, { "epoch": 0.16566626650660263, "grad_norm": 0.11612758785486221, "learning_rate": 0.00016546546546546546, "loss": 0.5043, "step": 552 }, { "epoch": 0.16596638655462184, "grad_norm": 0.11419732123613358, "learning_rate": 0.00016576576576576578, "loss": 0.4823, "step": 553 }, { "epoch": 0.16626650660264106, "grad_norm": 0.1097126379609108, "learning_rate": 0.00016606606606606606, "loss": 0.4657, "step": 554 }, { "epoch": 0.16656662665066027, "grad_norm": 0.11375732719898224, "learning_rate": 0.00016636636636636638, "loss": 0.4976, "step": 555 }, { "epoch": 0.16686674669867949, "grad_norm": 0.14020946621894836, "learning_rate": 0.0001666666666666667, "loss": 0.5031, "step": 556 }, { "epoch": 0.16716686674669867, "grad_norm": 0.11537282168865204, "learning_rate": 0.00016696696696696697, "loss": 0.5307, "step": 557 }, { "epoch": 0.1674669867947179, "grad_norm": 0.11510564386844635, "learning_rate": 0.00016726726726726726, "loss": 0.4818, "step": 558 }, { "epoch": 0.1677671068427371, "grad_norm": 0.14965474605560303, "learning_rate": 0.00016756756756756757, "loss": 0.4733, "step": 559 }, { "epoch": 0.16806722689075632, "grad_norm": 0.11072426289319992, "learning_rate": 0.0001678678678678679, "loss": 0.4945, "step": 560 }, { "epoch": 0.1683673469387755, "grad_norm": 0.13948391377925873, "learning_rate": 0.00016816816816816817, "loss": 0.5384, "step": 561 }, { "epoch": 0.16866746698679472, "grad_norm": 0.12210942059755325, "learning_rate": 0.00016846846846846846, "loss": 0.5396, "step": 562 }, { "epoch": 0.16896758703481393, "grad_norm": 0.1163802221417427, "learning_rate": 0.00016876876876876877, "loss": 0.5014, "step": 563 }, { "epoch": 0.16926770708283315, "grad_norm": 0.15363533794879913, "learning_rate": 0.00016906906906906908, "loss": 0.4728, "step": 564 }, { "epoch": 0.16956782713085233, "grad_norm": 0.13578364253044128, "learning_rate": 0.00016936936936936937, "loss": 0.473, "step": 565 }, { "epoch": 0.16986794717887155, "grad_norm": 0.11850472539663315, "learning_rate": 0.00016966966966966968, "loss": 0.5105, "step": 566 }, { "epoch": 0.17016806722689076, "grad_norm": 0.11600814759731293, "learning_rate": 0.00016996996996997, "loss": 0.5138, "step": 567 }, { "epoch": 0.17046818727490998, "grad_norm": 0.11694512516260147, "learning_rate": 0.00017027027027027028, "loss": 0.4818, "step": 568 }, { "epoch": 0.17076830732292916, "grad_norm": 0.1589028388261795, "learning_rate": 0.00017057057057057057, "loss": 0.5286, "step": 569 }, { "epoch": 0.17106842737094838, "grad_norm": 0.11037249118089676, "learning_rate": 0.00017087087087087088, "loss": 0.5065, "step": 570 }, { "epoch": 0.1713685474189676, "grad_norm": 0.11246825754642487, "learning_rate": 0.0001711711711711712, "loss": 0.4572, "step": 571 }, { "epoch": 0.1716686674669868, "grad_norm": 0.12457087635993958, "learning_rate": 0.00017147147147147148, "loss": 0.5341, "step": 572 }, { "epoch": 0.171968787515006, "grad_norm": 0.11229050159454346, "learning_rate": 0.00017177177177177177, "loss": 0.4152, "step": 573 }, { "epoch": 0.1722689075630252, "grad_norm": 0.10759669542312622, "learning_rate": 0.00017207207207207208, "loss": 0.4719, "step": 574 }, { "epoch": 0.17256902761104442, "grad_norm": 0.1949225217103958, "learning_rate": 0.0001723723723723724, "loss": 0.4645, "step": 575 }, { "epoch": 0.17286914765906364, "grad_norm": 0.11539410054683685, "learning_rate": 0.00017267267267267268, "loss": 0.5257, "step": 576 }, { "epoch": 0.17316926770708282, "grad_norm": 0.11130233108997345, "learning_rate": 0.000172972972972973, "loss": 0.4651, "step": 577 }, { "epoch": 0.17346938775510204, "grad_norm": 0.11778189241886139, "learning_rate": 0.00017327327327327328, "loss": 0.5365, "step": 578 }, { "epoch": 0.17376950780312125, "grad_norm": 0.11627792567014694, "learning_rate": 0.00017357357357357357, "loss": 0.5052, "step": 579 }, { "epoch": 0.17406962785114047, "grad_norm": 0.13903485238552094, "learning_rate": 0.00017387387387387388, "loss": 0.5194, "step": 580 }, { "epoch": 0.17436974789915966, "grad_norm": 0.12274301797151566, "learning_rate": 0.0001741741741741742, "loss": 0.506, "step": 581 }, { "epoch": 0.17466986794717887, "grad_norm": 0.11964797973632812, "learning_rate": 0.0001744744744744745, "loss": 0.4422, "step": 582 }, { "epoch": 0.17496998799519808, "grad_norm": 0.11528735607862473, "learning_rate": 0.00017477477477477476, "loss": 0.5045, "step": 583 }, { "epoch": 0.1752701080432173, "grad_norm": 0.12178003042936325, "learning_rate": 0.00017507507507507508, "loss": 0.4316, "step": 584 }, { "epoch": 0.17557022809123649, "grad_norm": 0.10782955586910248, "learning_rate": 0.0001753753753753754, "loss": 0.4541, "step": 585 }, { "epoch": 0.1758703481392557, "grad_norm": 0.11372298747301102, "learning_rate": 0.00017567567567567568, "loss": 0.4489, "step": 586 }, { "epoch": 0.17617046818727491, "grad_norm": 0.16553634405136108, "learning_rate": 0.000175975975975976, "loss": 0.4732, "step": 587 }, { "epoch": 0.17647058823529413, "grad_norm": 0.12725332379341125, "learning_rate": 0.00017627627627627627, "loss": 0.5046, "step": 588 }, { "epoch": 0.17677070828331332, "grad_norm": 0.11720646917819977, "learning_rate": 0.0001765765765765766, "loss": 0.5264, "step": 589 }, { "epoch": 0.17707082833133253, "grad_norm": 0.11271098256111145, "learning_rate": 0.00017687687687687687, "loss": 0.4926, "step": 590 }, { "epoch": 0.17737094837935174, "grad_norm": 0.12022582441568375, "learning_rate": 0.0001771771771771772, "loss": 0.5234, "step": 591 }, { "epoch": 0.17767106842737096, "grad_norm": 0.12866370379924774, "learning_rate": 0.0001774774774774775, "loss": 0.4921, "step": 592 }, { "epoch": 0.17797118847539015, "grad_norm": 0.11018224060535431, "learning_rate": 0.00017777777777777779, "loss": 0.4588, "step": 593 }, { "epoch": 0.17827130852340936, "grad_norm": 0.12001125514507294, "learning_rate": 0.00017807807807807807, "loss": 0.5136, "step": 594 }, { "epoch": 0.17857142857142858, "grad_norm": 0.12296062707901001, "learning_rate": 0.00017837837837837839, "loss": 0.4919, "step": 595 }, { "epoch": 0.1788715486194478, "grad_norm": 0.11281454563140869, "learning_rate": 0.0001786786786786787, "loss": 0.494, "step": 596 }, { "epoch": 0.17917166866746698, "grad_norm": 0.10556471347808838, "learning_rate": 0.00017897897897897898, "loss": 0.4229, "step": 597 }, { "epoch": 0.1794717887154862, "grad_norm": 0.1097581535577774, "learning_rate": 0.00017927927927927927, "loss": 0.4967, "step": 598 }, { "epoch": 0.1797719087635054, "grad_norm": 0.10584639757871628, "learning_rate": 0.00017957957957957958, "loss": 0.4255, "step": 599 }, { "epoch": 0.18007202881152462, "grad_norm": 0.11400244385004044, "learning_rate": 0.0001798798798798799, "loss": 0.4282, "step": 600 }, { "epoch": 0.1803721488595438, "grad_norm": 0.11473983526229858, "learning_rate": 0.00018018018018018018, "loss": 0.557, "step": 601 }, { "epoch": 0.18067226890756302, "grad_norm": 0.24360248446464539, "learning_rate": 0.0001804804804804805, "loss": 0.4693, "step": 602 }, { "epoch": 0.18097238895558224, "grad_norm": 0.11157704144716263, "learning_rate": 0.00018078078078078078, "loss": 0.4937, "step": 603 }, { "epoch": 0.18127250900360145, "grad_norm": 0.12321450561285019, "learning_rate": 0.0001810810810810811, "loss": 0.5592, "step": 604 }, { "epoch": 0.18157262905162064, "grad_norm": 0.11670718342065811, "learning_rate": 0.00018138138138138138, "loss": 0.507, "step": 605 }, { "epoch": 0.18187274909963985, "grad_norm": 0.10992065072059631, "learning_rate": 0.0001816816816816817, "loss": 0.4762, "step": 606 }, { "epoch": 0.18217286914765907, "grad_norm": 0.11405187100172043, "learning_rate": 0.000181981981981982, "loss": 0.4785, "step": 607 }, { "epoch": 0.18247298919567828, "grad_norm": 0.11776269227266312, "learning_rate": 0.0001822822822822823, "loss": 0.438, "step": 608 }, { "epoch": 0.18277310924369747, "grad_norm": 0.29667896032333374, "learning_rate": 0.00018258258258258258, "loss": 0.4869, "step": 609 }, { "epoch": 0.18307322929171668, "grad_norm": 0.11483728140592575, "learning_rate": 0.0001828828828828829, "loss": 0.5124, "step": 610 }, { "epoch": 0.1833733493397359, "grad_norm": 0.15464454889297485, "learning_rate": 0.0001831831831831832, "loss": 0.49, "step": 611 }, { "epoch": 0.1836734693877551, "grad_norm": 0.1975976973772049, "learning_rate": 0.0001834834834834835, "loss": 0.4841, "step": 612 }, { "epoch": 0.1839735894357743, "grad_norm": 0.18922753632068634, "learning_rate": 0.0001837837837837838, "loss": 0.461, "step": 613 }, { "epoch": 0.1842737094837935, "grad_norm": 0.11498506367206573, "learning_rate": 0.0001840840840840841, "loss": 0.4768, "step": 614 }, { "epoch": 0.18457382953181273, "grad_norm": 0.11605051159858704, "learning_rate": 0.00018438438438438438, "loss": 0.4853, "step": 615 }, { "epoch": 0.18487394957983194, "grad_norm": 0.13284830749034882, "learning_rate": 0.0001846846846846847, "loss": 0.521, "step": 616 }, { "epoch": 0.18517406962785113, "grad_norm": 0.11343251913785934, "learning_rate": 0.000184984984984985, "loss": 0.4741, "step": 617 }, { "epoch": 0.18547418967587034, "grad_norm": 0.11391028016805649, "learning_rate": 0.00018528528528528532, "loss": 0.464, "step": 618 }, { "epoch": 0.18577430972388956, "grad_norm": 0.14338339865207672, "learning_rate": 0.00018558558558558558, "loss": 0.4896, "step": 619 }, { "epoch": 0.18607442977190877, "grad_norm": 0.11370062828063965, "learning_rate": 0.0001858858858858859, "loss": 0.4137, "step": 620 }, { "epoch": 0.18637454981992796, "grad_norm": 0.20215009152889252, "learning_rate": 0.0001861861861861862, "loss": 0.4753, "step": 621 }, { "epoch": 0.18667466986794717, "grad_norm": 0.11335323750972748, "learning_rate": 0.0001864864864864865, "loss": 0.484, "step": 622 }, { "epoch": 0.1869747899159664, "grad_norm": 0.12075243145227432, "learning_rate": 0.0001867867867867868, "loss": 0.5086, "step": 623 }, { "epoch": 0.1872749099639856, "grad_norm": 0.1137528046965599, "learning_rate": 0.0001870870870870871, "loss": 0.4862, "step": 624 }, { "epoch": 0.1875750300120048, "grad_norm": 0.11862170696258545, "learning_rate": 0.0001873873873873874, "loss": 0.4472, "step": 625 }, { "epoch": 0.187875150060024, "grad_norm": 0.10449140518903732, "learning_rate": 0.00018768768768768769, "loss": 0.4252, "step": 626 }, { "epoch": 0.18817527010804322, "grad_norm": 0.12479628622531891, "learning_rate": 0.000187987987987988, "loss": 0.5151, "step": 627 }, { "epoch": 0.18847539015606243, "grad_norm": 0.11015018075704575, "learning_rate": 0.0001882882882882883, "loss": 0.4688, "step": 628 }, { "epoch": 0.18877551020408162, "grad_norm": 0.11410848796367645, "learning_rate": 0.0001885885885885886, "loss": 0.4999, "step": 629 }, { "epoch": 0.18907563025210083, "grad_norm": 0.1426745504140854, "learning_rate": 0.00018888888888888888, "loss": 0.4098, "step": 630 }, { "epoch": 0.18937575030012005, "grad_norm": 0.10985606908798218, "learning_rate": 0.0001891891891891892, "loss": 0.4525, "step": 631 }, { "epoch": 0.18967587034813926, "grad_norm": 0.11787772178649902, "learning_rate": 0.0001894894894894895, "loss": 0.4897, "step": 632 }, { "epoch": 0.18997599039615845, "grad_norm": 0.12313798815011978, "learning_rate": 0.0001897897897897898, "loss": 0.5339, "step": 633 }, { "epoch": 0.19027611044417767, "grad_norm": 0.1817101240158081, "learning_rate": 0.00019009009009009008, "loss": 0.5494, "step": 634 }, { "epoch": 0.19057623049219688, "grad_norm": 0.11306577175855637, "learning_rate": 0.0001903903903903904, "loss": 0.4803, "step": 635 }, { "epoch": 0.1908763505402161, "grad_norm": 0.11045024544000626, "learning_rate": 0.0001906906906906907, "loss": 0.4431, "step": 636 }, { "epoch": 0.19117647058823528, "grad_norm": 0.11855772882699966, "learning_rate": 0.000190990990990991, "loss": 0.5276, "step": 637 }, { "epoch": 0.1914765906362545, "grad_norm": 0.2188035398721695, "learning_rate": 0.0001912912912912913, "loss": 0.4671, "step": 638 }, { "epoch": 0.1917767106842737, "grad_norm": 0.1268298178911209, "learning_rate": 0.0001915915915915916, "loss": 0.4684, "step": 639 }, { "epoch": 0.19207683073229292, "grad_norm": 0.11935406178236008, "learning_rate": 0.0001918918918918919, "loss": 0.4585, "step": 640 }, { "epoch": 0.1923769507803121, "grad_norm": 0.12008475512266159, "learning_rate": 0.0001921921921921922, "loss": 0.4586, "step": 641 }, { "epoch": 0.19267707082833133, "grad_norm": 0.1218239888548851, "learning_rate": 0.0001924924924924925, "loss": 0.5081, "step": 642 }, { "epoch": 0.19297719087635054, "grad_norm": 0.11091190576553345, "learning_rate": 0.00019279279279279282, "loss": 0.5079, "step": 643 }, { "epoch": 0.19327731092436976, "grad_norm": 0.1120649129152298, "learning_rate": 0.00019309309309309308, "loss": 0.512, "step": 644 }, { "epoch": 0.19357743097238894, "grad_norm": 0.12589308619499207, "learning_rate": 0.0001933933933933934, "loss": 0.5332, "step": 645 }, { "epoch": 0.19387755102040816, "grad_norm": 0.20976229012012482, "learning_rate": 0.0001936936936936937, "loss": 0.4552, "step": 646 }, { "epoch": 0.19417767106842737, "grad_norm": 0.10452635586261749, "learning_rate": 0.00019399399399399402, "loss": 0.4325, "step": 647 }, { "epoch": 0.19447779111644659, "grad_norm": 0.12973804771900177, "learning_rate": 0.0001942942942942943, "loss": 0.4784, "step": 648 }, { "epoch": 0.1947779111644658, "grad_norm": 0.11603761464357376, "learning_rate": 0.00019459459459459462, "loss": 0.4605, "step": 649 }, { "epoch": 0.195078031212485, "grad_norm": 0.11491604894399643, "learning_rate": 0.0001948948948948949, "loss": 0.4921, "step": 650 }, { "epoch": 0.1953781512605042, "grad_norm": 0.11520378291606903, "learning_rate": 0.0001951951951951952, "loss": 0.5079, "step": 651 }, { "epoch": 0.19567827130852342, "grad_norm": 0.16177918016910553, "learning_rate": 0.0001954954954954955, "loss": 0.5376, "step": 652 }, { "epoch": 0.19597839135654263, "grad_norm": 0.11226612329483032, "learning_rate": 0.00019579579579579582, "loss": 0.4742, "step": 653 }, { "epoch": 0.19627851140456182, "grad_norm": 0.11720920354127884, "learning_rate": 0.00019609609609609613, "loss": 0.4879, "step": 654 }, { "epoch": 0.19657863145258103, "grad_norm": 0.12089575827121735, "learning_rate": 0.0001963963963963964, "loss": 0.4753, "step": 655 }, { "epoch": 0.19687875150060025, "grad_norm": 0.11939114332199097, "learning_rate": 0.0001966966966966967, "loss": 0.51, "step": 656 }, { "epoch": 0.19717887154861946, "grad_norm": 0.11049570888280869, "learning_rate": 0.00019699699699699701, "loss": 0.4316, "step": 657 }, { "epoch": 0.19747899159663865, "grad_norm": 0.11864820122718811, "learning_rate": 0.0001972972972972973, "loss": 0.5286, "step": 658 }, { "epoch": 0.19777911164465786, "grad_norm": 0.13813814520835876, "learning_rate": 0.0001975975975975976, "loss": 0.4973, "step": 659 }, { "epoch": 0.19807923169267708, "grad_norm": 0.10893283039331436, "learning_rate": 0.0001978978978978979, "loss": 0.4862, "step": 660 }, { "epoch": 0.1983793517406963, "grad_norm": 0.11592289805412292, "learning_rate": 0.0001981981981981982, "loss": 0.4995, "step": 661 }, { "epoch": 0.19867947178871548, "grad_norm": 0.10914364457130432, "learning_rate": 0.0001984984984984985, "loss": 0.4387, "step": 662 }, { "epoch": 0.1989795918367347, "grad_norm": 0.11548639833927155, "learning_rate": 0.0001987987987987988, "loss": 0.4889, "step": 663 }, { "epoch": 0.1992797118847539, "grad_norm": 0.1200682744383812, "learning_rate": 0.00019909909909909912, "loss": 0.5314, "step": 664 }, { "epoch": 0.19957983193277312, "grad_norm": 0.11693330109119415, "learning_rate": 0.0001993993993993994, "loss": 0.4713, "step": 665 }, { "epoch": 0.1998799519807923, "grad_norm": 0.11875863373279572, "learning_rate": 0.0001996996996996997, "loss": 0.494, "step": 666 }, { "epoch": 0.20018007202881152, "grad_norm": 0.11220604181289673, "learning_rate": 0.0002, "loss": 0.429, "step": 667 }, { "epoch": 0.20048019207683074, "grad_norm": 0.11995385587215424, "learning_rate": 0.00019999998628307335, "loss": 0.4765, "step": 668 }, { "epoch": 0.20078031212484995, "grad_norm": 0.14676910638809204, "learning_rate": 0.0001999999451322971, "loss": 0.5261, "step": 669 }, { "epoch": 0.20108043217286914, "grad_norm": 0.12924613058567047, "learning_rate": 0.00019999987654768255, "loss": 0.4744, "step": 670 }, { "epoch": 0.20138055222088835, "grad_norm": 0.34157878160476685, "learning_rate": 0.00019999978052924851, "loss": 0.4935, "step": 671 }, { "epoch": 0.20168067226890757, "grad_norm": 0.11561044305562973, "learning_rate": 0.0001999996570770214, "loss": 0.4938, "step": 672 }, { "epoch": 0.20198079231692678, "grad_norm": 0.12380159646272659, "learning_rate": 0.000199999506191035, "loss": 0.5206, "step": 673 }, { "epoch": 0.20228091236494597, "grad_norm": 0.12147749960422516, "learning_rate": 0.00019999932787133072, "loss": 0.419, "step": 674 }, { "epoch": 0.20258103241296518, "grad_norm": 0.12487666308879852, "learning_rate": 0.00019999912211795748, "loss": 0.482, "step": 675 }, { "epoch": 0.2028811524609844, "grad_norm": 0.11645001918077469, "learning_rate": 0.00019999888893097175, "loss": 0.495, "step": 676 }, { "epoch": 0.2031812725090036, "grad_norm": 0.14186392724514008, "learning_rate": 0.00019999862831043748, "loss": 0.4577, "step": 677 }, { "epoch": 0.2034813925570228, "grad_norm": 0.11139611899852753, "learning_rate": 0.00019999834025642618, "loss": 0.4751, "step": 678 }, { "epoch": 0.20378151260504201, "grad_norm": 0.1277855485677719, "learning_rate": 0.00019999802476901687, "loss": 0.5092, "step": 679 }, { "epoch": 0.20408163265306123, "grad_norm": 0.1118745356798172, "learning_rate": 0.0001999976818482961, "loss": 0.4756, "step": 680 }, { "epoch": 0.20438175270108044, "grad_norm": 0.12182165682315826, "learning_rate": 0.00019999731149435794, "loss": 0.4777, "step": 681 }, { "epoch": 0.20468187274909963, "grad_norm": 0.1322716474533081, "learning_rate": 0.00019999691370730402, "loss": 0.4691, "step": 682 }, { "epoch": 0.20498199279711884, "grad_norm": 0.12655936181545258, "learning_rate": 0.00019999648848724344, "loss": 0.4575, "step": 683 }, { "epoch": 0.20528211284513806, "grad_norm": 0.10949641466140747, "learning_rate": 0.00019999603583429284, "loss": 0.4587, "step": 684 }, { "epoch": 0.20558223289315727, "grad_norm": 0.11819802969694138, "learning_rate": 0.00019999555574857646, "loss": 0.5044, "step": 685 }, { "epoch": 0.20588235294117646, "grad_norm": 0.12319862842559814, "learning_rate": 0.00019999504823022592, "loss": 0.5049, "step": 686 }, { "epoch": 0.20618247298919568, "grad_norm": 0.12791141867637634, "learning_rate": 0.00019999451327938053, "loss": 0.5069, "step": 687 }, { "epoch": 0.2064825930372149, "grad_norm": 0.11336719244718552, "learning_rate": 0.00019999395089618702, "loss": 0.4507, "step": 688 }, { "epoch": 0.2067827130852341, "grad_norm": 0.15613985061645508, "learning_rate": 0.00019999336108079968, "loss": 0.498, "step": 689 }, { "epoch": 0.2070828331332533, "grad_norm": 0.11534976214170456, "learning_rate": 0.00019999274383338027, "loss": 0.4632, "step": 690 }, { "epoch": 0.2073829531812725, "grad_norm": 0.12534739077091217, "learning_rate": 0.0001999920991540982, "loss": 0.472, "step": 691 }, { "epoch": 0.20768307322929172, "grad_norm": 0.11476431041955948, "learning_rate": 0.00019999142704313027, "loss": 0.4315, "step": 692 }, { "epoch": 0.20798319327731093, "grad_norm": 0.13909682631492615, "learning_rate": 0.0001999907275006609, "loss": 0.4957, "step": 693 }, { "epoch": 0.20828331332533012, "grad_norm": 0.11797010153532028, "learning_rate": 0.00019999000052688202, "loss": 0.4987, "step": 694 }, { "epoch": 0.20858343337334934, "grad_norm": 0.12070614844560623, "learning_rate": 0.000199989246121993, "loss": 0.536, "step": 695 }, { "epoch": 0.20888355342136855, "grad_norm": 0.13717220723628998, "learning_rate": 0.00019998846428620089, "loss": 0.4716, "step": 696 }, { "epoch": 0.20918367346938777, "grad_norm": 0.11131128668785095, "learning_rate": 0.00019998765501972007, "loss": 0.47, "step": 697 }, { "epoch": 0.20948379351740695, "grad_norm": 0.1145501509308815, "learning_rate": 0.00019998681832277267, "loss": 0.4592, "step": 698 }, { "epoch": 0.20978391356542617, "grad_norm": 0.1326148509979248, "learning_rate": 0.00019998595419558812, "loss": 0.487, "step": 699 }, { "epoch": 0.21008403361344538, "grad_norm": 0.1202148050069809, "learning_rate": 0.00019998506263840354, "loss": 0.4405, "step": 700 }, { "epoch": 0.2103841536614646, "grad_norm": 0.12064465880393982, "learning_rate": 0.00019998414365146353, "loss": 0.4919, "step": 701 }, { "epoch": 0.21068427370948378, "grad_norm": 0.19440989196300507, "learning_rate": 0.00019998319723502019, "loss": 0.4824, "step": 702 }, { "epoch": 0.210984393757503, "grad_norm": 0.12534023821353912, "learning_rate": 0.00019998222338933315, "loss": 0.534, "step": 703 }, { "epoch": 0.2112845138055222, "grad_norm": 0.13558034598827362, "learning_rate": 0.00019998122211466957, "loss": 0.5248, "step": 704 }, { "epoch": 0.21158463385354143, "grad_norm": 0.2575712502002716, "learning_rate": 0.00019998019341130416, "loss": 0.4579, "step": 705 }, { "epoch": 0.2118847539015606, "grad_norm": 0.116855688393116, "learning_rate": 0.0001999791372795191, "loss": 0.4579, "step": 706 }, { "epoch": 0.21218487394957983, "grad_norm": 0.12518906593322754, "learning_rate": 0.00019997805371960417, "loss": 0.5, "step": 707 }, { "epoch": 0.21248499399759904, "grad_norm": 0.12143798172473907, "learning_rate": 0.00019997694273185662, "loss": 0.4403, "step": 708 }, { "epoch": 0.21278511404561826, "grad_norm": 0.11407876759767532, "learning_rate": 0.0001999758043165812, "loss": 0.4452, "step": 709 }, { "epoch": 0.21308523409363744, "grad_norm": 0.10889005661010742, "learning_rate": 0.00019997463847409023, "loss": 0.4205, "step": 710 }, { "epoch": 0.21338535414165666, "grad_norm": 0.12497064471244812, "learning_rate": 0.00019997344520470358, "loss": 0.446, "step": 711 }, { "epoch": 0.21368547418967587, "grad_norm": 0.14738810062408447, "learning_rate": 0.0001999722245087486, "loss": 0.4782, "step": 712 }, { "epoch": 0.2139855942376951, "grad_norm": 0.1568441241979599, "learning_rate": 0.00019997097638656014, "loss": 0.4735, "step": 713 }, { "epoch": 0.21428571428571427, "grad_norm": 0.13330377638339996, "learning_rate": 0.00019996970083848066, "loss": 0.5534, "step": 714 }, { "epoch": 0.2145858343337335, "grad_norm": 0.13056230545043945, "learning_rate": 0.00019996839786486006, "loss": 0.4948, "step": 715 }, { "epoch": 0.2148859543817527, "grad_norm": 0.1922270804643631, "learning_rate": 0.00019996706746605583, "loss": 0.5532, "step": 716 }, { "epoch": 0.21518607442977192, "grad_norm": 0.12903466820716858, "learning_rate": 0.00019996570964243287, "loss": 0.5108, "step": 717 }, { "epoch": 0.2154861944777911, "grad_norm": 0.11882127076387405, "learning_rate": 0.00019996432439436376, "loss": 0.5006, "step": 718 }, { "epoch": 0.21578631452581032, "grad_norm": 0.13730670511722565, "learning_rate": 0.00019996291172222848, "loss": 0.508, "step": 719 }, { "epoch": 0.21608643457382953, "grad_norm": 0.13259319961071014, "learning_rate": 0.00019996147162641464, "loss": 0.5398, "step": 720 }, { "epoch": 0.21638655462184875, "grad_norm": 0.2860109508037567, "learning_rate": 0.00019996000410731725, "loss": 0.5913, "step": 721 }, { "epoch": 0.21668667466986793, "grad_norm": 0.1916925013065338, "learning_rate": 0.00019995850916533896, "loss": 0.4511, "step": 722 }, { "epoch": 0.21698679471788715, "grad_norm": 0.31316134333610535, "learning_rate": 0.00019995698680088983, "loss": 0.5074, "step": 723 }, { "epoch": 0.21728691476590636, "grad_norm": 0.14046156406402588, "learning_rate": 0.00019995543701438757, "loss": 0.4805, "step": 724 }, { "epoch": 0.21758703481392558, "grad_norm": 0.11503440886735916, "learning_rate": 0.00019995385980625728, "loss": 0.4434, "step": 725 }, { "epoch": 0.21788715486194477, "grad_norm": 0.11567936092615128, "learning_rate": 0.00019995225517693174, "loss": 0.4499, "step": 726 }, { "epoch": 0.21818727490996398, "grad_norm": 0.11768448352813721, "learning_rate": 0.00019995062312685104, "loss": 0.4508, "step": 727 }, { "epoch": 0.2184873949579832, "grad_norm": 0.10805214941501617, "learning_rate": 0.000199948963656463, "loss": 0.4069, "step": 728 }, { "epoch": 0.2187875150060024, "grad_norm": 0.5363399982452393, "learning_rate": 0.00019994727676622286, "loss": 0.4972, "step": 729 }, { "epoch": 0.2190876350540216, "grad_norm": 0.11812508851289749, "learning_rate": 0.00019994556245659338, "loss": 0.4807, "step": 730 }, { "epoch": 0.2193877551020408, "grad_norm": 0.20644044876098633, "learning_rate": 0.00019994382072804489, "loss": 0.4612, "step": 731 }, { "epoch": 0.21968787515006002, "grad_norm": 0.11623464524745941, "learning_rate": 0.00019994205158105517, "loss": 0.4529, "step": 732 }, { "epoch": 0.21998799519807924, "grad_norm": 0.12570780515670776, "learning_rate": 0.00019994025501610962, "loss": 0.5192, "step": 733 }, { "epoch": 0.22028811524609843, "grad_norm": 0.12002886086702347, "learning_rate": 0.00019993843103370104, "loss": 0.4486, "step": 734 }, { "epoch": 0.22058823529411764, "grad_norm": 0.12708617746829987, "learning_rate": 0.0001999365796343299, "loss": 0.4994, "step": 735 }, { "epoch": 0.22088835534213686, "grad_norm": 0.14321328699588776, "learning_rate": 0.00019993470081850406, "loss": 0.5461, "step": 736 }, { "epoch": 0.22118847539015607, "grad_norm": 0.5809831619262695, "learning_rate": 0.00019993279458673896, "loss": 0.4908, "step": 737 }, { "epoch": 0.22148859543817526, "grad_norm": 0.12648674845695496, "learning_rate": 0.00019993086093955754, "loss": 0.5008, "step": 738 }, { "epoch": 0.22178871548619447, "grad_norm": 0.12926629185676575, "learning_rate": 0.00019992889987749033, "loss": 0.4538, "step": 739 }, { "epoch": 0.22208883553421369, "grad_norm": 0.13172459602355957, "learning_rate": 0.00019992691140107525, "loss": 0.4697, "step": 740 }, { "epoch": 0.2223889555822329, "grad_norm": 0.20475362241268158, "learning_rate": 0.00019992489551085783, "loss": 0.5028, "step": 741 }, { "epoch": 0.22268907563025211, "grad_norm": 0.11884354799985886, "learning_rate": 0.00019992285220739114, "loss": 0.4742, "step": 742 }, { "epoch": 0.2229891956782713, "grad_norm": 0.1247478723526001, "learning_rate": 0.0001999207814912357, "loss": 0.4895, "step": 743 }, { "epoch": 0.22328931572629052, "grad_norm": 0.13872456550598145, "learning_rate": 0.00019991868336295964, "loss": 0.4687, "step": 744 }, { "epoch": 0.22358943577430973, "grad_norm": 0.12662111222743988, "learning_rate": 0.00019991655782313853, "loss": 0.4898, "step": 745 }, { "epoch": 0.22388955582232895, "grad_norm": 0.12319193035364151, "learning_rate": 0.0001999144048723555, "loss": 0.5516, "step": 746 }, { "epoch": 0.22418967587034813, "grad_norm": 0.11385344713926315, "learning_rate": 0.0001999122245112011, "loss": 0.4497, "step": 747 }, { "epoch": 0.22448979591836735, "grad_norm": 0.13391318917274475, "learning_rate": 0.0001999100167402736, "loss": 0.4507, "step": 748 }, { "epoch": 0.22478991596638656, "grad_norm": 0.13216985762119293, "learning_rate": 0.00019990778156017864, "loss": 0.4939, "step": 749 }, { "epoch": 0.22509003601440578, "grad_norm": 0.13763558864593506, "learning_rate": 0.0001999055189715294, "loss": 0.513, "step": 750 }, { "epoch": 0.22539015606242496, "grad_norm": 0.1200341135263443, "learning_rate": 0.0001999032289749466, "loss": 0.4783, "step": 751 }, { "epoch": 0.22569027611044418, "grad_norm": 0.13042916357517242, "learning_rate": 0.00019990091157105847, "loss": 0.469, "step": 752 }, { "epoch": 0.2259903961584634, "grad_norm": 0.12593336403369904, "learning_rate": 0.0001998985667605008, "loss": 0.5282, "step": 753 }, { "epoch": 0.2262905162064826, "grad_norm": 0.12227312475442886, "learning_rate": 0.00019989619454391684, "loss": 0.4582, "step": 754 }, { "epoch": 0.2265906362545018, "grad_norm": 0.12027385085821152, "learning_rate": 0.00019989379492195734, "loss": 0.4634, "step": 755 }, { "epoch": 0.226890756302521, "grad_norm": 0.1255529671907425, "learning_rate": 0.0001998913678952807, "loss": 0.5003, "step": 756 }, { "epoch": 0.22719087635054022, "grad_norm": 0.12932389974594116, "learning_rate": 0.00019988891346455262, "loss": 0.4656, "step": 757 }, { "epoch": 0.22749099639855944, "grad_norm": 0.12482926994562149, "learning_rate": 0.00019988643163044656, "loss": 0.474, "step": 758 }, { "epoch": 0.22779111644657862, "grad_norm": 0.125040665268898, "learning_rate": 0.00019988392239364333, "loss": 0.4825, "step": 759 }, { "epoch": 0.22809123649459784, "grad_norm": 0.12448814511299133, "learning_rate": 0.0001998813857548313, "loss": 0.4414, "step": 760 }, { "epoch": 0.22839135654261705, "grad_norm": 0.15806365013122559, "learning_rate": 0.0001998788217147064, "loss": 0.5386, "step": 761 }, { "epoch": 0.22869147659063627, "grad_norm": 0.17397083342075348, "learning_rate": 0.00019987623027397207, "loss": 0.5484, "step": 762 }, { "epoch": 0.22899159663865545, "grad_norm": 0.15767407417297363, "learning_rate": 0.00019987361143333917, "loss": 0.4764, "step": 763 }, { "epoch": 0.22929171668667467, "grad_norm": 0.13692127168178558, "learning_rate": 0.00019987096519352617, "loss": 0.4733, "step": 764 }, { "epoch": 0.22959183673469388, "grad_norm": 0.17084169387817383, "learning_rate": 0.00019986829155525907, "loss": 0.4945, "step": 765 }, { "epoch": 0.2298919567827131, "grad_norm": 0.1780541092157364, "learning_rate": 0.0001998655905192713, "loss": 0.4839, "step": 766 }, { "epoch": 0.23019207683073228, "grad_norm": 0.4227488934993744, "learning_rate": 0.0001998628620863039, "loss": 0.4601, "step": 767 }, { "epoch": 0.2304921968787515, "grad_norm": 0.12997548282146454, "learning_rate": 0.0001998601062571054, "loss": 0.4737, "step": 768 }, { "epoch": 0.2307923169267707, "grad_norm": 0.12152252346277237, "learning_rate": 0.00019985732303243178, "loss": 0.4925, "step": 769 }, { "epoch": 0.23109243697478993, "grad_norm": 0.15152108669281006, "learning_rate": 0.0001998545124130466, "loss": 0.4779, "step": 770 }, { "epoch": 0.23139255702280911, "grad_norm": 0.11856577545404434, "learning_rate": 0.00019985167439972096, "loss": 0.4509, "step": 771 }, { "epoch": 0.23169267707082833, "grad_norm": 0.1667887270450592, "learning_rate": 0.0001998488089932334, "loss": 0.5191, "step": 772 }, { "epoch": 0.23199279711884754, "grad_norm": 0.127762109041214, "learning_rate": 0.00019984591619437, "loss": 0.5351, "step": 773 }, { "epoch": 0.23229291716686676, "grad_norm": 0.13002650439739227, "learning_rate": 0.0001998429960039244, "loss": 0.4875, "step": 774 }, { "epoch": 0.23259303721488594, "grad_norm": 0.1593828648328781, "learning_rate": 0.00019984004842269767, "loss": 0.4812, "step": 775 }, { "epoch": 0.23289315726290516, "grad_norm": 0.14525969326496124, "learning_rate": 0.0001998370734514985, "loss": 0.5652, "step": 776 }, { "epoch": 0.23319327731092437, "grad_norm": 0.12786003947257996, "learning_rate": 0.00019983407109114306, "loss": 0.417, "step": 777 }, { "epoch": 0.2334933973589436, "grad_norm": 0.12177425622940063, "learning_rate": 0.0001998310413424549, "loss": 0.4973, "step": 778 }, { "epoch": 0.23379351740696278, "grad_norm": 0.12841089069843292, "learning_rate": 0.0001998279842062653, "loss": 0.5296, "step": 779 }, { "epoch": 0.234093637454982, "grad_norm": 0.40901222825050354, "learning_rate": 0.00019982489968341292, "loss": 0.4556, "step": 780 }, { "epoch": 0.2343937575030012, "grad_norm": 0.12318484485149384, "learning_rate": 0.000199821787774744, "loss": 0.4793, "step": 781 }, { "epoch": 0.23469387755102042, "grad_norm": 0.13381503522396088, "learning_rate": 0.00019981864848111217, "loss": 0.4677, "step": 782 }, { "epoch": 0.2349939975990396, "grad_norm": 0.13625945150852203, "learning_rate": 0.00019981548180337874, "loss": 0.4417, "step": 783 }, { "epoch": 0.23529411764705882, "grad_norm": 0.14159783720970154, "learning_rate": 0.00019981228774241242, "loss": 0.479, "step": 784 }, { "epoch": 0.23559423769507803, "grad_norm": 0.1346697211265564, "learning_rate": 0.0001998090662990894, "loss": 0.4177, "step": 785 }, { "epoch": 0.23589435774309725, "grad_norm": 0.12795834243297577, "learning_rate": 0.00019980581747429358, "loss": 0.5186, "step": 786 }, { "epoch": 0.23619447779111644, "grad_norm": 0.12394683063030243, "learning_rate": 0.00019980254126891614, "loss": 0.4767, "step": 787 }, { "epoch": 0.23649459783913565, "grad_norm": 0.12592841684818268, "learning_rate": 0.0001997992376838559, "loss": 0.4718, "step": 788 }, { "epoch": 0.23679471788715487, "grad_norm": 0.136773481965065, "learning_rate": 0.00019979590672001917, "loss": 0.4687, "step": 789 }, { "epoch": 0.23709483793517408, "grad_norm": 0.15789003670215607, "learning_rate": 0.00019979254837831976, "loss": 0.5294, "step": 790 }, { "epoch": 0.23739495798319327, "grad_norm": 0.15132829546928406, "learning_rate": 0.00019978916265967896, "loss": 0.533, "step": 791 }, { "epoch": 0.23769507803121248, "grad_norm": 0.13565434515476227, "learning_rate": 0.00019978574956502562, "loss": 0.4872, "step": 792 }, { "epoch": 0.2379951980792317, "grad_norm": 0.12341909855604172, "learning_rate": 0.0001997823090952961, "loss": 0.4661, "step": 793 }, { "epoch": 0.2382953181272509, "grad_norm": 0.13288845121860504, "learning_rate": 0.00019977884125143422, "loss": 0.4695, "step": 794 }, { "epoch": 0.2385954381752701, "grad_norm": 0.11384455114603043, "learning_rate": 0.0001997753460343914, "loss": 0.4839, "step": 795 }, { "epoch": 0.2388955582232893, "grad_norm": 0.13420362770557404, "learning_rate": 0.00019977182344512647, "loss": 0.5234, "step": 796 }, { "epoch": 0.23919567827130853, "grad_norm": 0.11861885339021683, "learning_rate": 0.0001997682734846058, "loss": 0.4593, "step": 797 }, { "epoch": 0.23949579831932774, "grad_norm": 0.11524344980716705, "learning_rate": 0.00019976469615380334, "loss": 0.4461, "step": 798 }, { "epoch": 0.23979591836734693, "grad_norm": 0.1178302988409996, "learning_rate": 0.00019976109145370042, "loss": 0.4366, "step": 799 }, { "epoch": 0.24009603841536614, "grad_norm": 0.13384704291820526, "learning_rate": 0.00019975745938528597, "loss": 0.471, "step": 800 }, { "epoch": 0.24039615846338536, "grad_norm": 0.12177698314189911, "learning_rate": 0.00019975379994955644, "loss": 0.4967, "step": 801 }, { "epoch": 0.24069627851140457, "grad_norm": 0.16466966271400452, "learning_rate": 0.0001997501131475157, "loss": 0.4926, "step": 802 }, { "epoch": 0.24099639855942376, "grad_norm": 0.14508703351020813, "learning_rate": 0.00019974639898017525, "loss": 0.5903, "step": 803 }, { "epoch": 0.24129651860744297, "grad_norm": 0.11782081425189972, "learning_rate": 0.00019974265744855397, "loss": 0.4213, "step": 804 }, { "epoch": 0.2415966386554622, "grad_norm": 0.1257871836423874, "learning_rate": 0.0001997388885536783, "loss": 0.4696, "step": 805 }, { "epoch": 0.2418967587034814, "grad_norm": 0.13063441216945648, "learning_rate": 0.00019973509229658225, "loss": 0.5095, "step": 806 }, { "epoch": 0.2421968787515006, "grad_norm": 0.1315949410200119, "learning_rate": 0.00019973126867830728, "loss": 0.5238, "step": 807 }, { "epoch": 0.2424969987995198, "grad_norm": 0.16877683997154236, "learning_rate": 0.00019972741769990228, "loss": 0.4921, "step": 808 }, { "epoch": 0.24279711884753902, "grad_norm": 0.11835787445306778, "learning_rate": 0.00019972353936242377, "loss": 0.4703, "step": 809 }, { "epoch": 0.24309723889555823, "grad_norm": 0.14192615449428558, "learning_rate": 0.00019971963366693574, "loss": 0.4702, "step": 810 }, { "epoch": 0.24339735894357742, "grad_norm": 0.13355907797813416, "learning_rate": 0.00019971570061450963, "loss": 0.4859, "step": 811 }, { "epoch": 0.24369747899159663, "grad_norm": 0.125127911567688, "learning_rate": 0.00019971174020622448, "loss": 0.4636, "step": 812 }, { "epoch": 0.24399759903961585, "grad_norm": 0.14461135864257812, "learning_rate": 0.00019970775244316675, "loss": 0.4712, "step": 813 }, { "epoch": 0.24429771908763506, "grad_norm": 0.12992002069950104, "learning_rate": 0.00019970373732643046, "loss": 0.5124, "step": 814 }, { "epoch": 0.24459783913565425, "grad_norm": 0.12226548790931702, "learning_rate": 0.0001996996948571171, "loss": 0.4968, "step": 815 }, { "epoch": 0.24489795918367346, "grad_norm": 0.1229477971792221, "learning_rate": 0.00019969562503633563, "loss": 0.4902, "step": 816 }, { "epoch": 0.24519807923169268, "grad_norm": 0.12690682709217072, "learning_rate": 0.00019969152786520264, "loss": 0.485, "step": 817 }, { "epoch": 0.2454981992797119, "grad_norm": 0.12483441084623337, "learning_rate": 0.00019968740334484205, "loss": 0.4781, "step": 818 }, { "epoch": 0.24579831932773108, "grad_norm": 0.7821135520935059, "learning_rate": 0.00019968325147638548, "loss": 0.4798, "step": 819 }, { "epoch": 0.2460984393757503, "grad_norm": 0.13457560539245605, "learning_rate": 0.0001996790722609719, "loss": 0.446, "step": 820 }, { "epoch": 0.2463985594237695, "grad_norm": 0.16727985441684723, "learning_rate": 0.00019967486569974778, "loss": 0.5498, "step": 821 }, { "epoch": 0.24669867947178872, "grad_norm": 0.17841017246246338, "learning_rate": 0.00019967063179386721, "loss": 0.5494, "step": 822 }, { "epoch": 0.2469987995198079, "grad_norm": 0.17153313755989075, "learning_rate": 0.0001996663705444917, "loss": 0.5141, "step": 823 }, { "epoch": 0.24729891956782712, "grad_norm": 0.13014820218086243, "learning_rate": 0.00019966208195279023, "loss": 0.4661, "step": 824 }, { "epoch": 0.24759903961584634, "grad_norm": 0.20828866958618164, "learning_rate": 0.00019965776601993938, "loss": 0.474, "step": 825 }, { "epoch": 0.24789915966386555, "grad_norm": 0.14066027104854584, "learning_rate": 0.00019965342274712316, "loss": 0.5127, "step": 826 }, { "epoch": 0.24819927971188474, "grad_norm": 0.13748477399349213, "learning_rate": 0.0001996490521355331, "loss": 0.5034, "step": 827 }, { "epoch": 0.24849939975990396, "grad_norm": 0.14567652344703674, "learning_rate": 0.00019964465418636823, "loss": 0.4954, "step": 828 }, { "epoch": 0.24879951980792317, "grad_norm": 0.15060196816921234, "learning_rate": 0.00019964022890083503, "loss": 0.4874, "step": 829 }, { "epoch": 0.24909963985594238, "grad_norm": 0.1204688772559166, "learning_rate": 0.00019963577628014757, "loss": 0.4384, "step": 830 }, { "epoch": 0.24939975990396157, "grad_norm": 0.12509852647781372, "learning_rate": 0.0001996312963255274, "loss": 0.4729, "step": 831 }, { "epoch": 0.24969987995198079, "grad_norm": 0.22054116427898407, "learning_rate": 0.00019962678903820348, "loss": 0.4813, "step": 832 }, { "epoch": 0.25, "grad_norm": 0.13491611182689667, "learning_rate": 0.0001996222544194124, "loss": 0.51, "step": 833 }, { "epoch": 0.2503001200480192, "grad_norm": 0.14967964589595795, "learning_rate": 0.00019961769247039813, "loss": 0.4411, "step": 834 }, { "epoch": 0.25060024009603843, "grad_norm": 0.13337065279483795, "learning_rate": 0.0001996131031924122, "loss": 0.4768, "step": 835 }, { "epoch": 0.25090036014405764, "grad_norm": 0.17040053009986877, "learning_rate": 0.00019960848658671365, "loss": 0.4829, "step": 836 }, { "epoch": 0.25120048019207686, "grad_norm": 0.14164522290229797, "learning_rate": 0.00019960384265456894, "loss": 0.4903, "step": 837 }, { "epoch": 0.251500600240096, "grad_norm": 0.1331344097852707, "learning_rate": 0.00019959917139725212, "loss": 0.5193, "step": 838 }, { "epoch": 0.25180072028811523, "grad_norm": 0.1266501247882843, "learning_rate": 0.00019959447281604474, "loss": 0.461, "step": 839 }, { "epoch": 0.25210084033613445, "grad_norm": 0.13748733699321747, "learning_rate": 0.00019958974691223572, "loss": 0.547, "step": 840 }, { "epoch": 0.25240096038415366, "grad_norm": 0.1415606290102005, "learning_rate": 0.00019958499368712156, "loss": 0.4919, "step": 841 }, { "epoch": 0.2527010804321729, "grad_norm": 0.20114395022392273, "learning_rate": 0.00019958021314200633, "loss": 0.4776, "step": 842 }, { "epoch": 0.2530012004801921, "grad_norm": 0.13372939825057983, "learning_rate": 0.00019957540527820142, "loss": 0.5162, "step": 843 }, { "epoch": 0.2533013205282113, "grad_norm": 0.12159455567598343, "learning_rate": 0.0001995705700970259, "loss": 0.4015, "step": 844 }, { "epoch": 0.2536014405762305, "grad_norm": 0.12221530824899673, "learning_rate": 0.00019956570759980621, "loss": 0.4822, "step": 845 }, { "epoch": 0.2539015606242497, "grad_norm": 0.1302434504032135, "learning_rate": 0.0001995608177878763, "loss": 0.5038, "step": 846 }, { "epoch": 0.2542016806722689, "grad_norm": 0.12383106350898743, "learning_rate": 0.00019955590066257766, "loss": 0.4843, "step": 847 }, { "epoch": 0.2545018007202881, "grad_norm": 0.13749061524868011, "learning_rate": 0.00019955095622525924, "loss": 0.5388, "step": 848 }, { "epoch": 0.2548019207683073, "grad_norm": 0.13634788990020752, "learning_rate": 0.00019954598447727748, "loss": 0.5423, "step": 849 }, { "epoch": 0.25510204081632654, "grad_norm": 0.13687866926193237, "learning_rate": 0.00019954098541999634, "loss": 0.4582, "step": 850 }, { "epoch": 0.25540216086434575, "grad_norm": 0.12292606383562088, "learning_rate": 0.00019953595905478725, "loss": 0.4771, "step": 851 }, { "epoch": 0.25570228091236497, "grad_norm": 0.30889269709587097, "learning_rate": 0.0001995309053830291, "loss": 0.519, "step": 852 }, { "epoch": 0.2560024009603842, "grad_norm": 0.13533712923526764, "learning_rate": 0.0001995258244061084, "loss": 0.5394, "step": 853 }, { "epoch": 0.25630252100840334, "grad_norm": 0.2249276041984558, "learning_rate": 0.00019952071612541893, "loss": 0.5533, "step": 854 }, { "epoch": 0.25660264105642255, "grad_norm": 0.1300957351922989, "learning_rate": 0.00019951558054236222, "loss": 0.4871, "step": 855 }, { "epoch": 0.25690276110444177, "grad_norm": 0.12137411534786224, "learning_rate": 0.00019951041765834703, "loss": 0.4811, "step": 856 }, { "epoch": 0.257202881152461, "grad_norm": 0.11973215639591217, "learning_rate": 0.0001995052274747898, "loss": 0.4272, "step": 857 }, { "epoch": 0.2575030012004802, "grad_norm": 0.14288219809532166, "learning_rate": 0.00019950000999311443, "loss": 0.5364, "step": 858 }, { "epoch": 0.2578031212484994, "grad_norm": 0.13095241785049438, "learning_rate": 0.00019949476521475225, "loss": 0.4968, "step": 859 }, { "epoch": 0.2581032412965186, "grad_norm": 0.12350008636713028, "learning_rate": 0.00019948949314114208, "loss": 0.4447, "step": 860 }, { "epoch": 0.25840336134453784, "grad_norm": 0.11974304169416428, "learning_rate": 0.0001994841937737303, "loss": 0.4127, "step": 861 }, { "epoch": 0.258703481392557, "grad_norm": 0.11944734305143356, "learning_rate": 0.0001994788671139707, "loss": 0.4226, "step": 862 }, { "epoch": 0.2590036014405762, "grad_norm": 0.12920348346233368, "learning_rate": 0.00019947351316332453, "loss": 0.4136, "step": 863 }, { "epoch": 0.25930372148859543, "grad_norm": 0.12352780997753143, "learning_rate": 0.0001994681319232607, "loss": 0.4503, "step": 864 }, { "epoch": 0.25960384153661464, "grad_norm": 0.14253178238868713, "learning_rate": 0.00019946272339525542, "loss": 0.4836, "step": 865 }, { "epoch": 0.25990396158463386, "grad_norm": 0.12619134783744812, "learning_rate": 0.00019945728758079248, "loss": 0.4841, "step": 866 }, { "epoch": 0.2602040816326531, "grad_norm": 0.14023606479167938, "learning_rate": 0.0001994518244813631, "loss": 0.4952, "step": 867 }, { "epoch": 0.2605042016806723, "grad_norm": 0.1295354962348938, "learning_rate": 0.00019944633409846606, "loss": 0.5012, "step": 868 }, { "epoch": 0.2608043217286915, "grad_norm": 0.12856262922286987, "learning_rate": 0.0001994408164336076, "loss": 0.4753, "step": 869 }, { "epoch": 0.26110444177671066, "grad_norm": 0.12840332090854645, "learning_rate": 0.00019943527148830138, "loss": 0.5248, "step": 870 }, { "epoch": 0.2614045618247299, "grad_norm": 0.12111607193946838, "learning_rate": 0.0001994296992640686, "loss": 0.4126, "step": 871 }, { "epoch": 0.2617046818727491, "grad_norm": 0.1936316192150116, "learning_rate": 0.00019942409976243797, "loss": 0.4748, "step": 872 }, { "epoch": 0.2620048019207683, "grad_norm": 0.1337498128414154, "learning_rate": 0.00019941847298494557, "loss": 0.4875, "step": 873 }, { "epoch": 0.2623049219687875, "grad_norm": 0.1343999058008194, "learning_rate": 0.00019941281893313514, "loss": 0.5657, "step": 874 }, { "epoch": 0.26260504201680673, "grad_norm": 0.1383265107870102, "learning_rate": 0.0001994071376085578, "loss": 0.3998, "step": 875 }, { "epoch": 0.26290516206482595, "grad_norm": 0.1333858221769333, "learning_rate": 0.00019940142901277207, "loss": 0.4655, "step": 876 }, { "epoch": 0.26320528211284516, "grad_norm": 0.12730911374092102, "learning_rate": 0.00019939569314734407, "loss": 0.4565, "step": 877 }, { "epoch": 0.2635054021608643, "grad_norm": 0.12576372921466827, "learning_rate": 0.0001993899300138474, "loss": 0.4825, "step": 878 }, { "epoch": 0.26380552220888354, "grad_norm": 0.12902750074863434, "learning_rate": 0.0001993841396138631, "loss": 0.4152, "step": 879 }, { "epoch": 0.26410564225690275, "grad_norm": 0.1310381442308426, "learning_rate": 0.00019937832194897968, "loss": 0.498, "step": 880 }, { "epoch": 0.26440576230492197, "grad_norm": 0.13245545327663422, "learning_rate": 0.00019937247702079322, "loss": 0.4811, "step": 881 }, { "epoch": 0.2647058823529412, "grad_norm": 0.1286735236644745, "learning_rate": 0.00019936660483090707, "loss": 0.48, "step": 882 }, { "epoch": 0.2650060024009604, "grad_norm": 0.38595524430274963, "learning_rate": 0.0001993607053809323, "loss": 0.4453, "step": 883 }, { "epoch": 0.2653061224489796, "grad_norm": 0.1547454297542572, "learning_rate": 0.00019935477867248739, "loss": 0.5371, "step": 884 }, { "epoch": 0.2656062424969988, "grad_norm": 0.14364950358867645, "learning_rate": 0.00019934882470719817, "loss": 0.4814, "step": 885 }, { "epoch": 0.265906362545018, "grad_norm": 0.1367032378911972, "learning_rate": 0.0001993428434866981, "loss": 0.4894, "step": 886 }, { "epoch": 0.2662064825930372, "grad_norm": 0.14121219515800476, "learning_rate": 0.00019933683501262804, "loss": 0.4671, "step": 887 }, { "epoch": 0.2665066026410564, "grad_norm": 0.13421456515789032, "learning_rate": 0.0001993307992866363, "loss": 0.5321, "step": 888 }, { "epoch": 0.2668067226890756, "grad_norm": 0.13314856588840485, "learning_rate": 0.00019932473631037882, "loss": 0.541, "step": 889 }, { "epoch": 0.26710684273709484, "grad_norm": 0.13720490038394928, "learning_rate": 0.00019931864608551886, "loss": 0.5456, "step": 890 }, { "epoch": 0.26740696278511406, "grad_norm": 0.21238237619400024, "learning_rate": 0.00019931252861372714, "loss": 0.5126, "step": 891 }, { "epoch": 0.26770708283313327, "grad_norm": 0.1344596892595291, "learning_rate": 0.000199306383896682, "loss": 0.4836, "step": 892 }, { "epoch": 0.2680072028811525, "grad_norm": 0.16728933155536652, "learning_rate": 0.00019930021193606914, "loss": 0.5175, "step": 893 }, { "epoch": 0.26830732292917164, "grad_norm": 0.1329762041568756, "learning_rate": 0.00019929401273358177, "loss": 0.4533, "step": 894 }, { "epoch": 0.26860744297719086, "grad_norm": 0.13503654301166534, "learning_rate": 0.00019928778629092056, "loss": 0.4785, "step": 895 }, { "epoch": 0.2689075630252101, "grad_norm": 0.12425743043422699, "learning_rate": 0.00019928153260979366, "loss": 0.4158, "step": 896 }, { "epoch": 0.2692076830732293, "grad_norm": 0.12247985601425171, "learning_rate": 0.00019927525169191674, "loss": 0.4642, "step": 897 }, { "epoch": 0.2695078031212485, "grad_norm": 0.13523244857788086, "learning_rate": 0.00019926894353901288, "loss": 0.4824, "step": 898 }, { "epoch": 0.2698079231692677, "grad_norm": 0.1293213963508606, "learning_rate": 0.00019926260815281258, "loss": 0.5019, "step": 899 }, { "epoch": 0.27010804321728693, "grad_norm": 0.13876238465309143, "learning_rate": 0.000199256245535054, "loss": 0.498, "step": 900 }, { "epoch": 0.27040816326530615, "grad_norm": 0.13814818859100342, "learning_rate": 0.00019924985568748254, "loss": 0.536, "step": 901 }, { "epoch": 0.2707082833133253, "grad_norm": 0.12295421212911606, "learning_rate": 0.00019924343861185123, "loss": 0.467, "step": 902 }, { "epoch": 0.2710084033613445, "grad_norm": 0.12685605883598328, "learning_rate": 0.00019923699430992055, "loss": 0.4507, "step": 903 }, { "epoch": 0.27130852340936373, "grad_norm": 0.126492440700531, "learning_rate": 0.00019923052278345837, "loss": 0.4906, "step": 904 }, { "epoch": 0.27160864345738295, "grad_norm": 0.13927894830703735, "learning_rate": 0.0001992240240342401, "loss": 0.4868, "step": 905 }, { "epoch": 0.27190876350540216, "grad_norm": 0.1388770341873169, "learning_rate": 0.00019921749806404862, "loss": 0.5374, "step": 906 }, { "epoch": 0.2722088835534214, "grad_norm": 0.2512054145336151, "learning_rate": 0.0001992109448746742, "loss": 0.4949, "step": 907 }, { "epoch": 0.2725090036014406, "grad_norm": 0.22620728611946106, "learning_rate": 0.0001992043644679147, "loss": 0.4882, "step": 908 }, { "epoch": 0.2728091236494598, "grad_norm": 0.13367605209350586, "learning_rate": 0.0001991977568455753, "loss": 0.4923, "step": 909 }, { "epoch": 0.27310924369747897, "grad_norm": 0.14091336727142334, "learning_rate": 0.00019919112200946878, "loss": 0.5282, "step": 910 }, { "epoch": 0.2734093637454982, "grad_norm": 0.14546939730644226, "learning_rate": 0.00019918445996141535, "loss": 0.4407, "step": 911 }, { "epoch": 0.2737094837935174, "grad_norm": 0.1297386884689331, "learning_rate": 0.0001991777707032426, "loss": 0.4682, "step": 912 }, { "epoch": 0.2740096038415366, "grad_norm": 0.1352037638425827, "learning_rate": 0.00019917105423678574, "loss": 0.4778, "step": 913 }, { "epoch": 0.2743097238895558, "grad_norm": 0.1269834339618683, "learning_rate": 0.00019916431056388724, "loss": 0.4584, "step": 914 }, { "epoch": 0.27460984393757504, "grad_norm": 0.14803633093833923, "learning_rate": 0.00019915753968639726, "loss": 0.4682, "step": 915 }, { "epoch": 0.27490996398559425, "grad_norm": 0.1348196566104889, "learning_rate": 0.00019915074160617324, "loss": 0.4941, "step": 916 }, { "epoch": 0.27521008403361347, "grad_norm": 0.15063588321208954, "learning_rate": 0.0001991439163250802, "loss": 0.5334, "step": 917 }, { "epoch": 0.2755102040816326, "grad_norm": 0.16838787496089935, "learning_rate": 0.00019913706384499055, "loss": 0.4767, "step": 918 }, { "epoch": 0.27581032412965184, "grad_norm": 0.15116068720817566, "learning_rate": 0.00019913018416778419, "loss": 0.4798, "step": 919 }, { "epoch": 0.27611044417767105, "grad_norm": 0.1270640790462494, "learning_rate": 0.0001991232772953485, "loss": 0.4905, "step": 920 }, { "epoch": 0.27641056422569027, "grad_norm": 0.12478886544704437, "learning_rate": 0.00019911634322957827, "loss": 0.464, "step": 921 }, { "epoch": 0.2767106842737095, "grad_norm": 0.18658436834812164, "learning_rate": 0.00019910938197237582, "loss": 0.4703, "step": 922 }, { "epoch": 0.2770108043217287, "grad_norm": 0.14810101687908173, "learning_rate": 0.00019910239352565086, "loss": 0.4689, "step": 923 }, { "epoch": 0.2773109243697479, "grad_norm": 0.26814642548561096, "learning_rate": 0.00019909537789132063, "loss": 0.4454, "step": 924 }, { "epoch": 0.2776110444177671, "grad_norm": 0.15047408640384674, "learning_rate": 0.00019908833507130972, "loss": 0.4585, "step": 925 }, { "epoch": 0.27791116446578634, "grad_norm": 0.12900130450725555, "learning_rate": 0.0001990812650675503, "loss": 0.4725, "step": 926 }, { "epoch": 0.2782112845138055, "grad_norm": 0.28590309619903564, "learning_rate": 0.00019907416788198196, "loss": 0.5181, "step": 927 }, { "epoch": 0.2785114045618247, "grad_norm": 0.2776770293712616, "learning_rate": 0.00019906704351655167, "loss": 0.4349, "step": 928 }, { "epoch": 0.27881152460984393, "grad_norm": 0.7942174673080444, "learning_rate": 0.00019905989197321398, "loss": 0.4483, "step": 929 }, { "epoch": 0.27911164465786314, "grad_norm": 0.20447468757629395, "learning_rate": 0.0001990527132539308, "loss": 0.4817, "step": 930 }, { "epoch": 0.27941176470588236, "grad_norm": 0.14460432529449463, "learning_rate": 0.0001990455073606715, "loss": 0.4645, "step": 931 }, { "epoch": 0.2797118847539016, "grad_norm": 0.16081230342388153, "learning_rate": 0.00019903827429541303, "loss": 0.4403, "step": 932 }, { "epoch": 0.2800120048019208, "grad_norm": 0.18377286195755005, "learning_rate": 0.0001990310140601396, "loss": 0.4532, "step": 933 }, { "epoch": 0.28031212484994, "grad_norm": 0.20535436272621155, "learning_rate": 0.000199023726656843, "loss": 0.5203, "step": 934 }, { "epoch": 0.28061224489795916, "grad_norm": 0.16945256292819977, "learning_rate": 0.00019901641208752246, "loss": 0.4573, "step": 935 }, { "epoch": 0.2809123649459784, "grad_norm": 0.15890268981456757, "learning_rate": 0.00019900907035418465, "loss": 0.4381, "step": 936 }, { "epoch": 0.2812124849939976, "grad_norm": 0.15431177616119385, "learning_rate": 0.0001990017014588437, "loss": 0.5099, "step": 937 }, { "epoch": 0.2815126050420168, "grad_norm": 0.16148389875888824, "learning_rate": 0.00019899430540352118, "loss": 0.5215, "step": 938 }, { "epoch": 0.281812725090036, "grad_norm": 0.1503535807132721, "learning_rate": 0.00019898688219024605, "loss": 0.4793, "step": 939 }, { "epoch": 0.28211284513805523, "grad_norm": 0.15371309220790863, "learning_rate": 0.00019897943182105486, "loss": 0.4807, "step": 940 }, { "epoch": 0.28241296518607445, "grad_norm": 0.14984077215194702, "learning_rate": 0.0001989719542979915, "loss": 0.4534, "step": 941 }, { "epoch": 0.28271308523409366, "grad_norm": 0.1417742371559143, "learning_rate": 0.00019896444962310737, "loss": 0.4728, "step": 942 }, { "epoch": 0.2830132052821128, "grad_norm": 0.14261500537395477, "learning_rate": 0.00019895691779846125, "loss": 0.4636, "step": 943 }, { "epoch": 0.28331332533013204, "grad_norm": 0.15536335110664368, "learning_rate": 0.00019894935882611942, "loss": 0.5089, "step": 944 }, { "epoch": 0.28361344537815125, "grad_norm": 0.4062034785747528, "learning_rate": 0.00019894177270815563, "loss": 0.5466, "step": 945 }, { "epoch": 0.28391356542617047, "grad_norm": 0.12950041890144348, "learning_rate": 0.00019893415944665098, "loss": 0.4514, "step": 946 }, { "epoch": 0.2842136854741897, "grad_norm": 0.15430551767349243, "learning_rate": 0.0001989265190436942, "loss": 0.4482, "step": 947 }, { "epoch": 0.2845138055222089, "grad_norm": 0.14366622269153595, "learning_rate": 0.0001989188515013812, "loss": 0.4811, "step": 948 }, { "epoch": 0.2848139255702281, "grad_norm": 0.20464728772640228, "learning_rate": 0.0001989111568218156, "loss": 0.5033, "step": 949 }, { "epoch": 0.2851140456182473, "grad_norm": 0.1582338809967041, "learning_rate": 0.00019890343500710827, "loss": 0.5106, "step": 950 }, { "epoch": 0.2854141656662665, "grad_norm": 0.14050230383872986, "learning_rate": 0.00019889568605937761, "loss": 0.4848, "step": 951 }, { "epoch": 0.2857142857142857, "grad_norm": 0.12696197628974915, "learning_rate": 0.00019888790998074952, "loss": 0.4441, "step": 952 }, { "epoch": 0.2860144057623049, "grad_norm": 0.18210993707180023, "learning_rate": 0.0001988801067733572, "loss": 0.5396, "step": 953 }, { "epoch": 0.2863145258103241, "grad_norm": 0.8190274834632874, "learning_rate": 0.00019887227643934142, "loss": 0.4312, "step": 954 }, { "epoch": 0.28661464585834334, "grad_norm": 0.13277262449264526, "learning_rate": 0.00019886441898085035, "loss": 0.4746, "step": 955 }, { "epoch": 0.28691476590636256, "grad_norm": 0.16039538383483887, "learning_rate": 0.00019885653440003954, "loss": 0.498, "step": 956 }, { "epoch": 0.28721488595438177, "grad_norm": 0.14940515160560608, "learning_rate": 0.00019884862269907205, "loss": 0.5033, "step": 957 }, { "epoch": 0.287515006002401, "grad_norm": 0.12827421724796295, "learning_rate": 0.0001988406838801184, "loss": 0.4444, "step": 958 }, { "epoch": 0.28781512605042014, "grad_norm": 0.16131725907325745, "learning_rate": 0.00019883271794535648, "loss": 0.508, "step": 959 }, { "epoch": 0.28811524609843936, "grad_norm": 0.20877015590667725, "learning_rate": 0.0001988247248969717, "loss": 0.5394, "step": 960 }, { "epoch": 0.2884153661464586, "grad_norm": 0.13646623492240906, "learning_rate": 0.0001988167047371568, "loss": 0.4716, "step": 961 }, { "epoch": 0.2887154861944778, "grad_norm": 0.13780243694782257, "learning_rate": 0.00019880865746811207, "loss": 0.5101, "step": 962 }, { "epoch": 0.289015606242497, "grad_norm": 0.13552388548851013, "learning_rate": 0.00019880058309204514, "loss": 0.4945, "step": 963 }, { "epoch": 0.2893157262905162, "grad_norm": 0.1313784420490265, "learning_rate": 0.00019879248161117113, "loss": 0.449, "step": 964 }, { "epoch": 0.28961584633853543, "grad_norm": 0.17604967951774597, "learning_rate": 0.0001987843530277126, "loss": 0.483, "step": 965 }, { "epoch": 0.28991596638655465, "grad_norm": 0.13665145635604858, "learning_rate": 0.00019877619734389956, "loss": 0.4529, "step": 966 }, { "epoch": 0.2902160864345738, "grad_norm": 0.13970158994197845, "learning_rate": 0.00019876801456196943, "loss": 0.4481, "step": 967 }, { "epoch": 0.290516206482593, "grad_norm": 0.13947072625160217, "learning_rate": 0.000198759804684167, "loss": 0.479, "step": 968 }, { "epoch": 0.29081632653061223, "grad_norm": 0.14099106192588806, "learning_rate": 0.0001987515677127446, "loss": 0.4454, "step": 969 }, { "epoch": 0.29111644657863145, "grad_norm": 0.1705184131860733, "learning_rate": 0.00019874330364996192, "loss": 0.509, "step": 970 }, { "epoch": 0.29141656662665066, "grad_norm": 0.14040535688400269, "learning_rate": 0.00019873501249808616, "loss": 0.4816, "step": 971 }, { "epoch": 0.2917166866746699, "grad_norm": 0.17384812235832214, "learning_rate": 0.00019872669425939185, "loss": 0.5492, "step": 972 }, { "epoch": 0.2920168067226891, "grad_norm": 0.15088632702827454, "learning_rate": 0.00019871834893616107, "loss": 0.5129, "step": 973 }, { "epoch": 0.2923169267707083, "grad_norm": 0.13551321625709534, "learning_rate": 0.0001987099765306832, "loss": 0.4778, "step": 974 }, { "epoch": 0.29261704681872747, "grad_norm": 0.18222583830356598, "learning_rate": 0.0001987015770452551, "loss": 0.4995, "step": 975 }, { "epoch": 0.2929171668667467, "grad_norm": 0.1446659415960312, "learning_rate": 0.00019869315048218116, "loss": 0.4742, "step": 976 }, { "epoch": 0.2932172869147659, "grad_norm": 0.14516235888004303, "learning_rate": 0.00019868469684377306, "loss": 0.4851, "step": 977 }, { "epoch": 0.2935174069627851, "grad_norm": 0.13349777460098267, "learning_rate": 0.00019867621613234993, "loss": 0.4479, "step": 978 }, { "epoch": 0.2938175270108043, "grad_norm": 0.14506781101226807, "learning_rate": 0.00019866770835023836, "loss": 0.4712, "step": 979 }, { "epoch": 0.29411764705882354, "grad_norm": 0.18141105771064758, "learning_rate": 0.00019865917349977242, "loss": 0.4869, "step": 980 }, { "epoch": 0.29441776710684275, "grad_norm": 0.12961047887802124, "learning_rate": 0.00019865061158329353, "loss": 0.4201, "step": 981 }, { "epoch": 0.29471788715486197, "grad_norm": 0.1588519662618637, "learning_rate": 0.0001986420226031505, "loss": 0.4782, "step": 982 }, { "epoch": 0.2950180072028811, "grad_norm": 0.14325343072414398, "learning_rate": 0.00019863340656169965, "loss": 0.5089, "step": 983 }, { "epoch": 0.29531812725090034, "grad_norm": 0.17272049188613892, "learning_rate": 0.00019862476346130473, "loss": 0.4914, "step": 984 }, { "epoch": 0.29561824729891956, "grad_norm": 0.17687954008579254, "learning_rate": 0.00019861609330433684, "loss": 0.4788, "step": 985 }, { "epoch": 0.29591836734693877, "grad_norm": 0.1358242630958557, "learning_rate": 0.00019860739609317457, "loss": 0.4454, "step": 986 }, { "epoch": 0.296218487394958, "grad_norm": 0.14220671355724335, "learning_rate": 0.0001985986718302038, "loss": 0.4397, "step": 987 }, { "epoch": 0.2965186074429772, "grad_norm": 0.13197766244411469, "learning_rate": 0.00019858992051781805, "loss": 0.4539, "step": 988 }, { "epoch": 0.2968187274909964, "grad_norm": 0.1444684863090515, "learning_rate": 0.0001985811421584181, "loss": 0.5167, "step": 989 }, { "epoch": 0.29711884753901563, "grad_norm": 0.13593900203704834, "learning_rate": 0.00019857233675441217, "loss": 0.4868, "step": 990 }, { "epoch": 0.2974189675870348, "grad_norm": 0.12780894339084625, "learning_rate": 0.00019856350430821594, "loss": 0.4521, "step": 991 }, { "epoch": 0.297719087635054, "grad_norm": 0.17024055123329163, "learning_rate": 0.0001985546448222525, "loss": 0.4543, "step": 992 }, { "epoch": 0.2980192076830732, "grad_norm": 0.1440305858850479, "learning_rate": 0.00019854575829895233, "loss": 0.5076, "step": 993 }, { "epoch": 0.29831932773109243, "grad_norm": 0.13460861146450043, "learning_rate": 0.00019853684474075337, "loss": 0.4522, "step": 994 }, { "epoch": 0.29861944777911165, "grad_norm": 0.1406427025794983, "learning_rate": 0.00019852790415010092, "loss": 0.518, "step": 995 }, { "epoch": 0.29891956782713086, "grad_norm": 0.13395576179027557, "learning_rate": 0.00019851893652944776, "loss": 0.4416, "step": 996 }, { "epoch": 0.2992196878751501, "grad_norm": 0.138408824801445, "learning_rate": 0.00019850994188125401, "loss": 0.5226, "step": 997 }, { "epoch": 0.2995198079231693, "grad_norm": 0.16668342053890228, "learning_rate": 0.00019850092020798728, "loss": 0.4604, "step": 998 }, { "epoch": 0.29981992797118845, "grad_norm": 0.14884911477565765, "learning_rate": 0.00019849187151212258, "loss": 0.5255, "step": 999 }, { "epoch": 0.30012004801920766, "grad_norm": 0.13796059787273407, "learning_rate": 0.0001984827957961423, "loss": 0.5336, "step": 1000 }, { "epoch": 0.3004201680672269, "grad_norm": 0.15053890645503998, "learning_rate": 0.00019847369306253624, "loss": 0.5012, "step": 1001 }, { "epoch": 0.3007202881152461, "grad_norm": 0.13115812838077545, "learning_rate": 0.00019846456331380167, "loss": 0.4925, "step": 1002 }, { "epoch": 0.3010204081632653, "grad_norm": 0.1401546746492386, "learning_rate": 0.00019845540655244318, "loss": 0.5165, "step": 1003 }, { "epoch": 0.3013205282112845, "grad_norm": 0.1433335393667221, "learning_rate": 0.00019844622278097286, "loss": 0.4357, "step": 1004 }, { "epoch": 0.30162064825930374, "grad_norm": 0.1426486372947693, "learning_rate": 0.00019843701200191016, "loss": 0.5084, "step": 1005 }, { "epoch": 0.30192076830732295, "grad_norm": 0.13518790900707245, "learning_rate": 0.00019842777421778197, "loss": 0.4797, "step": 1006 }, { "epoch": 0.3022208883553421, "grad_norm": 0.12768086791038513, "learning_rate": 0.0001984185094311225, "loss": 0.4687, "step": 1007 }, { "epoch": 0.3025210084033613, "grad_norm": 0.12086029350757599, "learning_rate": 0.00019840921764447357, "loss": 0.4326, "step": 1008 }, { "epoch": 0.30282112845138054, "grad_norm": 0.13988471031188965, "learning_rate": 0.00019839989886038416, "loss": 0.5151, "step": 1009 }, { "epoch": 0.30312124849939975, "grad_norm": 0.1398009955883026, "learning_rate": 0.00019839055308141078, "loss": 0.4604, "step": 1010 }, { "epoch": 0.30342136854741897, "grad_norm": 0.27455607056617737, "learning_rate": 0.00019838118031011742, "loss": 0.4402, "step": 1011 }, { "epoch": 0.3037214885954382, "grad_norm": 0.14252755045890808, "learning_rate": 0.0001983717805490753, "loss": 0.438, "step": 1012 }, { "epoch": 0.3040216086434574, "grad_norm": 0.1371551901102066, "learning_rate": 0.0001983623538008632, "loss": 0.4083, "step": 1013 }, { "epoch": 0.3043217286914766, "grad_norm": 0.14904725551605225, "learning_rate": 0.0001983529000680672, "loss": 0.5038, "step": 1014 }, { "epoch": 0.30462184873949577, "grad_norm": 0.1304553896188736, "learning_rate": 0.00019834341935328086, "loss": 0.4621, "step": 1015 }, { "epoch": 0.304921968787515, "grad_norm": 0.12457938492298126, "learning_rate": 0.00019833391165910503, "loss": 0.4339, "step": 1016 }, { "epoch": 0.3052220888355342, "grad_norm": 0.1293737143278122, "learning_rate": 0.00019832437698814813, "loss": 0.4279, "step": 1017 }, { "epoch": 0.3055222088835534, "grad_norm": 0.1546356976032257, "learning_rate": 0.00019831481534302584, "loss": 0.4467, "step": 1018 }, { "epoch": 0.30582232893157263, "grad_norm": 0.13538290560245514, "learning_rate": 0.00019830522672636128, "loss": 0.4969, "step": 1019 }, { "epoch": 0.30612244897959184, "grad_norm": 0.1218218058347702, "learning_rate": 0.00019829561114078503, "loss": 0.4241, "step": 1020 }, { "epoch": 0.30642256902761106, "grad_norm": 0.1502799242734909, "learning_rate": 0.00019828596858893495, "loss": 0.4992, "step": 1021 }, { "epoch": 0.3067226890756303, "grad_norm": 0.22983862459659576, "learning_rate": 0.0001982762990734564, "loss": 0.4829, "step": 1022 }, { "epoch": 0.3070228091236495, "grad_norm": 0.1303202360868454, "learning_rate": 0.00019826660259700208, "loss": 0.4841, "step": 1023 }, { "epoch": 0.30732292917166865, "grad_norm": 0.13013200461864471, "learning_rate": 0.00019825687916223217, "loss": 0.4883, "step": 1024 }, { "epoch": 0.30762304921968786, "grad_norm": 0.138884037733078, "learning_rate": 0.0001982471287718141, "loss": 0.4776, "step": 1025 }, { "epoch": 0.3079231692677071, "grad_norm": 0.1241547241806984, "learning_rate": 0.0001982373514284228, "loss": 0.4507, "step": 1026 }, { "epoch": 0.3082232893157263, "grad_norm": 0.12917166948318481, "learning_rate": 0.00019822754713474057, "loss": 0.4002, "step": 1027 }, { "epoch": 0.3085234093637455, "grad_norm": 0.13225270807743073, "learning_rate": 0.00019821771589345713, "loss": 0.4844, "step": 1028 }, { "epoch": 0.3088235294117647, "grad_norm": 0.13052913546562195, "learning_rate": 0.0001982078577072696, "loss": 0.4794, "step": 1029 }, { "epoch": 0.30912364945978393, "grad_norm": 0.12789681553840637, "learning_rate": 0.00019819797257888237, "loss": 0.4012, "step": 1030 }, { "epoch": 0.30942376950780315, "grad_norm": 0.13375438749790192, "learning_rate": 0.00019818806051100736, "loss": 0.4654, "step": 1031 }, { "epoch": 0.3097238895558223, "grad_norm": 0.3724403977394104, "learning_rate": 0.00019817812150636383, "loss": 0.4445, "step": 1032 }, { "epoch": 0.3100240096038415, "grad_norm": 0.13455694913864136, "learning_rate": 0.00019816815556767848, "loss": 0.497, "step": 1033 }, { "epoch": 0.31032412965186074, "grad_norm": 0.13548237085342407, "learning_rate": 0.00019815816269768525, "loss": 0.437, "step": 1034 }, { "epoch": 0.31062424969987995, "grad_norm": 0.13410355150699615, "learning_rate": 0.00019814814289912565, "loss": 0.4331, "step": 1035 }, { "epoch": 0.31092436974789917, "grad_norm": 0.12189717590808868, "learning_rate": 0.00019813809617474844, "loss": 0.4196, "step": 1036 }, { "epoch": 0.3112244897959184, "grad_norm": 0.13920094072818756, "learning_rate": 0.00019812802252730988, "loss": 0.541, "step": 1037 }, { "epoch": 0.3115246098439376, "grad_norm": 0.13746348023414612, "learning_rate": 0.00019811792195957353, "loss": 0.4779, "step": 1038 }, { "epoch": 0.3118247298919568, "grad_norm": 0.15016810595989227, "learning_rate": 0.00019810779447431036, "loss": 0.4794, "step": 1039 }, { "epoch": 0.31212484993997597, "grad_norm": 0.14637719094753265, "learning_rate": 0.00019809764007429874, "loss": 0.553, "step": 1040 }, { "epoch": 0.3124249699879952, "grad_norm": 0.1545702964067459, "learning_rate": 0.0001980874587623244, "loss": 0.4634, "step": 1041 }, { "epoch": 0.3127250900360144, "grad_norm": 0.13468553125858307, "learning_rate": 0.0001980772505411805, "loss": 0.5257, "step": 1042 }, { "epoch": 0.3130252100840336, "grad_norm": 0.1812884509563446, "learning_rate": 0.00019806701541366753, "loss": 0.4869, "step": 1043 }, { "epoch": 0.3133253301320528, "grad_norm": 0.14042791724205017, "learning_rate": 0.00019805675338259335, "loss": 0.5218, "step": 1044 }, { "epoch": 0.31362545018007204, "grad_norm": 0.13315889239311218, "learning_rate": 0.00019804646445077326, "loss": 0.4418, "step": 1045 }, { "epoch": 0.31392557022809126, "grad_norm": 0.15402880311012268, "learning_rate": 0.0001980361486210299, "loss": 0.4833, "step": 1046 }, { "epoch": 0.31422569027611047, "grad_norm": 0.12961941957473755, "learning_rate": 0.00019802580589619334, "loss": 0.4996, "step": 1047 }, { "epoch": 0.31452581032412963, "grad_norm": 0.4248056411743164, "learning_rate": 0.0001980154362791009, "loss": 0.4639, "step": 1048 }, { "epoch": 0.31482593037214884, "grad_norm": 0.1287326067686081, "learning_rate": 0.00019800503977259747, "loss": 0.4019, "step": 1049 }, { "epoch": 0.31512605042016806, "grad_norm": 0.19719886779785156, "learning_rate": 0.00019799461637953517, "loss": 0.4509, "step": 1050 }, { "epoch": 0.3154261704681873, "grad_norm": 0.13943341374397278, "learning_rate": 0.00019798416610277347, "loss": 0.5395, "step": 1051 }, { "epoch": 0.3157262905162065, "grad_norm": 0.1295875459909439, "learning_rate": 0.00019797368894517939, "loss": 0.4619, "step": 1052 }, { "epoch": 0.3160264105642257, "grad_norm": 0.13652342557907104, "learning_rate": 0.00019796318490962716, "loss": 0.4586, "step": 1053 }, { "epoch": 0.3163265306122449, "grad_norm": 0.14109963178634644, "learning_rate": 0.00019795265399899842, "loss": 0.5335, "step": 1054 }, { "epoch": 0.31662665066026413, "grad_norm": 0.13764730095863342, "learning_rate": 0.0001979420962161823, "loss": 0.4758, "step": 1055 }, { "epoch": 0.3169267707082833, "grad_norm": 0.14121860265731812, "learning_rate": 0.0001979315115640751, "loss": 0.454, "step": 1056 }, { "epoch": 0.3172268907563025, "grad_norm": 0.12700317800045013, "learning_rate": 0.00019792090004558066, "loss": 0.4472, "step": 1057 }, { "epoch": 0.3175270108043217, "grad_norm": 0.14510038495063782, "learning_rate": 0.0001979102616636101, "loss": 0.5273, "step": 1058 }, { "epoch": 0.31782713085234093, "grad_norm": 0.1315007358789444, "learning_rate": 0.00019789959642108195, "loss": 0.4757, "step": 1059 }, { "epoch": 0.31812725090036015, "grad_norm": 0.14424051344394684, "learning_rate": 0.00019788890432092211, "loss": 0.4913, "step": 1060 }, { "epoch": 0.31842737094837936, "grad_norm": 0.12802337110042572, "learning_rate": 0.0001978781853660638, "loss": 0.4166, "step": 1061 }, { "epoch": 0.3187274909963986, "grad_norm": 0.13996674120426178, "learning_rate": 0.00019786743955944769, "loss": 0.476, "step": 1062 }, { "epoch": 0.3190276110444178, "grad_norm": 0.18760690093040466, "learning_rate": 0.00019785666690402175, "loss": 0.4954, "step": 1063 }, { "epoch": 0.31932773109243695, "grad_norm": 0.13369061052799225, "learning_rate": 0.00019784586740274128, "loss": 0.4986, "step": 1064 }, { "epoch": 0.31962785114045617, "grad_norm": 0.12482760846614838, "learning_rate": 0.00019783504105856908, "loss": 0.4301, "step": 1065 }, { "epoch": 0.3199279711884754, "grad_norm": 0.13145141303539276, "learning_rate": 0.00019782418787447518, "loss": 0.4374, "step": 1066 }, { "epoch": 0.3202280912364946, "grad_norm": 0.14006423950195312, "learning_rate": 0.00019781330785343705, "loss": 0.5104, "step": 1067 }, { "epoch": 0.3205282112845138, "grad_norm": 0.14591243863105774, "learning_rate": 0.00019780240099843952, "loss": 0.4999, "step": 1068 }, { "epoch": 0.320828331332533, "grad_norm": 0.1316094547510147, "learning_rate": 0.0001977914673124747, "loss": 0.4804, "step": 1069 }, { "epoch": 0.32112845138055224, "grad_norm": 0.13612405955791473, "learning_rate": 0.0001977805067985422, "loss": 0.5368, "step": 1070 }, { "epoch": 0.32142857142857145, "grad_norm": 0.1620555967092514, "learning_rate": 0.0001977695194596488, "loss": 0.4883, "step": 1071 }, { "epoch": 0.3217286914765906, "grad_norm": 0.13244867324829102, "learning_rate": 0.00019775850529880887, "loss": 0.4626, "step": 1072 }, { "epoch": 0.3220288115246098, "grad_norm": 0.1393079161643982, "learning_rate": 0.00019774746431904395, "loss": 0.4577, "step": 1073 }, { "epoch": 0.32232893157262904, "grad_norm": 0.1493871957063675, "learning_rate": 0.00019773639652338306, "loss": 0.4419, "step": 1074 }, { "epoch": 0.32262905162064826, "grad_norm": 0.14268308877944946, "learning_rate": 0.00019772530191486244, "loss": 0.482, "step": 1075 }, { "epoch": 0.32292917166866747, "grad_norm": 0.12896285951137543, "learning_rate": 0.00019771418049652586, "loss": 0.4488, "step": 1076 }, { "epoch": 0.3232292917166867, "grad_norm": 0.1597939133644104, "learning_rate": 0.00019770303227142425, "loss": 0.5328, "step": 1077 }, { "epoch": 0.3235294117647059, "grad_norm": 0.12893250584602356, "learning_rate": 0.00019769185724261611, "loss": 0.4659, "step": 1078 }, { "epoch": 0.3238295318127251, "grad_norm": 0.14060784876346588, "learning_rate": 0.00019768065541316712, "loss": 0.4903, "step": 1079 }, { "epoch": 0.3241296518607443, "grad_norm": 0.2835310995578766, "learning_rate": 0.00019766942678615035, "loss": 0.4688, "step": 1080 }, { "epoch": 0.3244297719087635, "grad_norm": 0.12974952161312103, "learning_rate": 0.0001976581713646463, "loss": 0.4589, "step": 1081 }, { "epoch": 0.3247298919567827, "grad_norm": 0.1387726068496704, "learning_rate": 0.00019764688915174274, "loss": 0.4869, "step": 1082 }, { "epoch": 0.3250300120048019, "grad_norm": 0.1310199350118637, "learning_rate": 0.00019763558015053483, "loss": 0.4745, "step": 1083 }, { "epoch": 0.32533013205282113, "grad_norm": 0.12792225182056427, "learning_rate": 0.00019762424436412502, "loss": 0.444, "step": 1084 }, { "epoch": 0.32563025210084034, "grad_norm": 0.12761080265045166, "learning_rate": 0.0001976128817956232, "loss": 0.4397, "step": 1085 }, { "epoch": 0.32593037214885956, "grad_norm": 0.13653092086315155, "learning_rate": 0.00019760149244814655, "loss": 0.4884, "step": 1086 }, { "epoch": 0.3262304921968788, "grad_norm": 0.14195531606674194, "learning_rate": 0.0001975900763248196, "loss": 0.4732, "step": 1087 }, { "epoch": 0.32653061224489793, "grad_norm": 0.13525669276714325, "learning_rate": 0.00019757863342877423, "loss": 0.4426, "step": 1088 }, { "epoch": 0.32683073229291715, "grad_norm": 0.12862493097782135, "learning_rate": 0.0001975671637631497, "loss": 0.4438, "step": 1089 }, { "epoch": 0.32713085234093636, "grad_norm": 0.1356307566165924, "learning_rate": 0.00019755566733109251, "loss": 0.4471, "step": 1090 }, { "epoch": 0.3274309723889556, "grad_norm": 0.14373083412647247, "learning_rate": 0.00019754414413575665, "loss": 0.488, "step": 1091 }, { "epoch": 0.3277310924369748, "grad_norm": 0.2767897844314575, "learning_rate": 0.00019753259418030334, "loss": 0.4432, "step": 1092 }, { "epoch": 0.328031212484994, "grad_norm": 0.1524336040019989, "learning_rate": 0.00019752101746790118, "loss": 0.491, "step": 1093 }, { "epoch": 0.3283313325330132, "grad_norm": 0.15589389204978943, "learning_rate": 0.0001975094140017261, "loss": 0.5193, "step": 1094 }, { "epoch": 0.32863145258103243, "grad_norm": 0.12623004615306854, "learning_rate": 0.00019749778378496142, "loss": 0.4119, "step": 1095 }, { "epoch": 0.3289315726290516, "grad_norm": 0.1387036293745041, "learning_rate": 0.0001974861268207977, "loss": 0.4313, "step": 1096 }, { "epoch": 0.3292316926770708, "grad_norm": 0.14449627697467804, "learning_rate": 0.0001974744431124329, "loss": 0.5194, "step": 1097 }, { "epoch": 0.32953181272509, "grad_norm": 0.1603599190711975, "learning_rate": 0.00019746273266307238, "loss": 0.4854, "step": 1098 }, { "epoch": 0.32983193277310924, "grad_norm": 0.13858915865421295, "learning_rate": 0.00019745099547592866, "loss": 0.5116, "step": 1099 }, { "epoch": 0.33013205282112845, "grad_norm": 0.13826991617679596, "learning_rate": 0.0001974392315542218, "loss": 0.4898, "step": 1100 }, { "epoch": 0.33043217286914767, "grad_norm": 0.2203083485364914, "learning_rate": 0.00019742744090117906, "loss": 0.475, "step": 1101 }, { "epoch": 0.3307322929171669, "grad_norm": 0.1723625510931015, "learning_rate": 0.00019741562352003508, "loss": 0.4794, "step": 1102 }, { "epoch": 0.3310324129651861, "grad_norm": 0.16649343073368073, "learning_rate": 0.0001974037794140318, "loss": 0.5317, "step": 1103 }, { "epoch": 0.33133253301320525, "grad_norm": 0.1325429528951645, "learning_rate": 0.00019739190858641853, "loss": 0.4915, "step": 1104 }, { "epoch": 0.33163265306122447, "grad_norm": 0.28730836510658264, "learning_rate": 0.00019738001104045185, "loss": 0.4688, "step": 1105 }, { "epoch": 0.3319327731092437, "grad_norm": 0.14861632883548737, "learning_rate": 0.0001973680867793958, "loss": 0.455, "step": 1106 }, { "epoch": 0.3322328931572629, "grad_norm": 0.20707793533802032, "learning_rate": 0.00019735613580652159, "loss": 0.4811, "step": 1107 }, { "epoch": 0.3325330132052821, "grad_norm": 0.16124333441257477, "learning_rate": 0.0001973441581251079, "loss": 0.5491, "step": 1108 }, { "epoch": 0.3328331332533013, "grad_norm": 0.14531394839286804, "learning_rate": 0.00019733215373844064, "loss": 0.4206, "step": 1109 }, { "epoch": 0.33313325330132054, "grad_norm": 0.12873904407024384, "learning_rate": 0.000197320122649813, "loss": 0.4752, "step": 1110 }, { "epoch": 0.33343337334933976, "grad_norm": 0.13601170480251312, "learning_rate": 0.0001973080648625257, "loss": 0.5011, "step": 1111 }, { "epoch": 0.33373349339735897, "grad_norm": 0.15537546575069427, "learning_rate": 0.00019729598037988662, "loss": 0.4942, "step": 1112 }, { "epoch": 0.33403361344537813, "grad_norm": 0.14289778470993042, "learning_rate": 0.0001972838692052109, "loss": 0.5334, "step": 1113 }, { "epoch": 0.33433373349339734, "grad_norm": 0.14202508330345154, "learning_rate": 0.00019727173134182123, "loss": 0.5125, "step": 1114 }, { "epoch": 0.33463385354141656, "grad_norm": 0.15394620597362518, "learning_rate": 0.00019725956679304742, "loss": 0.5323, "step": 1115 }, { "epoch": 0.3349339735894358, "grad_norm": 0.15147684514522552, "learning_rate": 0.00019724737556222672, "loss": 0.4669, "step": 1116 }, { "epoch": 0.335234093637455, "grad_norm": 0.13286633789539337, "learning_rate": 0.0001972351576527036, "loss": 0.4936, "step": 1117 }, { "epoch": 0.3355342136854742, "grad_norm": 0.14134052395820618, "learning_rate": 0.00019722291306782997, "loss": 0.5268, "step": 1118 }, { "epoch": 0.3358343337334934, "grad_norm": 0.8179460167884827, "learning_rate": 0.00019721064181096493, "loss": 0.5263, "step": 1119 }, { "epoch": 0.33613445378151263, "grad_norm": 0.1837950497865677, "learning_rate": 0.000197198343885475, "loss": 0.4431, "step": 1120 }, { "epoch": 0.3364345738295318, "grad_norm": 0.13986214995384216, "learning_rate": 0.00019718601929473393, "loss": 0.4894, "step": 1121 }, { "epoch": 0.336734693877551, "grad_norm": 0.15578554570674896, "learning_rate": 0.00019717366804212287, "loss": 0.4619, "step": 1122 }, { "epoch": 0.3370348139255702, "grad_norm": 0.15266619622707367, "learning_rate": 0.00019716129013103024, "loss": 0.4898, "step": 1123 }, { "epoch": 0.33733493397358943, "grad_norm": 0.17838340997695923, "learning_rate": 0.00019714888556485177, "loss": 0.5273, "step": 1124 }, { "epoch": 0.33763505402160865, "grad_norm": 0.1382509022951126, "learning_rate": 0.0001971364543469905, "loss": 0.4752, "step": 1125 }, { "epoch": 0.33793517406962786, "grad_norm": 0.1678454428911209, "learning_rate": 0.0001971239964808568, "loss": 0.4108, "step": 1126 }, { "epoch": 0.3382352941176471, "grad_norm": 0.14172542095184326, "learning_rate": 0.00019711151196986836, "loss": 0.433, "step": 1127 }, { "epoch": 0.3385354141656663, "grad_norm": 0.1336435079574585, "learning_rate": 0.00019709900081745014, "loss": 0.4269, "step": 1128 }, { "epoch": 0.33883553421368545, "grad_norm": 0.1344050168991089, "learning_rate": 0.00019708646302703446, "loss": 0.4533, "step": 1129 }, { "epoch": 0.33913565426170467, "grad_norm": 0.1460920125246048, "learning_rate": 0.00019707389860206087, "loss": 0.4229, "step": 1130 }, { "epoch": 0.3394357743097239, "grad_norm": 0.13166992366313934, "learning_rate": 0.00019706130754597632, "loss": 0.4411, "step": 1131 }, { "epoch": 0.3397358943577431, "grad_norm": 0.20385001599788666, "learning_rate": 0.000197048689862235, "loss": 0.448, "step": 1132 }, { "epoch": 0.3400360144057623, "grad_norm": 0.15308980643749237, "learning_rate": 0.00019703604555429844, "loss": 0.4357, "step": 1133 }, { "epoch": 0.3403361344537815, "grad_norm": 0.14577004313468933, "learning_rate": 0.00019702337462563545, "loss": 0.444, "step": 1134 }, { "epoch": 0.34063625450180074, "grad_norm": 0.1675492376089096, "learning_rate": 0.00019701067707972216, "loss": 0.4727, "step": 1135 }, { "epoch": 0.34093637454981995, "grad_norm": 0.1404566764831543, "learning_rate": 0.000196997952920042, "loss": 0.4681, "step": 1136 }, { "epoch": 0.3412364945978391, "grad_norm": 0.18300069868564606, "learning_rate": 0.00019698520215008568, "loss": 0.4652, "step": 1137 }, { "epoch": 0.3415366146458583, "grad_norm": 0.14181415736675262, "learning_rate": 0.00019697242477335127, "loss": 0.4962, "step": 1138 }, { "epoch": 0.34183673469387754, "grad_norm": 0.13853693008422852, "learning_rate": 0.00019695962079334405, "loss": 0.4491, "step": 1139 }, { "epoch": 0.34213685474189676, "grad_norm": 0.14697465300559998, "learning_rate": 0.00019694679021357666, "loss": 0.4479, "step": 1140 }, { "epoch": 0.34243697478991597, "grad_norm": 0.18280468881130219, "learning_rate": 0.000196933933037569, "loss": 0.4262, "step": 1141 }, { "epoch": 0.3427370948379352, "grad_norm": 0.1395919919013977, "learning_rate": 0.00019692104926884833, "loss": 0.4608, "step": 1142 }, { "epoch": 0.3430372148859544, "grad_norm": 0.13262638449668884, "learning_rate": 0.00019690813891094916, "loss": 0.4743, "step": 1143 }, { "epoch": 0.3433373349339736, "grad_norm": 0.13647164404392242, "learning_rate": 0.00019689520196741324, "loss": 0.4362, "step": 1144 }, { "epoch": 0.3436374549819928, "grad_norm": 0.13809150457382202, "learning_rate": 0.00019688223844178975, "loss": 0.4411, "step": 1145 }, { "epoch": 0.343937575030012, "grad_norm": 0.20118433237075806, "learning_rate": 0.00019686924833763506, "loss": 0.5154, "step": 1146 }, { "epoch": 0.3442376950780312, "grad_norm": 0.14372298121452332, "learning_rate": 0.00019685623165851285, "loss": 0.5373, "step": 1147 }, { "epoch": 0.3445378151260504, "grad_norm": 0.4217036962509155, "learning_rate": 0.00019684318840799408, "loss": 0.4773, "step": 1148 }, { "epoch": 0.34483793517406963, "grad_norm": 0.14791075885295868, "learning_rate": 0.00019683011858965703, "loss": 0.5459, "step": 1149 }, { "epoch": 0.34513805522208885, "grad_norm": 0.1378076672554016, "learning_rate": 0.00019681702220708725, "loss": 0.4692, "step": 1150 }, { "epoch": 0.34543817527010806, "grad_norm": 0.1290854811668396, "learning_rate": 0.00019680389926387762, "loss": 0.4516, "step": 1151 }, { "epoch": 0.3457382953181273, "grad_norm": 0.14721539616584778, "learning_rate": 0.0001967907497636282, "loss": 0.4897, "step": 1152 }, { "epoch": 0.34603841536614643, "grad_norm": 0.1549588143825531, "learning_rate": 0.00019677757370994647, "loss": 0.5171, "step": 1153 }, { "epoch": 0.34633853541416565, "grad_norm": 0.13510900735855103, "learning_rate": 0.00019676437110644707, "loss": 0.4619, "step": 1154 }, { "epoch": 0.34663865546218486, "grad_norm": 0.16830456256866455, "learning_rate": 0.00019675114195675205, "loss": 0.4727, "step": 1155 }, { "epoch": 0.3469387755102041, "grad_norm": 0.1219051405787468, "learning_rate": 0.00019673788626449064, "loss": 0.4297, "step": 1156 }, { "epoch": 0.3472388955582233, "grad_norm": 0.13119164109230042, "learning_rate": 0.00019672460403329935, "loss": 0.4646, "step": 1157 }, { "epoch": 0.3475390156062425, "grad_norm": 0.13671201467514038, "learning_rate": 0.00019671129526682205, "loss": 0.457, "step": 1158 }, { "epoch": 0.3478391356542617, "grad_norm": 0.1703466922044754, "learning_rate": 0.00019669795996870987, "loss": 0.5045, "step": 1159 }, { "epoch": 0.34813925570228094, "grad_norm": 0.18858648836612701, "learning_rate": 0.00019668459814262116, "loss": 0.4698, "step": 1160 }, { "epoch": 0.3484393757503001, "grad_norm": 0.13836561143398285, "learning_rate": 0.0001966712097922216, "loss": 0.4736, "step": 1161 }, { "epoch": 0.3487394957983193, "grad_norm": 0.15764808654785156, "learning_rate": 0.0001966577949211841, "loss": 0.4802, "step": 1162 }, { "epoch": 0.3490396158463385, "grad_norm": 0.1542765349149704, "learning_rate": 0.0001966443535331889, "loss": 0.4649, "step": 1163 }, { "epoch": 0.34933973589435774, "grad_norm": 0.13806657493114471, "learning_rate": 0.00019663088563192348, "loss": 0.4928, "step": 1164 }, { "epoch": 0.34963985594237695, "grad_norm": 0.2369850128889084, "learning_rate": 0.00019661739122108263, "loss": 0.525, "step": 1165 }, { "epoch": 0.34993997599039617, "grad_norm": 0.15411926805973053, "learning_rate": 0.00019660387030436837, "loss": 0.5242, "step": 1166 }, { "epoch": 0.3502400960384154, "grad_norm": 0.12959231436252594, "learning_rate": 0.00019659032288549003, "loss": 0.4718, "step": 1167 }, { "epoch": 0.3505402160864346, "grad_norm": 0.1349993348121643, "learning_rate": 0.00019657674896816414, "loss": 0.4333, "step": 1168 }, { "epoch": 0.35084033613445376, "grad_norm": 0.15307295322418213, "learning_rate": 0.00019656314855611456, "loss": 0.473, "step": 1169 }, { "epoch": 0.35114045618247297, "grad_norm": 0.13697105646133423, "learning_rate": 0.00019654952165307245, "loss": 0.4935, "step": 1170 }, { "epoch": 0.3514405762304922, "grad_norm": 0.14775574207305908, "learning_rate": 0.00019653586826277617, "loss": 0.5187, "step": 1171 }, { "epoch": 0.3517406962785114, "grad_norm": 0.135857954621315, "learning_rate": 0.00019652218838897136, "loss": 0.4352, "step": 1172 }, { "epoch": 0.3520408163265306, "grad_norm": 0.13962113857269287, "learning_rate": 0.00019650848203541093, "loss": 0.4965, "step": 1173 }, { "epoch": 0.35234093637454983, "grad_norm": 0.14366528391838074, "learning_rate": 0.00019649474920585512, "loss": 0.5181, "step": 1174 }, { "epoch": 0.35264105642256904, "grad_norm": 0.1447860300540924, "learning_rate": 0.0001964809899040713, "loss": 0.5149, "step": 1175 }, { "epoch": 0.35294117647058826, "grad_norm": 0.13299641013145447, "learning_rate": 0.0001964672041338342, "loss": 0.4445, "step": 1176 }, { "epoch": 0.3532412965186074, "grad_norm": 0.1375095099210739, "learning_rate": 0.0001964533918989258, "loss": 0.4906, "step": 1177 }, { "epoch": 0.35354141656662663, "grad_norm": 0.14751577377319336, "learning_rate": 0.00019643955320313534, "loss": 0.4753, "step": 1178 }, { "epoch": 0.35384153661464585, "grad_norm": 0.16620895266532898, "learning_rate": 0.00019642568805025928, "loss": 0.482, "step": 1179 }, { "epoch": 0.35414165666266506, "grad_norm": 0.1296658217906952, "learning_rate": 0.00019641179644410136, "loss": 0.4524, "step": 1180 }, { "epoch": 0.3544417767106843, "grad_norm": 0.24293987452983856, "learning_rate": 0.00019639787838847265, "loss": 0.5122, "step": 1181 }, { "epoch": 0.3547418967587035, "grad_norm": 0.1454843431711197, "learning_rate": 0.00019638393388719133, "loss": 0.5021, "step": 1182 }, { "epoch": 0.3550420168067227, "grad_norm": 0.13670776784420013, "learning_rate": 0.00019636996294408293, "loss": 0.4903, "step": 1183 }, { "epoch": 0.3553421368547419, "grad_norm": 0.14824290573596954, "learning_rate": 0.00019635596556298024, "loss": 0.5182, "step": 1184 }, { "epoch": 0.3556422569027611, "grad_norm": 0.1408873349428177, "learning_rate": 0.00019634194174772326, "loss": 0.4947, "step": 1185 }, { "epoch": 0.3559423769507803, "grad_norm": 0.12988775968551636, "learning_rate": 0.00019632789150215928, "loss": 0.4538, "step": 1186 }, { "epoch": 0.3562424969987995, "grad_norm": 0.148466095328331, "learning_rate": 0.00019631381483014283, "loss": 0.507, "step": 1187 }, { "epoch": 0.3565426170468187, "grad_norm": 0.13022229075431824, "learning_rate": 0.00019629971173553567, "loss": 0.4291, "step": 1188 }, { "epoch": 0.35684273709483794, "grad_norm": 0.15934327244758606, "learning_rate": 0.00019628558222220682, "loss": 0.502, "step": 1189 }, { "epoch": 0.35714285714285715, "grad_norm": 0.1747078150510788, "learning_rate": 0.00019627142629403258, "loss": 0.4984, "step": 1190 }, { "epoch": 0.35744297719087637, "grad_norm": 0.7439995408058167, "learning_rate": 0.0001962572439548964, "loss": 0.4476, "step": 1191 }, { "epoch": 0.3577430972388956, "grad_norm": 0.1325507014989853, "learning_rate": 0.0001962430352086891, "loss": 0.4926, "step": 1192 }, { "epoch": 0.35804321728691474, "grad_norm": 0.14991387724876404, "learning_rate": 0.00019622880005930866, "loss": 0.5043, "step": 1193 }, { "epoch": 0.35834333733493395, "grad_norm": 0.1313386708498001, "learning_rate": 0.00019621453851066036, "loss": 0.4202, "step": 1194 }, { "epoch": 0.35864345738295317, "grad_norm": 0.13692636787891388, "learning_rate": 0.00019620025056665664, "loss": 0.488, "step": 1195 }, { "epoch": 0.3589435774309724, "grad_norm": 0.13470913469791412, "learning_rate": 0.00019618593623121731, "loss": 0.4661, "step": 1196 }, { "epoch": 0.3592436974789916, "grad_norm": 0.16752992570400238, "learning_rate": 0.00019617159550826927, "loss": 0.475, "step": 1197 }, { "epoch": 0.3595438175270108, "grad_norm": 0.15151073038578033, "learning_rate": 0.00019615722840174676, "loss": 0.4865, "step": 1198 }, { "epoch": 0.35984393757503, "grad_norm": 0.14800961315631866, "learning_rate": 0.00019614283491559123, "loss": 0.4617, "step": 1199 }, { "epoch": 0.36014405762304924, "grad_norm": 0.25916588306427, "learning_rate": 0.00019612841505375138, "loss": 0.3924, "step": 1200 }, { "epoch": 0.3604441776710684, "grad_norm": 0.13493013381958008, "learning_rate": 0.00019611396882018313, "loss": 0.4551, "step": 1201 }, { "epoch": 0.3607442977190876, "grad_norm": 0.1315002739429474, "learning_rate": 0.00019609949621884966, "loss": 0.4402, "step": 1202 }, { "epoch": 0.36104441776710683, "grad_norm": 0.14526283740997314, "learning_rate": 0.00019608499725372127, "loss": 0.4859, "step": 1203 }, { "epoch": 0.36134453781512604, "grad_norm": 0.15128463506698608, "learning_rate": 0.0001960704719287757, "loss": 0.4719, "step": 1204 }, { "epoch": 0.36164465786314526, "grad_norm": 0.13079828023910522, "learning_rate": 0.00019605592024799772, "loss": 0.4258, "step": 1205 }, { "epoch": 0.3619447779111645, "grad_norm": 0.14006419479846954, "learning_rate": 0.0001960413422153795, "loss": 0.478, "step": 1206 }, { "epoch": 0.3622448979591837, "grad_norm": 0.2214106172323227, "learning_rate": 0.00019602673783492027, "loss": 0.4962, "step": 1207 }, { "epoch": 0.3625450180072029, "grad_norm": 0.14224013686180115, "learning_rate": 0.00019601210711062662, "loss": 0.5082, "step": 1208 }, { "epoch": 0.3628451380552221, "grad_norm": 0.17071548104286194, "learning_rate": 0.00019599745004651235, "loss": 0.4933, "step": 1209 }, { "epoch": 0.3631452581032413, "grad_norm": 0.14860154688358307, "learning_rate": 0.0001959827666465984, "loss": 0.4828, "step": 1210 }, { "epoch": 0.3634453781512605, "grad_norm": 0.15933696925640106, "learning_rate": 0.000195968056914913, "loss": 0.4377, "step": 1211 }, { "epoch": 0.3637454981992797, "grad_norm": 0.20036178827285767, "learning_rate": 0.00019595332085549163, "loss": 0.5222, "step": 1212 }, { "epoch": 0.3640456182472989, "grad_norm": 0.15371794998645782, "learning_rate": 0.00019593855847237693, "loss": 0.4734, "step": 1213 }, { "epoch": 0.36434573829531813, "grad_norm": 0.14381040632724762, "learning_rate": 0.0001959237697696188, "loss": 0.4294, "step": 1214 }, { "epoch": 0.36464585834333735, "grad_norm": 0.1590506136417389, "learning_rate": 0.00019590895475127436, "loss": 0.4526, "step": 1215 }, { "epoch": 0.36494597839135656, "grad_norm": 0.1287468820810318, "learning_rate": 0.00019589411342140793, "loss": 0.4134, "step": 1216 }, { "epoch": 0.3652460984393758, "grad_norm": 0.1266324520111084, "learning_rate": 0.00019587924578409104, "loss": 0.3931, "step": 1217 }, { "epoch": 0.36554621848739494, "grad_norm": 0.13392604887485504, "learning_rate": 0.0001958643518434025, "loss": 0.4429, "step": 1218 }, { "epoch": 0.36584633853541415, "grad_norm": 0.14387540519237518, "learning_rate": 0.0001958494316034283, "loss": 0.4916, "step": 1219 }, { "epoch": 0.36614645858343337, "grad_norm": 0.14918778836727142, "learning_rate": 0.00019583448506826155, "loss": 0.4862, "step": 1220 }, { "epoch": 0.3664465786314526, "grad_norm": 0.14455977082252502, "learning_rate": 0.00019581951224200274, "loss": 0.4743, "step": 1221 }, { "epoch": 0.3667466986794718, "grad_norm": 0.1635429561138153, "learning_rate": 0.00019580451312875945, "loss": 0.4825, "step": 1222 }, { "epoch": 0.367046818727491, "grad_norm": 0.1514321118593216, "learning_rate": 0.00019578948773264657, "loss": 0.465, "step": 1223 }, { "epoch": 0.3673469387755102, "grad_norm": 0.13055840134620667, "learning_rate": 0.00019577443605778606, "loss": 0.4603, "step": 1224 }, { "epoch": 0.36764705882352944, "grad_norm": 0.14448609948158264, "learning_rate": 0.00019575935810830724, "loss": 0.4908, "step": 1225 }, { "epoch": 0.3679471788715486, "grad_norm": 0.162399023771286, "learning_rate": 0.00019574425388834657, "loss": 0.4625, "step": 1226 }, { "epoch": 0.3682472989195678, "grad_norm": 0.2367607057094574, "learning_rate": 0.00019572912340204773, "loss": 0.499, "step": 1227 }, { "epoch": 0.368547418967587, "grad_norm": 0.14852334558963776, "learning_rate": 0.00019571396665356153, "loss": 0.4791, "step": 1228 }, { "epoch": 0.36884753901560624, "grad_norm": 0.14649629592895508, "learning_rate": 0.00019569878364704613, "loss": 0.4526, "step": 1229 }, { "epoch": 0.36914765906362546, "grad_norm": 0.14013509452342987, "learning_rate": 0.00019568357438666675, "loss": 0.4865, "step": 1230 }, { "epoch": 0.36944777911164467, "grad_norm": 0.13371364772319794, "learning_rate": 0.0001956683388765959, "loss": 0.4612, "step": 1231 }, { "epoch": 0.3697478991596639, "grad_norm": 0.1385035365819931, "learning_rate": 0.00019565307712101325, "loss": 0.456, "step": 1232 }, { "epoch": 0.3700480192076831, "grad_norm": 0.1443071961402893, "learning_rate": 0.00019563778912410574, "loss": 0.5028, "step": 1233 }, { "epoch": 0.37034813925570226, "grad_norm": 0.18161094188690186, "learning_rate": 0.00019562247489006738, "loss": 0.4309, "step": 1234 }, { "epoch": 0.3706482593037215, "grad_norm": 0.21723665297031403, "learning_rate": 0.00019560713442309954, "loss": 0.4589, "step": 1235 }, { "epoch": 0.3709483793517407, "grad_norm": 0.13557744026184082, "learning_rate": 0.00019559176772741065, "loss": 0.4953, "step": 1236 }, { "epoch": 0.3712484993997599, "grad_norm": 0.1500803530216217, "learning_rate": 0.0001955763748072164, "loss": 0.542, "step": 1237 }, { "epoch": 0.3715486194477791, "grad_norm": 0.1344929188489914, "learning_rate": 0.00019556095566673962, "loss": 0.5093, "step": 1238 }, { "epoch": 0.37184873949579833, "grad_norm": 0.15922845900058746, "learning_rate": 0.00019554551031021044, "loss": 0.4529, "step": 1239 }, { "epoch": 0.37214885954381755, "grad_norm": 0.1892634481191635, "learning_rate": 0.00019553003874186607, "loss": 0.4942, "step": 1240 }, { "epoch": 0.37244897959183676, "grad_norm": 0.16977249085903168, "learning_rate": 0.00019551454096595097, "loss": 0.4628, "step": 1241 }, { "epoch": 0.3727490996398559, "grad_norm": 0.1330426037311554, "learning_rate": 0.00019549901698671685, "loss": 0.4938, "step": 1242 }, { "epoch": 0.37304921968787513, "grad_norm": 0.12905094027519226, "learning_rate": 0.0001954834668084224, "loss": 0.3876, "step": 1243 }, { "epoch": 0.37334933973589435, "grad_norm": 0.1504504233598709, "learning_rate": 0.0001954678904353337, "loss": 0.538, "step": 1244 }, { "epoch": 0.37364945978391356, "grad_norm": 0.1254984438419342, "learning_rate": 0.00019545228787172397, "loss": 0.3784, "step": 1245 }, { "epoch": 0.3739495798319328, "grad_norm": 0.1318364143371582, "learning_rate": 0.00019543665912187357, "loss": 0.4715, "step": 1246 }, { "epoch": 0.374249699879952, "grad_norm": 0.15896952152252197, "learning_rate": 0.00019542100419007007, "loss": 0.4601, "step": 1247 }, { "epoch": 0.3745498199279712, "grad_norm": 0.14802490174770355, "learning_rate": 0.00019540532308060825, "loss": 0.4695, "step": 1248 }, { "epoch": 0.3748499399759904, "grad_norm": 0.14402136206626892, "learning_rate": 0.00019538961579778998, "loss": 0.4744, "step": 1249 }, { "epoch": 0.3751500600240096, "grad_norm": 0.18700093030929565, "learning_rate": 0.00019537388234592442, "loss": 0.5243, "step": 1250 }, { "epoch": 0.3754501800720288, "grad_norm": 0.13441616296768188, "learning_rate": 0.00019535812272932786, "loss": 0.4646, "step": 1251 }, { "epoch": 0.375750300120048, "grad_norm": 0.1384831815958023, "learning_rate": 0.00019534233695232375, "loss": 0.4393, "step": 1252 }, { "epoch": 0.3760504201680672, "grad_norm": 0.15414471924304962, "learning_rate": 0.00019532652501924277, "loss": 0.471, "step": 1253 }, { "epoch": 0.37635054021608644, "grad_norm": 0.14955464005470276, "learning_rate": 0.0001953106869344227, "loss": 0.4703, "step": 1254 }, { "epoch": 0.37665066026410565, "grad_norm": 0.13937316834926605, "learning_rate": 0.00019529482270220857, "loss": 0.4557, "step": 1255 }, { "epoch": 0.37695078031212487, "grad_norm": 0.1474233865737915, "learning_rate": 0.00019527893232695252, "loss": 0.4807, "step": 1256 }, { "epoch": 0.3772509003601441, "grad_norm": 0.14132389426231384, "learning_rate": 0.00019526301581301392, "loss": 0.4805, "step": 1257 }, { "epoch": 0.37755102040816324, "grad_norm": 0.15703056752681732, "learning_rate": 0.00019524707316475928, "loss": 0.4785, "step": 1258 }, { "epoch": 0.37785114045618245, "grad_norm": 0.13659298419952393, "learning_rate": 0.00019523110438656228, "loss": 0.4925, "step": 1259 }, { "epoch": 0.37815126050420167, "grad_norm": 0.20109659433364868, "learning_rate": 0.00019521510948280373, "loss": 0.4768, "step": 1260 }, { "epoch": 0.3784513805522209, "grad_norm": 0.19589029252529144, "learning_rate": 0.0001951990884578717, "loss": 0.5204, "step": 1261 }, { "epoch": 0.3787515006002401, "grad_norm": 0.1271110624074936, "learning_rate": 0.00019518304131616138, "loss": 0.4295, "step": 1262 }, { "epoch": 0.3790516206482593, "grad_norm": 0.2087676227092743, "learning_rate": 0.0001951669680620751, "loss": 0.5067, "step": 1263 }, { "epoch": 0.3793517406962785, "grad_norm": 0.18466299772262573, "learning_rate": 0.00019515086870002234, "loss": 0.5341, "step": 1264 }, { "epoch": 0.37965186074429774, "grad_norm": 0.12623387575149536, "learning_rate": 0.00019513474323441986, "loss": 0.4194, "step": 1265 }, { "epoch": 0.3799519807923169, "grad_norm": 0.14969417452812195, "learning_rate": 0.00019511859166969142, "loss": 0.4693, "step": 1266 }, { "epoch": 0.3802521008403361, "grad_norm": 0.14682205021381378, "learning_rate": 0.00019510241401026802, "loss": 0.5259, "step": 1267 }, { "epoch": 0.38055222088835533, "grad_norm": 0.20016992092132568, "learning_rate": 0.00019508621026058785, "loss": 0.4574, "step": 1268 }, { "epoch": 0.38085234093637454, "grad_norm": 0.1485625058412552, "learning_rate": 0.00019506998042509622, "loss": 0.441, "step": 1269 }, { "epoch": 0.38115246098439376, "grad_norm": 0.1220528706908226, "learning_rate": 0.0001950537245082456, "loss": 0.4228, "step": 1270 }, { "epoch": 0.381452581032413, "grad_norm": 0.131525918841362, "learning_rate": 0.00019503744251449557, "loss": 0.4669, "step": 1271 }, { "epoch": 0.3817527010804322, "grad_norm": 0.1620159149169922, "learning_rate": 0.00019502113444831297, "loss": 0.5135, "step": 1272 }, { "epoch": 0.3820528211284514, "grad_norm": 0.1345333456993103, "learning_rate": 0.00019500480031417166, "loss": 0.4917, "step": 1273 }, { "epoch": 0.38235294117647056, "grad_norm": 0.15267445147037506, "learning_rate": 0.0001949884401165528, "loss": 0.4456, "step": 1274 }, { "epoch": 0.3826530612244898, "grad_norm": 0.18665656447410583, "learning_rate": 0.00019497205385994457, "loss": 0.5099, "step": 1275 }, { "epoch": 0.382953181272509, "grad_norm": 0.139787957072258, "learning_rate": 0.0001949556415488424, "loss": 0.4554, "step": 1276 }, { "epoch": 0.3832533013205282, "grad_norm": 0.14109259843826294, "learning_rate": 0.00019493920318774873, "loss": 0.4776, "step": 1277 }, { "epoch": 0.3835534213685474, "grad_norm": 0.1319589614868164, "learning_rate": 0.00019492273878117335, "loss": 0.4415, "step": 1278 }, { "epoch": 0.38385354141656663, "grad_norm": 0.16108369827270508, "learning_rate": 0.000194906248333633, "loss": 0.523, "step": 1279 }, { "epoch": 0.38415366146458585, "grad_norm": 0.14197713136672974, "learning_rate": 0.0001948897318496517, "loss": 0.4603, "step": 1280 }, { "epoch": 0.38445378151260506, "grad_norm": 0.1432972252368927, "learning_rate": 0.00019487318933376048, "loss": 0.4849, "step": 1281 }, { "epoch": 0.3847539015606242, "grad_norm": 0.13725528120994568, "learning_rate": 0.00019485662079049768, "loss": 0.4327, "step": 1282 }, { "epoch": 0.38505402160864344, "grad_norm": 0.14719566702842712, "learning_rate": 0.0001948400262244086, "loss": 0.4813, "step": 1283 }, { "epoch": 0.38535414165666265, "grad_norm": 0.13413792848587036, "learning_rate": 0.00019482340564004586, "loss": 0.4021, "step": 1284 }, { "epoch": 0.38565426170468187, "grad_norm": 0.14877858757972717, "learning_rate": 0.00019480675904196907, "loss": 0.4527, "step": 1285 }, { "epoch": 0.3859543817527011, "grad_norm": 0.1463140845298767, "learning_rate": 0.00019479008643474505, "loss": 0.4249, "step": 1286 }, { "epoch": 0.3862545018007203, "grad_norm": 0.14037886261940002, "learning_rate": 0.00019477338782294772, "loss": 0.4336, "step": 1287 }, { "epoch": 0.3865546218487395, "grad_norm": 0.42916664481163025, "learning_rate": 0.0001947566632111582, "loss": 0.5011, "step": 1288 }, { "epoch": 0.3868547418967587, "grad_norm": 0.26069143414497375, "learning_rate": 0.00019473991260396463, "loss": 0.4952, "step": 1289 }, { "epoch": 0.3871548619447779, "grad_norm": 0.16868044435977936, "learning_rate": 0.0001947231360059624, "loss": 0.4922, "step": 1290 }, { "epoch": 0.3874549819927971, "grad_norm": 0.13486826419830322, "learning_rate": 0.00019470633342175394, "loss": 0.4639, "step": 1291 }, { "epoch": 0.3877551020408163, "grad_norm": 0.131912499666214, "learning_rate": 0.00019468950485594888, "loss": 0.4263, "step": 1292 }, { "epoch": 0.3880552220888355, "grad_norm": 0.13306498527526855, "learning_rate": 0.0001946726503131639, "loss": 0.4493, "step": 1293 }, { "epoch": 0.38835534213685474, "grad_norm": 0.1558721959590912, "learning_rate": 0.00019465576979802292, "loss": 0.4647, "step": 1294 }, { "epoch": 0.38865546218487396, "grad_norm": 0.14165359735488892, "learning_rate": 0.00019463886331515685, "loss": 0.4208, "step": 1295 }, { "epoch": 0.38895558223289317, "grad_norm": 0.14463305473327637, "learning_rate": 0.00019462193086920384, "loss": 0.4406, "step": 1296 }, { "epoch": 0.3892557022809124, "grad_norm": 0.1373589187860489, "learning_rate": 0.00019460497246480903, "loss": 0.4195, "step": 1297 }, { "epoch": 0.3895558223289316, "grad_norm": 0.13842253386974335, "learning_rate": 0.00019458798810662487, "loss": 0.4675, "step": 1298 }, { "epoch": 0.38985594237695076, "grad_norm": 0.17589081823825836, "learning_rate": 0.00019457097779931076, "loss": 0.5043, "step": 1299 }, { "epoch": 0.39015606242497, "grad_norm": 0.1493915170431137, "learning_rate": 0.0001945539415475333, "loss": 0.4974, "step": 1300 }, { "epoch": 0.3904561824729892, "grad_norm": 0.16712096333503723, "learning_rate": 0.00019453687935596617, "loss": 0.5679, "step": 1301 }, { "epoch": 0.3907563025210084, "grad_norm": 0.15799379348754883, "learning_rate": 0.00019451979122929024, "loss": 0.4574, "step": 1302 }, { "epoch": 0.3910564225690276, "grad_norm": 0.13828876614570618, "learning_rate": 0.0001945026771721934, "loss": 0.4455, "step": 1303 }, { "epoch": 0.39135654261704683, "grad_norm": 0.13726243376731873, "learning_rate": 0.00019448553718937067, "loss": 0.4973, "step": 1304 }, { "epoch": 0.39165666266506605, "grad_norm": 0.6387799382209778, "learning_rate": 0.00019446837128552424, "loss": 0.464, "step": 1305 }, { "epoch": 0.39195678271308526, "grad_norm": 0.13185249269008636, "learning_rate": 0.00019445117946536339, "loss": 0.4391, "step": 1306 }, { "epoch": 0.3922569027611044, "grad_norm": 0.20164337754249573, "learning_rate": 0.0001944339617336045, "loss": 0.445, "step": 1307 }, { "epoch": 0.39255702280912363, "grad_norm": 0.15695242583751678, "learning_rate": 0.00019441671809497104, "loss": 0.4472, "step": 1308 }, { "epoch": 0.39285714285714285, "grad_norm": 0.19588248431682587, "learning_rate": 0.00019439944855419362, "loss": 0.4808, "step": 1309 }, { "epoch": 0.39315726290516206, "grad_norm": 0.42993319034576416, "learning_rate": 0.00019438215311600989, "loss": 0.4679, "step": 1310 }, { "epoch": 0.3934573829531813, "grad_norm": 0.15529251098632812, "learning_rate": 0.0001943648317851647, "loss": 0.4399, "step": 1311 }, { "epoch": 0.3937575030012005, "grad_norm": 0.17741309106349945, "learning_rate": 0.00019434748456641, "loss": 0.4295, "step": 1312 }, { "epoch": 0.3940576230492197, "grad_norm": 0.3504805266857147, "learning_rate": 0.0001943301114645047, "loss": 0.4933, "step": 1313 }, { "epoch": 0.3943577430972389, "grad_norm": 0.14857859909534454, "learning_rate": 0.00019431271248421497, "loss": 0.4353, "step": 1314 }, { "epoch": 0.3946578631452581, "grad_norm": 0.19789060950279236, "learning_rate": 0.00019429528763031403, "loss": 0.5175, "step": 1315 }, { "epoch": 0.3949579831932773, "grad_norm": 0.15028391778469086, "learning_rate": 0.00019427783690758216, "loss": 0.5012, "step": 1316 }, { "epoch": 0.3952581032412965, "grad_norm": 0.14596232771873474, "learning_rate": 0.0001942603603208068, "loss": 0.4382, "step": 1317 }, { "epoch": 0.3955582232893157, "grad_norm": 0.14369916915893555, "learning_rate": 0.00019424285787478243, "loss": 0.3947, "step": 1318 }, { "epoch": 0.39585834333733494, "grad_norm": 0.14337095618247986, "learning_rate": 0.00019422532957431062, "loss": 0.4126, "step": 1319 }, { "epoch": 0.39615846338535415, "grad_norm": 0.16140638291835785, "learning_rate": 0.0001942077754242001, "loss": 0.4489, "step": 1320 }, { "epoch": 0.39645858343337337, "grad_norm": 0.16274520754814148, "learning_rate": 0.00019419019542926664, "loss": 0.4399, "step": 1321 }, { "epoch": 0.3967587034813926, "grad_norm": 0.15587328374385834, "learning_rate": 0.0001941725895943331, "loss": 0.4705, "step": 1322 }, { "epoch": 0.39705882352941174, "grad_norm": 0.1563081294298172, "learning_rate": 0.00019415495792422945, "loss": 0.4875, "step": 1323 }, { "epoch": 0.39735894357743096, "grad_norm": 0.15606364607810974, "learning_rate": 0.0001941373004237927, "loss": 0.5291, "step": 1324 }, { "epoch": 0.39765906362545017, "grad_norm": 0.16615036129951477, "learning_rate": 0.00019411961709786703, "loss": 0.53, "step": 1325 }, { "epoch": 0.3979591836734694, "grad_norm": 0.14595146477222443, "learning_rate": 0.00019410190795130365, "loss": 0.4094, "step": 1326 }, { "epoch": 0.3982593037214886, "grad_norm": 0.1788942515850067, "learning_rate": 0.00019408417298896085, "loss": 0.4767, "step": 1327 }, { "epoch": 0.3985594237695078, "grad_norm": 0.36024177074432373, "learning_rate": 0.00019406641221570402, "loss": 0.4652, "step": 1328 }, { "epoch": 0.39885954381752703, "grad_norm": 0.15379220247268677, "learning_rate": 0.00019404862563640558, "loss": 0.4679, "step": 1329 }, { "epoch": 0.39915966386554624, "grad_norm": 0.14786048233509064, "learning_rate": 0.00019403081325594516, "loss": 0.4564, "step": 1330 }, { "epoch": 0.3994597839135654, "grad_norm": 0.16695883870124817, "learning_rate": 0.0001940129750792093, "loss": 0.4398, "step": 1331 }, { "epoch": 0.3997599039615846, "grad_norm": 0.14832280576229095, "learning_rate": 0.00019399511111109176, "loss": 0.4871, "step": 1332 }, { "epoch": 0.40006002400960383, "grad_norm": 0.18180081248283386, "learning_rate": 0.00019397722135649326, "loss": 0.4901, "step": 1333 }, { "epoch": 0.40036014405762305, "grad_norm": 0.1705753356218338, "learning_rate": 0.0001939593058203217, "loss": 0.4704, "step": 1334 }, { "epoch": 0.40066026410564226, "grad_norm": 0.16212989389896393, "learning_rate": 0.00019394136450749197, "loss": 0.5103, "step": 1335 }, { "epoch": 0.4009603841536615, "grad_norm": 0.15347157418727875, "learning_rate": 0.00019392339742292612, "loss": 0.4944, "step": 1336 }, { "epoch": 0.4012605042016807, "grad_norm": 0.13810157775878906, "learning_rate": 0.00019390540457155312, "loss": 0.4947, "step": 1337 }, { "epoch": 0.4015606242496999, "grad_norm": 0.22768346965312958, "learning_rate": 0.00019388738595830916, "loss": 0.5081, "step": 1338 }, { "epoch": 0.40186074429771906, "grad_norm": 0.1549905240535736, "learning_rate": 0.00019386934158813744, "loss": 0.442, "step": 1339 }, { "epoch": 0.4021608643457383, "grad_norm": 0.1581527441740036, "learning_rate": 0.0001938512714659882, "loss": 0.5218, "step": 1340 }, { "epoch": 0.4024609843937575, "grad_norm": 0.14964637160301208, "learning_rate": 0.0001938331755968188, "loss": 0.5056, "step": 1341 }, { "epoch": 0.4027611044417767, "grad_norm": 0.15961486101150513, "learning_rate": 0.0001938150539855936, "loss": 0.5259, "step": 1342 }, { "epoch": 0.4030612244897959, "grad_norm": 0.15585605800151825, "learning_rate": 0.0001937969066372841, "loss": 0.4805, "step": 1343 }, { "epoch": 0.40336134453781514, "grad_norm": 0.14608198404312134, "learning_rate": 0.00019377873355686879, "loss": 0.4866, "step": 1344 }, { "epoch": 0.40366146458583435, "grad_norm": 0.16492308676242828, "learning_rate": 0.00019376053474933324, "loss": 0.4777, "step": 1345 }, { "epoch": 0.40396158463385357, "grad_norm": 0.1487661898136139, "learning_rate": 0.00019374231021967013, "loss": 0.5075, "step": 1346 }, { "epoch": 0.4042617046818727, "grad_norm": 0.1344498097896576, "learning_rate": 0.00019372405997287908, "loss": 0.4452, "step": 1347 }, { "epoch": 0.40456182472989194, "grad_norm": 0.13872256875038147, "learning_rate": 0.00019370578401396688, "loss": 0.467, "step": 1348 }, { "epoch": 0.40486194477791115, "grad_norm": 0.17656131088733673, "learning_rate": 0.00019368748234794731, "loss": 0.5042, "step": 1349 }, { "epoch": 0.40516206482593037, "grad_norm": 0.15703557431697845, "learning_rate": 0.00019366915497984126, "loss": 0.4602, "step": 1350 }, { "epoch": 0.4054621848739496, "grad_norm": 0.14739936590194702, "learning_rate": 0.0001936508019146766, "loss": 0.4794, "step": 1351 }, { "epoch": 0.4057623049219688, "grad_norm": 0.1558038592338562, "learning_rate": 0.00019363242315748828, "loss": 0.5116, "step": 1352 }, { "epoch": 0.406062424969988, "grad_norm": 0.14387072622776031, "learning_rate": 0.0001936140187133183, "loss": 0.5056, "step": 1353 }, { "epoch": 0.4063625450180072, "grad_norm": 0.14097860455513, "learning_rate": 0.00019359558858721574, "loss": 0.5079, "step": 1354 }, { "epoch": 0.4066626650660264, "grad_norm": 0.13414618372917175, "learning_rate": 0.00019357713278423666, "loss": 0.4497, "step": 1355 }, { "epoch": 0.4069627851140456, "grad_norm": 0.1423015296459198, "learning_rate": 0.0001935586513094442, "loss": 0.4772, "step": 1356 }, { "epoch": 0.4072629051620648, "grad_norm": 0.14633090794086456, "learning_rate": 0.0001935401441679086, "loss": 0.4682, "step": 1357 }, { "epoch": 0.40756302521008403, "grad_norm": 0.13317428529262543, "learning_rate": 0.00019352161136470698, "loss": 0.4852, "step": 1358 }, { "epoch": 0.40786314525810324, "grad_norm": 0.15108975768089294, "learning_rate": 0.00019350305290492367, "loss": 0.503, "step": 1359 }, { "epoch": 0.40816326530612246, "grad_norm": 0.13461902737617493, "learning_rate": 0.00019348446879364998, "loss": 0.4551, "step": 1360 }, { "epoch": 0.4084633853541417, "grad_norm": 0.14803214371204376, "learning_rate": 0.0001934658590359842, "loss": 0.5164, "step": 1361 }, { "epoch": 0.4087635054021609, "grad_norm": 0.17757324874401093, "learning_rate": 0.00019344722363703174, "loss": 0.4634, "step": 1362 }, { "epoch": 0.40906362545018005, "grad_norm": 0.15838757157325745, "learning_rate": 0.000193428562601905, "loss": 0.5696, "step": 1363 }, { "epoch": 0.40936374549819926, "grad_norm": 0.13884811103343964, "learning_rate": 0.0001934098759357234, "loss": 0.4073, "step": 1364 }, { "epoch": 0.4096638655462185, "grad_norm": 0.15501153469085693, "learning_rate": 0.00019339116364361342, "loss": 0.4901, "step": 1365 }, { "epoch": 0.4099639855942377, "grad_norm": 0.1534595936536789, "learning_rate": 0.00019337242573070858, "loss": 0.5782, "step": 1366 }, { "epoch": 0.4102641056422569, "grad_norm": 0.39484915137290955, "learning_rate": 0.00019335366220214943, "loss": 0.4158, "step": 1367 }, { "epoch": 0.4105642256902761, "grad_norm": 0.1517392247915268, "learning_rate": 0.0001933348730630835, "loss": 0.4379, "step": 1368 }, { "epoch": 0.41086434573829533, "grad_norm": 0.14215496182441711, "learning_rate": 0.00019331605831866534, "loss": 0.4906, "step": 1369 }, { "epoch": 0.41116446578631455, "grad_norm": 0.1274791955947876, "learning_rate": 0.00019329721797405665, "loss": 0.435, "step": 1370 }, { "epoch": 0.4114645858343337, "grad_norm": 0.14641650021076202, "learning_rate": 0.00019327835203442596, "loss": 0.5223, "step": 1371 }, { "epoch": 0.4117647058823529, "grad_norm": 0.15195374190807343, "learning_rate": 0.000193259460504949, "loss": 0.5157, "step": 1372 }, { "epoch": 0.41206482593037214, "grad_norm": 0.14944523572921753, "learning_rate": 0.00019324054339080838, "loss": 0.5046, "step": 1373 }, { "epoch": 0.41236494597839135, "grad_norm": 0.20714707672595978, "learning_rate": 0.00019322160069719388, "loss": 0.5317, "step": 1374 }, { "epoch": 0.41266506602641057, "grad_norm": 0.14786897599697113, "learning_rate": 0.00019320263242930214, "loss": 0.4566, "step": 1375 }, { "epoch": 0.4129651860744298, "grad_norm": 0.15220782160758972, "learning_rate": 0.00019318363859233693, "loss": 0.5205, "step": 1376 }, { "epoch": 0.413265306122449, "grad_norm": 0.12114045768976212, "learning_rate": 0.00019316461919150895, "loss": 0.3673, "step": 1377 }, { "epoch": 0.4135654261704682, "grad_norm": 0.14805008471012115, "learning_rate": 0.00019314557423203595, "loss": 0.4754, "step": 1378 }, { "epoch": 0.41386554621848737, "grad_norm": 0.15739750862121582, "learning_rate": 0.00019312650371914277, "loss": 0.447, "step": 1379 }, { "epoch": 0.4141656662665066, "grad_norm": 0.15176567435264587, "learning_rate": 0.00019310740765806112, "loss": 0.4614, "step": 1380 }, { "epoch": 0.4144657863145258, "grad_norm": 0.14173932373523712, "learning_rate": 0.0001930882860540298, "loss": 0.4456, "step": 1381 }, { "epoch": 0.414765906362545, "grad_norm": 0.1475897878408432, "learning_rate": 0.00019306913891229462, "loss": 0.4743, "step": 1382 }, { "epoch": 0.4150660264105642, "grad_norm": 0.1380489021539688, "learning_rate": 0.00019304996623810834, "loss": 0.4284, "step": 1383 }, { "epoch": 0.41536614645858344, "grad_norm": 0.1423666626214981, "learning_rate": 0.0001930307680367308, "loss": 0.5312, "step": 1384 }, { "epoch": 0.41566626650660266, "grad_norm": 0.159433051943779, "learning_rate": 0.0001930115443134288, "loss": 0.5392, "step": 1385 }, { "epoch": 0.41596638655462187, "grad_norm": 0.13836570084095, "learning_rate": 0.00019299229507347614, "loss": 0.4596, "step": 1386 }, { "epoch": 0.41626650660264103, "grad_norm": 0.1377885788679123, "learning_rate": 0.00019297302032215364, "loss": 0.4446, "step": 1387 }, { "epoch": 0.41656662665066024, "grad_norm": 0.13649654388427734, "learning_rate": 0.00019295372006474906, "loss": 0.4496, "step": 1388 }, { "epoch": 0.41686674669867946, "grad_norm": 0.1478634625673294, "learning_rate": 0.00019293439430655726, "loss": 0.5175, "step": 1389 }, { "epoch": 0.4171668667466987, "grad_norm": 0.16393691301345825, "learning_rate": 0.00019291504305288005, "loss": 0.5568, "step": 1390 }, { "epoch": 0.4174669867947179, "grad_norm": 0.1378331482410431, "learning_rate": 0.00019289566630902619, "loss": 0.449, "step": 1391 }, { "epoch": 0.4177671068427371, "grad_norm": 0.1320262998342514, "learning_rate": 0.00019287626408031147, "loss": 0.4264, "step": 1392 }, { "epoch": 0.4180672268907563, "grad_norm": 0.13636216521263123, "learning_rate": 0.00019285683637205864, "loss": 0.5008, "step": 1393 }, { "epoch": 0.41836734693877553, "grad_norm": 0.14212261140346527, "learning_rate": 0.00019283738318959752, "loss": 0.451, "step": 1394 }, { "epoch": 0.41866746698679475, "grad_norm": 0.1388690024614334, "learning_rate": 0.00019281790453826484, "loss": 0.4727, "step": 1395 }, { "epoch": 0.4189675870348139, "grad_norm": 0.13603146374225616, "learning_rate": 0.0001927984004234044, "loss": 0.4617, "step": 1396 }, { "epoch": 0.4192677070828331, "grad_norm": 0.1306666135787964, "learning_rate": 0.00019277887085036684, "loss": 0.5171, "step": 1397 }, { "epoch": 0.41956782713085233, "grad_norm": 0.1543315351009369, "learning_rate": 0.0001927593158245099, "loss": 0.4892, "step": 1398 }, { "epoch": 0.41986794717887155, "grad_norm": 0.3799855411052704, "learning_rate": 0.00019273973535119835, "loss": 0.5509, "step": 1399 }, { "epoch": 0.42016806722689076, "grad_norm": 0.1430201381444931, "learning_rate": 0.00019272012943580383, "loss": 0.5026, "step": 1400 }, { "epoch": 0.42046818727491, "grad_norm": 0.13826300203800201, "learning_rate": 0.00019270049808370492, "loss": 0.453, "step": 1401 }, { "epoch": 0.4207683073229292, "grad_norm": 0.14627663791179657, "learning_rate": 0.00019268084130028736, "loss": 0.4964, "step": 1402 }, { "epoch": 0.4210684273709484, "grad_norm": 0.16021540760993958, "learning_rate": 0.00019266115909094368, "loss": 0.4424, "step": 1403 }, { "epoch": 0.42136854741896757, "grad_norm": 0.13889935612678528, "learning_rate": 0.00019264145146107356, "loss": 0.4589, "step": 1404 }, { "epoch": 0.4216686674669868, "grad_norm": 0.14017444849014282, "learning_rate": 0.00019262171841608348, "loss": 0.4843, "step": 1405 }, { "epoch": 0.421968787515006, "grad_norm": 0.14206208288669586, "learning_rate": 0.00019260195996138703, "loss": 0.5191, "step": 1406 }, { "epoch": 0.4222689075630252, "grad_norm": 0.14236919581890106, "learning_rate": 0.00019258217610240467, "loss": 0.4836, "step": 1407 }, { "epoch": 0.4225690276110444, "grad_norm": 0.13786455988883972, "learning_rate": 0.0001925623668445639, "loss": 0.4229, "step": 1408 }, { "epoch": 0.42286914765906364, "grad_norm": 0.1529259830713272, "learning_rate": 0.0001925425321932992, "loss": 0.4672, "step": 1409 }, { "epoch": 0.42316926770708285, "grad_norm": 0.14073142409324646, "learning_rate": 0.00019252267215405188, "loss": 0.4777, "step": 1410 }, { "epoch": 0.42346938775510207, "grad_norm": 0.1450301557779312, "learning_rate": 0.00019250278673227042, "loss": 0.4815, "step": 1411 }, { "epoch": 0.4237695078031212, "grad_norm": 0.13629521429538727, "learning_rate": 0.0001924828759334101, "loss": 0.4118, "step": 1412 }, { "epoch": 0.42406962785114044, "grad_norm": 0.1464170515537262, "learning_rate": 0.0001924629397629332, "loss": 0.5251, "step": 1413 }, { "epoch": 0.42436974789915966, "grad_norm": 0.14412495493888855, "learning_rate": 0.00019244297822630906, "loss": 0.461, "step": 1414 }, { "epoch": 0.42466986794717887, "grad_norm": 0.13709735870361328, "learning_rate": 0.0001924229913290138, "loss": 0.5078, "step": 1415 }, { "epoch": 0.4249699879951981, "grad_norm": 0.14858576655387878, "learning_rate": 0.0001924029790765307, "loss": 0.5115, "step": 1416 }, { "epoch": 0.4252701080432173, "grad_norm": 0.12990660965442657, "learning_rate": 0.0001923829414743498, "loss": 0.417, "step": 1417 }, { "epoch": 0.4255702280912365, "grad_norm": 0.1423971801996231, "learning_rate": 0.00019236287852796821, "loss": 0.4614, "step": 1418 }, { "epoch": 0.4258703481392557, "grad_norm": 0.14663192629814148, "learning_rate": 0.00019234279024289003, "loss": 0.4758, "step": 1419 }, { "epoch": 0.4261704681872749, "grad_norm": 0.13479375839233398, "learning_rate": 0.00019232267662462618, "loss": 0.4386, "step": 1420 }, { "epoch": 0.4264705882352941, "grad_norm": 0.1321115493774414, "learning_rate": 0.0001923025376786946, "loss": 0.428, "step": 1421 }, { "epoch": 0.4267707082833133, "grad_norm": 0.14896945655345917, "learning_rate": 0.00019228237341062024, "loss": 0.5236, "step": 1422 }, { "epoch": 0.42707082833133253, "grad_norm": 0.1456255316734314, "learning_rate": 0.00019226218382593487, "loss": 0.5066, "step": 1423 }, { "epoch": 0.42737094837935174, "grad_norm": 0.13657549023628235, "learning_rate": 0.0001922419689301773, "loss": 0.4759, "step": 1424 }, { "epoch": 0.42767106842737096, "grad_norm": 0.14950986206531525, "learning_rate": 0.00019222172872889327, "loss": 0.4289, "step": 1425 }, { "epoch": 0.4279711884753902, "grad_norm": 0.1470792442560196, "learning_rate": 0.00019220146322763545, "loss": 0.4723, "step": 1426 }, { "epoch": 0.4282713085234094, "grad_norm": 0.15343989431858063, "learning_rate": 0.0001921811724319634, "loss": 0.4901, "step": 1427 }, { "epoch": 0.42857142857142855, "grad_norm": 0.16162234544754028, "learning_rate": 0.0001921608563474437, "loss": 0.4972, "step": 1428 }, { "epoch": 0.42887154861944776, "grad_norm": 0.1257782131433487, "learning_rate": 0.00019214051497964984, "loss": 0.413, "step": 1429 }, { "epoch": 0.429171668667467, "grad_norm": 0.15724851191043854, "learning_rate": 0.00019212014833416222, "loss": 0.5128, "step": 1430 }, { "epoch": 0.4294717887154862, "grad_norm": 0.1418631672859192, "learning_rate": 0.0001920997564165682, "loss": 0.4481, "step": 1431 }, { "epoch": 0.4297719087635054, "grad_norm": 0.13407331705093384, "learning_rate": 0.0001920793392324621, "loss": 0.4786, "step": 1432 }, { "epoch": 0.4300720288115246, "grad_norm": 0.13005974888801575, "learning_rate": 0.00019205889678744514, "loss": 0.4658, "step": 1433 }, { "epoch": 0.43037214885954383, "grad_norm": 0.13358049094676971, "learning_rate": 0.0001920384290871254, "loss": 0.4239, "step": 1434 }, { "epoch": 0.43067226890756305, "grad_norm": 0.14098550379276276, "learning_rate": 0.00019201793613711802, "loss": 0.4933, "step": 1435 }, { "epoch": 0.4309723889555822, "grad_norm": 0.1439901441335678, "learning_rate": 0.000191997417943045, "loss": 0.4727, "step": 1436 }, { "epoch": 0.4312725090036014, "grad_norm": 0.15355221927165985, "learning_rate": 0.00019197687451053526, "loss": 0.4613, "step": 1437 }, { "epoch": 0.43157262905162064, "grad_norm": 0.38890597224235535, "learning_rate": 0.00019195630584522465, "loss": 0.5481, "step": 1438 }, { "epoch": 0.43187274909963985, "grad_norm": 0.3396846055984497, "learning_rate": 0.00019193571195275596, "loss": 0.4547, "step": 1439 }, { "epoch": 0.43217286914765907, "grad_norm": 0.12148111313581467, "learning_rate": 0.00019191509283877892, "loss": 0.4084, "step": 1440 }, { "epoch": 0.4324729891956783, "grad_norm": 0.15273834764957428, "learning_rate": 0.00019189444850895008, "loss": 0.4986, "step": 1441 }, { "epoch": 0.4327731092436975, "grad_norm": 0.17270119488239288, "learning_rate": 0.000191873778968933, "loss": 0.533, "step": 1442 }, { "epoch": 0.4330732292917167, "grad_norm": 0.14104081690311432, "learning_rate": 0.00019185308422439815, "loss": 0.4491, "step": 1443 }, { "epoch": 0.43337334933973587, "grad_norm": 0.1499965637922287, "learning_rate": 0.00019183236428102287, "loss": 0.5033, "step": 1444 }, { "epoch": 0.4336734693877551, "grad_norm": 0.14168301224708557, "learning_rate": 0.00019181161914449146, "loss": 0.4739, "step": 1445 }, { "epoch": 0.4339735894357743, "grad_norm": 0.14708314836025238, "learning_rate": 0.00019179084882049513, "loss": 0.4672, "step": 1446 }, { "epoch": 0.4342737094837935, "grad_norm": 0.1471162885427475, "learning_rate": 0.00019177005331473193, "loss": 0.4717, "step": 1447 }, { "epoch": 0.4345738295318127, "grad_norm": 0.15595698356628418, "learning_rate": 0.0001917492326329069, "loss": 0.4544, "step": 1448 }, { "epoch": 0.43487394957983194, "grad_norm": 0.14595715701580048, "learning_rate": 0.00019172838678073193, "loss": 0.489, "step": 1449 }, { "epoch": 0.43517406962785116, "grad_norm": 0.14658313989639282, "learning_rate": 0.00019170751576392587, "loss": 0.4284, "step": 1450 }, { "epoch": 0.43547418967587037, "grad_norm": 0.1370433270931244, "learning_rate": 0.00019168661958821441, "loss": 0.4149, "step": 1451 }, { "epoch": 0.43577430972388953, "grad_norm": 0.15391285717487335, "learning_rate": 0.00019166569825933025, "loss": 0.482, "step": 1452 }, { "epoch": 0.43607442977190874, "grad_norm": 0.1594506800174713, "learning_rate": 0.00019164475178301283, "loss": 0.5282, "step": 1453 }, { "epoch": 0.43637454981992796, "grad_norm": 0.5735306739807129, "learning_rate": 0.0001916237801650086, "loss": 0.4453, "step": 1454 }, { "epoch": 0.4366746698679472, "grad_norm": 0.16294141113758087, "learning_rate": 0.00019160278341107093, "loss": 0.4922, "step": 1455 }, { "epoch": 0.4369747899159664, "grad_norm": 0.15014778077602386, "learning_rate": 0.00019158176152695998, "loss": 0.4767, "step": 1456 }, { "epoch": 0.4372749099639856, "grad_norm": 0.1422508955001831, "learning_rate": 0.00019156071451844288, "loss": 0.4987, "step": 1457 }, { "epoch": 0.4375750300120048, "grad_norm": 0.2151377648115158, "learning_rate": 0.00019153964239129365, "loss": 0.4407, "step": 1458 }, { "epoch": 0.43787515006002403, "grad_norm": 0.16766145825386047, "learning_rate": 0.00019151854515129317, "loss": 0.5277, "step": 1459 }, { "epoch": 0.4381752701080432, "grad_norm": 0.15800268948078156, "learning_rate": 0.00019149742280422924, "loss": 0.4434, "step": 1460 }, { "epoch": 0.4384753901560624, "grad_norm": 0.14400655031204224, "learning_rate": 0.00019147627535589653, "loss": 0.4703, "step": 1461 }, { "epoch": 0.4387755102040816, "grad_norm": 0.15325772762298584, "learning_rate": 0.0001914551028120966, "loss": 0.4469, "step": 1462 }, { "epoch": 0.43907563025210083, "grad_norm": 0.16910713911056519, "learning_rate": 0.00019143390517863788, "loss": 0.4803, "step": 1463 }, { "epoch": 0.43937575030012005, "grad_norm": 0.1375475972890854, "learning_rate": 0.00019141268246133572, "loss": 0.4597, "step": 1464 }, { "epoch": 0.43967587034813926, "grad_norm": 0.14966444671154022, "learning_rate": 0.00019139143466601231, "loss": 0.5085, "step": 1465 }, { "epoch": 0.4399759903961585, "grad_norm": 0.13662435114383698, "learning_rate": 0.00019137016179849673, "loss": 0.4712, "step": 1466 }, { "epoch": 0.4402761104441777, "grad_norm": 0.15421763062477112, "learning_rate": 0.00019134886386462497, "loss": 0.4656, "step": 1467 }, { "epoch": 0.44057623049219685, "grad_norm": 0.14279350638389587, "learning_rate": 0.0001913275408702399, "loss": 0.4611, "step": 1468 }, { "epoch": 0.44087635054021607, "grad_norm": 0.1423294097185135, "learning_rate": 0.00019130619282119117, "loss": 0.4791, "step": 1469 }, { "epoch": 0.4411764705882353, "grad_norm": 0.15571759641170502, "learning_rate": 0.00019128481972333544, "loss": 0.4612, "step": 1470 }, { "epoch": 0.4414765906362545, "grad_norm": 0.14783750474452972, "learning_rate": 0.00019126342158253614, "loss": 0.4404, "step": 1471 }, { "epoch": 0.4417767106842737, "grad_norm": 0.2501294016838074, "learning_rate": 0.0001912419984046636, "loss": 0.4526, "step": 1472 }, { "epoch": 0.4420768307322929, "grad_norm": 0.13411974906921387, "learning_rate": 0.00019122055019559503, "loss": 0.461, "step": 1473 }, { "epoch": 0.44237695078031214, "grad_norm": 0.1492329239845276, "learning_rate": 0.0001911990769612145, "loss": 0.4456, "step": 1474 }, { "epoch": 0.44267707082833135, "grad_norm": 0.15338747203350067, "learning_rate": 0.00019117757870741294, "loss": 0.55, "step": 1475 }, { "epoch": 0.4429771908763505, "grad_norm": 0.30840009450912476, "learning_rate": 0.0001911560554400882, "loss": 0.469, "step": 1476 }, { "epoch": 0.4432773109243697, "grad_norm": 0.14980410039424896, "learning_rate": 0.00019113450716514487, "loss": 0.4939, "step": 1477 }, { "epoch": 0.44357743097238894, "grad_norm": 0.16766425967216492, "learning_rate": 0.00019111293388849449, "loss": 0.4491, "step": 1478 }, { "epoch": 0.44387755102040816, "grad_norm": 0.13539418578147888, "learning_rate": 0.00019109133561605546, "loss": 0.4329, "step": 1479 }, { "epoch": 0.44417767106842737, "grad_norm": 0.19142962992191315, "learning_rate": 0.00019106971235375298, "loss": 0.5242, "step": 1480 }, { "epoch": 0.4444777911164466, "grad_norm": 0.1418309360742569, "learning_rate": 0.00019104806410751924, "loss": 0.49, "step": 1481 }, { "epoch": 0.4447779111644658, "grad_norm": 0.14699430763721466, "learning_rate": 0.00019102639088329308, "loss": 0.489, "step": 1482 }, { "epoch": 0.445078031212485, "grad_norm": 0.14769864082336426, "learning_rate": 0.00019100469268702036, "loss": 0.4961, "step": 1483 }, { "epoch": 0.44537815126050423, "grad_norm": 0.22757241129875183, "learning_rate": 0.0001909829695246537, "loss": 0.4027, "step": 1484 }, { "epoch": 0.4456782713085234, "grad_norm": 0.1623411476612091, "learning_rate": 0.00019096122140215262, "loss": 0.4917, "step": 1485 }, { "epoch": 0.4459783913565426, "grad_norm": 0.13646866381168365, "learning_rate": 0.00019093944832548348, "loss": 0.411, "step": 1486 }, { "epoch": 0.4462785114045618, "grad_norm": 0.13902203738689423, "learning_rate": 0.00019091765030061943, "loss": 0.4696, "step": 1487 }, { "epoch": 0.44657863145258103, "grad_norm": 0.15161632001399994, "learning_rate": 0.00019089582733354055, "loss": 0.4859, "step": 1488 }, { "epoch": 0.44687875150060025, "grad_norm": 0.169401615858078, "learning_rate": 0.0001908739794302337, "loss": 0.48, "step": 1489 }, { "epoch": 0.44717887154861946, "grad_norm": 0.1369437575340271, "learning_rate": 0.0001908521065966926, "loss": 0.5028, "step": 1490 }, { "epoch": 0.4474789915966387, "grad_norm": 0.14216217398643494, "learning_rate": 0.00019083020883891783, "loss": 0.506, "step": 1491 }, { "epoch": 0.4477791116446579, "grad_norm": 0.13778440654277802, "learning_rate": 0.0001908082861629168, "loss": 0.4893, "step": 1492 }, { "epoch": 0.44807923169267705, "grad_norm": 0.13807134330272675, "learning_rate": 0.0001907863385747037, "loss": 0.4029, "step": 1493 }, { "epoch": 0.44837935174069626, "grad_norm": 0.15739092230796814, "learning_rate": 0.0001907643660802996, "loss": 0.4539, "step": 1494 }, { "epoch": 0.4486794717887155, "grad_norm": 0.17259110510349274, "learning_rate": 0.00019074236868573245, "loss": 0.475, "step": 1495 }, { "epoch": 0.4489795918367347, "grad_norm": 0.1383882761001587, "learning_rate": 0.00019072034639703694, "loss": 0.439, "step": 1496 }, { "epoch": 0.4492797118847539, "grad_norm": 0.13202977180480957, "learning_rate": 0.00019069829922025466, "loss": 0.4226, "step": 1497 }, { "epoch": 0.4495798319327731, "grad_norm": 0.12966297566890717, "learning_rate": 0.00019067622716143398, "loss": 0.4331, "step": 1498 }, { "epoch": 0.44987995198079234, "grad_norm": 0.14526928961277008, "learning_rate": 0.00019065413022663013, "loss": 0.5089, "step": 1499 }, { "epoch": 0.45018007202881155, "grad_norm": 0.1366245597600937, "learning_rate": 0.00019063200842190514, "loss": 0.3964, "step": 1500 }, { "epoch": 0.4504801920768307, "grad_norm": 0.1469152271747589, "learning_rate": 0.00019060986175332788, "loss": 0.5168, "step": 1501 }, { "epoch": 0.4507803121248499, "grad_norm": 0.7257106304168701, "learning_rate": 0.00019058769022697406, "loss": 0.4882, "step": 1502 }, { "epoch": 0.45108043217286914, "grad_norm": 0.14178527891635895, "learning_rate": 0.00019056549384892612, "loss": 0.5005, "step": 1503 }, { "epoch": 0.45138055222088835, "grad_norm": 0.1433630883693695, "learning_rate": 0.00019054327262527345, "loss": 0.4629, "step": 1504 }, { "epoch": 0.45168067226890757, "grad_norm": 0.1455744206905365, "learning_rate": 0.00019052102656211216, "loss": 0.4764, "step": 1505 }, { "epoch": 0.4519807923169268, "grad_norm": 0.16220326721668243, "learning_rate": 0.00019049875566554518, "loss": 0.5098, "step": 1506 }, { "epoch": 0.452280912364946, "grad_norm": 0.208993062376976, "learning_rate": 0.0001904764599416823, "loss": 0.5381, "step": 1507 }, { "epoch": 0.4525810324129652, "grad_norm": 0.14417657256126404, "learning_rate": 0.0001904541393966401, "loss": 0.4596, "step": 1508 }, { "epoch": 0.45288115246098437, "grad_norm": 0.14887084066867828, "learning_rate": 0.00019043179403654191, "loss": 0.49, "step": 1509 }, { "epoch": 0.4531812725090036, "grad_norm": 0.14634130895137787, "learning_rate": 0.00019040942386751804, "loss": 0.4513, "step": 1510 }, { "epoch": 0.4534813925570228, "grad_norm": 0.14948152005672455, "learning_rate": 0.0001903870288957054, "loss": 0.4613, "step": 1511 }, { "epoch": 0.453781512605042, "grad_norm": 0.14923574030399323, "learning_rate": 0.0001903646091272478, "loss": 0.5116, "step": 1512 }, { "epoch": 0.45408163265306123, "grad_norm": 0.13415589928627014, "learning_rate": 0.00019034216456829584, "loss": 0.4443, "step": 1513 }, { "epoch": 0.45438175270108044, "grad_norm": 0.1644652783870697, "learning_rate": 0.00019031969522500695, "loss": 0.5325, "step": 1514 }, { "epoch": 0.45468187274909966, "grad_norm": 0.16179829835891724, "learning_rate": 0.00019029720110354535, "loss": 0.5353, "step": 1515 }, { "epoch": 0.4549819927971189, "grad_norm": 0.14486360549926758, "learning_rate": 0.00019027468221008203, "loss": 0.4886, "step": 1516 }, { "epoch": 0.45528211284513803, "grad_norm": 0.18260490894317627, "learning_rate": 0.0001902521385507948, "loss": 0.4648, "step": 1517 }, { "epoch": 0.45558223289315725, "grad_norm": 0.17621013522148132, "learning_rate": 0.00019022957013186821, "loss": 0.5227, "step": 1518 }, { "epoch": 0.45588235294117646, "grad_norm": 0.17596934735774994, "learning_rate": 0.00019020697695949372, "loss": 0.4976, "step": 1519 }, { "epoch": 0.4561824729891957, "grad_norm": 0.16129888594150543, "learning_rate": 0.00019018435903986943, "loss": 0.4665, "step": 1520 }, { "epoch": 0.4564825930372149, "grad_norm": 0.20698495209217072, "learning_rate": 0.00019016171637920034, "loss": 0.4531, "step": 1521 }, { "epoch": 0.4567827130852341, "grad_norm": 0.5277857780456543, "learning_rate": 0.00019013904898369826, "loss": 0.4494, "step": 1522 }, { "epoch": 0.4570828331332533, "grad_norm": 0.17072156071662903, "learning_rate": 0.00019011635685958162, "loss": 0.4635, "step": 1523 }, { "epoch": 0.45738295318127253, "grad_norm": 0.1649957001209259, "learning_rate": 0.00019009364001307586, "loss": 0.487, "step": 1524 }, { "epoch": 0.4576830732292917, "grad_norm": 0.1527813822031021, "learning_rate": 0.00019007089845041297, "loss": 0.4954, "step": 1525 }, { "epoch": 0.4579831932773109, "grad_norm": 0.14998187124729156, "learning_rate": 0.00019004813217783192, "loss": 0.4788, "step": 1526 }, { "epoch": 0.4582833133253301, "grad_norm": 0.2669890820980072, "learning_rate": 0.00019002534120157835, "loss": 0.4909, "step": 1527 }, { "epoch": 0.45858343337334934, "grad_norm": 0.17830884456634521, "learning_rate": 0.0001900025255279047, "loss": 0.4735, "step": 1528 }, { "epoch": 0.45888355342136855, "grad_norm": 0.1925525665283203, "learning_rate": 0.00018997968516307022, "loss": 0.5461, "step": 1529 }, { "epoch": 0.45918367346938777, "grad_norm": 0.15895669162273407, "learning_rate": 0.00018995682011334087, "loss": 0.4522, "step": 1530 }, { "epoch": 0.459483793517407, "grad_norm": 0.14554451406002045, "learning_rate": 0.00018993393038498941, "loss": 0.4931, "step": 1531 }, { "epoch": 0.4597839135654262, "grad_norm": 0.2117418497800827, "learning_rate": 0.0001899110159842954, "loss": 0.4882, "step": 1532 }, { "epoch": 0.46008403361344535, "grad_norm": 0.16560959815979004, "learning_rate": 0.0001898880769175451, "loss": 0.5318, "step": 1533 }, { "epoch": 0.46038415366146457, "grad_norm": 0.14025753736495972, "learning_rate": 0.0001898651131910316, "loss": 0.4266, "step": 1534 }, { "epoch": 0.4606842737094838, "grad_norm": 0.1386413872241974, "learning_rate": 0.00018984212481105476, "loss": 0.4467, "step": 1535 }, { "epoch": 0.460984393757503, "grad_norm": 0.15134479105472565, "learning_rate": 0.00018981911178392116, "loss": 0.4654, "step": 1536 }, { "epoch": 0.4612845138055222, "grad_norm": 0.14854450523853302, "learning_rate": 0.00018979607411594417, "loss": 0.5032, "step": 1537 }, { "epoch": 0.4615846338535414, "grad_norm": 0.1487223356962204, "learning_rate": 0.0001897730118134439, "loss": 0.4023, "step": 1538 }, { "epoch": 0.46188475390156064, "grad_norm": 0.1386006474494934, "learning_rate": 0.0001897499248827472, "loss": 0.4373, "step": 1539 }, { "epoch": 0.46218487394957986, "grad_norm": 0.14046189188957214, "learning_rate": 0.00018972681333018776, "loss": 0.421, "step": 1540 }, { "epoch": 0.462484993997599, "grad_norm": 0.1442844271659851, "learning_rate": 0.00018970367716210593, "loss": 0.4859, "step": 1541 }, { "epoch": 0.46278511404561823, "grad_norm": 0.2659657895565033, "learning_rate": 0.00018968051638484888, "loss": 0.4487, "step": 1542 }, { "epoch": 0.46308523409363744, "grad_norm": 0.14862355589866638, "learning_rate": 0.00018965733100477044, "loss": 0.489, "step": 1543 }, { "epoch": 0.46338535414165666, "grad_norm": 0.1391323059797287, "learning_rate": 0.00018963412102823138, "loss": 0.4772, "step": 1544 }, { "epoch": 0.4636854741896759, "grad_norm": 0.17410308122634888, "learning_rate": 0.00018961088646159897, "loss": 0.5178, "step": 1545 }, { "epoch": 0.4639855942376951, "grad_norm": 0.16075238585472107, "learning_rate": 0.0001895876273112474, "loss": 0.5296, "step": 1546 }, { "epoch": 0.4642857142857143, "grad_norm": 0.14295291900634766, "learning_rate": 0.00018956434358355755, "loss": 0.51, "step": 1547 }, { "epoch": 0.4645858343337335, "grad_norm": 0.14986905455589294, "learning_rate": 0.000189541035284917, "loss": 0.4853, "step": 1548 }, { "epoch": 0.4648859543817527, "grad_norm": 0.14428989589214325, "learning_rate": 0.00018951770242172018, "loss": 0.4914, "step": 1549 }, { "epoch": 0.4651860744297719, "grad_norm": 0.1400524526834488, "learning_rate": 0.00018949434500036816, "loss": 0.47, "step": 1550 }, { "epoch": 0.4654861944777911, "grad_norm": 0.1384679079055786, "learning_rate": 0.00018947096302726876, "loss": 0.47, "step": 1551 }, { "epoch": 0.4657863145258103, "grad_norm": 0.13552284240722656, "learning_rate": 0.0001894475565088366, "loss": 0.4475, "step": 1552 }, { "epoch": 0.46608643457382953, "grad_norm": 0.1343267560005188, "learning_rate": 0.00018942412545149297, "loss": 0.4762, "step": 1553 }, { "epoch": 0.46638655462184875, "grad_norm": 0.16411390900611877, "learning_rate": 0.00018940066986166592, "loss": 0.4922, "step": 1554 }, { "epoch": 0.46668667466986796, "grad_norm": 0.14296169579029083, "learning_rate": 0.0001893771897457902, "loss": 0.4421, "step": 1555 }, { "epoch": 0.4669867947178872, "grad_norm": 0.1436728686094284, "learning_rate": 0.00018935368511030734, "loss": 0.4694, "step": 1556 }, { "epoch": 0.46728691476590634, "grad_norm": 0.14206938445568085, "learning_rate": 0.00018933015596166554, "loss": 0.4481, "step": 1557 }, { "epoch": 0.46758703481392555, "grad_norm": 0.14524251222610474, "learning_rate": 0.00018930660230631976, "loss": 0.459, "step": 1558 }, { "epoch": 0.46788715486194477, "grad_norm": 0.1441972553730011, "learning_rate": 0.0001892830241507317, "loss": 0.4591, "step": 1559 }, { "epoch": 0.468187274909964, "grad_norm": 0.14738333225250244, "learning_rate": 0.0001892594215013697, "loss": 0.528, "step": 1560 }, { "epoch": 0.4684873949579832, "grad_norm": 0.15935885906219482, "learning_rate": 0.00018923579436470894, "loss": 0.5305, "step": 1561 }, { "epoch": 0.4687875150060024, "grad_norm": 0.13202980160713196, "learning_rate": 0.0001892121427472312, "loss": 0.4403, "step": 1562 }, { "epoch": 0.4690876350540216, "grad_norm": 0.13312020897865295, "learning_rate": 0.00018918846665542507, "loss": 0.3999, "step": 1563 }, { "epoch": 0.46938775510204084, "grad_norm": 0.14565500617027283, "learning_rate": 0.00018916476609578582, "loss": 0.486, "step": 1564 }, { "epoch": 0.46968787515006, "grad_norm": 0.16923834383487701, "learning_rate": 0.00018914104107481538, "loss": 0.4643, "step": 1565 }, { "epoch": 0.4699879951980792, "grad_norm": 0.13346311450004578, "learning_rate": 0.00018911729159902247, "loss": 0.429, "step": 1566 }, { "epoch": 0.4702881152460984, "grad_norm": 0.18116609752178192, "learning_rate": 0.0001890935176749225, "loss": 0.4765, "step": 1567 }, { "epoch": 0.47058823529411764, "grad_norm": 0.1305547058582306, "learning_rate": 0.0001890697193090375, "loss": 0.4561, "step": 1568 }, { "epoch": 0.47088835534213686, "grad_norm": 0.14567294716835022, "learning_rate": 0.00018904589650789642, "loss": 0.569, "step": 1569 }, { "epoch": 0.47118847539015607, "grad_norm": 0.14942717552185059, "learning_rate": 0.00018902204927803462, "loss": 0.4551, "step": 1570 }, { "epoch": 0.4714885954381753, "grad_norm": 0.14293356239795685, "learning_rate": 0.0001889981776259944, "loss": 0.4829, "step": 1571 }, { "epoch": 0.4717887154861945, "grad_norm": 0.17996667325496674, "learning_rate": 0.00018897428155832465, "loss": 0.4385, "step": 1572 }, { "epoch": 0.47208883553421366, "grad_norm": 0.14416664838790894, "learning_rate": 0.000188950361081581, "loss": 0.5011, "step": 1573 }, { "epoch": 0.4723889555822329, "grad_norm": 0.14845241606235504, "learning_rate": 0.00018892641620232574, "loss": 0.4433, "step": 1574 }, { "epoch": 0.4726890756302521, "grad_norm": 0.14318734407424927, "learning_rate": 0.00018890244692712786, "loss": 0.4827, "step": 1575 }, { "epoch": 0.4729891956782713, "grad_norm": 0.1348116546869278, "learning_rate": 0.00018887845326256308, "loss": 0.4357, "step": 1576 }, { "epoch": 0.4732893157262905, "grad_norm": 0.1515018492937088, "learning_rate": 0.00018885443521521377, "loss": 0.5059, "step": 1577 }, { "epoch": 0.47358943577430973, "grad_norm": 0.1488025188446045, "learning_rate": 0.00018883039279166903, "loss": 0.4918, "step": 1578 }, { "epoch": 0.47388955582232895, "grad_norm": 0.1464966982603073, "learning_rate": 0.0001888063259985246, "loss": 0.4734, "step": 1579 }, { "epoch": 0.47418967587034816, "grad_norm": 0.15424638986587524, "learning_rate": 0.00018878223484238295, "loss": 0.4756, "step": 1580 }, { "epoch": 0.4744897959183674, "grad_norm": 0.14891758561134338, "learning_rate": 0.00018875811932985318, "loss": 0.5217, "step": 1581 }, { "epoch": 0.47478991596638653, "grad_norm": 0.1575443595647812, "learning_rate": 0.00018873397946755116, "loss": 0.4884, "step": 1582 }, { "epoch": 0.47509003601440575, "grad_norm": 0.13640117645263672, "learning_rate": 0.00018870981526209932, "loss": 0.4691, "step": 1583 }, { "epoch": 0.47539015606242496, "grad_norm": 0.1447877287864685, "learning_rate": 0.00018868562672012687, "loss": 0.4475, "step": 1584 }, { "epoch": 0.4756902761104442, "grad_norm": 0.16914622485637665, "learning_rate": 0.0001886614138482697, "loss": 0.4971, "step": 1585 }, { "epoch": 0.4759903961584634, "grad_norm": 0.14456208050251007, "learning_rate": 0.0001886371766531702, "loss": 0.475, "step": 1586 }, { "epoch": 0.4762905162064826, "grad_norm": 0.13137194514274597, "learning_rate": 0.00018861291514147768, "loss": 0.4211, "step": 1587 }, { "epoch": 0.4765906362545018, "grad_norm": 0.1403452455997467, "learning_rate": 0.000188588629319848, "loss": 0.4933, "step": 1588 }, { "epoch": 0.47689075630252103, "grad_norm": 0.1483708769083023, "learning_rate": 0.00018856431919494365, "loss": 0.5357, "step": 1589 }, { "epoch": 0.4771908763505402, "grad_norm": 0.16883468627929688, "learning_rate": 0.00018853998477343385, "loss": 0.4721, "step": 1590 }, { "epoch": 0.4774909963985594, "grad_norm": 0.16150923073291779, "learning_rate": 0.0001885156260619945, "loss": 0.4541, "step": 1591 }, { "epoch": 0.4777911164465786, "grad_norm": 0.15299014747142792, "learning_rate": 0.0001884912430673081, "loss": 0.4627, "step": 1592 }, { "epoch": 0.47809123649459784, "grad_norm": 0.1427268385887146, "learning_rate": 0.0001884668357960639, "loss": 0.4164, "step": 1593 }, { "epoch": 0.47839135654261705, "grad_norm": 0.14231397211551666, "learning_rate": 0.00018844240425495767, "loss": 0.4666, "step": 1594 }, { "epoch": 0.47869147659063627, "grad_norm": 0.14517545700073242, "learning_rate": 0.00018841794845069195, "loss": 0.4795, "step": 1595 }, { "epoch": 0.4789915966386555, "grad_norm": 0.14669832587242126, "learning_rate": 0.00018839346838997594, "loss": 0.523, "step": 1596 }, { "epoch": 0.4792917166866747, "grad_norm": 0.14146114885807037, "learning_rate": 0.00018836896407952548, "loss": 0.4394, "step": 1597 }, { "epoch": 0.47959183673469385, "grad_norm": 0.4386819303035736, "learning_rate": 0.000188344435526063, "loss": 0.4967, "step": 1598 }, { "epoch": 0.47989195678271307, "grad_norm": 0.1534242480993271, "learning_rate": 0.00018831988273631763, "loss": 0.5025, "step": 1599 }, { "epoch": 0.4801920768307323, "grad_norm": 0.14127042889595032, "learning_rate": 0.00018829530571702515, "loss": 0.4793, "step": 1600 }, { "epoch": 0.4804921968787515, "grad_norm": 0.1541667878627777, "learning_rate": 0.00018827070447492803, "loss": 0.4118, "step": 1601 }, { "epoch": 0.4807923169267707, "grad_norm": 0.29758089780807495, "learning_rate": 0.00018824607901677526, "loss": 0.5182, "step": 1602 }, { "epoch": 0.4810924369747899, "grad_norm": 0.13616251945495605, "learning_rate": 0.00018822142934932261, "loss": 0.4765, "step": 1603 }, { "epoch": 0.48139255702280914, "grad_norm": 0.14783746004104614, "learning_rate": 0.00018819675547933243, "loss": 0.4987, "step": 1604 }, { "epoch": 0.48169267707082836, "grad_norm": 0.1417647898197174, "learning_rate": 0.0001881720574135737, "loss": 0.4763, "step": 1605 }, { "epoch": 0.4819927971188475, "grad_norm": 0.1706775277853012, "learning_rate": 0.000188147335158822, "loss": 0.4747, "step": 1606 }, { "epoch": 0.48229291716686673, "grad_norm": 0.13298912346363068, "learning_rate": 0.0001881225887218597, "loss": 0.4387, "step": 1607 }, { "epoch": 0.48259303721488594, "grad_norm": 0.13805459439754486, "learning_rate": 0.00018809781810947564, "loss": 0.5038, "step": 1608 }, { "epoch": 0.48289315726290516, "grad_norm": 0.1466389149427414, "learning_rate": 0.00018807302332846538, "loss": 0.3996, "step": 1609 }, { "epoch": 0.4831932773109244, "grad_norm": 0.14047665894031525, "learning_rate": 0.000188048204385631, "loss": 0.4513, "step": 1610 }, { "epoch": 0.4834933973589436, "grad_norm": 0.1531951129436493, "learning_rate": 0.00018802336128778143, "loss": 0.5707, "step": 1611 }, { "epoch": 0.4837935174069628, "grad_norm": 0.19110116362571716, "learning_rate": 0.000187998494041732, "loss": 0.5178, "step": 1612 }, { "epoch": 0.484093637454982, "grad_norm": 0.13959315419197083, "learning_rate": 0.00018797360265430474, "loss": 0.4237, "step": 1613 }, { "epoch": 0.4843937575030012, "grad_norm": 0.15554258227348328, "learning_rate": 0.0001879486871323284, "loss": 0.5351, "step": 1614 }, { "epoch": 0.4846938775510204, "grad_norm": 0.1472192108631134, "learning_rate": 0.00018792374748263817, "loss": 0.4626, "step": 1615 }, { "epoch": 0.4849939975990396, "grad_norm": 0.14186328649520874, "learning_rate": 0.00018789878371207604, "loss": 0.4982, "step": 1616 }, { "epoch": 0.4852941176470588, "grad_norm": 0.14574813842773438, "learning_rate": 0.00018787379582749046, "loss": 0.4851, "step": 1617 }, { "epoch": 0.48559423769507803, "grad_norm": 0.15164965391159058, "learning_rate": 0.00018784878383573664, "loss": 0.4579, "step": 1618 }, { "epoch": 0.48589435774309725, "grad_norm": 0.13515906035900116, "learning_rate": 0.00018782374774367627, "loss": 0.4339, "step": 1619 }, { "epoch": 0.48619447779111646, "grad_norm": 0.15253469347953796, "learning_rate": 0.00018779868755817777, "loss": 0.5466, "step": 1620 }, { "epoch": 0.4864945978391357, "grad_norm": 0.1400163322687149, "learning_rate": 0.00018777360328611607, "loss": 0.4704, "step": 1621 }, { "epoch": 0.48679471788715484, "grad_norm": 0.13967138528823853, "learning_rate": 0.0001877484949343728, "loss": 0.4404, "step": 1622 }, { "epoch": 0.48709483793517405, "grad_norm": 0.14153768122196198, "learning_rate": 0.00018772336250983608, "loss": 0.489, "step": 1623 }, { "epoch": 0.48739495798319327, "grad_norm": 0.14577268064022064, "learning_rate": 0.0001876982060194008, "loss": 0.4514, "step": 1624 }, { "epoch": 0.4876950780312125, "grad_norm": 0.44660818576812744, "learning_rate": 0.00018767302546996825, "loss": 0.4758, "step": 1625 }, { "epoch": 0.4879951980792317, "grad_norm": 0.14724412560462952, "learning_rate": 0.00018764782086844647, "loss": 0.4163, "step": 1626 }, { "epoch": 0.4882953181272509, "grad_norm": 0.1457844376564026, "learning_rate": 0.0001876225922217501, "loss": 0.4749, "step": 1627 }, { "epoch": 0.4885954381752701, "grad_norm": 0.14120006561279297, "learning_rate": 0.00018759733953680025, "loss": 0.5085, "step": 1628 }, { "epoch": 0.48889555822328934, "grad_norm": 0.13881921768188477, "learning_rate": 0.00018757206282052474, "loss": 0.4957, "step": 1629 }, { "epoch": 0.4891956782713085, "grad_norm": 0.19758376479148865, "learning_rate": 0.00018754676207985798, "loss": 0.4463, "step": 1630 }, { "epoch": 0.4894957983193277, "grad_norm": 0.1473570466041565, "learning_rate": 0.00018752143732174087, "loss": 0.5127, "step": 1631 }, { "epoch": 0.4897959183673469, "grad_norm": 0.13440637290477753, "learning_rate": 0.000187496088553121, "loss": 0.4459, "step": 1632 }, { "epoch": 0.49009603841536614, "grad_norm": 0.14222657680511475, "learning_rate": 0.00018747071578095254, "loss": 0.4833, "step": 1633 }, { "epoch": 0.49039615846338536, "grad_norm": 0.1522848904132843, "learning_rate": 0.00018744531901219617, "loss": 0.5005, "step": 1634 }, { "epoch": 0.49069627851140457, "grad_norm": 0.1476879119873047, "learning_rate": 0.00018741989825381928, "loss": 0.4714, "step": 1635 }, { "epoch": 0.4909963985594238, "grad_norm": 0.1372116506099701, "learning_rate": 0.00018739445351279566, "loss": 0.4604, "step": 1636 }, { "epoch": 0.491296518607443, "grad_norm": 0.1398506462574005, "learning_rate": 0.00018736898479610584, "loss": 0.4741, "step": 1637 }, { "epoch": 0.49159663865546216, "grad_norm": 0.19208797812461853, "learning_rate": 0.0001873434921107369, "loss": 0.4623, "step": 1638 }, { "epoch": 0.4918967587034814, "grad_norm": 0.16721111536026, "learning_rate": 0.00018731797546368243, "loss": 0.4741, "step": 1639 }, { "epoch": 0.4921968787515006, "grad_norm": 0.1624535322189331, "learning_rate": 0.00018729243486194258, "loss": 0.5396, "step": 1640 }, { "epoch": 0.4924969987995198, "grad_norm": 0.16763810813426971, "learning_rate": 0.0001872668703125242, "loss": 0.4297, "step": 1641 }, { "epoch": 0.492797118847539, "grad_norm": 0.13680794835090637, "learning_rate": 0.00018724128182244062, "loss": 0.4482, "step": 1642 }, { "epoch": 0.49309723889555823, "grad_norm": 0.14087484776973724, "learning_rate": 0.00018721566939871172, "loss": 0.4711, "step": 1643 }, { "epoch": 0.49339735894357745, "grad_norm": 0.14618854224681854, "learning_rate": 0.000187190033048364, "loss": 0.4466, "step": 1644 }, { "epoch": 0.49369747899159666, "grad_norm": 0.13818864524364471, "learning_rate": 0.00018716437277843046, "loss": 0.4859, "step": 1645 }, { "epoch": 0.4939975990396158, "grad_norm": 0.14081290364265442, "learning_rate": 0.00018713868859595074, "loss": 0.4132, "step": 1646 }, { "epoch": 0.49429771908763503, "grad_norm": 0.16399510204792023, "learning_rate": 0.00018711298050797098, "loss": 0.4874, "step": 1647 }, { "epoch": 0.49459783913565425, "grad_norm": 0.14325211942195892, "learning_rate": 0.0001870872485215439, "loss": 0.4684, "step": 1648 }, { "epoch": 0.49489795918367346, "grad_norm": 0.19651995599269867, "learning_rate": 0.0001870614926437288, "loss": 0.4346, "step": 1649 }, { "epoch": 0.4951980792316927, "grad_norm": 0.15420101583003998, "learning_rate": 0.0001870357128815915, "loss": 0.4813, "step": 1650 }, { "epoch": 0.4954981992797119, "grad_norm": 0.13209006190299988, "learning_rate": 0.0001870099092422043, "loss": 0.411, "step": 1651 }, { "epoch": 0.4957983193277311, "grad_norm": 0.37966907024383545, "learning_rate": 0.00018698408173264627, "loss": 0.4548, "step": 1652 }, { "epoch": 0.4960984393757503, "grad_norm": 0.1550043672323227, "learning_rate": 0.0001869582303600028, "loss": 0.4663, "step": 1653 }, { "epoch": 0.4963985594237695, "grad_norm": 0.14909137785434723, "learning_rate": 0.00018693235513136597, "loss": 0.4915, "step": 1654 }, { "epoch": 0.4966986794717887, "grad_norm": 0.12357178330421448, "learning_rate": 0.00018690645605383432, "loss": 0.4396, "step": 1655 }, { "epoch": 0.4969987995198079, "grad_norm": 0.2299381047487259, "learning_rate": 0.00018688053313451296, "loss": 0.4607, "step": 1656 }, { "epoch": 0.4972989195678271, "grad_norm": 0.1735653132200241, "learning_rate": 0.00018685458638051361, "loss": 0.4232, "step": 1657 }, { "epoch": 0.49759903961584634, "grad_norm": 0.14225593209266663, "learning_rate": 0.00018682861579895436, "loss": 0.4265, "step": 1658 }, { "epoch": 0.49789915966386555, "grad_norm": 0.15407206118106842, "learning_rate": 0.00018680262139695997, "loss": 0.4791, "step": 1659 }, { "epoch": 0.49819927971188477, "grad_norm": 0.13994479179382324, "learning_rate": 0.00018677660318166178, "loss": 0.4441, "step": 1660 }, { "epoch": 0.498499399759904, "grad_norm": 0.15098072588443756, "learning_rate": 0.00018675056116019753, "loss": 0.4234, "step": 1661 }, { "epoch": 0.49879951980792314, "grad_norm": 0.17504270374774933, "learning_rate": 0.00018672449533971156, "loss": 0.487, "step": 1662 }, { "epoch": 0.49909963985594236, "grad_norm": 0.14800134301185608, "learning_rate": 0.00018669840572735472, "loss": 0.4558, "step": 1663 }, { "epoch": 0.49939975990396157, "grad_norm": 0.13439930975437164, "learning_rate": 0.0001866722923302844, "loss": 0.4529, "step": 1664 }, { "epoch": 0.4996998799519808, "grad_norm": 0.13524958491325378, "learning_rate": 0.0001866461551556645, "loss": 0.423, "step": 1665 }, { "epoch": 0.5, "grad_norm": 0.1444150060415268, "learning_rate": 0.0001866199942106655, "loss": 0.4748, "step": 1666 }, { "epoch": 0.5003001200480192, "grad_norm": 0.14408966898918152, "learning_rate": 0.00018659380950246434, "loss": 0.4791, "step": 1667 }, { "epoch": 0.5006002400960384, "grad_norm": 0.18511995673179626, "learning_rate": 0.0001865676010382444, "loss": 0.4899, "step": 1668 }, { "epoch": 0.5009003601440576, "grad_norm": 0.14206524193286896, "learning_rate": 0.00018654136882519578, "loss": 0.4371, "step": 1669 }, { "epoch": 0.5012004801920769, "grad_norm": 0.1453809291124344, "learning_rate": 0.000186515112870515, "loss": 0.4839, "step": 1670 }, { "epoch": 0.501500600240096, "grad_norm": 0.13603635132312775, "learning_rate": 0.000186488833181405, "loss": 0.4679, "step": 1671 }, { "epoch": 0.5018007202881153, "grad_norm": 0.2923451066017151, "learning_rate": 0.00018646252976507537, "loss": 0.5172, "step": 1672 }, { "epoch": 0.5021008403361344, "grad_norm": 0.14126873016357422, "learning_rate": 0.0001864362026287421, "loss": 0.4173, "step": 1673 }, { "epoch": 0.5024009603841537, "grad_norm": 0.16036510467529297, "learning_rate": 0.00018640985177962783, "loss": 0.4706, "step": 1674 }, { "epoch": 0.5027010804321729, "grad_norm": 0.18069399893283844, "learning_rate": 0.0001863834772249615, "loss": 0.5106, "step": 1675 }, { "epoch": 0.503001200480192, "grad_norm": 0.17470668256282806, "learning_rate": 0.00018635707897197873, "loss": 0.4521, "step": 1676 }, { "epoch": 0.5033013205282113, "grad_norm": 0.14301811158657074, "learning_rate": 0.00018633065702792153, "loss": 0.4877, "step": 1677 }, { "epoch": 0.5036014405762305, "grad_norm": 0.14429409801959991, "learning_rate": 0.00018630421140003854, "loss": 0.5012, "step": 1678 }, { "epoch": 0.5039015606242497, "grad_norm": 0.1368817389011383, "learning_rate": 0.0001862777420955847, "loss": 0.439, "step": 1679 }, { "epoch": 0.5042016806722689, "grad_norm": 0.1479075849056244, "learning_rate": 0.0001862512491218217, "loss": 0.4278, "step": 1680 }, { "epoch": 0.5045018007202882, "grad_norm": 0.13655149936676025, "learning_rate": 0.00018622473248601748, "loss": 0.4818, "step": 1681 }, { "epoch": 0.5048019207683073, "grad_norm": 0.7065068483352661, "learning_rate": 0.00018619819219544662, "loss": 0.492, "step": 1682 }, { "epoch": 0.5051020408163265, "grad_norm": 0.1435791552066803, "learning_rate": 0.00018617162825739013, "loss": 0.4943, "step": 1683 }, { "epoch": 0.5054021608643458, "grad_norm": 0.12881679832935333, "learning_rate": 0.0001861450406791355, "loss": 0.4289, "step": 1684 }, { "epoch": 0.5057022809123649, "grad_norm": 0.15346279740333557, "learning_rate": 0.00018611842946797676, "loss": 0.4951, "step": 1685 }, { "epoch": 0.5060024009603842, "grad_norm": 0.1458282768726349, "learning_rate": 0.0001860917946312144, "loss": 0.4993, "step": 1686 }, { "epoch": 0.5063025210084033, "grad_norm": 0.17145638167858124, "learning_rate": 0.00018606513617615533, "loss": 0.4638, "step": 1687 }, { "epoch": 0.5066026410564226, "grad_norm": 0.1460111290216446, "learning_rate": 0.00018603845411011303, "loss": 0.4975, "step": 1688 }, { "epoch": 0.5069027611044418, "grad_norm": 0.13614998757839203, "learning_rate": 0.00018601174844040742, "loss": 0.3831, "step": 1689 }, { "epoch": 0.507202881152461, "grad_norm": 0.13792163133621216, "learning_rate": 0.00018598501917436487, "loss": 0.4625, "step": 1690 }, { "epoch": 0.5075030012004802, "grad_norm": 0.19221265614032745, "learning_rate": 0.0001859582663193183, "loss": 0.4357, "step": 1691 }, { "epoch": 0.5078031212484994, "grad_norm": 0.1561165750026703, "learning_rate": 0.000185931489882607, "loss": 0.4401, "step": 1692 }, { "epoch": 0.5081032412965186, "grad_norm": 0.19886402785778046, "learning_rate": 0.00018590468987157678, "loss": 0.4883, "step": 1693 }, { "epoch": 0.5084033613445378, "grad_norm": 0.14886119961738586, "learning_rate": 0.00018587786629357993, "loss": 0.468, "step": 1694 }, { "epoch": 0.508703481392557, "grad_norm": 0.13450387120246887, "learning_rate": 0.00018585101915597518, "loss": 0.4569, "step": 1695 }, { "epoch": 0.5090036014405762, "grad_norm": 0.14112287759780884, "learning_rate": 0.00018582414846612775, "loss": 0.467, "step": 1696 }, { "epoch": 0.5093037214885955, "grad_norm": 0.15156422555446625, "learning_rate": 0.0001857972542314093, "loss": 0.4965, "step": 1697 }, { "epoch": 0.5096038415366146, "grad_norm": 0.13746501505374908, "learning_rate": 0.00018577033645919794, "loss": 0.4749, "step": 1698 }, { "epoch": 0.5099039615846338, "grad_norm": 0.1336117684841156, "learning_rate": 0.0001857433951568783, "loss": 0.46, "step": 1699 }, { "epoch": 0.5102040816326531, "grad_norm": 0.1843332201242447, "learning_rate": 0.00018571643033184136, "loss": 0.4509, "step": 1700 }, { "epoch": 0.5105042016806722, "grad_norm": 0.148481085896492, "learning_rate": 0.00018568944199148462, "loss": 0.4617, "step": 1701 }, { "epoch": 0.5108043217286915, "grad_norm": 0.13783060014247894, "learning_rate": 0.00018566243014321205, "loss": 0.4891, "step": 1702 }, { "epoch": 0.5111044417767107, "grad_norm": 0.1354529857635498, "learning_rate": 0.00018563539479443404, "loss": 0.4588, "step": 1703 }, { "epoch": 0.5114045618247299, "grad_norm": 0.16476033627986908, "learning_rate": 0.0001856083359525674, "loss": 0.5441, "step": 1704 }, { "epoch": 0.5117046818727491, "grad_norm": 0.18820038437843323, "learning_rate": 0.00018558125362503543, "loss": 0.4573, "step": 1705 }, { "epoch": 0.5120048019207684, "grad_norm": 0.14193595945835114, "learning_rate": 0.00018555414781926786, "loss": 0.4989, "step": 1706 }, { "epoch": 0.5123049219687875, "grad_norm": 0.13493549823760986, "learning_rate": 0.00018552701854270082, "loss": 0.4456, "step": 1707 }, { "epoch": 0.5126050420168067, "grad_norm": 0.138336643576622, "learning_rate": 0.000185499865802777, "loss": 0.4975, "step": 1708 }, { "epoch": 0.512905162064826, "grad_norm": 0.1490534096956253, "learning_rate": 0.00018547268960694533, "loss": 0.469, "step": 1709 }, { "epoch": 0.5132052821128451, "grad_norm": 0.18119500577449799, "learning_rate": 0.00018544548996266138, "loss": 0.4968, "step": 1710 }, { "epoch": 0.5135054021608644, "grad_norm": 0.14145724475383759, "learning_rate": 0.000185418266877387, "loss": 0.4767, "step": 1711 }, { "epoch": 0.5138055222088835, "grad_norm": 0.151786208152771, "learning_rate": 0.00018539102035859057, "loss": 0.5035, "step": 1712 }, { "epoch": 0.5141056422569028, "grad_norm": 0.33611804246902466, "learning_rate": 0.00018536375041374684, "loss": 0.4856, "step": 1713 }, { "epoch": 0.514405762304922, "grad_norm": 0.1260775625705719, "learning_rate": 0.00018533645705033703, "loss": 0.3688, "step": 1714 }, { "epoch": 0.5147058823529411, "grad_norm": 0.1975342482328415, "learning_rate": 0.00018530914027584875, "loss": 0.5317, "step": 1715 }, { "epoch": 0.5150060024009604, "grad_norm": 0.14410541951656342, "learning_rate": 0.00018528180009777601, "loss": 0.4889, "step": 1716 }, { "epoch": 0.5153061224489796, "grad_norm": 0.15689077973365784, "learning_rate": 0.00018525443652361935, "loss": 0.5142, "step": 1717 }, { "epoch": 0.5156062424969988, "grad_norm": 0.1498923897743225, "learning_rate": 0.00018522704956088558, "loss": 0.5111, "step": 1718 }, { "epoch": 0.515906362545018, "grad_norm": 0.13396055996418, "learning_rate": 0.00018519963921708805, "loss": 0.4741, "step": 1719 }, { "epoch": 0.5162064825930373, "grad_norm": 0.13936327397823334, "learning_rate": 0.00018517220549974642, "loss": 0.4694, "step": 1720 }, { "epoch": 0.5165066026410564, "grad_norm": 0.16161802411079407, "learning_rate": 0.00018514474841638685, "loss": 0.4651, "step": 1721 }, { "epoch": 0.5168067226890757, "grad_norm": 0.1398150771856308, "learning_rate": 0.00018511726797454189, "loss": 0.4367, "step": 1722 }, { "epoch": 0.5171068427370948, "grad_norm": 0.1277453601360321, "learning_rate": 0.00018508976418175045, "loss": 0.4314, "step": 1723 }, { "epoch": 0.517406962785114, "grad_norm": 0.14832301437854767, "learning_rate": 0.0001850622370455579, "loss": 0.5337, "step": 1724 }, { "epoch": 0.5177070828331333, "grad_norm": 0.1403554230928421, "learning_rate": 0.000185034686573516, "loss": 0.4656, "step": 1725 }, { "epoch": 0.5180072028811524, "grad_norm": 0.14238956570625305, "learning_rate": 0.00018500711277318288, "loss": 0.4612, "step": 1726 }, { "epoch": 0.5183073229291717, "grad_norm": 0.13832856714725494, "learning_rate": 0.0001849795156521231, "loss": 0.4691, "step": 1727 }, { "epoch": 0.5186074429771909, "grad_norm": 0.17772655189037323, "learning_rate": 0.00018495189521790766, "loss": 0.4763, "step": 1728 }, { "epoch": 0.5189075630252101, "grad_norm": 0.1370764821767807, "learning_rate": 0.00018492425147811385, "loss": 0.4635, "step": 1729 }, { "epoch": 0.5192076830732293, "grad_norm": 0.15430189669132233, "learning_rate": 0.00018489658444032544, "loss": 0.5373, "step": 1730 }, { "epoch": 0.5195078031212484, "grad_norm": 0.1377253532409668, "learning_rate": 0.0001848688941121326, "loss": 0.4291, "step": 1731 }, { "epoch": 0.5198079231692677, "grad_norm": 0.1475270688533783, "learning_rate": 0.00018484118050113177, "loss": 0.4561, "step": 1732 }, { "epoch": 0.5201080432172869, "grad_norm": 0.17185480892658234, "learning_rate": 0.00018481344361492592, "loss": 0.5085, "step": 1733 }, { "epoch": 0.5204081632653061, "grad_norm": 0.14446613192558289, "learning_rate": 0.00018478568346112434, "loss": 0.4896, "step": 1734 }, { "epoch": 0.5207082833133253, "grad_norm": 0.1468825787305832, "learning_rate": 0.0001847579000473427, "loss": 0.4509, "step": 1735 }, { "epoch": 0.5210084033613446, "grad_norm": 0.12412779033184052, "learning_rate": 0.00018473009338120308, "loss": 0.4299, "step": 1736 }, { "epoch": 0.5213085234093637, "grad_norm": 0.19421713054180145, "learning_rate": 0.00018470226347033387, "loss": 0.5167, "step": 1737 }, { "epoch": 0.521608643457383, "grad_norm": 0.2185223549604416, "learning_rate": 0.00018467441032236995, "loss": 0.4709, "step": 1738 }, { "epoch": 0.5219087635054022, "grad_norm": 0.17113527655601501, "learning_rate": 0.0001846465339449525, "loss": 0.489, "step": 1739 }, { "epoch": 0.5222088835534213, "grad_norm": 0.1450916826725006, "learning_rate": 0.00018461863434572905, "loss": 0.5109, "step": 1740 }, { "epoch": 0.5225090036014406, "grad_norm": 0.13966700434684753, "learning_rate": 0.00018459071153235356, "loss": 0.4244, "step": 1741 }, { "epoch": 0.5228091236494598, "grad_norm": 0.26031041145324707, "learning_rate": 0.0001845627655124863, "loss": 0.5413, "step": 1742 }, { "epoch": 0.523109243697479, "grad_norm": 0.14094829559326172, "learning_rate": 0.000184534796293794, "loss": 0.4504, "step": 1743 }, { "epoch": 0.5234093637454982, "grad_norm": 0.29313942790031433, "learning_rate": 0.00018450680388394967, "loss": 0.428, "step": 1744 }, { "epoch": 0.5237094837935174, "grad_norm": 0.135672926902771, "learning_rate": 0.00018447878829063268, "loss": 0.4074, "step": 1745 }, { "epoch": 0.5240096038415366, "grad_norm": 0.1842111051082611, "learning_rate": 0.0001844507495215288, "loss": 0.4195, "step": 1746 }, { "epoch": 0.5243097238895558, "grad_norm": 0.2752435505390167, "learning_rate": 0.0001844226875843302, "loss": 0.4208, "step": 1747 }, { "epoch": 0.524609843937575, "grad_norm": 0.15792347490787506, "learning_rate": 0.00018439460248673522, "loss": 0.4752, "step": 1748 }, { "epoch": 0.5249099639855942, "grad_norm": 0.732960045337677, "learning_rate": 0.00018436649423644882, "loss": 0.4421, "step": 1749 }, { "epoch": 0.5252100840336135, "grad_norm": 0.23590520024299622, "learning_rate": 0.0001843383628411821, "loss": 0.4635, "step": 1750 }, { "epoch": 0.5255102040816326, "grad_norm": 0.23066602647304535, "learning_rate": 0.0001843102083086526, "loss": 0.4581, "step": 1751 }, { "epoch": 0.5258103241296519, "grad_norm": 0.20254339277744293, "learning_rate": 0.00018428203064658422, "loss": 0.497, "step": 1752 }, { "epoch": 0.526110444177671, "grad_norm": 0.24237699806690216, "learning_rate": 0.00018425382986270717, "loss": 0.4573, "step": 1753 }, { "epoch": 0.5264105642256903, "grad_norm": 0.16338394582271576, "learning_rate": 0.000184225605964758, "loss": 0.4818, "step": 1754 }, { "epoch": 0.5267106842737095, "grad_norm": 0.2194354236125946, "learning_rate": 0.0001841973589604796, "loss": 0.5307, "step": 1755 }, { "epoch": 0.5270108043217286, "grad_norm": 0.15968391299247742, "learning_rate": 0.00018416908885762122, "loss": 0.4751, "step": 1756 }, { "epoch": 0.5273109243697479, "grad_norm": 0.3560429513454437, "learning_rate": 0.00018414079566393844, "loss": 0.5122, "step": 1757 }, { "epoch": 0.5276110444177671, "grad_norm": 0.2543269097805023, "learning_rate": 0.0001841124793871932, "loss": 0.462, "step": 1758 }, { "epoch": 0.5279111644657863, "grad_norm": 0.2932210862636566, "learning_rate": 0.00018408414003515371, "loss": 0.452, "step": 1759 }, { "epoch": 0.5282112845138055, "grad_norm": 0.1889122724533081, "learning_rate": 0.00018405577761559453, "loss": 0.4586, "step": 1760 }, { "epoch": 0.5285114045618248, "grad_norm": 0.16669833660125732, "learning_rate": 0.00018402739213629665, "loss": 0.4901, "step": 1761 }, { "epoch": 0.5288115246098439, "grad_norm": 0.15147271752357483, "learning_rate": 0.0001839989836050472, "loss": 0.4456, "step": 1762 }, { "epoch": 0.5291116446578632, "grad_norm": 0.24772000312805176, "learning_rate": 0.00018397055202963982, "loss": 0.4918, "step": 1763 }, { "epoch": 0.5294117647058824, "grad_norm": 0.19363531470298767, "learning_rate": 0.0001839420974178743, "loss": 0.4948, "step": 1764 }, { "epoch": 0.5297118847539015, "grad_norm": 0.17379561066627502, "learning_rate": 0.00018391361977755693, "loss": 0.4648, "step": 1765 }, { "epoch": 0.5300120048019208, "grad_norm": 0.38105544447898865, "learning_rate": 0.00018388511911650014, "loss": 0.5332, "step": 1766 }, { "epoch": 0.53031212484994, "grad_norm": 0.20324936509132385, "learning_rate": 0.00018385659544252283, "loss": 0.4776, "step": 1767 }, { "epoch": 0.5306122448979592, "grad_norm": 0.22118645906448364, "learning_rate": 0.00018382804876345007, "loss": 0.5479, "step": 1768 }, { "epoch": 0.5309123649459784, "grad_norm": 0.15739142894744873, "learning_rate": 0.00018379947908711336, "loss": 0.4636, "step": 1769 }, { "epoch": 0.5312124849939976, "grad_norm": 0.16198864579200745, "learning_rate": 0.0001837708864213505, "loss": 0.465, "step": 1770 }, { "epoch": 0.5315126050420168, "grad_norm": 0.1969040334224701, "learning_rate": 0.0001837422707740055, "loss": 0.4949, "step": 1771 }, { "epoch": 0.531812725090036, "grad_norm": 0.19768664240837097, "learning_rate": 0.00018371363215292873, "loss": 0.4686, "step": 1772 }, { "epoch": 0.5321128451380552, "grad_norm": 0.1620958149433136, "learning_rate": 0.00018368497056597688, "loss": 0.4895, "step": 1773 }, { "epoch": 0.5324129651860744, "grad_norm": 0.19767068326473236, "learning_rate": 0.00018365628602101295, "loss": 0.5202, "step": 1774 }, { "epoch": 0.5327130852340937, "grad_norm": 0.15351001918315887, "learning_rate": 0.0001836275785259062, "loss": 0.4893, "step": 1775 }, { "epoch": 0.5330132052821128, "grad_norm": 0.15768110752105713, "learning_rate": 0.00018359884808853222, "loss": 0.4987, "step": 1776 }, { "epoch": 0.5333133253301321, "grad_norm": 0.18536004424095154, "learning_rate": 0.00018357009471677284, "loss": 0.4503, "step": 1777 }, { "epoch": 0.5336134453781513, "grad_norm": 0.1558152437210083, "learning_rate": 0.00018354131841851623, "loss": 0.4222, "step": 1778 }, { "epoch": 0.5339135654261705, "grad_norm": 0.18492354452610016, "learning_rate": 0.00018351251920165686, "loss": 0.4214, "step": 1779 }, { "epoch": 0.5342136854741897, "grad_norm": 0.21126689016819, "learning_rate": 0.00018348369707409546, "loss": 0.4899, "step": 1780 }, { "epoch": 0.5345138055222088, "grad_norm": 0.30342015624046326, "learning_rate": 0.00018345485204373905, "loss": 0.5045, "step": 1781 }, { "epoch": 0.5348139255702281, "grad_norm": 0.20529039204120636, "learning_rate": 0.00018342598411850088, "loss": 0.5035, "step": 1782 }, { "epoch": 0.5351140456182473, "grad_norm": 0.16132153570652008, "learning_rate": 0.0001833970933063006, "loss": 0.4648, "step": 1783 }, { "epoch": 0.5354141656662665, "grad_norm": 0.17530177533626556, "learning_rate": 0.00018336817961506408, "loss": 0.5205, "step": 1784 }, { "epoch": 0.5357142857142857, "grad_norm": 0.3235676884651184, "learning_rate": 0.0001833392430527234, "loss": 0.5159, "step": 1785 }, { "epoch": 0.536014405762305, "grad_norm": 0.228851780295372, "learning_rate": 0.00018331028362721701, "loss": 0.5307, "step": 1786 }, { "epoch": 0.5363145258103241, "grad_norm": 0.16981162130832672, "learning_rate": 0.0001832813013464896, "loss": 0.5155, "step": 1787 }, { "epoch": 0.5366146458583433, "grad_norm": 0.15694698691368103, "learning_rate": 0.0001832522962184921, "loss": 0.4797, "step": 1788 }, { "epoch": 0.5369147659063626, "grad_norm": 0.15145984292030334, "learning_rate": 0.00018322326825118176, "loss": 0.4455, "step": 1789 }, { "epoch": 0.5372148859543817, "grad_norm": 0.20638592541217804, "learning_rate": 0.00018319421745252208, "loss": 0.4699, "step": 1790 }, { "epoch": 0.537515006002401, "grad_norm": 0.17048896849155426, "learning_rate": 0.0001831651438304828, "loss": 0.4974, "step": 1791 }, { "epoch": 0.5378151260504201, "grad_norm": 0.1427546590566635, "learning_rate": 0.00018313604739303988, "loss": 0.4574, "step": 1792 }, { "epoch": 0.5381152460984394, "grad_norm": 0.19892023503780365, "learning_rate": 0.00018310692814817569, "loss": 0.4983, "step": 1793 }, { "epoch": 0.5384153661464586, "grad_norm": 0.22113119065761566, "learning_rate": 0.00018307778610387868, "loss": 0.5172, "step": 1794 }, { "epoch": 0.5387154861944778, "grad_norm": 0.1393192708492279, "learning_rate": 0.0001830486212681437, "loss": 0.4397, "step": 1795 }, { "epoch": 0.539015606242497, "grad_norm": 0.17644095420837402, "learning_rate": 0.00018301943364897177, "loss": 0.4744, "step": 1796 }, { "epoch": 0.5393157262905162, "grad_norm": 0.1647583246231079, "learning_rate": 0.00018299022325437013, "loss": 0.4958, "step": 1797 }, { "epoch": 0.5396158463385354, "grad_norm": 0.19911617040634155, "learning_rate": 0.0001829609900923524, "loss": 0.4451, "step": 1798 }, { "epoch": 0.5399159663865546, "grad_norm": 0.1554144322872162, "learning_rate": 0.00018293173417093826, "loss": 0.5257, "step": 1799 }, { "epoch": 0.5402160864345739, "grad_norm": 0.245337575674057, "learning_rate": 0.00018290245549815385, "loss": 0.4839, "step": 1800 }, { "epoch": 0.540516206482593, "grad_norm": 0.17131870985031128, "learning_rate": 0.00018287315408203135, "loss": 0.4841, "step": 1801 }, { "epoch": 0.5408163265306123, "grad_norm": 0.30908921360969543, "learning_rate": 0.00018284382993060931, "loss": 0.4453, "step": 1802 }, { "epoch": 0.5411164465786314, "grad_norm": 0.15892593562602997, "learning_rate": 0.00018281448305193244, "loss": 0.4909, "step": 1803 }, { "epoch": 0.5414165666266506, "grad_norm": 0.20572002232074738, "learning_rate": 0.00018278511345405177, "loss": 0.4391, "step": 1804 }, { "epoch": 0.5417166866746699, "grad_norm": 0.16379763185977936, "learning_rate": 0.0001827557211450245, "loss": 0.478, "step": 1805 }, { "epoch": 0.542016806722689, "grad_norm": 0.19017498195171356, "learning_rate": 0.00018272630613291405, "loss": 0.4413, "step": 1806 }, { "epoch": 0.5423169267707083, "grad_norm": 0.1718524992465973, "learning_rate": 0.0001826968684257901, "loss": 0.5658, "step": 1807 }, { "epoch": 0.5426170468187275, "grad_norm": 0.3137906491756439, "learning_rate": 0.0001826674080317285, "loss": 0.3869, "step": 1808 }, { "epoch": 0.5429171668667467, "grad_norm": 0.2052076756954193, "learning_rate": 0.00018263792495881148, "loss": 0.4597, "step": 1809 }, { "epoch": 0.5432172869147659, "grad_norm": 0.2138308882713318, "learning_rate": 0.0001826084192151273, "loss": 0.4839, "step": 1810 }, { "epoch": 0.5435174069627852, "grad_norm": 0.17791545391082764, "learning_rate": 0.00018257889080877055, "loss": 0.4989, "step": 1811 }, { "epoch": 0.5438175270108043, "grad_norm": 0.1574055552482605, "learning_rate": 0.000182549339747842, "loss": 0.4785, "step": 1812 }, { "epoch": 0.5441176470588235, "grad_norm": 0.33668404817581177, "learning_rate": 0.00018251976604044868, "loss": 0.486, "step": 1813 }, { "epoch": 0.5444177671068428, "grad_norm": 0.1609889566898346, "learning_rate": 0.0001824901696947037, "loss": 0.5451, "step": 1814 }, { "epoch": 0.5447178871548619, "grad_norm": 0.14856895804405212, "learning_rate": 0.0001824605507187266, "loss": 0.4601, "step": 1815 }, { "epoch": 0.5450180072028812, "grad_norm": 0.3381519913673401, "learning_rate": 0.00018243090912064294, "loss": 0.4626, "step": 1816 }, { "epoch": 0.5453181272509003, "grad_norm": 0.1419222056865692, "learning_rate": 0.00018240124490858457, "loss": 0.4175, "step": 1817 }, { "epoch": 0.5456182472989196, "grad_norm": 0.15159635245800018, "learning_rate": 0.0001823715580906895, "loss": 0.4754, "step": 1818 }, { "epoch": 0.5459183673469388, "grad_norm": 0.17327073216438293, "learning_rate": 0.000182341848675102, "loss": 0.4694, "step": 1819 }, { "epoch": 0.5462184873949579, "grad_norm": 0.18374492228031158, "learning_rate": 0.00018231211666997247, "loss": 0.4907, "step": 1820 }, { "epoch": 0.5465186074429772, "grad_norm": 0.16119374334812164, "learning_rate": 0.00018228236208345762, "loss": 0.4843, "step": 1821 }, { "epoch": 0.5468187274909964, "grad_norm": 0.15638823807239532, "learning_rate": 0.00018225258492372018, "loss": 0.4957, "step": 1822 }, { "epoch": 0.5471188475390156, "grad_norm": 0.15086990594863892, "learning_rate": 0.00018222278519892926, "loss": 0.4562, "step": 1823 }, { "epoch": 0.5474189675870348, "grad_norm": 0.2456623613834381, "learning_rate": 0.00018219296291726003, "loss": 0.4646, "step": 1824 }, { "epoch": 0.5477190876350541, "grad_norm": 1.42583429813385, "learning_rate": 0.00018216311808689388, "loss": 0.4726, "step": 1825 }, { "epoch": 0.5480192076830732, "grad_norm": 0.16602733731269836, "learning_rate": 0.0001821332507160184, "loss": 0.4781, "step": 1826 }, { "epoch": 0.5483193277310925, "grad_norm": 0.16486293077468872, "learning_rate": 0.00018210336081282743, "loss": 0.4647, "step": 1827 }, { "epoch": 0.5486194477791116, "grad_norm": 0.1668195128440857, "learning_rate": 0.00018207344838552084, "loss": 0.4584, "step": 1828 }, { "epoch": 0.5489195678271308, "grad_norm": 0.1644868105649948, "learning_rate": 0.00018204351344230474, "loss": 0.4652, "step": 1829 }, { "epoch": 0.5492196878751501, "grad_norm": 0.1930205076932907, "learning_rate": 0.00018201355599139154, "loss": 0.4959, "step": 1830 }, { "epoch": 0.5495198079231692, "grad_norm": 0.16677197813987732, "learning_rate": 0.00018198357604099966, "loss": 0.4816, "step": 1831 }, { "epoch": 0.5498199279711885, "grad_norm": 0.2054395079612732, "learning_rate": 0.00018195357359935375, "loss": 0.4158, "step": 1832 }, { "epoch": 0.5501200480192077, "grad_norm": 0.18362215161323547, "learning_rate": 0.00018192354867468467, "loss": 0.5312, "step": 1833 }, { "epoch": 0.5504201680672269, "grad_norm": 0.16130132973194122, "learning_rate": 0.00018189350127522936, "loss": 0.4718, "step": 1834 }, { "epoch": 0.5507202881152461, "grad_norm": 0.24162361025810242, "learning_rate": 0.00018186343140923106, "loss": 0.4735, "step": 1835 }, { "epoch": 0.5510204081632653, "grad_norm": 0.13708935678005219, "learning_rate": 0.00018183333908493903, "loss": 0.4089, "step": 1836 }, { "epoch": 0.5513205282112845, "grad_norm": 0.20122745633125305, "learning_rate": 0.00018180322431060874, "loss": 0.4965, "step": 1837 }, { "epoch": 0.5516206482593037, "grad_norm": 0.1756470650434494, "learning_rate": 0.00018177308709450192, "loss": 0.5196, "step": 1838 }, { "epoch": 0.551920768307323, "grad_norm": 0.142266184091568, "learning_rate": 0.00018174292744488628, "loss": 0.4741, "step": 1839 }, { "epoch": 0.5522208883553421, "grad_norm": 0.13735970854759216, "learning_rate": 0.0001817127453700358, "loss": 0.3803, "step": 1840 }, { "epoch": 0.5525210084033614, "grad_norm": 0.14827807247638702, "learning_rate": 0.00018168254087823062, "loss": 0.4642, "step": 1841 }, { "epoch": 0.5528211284513805, "grad_norm": 0.15369684994220734, "learning_rate": 0.00018165231397775696, "loss": 0.5226, "step": 1842 }, { "epoch": 0.5531212484993998, "grad_norm": 0.17196229100227356, "learning_rate": 0.0001816220646769072, "loss": 0.542, "step": 1843 }, { "epoch": 0.553421368547419, "grad_norm": 0.15198828279972076, "learning_rate": 0.00018159179298397996, "loss": 0.4516, "step": 1844 }, { "epoch": 0.5537214885954381, "grad_norm": 0.14250534772872925, "learning_rate": 0.0001815614989072799, "loss": 0.4312, "step": 1845 }, { "epoch": 0.5540216086434574, "grad_norm": 0.190664604306221, "learning_rate": 0.00018153118245511785, "loss": 0.4878, "step": 1846 }, { "epoch": 0.5543217286914766, "grad_norm": 0.16536127030849457, "learning_rate": 0.00018150084363581075, "loss": 0.4951, "step": 1847 }, { "epoch": 0.5546218487394958, "grad_norm": 0.1848268061876297, "learning_rate": 0.00018147048245768175, "loss": 0.4926, "step": 1848 }, { "epoch": 0.554921968787515, "grad_norm": 0.19543147087097168, "learning_rate": 0.00018144009892906006, "loss": 0.4962, "step": 1849 }, { "epoch": 0.5552220888355343, "grad_norm": 0.23135653138160706, "learning_rate": 0.00018140969305828106, "loss": 0.5021, "step": 1850 }, { "epoch": 0.5555222088835534, "grad_norm": 0.13906599581241608, "learning_rate": 0.0001813792648536863, "loss": 0.4226, "step": 1851 }, { "epoch": 0.5558223289315727, "grad_norm": 0.14647486805915833, "learning_rate": 0.00018134881432362336, "loss": 0.4676, "step": 1852 }, { "epoch": 0.5561224489795918, "grad_norm": 0.1806657612323761, "learning_rate": 0.000181318341476446, "loss": 0.4747, "step": 1853 }, { "epoch": 0.556422569027611, "grad_norm": 0.2328762412071228, "learning_rate": 0.00018128784632051408, "loss": 0.4944, "step": 1854 }, { "epoch": 0.5567226890756303, "grad_norm": 0.14552703499794006, "learning_rate": 0.00018125732886419362, "loss": 0.4999, "step": 1855 }, { "epoch": 0.5570228091236494, "grad_norm": 0.15310394763946533, "learning_rate": 0.00018122678911585677, "loss": 0.4801, "step": 1856 }, { "epoch": 0.5573229291716687, "grad_norm": 0.2084084302186966, "learning_rate": 0.00018119622708388165, "loss": 0.5182, "step": 1857 }, { "epoch": 0.5576230492196879, "grad_norm": 0.19386032223701477, "learning_rate": 0.00018116564277665274, "loss": 0.4938, "step": 1858 }, { "epoch": 0.5579231692677071, "grad_norm": 0.1554754376411438, "learning_rate": 0.0001811350362025604, "loss": 0.4759, "step": 1859 }, { "epoch": 0.5582232893157263, "grad_norm": 0.14368936419487, "learning_rate": 0.00018110440737000122, "loss": 0.4267, "step": 1860 }, { "epoch": 0.5585234093637454, "grad_norm": 0.16357707977294922, "learning_rate": 0.00018107375628737785, "loss": 0.5091, "step": 1861 }, { "epoch": 0.5588235294117647, "grad_norm": 0.14437748491764069, "learning_rate": 0.00018104308296309913, "loss": 0.4361, "step": 1862 }, { "epoch": 0.5591236494597839, "grad_norm": 0.1523028463125229, "learning_rate": 0.00018101238740557985, "loss": 0.4523, "step": 1863 }, { "epoch": 0.5594237695078031, "grad_norm": 0.1871221661567688, "learning_rate": 0.000180981669623241, "loss": 0.5747, "step": 1864 }, { "epoch": 0.5597238895558223, "grad_norm": 0.14608240127563477, "learning_rate": 0.00018095092962450973, "loss": 0.4509, "step": 1865 }, { "epoch": 0.5600240096038416, "grad_norm": 0.16460062563419342, "learning_rate": 0.0001809201674178191, "loss": 0.4713, "step": 1866 }, { "epoch": 0.5603241296518607, "grad_norm": 0.1632416993379593, "learning_rate": 0.00018088938301160843, "loss": 0.4592, "step": 1867 }, { "epoch": 0.56062424969988, "grad_norm": 0.15670481324195862, "learning_rate": 0.0001808585764143231, "loss": 0.4528, "step": 1868 }, { "epoch": 0.5609243697478992, "grad_norm": 0.14016574621200562, "learning_rate": 0.00018082774763441444, "loss": 0.3966, "step": 1869 }, { "epoch": 0.5612244897959183, "grad_norm": 0.15027348697185516, "learning_rate": 0.00018079689668034005, "loss": 0.4706, "step": 1870 }, { "epoch": 0.5615246098439376, "grad_norm": 0.16850395500659943, "learning_rate": 0.00018076602356056353, "loss": 0.4728, "step": 1871 }, { "epoch": 0.5618247298919568, "grad_norm": 0.14605863392353058, "learning_rate": 0.00018073512828355458, "loss": 0.4838, "step": 1872 }, { "epoch": 0.562124849939976, "grad_norm": 0.15179717540740967, "learning_rate": 0.0001807042108577889, "loss": 0.4864, "step": 1873 }, { "epoch": 0.5624249699879952, "grad_norm": 0.13549867272377014, "learning_rate": 0.0001806732712917484, "loss": 0.3831, "step": 1874 }, { "epoch": 0.5627250900360145, "grad_norm": 0.14311176538467407, "learning_rate": 0.00018064230959392096, "loss": 0.4374, "step": 1875 }, { "epoch": 0.5630252100840336, "grad_norm": 0.16444933414459229, "learning_rate": 0.00018061132577280056, "loss": 0.5191, "step": 1876 }, { "epoch": 0.5633253301320528, "grad_norm": 0.2601252496242523, "learning_rate": 0.0001805803198368873, "loss": 0.4561, "step": 1877 }, { "epoch": 0.563625450180072, "grad_norm": 0.14315593242645264, "learning_rate": 0.00018054929179468724, "loss": 0.4424, "step": 1878 }, { "epoch": 0.5639255702280912, "grad_norm": 0.1418648660182953, "learning_rate": 0.00018051824165471263, "loss": 0.4463, "step": 1879 }, { "epoch": 0.5642256902761105, "grad_norm": 0.1637127697467804, "learning_rate": 0.00018048716942548168, "loss": 0.4576, "step": 1880 }, { "epoch": 0.5645258103241296, "grad_norm": 0.1702413707971573, "learning_rate": 0.00018045607511551869, "loss": 0.5403, "step": 1881 }, { "epoch": 0.5648259303721489, "grad_norm": 0.15511217713356018, "learning_rate": 0.0001804249587333541, "loss": 0.4783, "step": 1882 }, { "epoch": 0.5651260504201681, "grad_norm": 0.166106715798378, "learning_rate": 0.00018039382028752426, "loss": 0.4398, "step": 1883 }, { "epoch": 0.5654261704681873, "grad_norm": 0.1387651264667511, "learning_rate": 0.00018036265978657164, "loss": 0.4556, "step": 1884 }, { "epoch": 0.5657262905162065, "grad_norm": 0.15797646343708038, "learning_rate": 0.0001803314772390448, "loss": 0.4892, "step": 1885 }, { "epoch": 0.5660264105642256, "grad_norm": 0.20617718994617462, "learning_rate": 0.00018030027265349835, "loss": 0.5124, "step": 1886 }, { "epoch": 0.5663265306122449, "grad_norm": 0.14816582202911377, "learning_rate": 0.0001802690460384928, "loss": 0.5076, "step": 1887 }, { "epoch": 0.5666266506602641, "grad_norm": 0.1775469332933426, "learning_rate": 0.00018023779740259494, "loss": 0.4847, "step": 1888 }, { "epoch": 0.5669267707082833, "grad_norm": 0.1552363783121109, "learning_rate": 0.0001802065267543774, "loss": 0.5234, "step": 1889 }, { "epoch": 0.5672268907563025, "grad_norm": 0.1534537822008133, "learning_rate": 0.00018017523410241893, "loss": 0.4943, "step": 1890 }, { "epoch": 0.5675270108043218, "grad_norm": 0.15913110971450806, "learning_rate": 0.0001801439194553043, "loss": 0.4902, "step": 1891 }, { "epoch": 0.5678271308523409, "grad_norm": 0.16289573907852173, "learning_rate": 0.0001801125828216244, "loss": 0.4366, "step": 1892 }, { "epoch": 0.5681272509003601, "grad_norm": 0.14295737445354462, "learning_rate": 0.00018008122420997598, "loss": 0.435, "step": 1893 }, { "epoch": 0.5684273709483794, "grad_norm": 0.1722162663936615, "learning_rate": 0.00018004984362896196, "loss": 0.4712, "step": 1894 }, { "epoch": 0.5687274909963985, "grad_norm": 0.15125919878482819, "learning_rate": 0.00018001844108719124, "loss": 0.4776, "step": 1895 }, { "epoch": 0.5690276110444178, "grad_norm": 0.14587509632110596, "learning_rate": 0.00017998701659327875, "loss": 0.4654, "step": 1896 }, { "epoch": 0.569327731092437, "grad_norm": 0.1562846153974533, "learning_rate": 0.00017995557015584542, "loss": 0.4874, "step": 1897 }, { "epoch": 0.5696278511404562, "grad_norm": 0.18704284727573395, "learning_rate": 0.00017992410178351825, "loss": 0.4313, "step": 1898 }, { "epoch": 0.5699279711884754, "grad_norm": 0.1448519378900528, "learning_rate": 0.00017989261148493023, "loss": 0.4397, "step": 1899 }, { "epoch": 0.5702280912364946, "grad_norm": 0.1406678855419159, "learning_rate": 0.00017986109926872032, "loss": 0.4486, "step": 1900 }, { "epoch": 0.5705282112845138, "grad_norm": 0.1617930382490158, "learning_rate": 0.0001798295651435336, "loss": 0.5144, "step": 1901 }, { "epoch": 0.570828331332533, "grad_norm": 0.14808118343353271, "learning_rate": 0.000179798009118021, "loss": 0.4841, "step": 1902 }, { "epoch": 0.5711284513805522, "grad_norm": 0.23392923176288605, "learning_rate": 0.00017976643120083964, "loss": 0.4073, "step": 1903 }, { "epoch": 0.5714285714285714, "grad_norm": 0.14798158407211304, "learning_rate": 0.00017973483140065253, "loss": 0.4561, "step": 1904 }, { "epoch": 0.5717286914765907, "grad_norm": 0.16208335757255554, "learning_rate": 0.00017970320972612869, "loss": 0.4918, "step": 1905 }, { "epoch": 0.5720288115246098, "grad_norm": 0.1403273195028305, "learning_rate": 0.00017967156618594322, "loss": 0.426, "step": 1906 }, { "epoch": 0.5723289315726291, "grad_norm": 0.1615184247493744, "learning_rate": 0.0001796399007887771, "loss": 0.4749, "step": 1907 }, { "epoch": 0.5726290516206483, "grad_norm": 0.2007116824388504, "learning_rate": 0.00017960821354331738, "loss": 0.4807, "step": 1908 }, { "epoch": 0.5729291716686674, "grad_norm": 0.158131405711174, "learning_rate": 0.00017957650445825713, "loss": 0.5156, "step": 1909 }, { "epoch": 0.5732292917166867, "grad_norm": 0.15085642039775848, "learning_rate": 0.00017954477354229536, "loss": 0.4494, "step": 1910 }, { "epoch": 0.5735294117647058, "grad_norm": 0.16237707436084747, "learning_rate": 0.00017951302080413707, "loss": 0.4955, "step": 1911 }, { "epoch": 0.5738295318127251, "grad_norm": 0.1413777470588684, "learning_rate": 0.00017948124625249328, "loss": 0.4761, "step": 1912 }, { "epoch": 0.5741296518607443, "grad_norm": 0.1585283726453781, "learning_rate": 0.00017944944989608096, "loss": 0.5372, "step": 1913 }, { "epoch": 0.5744297719087635, "grad_norm": 0.13110783696174622, "learning_rate": 0.00017941763174362304, "loss": 0.4157, "step": 1914 }, { "epoch": 0.5747298919567827, "grad_norm": 0.14906930923461914, "learning_rate": 0.00017938579180384854, "loss": 0.4632, "step": 1915 }, { "epoch": 0.575030012004802, "grad_norm": 0.13836872577667236, "learning_rate": 0.00017935393008549228, "loss": 0.4291, "step": 1916 }, { "epoch": 0.5753301320528211, "grad_norm": 0.16051827371120453, "learning_rate": 0.0001793220465972953, "loss": 0.4826, "step": 1917 }, { "epoch": 0.5756302521008403, "grad_norm": 0.14186377823352814, "learning_rate": 0.00017929014134800432, "loss": 0.4562, "step": 1918 }, { "epoch": 0.5759303721488596, "grad_norm": 0.17933087050914764, "learning_rate": 0.0001792582143463723, "loss": 0.4544, "step": 1919 }, { "epoch": 0.5762304921968787, "grad_norm": 0.5719718337059021, "learning_rate": 0.00017922626560115798, "loss": 0.4773, "step": 1920 }, { "epoch": 0.576530612244898, "grad_norm": 0.15085569024085999, "learning_rate": 0.0001791942951211261, "loss": 0.494, "step": 1921 }, { "epoch": 0.5768307322929171, "grad_norm": 0.1766566038131714, "learning_rate": 0.0001791623029150475, "loss": 0.5268, "step": 1922 }, { "epoch": 0.5771308523409364, "grad_norm": 0.34068360924720764, "learning_rate": 0.00017913028899169882, "loss": 0.4105, "step": 1923 }, { "epoch": 0.5774309723889556, "grad_norm": 0.2693949341773987, "learning_rate": 0.00017909825335986267, "loss": 0.4243, "step": 1924 }, { "epoch": 0.5777310924369747, "grad_norm": 0.23097528517246246, "learning_rate": 0.00017906619602832774, "loss": 0.4775, "step": 1925 }, { "epoch": 0.578031212484994, "grad_norm": 0.3153398334980011, "learning_rate": 0.0001790341170058885, "loss": 0.4802, "step": 1926 }, { "epoch": 0.5783313325330132, "grad_norm": 0.17524994909763336, "learning_rate": 0.00017900201630134555, "loss": 0.5036, "step": 1927 }, { "epoch": 0.5786314525810324, "grad_norm": 0.19373682141304016, "learning_rate": 0.0001789698939235053, "loss": 0.465, "step": 1928 }, { "epoch": 0.5789315726290516, "grad_norm": 0.19627533853054047, "learning_rate": 0.00017893774988118015, "loss": 0.4986, "step": 1929 }, { "epoch": 0.5792316926770709, "grad_norm": 0.15393482148647308, "learning_rate": 0.0001789055841831885, "loss": 0.4475, "step": 1930 }, { "epoch": 0.57953181272509, "grad_norm": 0.15590296685695648, "learning_rate": 0.00017887339683835457, "loss": 0.4688, "step": 1931 }, { "epoch": 0.5798319327731093, "grad_norm": 0.2071526050567627, "learning_rate": 0.00017884118785550866, "loss": 0.502, "step": 1932 }, { "epoch": 0.5801320528211285, "grad_norm": 0.14130908250808716, "learning_rate": 0.00017880895724348687, "loss": 0.446, "step": 1933 }, { "epoch": 0.5804321728691476, "grad_norm": 0.14672444760799408, "learning_rate": 0.00017877670501113135, "loss": 0.4557, "step": 1934 }, { "epoch": 0.5807322929171669, "grad_norm": 0.162306010723114, "learning_rate": 0.00017874443116729013, "loss": 0.5119, "step": 1935 }, { "epoch": 0.581032412965186, "grad_norm": 0.15669575333595276, "learning_rate": 0.0001787121357208171, "loss": 0.5024, "step": 1936 }, { "epoch": 0.5813325330132053, "grad_norm": 0.22228948771953583, "learning_rate": 0.00017867981868057223, "loss": 0.4616, "step": 1937 }, { "epoch": 0.5816326530612245, "grad_norm": 0.15275882184505463, "learning_rate": 0.0001786474800554213, "loss": 0.4786, "step": 1938 }, { "epoch": 0.5819327731092437, "grad_norm": 0.14351172745227814, "learning_rate": 0.000178615119854236, "loss": 0.421, "step": 1939 }, { "epoch": 0.5822328931572629, "grad_norm": 0.13428664207458496, "learning_rate": 0.00017858273808589402, "loss": 0.3897, "step": 1940 }, { "epoch": 0.5825330132052821, "grad_norm": 0.1439991444349289, "learning_rate": 0.00017855033475927895, "loss": 0.4672, "step": 1941 }, { "epoch": 0.5828331332533013, "grad_norm": 0.161099374294281, "learning_rate": 0.00017851790988328024, "loss": 0.4948, "step": 1942 }, { "epoch": 0.5831332533013205, "grad_norm": 0.14966915547847748, "learning_rate": 0.00017848546346679327, "loss": 0.4741, "step": 1943 }, { "epoch": 0.5834333733493398, "grad_norm": 0.17617997527122498, "learning_rate": 0.00017845299551871936, "loss": 0.4958, "step": 1944 }, { "epoch": 0.5837334933973589, "grad_norm": 0.14991939067840576, "learning_rate": 0.0001784205060479657, "loss": 0.4269, "step": 1945 }, { "epoch": 0.5840336134453782, "grad_norm": 0.15833768248558044, "learning_rate": 0.00017838799506344544, "loss": 0.4718, "step": 1946 }, { "epoch": 0.5843337334933973, "grad_norm": 0.1564813107252121, "learning_rate": 0.0001783554625740776, "loss": 0.4885, "step": 1947 }, { "epoch": 0.5846338535414166, "grad_norm": 0.9468123912811279, "learning_rate": 0.00017832290858878704, "loss": 0.4513, "step": 1948 }, { "epoch": 0.5849339735894358, "grad_norm": 0.14726650714874268, "learning_rate": 0.00017829033311650462, "loss": 0.4731, "step": 1949 }, { "epoch": 0.5852340936374549, "grad_norm": 5.142820358276367, "learning_rate": 0.00017825773616616703, "loss": 0.5011, "step": 1950 }, { "epoch": 0.5855342136854742, "grad_norm": 0.17839622497558594, "learning_rate": 0.00017822511774671687, "loss": 0.4732, "step": 1951 }, { "epoch": 0.5858343337334934, "grad_norm": 0.2276451140642166, "learning_rate": 0.00017819247786710264, "loss": 0.5079, "step": 1952 }, { "epoch": 0.5861344537815126, "grad_norm": 0.34687280654907227, "learning_rate": 0.0001781598165362787, "loss": 0.4788, "step": 1953 }, { "epoch": 0.5864345738295318, "grad_norm": 0.16111914813518524, "learning_rate": 0.0001781271337632053, "loss": 0.4534, "step": 1954 }, { "epoch": 0.5867346938775511, "grad_norm": 0.19313451647758484, "learning_rate": 0.00017809442955684862, "loss": 0.495, "step": 1955 }, { "epoch": 0.5870348139255702, "grad_norm": 0.18948647379875183, "learning_rate": 0.00017806170392618067, "loss": 0.5002, "step": 1956 }, { "epoch": 0.5873349339735895, "grad_norm": 0.19670206308364868, "learning_rate": 0.00017802895688017936, "loss": 0.4845, "step": 1957 }, { "epoch": 0.5876350540216086, "grad_norm": 0.18781894445419312, "learning_rate": 0.00017799618842782844, "loss": 0.4335, "step": 1958 }, { "epoch": 0.5879351740696278, "grad_norm": 0.17452123761177063, "learning_rate": 0.00017796339857811756, "loss": 0.4928, "step": 1959 }, { "epoch": 0.5882352941176471, "grad_norm": 0.2590268552303314, "learning_rate": 0.0001779305873400423, "loss": 0.5229, "step": 1960 }, { "epoch": 0.5885354141656662, "grad_norm": 4.169418811798096, "learning_rate": 0.00017789775472260396, "loss": 0.5012, "step": 1961 }, { "epoch": 0.5888355342136855, "grad_norm": 4.5325212478637695, "learning_rate": 0.00017786490073480984, "loss": 2.4917, "step": 1962 }, { "epoch": 0.5891356542617047, "grad_norm": 1.494038701057434, "learning_rate": 0.00017783202538567308, "loss": 0.6653, "step": 1963 }, { "epoch": 0.5894357743097239, "grad_norm": 2.7276971340179443, "learning_rate": 0.0001777991286842126, "loss": 0.579, "step": 1964 }, { "epoch": 0.5897358943577431, "grad_norm": 6.630552291870117, "learning_rate": 0.00017776621063945322, "loss": 1.2808, "step": 1965 }, { "epoch": 0.5900360144057623, "grad_norm": 1.0019192695617676, "learning_rate": 0.0001777332712604257, "loss": 0.5761, "step": 1966 }, { "epoch": 0.5903361344537815, "grad_norm": 0.37464189529418945, "learning_rate": 0.00017770031055616654, "loss": 0.5019, "step": 1967 }, { "epoch": 0.5906362545018007, "grad_norm": 0.32812222838401794, "learning_rate": 0.00017766732853571814, "loss": 0.5611, "step": 1968 }, { "epoch": 0.59093637454982, "grad_norm": 0.4193577170372009, "learning_rate": 0.00017763432520812874, "loss": 0.5191, "step": 1969 }, { "epoch": 0.5912364945978391, "grad_norm": 0.32257169485092163, "learning_rate": 0.00017760130058245242, "loss": 0.5225, "step": 1970 }, { "epoch": 0.5915366146458584, "grad_norm": 0.2552598714828491, "learning_rate": 0.00017756825466774912, "loss": 0.407, "step": 1971 }, { "epoch": 0.5918367346938775, "grad_norm": 0.2669002115726471, "learning_rate": 0.00017753518747308454, "loss": 0.568, "step": 1972 }, { "epoch": 0.5921368547418968, "grad_norm": 0.2475007325410843, "learning_rate": 0.0001775020990075304, "loss": 0.5357, "step": 1973 }, { "epoch": 0.592436974789916, "grad_norm": 0.285609632730484, "learning_rate": 0.00017746898928016404, "loss": 0.5008, "step": 1974 }, { "epoch": 0.5927370948379351, "grad_norm": 0.24300764501094818, "learning_rate": 0.0001774358583000688, "loss": 0.6036, "step": 1975 }, { "epoch": 0.5930372148859544, "grad_norm": 0.18297113478183746, "learning_rate": 0.00017740270607633377, "loss": 0.5048, "step": 1976 }, { "epoch": 0.5933373349339736, "grad_norm": 0.22285380959510803, "learning_rate": 0.00017736953261805386, "loss": 0.4995, "step": 1977 }, { "epoch": 0.5936374549819928, "grad_norm": 0.1846853494644165, "learning_rate": 0.00017733633793432985, "loss": 0.4931, "step": 1978 }, { "epoch": 0.593937575030012, "grad_norm": 0.19081871211528778, "learning_rate": 0.0001773031220342683, "loss": 0.5039, "step": 1979 }, { "epoch": 0.5942376950780313, "grad_norm": 0.214079812169075, "learning_rate": 0.0001772698849269816, "loss": 0.5024, "step": 1980 }, { "epoch": 0.5945378151260504, "grad_norm": 0.23798619210720062, "learning_rate": 0.00017723662662158803, "loss": 0.5301, "step": 1981 }, { "epoch": 0.5948379351740696, "grad_norm": 0.2054298371076584, "learning_rate": 0.00017720334712721157, "loss": 0.5638, "step": 1982 }, { "epoch": 0.5951380552220888, "grad_norm": 0.18289883434772491, "learning_rate": 0.00017717004645298207, "loss": 0.5326, "step": 1983 }, { "epoch": 0.595438175270108, "grad_norm": 0.17352937161922455, "learning_rate": 0.00017713672460803524, "loss": 0.4728, "step": 1984 }, { "epoch": 0.5957382953181273, "grad_norm": 0.17404381930828094, "learning_rate": 0.00017710338160151248, "loss": 0.5195, "step": 1985 }, { "epoch": 0.5960384153661464, "grad_norm": 0.20349542796611786, "learning_rate": 0.00017707001744256108, "loss": 0.5368, "step": 1986 }, { "epoch": 0.5963385354141657, "grad_norm": 0.18232762813568115, "learning_rate": 0.00017703663214033415, "loss": 0.4905, "step": 1987 }, { "epoch": 0.5966386554621849, "grad_norm": 0.19482576847076416, "learning_rate": 0.00017700322570399056, "loss": 0.5443, "step": 1988 }, { "epoch": 0.5969387755102041, "grad_norm": 0.1861189305782318, "learning_rate": 0.00017696979814269489, "loss": 0.4927, "step": 1989 }, { "epoch": 0.5972388955582233, "grad_norm": 0.15059329569339752, "learning_rate": 0.00017693634946561775, "loss": 0.4527, "step": 1990 }, { "epoch": 0.5975390156062425, "grad_norm": 0.1789100617170334, "learning_rate": 0.00017690287968193528, "loss": 0.5073, "step": 1991 }, { "epoch": 0.5978391356542617, "grad_norm": 0.18303994834423065, "learning_rate": 0.00017686938880082963, "loss": 0.4518, "step": 1992 }, { "epoch": 0.5981392557022809, "grad_norm": 0.1630188226699829, "learning_rate": 0.00017683587683148857, "loss": 0.5175, "step": 1993 }, { "epoch": 0.5984393757503002, "grad_norm": 0.15870651602745056, "learning_rate": 0.00017680234378310573, "loss": 0.4886, "step": 1994 }, { "epoch": 0.5987394957983193, "grad_norm": 0.16810081899166107, "learning_rate": 0.00017676878966488055, "loss": 0.5112, "step": 1995 }, { "epoch": 0.5990396158463386, "grad_norm": 0.1768048256635666, "learning_rate": 0.0001767352144860182, "loss": 0.4502, "step": 1996 }, { "epoch": 0.5993397358943577, "grad_norm": 0.16407325863838196, "learning_rate": 0.00017670161825572965, "loss": 0.4885, "step": 1997 }, { "epoch": 0.5996398559423769, "grad_norm": 0.16511856019496918, "learning_rate": 0.00017666800098323164, "loss": 0.4894, "step": 1998 }, { "epoch": 0.5999399759903962, "grad_norm": 0.18376819789409637, "learning_rate": 0.0001766343626777467, "loss": 0.5458, "step": 1999 }, { "epoch": 0.6002400960384153, "grad_norm": 0.1894349306821823, "learning_rate": 0.00017660070334850304, "loss": 0.4612, "step": 2000 }, { "epoch": 0.6005402160864346, "grad_norm": 0.15564632415771484, "learning_rate": 0.00017656702300473481, "loss": 0.4722, "step": 2001 }, { "epoch": 0.6008403361344538, "grad_norm": 0.188431978225708, "learning_rate": 0.00017653332165568178, "loss": 0.5229, "step": 2002 }, { "epoch": 0.601140456182473, "grad_norm": 0.25116777420043945, "learning_rate": 0.0001764995993105895, "loss": 0.5073, "step": 2003 }, { "epoch": 0.6014405762304922, "grad_norm": 0.15402956306934357, "learning_rate": 0.00017646585597870935, "loss": 0.4539, "step": 2004 }, { "epoch": 0.6017406962785115, "grad_norm": 0.2883153259754181, "learning_rate": 0.00017643209166929845, "loss": 0.492, "step": 2005 }, { "epoch": 0.6020408163265306, "grad_norm": 0.14544089138507843, "learning_rate": 0.00017639830639161957, "loss": 0.4453, "step": 2006 }, { "epoch": 0.6023409363745498, "grad_norm": 0.16686096787452698, "learning_rate": 0.00017636450015494137, "loss": 0.4639, "step": 2007 }, { "epoch": 0.602641056422569, "grad_norm": 0.18141674995422363, "learning_rate": 0.0001763306729685382, "loss": 0.4564, "step": 2008 }, { "epoch": 0.6029411764705882, "grad_norm": 0.16426688432693481, "learning_rate": 0.00017629682484169014, "loss": 0.5075, "step": 2009 }, { "epoch": 0.6032412965186075, "grad_norm": 0.18985973298549652, "learning_rate": 0.00017626295578368305, "loss": 0.4788, "step": 2010 }, { "epoch": 0.6035414165666266, "grad_norm": 0.15430620312690735, "learning_rate": 0.00017622906580380852, "loss": 0.46, "step": 2011 }, { "epoch": 0.6038415366146459, "grad_norm": 0.15269403159618378, "learning_rate": 0.0001761951549113639, "loss": 0.5045, "step": 2012 }, { "epoch": 0.6041416566626651, "grad_norm": 0.15193504095077515, "learning_rate": 0.00017616122311565218, "loss": 0.4332, "step": 2013 }, { "epoch": 0.6044417767106842, "grad_norm": 0.1438152939081192, "learning_rate": 0.00017612727042598224, "loss": 0.451, "step": 2014 }, { "epoch": 0.6047418967587035, "grad_norm": 0.16883939504623413, "learning_rate": 0.0001760932968516686, "loss": 0.518, "step": 2015 }, { "epoch": 0.6050420168067226, "grad_norm": 0.1587616354227066, "learning_rate": 0.00017605930240203145, "loss": 0.464, "step": 2016 }, { "epoch": 0.6053421368547419, "grad_norm": 0.18665722012519836, "learning_rate": 0.00017602528708639685, "loss": 0.561, "step": 2017 }, { "epoch": 0.6056422569027611, "grad_norm": 0.14915643632411957, "learning_rate": 0.00017599125091409648, "loss": 0.4374, "step": 2018 }, { "epoch": 0.6059423769507803, "grad_norm": 0.15475080907344818, "learning_rate": 0.0001759571938944678, "loss": 0.5175, "step": 2019 }, { "epoch": 0.6062424969987995, "grad_norm": 0.1503390371799469, "learning_rate": 0.00017592311603685393, "loss": 0.4692, "step": 2020 }, { "epoch": 0.6065426170468188, "grad_norm": 0.15767855942249298, "learning_rate": 0.00017588901735060376, "loss": 0.51, "step": 2021 }, { "epoch": 0.6068427370948379, "grad_norm": 0.15096405148506165, "learning_rate": 0.00017585489784507186, "loss": 0.4601, "step": 2022 }, { "epoch": 0.6071428571428571, "grad_norm": 0.15504375100135803, "learning_rate": 0.00017582075752961855, "loss": 0.5359, "step": 2023 }, { "epoch": 0.6074429771908764, "grad_norm": 0.1470356434583664, "learning_rate": 0.00017578659641360978, "loss": 0.4396, "step": 2024 }, { "epoch": 0.6077430972388955, "grad_norm": 0.18372634053230286, "learning_rate": 0.00017575241450641736, "loss": 0.5157, "step": 2025 }, { "epoch": 0.6080432172869148, "grad_norm": 0.16779500246047974, "learning_rate": 0.00017571821181741859, "loss": 0.5137, "step": 2026 }, { "epoch": 0.608343337334934, "grad_norm": 0.31798943877220154, "learning_rate": 0.00017568398835599667, "loss": 0.4779, "step": 2027 }, { "epoch": 0.6086434573829532, "grad_norm": 0.1485154628753662, "learning_rate": 0.00017564974413154035, "loss": 0.4492, "step": 2028 }, { "epoch": 0.6089435774309724, "grad_norm": 0.15988373756408691, "learning_rate": 0.0001756154791534442, "loss": 0.5101, "step": 2029 }, { "epoch": 0.6092436974789915, "grad_norm": 0.1446724385023117, "learning_rate": 0.00017558119343110838, "loss": 0.4409, "step": 2030 }, { "epoch": 0.6095438175270108, "grad_norm": 0.16256844997406006, "learning_rate": 0.0001755468869739388, "loss": 0.4911, "step": 2031 }, { "epoch": 0.60984393757503, "grad_norm": 0.16651900112628937, "learning_rate": 0.00017551255979134705, "loss": 0.488, "step": 2032 }, { "epoch": 0.6101440576230492, "grad_norm": 0.14109483361244202, "learning_rate": 0.00017547821189275038, "loss": 0.4293, "step": 2033 }, { "epoch": 0.6104441776710684, "grad_norm": 0.15952004492282867, "learning_rate": 0.00017544384328757175, "loss": 0.4497, "step": 2034 }, { "epoch": 0.6107442977190877, "grad_norm": 0.14462940394878387, "learning_rate": 0.00017540945398523982, "loss": 0.4239, "step": 2035 }, { "epoch": 0.6110444177671068, "grad_norm": 0.14844803512096405, "learning_rate": 0.00017537504399518886, "loss": 0.4745, "step": 2036 }, { "epoch": 0.6113445378151261, "grad_norm": 0.14788131415843964, "learning_rate": 0.00017534061332685888, "loss": 0.457, "step": 2037 }, { "epoch": 0.6116446578631453, "grad_norm": 0.14577676355838776, "learning_rate": 0.00017530616198969555, "loss": 0.4824, "step": 2038 }, { "epoch": 0.6119447779111644, "grad_norm": 0.14970937371253967, "learning_rate": 0.00017527168999315016, "loss": 0.4017, "step": 2039 }, { "epoch": 0.6122448979591837, "grad_norm": 0.15392345190048218, "learning_rate": 0.00017523719734667973, "loss": 0.4667, "step": 2040 }, { "epoch": 0.6125450180072028, "grad_norm": 0.1536334753036499, "learning_rate": 0.00017520268405974692, "loss": 0.4893, "step": 2041 }, { "epoch": 0.6128451380552221, "grad_norm": 0.14010770618915558, "learning_rate": 0.00017516815014182008, "loss": 0.4642, "step": 2042 }, { "epoch": 0.6131452581032413, "grad_norm": 0.17600683867931366, "learning_rate": 0.00017513359560237314, "loss": 0.475, "step": 2043 }, { "epoch": 0.6134453781512605, "grad_norm": 0.1615845113992691, "learning_rate": 0.00017509902045088582, "loss": 0.5072, "step": 2044 }, { "epoch": 0.6137454981992797, "grad_norm": 0.16537730395793915, "learning_rate": 0.00017506442469684332, "loss": 0.5221, "step": 2045 }, { "epoch": 0.614045618247299, "grad_norm": 0.14268003404140472, "learning_rate": 0.00017502980834973667, "loss": 0.4465, "step": 2046 }, { "epoch": 0.6143457382953181, "grad_norm": 0.1623399555683136, "learning_rate": 0.0001749951714190624, "loss": 0.465, "step": 2047 }, { "epoch": 0.6146458583433373, "grad_norm": 0.20555807650089264, "learning_rate": 0.00017496051391432282, "loss": 0.445, "step": 2048 }, { "epoch": 0.6149459783913566, "grad_norm": 0.15159904956817627, "learning_rate": 0.00017492583584502577, "loss": 0.4622, "step": 2049 }, { "epoch": 0.6152460984393757, "grad_norm": 0.1567160189151764, "learning_rate": 0.0001748911372206848, "loss": 0.4422, "step": 2050 }, { "epoch": 0.615546218487395, "grad_norm": 0.16784349083900452, "learning_rate": 0.0001748564180508191, "loss": 0.4832, "step": 2051 }, { "epoch": 0.6158463385354142, "grad_norm": 0.1470254361629486, "learning_rate": 0.00017482167834495345, "loss": 0.4916, "step": 2052 }, { "epoch": 0.6161464585834334, "grad_norm": 0.1490541696548462, "learning_rate": 0.00017478691811261828, "loss": 0.4816, "step": 2053 }, { "epoch": 0.6164465786314526, "grad_norm": 0.1885465681552887, "learning_rate": 0.0001747521373633497, "loss": 0.5393, "step": 2054 }, { "epoch": 0.6167466986794717, "grad_norm": 0.16754573583602905, "learning_rate": 0.00017471733610668934, "loss": 0.5322, "step": 2055 }, { "epoch": 0.617046818727491, "grad_norm": 0.21554969251155853, "learning_rate": 0.0001746825143521846, "loss": 0.4694, "step": 2056 }, { "epoch": 0.6173469387755102, "grad_norm": 0.1596878319978714, "learning_rate": 0.0001746476721093884, "loss": 0.4782, "step": 2057 }, { "epoch": 0.6176470588235294, "grad_norm": 0.24440227448940277, "learning_rate": 0.00017461280938785932, "loss": 0.4607, "step": 2058 }, { "epoch": 0.6179471788715486, "grad_norm": 0.1455274522304535, "learning_rate": 0.00017457792619716153, "loss": 0.4751, "step": 2059 }, { "epoch": 0.6182472989195679, "grad_norm": 0.1382697969675064, "learning_rate": 0.00017454302254686486, "loss": 0.4552, "step": 2060 }, { "epoch": 0.618547418967587, "grad_norm": 0.1711137741804123, "learning_rate": 0.00017450809844654468, "loss": 0.4363, "step": 2061 }, { "epoch": 0.6188475390156063, "grad_norm": 0.1538340300321579, "learning_rate": 0.00017447315390578207, "loss": 0.4643, "step": 2062 }, { "epoch": 0.6191476590636255, "grad_norm": 0.14743641018867493, "learning_rate": 0.00017443818893416363, "loss": 0.4323, "step": 2063 }, { "epoch": 0.6194477791116446, "grad_norm": 0.4574187397956848, "learning_rate": 0.00017440320354128162, "loss": 0.5133, "step": 2064 }, { "epoch": 0.6197478991596639, "grad_norm": 0.14564795792102814, "learning_rate": 0.00017436819773673383, "loss": 0.4476, "step": 2065 }, { "epoch": 0.620048019207683, "grad_norm": 0.5146244168281555, "learning_rate": 0.00017433317153012375, "loss": 0.4766, "step": 2066 }, { "epoch": 0.6203481392557023, "grad_norm": 0.20717193186283112, "learning_rate": 0.00017429812493106043, "loss": 0.5036, "step": 2067 }, { "epoch": 0.6206482593037215, "grad_norm": 0.1570674180984497, "learning_rate": 0.00017426305794915846, "loss": 0.4884, "step": 2068 }, { "epoch": 0.6209483793517407, "grad_norm": 0.28760769963264465, "learning_rate": 0.00017422797059403814, "loss": 0.5122, "step": 2069 }, { "epoch": 0.6212484993997599, "grad_norm": 0.15187154710292816, "learning_rate": 0.00017419286287532516, "loss": 0.4608, "step": 2070 }, { "epoch": 0.6215486194477791, "grad_norm": 0.15756598114967346, "learning_rate": 0.00017415773480265102, "loss": 0.4853, "step": 2071 }, { "epoch": 0.6218487394957983, "grad_norm": 0.1937892585992813, "learning_rate": 0.00017412258638565268, "loss": 0.5064, "step": 2072 }, { "epoch": 0.6221488595438175, "grad_norm": 0.17976678907871246, "learning_rate": 0.00017408741763397267, "loss": 0.4349, "step": 2073 }, { "epoch": 0.6224489795918368, "grad_norm": 0.18714259564876556, "learning_rate": 0.00017405222855725917, "loss": 0.5572, "step": 2074 }, { "epoch": 0.6227490996398559, "grad_norm": 0.21988919377326965, "learning_rate": 0.0001740170191651659, "loss": 0.4587, "step": 2075 }, { "epoch": 0.6230492196878752, "grad_norm": 0.17325930297374725, "learning_rate": 0.00017398178946735214, "loss": 0.4815, "step": 2076 }, { "epoch": 0.6233493397358943, "grad_norm": 0.15329764783382416, "learning_rate": 0.00017394653947348278, "loss": 0.4946, "step": 2077 }, { "epoch": 0.6236494597839136, "grad_norm": 0.13897527754306793, "learning_rate": 0.0001739112691932282, "loss": 0.4417, "step": 2078 }, { "epoch": 0.6239495798319328, "grad_norm": 0.17680160701274872, "learning_rate": 0.00017387597863626446, "loss": 0.485, "step": 2079 }, { "epoch": 0.6242496998799519, "grad_norm": 0.2085428088903427, "learning_rate": 0.00017384066781227307, "loss": 0.4756, "step": 2080 }, { "epoch": 0.6245498199279712, "grad_norm": 0.15245382487773895, "learning_rate": 0.00017380533673094117, "loss": 0.4285, "step": 2081 }, { "epoch": 0.6248499399759904, "grad_norm": 0.14067943394184113, "learning_rate": 0.0001737699854019614, "loss": 0.453, "step": 2082 }, { "epoch": 0.6251500600240096, "grad_norm": 0.14680393040180206, "learning_rate": 0.00017373461383503208, "loss": 0.4824, "step": 2083 }, { "epoch": 0.6254501800720288, "grad_norm": 0.14030559360980988, "learning_rate": 0.00017369922203985688, "loss": 0.4266, "step": 2084 }, { "epoch": 0.6257503001200481, "grad_norm": 0.14773330092430115, "learning_rate": 0.00017366381002614523, "loss": 0.4897, "step": 2085 }, { "epoch": 0.6260504201680672, "grad_norm": 0.2262420505285263, "learning_rate": 0.00017362837780361196, "loss": 0.5152, "step": 2086 }, { "epoch": 0.6263505402160864, "grad_norm": 0.1481279730796814, "learning_rate": 0.0001735929253819775, "loss": 0.4594, "step": 2087 }, { "epoch": 0.6266506602641057, "grad_norm": 0.14530371129512787, "learning_rate": 0.00017355745277096785, "loss": 0.4572, "step": 2088 }, { "epoch": 0.6269507803121248, "grad_norm": 0.1739528626203537, "learning_rate": 0.00017352195998031447, "loss": 0.5099, "step": 2089 }, { "epoch": 0.6272509003601441, "grad_norm": 0.1571219265460968, "learning_rate": 0.0001734864470197544, "loss": 0.5085, "step": 2090 }, { "epoch": 0.6275510204081632, "grad_norm": 0.35751938819885254, "learning_rate": 0.00017345091389903025, "loss": 0.4412, "step": 2091 }, { "epoch": 0.6278511404561825, "grad_norm": 0.14211386442184448, "learning_rate": 0.00017341536062789008, "loss": 0.4501, "step": 2092 }, { "epoch": 0.6281512605042017, "grad_norm": 0.14581553637981415, "learning_rate": 0.00017337978721608757, "loss": 0.4616, "step": 2093 }, { "epoch": 0.6284513805522209, "grad_norm": 0.14807532727718353, "learning_rate": 0.00017334419367338183, "loss": 0.4759, "step": 2094 }, { "epoch": 0.6287515006002401, "grad_norm": 0.16016747057437897, "learning_rate": 0.00017330858000953758, "loss": 0.4912, "step": 2095 }, { "epoch": 0.6290516206482593, "grad_norm": 0.1585829108953476, "learning_rate": 0.000173272946234325, "loss": 0.4647, "step": 2096 }, { "epoch": 0.6293517406962785, "grad_norm": 0.14752192795276642, "learning_rate": 0.0001732372923575198, "loss": 0.4384, "step": 2097 }, { "epoch": 0.6296518607442977, "grad_norm": 0.2791168987751007, "learning_rate": 0.00017320161838890323, "loss": 0.4225, "step": 2098 }, { "epoch": 0.629951980792317, "grad_norm": 0.16510701179504395, "learning_rate": 0.00017316592433826202, "loss": 0.4936, "step": 2099 }, { "epoch": 0.6302521008403361, "grad_norm": 0.15952514111995697, "learning_rate": 0.00017313021021538844, "loss": 0.4983, "step": 2100 }, { "epoch": 0.6305522208883554, "grad_norm": 0.1469140350818634, "learning_rate": 0.00017309447603008026, "loss": 0.4273, "step": 2101 }, { "epoch": 0.6308523409363745, "grad_norm": 0.16945834457874298, "learning_rate": 0.0001730587217921407, "loss": 0.5163, "step": 2102 }, { "epoch": 0.6311524609843937, "grad_norm": 0.15591193735599518, "learning_rate": 0.00017302294751137855, "loss": 0.4628, "step": 2103 }, { "epoch": 0.631452581032413, "grad_norm": 0.15113277733325958, "learning_rate": 0.00017298715319760807, "loss": 0.456, "step": 2104 }, { "epoch": 0.6317527010804321, "grad_norm": 0.16945365071296692, "learning_rate": 0.00017295133886064906, "loss": 0.491, "step": 2105 }, { "epoch": 0.6320528211284514, "grad_norm": 0.14711284637451172, "learning_rate": 0.0001729155045103267, "loss": 0.4822, "step": 2106 }, { "epoch": 0.6323529411764706, "grad_norm": 0.14515145123004913, "learning_rate": 0.0001728796501564718, "loss": 0.4862, "step": 2107 }, { "epoch": 0.6326530612244898, "grad_norm": 0.15135280787944794, "learning_rate": 0.00017284377580892052, "loss": 0.4615, "step": 2108 }, { "epoch": 0.632953181272509, "grad_norm": 0.1585320681333542, "learning_rate": 0.00017280788147751462, "loss": 0.4345, "step": 2109 }, { "epoch": 0.6332533013205283, "grad_norm": 0.14420172572135925, "learning_rate": 0.0001727719671721013, "loss": 0.4516, "step": 2110 }, { "epoch": 0.6335534213685474, "grad_norm": 0.13808774948120117, "learning_rate": 0.00017273603290253326, "loss": 0.431, "step": 2111 }, { "epoch": 0.6338535414165666, "grad_norm": 0.14348426461219788, "learning_rate": 0.0001727000786786686, "loss": 0.4624, "step": 2112 }, { "epoch": 0.6341536614645858, "grad_norm": 0.19821301102638245, "learning_rate": 0.000172664104510371, "loss": 0.4774, "step": 2113 }, { "epoch": 0.634453781512605, "grad_norm": 0.15625211596488953, "learning_rate": 0.00017262811040750953, "loss": 0.5078, "step": 2114 }, { "epoch": 0.6347539015606243, "grad_norm": 0.13860677182674408, "learning_rate": 0.00017259209637995876, "loss": 0.4708, "step": 2115 }, { "epoch": 0.6350540216086434, "grad_norm": 0.1412367820739746, "learning_rate": 0.00017255606243759872, "loss": 0.4723, "step": 2116 }, { "epoch": 0.6353541416566627, "grad_norm": 0.14818702638149261, "learning_rate": 0.00017252000859031494, "loss": 0.5005, "step": 2117 }, { "epoch": 0.6356542617046819, "grad_norm": 0.15439516305923462, "learning_rate": 0.00017248393484799835, "loss": 0.3705, "step": 2118 }, { "epoch": 0.635954381752701, "grad_norm": 0.1573125123977661, "learning_rate": 0.00017244784122054537, "loss": 0.4828, "step": 2119 }, { "epoch": 0.6362545018007203, "grad_norm": 0.15407924354076385, "learning_rate": 0.0001724117277178579, "loss": 0.5148, "step": 2120 }, { "epoch": 0.6365546218487395, "grad_norm": 0.15772031247615814, "learning_rate": 0.00017237559434984324, "loss": 0.5087, "step": 2121 }, { "epoch": 0.6368547418967587, "grad_norm": 0.14631694555282593, "learning_rate": 0.00017233944112641418, "loss": 0.4673, "step": 2122 }, { "epoch": 0.6371548619447779, "grad_norm": 0.1371794044971466, "learning_rate": 0.00017230326805748892, "loss": 0.4077, "step": 2123 }, { "epoch": 0.6374549819927972, "grad_norm": 0.1333874613046646, "learning_rate": 0.00017226707515299115, "loss": 0.4053, "step": 2124 }, { "epoch": 0.6377551020408163, "grad_norm": 0.14271295070648193, "learning_rate": 0.00017223086242284995, "loss": 0.4764, "step": 2125 }, { "epoch": 0.6380552220888356, "grad_norm": 0.14769414067268372, "learning_rate": 0.0001721946298769999, "loss": 0.4937, "step": 2126 }, { "epoch": 0.6383553421368547, "grad_norm": 0.16356337070465088, "learning_rate": 0.00017215837752538097, "loss": 0.445, "step": 2127 }, { "epoch": 0.6386554621848739, "grad_norm": 0.17607265710830688, "learning_rate": 0.00017212210537793856, "loss": 0.446, "step": 2128 }, { "epoch": 0.6389555822328932, "grad_norm": 0.15226007997989655, "learning_rate": 0.00017208581344462353, "loss": 0.4741, "step": 2129 }, { "epoch": 0.6392557022809123, "grad_norm": 0.15901261568069458, "learning_rate": 0.0001720495017353922, "loss": 0.4863, "step": 2130 }, { "epoch": 0.6395558223289316, "grad_norm": 0.15675465762615204, "learning_rate": 0.00017201317026020618, "loss": 0.5179, "step": 2131 }, { "epoch": 0.6398559423769508, "grad_norm": 0.13890616595745087, "learning_rate": 0.0001719768190290327, "loss": 0.3986, "step": 2132 }, { "epoch": 0.64015606242497, "grad_norm": 0.13822025060653687, "learning_rate": 0.0001719404480518442, "loss": 0.4212, "step": 2133 }, { "epoch": 0.6404561824729892, "grad_norm": 0.1566293090581894, "learning_rate": 0.00017190405733861873, "loss": 0.5104, "step": 2134 }, { "epoch": 0.6407563025210085, "grad_norm": 0.13361778855323792, "learning_rate": 0.00017186764689933957, "loss": 0.443, "step": 2135 }, { "epoch": 0.6410564225690276, "grad_norm": 0.16091105341911316, "learning_rate": 0.00017183121674399558, "loss": 0.5039, "step": 2136 }, { "epoch": 0.6413565426170468, "grad_norm": 0.14156047999858856, "learning_rate": 0.00017179476688258097, "loss": 0.4789, "step": 2137 }, { "epoch": 0.641656662665066, "grad_norm": 0.20020362734794617, "learning_rate": 0.00017175829732509525, "loss": 0.5266, "step": 2138 }, { "epoch": 0.6419567827130852, "grad_norm": 0.173885315656662, "learning_rate": 0.00017172180808154352, "loss": 0.5213, "step": 2139 }, { "epoch": 0.6422569027611045, "grad_norm": 0.15453475713729858, "learning_rate": 0.00017168529916193614, "loss": 0.4017, "step": 2140 }, { "epoch": 0.6425570228091236, "grad_norm": 0.14919741451740265, "learning_rate": 0.00017164877057628892, "loss": 0.4756, "step": 2141 }, { "epoch": 0.6428571428571429, "grad_norm": 0.16573822498321533, "learning_rate": 0.00017161222233462307, "loss": 0.5327, "step": 2142 }, { "epoch": 0.6431572629051621, "grad_norm": 0.18085473775863647, "learning_rate": 0.00017157565444696516, "loss": 0.5019, "step": 2143 }, { "epoch": 0.6434573829531812, "grad_norm": 0.37763917446136475, "learning_rate": 0.00017153906692334717, "loss": 0.5499, "step": 2144 }, { "epoch": 0.6437575030012005, "grad_norm": 0.2172531634569168, "learning_rate": 0.0001715024597738065, "loss": 0.4735, "step": 2145 }, { "epoch": 0.6440576230492197, "grad_norm": 0.17138592898845673, "learning_rate": 0.00017146583300838586, "loss": 0.5055, "step": 2146 }, { "epoch": 0.6443577430972389, "grad_norm": 0.16673091053962708, "learning_rate": 0.00017142918663713342, "loss": 0.5186, "step": 2147 }, { "epoch": 0.6446578631452581, "grad_norm": 0.19096100330352783, "learning_rate": 0.00017139252067010268, "loss": 0.4336, "step": 2148 }, { "epoch": 0.6449579831932774, "grad_norm": 0.1490613967180252, "learning_rate": 0.00017135583511735248, "loss": 0.4933, "step": 2149 }, { "epoch": 0.6452581032412965, "grad_norm": 0.14809657633304596, "learning_rate": 0.00017131912998894717, "loss": 0.486, "step": 2150 }, { "epoch": 0.6455582232893158, "grad_norm": 0.1331087201833725, "learning_rate": 0.00017128240529495632, "loss": 0.4362, "step": 2151 }, { "epoch": 0.6458583433373349, "grad_norm": 0.15874071419239044, "learning_rate": 0.0001712456610454549, "loss": 0.4792, "step": 2152 }, { "epoch": 0.6461584633853541, "grad_norm": 0.24738092720508575, "learning_rate": 0.00017120889725052337, "loss": 0.4663, "step": 2153 }, { "epoch": 0.6464585834333734, "grad_norm": 0.15679825842380524, "learning_rate": 0.0001711721139202474, "loss": 0.4808, "step": 2154 }, { "epoch": 0.6467587034813925, "grad_norm": 0.29751163721084595, "learning_rate": 0.00017113531106471809, "loss": 0.4908, "step": 2155 }, { "epoch": 0.6470588235294118, "grad_norm": 0.17562802135944366, "learning_rate": 0.0001710984886940318, "loss": 0.5049, "step": 2156 }, { "epoch": 0.647358943577431, "grad_norm": 0.14633187651634216, "learning_rate": 0.00017106164681829046, "loss": 0.4547, "step": 2157 }, { "epoch": 0.6476590636254502, "grad_norm": 0.1577942818403244, "learning_rate": 0.00017102478544760112, "loss": 0.4259, "step": 2158 }, { "epoch": 0.6479591836734694, "grad_norm": 0.1356159895658493, "learning_rate": 0.00017098790459207632, "loss": 0.4123, "step": 2159 }, { "epoch": 0.6482593037214885, "grad_norm": 0.15663506090641022, "learning_rate": 0.0001709510042618339, "loss": 0.4634, "step": 2160 }, { "epoch": 0.6485594237695078, "grad_norm": 0.1408921331167221, "learning_rate": 0.00017091408446699697, "loss": 0.4169, "step": 2161 }, { "epoch": 0.648859543817527, "grad_norm": 0.14489974081516266, "learning_rate": 0.00017087714521769415, "loss": 0.4642, "step": 2162 }, { "epoch": 0.6491596638655462, "grad_norm": 0.13579650223255157, "learning_rate": 0.00017084018652405923, "loss": 0.404, "step": 2163 }, { "epoch": 0.6494597839135654, "grad_norm": 0.13602407276630402, "learning_rate": 0.00017080320839623148, "loss": 0.4439, "step": 2164 }, { "epoch": 0.6497599039615847, "grad_norm": 0.14799612760543823, "learning_rate": 0.00017076621084435533, "loss": 0.463, "step": 2165 }, { "epoch": 0.6500600240096038, "grad_norm": 0.16625499725341797, "learning_rate": 0.00017072919387858072, "loss": 0.4916, "step": 2166 }, { "epoch": 0.6503601440576231, "grad_norm": 0.15468667447566986, "learning_rate": 0.0001706921575090628, "loss": 0.4974, "step": 2167 }, { "epoch": 0.6506602641056423, "grad_norm": 0.1400432586669922, "learning_rate": 0.00017065510174596202, "loss": 0.4565, "step": 2168 }, { "epoch": 0.6509603841536614, "grad_norm": 0.13010244071483612, "learning_rate": 0.00017061802659944428, "loss": 0.3803, "step": 2169 }, { "epoch": 0.6512605042016807, "grad_norm": 0.1378718614578247, "learning_rate": 0.00017058093207968067, "loss": 0.4357, "step": 2170 }, { "epoch": 0.6515606242496998, "grad_norm": 0.15193061530590057, "learning_rate": 0.00017054381819684767, "loss": 0.4692, "step": 2171 }, { "epoch": 0.6518607442977191, "grad_norm": 0.1514962762594223, "learning_rate": 0.00017050668496112705, "loss": 0.5058, "step": 2172 }, { "epoch": 0.6521608643457383, "grad_norm": 0.15576845407485962, "learning_rate": 0.00017046953238270587, "loss": 0.5009, "step": 2173 }, { "epoch": 0.6524609843937575, "grad_norm": 0.143166646361351, "learning_rate": 0.00017043236047177654, "loss": 0.4734, "step": 2174 }, { "epoch": 0.6527611044417767, "grad_norm": 0.1380712240934372, "learning_rate": 0.00017039516923853673, "loss": 0.4174, "step": 2175 }, { "epoch": 0.6530612244897959, "grad_norm": 0.17489846050739288, "learning_rate": 0.00017035795869318942, "loss": 0.51, "step": 2176 }, { "epoch": 0.6533613445378151, "grad_norm": 0.16089336574077606, "learning_rate": 0.00017032072884594292, "loss": 0.5001, "step": 2177 }, { "epoch": 0.6536614645858343, "grad_norm": 0.13599681854248047, "learning_rate": 0.00017028347970701085, "loss": 0.4047, "step": 2178 }, { "epoch": 0.6539615846338536, "grad_norm": 0.14149728417396545, "learning_rate": 0.00017024621128661197, "loss": 0.4526, "step": 2179 }, { "epoch": 0.6542617046818727, "grad_norm": 0.14691421389579773, "learning_rate": 0.0001702089235949705, "loss": 0.4341, "step": 2180 }, { "epoch": 0.654561824729892, "grad_norm": 0.14323483407497406, "learning_rate": 0.00017017161664231593, "loss": 0.4982, "step": 2181 }, { "epoch": 0.6548619447779112, "grad_norm": 0.13906525075435638, "learning_rate": 0.00017013429043888297, "loss": 0.4486, "step": 2182 }, { "epoch": 0.6551620648259304, "grad_norm": 0.6826688051223755, "learning_rate": 0.0001700969449949116, "loss": 0.4892, "step": 2183 }, { "epoch": 0.6554621848739496, "grad_norm": 1.933110237121582, "learning_rate": 0.00017005958032064716, "loss": 0.5188, "step": 2184 }, { "epoch": 0.6557623049219687, "grad_norm": 0.13388942182064056, "learning_rate": 0.00017002219642634015, "loss": 0.42, "step": 2185 }, { "epoch": 0.656062424969988, "grad_norm": 0.39405903220176697, "learning_rate": 0.00016998479332224655, "loss": 0.4892, "step": 2186 }, { "epoch": 0.6563625450180072, "grad_norm": 0.20359550416469574, "learning_rate": 0.00016994737101862735, "loss": 0.4792, "step": 2187 }, { "epoch": 0.6566626650660264, "grad_norm": 0.1704937219619751, "learning_rate": 0.00016990992952574895, "loss": 0.5432, "step": 2188 }, { "epoch": 0.6569627851140456, "grad_norm": 0.15303505957126617, "learning_rate": 0.00016987246885388305, "loss": 0.4766, "step": 2189 }, { "epoch": 0.6572629051620649, "grad_norm": 0.15986792743206024, "learning_rate": 0.0001698349890133065, "loss": 0.4634, "step": 2190 }, { "epoch": 0.657563025210084, "grad_norm": 0.1516830176115036, "learning_rate": 0.0001697974900143015, "loss": 0.4386, "step": 2191 }, { "epoch": 0.6578631452581032, "grad_norm": 0.17122162878513336, "learning_rate": 0.00016975997186715542, "loss": 0.4553, "step": 2192 }, { "epoch": 0.6581632653061225, "grad_norm": 0.16046889126300812, "learning_rate": 0.000169722434582161, "loss": 0.4854, "step": 2193 }, { "epoch": 0.6584633853541416, "grad_norm": 0.18766555190086365, "learning_rate": 0.00016968487816961612, "loss": 0.4352, "step": 2194 }, { "epoch": 0.6587635054021609, "grad_norm": 0.24515730142593384, "learning_rate": 0.00016964730263982394, "loss": 0.532, "step": 2195 }, { "epoch": 0.65906362545018, "grad_norm": 0.18200074136257172, "learning_rate": 0.00016960970800309293, "loss": 0.517, "step": 2196 }, { "epoch": 0.6593637454981993, "grad_norm": 0.16878826916217804, "learning_rate": 0.0001695720942697367, "loss": 0.4512, "step": 2197 }, { "epoch": 0.6596638655462185, "grad_norm": 0.16377224028110504, "learning_rate": 0.00016953446145007416, "loss": 0.4876, "step": 2198 }, { "epoch": 0.6599639855942377, "grad_norm": 0.15861105918884277, "learning_rate": 0.00016949680955442945, "loss": 0.491, "step": 2199 }, { "epoch": 0.6602641056422569, "grad_norm": 0.14829066395759583, "learning_rate": 0.0001694591385931319, "loss": 0.4204, "step": 2200 }, { "epoch": 0.6605642256902761, "grad_norm": 0.1616215854883194, "learning_rate": 0.0001694214485765162, "loss": 0.4864, "step": 2201 }, { "epoch": 0.6608643457382953, "grad_norm": 0.1581946164369583, "learning_rate": 0.00016938373951492206, "loss": 0.5292, "step": 2202 }, { "epoch": 0.6611644657863145, "grad_norm": 0.15371695160865784, "learning_rate": 0.00016934601141869456, "loss": 0.4524, "step": 2203 }, { "epoch": 0.6614645858343338, "grad_norm": 0.15436139702796936, "learning_rate": 0.00016930826429818405, "loss": 0.4704, "step": 2204 }, { "epoch": 0.6617647058823529, "grad_norm": 0.1531285047531128, "learning_rate": 0.00016927049816374594, "loss": 0.4567, "step": 2205 }, { "epoch": 0.6620648259303722, "grad_norm": 0.15854240953922272, "learning_rate": 0.00016923271302574094, "loss": 0.4497, "step": 2206 }, { "epoch": 0.6623649459783914, "grad_norm": 0.15710663795471191, "learning_rate": 0.000169194908894535, "loss": 0.5279, "step": 2207 }, { "epoch": 0.6626650660264105, "grad_norm": 0.15883848071098328, "learning_rate": 0.0001691570857804992, "loss": 0.4974, "step": 2208 }, { "epoch": 0.6629651860744298, "grad_norm": 0.21286913752555847, "learning_rate": 0.00016911924369400992, "loss": 0.4326, "step": 2209 }, { "epoch": 0.6632653061224489, "grad_norm": 0.15384641289710999, "learning_rate": 0.00016908138264544874, "loss": 0.5119, "step": 2210 }, { "epoch": 0.6635654261704682, "grad_norm": 0.16249476373195648, "learning_rate": 0.00016904350264520233, "loss": 0.5084, "step": 2211 }, { "epoch": 0.6638655462184874, "grad_norm": 0.1680646389722824, "learning_rate": 0.00016900560370366265, "loss": 0.5169, "step": 2212 }, { "epoch": 0.6641656662665066, "grad_norm": 0.1526922881603241, "learning_rate": 0.0001689676858312269, "loss": 0.4992, "step": 2213 }, { "epoch": 0.6644657863145258, "grad_norm": 0.4710622727870941, "learning_rate": 0.0001689297490382973, "loss": 0.464, "step": 2214 }, { "epoch": 0.6647659063625451, "grad_norm": 0.1634788066148758, "learning_rate": 0.00016889179333528148, "loss": 0.4725, "step": 2215 }, { "epoch": 0.6650660264105642, "grad_norm": 0.15752537548542023, "learning_rate": 0.0001688538187325921, "loss": 0.4526, "step": 2216 }, { "epoch": 0.6653661464585834, "grad_norm": 0.15170423686504364, "learning_rate": 0.00016881582524064706, "loss": 0.4586, "step": 2217 }, { "epoch": 0.6656662665066027, "grad_norm": 0.1426185667514801, "learning_rate": 0.00016877781286986948, "loss": 0.4159, "step": 2218 }, { "epoch": 0.6659663865546218, "grad_norm": 0.15381565690040588, "learning_rate": 0.00016873978163068753, "loss": 0.4793, "step": 2219 }, { "epoch": 0.6662665066026411, "grad_norm": 0.1260295808315277, "learning_rate": 0.00016870173153353478, "loss": 0.3845, "step": 2220 }, { "epoch": 0.6665666266506602, "grad_norm": 0.1446010321378708, "learning_rate": 0.00016866366258884967, "loss": 0.448, "step": 2221 }, { "epoch": 0.6668667466986795, "grad_norm": 0.18416649103164673, "learning_rate": 0.00016862557480707612, "loss": 0.4565, "step": 2222 }, { "epoch": 0.6671668667466987, "grad_norm": 0.15086296200752258, "learning_rate": 0.00016858746819866302, "loss": 0.4724, "step": 2223 }, { "epoch": 0.6674669867947179, "grad_norm": 0.15301254391670227, "learning_rate": 0.00016854934277406446, "loss": 0.5115, "step": 2224 }, { "epoch": 0.6677671068427371, "grad_norm": 0.24658562242984772, "learning_rate": 0.00016851119854373976, "loss": 0.5038, "step": 2225 }, { "epoch": 0.6680672268907563, "grad_norm": 0.15066738426685333, "learning_rate": 0.00016847303551815332, "loss": 0.453, "step": 2226 }, { "epoch": 0.6683673469387755, "grad_norm": 0.17413273453712463, "learning_rate": 0.00016843485370777474, "loss": 0.4851, "step": 2227 }, { "epoch": 0.6686674669867947, "grad_norm": 0.14235296845436096, "learning_rate": 0.00016839665312307878, "loss": 0.4635, "step": 2228 }, { "epoch": 0.668967587034814, "grad_norm": 0.15093177556991577, "learning_rate": 0.00016835843377454527, "loss": 0.48, "step": 2229 }, { "epoch": 0.6692677070828331, "grad_norm": 0.1457367092370987, "learning_rate": 0.0001683201956726593, "loss": 0.4722, "step": 2230 }, { "epoch": 0.6695678271308524, "grad_norm": 0.17020396888256073, "learning_rate": 0.00016828193882791108, "loss": 0.4627, "step": 2231 }, { "epoch": 0.6698679471788715, "grad_norm": 0.15534807741641998, "learning_rate": 0.0001682436632507959, "loss": 0.5646, "step": 2232 }, { "epoch": 0.6701680672268907, "grad_norm": 0.14641976356506348, "learning_rate": 0.0001682053689518142, "loss": 0.5024, "step": 2233 }, { "epoch": 0.67046818727491, "grad_norm": 0.15433022379875183, "learning_rate": 0.0001681670559414716, "loss": 0.494, "step": 2234 }, { "epoch": 0.6707683073229291, "grad_norm": 0.1448221504688263, "learning_rate": 0.00016812872423027888, "loss": 0.4797, "step": 2235 }, { "epoch": 0.6710684273709484, "grad_norm": 0.13265001773834229, "learning_rate": 0.00016809037382875186, "loss": 0.4194, "step": 2236 }, { "epoch": 0.6713685474189676, "grad_norm": 0.24117717146873474, "learning_rate": 0.00016805200474741155, "loss": 0.5215, "step": 2237 }, { "epoch": 0.6716686674669868, "grad_norm": 0.17228154838085175, "learning_rate": 0.00016801361699678405, "loss": 0.4976, "step": 2238 }, { "epoch": 0.671968787515006, "grad_norm": 0.13660216331481934, "learning_rate": 0.00016797521058740063, "loss": 0.4128, "step": 2239 }, { "epoch": 0.6722689075630253, "grad_norm": 0.21866540610790253, "learning_rate": 0.0001679367855297976, "loss": 0.4777, "step": 2240 }, { "epoch": 0.6725690276110444, "grad_norm": 0.14396953582763672, "learning_rate": 0.00016789834183451646, "loss": 0.4473, "step": 2241 }, { "epoch": 0.6728691476590636, "grad_norm": 0.2414744794368744, "learning_rate": 0.00016785987951210385, "loss": 0.4548, "step": 2242 }, { "epoch": 0.6731692677070829, "grad_norm": 0.1470125913619995, "learning_rate": 0.00016782139857311136, "loss": 0.4366, "step": 2243 }, { "epoch": 0.673469387755102, "grad_norm": 0.13129663467407227, "learning_rate": 0.0001677828990280959, "loss": 0.4144, "step": 2244 }, { "epoch": 0.6737695078031213, "grad_norm": 0.14611317217350006, "learning_rate": 0.00016774438088761927, "loss": 0.4635, "step": 2245 }, { "epoch": 0.6740696278511404, "grad_norm": 0.13981489837169647, "learning_rate": 0.0001677058441622486, "loss": 0.467, "step": 2246 }, { "epoch": 0.6743697478991597, "grad_norm": 0.13604478538036346, "learning_rate": 0.00016766728886255592, "loss": 0.4421, "step": 2247 }, { "epoch": 0.6746698679471789, "grad_norm": 0.1460788995027542, "learning_rate": 0.00016762871499911844, "loss": 0.476, "step": 2248 }, { "epoch": 0.674969987995198, "grad_norm": 0.13810686767101288, "learning_rate": 0.0001675901225825185, "loss": 0.4452, "step": 2249 }, { "epoch": 0.6752701080432173, "grad_norm": 0.15245585143566132, "learning_rate": 0.0001675515116233434, "loss": 0.4814, "step": 2250 }, { "epoch": 0.6755702280912365, "grad_norm": 0.13995683193206787, "learning_rate": 0.00016751288213218572, "loss": 0.4371, "step": 2251 }, { "epoch": 0.6758703481392557, "grad_norm": 0.14143361151218414, "learning_rate": 0.00016747423411964295, "loss": 0.469, "step": 2252 }, { "epoch": 0.6761704681872749, "grad_norm": 0.13966970145702362, "learning_rate": 0.00016743556759631778, "loss": 0.4406, "step": 2253 }, { "epoch": 0.6764705882352942, "grad_norm": 0.13671116530895233, "learning_rate": 0.00016739688257281788, "loss": 0.4137, "step": 2254 }, { "epoch": 0.6767707082833133, "grad_norm": 0.13782845437526703, "learning_rate": 0.00016735817905975603, "loss": 0.4323, "step": 2255 }, { "epoch": 0.6770708283313326, "grad_norm": 0.15411385893821716, "learning_rate": 0.00016731945706775013, "loss": 0.482, "step": 2256 }, { "epoch": 0.6773709483793517, "grad_norm": 0.15254433453083038, "learning_rate": 0.00016728071660742315, "loss": 0.4975, "step": 2257 }, { "epoch": 0.6776710684273709, "grad_norm": 0.14145848155021667, "learning_rate": 0.000167241957689403, "loss": 0.4492, "step": 2258 }, { "epoch": 0.6779711884753902, "grad_norm": 0.14561478793621063, "learning_rate": 0.0001672031803243228, "loss": 0.4613, "step": 2259 }, { "epoch": 0.6782713085234093, "grad_norm": 0.14410771429538727, "learning_rate": 0.0001671643845228207, "loss": 0.4447, "step": 2260 }, { "epoch": 0.6785714285714286, "grad_norm": 0.14435681700706482, "learning_rate": 0.00016712557029553983, "loss": 0.4498, "step": 2261 }, { "epoch": 0.6788715486194478, "grad_norm": 0.1489761620759964, "learning_rate": 0.00016708673765312849, "loss": 0.4406, "step": 2262 }, { "epoch": 0.679171668667467, "grad_norm": 0.14680323004722595, "learning_rate": 0.00016704788660623987, "loss": 0.4957, "step": 2263 }, { "epoch": 0.6794717887154862, "grad_norm": 0.14683204889297485, "learning_rate": 0.0001670090171655324, "loss": 0.4304, "step": 2264 }, { "epoch": 0.6797719087635054, "grad_norm": 0.14627540111541748, "learning_rate": 0.00016697012934166944, "loss": 0.4955, "step": 2265 }, { "epoch": 0.6800720288115246, "grad_norm": 0.16079705953598022, "learning_rate": 0.0001669312231453194, "loss": 0.4412, "step": 2266 }, { "epoch": 0.6803721488595438, "grad_norm": 0.14432165026664734, "learning_rate": 0.00016689229858715576, "loss": 0.4334, "step": 2267 }, { "epoch": 0.680672268907563, "grad_norm": 0.14839954674243927, "learning_rate": 0.00016685335567785705, "loss": 0.4511, "step": 2268 }, { "epoch": 0.6809723889555822, "grad_norm": 0.14657177031040192, "learning_rate": 0.00016681439442810679, "loss": 0.4664, "step": 2269 }, { "epoch": 0.6812725090036015, "grad_norm": 0.14277929067611694, "learning_rate": 0.00016677541484859352, "loss": 0.4895, "step": 2270 }, { "epoch": 0.6815726290516206, "grad_norm": 0.14854009449481964, "learning_rate": 0.00016673641695001088, "loss": 0.4802, "step": 2271 }, { "epoch": 0.6818727490996399, "grad_norm": 0.21721404790878296, "learning_rate": 0.00016669740074305752, "loss": 0.4721, "step": 2272 }, { "epoch": 0.6821728691476591, "grad_norm": 0.14389510452747345, "learning_rate": 0.00016665836623843701, "loss": 0.4618, "step": 2273 }, { "epoch": 0.6824729891956782, "grad_norm": 0.15691958367824554, "learning_rate": 0.0001666193134468581, "loss": 0.4687, "step": 2274 }, { "epoch": 0.6827731092436975, "grad_norm": 0.14597022533416748, "learning_rate": 0.00016658024237903445, "loss": 0.4747, "step": 2275 }, { "epoch": 0.6830732292917167, "grad_norm": 0.15653330087661743, "learning_rate": 0.00016654115304568472, "loss": 0.4877, "step": 2276 }, { "epoch": 0.6833733493397359, "grad_norm": 0.13929207623004913, "learning_rate": 0.00016650204545753266, "loss": 0.4856, "step": 2277 }, { "epoch": 0.6836734693877551, "grad_norm": 0.3877395987510681, "learning_rate": 0.000166462919625307, "loss": 0.4308, "step": 2278 }, { "epoch": 0.6839735894357744, "grad_norm": 0.1269235759973526, "learning_rate": 0.00016642377555974142, "loss": 0.4114, "step": 2279 }, { "epoch": 0.6842737094837935, "grad_norm": 0.1468934714794159, "learning_rate": 0.0001663846132715747, "loss": 0.5039, "step": 2280 }, { "epoch": 0.6845738295318127, "grad_norm": 0.14282235503196716, "learning_rate": 0.0001663454327715505, "loss": 0.4331, "step": 2281 }, { "epoch": 0.6848739495798319, "grad_norm": 0.15750561654567719, "learning_rate": 0.00016630623407041758, "loss": 0.4619, "step": 2282 }, { "epoch": 0.6851740696278511, "grad_norm": 0.18510620296001434, "learning_rate": 0.0001662670171789297, "loss": 0.4194, "step": 2283 }, { "epoch": 0.6854741896758704, "grad_norm": 0.15529020130634308, "learning_rate": 0.00016622778210784547, "loss": 0.4733, "step": 2284 }, { "epoch": 0.6857743097238895, "grad_norm": 0.16339364647865295, "learning_rate": 0.00016618852886792862, "loss": 0.4461, "step": 2285 }, { "epoch": 0.6860744297719088, "grad_norm": 0.14887449145317078, "learning_rate": 0.00016614925746994783, "loss": 0.4122, "step": 2286 }, { "epoch": 0.686374549819928, "grad_norm": 0.17690427601337433, "learning_rate": 0.00016610996792467677, "loss": 0.4675, "step": 2287 }, { "epoch": 0.6866746698679472, "grad_norm": 0.14621487259864807, "learning_rate": 0.00016607066024289404, "loss": 0.5069, "step": 2288 }, { "epoch": 0.6869747899159664, "grad_norm": 0.14996911585330963, "learning_rate": 0.0001660313344353833, "loss": 0.4516, "step": 2289 }, { "epoch": 0.6872749099639855, "grad_norm": 0.1662423014640808, "learning_rate": 0.00016599199051293314, "loss": 0.459, "step": 2290 }, { "epoch": 0.6875750300120048, "grad_norm": 0.19129732251167297, "learning_rate": 0.00016595262848633703, "loss": 0.4695, "step": 2291 }, { "epoch": 0.687875150060024, "grad_norm": 0.14279800653457642, "learning_rate": 0.00016591324836639357, "loss": 0.4337, "step": 2292 }, { "epoch": 0.6881752701080432, "grad_norm": 0.15134789049625397, "learning_rate": 0.00016587385016390624, "loss": 0.4685, "step": 2293 }, { "epoch": 0.6884753901560624, "grad_norm": 0.20491233468055725, "learning_rate": 0.00016583443388968344, "loss": 0.4921, "step": 2294 }, { "epoch": 0.6887755102040817, "grad_norm": 0.1463315188884735, "learning_rate": 0.00016579499955453865, "loss": 0.469, "step": 2295 }, { "epoch": 0.6890756302521008, "grad_norm": 0.2550693452358246, "learning_rate": 0.00016575554716929012, "loss": 0.4493, "step": 2296 }, { "epoch": 0.68937575030012, "grad_norm": 0.26226985454559326, "learning_rate": 0.0001657160767447612, "loss": 0.4992, "step": 2297 }, { "epoch": 0.6896758703481393, "grad_norm": 0.1389845758676529, "learning_rate": 0.00016567658829178022, "loss": 0.4366, "step": 2298 }, { "epoch": 0.6899759903961584, "grad_norm": 0.1418905258178711, "learning_rate": 0.0001656370818211803, "loss": 0.4513, "step": 2299 }, { "epoch": 0.6902761104441777, "grad_norm": 0.13269588351249695, "learning_rate": 0.0001655975573437996, "loss": 0.4409, "step": 2300 }, { "epoch": 0.6905762304921969, "grad_norm": 0.17201949656009674, "learning_rate": 0.00016555801487048126, "loss": 0.5256, "step": 2301 }, { "epoch": 0.6908763505402161, "grad_norm": 0.1453656107187271, "learning_rate": 0.00016551845441207326, "loss": 0.474, "step": 2302 }, { "epoch": 0.6911764705882353, "grad_norm": 0.1528858244419098, "learning_rate": 0.00016547887597942855, "loss": 0.4822, "step": 2303 }, { "epoch": 0.6914765906362546, "grad_norm": 0.14635047316551208, "learning_rate": 0.00016543927958340504, "loss": 0.4534, "step": 2304 }, { "epoch": 0.6917767106842737, "grad_norm": 0.1443641185760498, "learning_rate": 0.00016539966523486553, "loss": 0.4873, "step": 2305 }, { "epoch": 0.6920768307322929, "grad_norm": 0.24002930521965027, "learning_rate": 0.00016536003294467778, "loss": 0.4502, "step": 2306 }, { "epoch": 0.6923769507803121, "grad_norm": 0.2631763219833374, "learning_rate": 0.00016532038272371445, "loss": 0.5258, "step": 2307 }, { "epoch": 0.6926770708283313, "grad_norm": 0.15520347654819489, "learning_rate": 0.0001652807145828531, "loss": 0.5068, "step": 2308 }, { "epoch": 0.6929771908763506, "grad_norm": 0.13766346871852875, "learning_rate": 0.0001652410285329763, "loss": 0.4309, "step": 2309 }, { "epoch": 0.6932773109243697, "grad_norm": 0.14334671199321747, "learning_rate": 0.0001652013245849714, "loss": 0.4534, "step": 2310 }, { "epoch": 0.693577430972389, "grad_norm": 0.1372639685869217, "learning_rate": 0.00016516160274973073, "loss": 0.4453, "step": 2311 }, { "epoch": 0.6938775510204082, "grad_norm": 0.15254907310009003, "learning_rate": 0.0001651218630381515, "loss": 0.4572, "step": 2312 }, { "epoch": 0.6941776710684273, "grad_norm": 3.143188238143921, "learning_rate": 0.0001650821054611359, "loss": 0.504, "step": 2313 }, { "epoch": 0.6944777911164466, "grad_norm": 0.15842002630233765, "learning_rate": 0.00016504233002959093, "loss": 0.5412, "step": 2314 }, { "epoch": 0.6947779111644657, "grad_norm": 0.9213683605194092, "learning_rate": 0.0001650025367544285, "loss": 0.413, "step": 2315 }, { "epoch": 0.695078031212485, "grad_norm": 0.13423404097557068, "learning_rate": 0.0001649627256465655, "loss": 0.4424, "step": 2316 }, { "epoch": 0.6953781512605042, "grad_norm": 0.14566202461719513, "learning_rate": 0.00016492289671692366, "loss": 0.4464, "step": 2317 }, { "epoch": 0.6956782713085234, "grad_norm": 0.1668601930141449, "learning_rate": 0.0001648830499764295, "loss": 0.4897, "step": 2318 }, { "epoch": 0.6959783913565426, "grad_norm": 0.16586680710315704, "learning_rate": 0.00016484318543601455, "loss": 0.481, "step": 2319 }, { "epoch": 0.6962785114045619, "grad_norm": 0.16026067733764648, "learning_rate": 0.00016480330310661523, "loss": 0.4539, "step": 2320 }, { "epoch": 0.696578631452581, "grad_norm": 0.1537049561738968, "learning_rate": 0.00016476340299917278, "loss": 0.4323, "step": 2321 }, { "epoch": 0.6968787515006002, "grad_norm": 0.1631222367286682, "learning_rate": 0.0001647234851246333, "loss": 0.4406, "step": 2322 }, { "epoch": 0.6971788715486195, "grad_norm": 0.18382146954536438, "learning_rate": 0.00016468354949394787, "loss": 0.4854, "step": 2323 }, { "epoch": 0.6974789915966386, "grad_norm": 0.1590641289949417, "learning_rate": 0.00016464359611807235, "loss": 0.413, "step": 2324 }, { "epoch": 0.6977791116446579, "grad_norm": 0.15298865735530853, "learning_rate": 0.00016460362500796746, "loss": 0.4587, "step": 2325 }, { "epoch": 0.698079231692677, "grad_norm": 0.20010384917259216, "learning_rate": 0.0001645636361745988, "loss": 0.4405, "step": 2326 }, { "epoch": 0.6983793517406963, "grad_norm": 0.1749500185251236, "learning_rate": 0.0001645236296289369, "loss": 0.5469, "step": 2327 }, { "epoch": 0.6986794717887155, "grad_norm": 0.1599074900150299, "learning_rate": 0.0001644836053819571, "loss": 0.453, "step": 2328 }, { "epoch": 0.6989795918367347, "grad_norm": 0.14947155117988586, "learning_rate": 0.00016444356344463953, "loss": 0.3807, "step": 2329 }, { "epoch": 0.6992797118847539, "grad_norm": 0.23378188908100128, "learning_rate": 0.00016440350382796929, "loss": 0.4932, "step": 2330 }, { "epoch": 0.6995798319327731, "grad_norm": 0.18306280672550201, "learning_rate": 0.00016436342654293625, "loss": 0.4734, "step": 2331 }, { "epoch": 0.6998799519807923, "grad_norm": 0.240435391664505, "learning_rate": 0.00016432333160053522, "loss": 0.4744, "step": 2332 }, { "epoch": 0.7001800720288115, "grad_norm": 0.17868775129318237, "learning_rate": 0.0001642832190117657, "loss": 0.4662, "step": 2333 }, { "epoch": 0.7004801920768308, "grad_norm": 0.14438675343990326, "learning_rate": 0.00016424308878763215, "loss": 0.4399, "step": 2334 }, { "epoch": 0.7007803121248499, "grad_norm": 0.1518152356147766, "learning_rate": 0.00016420294093914386, "loss": 0.5056, "step": 2335 }, { "epoch": 0.7010804321728692, "grad_norm": 0.1992632895708084, "learning_rate": 0.00016416277547731488, "loss": 0.4596, "step": 2336 }, { "epoch": 0.7013805522208884, "grad_norm": 0.15660887956619263, "learning_rate": 0.00016412259241316418, "loss": 0.4479, "step": 2337 }, { "epoch": 0.7016806722689075, "grad_norm": 0.14185617864131927, "learning_rate": 0.00016408239175771553, "loss": 0.4052, "step": 2338 }, { "epoch": 0.7019807923169268, "grad_norm": 0.14662165939807892, "learning_rate": 0.0001640421735219975, "loss": 0.4156, "step": 2339 }, { "epoch": 0.7022809123649459, "grad_norm": 0.16649018228054047, "learning_rate": 0.00016400193771704354, "loss": 0.546, "step": 2340 }, { "epoch": 0.7025810324129652, "grad_norm": 0.16269463300704956, "learning_rate": 0.00016396168435389184, "loss": 0.4766, "step": 2341 }, { "epoch": 0.7028811524609844, "grad_norm": 0.18485970795154572, "learning_rate": 0.00016392141344358544, "loss": 0.4581, "step": 2342 }, { "epoch": 0.7031812725090036, "grad_norm": 0.16991198062896729, "learning_rate": 0.00016388112499717225, "loss": 0.4065, "step": 2343 }, { "epoch": 0.7034813925570228, "grad_norm": 0.14863085746765137, "learning_rate": 0.0001638408190257049, "loss": 0.4582, "step": 2344 }, { "epoch": 0.7037815126050421, "grad_norm": 0.14156857132911682, "learning_rate": 0.00016380049554024086, "loss": 0.4515, "step": 2345 }, { "epoch": 0.7040816326530612, "grad_norm": 0.15150651335716248, "learning_rate": 0.00016376015455184245, "loss": 0.4547, "step": 2346 }, { "epoch": 0.7043817527010804, "grad_norm": 0.17717133462429047, "learning_rate": 0.0001637197960715768, "loss": 0.553, "step": 2347 }, { "epoch": 0.7046818727490997, "grad_norm": 0.1485554426908493, "learning_rate": 0.0001636794201105157, "loss": 0.4529, "step": 2348 }, { "epoch": 0.7049819927971188, "grad_norm": 0.1734079122543335, "learning_rate": 0.0001636390266797359, "loss": 0.4458, "step": 2349 }, { "epoch": 0.7052821128451381, "grad_norm": 0.1593354195356369, "learning_rate": 0.00016359861579031884, "loss": 0.4446, "step": 2350 }, { "epoch": 0.7055822328931572, "grad_norm": 0.16919587552547455, "learning_rate": 0.00016355818745335078, "loss": 0.5258, "step": 2351 }, { "epoch": 0.7058823529411765, "grad_norm": 0.1594027727842331, "learning_rate": 0.0001635177416799228, "loss": 0.4679, "step": 2352 }, { "epoch": 0.7061824729891957, "grad_norm": 0.14303286373615265, "learning_rate": 0.00016347727848113074, "loss": 0.4526, "step": 2353 }, { "epoch": 0.7064825930372148, "grad_norm": 0.1363154947757721, "learning_rate": 0.00016343679786807517, "loss": 0.4458, "step": 2354 }, { "epoch": 0.7067827130852341, "grad_norm": 0.14714987576007843, "learning_rate": 0.00016339629985186153, "loss": 0.4277, "step": 2355 }, { "epoch": 0.7070828331332533, "grad_norm": 0.15142428874969482, "learning_rate": 0.00016335578444359996, "loss": 0.5097, "step": 2356 }, { "epoch": 0.7073829531812725, "grad_norm": 0.15368203818798065, "learning_rate": 0.0001633152516544054, "loss": 0.4766, "step": 2357 }, { "epoch": 0.7076830732292917, "grad_norm": 0.14941073954105377, "learning_rate": 0.00016327470149539756, "loss": 0.4105, "step": 2358 }, { "epoch": 0.707983193277311, "grad_norm": 0.16012686491012573, "learning_rate": 0.0001632341339777009, "loss": 0.5073, "step": 2359 }, { "epoch": 0.7082833133253301, "grad_norm": 0.13656701147556305, "learning_rate": 0.00016319354911244468, "loss": 0.4195, "step": 2360 }, { "epoch": 0.7085834333733494, "grad_norm": 0.1558884233236313, "learning_rate": 0.0001631529469107629, "loss": 0.4813, "step": 2361 }, { "epoch": 0.7088835534213686, "grad_norm": 0.19369877874851227, "learning_rate": 0.00016311232738379423, "loss": 0.4347, "step": 2362 }, { "epoch": 0.7091836734693877, "grad_norm": 0.16903385519981384, "learning_rate": 0.00016307169054268226, "loss": 0.4733, "step": 2363 }, { "epoch": 0.709483793517407, "grad_norm": 0.14185801148414612, "learning_rate": 0.00016303103639857519, "loss": 0.4464, "step": 2364 }, { "epoch": 0.7097839135654261, "grad_norm": 0.16600801050662994, "learning_rate": 0.000162990364962626, "loss": 0.5555, "step": 2365 }, { "epoch": 0.7100840336134454, "grad_norm": 0.14495305716991425, "learning_rate": 0.00016294967624599254, "loss": 0.4612, "step": 2366 }, { "epoch": 0.7103841536614646, "grad_norm": 0.18455393612384796, "learning_rate": 0.00016290897025983715, "loss": 0.539, "step": 2367 }, { "epoch": 0.7106842737094838, "grad_norm": 0.14817331731319427, "learning_rate": 0.00016286824701532718, "loss": 0.476, "step": 2368 }, { "epoch": 0.710984393757503, "grad_norm": 0.15020863711833954, "learning_rate": 0.00016282750652363447, "loss": 0.464, "step": 2369 }, { "epoch": 0.7112845138055222, "grad_norm": 0.16190224885940552, "learning_rate": 0.00016278674879593582, "loss": 0.4306, "step": 2370 }, { "epoch": 0.7115846338535414, "grad_norm": 0.15611281991004944, "learning_rate": 0.00016274597384341254, "loss": 0.4695, "step": 2371 }, { "epoch": 0.7118847539015606, "grad_norm": 0.1466979831457138, "learning_rate": 0.00016270518167725085, "loss": 0.4928, "step": 2372 }, { "epoch": 0.7121848739495799, "grad_norm": 0.14179770648479462, "learning_rate": 0.00016266437230864157, "loss": 0.4402, "step": 2373 }, { "epoch": 0.712484993997599, "grad_norm": 0.14757367968559265, "learning_rate": 0.0001626235457487803, "loss": 0.4456, "step": 2374 }, { "epoch": 0.7127851140456183, "grad_norm": 0.1571052223443985, "learning_rate": 0.0001625827020088673, "loss": 0.4685, "step": 2375 }, { "epoch": 0.7130852340936374, "grad_norm": 0.22218023240566254, "learning_rate": 0.00016254184110010765, "loss": 0.5559, "step": 2376 }, { "epoch": 0.7133853541416567, "grad_norm": 0.14846636354923248, "learning_rate": 0.00016250096303371104, "loss": 0.4649, "step": 2377 }, { "epoch": 0.7136854741896759, "grad_norm": 0.1457940936088562, "learning_rate": 0.00016246006782089187, "loss": 0.4535, "step": 2378 }, { "epoch": 0.713985594237695, "grad_norm": 0.1568482220172882, "learning_rate": 0.0001624191554728693, "loss": 0.5334, "step": 2379 }, { "epoch": 0.7142857142857143, "grad_norm": 0.14761097729206085, "learning_rate": 0.00016237822600086716, "loss": 0.4504, "step": 2380 }, { "epoch": 0.7145858343337335, "grad_norm": 0.17113368213176727, "learning_rate": 0.000162337279416114, "loss": 0.5158, "step": 2381 }, { "epoch": 0.7148859543817527, "grad_norm": 0.14863641560077667, "learning_rate": 0.00016229631572984302, "loss": 0.4488, "step": 2382 }, { "epoch": 0.7151860744297719, "grad_norm": 0.14860329031944275, "learning_rate": 0.00016225533495329214, "loss": 0.4718, "step": 2383 }, { "epoch": 0.7154861944777912, "grad_norm": 1.8711261749267578, "learning_rate": 0.00016221433709770396, "loss": 0.5577, "step": 2384 }, { "epoch": 0.7157863145258103, "grad_norm": 0.18025538325309753, "learning_rate": 0.0001621733221743258, "loss": 0.4975, "step": 2385 }, { "epoch": 0.7160864345738295, "grad_norm": 28.5136775970459, "learning_rate": 0.0001621322901944096, "loss": 2.9849, "step": 2386 }, { "epoch": 0.7163865546218487, "grad_norm": 1.3544706106185913, "learning_rate": 0.00016209124116921207, "loss": 0.5188, "step": 2387 }, { "epoch": 0.7166866746698679, "grad_norm": 0.16111868619918823, "learning_rate": 0.00016205017510999447, "loss": 0.4555, "step": 2388 }, { "epoch": 0.7169867947178872, "grad_norm": 0.16572299599647522, "learning_rate": 0.00016200909202802283, "loss": 0.4513, "step": 2389 }, { "epoch": 0.7172869147659063, "grad_norm": 0.1773393750190735, "learning_rate": 0.00016196799193456785, "loss": 0.4331, "step": 2390 }, { "epoch": 0.7175870348139256, "grad_norm": 0.4284209907054901, "learning_rate": 0.0001619268748409048, "loss": 0.4839, "step": 2391 }, { "epoch": 0.7178871548619448, "grad_norm": 0.1890149861574173, "learning_rate": 0.00016188574075831378, "loss": 0.5225, "step": 2392 }, { "epoch": 0.718187274909964, "grad_norm": 0.16966569423675537, "learning_rate": 0.00016184458969807937, "loss": 0.4281, "step": 2393 }, { "epoch": 0.7184873949579832, "grad_norm": 0.178705632686615, "learning_rate": 0.0001618034216714909, "loss": 0.5232, "step": 2394 }, { "epoch": 0.7187875150060024, "grad_norm": 0.16386286914348602, "learning_rate": 0.0001617622366898424, "loss": 0.4477, "step": 2395 }, { "epoch": 0.7190876350540216, "grad_norm": 0.17845234274864197, "learning_rate": 0.00016172103476443247, "loss": 0.514, "step": 2396 }, { "epoch": 0.7193877551020408, "grad_norm": 0.16308018565177917, "learning_rate": 0.00016167981590656437, "loss": 0.4479, "step": 2397 }, { "epoch": 0.71968787515006, "grad_norm": 0.15091225504875183, "learning_rate": 0.00016163858012754604, "loss": 0.4255, "step": 2398 }, { "epoch": 0.7199879951980792, "grad_norm": 0.17604851722717285, "learning_rate": 0.00016159732743869002, "loss": 0.4797, "step": 2399 }, { "epoch": 0.7202881152460985, "grad_norm": 0.15736879408359528, "learning_rate": 0.00016155605785131357, "loss": 0.4726, "step": 2400 }, { "epoch": 0.7205882352941176, "grad_norm": 0.16581600904464722, "learning_rate": 0.00016151477137673842, "loss": 0.478, "step": 2401 }, { "epoch": 0.7208883553421368, "grad_norm": 0.19278988242149353, "learning_rate": 0.00016147346802629115, "loss": 0.4809, "step": 2402 }, { "epoch": 0.7211884753901561, "grad_norm": 0.2418685406446457, "learning_rate": 0.00016143214781130284, "loss": 0.5177, "step": 2403 }, { "epoch": 0.7214885954381752, "grad_norm": 0.16705535352230072, "learning_rate": 0.00016139081074310915, "loss": 0.4942, "step": 2404 }, { "epoch": 0.7217887154861945, "grad_norm": 0.13845057785511017, "learning_rate": 0.00016134945683305048, "loss": 0.4137, "step": 2405 }, { "epoch": 0.7220888355342137, "grad_norm": 0.15267415344715118, "learning_rate": 0.0001613080860924718, "loss": 0.4332, "step": 2406 }, { "epoch": 0.7223889555822329, "grad_norm": 0.1513591855764389, "learning_rate": 0.00016126669853272274, "loss": 0.487, "step": 2407 }, { "epoch": 0.7226890756302521, "grad_norm": 0.2217532843351364, "learning_rate": 0.0001612252941651574, "loss": 0.4184, "step": 2408 }, { "epoch": 0.7229891956782714, "grad_norm": 0.2727909982204437, "learning_rate": 0.00016118387300113467, "loss": 0.4827, "step": 2409 }, { "epoch": 0.7232893157262905, "grad_norm": 0.14815635979175568, "learning_rate": 0.00016114243505201795, "loss": 0.4442, "step": 2410 }, { "epoch": 0.7235894357743097, "grad_norm": 0.15210844576358795, "learning_rate": 0.0001611009803291753, "loss": 0.4809, "step": 2411 }, { "epoch": 0.723889555822329, "grad_norm": 0.14424121379852295, "learning_rate": 0.00016105950884397926, "loss": 0.4478, "step": 2412 }, { "epoch": 0.7241896758703481, "grad_norm": 0.14106842875480652, "learning_rate": 0.00016101802060780712, "loss": 0.4512, "step": 2413 }, { "epoch": 0.7244897959183674, "grad_norm": 0.14604397118091583, "learning_rate": 0.0001609765156320407, "loss": 0.444, "step": 2414 }, { "epoch": 0.7247899159663865, "grad_norm": 0.1434173882007599, "learning_rate": 0.00016093499392806648, "loss": 0.4616, "step": 2415 }, { "epoch": 0.7250900360144058, "grad_norm": 0.15155497193336487, "learning_rate": 0.00016089345550727532, "loss": 0.5038, "step": 2416 }, { "epoch": 0.725390156062425, "grad_norm": 0.1462622880935669, "learning_rate": 0.00016085190038106292, "loss": 0.4598, "step": 2417 }, { "epoch": 0.7256902761104442, "grad_norm": 0.20126402378082275, "learning_rate": 0.00016081032856082937, "loss": 0.4401, "step": 2418 }, { "epoch": 0.7259903961584634, "grad_norm": 0.15513941645622253, "learning_rate": 0.00016076874005797951, "loss": 0.5003, "step": 2419 }, { "epoch": 0.7262905162064826, "grad_norm": 0.9181942939758301, "learning_rate": 0.0001607271348839226, "loss": 0.5313, "step": 2420 }, { "epoch": 0.7265906362545018, "grad_norm": 0.1356831192970276, "learning_rate": 0.00016068551305007254, "loss": 0.4011, "step": 2421 }, { "epoch": 0.726890756302521, "grad_norm": 0.13881300389766693, "learning_rate": 0.00016064387456784788, "loss": 0.4275, "step": 2422 }, { "epoch": 0.7271908763505402, "grad_norm": 0.15785065293312073, "learning_rate": 0.0001606022194486716, "loss": 0.4802, "step": 2423 }, { "epoch": 0.7274909963985594, "grad_norm": 0.17977090179920197, "learning_rate": 0.00016056054770397128, "loss": 0.472, "step": 2424 }, { "epoch": 0.7277911164465787, "grad_norm": 0.17413394153118134, "learning_rate": 0.00016051885934517917, "loss": 0.4735, "step": 2425 }, { "epoch": 0.7280912364945978, "grad_norm": 0.1621166616678238, "learning_rate": 0.00016047715438373192, "loss": 0.5128, "step": 2426 }, { "epoch": 0.728391356542617, "grad_norm": 0.16929030418395996, "learning_rate": 0.00016043543283107082, "loss": 0.5162, "step": 2427 }, { "epoch": 0.7286914765906363, "grad_norm": 0.15607744455337524, "learning_rate": 0.00016039369469864173, "loss": 0.4823, "step": 2428 }, { "epoch": 0.7289915966386554, "grad_norm": 0.1620553880929947, "learning_rate": 0.000160351939997895, "loss": 0.5065, "step": 2429 }, { "epoch": 0.7292917166866747, "grad_norm": 0.1509367674589157, "learning_rate": 0.00016031016874028557, "loss": 0.4166, "step": 2430 }, { "epoch": 0.7295918367346939, "grad_norm": 0.16851627826690674, "learning_rate": 0.0001602683809372729, "loss": 0.5493, "step": 2431 }, { "epoch": 0.7298919567827131, "grad_norm": 0.2047121524810791, "learning_rate": 0.00016022657660032098, "loss": 0.5213, "step": 2432 }, { "epoch": 0.7301920768307323, "grad_norm": 0.18215420842170715, "learning_rate": 0.00016018475574089837, "loss": 0.4492, "step": 2433 }, { "epoch": 0.7304921968787516, "grad_norm": 0.15045644342899323, "learning_rate": 0.00016014291837047813, "loss": 0.4588, "step": 2434 }, { "epoch": 0.7307923169267707, "grad_norm": 0.14964286983013153, "learning_rate": 0.00016010106450053786, "loss": 0.4541, "step": 2435 }, { "epoch": 0.7310924369747899, "grad_norm": 0.17790651321411133, "learning_rate": 0.00016005919414255972, "loss": 0.4132, "step": 2436 }, { "epoch": 0.7313925570228091, "grad_norm": 0.320291131734848, "learning_rate": 0.00016001730730803035, "loss": 0.536, "step": 2437 }, { "epoch": 0.7316926770708283, "grad_norm": 0.15852206945419312, "learning_rate": 0.00015997540400844088, "loss": 0.4806, "step": 2438 }, { "epoch": 0.7319927971188476, "grad_norm": 0.14510732889175415, "learning_rate": 0.00015993348425528704, "loss": 0.4662, "step": 2439 }, { "epoch": 0.7322929171668667, "grad_norm": 0.1684138923883438, "learning_rate": 0.00015989154806006904, "loss": 0.5278, "step": 2440 }, { "epoch": 0.732593037214886, "grad_norm": 0.1442684829235077, "learning_rate": 0.0001598495954342916, "loss": 0.4483, "step": 2441 }, { "epoch": 0.7328931572629052, "grad_norm": 0.14993314445018768, "learning_rate": 0.00015980762638946388, "loss": 0.4991, "step": 2442 }, { "epoch": 0.7331932773109243, "grad_norm": 0.14737346768379211, "learning_rate": 0.00015976564093709967, "loss": 0.447, "step": 2443 }, { "epoch": 0.7334933973589436, "grad_norm": 0.5534051060676575, "learning_rate": 0.00015972363908871716, "loss": 0.4782, "step": 2444 }, { "epoch": 0.7337935174069627, "grad_norm": 0.3656732439994812, "learning_rate": 0.00015968162085583913, "loss": 0.505, "step": 2445 }, { "epoch": 0.734093637454982, "grad_norm": 0.17494964599609375, "learning_rate": 0.00015963958624999275, "loss": 0.4748, "step": 2446 }, { "epoch": 0.7343937575030012, "grad_norm": 0.15481603145599365, "learning_rate": 0.00015959753528270971, "loss": 0.4708, "step": 2447 }, { "epoch": 0.7346938775510204, "grad_norm": 0.1475801169872284, "learning_rate": 0.00015955546796552627, "loss": 0.4227, "step": 2448 }, { "epoch": 0.7349939975990396, "grad_norm": 0.2968604564666748, "learning_rate": 0.0001595133843099831, "loss": 0.5031, "step": 2449 }, { "epoch": 0.7352941176470589, "grad_norm": 0.15884873270988464, "learning_rate": 0.00015947128432762536, "loss": 0.4467, "step": 2450 }, { "epoch": 0.735594237695078, "grad_norm": 0.1618734747171402, "learning_rate": 0.00015942916803000267, "loss": 0.4693, "step": 2451 }, { "epoch": 0.7358943577430972, "grad_norm": 0.1525658667087555, "learning_rate": 0.00015938703542866923, "loss": 0.4293, "step": 2452 }, { "epoch": 0.7361944777911165, "grad_norm": 0.1299402117729187, "learning_rate": 0.00015934488653518355, "loss": 0.3741, "step": 2453 }, { "epoch": 0.7364945978391356, "grad_norm": 0.16883011162281036, "learning_rate": 0.00015930272136110873, "loss": 0.4614, "step": 2454 }, { "epoch": 0.7367947178871549, "grad_norm": 0.39602482318878174, "learning_rate": 0.0001592605399180123, "loss": 0.4385, "step": 2455 }, { "epoch": 0.737094837935174, "grad_norm": 0.16625580191612244, "learning_rate": 0.0001592183422174663, "loss": 0.4745, "step": 2456 }, { "epoch": 0.7373949579831933, "grad_norm": 0.14766505360603333, "learning_rate": 0.0001591761282710471, "loss": 0.4177, "step": 2457 }, { "epoch": 0.7376950780312125, "grad_norm": 0.32073864340782166, "learning_rate": 0.00015913389809033568, "loss": 0.4803, "step": 2458 }, { "epoch": 0.7379951980792316, "grad_norm": 0.16001272201538086, "learning_rate": 0.0001590916516869174, "loss": 0.5038, "step": 2459 }, { "epoch": 0.7382953181272509, "grad_norm": 0.14552772045135498, "learning_rate": 0.00015904938907238206, "loss": 0.4837, "step": 2460 }, { "epoch": 0.7385954381752701, "grad_norm": 0.1509009599685669, "learning_rate": 0.0001590071102583239, "loss": 0.4703, "step": 2461 }, { "epoch": 0.7388955582232893, "grad_norm": 0.16939735412597656, "learning_rate": 0.00015896481525634162, "loss": 0.4524, "step": 2462 }, { "epoch": 0.7391956782713085, "grad_norm": 0.15040256083011627, "learning_rate": 0.00015892250407803843, "loss": 0.4833, "step": 2463 }, { "epoch": 0.7394957983193278, "grad_norm": 0.154825821518898, "learning_rate": 0.0001588801767350219, "loss": 0.4759, "step": 2464 }, { "epoch": 0.7397959183673469, "grad_norm": 0.6256540417671204, "learning_rate": 0.00015883783323890403, "loss": 0.4295, "step": 2465 }, { "epoch": 0.7400960384153662, "grad_norm": 1.0148793458938599, "learning_rate": 0.00015879547360130128, "loss": 0.5452, "step": 2466 }, { "epoch": 0.7403961584633854, "grad_norm": 0.14528141915798187, "learning_rate": 0.00015875309783383452, "loss": 0.4459, "step": 2467 }, { "epoch": 0.7406962785114045, "grad_norm": 0.14045490324497223, "learning_rate": 0.00015871070594812906, "loss": 0.4247, "step": 2468 }, { "epoch": 0.7409963985594238, "grad_norm": 0.14894379675388336, "learning_rate": 0.00015866829795581464, "loss": 0.4313, "step": 2469 }, { "epoch": 0.741296518607443, "grad_norm": 0.1819421797990799, "learning_rate": 0.00015862587386852541, "loss": 0.5641, "step": 2470 }, { "epoch": 0.7415966386554622, "grad_norm": 0.1688489019870758, "learning_rate": 0.00015858343369789992, "loss": 0.4838, "step": 2471 }, { "epoch": 0.7418967587034814, "grad_norm": 0.16753920912742615, "learning_rate": 0.00015854097745558114, "loss": 0.4528, "step": 2472 }, { "epoch": 0.7421968787515006, "grad_norm": 0.16875241696834564, "learning_rate": 0.00015849850515321648, "loss": 0.5313, "step": 2473 }, { "epoch": 0.7424969987995198, "grad_norm": 0.18420499563217163, "learning_rate": 0.00015845601680245766, "loss": 0.512, "step": 2474 }, { "epoch": 0.742797118847539, "grad_norm": 0.21641017496585846, "learning_rate": 0.000158413512414961, "loss": 0.4748, "step": 2475 }, { "epoch": 0.7430972388955582, "grad_norm": 0.15973563492298126, "learning_rate": 0.00015837099200238696, "loss": 0.4729, "step": 2476 }, { "epoch": 0.7433973589435774, "grad_norm": 0.3653205633163452, "learning_rate": 0.00015832845557640058, "loss": 0.459, "step": 2477 }, { "epoch": 0.7436974789915967, "grad_norm": 0.16915689408779144, "learning_rate": 0.00015828590314867125, "loss": 0.5008, "step": 2478 }, { "epoch": 0.7439975990396158, "grad_norm": 0.15779462456703186, "learning_rate": 0.00015824333473087276, "loss": 0.4087, "step": 2479 }, { "epoch": 0.7442977190876351, "grad_norm": 0.1613835096359253, "learning_rate": 0.0001582007503346832, "loss": 0.4711, "step": 2480 }, { "epoch": 0.7445978391356542, "grad_norm": 0.18008920550346375, "learning_rate": 0.00015815814997178514, "loss": 0.5015, "step": 2481 }, { "epoch": 0.7448979591836735, "grad_norm": 0.152873694896698, "learning_rate": 0.00015811553365386555, "loss": 0.4183, "step": 2482 }, { "epoch": 0.7451980792316927, "grad_norm": 0.16232550144195557, "learning_rate": 0.00015807290139261567, "loss": 0.5007, "step": 2483 }, { "epoch": 0.7454981992797118, "grad_norm": 0.19482393562793732, "learning_rate": 0.0001580302531997312, "loss": 0.5102, "step": 2484 }, { "epoch": 0.7457983193277311, "grad_norm": 0.15328121185302734, "learning_rate": 0.00015798758908691215, "loss": 0.4548, "step": 2485 }, { "epoch": 0.7460984393757503, "grad_norm": 0.1667821705341339, "learning_rate": 0.00015794490906586298, "loss": 0.5252, "step": 2486 }, { "epoch": 0.7463985594237695, "grad_norm": 0.16482432186603546, "learning_rate": 0.00015790221314829244, "loss": 0.487, "step": 2487 }, { "epoch": 0.7466986794717887, "grad_norm": 0.14885227382183075, "learning_rate": 0.00015785950134591364, "loss": 0.4354, "step": 2488 }, { "epoch": 0.746998799519808, "grad_norm": 0.1504472941160202, "learning_rate": 0.0001578167736704441, "loss": 0.4777, "step": 2489 }, { "epoch": 0.7472989195678271, "grad_norm": 0.17454247176647186, "learning_rate": 0.0001577740301336057, "loss": 0.5177, "step": 2490 }, { "epoch": 0.7475990396158463, "grad_norm": 0.16132484376430511, "learning_rate": 0.00015773127074712457, "loss": 0.4218, "step": 2491 }, { "epoch": 0.7478991596638656, "grad_norm": 0.1640845090150833, "learning_rate": 0.00015768849552273129, "loss": 0.4685, "step": 2492 }, { "epoch": 0.7481992797118847, "grad_norm": 0.15111713111400604, "learning_rate": 0.00015764570447216074, "loss": 0.4401, "step": 2493 }, { "epoch": 0.748499399759904, "grad_norm": 0.16630788147449493, "learning_rate": 0.0001576028976071522, "loss": 0.4573, "step": 2494 }, { "epoch": 0.7487995198079231, "grad_norm": 0.15859933197498322, "learning_rate": 0.0001575600749394492, "loss": 0.4777, "step": 2495 }, { "epoch": 0.7490996398559424, "grad_norm": 0.14920127391815186, "learning_rate": 0.00015751723648079965, "loss": 0.4593, "step": 2496 }, { "epoch": 0.7493997599039616, "grad_norm": 0.16223812103271484, "learning_rate": 0.0001574743822429558, "loss": 0.4911, "step": 2497 }, { "epoch": 0.7496998799519808, "grad_norm": 0.15160585939884186, "learning_rate": 0.00015743151223767424, "loss": 0.4731, "step": 2498 }, { "epoch": 0.75, "grad_norm": 0.16430693864822388, "learning_rate": 0.0001573886264767158, "loss": 0.4952, "step": 2499 }, { "epoch": 0.7503001200480192, "grad_norm": 0.17609624564647675, "learning_rate": 0.00015734572497184577, "loss": 0.546, "step": 2500 }, { "epoch": 0.7506002400960384, "grad_norm": 0.3264397084712982, "learning_rate": 0.00015730280773483367, "loss": 0.4297, "step": 2501 }, { "epoch": 0.7509003601440576, "grad_norm": 0.16511526703834534, "learning_rate": 0.00015725987477745328, "loss": 0.5182, "step": 2502 }, { "epoch": 0.7512004801920769, "grad_norm": 0.17369025945663452, "learning_rate": 0.00015721692611148284, "loss": 0.4407, "step": 2503 }, { "epoch": 0.751500600240096, "grad_norm": 0.13990244269371033, "learning_rate": 0.00015717396174870483, "loss": 0.4365, "step": 2504 }, { "epoch": 0.7518007202881153, "grad_norm": 0.13970094919204712, "learning_rate": 0.00015713098170090599, "loss": 0.4663, "step": 2505 }, { "epoch": 0.7521008403361344, "grad_norm": 0.16043907403945923, "learning_rate": 0.00015708798597987742, "loss": 0.5133, "step": 2506 }, { "epoch": 0.7524009603841537, "grad_norm": 0.1567663550376892, "learning_rate": 0.00015704497459741447, "loss": 0.498, "step": 2507 }, { "epoch": 0.7527010804321729, "grad_norm": 0.21229980885982513, "learning_rate": 0.00015700194756531688, "loss": 0.4826, "step": 2508 }, { "epoch": 0.753001200480192, "grad_norm": 0.14606361091136932, "learning_rate": 0.0001569589048953886, "loss": 0.4145, "step": 2509 }, { "epoch": 0.7533013205282113, "grad_norm": 0.15010032057762146, "learning_rate": 0.00015691584659943786, "loss": 0.4501, "step": 2510 }, { "epoch": 0.7536014405762305, "grad_norm": 0.3521146774291992, "learning_rate": 0.00015687277268927724, "loss": 0.4969, "step": 2511 }, { "epoch": 0.7539015606242497, "grad_norm": 0.16045944392681122, "learning_rate": 0.0001568296831767236, "loss": 0.4624, "step": 2512 }, { "epoch": 0.7542016806722689, "grad_norm": 0.14147594571113586, "learning_rate": 0.000156786578073598, "loss": 0.4703, "step": 2513 }, { "epoch": 0.7545018007202882, "grad_norm": 0.15041673183441162, "learning_rate": 0.00015674345739172585, "loss": 0.4737, "step": 2514 }, { "epoch": 0.7548019207683073, "grad_norm": 0.15191854536533356, "learning_rate": 0.00015670032114293683, "loss": 0.5047, "step": 2515 }, { "epoch": 0.7551020408163265, "grad_norm": 0.18009275197982788, "learning_rate": 0.00015665716933906487, "loss": 0.4611, "step": 2516 }, { "epoch": 0.7554021608643458, "grad_norm": 0.17272666096687317, "learning_rate": 0.00015661400199194813, "loss": 0.4397, "step": 2517 }, { "epoch": 0.7557022809123649, "grad_norm": 0.1610168069601059, "learning_rate": 0.00015657081911342916, "loss": 0.4273, "step": 2518 }, { "epoch": 0.7560024009603842, "grad_norm": 0.1512984335422516, "learning_rate": 0.00015652762071535458, "loss": 0.4762, "step": 2519 }, { "epoch": 0.7563025210084033, "grad_norm": 0.15497024357318878, "learning_rate": 0.0001564844068095755, "loss": 0.5085, "step": 2520 }, { "epoch": 0.7566026410564226, "grad_norm": 0.16191035509109497, "learning_rate": 0.00015644117740794706, "loss": 0.4851, "step": 2521 }, { "epoch": 0.7569027611044418, "grad_norm": 0.18235939741134644, "learning_rate": 0.0001563979325223288, "loss": 0.4637, "step": 2522 }, { "epoch": 0.757202881152461, "grad_norm": 0.1426496058702469, "learning_rate": 0.00015635467216458445, "loss": 0.4371, "step": 2523 }, { "epoch": 0.7575030012004802, "grad_norm": 0.14401482045650482, "learning_rate": 0.00015631139634658195, "loss": 0.4089, "step": 2524 }, { "epoch": 0.7578031212484994, "grad_norm": 0.15074434876441956, "learning_rate": 0.0001562681050801936, "loss": 0.458, "step": 2525 }, { "epoch": 0.7581032412965186, "grad_norm": 0.14815548062324524, "learning_rate": 0.0001562247983772958, "loss": 0.47, "step": 2526 }, { "epoch": 0.7584033613445378, "grad_norm": 0.1517464518547058, "learning_rate": 0.0001561814762497693, "loss": 0.4807, "step": 2527 }, { "epoch": 0.758703481392557, "grad_norm": 0.14972248673439026, "learning_rate": 0.000156138138709499, "loss": 0.4332, "step": 2528 }, { "epoch": 0.7590036014405762, "grad_norm": 0.15079128742218018, "learning_rate": 0.00015609478576837402, "loss": 0.4871, "step": 2529 }, { "epoch": 0.7593037214885955, "grad_norm": 0.5011845231056213, "learning_rate": 0.0001560514174382878, "loss": 0.5338, "step": 2530 }, { "epoch": 0.7596038415366146, "grad_norm": 0.1483888030052185, "learning_rate": 0.00015600803373113796, "loss": 0.4487, "step": 2531 }, { "epoch": 0.7599039615846338, "grad_norm": 0.1570620834827423, "learning_rate": 0.00015596463465882622, "loss": 0.4682, "step": 2532 }, { "epoch": 0.7602040816326531, "grad_norm": 0.15329718589782715, "learning_rate": 0.0001559212202332587, "loss": 0.5032, "step": 2533 }, { "epoch": 0.7605042016806722, "grad_norm": 0.15335699915885925, "learning_rate": 0.00015587779046634568, "loss": 0.415, "step": 2534 }, { "epoch": 0.7608043217286915, "grad_norm": 0.15885575115680695, "learning_rate": 0.00015583434537000154, "loss": 0.465, "step": 2535 }, { "epoch": 0.7611044417767107, "grad_norm": 0.13733403384685516, "learning_rate": 0.00015579088495614498, "loss": 0.4428, "step": 2536 }, { "epoch": 0.7614045618247299, "grad_norm": 0.13970106840133667, "learning_rate": 0.00015574740923669886, "loss": 0.4248, "step": 2537 }, { "epoch": 0.7617046818727491, "grad_norm": 0.14858634769916534, "learning_rate": 0.00015570391822359024, "loss": 0.463, "step": 2538 }, { "epoch": 0.7620048019207684, "grad_norm": 0.15809981524944305, "learning_rate": 0.00015566041192875043, "loss": 0.4542, "step": 2539 }, { "epoch": 0.7623049219687875, "grad_norm": 0.1487644910812378, "learning_rate": 0.0001556168903641148, "loss": 0.4856, "step": 2540 }, { "epoch": 0.7626050420168067, "grad_norm": 0.14323757588863373, "learning_rate": 0.00015557335354162306, "loss": 0.4678, "step": 2541 }, { "epoch": 0.762905162064826, "grad_norm": 0.14858075976371765, "learning_rate": 0.00015552980147321902, "loss": 0.3994, "step": 2542 }, { "epoch": 0.7632052821128451, "grad_norm": 0.2091810554265976, "learning_rate": 0.00015548623417085063, "loss": 0.4711, "step": 2543 }, { "epoch": 0.7635054021608644, "grad_norm": 0.13712801039218903, "learning_rate": 0.00015544265164647018, "loss": 0.4273, "step": 2544 }, { "epoch": 0.7638055222088835, "grad_norm": 0.14469502866268158, "learning_rate": 0.00015539905391203398, "loss": 0.4167, "step": 2545 }, { "epoch": 0.7641056422569028, "grad_norm": 0.1433638483285904, "learning_rate": 0.00015535544097950257, "loss": 0.4609, "step": 2546 }, { "epoch": 0.764405762304922, "grad_norm": 0.4555927813053131, "learning_rate": 0.00015531181286084067, "loss": 0.502, "step": 2547 }, { "epoch": 0.7647058823529411, "grad_norm": 0.1536446213722229, "learning_rate": 0.00015526816956801714, "loss": 0.46, "step": 2548 }, { "epoch": 0.7650060024009604, "grad_norm": 0.15496781468391418, "learning_rate": 0.00015522451111300503, "loss": 0.5006, "step": 2549 }, { "epoch": 0.7653061224489796, "grad_norm": 0.18594437837600708, "learning_rate": 0.00015518083750778157, "loss": 0.4723, "step": 2550 }, { "epoch": 0.7656062424969988, "grad_norm": 0.16826999187469482, "learning_rate": 0.00015513714876432802, "loss": 0.5019, "step": 2551 }, { "epoch": 0.765906362545018, "grad_norm": 0.15920387208461761, "learning_rate": 0.00015509344489462995, "loss": 0.4593, "step": 2552 }, { "epoch": 0.7662064825930373, "grad_norm": 0.16303475201129913, "learning_rate": 0.00015504972591067704, "loss": 0.4888, "step": 2553 }, { "epoch": 0.7665066026410564, "grad_norm": 0.20474757254123688, "learning_rate": 0.00015500599182446305, "loss": 0.4948, "step": 2554 }, { "epoch": 0.7668067226890757, "grad_norm": 0.14326965808868408, "learning_rate": 0.0001549622426479859, "loss": 0.443, "step": 2555 }, { "epoch": 0.7671068427370948, "grad_norm": 0.16902682185173035, "learning_rate": 0.00015491847839324777, "loss": 0.4591, "step": 2556 }, { "epoch": 0.767406962785114, "grad_norm": 0.15285806357860565, "learning_rate": 0.00015487469907225475, "loss": 0.4428, "step": 2557 }, { "epoch": 0.7677070828331333, "grad_norm": 0.18961991369724274, "learning_rate": 0.0001548309046970173, "loss": 0.5089, "step": 2558 }, { "epoch": 0.7680072028811524, "grad_norm": 0.14731541275978088, "learning_rate": 0.00015478709527954986, "loss": 0.4718, "step": 2559 }, { "epoch": 0.7683073229291717, "grad_norm": 0.13845667243003845, "learning_rate": 0.00015474327083187105, "loss": 0.4403, "step": 2560 }, { "epoch": 0.7686074429771909, "grad_norm": 0.1612999141216278, "learning_rate": 0.00015469943136600366, "loss": 0.5089, "step": 2561 }, { "epoch": 0.7689075630252101, "grad_norm": 0.1368875503540039, "learning_rate": 0.00015465557689397442, "loss": 0.4383, "step": 2562 }, { "epoch": 0.7692076830732293, "grad_norm": 0.16148759424686432, "learning_rate": 0.00015461170742781438, "loss": 0.5166, "step": 2563 }, { "epoch": 0.7695078031212484, "grad_norm": 0.18068450689315796, "learning_rate": 0.00015456782297955865, "loss": 0.4748, "step": 2564 }, { "epoch": 0.7698079231692677, "grad_norm": 0.1479911208152771, "learning_rate": 0.00015452392356124638, "loss": 0.4648, "step": 2565 }, { "epoch": 0.7701080432172869, "grad_norm": 0.1680109202861786, "learning_rate": 0.00015448000918492086, "loss": 0.4356, "step": 2566 }, { "epoch": 0.7704081632653061, "grad_norm": 0.14404886960983276, "learning_rate": 0.00015443607986262957, "loss": 0.4426, "step": 2567 }, { "epoch": 0.7707082833133253, "grad_norm": 0.14656315743923187, "learning_rate": 0.00015439213560642393, "loss": 0.4328, "step": 2568 }, { "epoch": 0.7710084033613446, "grad_norm": 0.14843294024467468, "learning_rate": 0.00015434817642835959, "loss": 0.4031, "step": 2569 }, { "epoch": 0.7713085234093637, "grad_norm": 0.13767988979816437, "learning_rate": 0.00015430420234049624, "loss": 0.4166, "step": 2570 }, { "epoch": 0.771608643457383, "grad_norm": 0.16981904208660126, "learning_rate": 0.00015426021335489764, "loss": 0.4666, "step": 2571 }, { "epoch": 0.7719087635054022, "grad_norm": 0.15011709928512573, "learning_rate": 0.00015421620948363168, "loss": 0.4682, "step": 2572 }, { "epoch": 0.7722088835534213, "grad_norm": 0.1532842367887497, "learning_rate": 0.00015417219073877037, "loss": 0.4622, "step": 2573 }, { "epoch": 0.7725090036014406, "grad_norm": 0.5074936151504517, "learning_rate": 0.00015412815713238963, "loss": 0.4313, "step": 2574 }, { "epoch": 0.7728091236494598, "grad_norm": 0.21822358667850494, "learning_rate": 0.0001540841086765697, "loss": 0.4781, "step": 2575 }, { "epoch": 0.773109243697479, "grad_norm": 0.1980976164340973, "learning_rate": 0.00015404004538339472, "loss": 0.4466, "step": 2576 }, { "epoch": 0.7734093637454982, "grad_norm": 0.17648696899414062, "learning_rate": 0.0001539959672649529, "loss": 0.4553, "step": 2577 }, { "epoch": 0.7737094837935174, "grad_norm": 0.3987429141998291, "learning_rate": 0.00015395187433333665, "loss": 0.4719, "step": 2578 }, { "epoch": 0.7740096038415366, "grad_norm": 0.15765416622161865, "learning_rate": 0.0001539077666006423, "loss": 0.4828, "step": 2579 }, { "epoch": 0.7743097238895558, "grad_norm": 0.1482846885919571, "learning_rate": 0.00015386364407897035, "loss": 0.4396, "step": 2580 }, { "epoch": 0.774609843937575, "grad_norm": 0.15453636646270752, "learning_rate": 0.00015381950678042524, "loss": 0.4964, "step": 2581 }, { "epoch": 0.7749099639855942, "grad_norm": 0.17158925533294678, "learning_rate": 0.00015377535471711558, "loss": 0.515, "step": 2582 }, { "epoch": 0.7752100840336135, "grad_norm": 0.1416906863451004, "learning_rate": 0.000153731187901154, "loss": 0.3999, "step": 2583 }, { "epoch": 0.7755102040816326, "grad_norm": 0.15163570642471313, "learning_rate": 0.0001536870063446571, "loss": 0.4658, "step": 2584 }, { "epoch": 0.7758103241296519, "grad_norm": 0.16137585043907166, "learning_rate": 0.0001536428100597456, "loss": 0.5056, "step": 2585 }, { "epoch": 0.776110444177671, "grad_norm": 0.15285837650299072, "learning_rate": 0.0001535985990585443, "loss": 0.4718, "step": 2586 }, { "epoch": 0.7764105642256903, "grad_norm": 0.15122708678245544, "learning_rate": 0.00015355437335318195, "loss": 0.4593, "step": 2587 }, { "epoch": 0.7767106842737095, "grad_norm": 0.3090333640575409, "learning_rate": 0.0001535101329557913, "loss": 0.4388, "step": 2588 }, { "epoch": 0.7770108043217286, "grad_norm": 0.20557376742362976, "learning_rate": 0.00015346587787850932, "loss": 0.5051, "step": 2589 }, { "epoch": 0.7773109243697479, "grad_norm": 0.15038730204105377, "learning_rate": 0.00015342160813347676, "loss": 0.3762, "step": 2590 }, { "epoch": 0.7776110444177671, "grad_norm": 0.15132176876068115, "learning_rate": 0.0001533773237328386, "loss": 0.4181, "step": 2591 }, { "epoch": 0.7779111644657863, "grad_norm": 0.15697380900382996, "learning_rate": 0.00015333302468874374, "loss": 0.3925, "step": 2592 }, { "epoch": 0.7782112845138055, "grad_norm": 0.20264506340026855, "learning_rate": 0.0001532887110133451, "loss": 0.5233, "step": 2593 }, { "epoch": 0.7785114045618248, "grad_norm": 0.27464234828948975, "learning_rate": 0.00015324438271879963, "loss": 0.4927, "step": 2594 }, { "epoch": 0.7788115246098439, "grad_norm": 0.16195513308048248, "learning_rate": 0.00015320003981726828, "loss": 0.4794, "step": 2595 }, { "epoch": 0.7791116446578632, "grad_norm": 0.1584070920944214, "learning_rate": 0.00015315568232091603, "loss": 0.4716, "step": 2596 }, { "epoch": 0.7794117647058824, "grad_norm": 0.142884761095047, "learning_rate": 0.00015311131024191182, "loss": 0.4211, "step": 2597 }, { "epoch": 0.7797118847539015, "grad_norm": 0.1578892320394516, "learning_rate": 0.00015306692359242868, "loss": 0.4892, "step": 2598 }, { "epoch": 0.7800120048019208, "grad_norm": 0.15338781476020813, "learning_rate": 0.0001530225223846436, "loss": 0.4763, "step": 2599 }, { "epoch": 0.78031212484994, "grad_norm": 0.15820817649364471, "learning_rate": 0.00015297810663073743, "loss": 0.4925, "step": 2600 }, { "epoch": 0.7806122448979592, "grad_norm": 0.15015913546085358, "learning_rate": 0.0001529336763428952, "loss": 0.4847, "step": 2601 }, { "epoch": 0.7809123649459784, "grad_norm": 0.15379305183887482, "learning_rate": 0.00015288923153330584, "loss": 0.4217, "step": 2602 }, { "epoch": 0.7812124849939976, "grad_norm": 0.14863692224025726, "learning_rate": 0.00015284477221416226, "loss": 0.4772, "step": 2603 }, { "epoch": 0.7815126050420168, "grad_norm": 0.17082728445529938, "learning_rate": 0.00015280029839766134, "loss": 0.5133, "step": 2604 }, { "epoch": 0.781812725090036, "grad_norm": 0.1583660989999771, "learning_rate": 0.00015275581009600403, "loss": 0.4573, "step": 2605 }, { "epoch": 0.7821128451380552, "grad_norm": 0.17564094066619873, "learning_rate": 0.00015271130732139515, "loss": 0.443, "step": 2606 }, { "epoch": 0.7824129651860744, "grad_norm": 0.16535411775112152, "learning_rate": 0.0001526667900860435, "loss": 0.4876, "step": 2607 }, { "epoch": 0.7827130852340937, "grad_norm": 0.2890619933605194, "learning_rate": 0.00015262225840216194, "loss": 0.4977, "step": 2608 }, { "epoch": 0.7830132052821128, "grad_norm": 0.13368655741214752, "learning_rate": 0.00015257771228196715, "loss": 0.4076, "step": 2609 }, { "epoch": 0.7833133253301321, "grad_norm": 0.14766785502433777, "learning_rate": 0.00015253315173767993, "loss": 0.4406, "step": 2610 }, { "epoch": 0.7836134453781513, "grad_norm": 0.15561902523040771, "learning_rate": 0.00015248857678152485, "loss": 0.5003, "step": 2611 }, { "epoch": 0.7839135654261705, "grad_norm": 0.20068630576133728, "learning_rate": 0.0001524439874257306, "loss": 0.4634, "step": 2612 }, { "epoch": 0.7842136854741897, "grad_norm": 0.1610409915447235, "learning_rate": 0.00015239938368252976, "loss": 0.471, "step": 2613 }, { "epoch": 0.7845138055222088, "grad_norm": 0.16158722341060638, "learning_rate": 0.00015235476556415889, "loss": 0.4585, "step": 2614 }, { "epoch": 0.7848139255702281, "grad_norm": 0.15008698403835297, "learning_rate": 0.00015231013308285838, "loss": 0.4735, "step": 2615 }, { "epoch": 0.7851140456182473, "grad_norm": 0.18932443857192993, "learning_rate": 0.00015226548625087266, "loss": 0.5461, "step": 2616 }, { "epoch": 0.7854141656662665, "grad_norm": 0.15541422367095947, "learning_rate": 0.00015222082508045012, "loss": 0.4787, "step": 2617 }, { "epoch": 0.7857142857142857, "grad_norm": 0.15387295186519623, "learning_rate": 0.00015217614958384302, "loss": 0.499, "step": 2618 }, { "epoch": 0.786014405762305, "grad_norm": 0.15941022336483002, "learning_rate": 0.00015213145977330757, "loss": 0.4226, "step": 2619 }, { "epoch": 0.7863145258103241, "grad_norm": 0.15059997141361237, "learning_rate": 0.00015208675566110387, "loss": 0.5033, "step": 2620 }, { "epoch": 0.7866146458583433, "grad_norm": 0.1610608547925949, "learning_rate": 0.00015204203725949604, "loss": 0.4464, "step": 2621 }, { "epoch": 0.7869147659063626, "grad_norm": 0.13089437782764435, "learning_rate": 0.00015199730458075202, "loss": 0.3898, "step": 2622 }, { "epoch": 0.7872148859543817, "grad_norm": 0.1501428335905075, "learning_rate": 0.00015195255763714372, "loss": 0.468, "step": 2623 }, { "epoch": 0.787515006002401, "grad_norm": 0.15691035985946655, "learning_rate": 0.00015190779644094695, "loss": 0.4842, "step": 2624 }, { "epoch": 0.7878151260504201, "grad_norm": 0.16759933531284332, "learning_rate": 0.00015186302100444142, "loss": 0.4867, "step": 2625 }, { "epoch": 0.7881152460984394, "grad_norm": 0.15055400133132935, "learning_rate": 0.0001518182313399108, "loss": 0.428, "step": 2626 }, { "epoch": 0.7884153661464586, "grad_norm": 0.152202770113945, "learning_rate": 0.00015177342745964256, "loss": 0.4762, "step": 2627 }, { "epoch": 0.7887154861944778, "grad_norm": 0.1485380083322525, "learning_rate": 0.00015172860937592817, "loss": 0.5, "step": 2628 }, { "epoch": 0.789015606242497, "grad_norm": 0.17636558413505554, "learning_rate": 0.00015168377710106297, "loss": 0.4601, "step": 2629 }, { "epoch": 0.7893157262905162, "grad_norm": 0.1553865373134613, "learning_rate": 0.0001516389306473461, "loss": 0.475, "step": 2630 }, { "epoch": 0.7896158463385354, "grad_norm": 0.16150932013988495, "learning_rate": 0.0001515940700270808, "loss": 0.5349, "step": 2631 }, { "epoch": 0.7899159663865546, "grad_norm": 0.15071119368076324, "learning_rate": 0.00015154919525257396, "loss": 0.4644, "step": 2632 }, { "epoch": 0.7902160864345739, "grad_norm": 0.13300053775310516, "learning_rate": 0.0001515043063361365, "loss": 0.4493, "step": 2633 }, { "epoch": 0.790516206482593, "grad_norm": 0.15277032554149628, "learning_rate": 0.00015145940329008316, "loss": 0.4793, "step": 2634 }, { "epoch": 0.7908163265306123, "grad_norm": 0.13567908108234406, "learning_rate": 0.00015141448612673262, "loss": 0.4319, "step": 2635 }, { "epoch": 0.7911164465786314, "grad_norm": 0.18850558996200562, "learning_rate": 0.00015136955485840737, "loss": 0.4799, "step": 2636 }, { "epoch": 0.7914165666266506, "grad_norm": 0.1616249531507492, "learning_rate": 0.00015132460949743374, "loss": 0.4601, "step": 2637 }, { "epoch": 0.7917166866746699, "grad_norm": 0.14946779608726501, "learning_rate": 0.000151279650056142, "loss": 0.4606, "step": 2638 }, { "epoch": 0.792016806722689, "grad_norm": 0.14779013395309448, "learning_rate": 0.0001512346765468663, "loss": 0.4524, "step": 2639 }, { "epoch": 0.7923169267707083, "grad_norm": 0.14556260406970978, "learning_rate": 0.00015118968898194458, "loss": 0.4458, "step": 2640 }, { "epoch": 0.7926170468187275, "grad_norm": 0.13448557257652283, "learning_rate": 0.00015114468737371866, "loss": 0.4144, "step": 2641 }, { "epoch": 0.7929171668667467, "grad_norm": 0.14961867034435272, "learning_rate": 0.0001510996717345342, "loss": 0.4782, "step": 2642 }, { "epoch": 0.7932172869147659, "grad_norm": 0.1369171291589737, "learning_rate": 0.00015105464207674073, "loss": 0.4393, "step": 2643 }, { "epoch": 0.7935174069627852, "grad_norm": 0.1466313898563385, "learning_rate": 0.00015100959841269168, "loss": 0.4486, "step": 2644 }, { "epoch": 0.7938175270108043, "grad_norm": 0.17291010916233063, "learning_rate": 0.00015096454075474416, "loss": 0.4699, "step": 2645 }, { "epoch": 0.7941176470588235, "grad_norm": 0.1477946639060974, "learning_rate": 0.0001509194691152593, "loss": 0.4767, "step": 2646 }, { "epoch": 0.7944177671068428, "grad_norm": 0.14116594195365906, "learning_rate": 0.00015087438350660194, "loss": 0.439, "step": 2647 }, { "epoch": 0.7947178871548619, "grad_norm": 0.1355549544095993, "learning_rate": 0.0001508292839411408, "loss": 0.4093, "step": 2648 }, { "epoch": 0.7950180072028812, "grad_norm": 0.15723030269145966, "learning_rate": 0.00015078417043124849, "loss": 0.5582, "step": 2649 }, { "epoch": 0.7953181272509003, "grad_norm": 0.2367934137582779, "learning_rate": 0.00015073904298930132, "loss": 0.4499, "step": 2650 }, { "epoch": 0.7956182472989196, "grad_norm": 0.16094963252544403, "learning_rate": 0.00015069390162767953, "loss": 0.4269, "step": 2651 }, { "epoch": 0.7959183673469388, "grad_norm": 0.13712561130523682, "learning_rate": 0.0001506487463587671, "loss": 0.4672, "step": 2652 }, { "epoch": 0.7962184873949579, "grad_norm": 0.17767317593097687, "learning_rate": 0.00015060357719495188, "loss": 0.4718, "step": 2653 }, { "epoch": 0.7965186074429772, "grad_norm": 0.14278756082057953, "learning_rate": 0.0001505583941486255, "loss": 0.4752, "step": 2654 }, { "epoch": 0.7968187274909964, "grad_norm": 0.14380548894405365, "learning_rate": 0.00015051319723218343, "loss": 0.4651, "step": 2655 }, { "epoch": 0.7971188475390156, "grad_norm": 0.13241620361804962, "learning_rate": 0.0001504679864580249, "loss": 0.4127, "step": 2656 }, { "epoch": 0.7974189675870348, "grad_norm": 0.14332759380340576, "learning_rate": 0.000150422761838553, "loss": 0.4809, "step": 2657 }, { "epoch": 0.7977190876350541, "grad_norm": 0.19808930158615112, "learning_rate": 0.00015037752338617456, "loss": 0.3853, "step": 2658 }, { "epoch": 0.7980192076830732, "grad_norm": 0.14704017341136932, "learning_rate": 0.00015033227111330022, "loss": 0.4682, "step": 2659 }, { "epoch": 0.7983193277310925, "grad_norm": 0.12785203754901886, "learning_rate": 0.00015028700503234447, "loss": 0.4025, "step": 2660 }, { "epoch": 0.7986194477791116, "grad_norm": 0.13784854114055634, "learning_rate": 0.0001502417251557255, "loss": 0.4385, "step": 2661 }, { "epoch": 0.7989195678271308, "grad_norm": 0.14797067642211914, "learning_rate": 0.0001501964314958653, "loss": 0.4798, "step": 2662 }, { "epoch": 0.7992196878751501, "grad_norm": 0.14484858512878418, "learning_rate": 0.00015015112406518974, "loss": 0.4767, "step": 2663 }, { "epoch": 0.7995198079231692, "grad_norm": 0.16087014973163605, "learning_rate": 0.00015010580287612833, "loss": 0.5004, "step": 2664 }, { "epoch": 0.7998199279711885, "grad_norm": 0.1416206955909729, "learning_rate": 0.00015006046794111447, "loss": 0.477, "step": 2665 }, { "epoch": 0.8001200480192077, "grad_norm": 0.13713401556015015, "learning_rate": 0.00015001511927258522, "loss": 0.4467, "step": 2666 }, { "epoch": 0.8004201680672269, "grad_norm": 0.14090123772621155, "learning_rate": 0.00014996975688298152, "loss": 0.4527, "step": 2667 }, { "epoch": 0.8007202881152461, "grad_norm": 0.1869259923696518, "learning_rate": 0.000149924380784748, "loss": 0.5236, "step": 2668 }, { "epoch": 0.8010204081632653, "grad_norm": 0.14781752228736877, "learning_rate": 0.00014987899099033308, "loss": 0.5191, "step": 2669 }, { "epoch": 0.8013205282112845, "grad_norm": 0.17033900320529938, "learning_rate": 0.00014983358751218892, "loss": 0.4907, "step": 2670 }, { "epoch": 0.8016206482593037, "grad_norm": 0.1332671046257019, "learning_rate": 0.00014978817036277145, "loss": 0.4062, "step": 2671 }, { "epoch": 0.801920768307323, "grad_norm": 0.14560440182685852, "learning_rate": 0.0001497427395545403, "loss": 0.4512, "step": 2672 }, { "epoch": 0.8022208883553421, "grad_norm": 0.15215948224067688, "learning_rate": 0.00014969729509995897, "loss": 0.5081, "step": 2673 }, { "epoch": 0.8025210084033614, "grad_norm": 0.16118304431438446, "learning_rate": 0.0001496518370114946, "loss": 0.4843, "step": 2674 }, { "epoch": 0.8028211284513805, "grad_norm": 0.12681308388710022, "learning_rate": 0.00014960636530161807, "loss": 0.4165, "step": 2675 }, { "epoch": 0.8031212484993998, "grad_norm": 0.14559060335159302, "learning_rate": 0.00014956087998280402, "loss": 0.4896, "step": 2676 }, { "epoch": 0.803421368547419, "grad_norm": 0.14230754971504211, "learning_rate": 0.00014951538106753088, "loss": 0.4769, "step": 2677 }, { "epoch": 0.8037214885954381, "grad_norm": 0.13430102169513702, "learning_rate": 0.00014946986856828067, "loss": 0.3842, "step": 2678 }, { "epoch": 0.8040216086434574, "grad_norm": 0.1401878446340561, "learning_rate": 0.0001494243424975393, "loss": 0.4326, "step": 2679 }, { "epoch": 0.8043217286914766, "grad_norm": 0.14194200932979584, "learning_rate": 0.00014937880286779629, "loss": 0.4497, "step": 2680 }, { "epoch": 0.8046218487394958, "grad_norm": 0.14997471868991852, "learning_rate": 0.0001493332496915449, "loss": 0.4537, "step": 2681 }, { "epoch": 0.804921968787515, "grad_norm": 0.1459532380104065, "learning_rate": 0.00014928768298128216, "loss": 0.4386, "step": 2682 }, { "epoch": 0.8052220888355343, "grad_norm": 0.14103145897388458, "learning_rate": 0.00014924210274950875, "loss": 0.4325, "step": 2683 }, { "epoch": 0.8055222088835534, "grad_norm": 0.14490856230258942, "learning_rate": 0.00014919650900872909, "loss": 0.456, "step": 2684 }, { "epoch": 0.8058223289315727, "grad_norm": 0.15141165256500244, "learning_rate": 0.00014915090177145131, "loss": 0.4898, "step": 2685 }, { "epoch": 0.8061224489795918, "grad_norm": 0.1545751988887787, "learning_rate": 0.0001491052810501872, "loss": 0.4709, "step": 2686 }, { "epoch": 0.806422569027611, "grad_norm": 0.14902053773403168, "learning_rate": 0.0001490596468574523, "loss": 0.4529, "step": 2687 }, { "epoch": 0.8067226890756303, "grad_norm": 0.14312995970249176, "learning_rate": 0.00014901399920576585, "loss": 0.4168, "step": 2688 }, { "epoch": 0.8070228091236494, "grad_norm": 0.14048358798027039, "learning_rate": 0.00014896833810765074, "loss": 0.4396, "step": 2689 }, { "epoch": 0.8073229291716687, "grad_norm": 0.12291613966226578, "learning_rate": 0.00014892266357563358, "loss": 0.3882, "step": 2690 }, { "epoch": 0.8076230492196879, "grad_norm": 0.1517232358455658, "learning_rate": 0.0001488769756222446, "loss": 0.4675, "step": 2691 }, { "epoch": 0.8079231692677071, "grad_norm": 0.14395961165428162, "learning_rate": 0.0001488312742600179, "loss": 0.3948, "step": 2692 }, { "epoch": 0.8082232893157263, "grad_norm": 0.15397968888282776, "learning_rate": 0.00014878555950149095, "loss": 0.4186, "step": 2693 }, { "epoch": 0.8085234093637454, "grad_norm": 0.14338110387325287, "learning_rate": 0.00014873983135920517, "loss": 0.4721, "step": 2694 }, { "epoch": 0.8088235294117647, "grad_norm": 0.14677682518959045, "learning_rate": 0.00014869408984570552, "loss": 0.4791, "step": 2695 }, { "epoch": 0.8091236494597839, "grad_norm": 0.17025801539421082, "learning_rate": 0.00014864833497354074, "loss": 0.4945, "step": 2696 }, { "epoch": 0.8094237695078031, "grad_norm": 0.14594808220863342, "learning_rate": 0.00014860256675526304, "loss": 0.4342, "step": 2697 }, { "epoch": 0.8097238895558223, "grad_norm": 0.14775292575359344, "learning_rate": 0.00014855678520342847, "loss": 0.4861, "step": 2698 }, { "epoch": 0.8100240096038416, "grad_norm": 0.17930883169174194, "learning_rate": 0.00014851099033059666, "loss": 0.4628, "step": 2699 }, { "epoch": 0.8103241296518607, "grad_norm": 0.14827530086040497, "learning_rate": 0.0001484651821493309, "loss": 0.4498, "step": 2700 }, { "epoch": 0.81062424969988, "grad_norm": 0.1567845195531845, "learning_rate": 0.00014841936067219814, "loss": 0.4385, "step": 2701 }, { "epoch": 0.8109243697478992, "grad_norm": 0.16922719776630402, "learning_rate": 0.000148373525911769, "loss": 0.4984, "step": 2702 }, { "epoch": 0.8112244897959183, "grad_norm": 0.14715611934661865, "learning_rate": 0.00014832767788061773, "loss": 0.4684, "step": 2703 }, { "epoch": 0.8115246098439376, "grad_norm": 0.1670290231704712, "learning_rate": 0.00014828181659132215, "loss": 0.4803, "step": 2704 }, { "epoch": 0.8118247298919568, "grad_norm": 0.1431550681591034, "learning_rate": 0.00014823594205646385, "loss": 0.452, "step": 2705 }, { "epoch": 0.812124849939976, "grad_norm": 0.1359039843082428, "learning_rate": 0.00014819005428862788, "loss": 0.4049, "step": 2706 }, { "epoch": 0.8124249699879952, "grad_norm": 0.1706361174583435, "learning_rate": 0.0001481441533004032, "loss": 0.4852, "step": 2707 }, { "epoch": 0.8127250900360145, "grad_norm": 0.14390254020690918, "learning_rate": 0.000148098239104382, "loss": 0.4607, "step": 2708 }, { "epoch": 0.8130252100840336, "grad_norm": 0.14577381312847137, "learning_rate": 0.00014805231171316046, "loss": 0.5027, "step": 2709 }, { "epoch": 0.8133253301320528, "grad_norm": 0.14503417909145355, "learning_rate": 0.0001480063711393382, "loss": 0.4609, "step": 2710 }, { "epoch": 0.813625450180072, "grad_norm": 0.15241298079490662, "learning_rate": 0.00014796041739551852, "loss": 0.4392, "step": 2711 }, { "epoch": 0.8139255702280912, "grad_norm": 0.15596158802509308, "learning_rate": 0.00014791445049430825, "loss": 0.4462, "step": 2712 }, { "epoch": 0.8142256902761105, "grad_norm": 0.1656380444765091, "learning_rate": 0.00014786847044831792, "loss": 0.516, "step": 2713 }, { "epoch": 0.8145258103241296, "grad_norm": 0.1357385665178299, "learning_rate": 0.00014782247727016155, "loss": 0.4161, "step": 2714 }, { "epoch": 0.8148259303721489, "grad_norm": 0.23623062670230865, "learning_rate": 0.00014777647097245696, "loss": 0.5041, "step": 2715 }, { "epoch": 0.8151260504201681, "grad_norm": 0.13372428715229034, "learning_rate": 0.00014773045156782537, "loss": 0.4487, "step": 2716 }, { "epoch": 0.8154261704681873, "grad_norm": 0.13498467206954956, "learning_rate": 0.0001476844190688917, "loss": 0.4164, "step": 2717 }, { "epoch": 0.8157262905162065, "grad_norm": 0.13854321837425232, "learning_rate": 0.00014763837348828442, "loss": 0.4769, "step": 2718 }, { "epoch": 0.8160264105642256, "grad_norm": 0.14869745075702667, "learning_rate": 0.00014759231483863568, "loss": 0.4603, "step": 2719 }, { "epoch": 0.8163265306122449, "grad_norm": 0.1487181931734085, "learning_rate": 0.00014754624313258102, "loss": 0.5091, "step": 2720 }, { "epoch": 0.8166266506602641, "grad_norm": 0.14438748359680176, "learning_rate": 0.00014750015838275977, "loss": 0.4928, "step": 2721 }, { "epoch": 0.8169267707082833, "grad_norm": 0.1376090794801712, "learning_rate": 0.00014745406060181476, "loss": 0.4515, "step": 2722 }, { "epoch": 0.8172268907563025, "grad_norm": 0.2375035583972931, "learning_rate": 0.00014740794980239231, "loss": 0.4949, "step": 2723 }, { "epoch": 0.8175270108043218, "grad_norm": 0.1473246067762375, "learning_rate": 0.00014736182599714247, "loss": 0.4718, "step": 2724 }, { "epoch": 0.8178271308523409, "grad_norm": 0.22862257063388824, "learning_rate": 0.00014731568919871873, "loss": 0.4534, "step": 2725 }, { "epoch": 0.8181272509003601, "grad_norm": 0.1518033742904663, "learning_rate": 0.00014726953941977824, "loss": 0.4403, "step": 2726 }, { "epoch": 0.8184273709483794, "grad_norm": 0.13525591790676117, "learning_rate": 0.0001472233766729816, "loss": 0.4208, "step": 2727 }, { "epoch": 0.8187274909963985, "grad_norm": 0.16344895958900452, "learning_rate": 0.00014717720097099307, "loss": 0.5802, "step": 2728 }, { "epoch": 0.8190276110444178, "grad_norm": 0.13552914559841156, "learning_rate": 0.0001471310123264804, "loss": 0.436, "step": 2729 }, { "epoch": 0.819327731092437, "grad_norm": 0.1389494091272354, "learning_rate": 0.00014708481075211498, "loss": 0.4497, "step": 2730 }, { "epoch": 0.8196278511404562, "grad_norm": 0.13997599482536316, "learning_rate": 0.00014703859626057157, "loss": 0.4735, "step": 2731 }, { "epoch": 0.8199279711884754, "grad_norm": 0.21648424863815308, "learning_rate": 0.00014699236886452866, "loss": 0.5035, "step": 2732 }, { "epoch": 0.8202280912364946, "grad_norm": 0.14384159445762634, "learning_rate": 0.00014694612857666823, "loss": 0.4853, "step": 2733 }, { "epoch": 0.8205282112845138, "grad_norm": 0.14302153885364532, "learning_rate": 0.00014689987540967574, "loss": 0.4665, "step": 2734 }, { "epoch": 0.820828331332533, "grad_norm": 0.13343827426433563, "learning_rate": 0.00014685360937624018, "loss": 0.405, "step": 2735 }, { "epoch": 0.8211284513805522, "grad_norm": 0.1506120264530182, "learning_rate": 0.00014680733048905414, "loss": 0.4566, "step": 2736 }, { "epoch": 0.8214285714285714, "grad_norm": 0.1620592176914215, "learning_rate": 0.00014676103876081368, "loss": 0.467, "step": 2737 }, { "epoch": 0.8217286914765907, "grad_norm": 0.14073067903518677, "learning_rate": 0.0001467147342042185, "loss": 0.4669, "step": 2738 }, { "epoch": 0.8220288115246098, "grad_norm": 0.13470537960529327, "learning_rate": 0.00014666841683197155, "loss": 0.4334, "step": 2739 }, { "epoch": 0.8223289315726291, "grad_norm": 0.14084503054618835, "learning_rate": 0.00014662208665677966, "loss": 0.4202, "step": 2740 }, { "epoch": 0.8226290516206483, "grad_norm": 0.15769736468791962, "learning_rate": 0.00014657574369135286, "loss": 0.4982, "step": 2741 }, { "epoch": 0.8229291716686674, "grad_norm": 0.1900629848241806, "learning_rate": 0.00014652938794840483, "loss": 0.4747, "step": 2742 }, { "epoch": 0.8232292917166867, "grad_norm": 0.13935410976409912, "learning_rate": 0.00014648301944065277, "loss": 0.4029, "step": 2743 }, { "epoch": 0.8235294117647058, "grad_norm": 0.14022275805473328, "learning_rate": 0.0001464366381808173, "loss": 0.4784, "step": 2744 }, { "epoch": 0.8238295318127251, "grad_norm": 0.1625329852104187, "learning_rate": 0.00014639024418162263, "loss": 0.4318, "step": 2745 }, { "epoch": 0.8241296518607443, "grad_norm": 0.15212495625019073, "learning_rate": 0.00014634383745579642, "loss": 0.4666, "step": 2746 }, { "epoch": 0.8244297719087635, "grad_norm": 0.13738644123077393, "learning_rate": 0.0001462974180160698, "loss": 0.4315, "step": 2747 }, { "epoch": 0.8247298919567827, "grad_norm": 0.14229655265808105, "learning_rate": 0.00014625098587517737, "loss": 0.4678, "step": 2748 }, { "epoch": 0.825030012004802, "grad_norm": 0.14815521240234375, "learning_rate": 0.00014620454104585738, "loss": 0.4424, "step": 2749 }, { "epoch": 0.8253301320528211, "grad_norm": 0.14716976881027222, "learning_rate": 0.0001461580835408513, "loss": 0.4756, "step": 2750 }, { "epoch": 0.8256302521008403, "grad_norm": 0.14700216054916382, "learning_rate": 0.00014611161337290435, "loss": 0.4693, "step": 2751 }, { "epoch": 0.8259303721488596, "grad_norm": 0.13168157637119293, "learning_rate": 0.00014606513055476496, "loss": 0.4187, "step": 2752 }, { "epoch": 0.8262304921968787, "grad_norm": 0.1525358408689499, "learning_rate": 0.00014601863509918525, "loss": 0.4878, "step": 2753 }, { "epoch": 0.826530612244898, "grad_norm": 0.14710712432861328, "learning_rate": 0.00014597212701892065, "loss": 0.4692, "step": 2754 }, { "epoch": 0.8268307322929171, "grad_norm": 0.14061634242534637, "learning_rate": 0.00014592560632673015, "loss": 0.4021, "step": 2755 }, { "epoch": 0.8271308523409364, "grad_norm": 0.14602655172348022, "learning_rate": 0.00014587907303537616, "loss": 0.4828, "step": 2756 }, { "epoch": 0.8274309723889556, "grad_norm": 0.1381726861000061, "learning_rate": 0.00014583252715762455, "loss": 0.4516, "step": 2757 }, { "epoch": 0.8277310924369747, "grad_norm": 0.18605683743953705, "learning_rate": 0.00014578596870624467, "loss": 0.4707, "step": 2758 }, { "epoch": 0.828031212484994, "grad_norm": 0.1509706825017929, "learning_rate": 0.0001457393976940093, "loss": 0.4889, "step": 2759 }, { "epoch": 0.8283313325330132, "grad_norm": 0.14689093828201294, "learning_rate": 0.00014569281413369462, "loss": 0.4191, "step": 2760 }, { "epoch": 0.8286314525810324, "grad_norm": 0.14853395521640778, "learning_rate": 0.00014564621803808033, "loss": 0.4641, "step": 2761 }, { "epoch": 0.8289315726290516, "grad_norm": 0.7913419604301453, "learning_rate": 0.00014559960941994954, "loss": 0.4879, "step": 2762 }, { "epoch": 0.8292316926770709, "grad_norm": 0.1619863659143448, "learning_rate": 0.00014555298829208876, "loss": 0.4998, "step": 2763 }, { "epoch": 0.82953181272509, "grad_norm": 0.14276006817817688, "learning_rate": 0.00014550635466728802, "loss": 0.457, "step": 2764 }, { "epoch": 0.8298319327731093, "grad_norm": 0.13974639773368835, "learning_rate": 0.00014545970855834063, "loss": 0.4358, "step": 2765 }, { "epoch": 0.8301320528211285, "grad_norm": 0.14814308285713196, "learning_rate": 0.0001454130499780435, "loss": 0.4754, "step": 2766 }, { "epoch": 0.8304321728691476, "grad_norm": 0.14188772439956665, "learning_rate": 0.0001453663789391968, "loss": 0.4499, "step": 2767 }, { "epoch": 0.8307322929171669, "grad_norm": 0.20982439815998077, "learning_rate": 0.0001453196954546043, "loss": 0.4604, "step": 2768 }, { "epoch": 0.831032412965186, "grad_norm": 0.1626424789428711, "learning_rate": 0.00014527299953707296, "loss": 0.47, "step": 2769 }, { "epoch": 0.8313325330132053, "grad_norm": 0.2350388914346695, "learning_rate": 0.00014522629119941333, "loss": 0.4388, "step": 2770 }, { "epoch": 0.8316326530612245, "grad_norm": 0.14766353368759155, "learning_rate": 0.00014517957045443933, "loss": 0.4693, "step": 2771 }, { "epoch": 0.8319327731092437, "grad_norm": 0.25449541211128235, "learning_rate": 0.0001451328373149682, "loss": 0.5037, "step": 2772 }, { "epoch": 0.8322328931572629, "grad_norm": 0.14723801612854004, "learning_rate": 0.00014508609179382067, "loss": 0.4968, "step": 2773 }, { "epoch": 0.8325330132052821, "grad_norm": 0.1688634157180786, "learning_rate": 0.00014503933390382084, "loss": 0.4964, "step": 2774 }, { "epoch": 0.8328331332533013, "grad_norm": 0.1697065532207489, "learning_rate": 0.00014499256365779623, "loss": 0.4478, "step": 2775 }, { "epoch": 0.8331332533013205, "grad_norm": 0.1368568390607834, "learning_rate": 0.00014494578106857768, "loss": 0.4438, "step": 2776 }, { "epoch": 0.8334333733493398, "grad_norm": 2.361243724822998, "learning_rate": 0.00014489898614899945, "loss": 0.4527, "step": 2777 }, { "epoch": 0.8337334933973589, "grad_norm": 0.14673811197280884, "learning_rate": 0.0001448521789118992, "loss": 0.4258, "step": 2778 }, { "epoch": 0.8340336134453782, "grad_norm": 0.14182722568511963, "learning_rate": 0.00014480535937011802, "loss": 0.4238, "step": 2779 }, { "epoch": 0.8343337334933973, "grad_norm": 14.291014671325684, "learning_rate": 0.00014475852753650023, "loss": 1.1137, "step": 2780 }, { "epoch": 0.8346338535414166, "grad_norm": 0.15664032101631165, "learning_rate": 0.00014471168342389363, "loss": 0.4579, "step": 2781 }, { "epoch": 0.8349339735894358, "grad_norm": 0.18944329023361206, "learning_rate": 0.00014466482704514938, "loss": 0.4783, "step": 2782 }, { "epoch": 0.8352340936374549, "grad_norm": 0.20190639793872833, "learning_rate": 0.000144617958413122, "loss": 0.4592, "step": 2783 }, { "epoch": 0.8355342136854742, "grad_norm": 0.19092018902301788, "learning_rate": 0.00014457107754066933, "loss": 0.4655, "step": 2784 }, { "epoch": 0.8358343337334934, "grad_norm": 0.1797829419374466, "learning_rate": 0.0001445241844406526, "loss": 0.4773, "step": 2785 }, { "epoch": 0.8361344537815126, "grad_norm": 0.16416847705841064, "learning_rate": 0.00014447727912593643, "loss": 0.4721, "step": 2786 }, { "epoch": 0.8364345738295318, "grad_norm": 0.15211446583271027, "learning_rate": 0.00014443036160938872, "loss": 0.4339, "step": 2787 }, { "epoch": 0.8367346938775511, "grad_norm": 0.14469176530838013, "learning_rate": 0.00014438343190388076, "loss": 0.4558, "step": 2788 }, { "epoch": 0.8370348139255702, "grad_norm": 0.16924558579921722, "learning_rate": 0.00014433649002228721, "loss": 0.4976, "step": 2789 }, { "epoch": 0.8373349339735895, "grad_norm": 0.18245111405849457, "learning_rate": 0.000144289535977486, "loss": 0.4133, "step": 2790 }, { "epoch": 0.8376350540216086, "grad_norm": 0.4985876679420471, "learning_rate": 0.00014424256978235847, "loss": 0.4933, "step": 2791 }, { "epoch": 0.8379351740696278, "grad_norm": 0.16431467235088348, "learning_rate": 0.0001441955914497892, "loss": 0.4111, "step": 2792 }, { "epoch": 0.8382352941176471, "grad_norm": 0.24334603548049927, "learning_rate": 0.00014414860099266617, "loss": 0.4487, "step": 2793 }, { "epoch": 0.8385354141656662, "grad_norm": 0.15816469490528107, "learning_rate": 0.0001441015984238807, "loss": 0.475, "step": 2794 }, { "epoch": 0.8388355342136855, "grad_norm": 0.16716502606868744, "learning_rate": 0.00014405458375632744, "loss": 0.5115, "step": 2795 }, { "epoch": 0.8391356542617047, "grad_norm": 0.14759080111980438, "learning_rate": 0.00014400755700290423, "loss": 0.4338, "step": 2796 }, { "epoch": 0.8394357743097239, "grad_norm": 0.21350345015525818, "learning_rate": 0.00014396051817651238, "loss": 0.4318, "step": 2797 }, { "epoch": 0.8397358943577431, "grad_norm": 0.6919642090797424, "learning_rate": 0.0001439134672900565, "loss": 0.5268, "step": 2798 }, { "epoch": 0.8400360144057623, "grad_norm": 0.3596741259098053, "learning_rate": 0.00014386640435644436, "loss": 0.4604, "step": 2799 }, { "epoch": 0.8403361344537815, "grad_norm": 0.1968657374382019, "learning_rate": 0.00014381932938858718, "loss": 0.4816, "step": 2800 }, { "epoch": 0.8406362545018007, "grad_norm": 0.14838272333145142, "learning_rate": 0.00014377224239939945, "loss": 0.4707, "step": 2801 }, { "epoch": 0.84093637454982, "grad_norm": 0.15526807308197021, "learning_rate": 0.00014372514340179894, "loss": 0.4758, "step": 2802 }, { "epoch": 0.8412364945978391, "grad_norm": 0.18176786601543427, "learning_rate": 0.00014367803240870672, "loss": 0.4563, "step": 2803 }, { "epoch": 0.8415366146458584, "grad_norm": 0.14367863535881042, "learning_rate": 0.00014363090943304713, "loss": 0.4246, "step": 2804 }, { "epoch": 0.8418367346938775, "grad_norm": 0.16701172292232513, "learning_rate": 0.00014358377448774786, "loss": 0.4744, "step": 2805 }, { "epoch": 0.8421368547418968, "grad_norm": 0.14857949316501617, "learning_rate": 0.00014353662758573977, "loss": 0.4361, "step": 2806 }, { "epoch": 0.842436974789916, "grad_norm": 0.15754273533821106, "learning_rate": 0.00014348946873995718, "loss": 0.4611, "step": 2807 }, { "epoch": 0.8427370948379351, "grad_norm": 0.2516443431377411, "learning_rate": 0.00014344229796333746, "loss": 0.5025, "step": 2808 }, { "epoch": 0.8430372148859544, "grad_norm": 0.16553343832492828, "learning_rate": 0.00014339511526882144, "loss": 0.4528, "step": 2809 }, { "epoch": 0.8433373349339736, "grad_norm": 0.1507650464773178, "learning_rate": 0.0001433479206693532, "loss": 0.4749, "step": 2810 }, { "epoch": 0.8436374549819928, "grad_norm": 0.1556183397769928, "learning_rate": 0.00014330071417787988, "loss": 0.4826, "step": 2811 }, { "epoch": 0.843937575030012, "grad_norm": 0.15120598673820496, "learning_rate": 0.0001432534958073522, "loss": 0.4768, "step": 2812 }, { "epoch": 0.8442376950780313, "grad_norm": 0.1328650414943695, "learning_rate": 0.0001432062655707239, "loss": 0.4204, "step": 2813 }, { "epoch": 0.8445378151260504, "grad_norm": 0.14978034794330597, "learning_rate": 0.0001431590234809521, "loss": 0.4661, "step": 2814 }, { "epoch": 0.8448379351740696, "grad_norm": 0.15140162408351898, "learning_rate": 0.00014311176955099706, "loss": 0.441, "step": 2815 }, { "epoch": 0.8451380552220888, "grad_norm": 0.15080350637435913, "learning_rate": 0.0001430645037938224, "loss": 0.475, "step": 2816 }, { "epoch": 0.845438175270108, "grad_norm": 0.17098544538021088, "learning_rate": 0.0001430172262223949, "loss": 0.4928, "step": 2817 }, { "epoch": 0.8457382953181273, "grad_norm": 0.1530095487833023, "learning_rate": 0.00014296993684968467, "loss": 0.4843, "step": 2818 }, { "epoch": 0.8460384153661464, "grad_norm": 0.14818990230560303, "learning_rate": 0.00014292263568866498, "loss": 0.4748, "step": 2819 }, { "epoch": 0.8463385354141657, "grad_norm": 0.1625467985868454, "learning_rate": 0.0001428753227523124, "loss": 0.4293, "step": 2820 }, { "epoch": 0.8466386554621849, "grad_norm": 0.1551417112350464, "learning_rate": 0.0001428279980536066, "loss": 0.4932, "step": 2821 }, { "epoch": 0.8469387755102041, "grad_norm": 0.14325112104415894, "learning_rate": 0.00014278066160553065, "loss": 0.4272, "step": 2822 }, { "epoch": 0.8472388955582233, "grad_norm": 0.2540292739868164, "learning_rate": 0.00014273331342107073, "loss": 0.4257, "step": 2823 }, { "epoch": 0.8475390156062425, "grad_norm": 0.15317946672439575, "learning_rate": 0.0001426859535132163, "loss": 0.4679, "step": 2824 }, { "epoch": 0.8478391356542617, "grad_norm": 0.13195572793483734, "learning_rate": 0.00014263858189495995, "loss": 0.4019, "step": 2825 }, { "epoch": 0.8481392557022809, "grad_norm": 0.166551411151886, "learning_rate": 0.00014259119857929758, "loss": 0.5337, "step": 2826 }, { "epoch": 0.8484393757503002, "grad_norm": 0.13715970516204834, "learning_rate": 0.0001425438035792283, "loss": 0.4264, "step": 2827 }, { "epoch": 0.8487394957983193, "grad_norm": 0.12979838252067566, "learning_rate": 0.00014249639690775432, "loss": 0.3659, "step": 2828 }, { "epoch": 0.8490396158463386, "grad_norm": 0.14503979682922363, "learning_rate": 0.00014244897857788112, "loss": 0.4509, "step": 2829 }, { "epoch": 0.8493397358943577, "grad_norm": 0.14053593575954437, "learning_rate": 0.0001424015486026174, "loss": 0.4479, "step": 2830 }, { "epoch": 0.8496398559423769, "grad_norm": 0.22120803594589233, "learning_rate": 0.00014235410699497503, "loss": 0.5086, "step": 2831 }, { "epoch": 0.8499399759903962, "grad_norm": 0.13641591370105743, "learning_rate": 0.00014230665376796904, "loss": 0.4187, "step": 2832 }, { "epoch": 0.8502400960384153, "grad_norm": 0.14362581074237823, "learning_rate": 0.00014225918893461773, "loss": 0.4234, "step": 2833 }, { "epoch": 0.8505402160864346, "grad_norm": 0.18435567617416382, "learning_rate": 0.00014221171250794248, "loss": 0.4224, "step": 2834 }, { "epoch": 0.8508403361344538, "grad_norm": 0.13645349442958832, "learning_rate": 0.00014216422450096796, "loss": 0.4355, "step": 2835 }, { "epoch": 0.851140456182473, "grad_norm": 0.14655910432338715, "learning_rate": 0.0001421167249267219, "loss": 0.4681, "step": 2836 }, { "epoch": 0.8514405762304922, "grad_norm": 0.14065751433372498, "learning_rate": 0.00014206921379823528, "loss": 0.4638, "step": 2837 }, { "epoch": 0.8517406962785115, "grad_norm": 0.16344814002513885, "learning_rate": 0.00014202169112854224, "loss": 0.4572, "step": 2838 }, { "epoch": 0.8520408163265306, "grad_norm": 0.13833872973918915, "learning_rate": 0.00014197415693068015, "loss": 0.4055, "step": 2839 }, { "epoch": 0.8523409363745498, "grad_norm": 0.146205335855484, "learning_rate": 0.00014192661121768932, "loss": 0.4423, "step": 2840 }, { "epoch": 0.852641056422569, "grad_norm": 0.15868382155895233, "learning_rate": 0.00014187905400261353, "loss": 0.493, "step": 2841 }, { "epoch": 0.8529411764705882, "grad_norm": 0.15348432958126068, "learning_rate": 0.00014183148529849944, "loss": 0.4495, "step": 2842 }, { "epoch": 0.8532412965186075, "grad_norm": 0.1486390084028244, "learning_rate": 0.00014178390511839704, "loss": 0.4924, "step": 2843 }, { "epoch": 0.8535414165666266, "grad_norm": 0.14600692689418793, "learning_rate": 0.00014173631347535939, "loss": 0.4744, "step": 2844 }, { "epoch": 0.8538415366146459, "grad_norm": 0.13798721134662628, "learning_rate": 0.00014168871038244268, "loss": 0.4309, "step": 2845 }, { "epoch": 0.8541416566626651, "grad_norm": 0.1473832130432129, "learning_rate": 0.00014164109585270635, "loss": 0.4477, "step": 2846 }, { "epoch": 0.8544417767106842, "grad_norm": 0.5650750398635864, "learning_rate": 0.00014159346989921285, "loss": 0.398, "step": 2847 }, { "epoch": 0.8547418967587035, "grad_norm": 0.1595097780227661, "learning_rate": 0.0001415458325350278, "loss": 0.4217, "step": 2848 }, { "epoch": 0.8550420168067226, "grad_norm": 0.1487688422203064, "learning_rate": 0.00014149818377321997, "loss": 0.4468, "step": 2849 }, { "epoch": 0.8553421368547419, "grad_norm": 0.16221052408218384, "learning_rate": 0.0001414505236268613, "loss": 0.4624, "step": 2850 }, { "epoch": 0.8556422569027611, "grad_norm": 0.1519615799188614, "learning_rate": 0.00014140285210902676, "loss": 0.4465, "step": 2851 }, { "epoch": 0.8559423769507803, "grad_norm": 0.19140374660491943, "learning_rate": 0.0001413551692327945, "loss": 0.4556, "step": 2852 }, { "epoch": 0.8562424969987995, "grad_norm": 0.14043140411376953, "learning_rate": 0.00014130747501124572, "loss": 0.4532, "step": 2853 }, { "epoch": 0.8565426170468188, "grad_norm": 0.15206171572208405, "learning_rate": 0.00014125976945746486, "loss": 0.4832, "step": 2854 }, { "epoch": 0.8568427370948379, "grad_norm": 0.15809527039527893, "learning_rate": 0.00014121205258453933, "loss": 0.4899, "step": 2855 }, { "epoch": 0.8571428571428571, "grad_norm": 0.13327966630458832, "learning_rate": 0.00014116432440555978, "loss": 0.3969, "step": 2856 }, { "epoch": 0.8574429771908764, "grad_norm": 0.14684216678142548, "learning_rate": 0.0001411165849336198, "loss": 0.472, "step": 2857 }, { "epoch": 0.8577430972388955, "grad_norm": 0.17298445105552673, "learning_rate": 0.00014106883418181623, "loss": 0.4668, "step": 2858 }, { "epoch": 0.8580432172869148, "grad_norm": 0.16335934400558472, "learning_rate": 0.00014102107216324892, "loss": 0.4664, "step": 2859 }, { "epoch": 0.858343337334934, "grad_norm": 0.14026081562042236, "learning_rate": 0.00014097329889102084, "loss": 0.4154, "step": 2860 }, { "epoch": 0.8586434573829532, "grad_norm": 0.1268925815820694, "learning_rate": 0.00014092551437823803, "loss": 0.3778, "step": 2861 }, { "epoch": 0.8589435774309724, "grad_norm": 0.14247116446495056, "learning_rate": 0.00014087771863800964, "loss": 0.3996, "step": 2862 }, { "epoch": 0.8592436974789915, "grad_norm": 0.13496030867099762, "learning_rate": 0.00014082991168344784, "loss": 0.4283, "step": 2863 }, { "epoch": 0.8595438175270108, "grad_norm": 0.19215568900108337, "learning_rate": 0.00014078209352766795, "loss": 0.4934, "step": 2864 }, { "epoch": 0.85984393757503, "grad_norm": 0.1477733999490738, "learning_rate": 0.00014073426418378837, "loss": 0.4555, "step": 2865 }, { "epoch": 0.8601440576230492, "grad_norm": 0.14388087391853333, "learning_rate": 0.00014068642366493048, "loss": 0.461, "step": 2866 }, { "epoch": 0.8604441776710684, "grad_norm": 0.1384228765964508, "learning_rate": 0.00014063857198421876, "loss": 0.4369, "step": 2867 }, { "epoch": 0.8607442977190877, "grad_norm": 0.20080068707466125, "learning_rate": 0.0001405907091547808, "loss": 0.4404, "step": 2868 }, { "epoch": 0.8610444177671068, "grad_norm": 0.21571896970272064, "learning_rate": 0.00014054283518974726, "loss": 0.4584, "step": 2869 }, { "epoch": 0.8613445378151261, "grad_norm": 0.16102316975593567, "learning_rate": 0.00014049495010225174, "loss": 0.4849, "step": 2870 }, { "epoch": 0.8616446578631453, "grad_norm": 0.14411479234695435, "learning_rate": 0.00014044705390543104, "loss": 0.489, "step": 2871 }, { "epoch": 0.8619447779111644, "grad_norm": 0.1374804824590683, "learning_rate": 0.00014039914661242485, "loss": 0.411, "step": 2872 }, { "epoch": 0.8622448979591837, "grad_norm": 0.14753234386444092, "learning_rate": 0.00014035122823637607, "loss": 0.4614, "step": 2873 }, { "epoch": 0.8625450180072028, "grad_norm": 0.1469254344701767, "learning_rate": 0.00014030329879043048, "loss": 0.4451, "step": 2874 }, { "epoch": 0.8628451380552221, "grad_norm": 0.1511831283569336, "learning_rate": 0.000140255358287737, "loss": 0.4819, "step": 2875 }, { "epoch": 0.8631452581032413, "grad_norm": 0.1463584452867508, "learning_rate": 0.00014020740674144762, "loss": 0.4872, "step": 2876 }, { "epoch": 0.8634453781512605, "grad_norm": 0.13619887828826904, "learning_rate": 0.0001401594441647172, "loss": 0.4203, "step": 2877 }, { "epoch": 0.8637454981992797, "grad_norm": 0.19682137668132782, "learning_rate": 0.00014011147057070374, "loss": 0.4392, "step": 2878 }, { "epoch": 0.864045618247299, "grad_norm": 0.15196283161640167, "learning_rate": 0.00014006348597256832, "loss": 0.386, "step": 2879 }, { "epoch": 0.8643457382953181, "grad_norm": 0.15986262261867523, "learning_rate": 0.00014001549038347488, "loss": 0.4645, "step": 2880 }, { "epoch": 0.8646458583433373, "grad_norm": 0.13178189098834991, "learning_rate": 0.00013996748381659053, "loss": 0.3843, "step": 2881 }, { "epoch": 0.8649459783913566, "grad_norm": 0.151777982711792, "learning_rate": 0.00013991946628508524, "loss": 0.4774, "step": 2882 }, { "epoch": 0.8652460984393757, "grad_norm": 0.16395652294158936, "learning_rate": 0.00013987143780213216, "loss": 0.437, "step": 2883 }, { "epoch": 0.865546218487395, "grad_norm": 0.1446828693151474, "learning_rate": 0.00013982339838090728, "loss": 0.4669, "step": 2884 }, { "epoch": 0.8658463385354142, "grad_norm": 0.13735127449035645, "learning_rate": 0.0001397753480345897, "loss": 0.4115, "step": 2885 }, { "epoch": 0.8661464585834334, "grad_norm": 0.1632857322692871, "learning_rate": 0.00013972728677636145, "loss": 0.4536, "step": 2886 }, { "epoch": 0.8664465786314526, "grad_norm": 0.16722363233566284, "learning_rate": 0.00013967921461940762, "loss": 0.4238, "step": 2887 }, { "epoch": 0.8667466986794717, "grad_norm": 0.13403132557868958, "learning_rate": 0.00013963113157691628, "loss": 0.4369, "step": 2888 }, { "epoch": 0.867046818727491, "grad_norm": 0.14564616978168488, "learning_rate": 0.0001395830376620784, "loss": 0.4043, "step": 2889 }, { "epoch": 0.8673469387755102, "grad_norm": 0.1612880378961563, "learning_rate": 0.00013953493288808804, "loss": 0.4563, "step": 2890 }, { "epoch": 0.8676470588235294, "grad_norm": 0.15668343007564545, "learning_rate": 0.00013948681726814216, "loss": 0.5096, "step": 2891 }, { "epoch": 0.8679471788715486, "grad_norm": 0.1354898363351822, "learning_rate": 0.00013943869081544072, "loss": 0.4098, "step": 2892 }, { "epoch": 0.8682472989195679, "grad_norm": 0.1431705355644226, "learning_rate": 0.0001393905535431867, "loss": 0.4267, "step": 2893 }, { "epoch": 0.868547418967587, "grad_norm": 0.1419495940208435, "learning_rate": 0.000139342405464586, "loss": 0.4498, "step": 2894 }, { "epoch": 0.8688475390156063, "grad_norm": 0.12562808394432068, "learning_rate": 0.00013929424659284747, "loss": 0.3835, "step": 2895 }, { "epoch": 0.8691476590636255, "grad_norm": 0.14655275642871857, "learning_rate": 0.00013924607694118296, "loss": 0.4974, "step": 2896 }, { "epoch": 0.8694477791116446, "grad_norm": 0.1477247029542923, "learning_rate": 0.00013919789652280725, "loss": 0.4252, "step": 2897 }, { "epoch": 0.8697478991596639, "grad_norm": 0.14200358092784882, "learning_rate": 0.0001391497053509381, "loss": 0.4469, "step": 2898 }, { "epoch": 0.870048019207683, "grad_norm": 0.14584213495254517, "learning_rate": 0.00013910150343879622, "loss": 0.456, "step": 2899 }, { "epoch": 0.8703481392557023, "grad_norm": 0.13909918069839478, "learning_rate": 0.00013905329079960522, "loss": 0.4648, "step": 2900 }, { "epoch": 0.8706482593037215, "grad_norm": 0.146524578332901, "learning_rate": 0.00013900506744659168, "loss": 0.466, "step": 2901 }, { "epoch": 0.8709483793517407, "grad_norm": 0.12789376080036163, "learning_rate": 0.00013895683339298516, "loss": 0.3878, "step": 2902 }, { "epoch": 0.8712484993997599, "grad_norm": 0.13936883211135864, "learning_rate": 0.0001389085886520181, "loss": 0.4519, "step": 2903 }, { "epoch": 0.8715486194477791, "grad_norm": 0.13382497429847717, "learning_rate": 0.00013886033323692588, "loss": 0.4173, "step": 2904 }, { "epoch": 0.8718487394957983, "grad_norm": 0.13834607601165771, "learning_rate": 0.00013881206716094685, "loss": 0.4493, "step": 2905 }, { "epoch": 0.8721488595438175, "grad_norm": 0.13608871400356293, "learning_rate": 0.00013876379043732223, "loss": 0.4277, "step": 2906 }, { "epoch": 0.8724489795918368, "grad_norm": 0.15459437668323517, "learning_rate": 0.0001387155030792962, "loss": 0.4549, "step": 2907 }, { "epoch": 0.8727490996398559, "grad_norm": 0.1430646777153015, "learning_rate": 0.00013866720510011582, "loss": 0.4418, "step": 2908 }, { "epoch": 0.8730492196878752, "grad_norm": 0.17030729353427887, "learning_rate": 0.0001386188965130311, "loss": 0.4434, "step": 2909 }, { "epoch": 0.8733493397358943, "grad_norm": 0.1302502453327179, "learning_rate": 0.00013857057733129494, "loss": 0.3983, "step": 2910 }, { "epoch": 0.8736494597839136, "grad_norm": 0.15272179245948792, "learning_rate": 0.00013852224756816315, "loss": 0.5, "step": 2911 }, { "epoch": 0.8739495798319328, "grad_norm": 0.15235954523086548, "learning_rate": 0.00013847390723689454, "loss": 0.5164, "step": 2912 }, { "epoch": 0.8742496998799519, "grad_norm": 0.17023412883281708, "learning_rate": 0.00013842555635075056, "loss": 0.4858, "step": 2913 }, { "epoch": 0.8745498199279712, "grad_norm": 0.14470688998699188, "learning_rate": 0.00013837719492299587, "loss": 0.4179, "step": 2914 }, { "epoch": 0.8748499399759904, "grad_norm": 0.14784374833106995, "learning_rate": 0.00013832882296689776, "loss": 0.4445, "step": 2915 }, { "epoch": 0.8751500600240096, "grad_norm": 0.1371723860502243, "learning_rate": 0.00013828044049572658, "loss": 0.4134, "step": 2916 }, { "epoch": 0.8754501800720288, "grad_norm": 0.13509762287139893, "learning_rate": 0.00013823204752275547, "loss": 0.4147, "step": 2917 }, { "epoch": 0.8757503001200481, "grad_norm": 0.16309741139411926, "learning_rate": 0.00013818364406126054, "loss": 0.49, "step": 2918 }, { "epoch": 0.8760504201680672, "grad_norm": 0.2603866159915924, "learning_rate": 0.0001381352301245207, "loss": 0.437, "step": 2919 }, { "epoch": 0.8763505402160864, "grad_norm": 0.15486130118370056, "learning_rate": 0.00013808680572581776, "loss": 0.4294, "step": 2920 }, { "epoch": 0.8766506602641057, "grad_norm": 0.18155358731746674, "learning_rate": 0.0001380383708784364, "loss": 0.4407, "step": 2921 }, { "epoch": 0.8769507803121248, "grad_norm": 0.1427716165781021, "learning_rate": 0.00013798992559566414, "loss": 0.4445, "step": 2922 }, { "epoch": 0.8772509003601441, "grad_norm": 0.16872714459896088, "learning_rate": 0.0001379414698907914, "loss": 0.5084, "step": 2923 }, { "epoch": 0.8775510204081632, "grad_norm": 0.14170365035533905, "learning_rate": 0.00013789300377711148, "loss": 0.4457, "step": 2924 }, { "epoch": 0.8778511404561825, "grad_norm": 0.14954358339309692, "learning_rate": 0.0001378445272679205, "loss": 0.4339, "step": 2925 }, { "epoch": 0.8781512605042017, "grad_norm": 0.1393103152513504, "learning_rate": 0.00013779604037651738, "loss": 0.4561, "step": 2926 }, { "epoch": 0.8784513805522209, "grad_norm": 0.2205260545015335, "learning_rate": 0.00013774754311620394, "loss": 0.4564, "step": 2927 }, { "epoch": 0.8787515006002401, "grad_norm": 0.13299301266670227, "learning_rate": 0.00013769903550028492, "loss": 0.4332, "step": 2928 }, { "epoch": 0.8790516206482593, "grad_norm": 0.13900135457515717, "learning_rate": 0.0001376505175420678, "loss": 0.4416, "step": 2929 }, { "epoch": 0.8793517406962785, "grad_norm": 0.14976787567138672, "learning_rate": 0.0001376019892548629, "loss": 0.5004, "step": 2930 }, { "epoch": 0.8796518607442977, "grad_norm": 0.14274270832538605, "learning_rate": 0.00013755345065198342, "loss": 0.4604, "step": 2931 }, { "epoch": 0.879951980792317, "grad_norm": 0.14141829311847687, "learning_rate": 0.00013750490174674535, "loss": 0.4291, "step": 2932 }, { "epoch": 0.8802521008403361, "grad_norm": 0.15518976747989655, "learning_rate": 0.00013745634255246756, "loss": 0.5244, "step": 2933 }, { "epoch": 0.8805522208883554, "grad_norm": 0.13994963467121124, "learning_rate": 0.0001374077730824717, "loss": 0.442, "step": 2934 }, { "epoch": 0.8808523409363745, "grad_norm": 0.18987542390823364, "learning_rate": 0.00013735919335008216, "loss": 0.5006, "step": 2935 }, { "epoch": 0.8811524609843937, "grad_norm": 0.1415923684835434, "learning_rate": 0.00013731060336862635, "loss": 0.4828, "step": 2936 }, { "epoch": 0.881452581032413, "grad_norm": 0.350637823343277, "learning_rate": 0.00013726200315143436, "loss": 0.452, "step": 2937 }, { "epoch": 0.8817527010804321, "grad_norm": 0.14160653948783875, "learning_rate": 0.00013721339271183906, "loss": 0.4292, "step": 2938 }, { "epoch": 0.8820528211284514, "grad_norm": 0.15507519245147705, "learning_rate": 0.00013716477206317614, "loss": 0.411, "step": 2939 }, { "epoch": 0.8823529411764706, "grad_norm": 0.1486891359090805, "learning_rate": 0.00013711614121878423, "loss": 0.4525, "step": 2940 }, { "epoch": 0.8826530612244898, "grad_norm": 0.14755134284496307, "learning_rate": 0.00013706750019200448, "loss": 0.4768, "step": 2941 }, { "epoch": 0.882953181272509, "grad_norm": 0.15835875272750854, "learning_rate": 0.00013701884899618117, "loss": 0.4316, "step": 2942 }, { "epoch": 0.8832533013205283, "grad_norm": 0.1589801013469696, "learning_rate": 0.00013697018764466108, "loss": 0.514, "step": 2943 }, { "epoch": 0.8835534213685474, "grad_norm": 0.14008241891860962, "learning_rate": 0.00013692151615079395, "loss": 0.4373, "step": 2944 }, { "epoch": 0.8838535414165666, "grad_norm": 0.15842609107494354, "learning_rate": 0.0001368728345279322, "loss": 0.488, "step": 2945 }, { "epoch": 0.8841536614645858, "grad_norm": 0.15005439519882202, "learning_rate": 0.00013682414278943115, "loss": 0.458, "step": 2946 }, { "epoch": 0.884453781512605, "grad_norm": 0.22050262987613678, "learning_rate": 0.00013677544094864872, "loss": 0.4706, "step": 2947 }, { "epoch": 0.8847539015606243, "grad_norm": 0.14626656472682953, "learning_rate": 0.0001367267290189458, "loss": 0.4766, "step": 2948 }, { "epoch": 0.8850540216086434, "grad_norm": 0.14207623898983002, "learning_rate": 0.00013667800701368585, "loss": 0.4309, "step": 2949 }, { "epoch": 0.8853541416566627, "grad_norm": 0.14198783040046692, "learning_rate": 0.00013662927494623528, "loss": 0.4418, "step": 2950 }, { "epoch": 0.8856542617046819, "grad_norm": 0.13123789429664612, "learning_rate": 0.00013658053282996315, "loss": 0.4302, "step": 2951 }, { "epoch": 0.885954381752701, "grad_norm": 0.1482761651277542, "learning_rate": 0.00013653178067824127, "loss": 0.4908, "step": 2952 }, { "epoch": 0.8862545018007203, "grad_norm": 0.14108116924762726, "learning_rate": 0.0001364830185044443, "loss": 0.4474, "step": 2953 }, { "epoch": 0.8865546218487395, "grad_norm": 0.1418759524822235, "learning_rate": 0.00013643424632194947, "loss": 0.4435, "step": 2954 }, { "epoch": 0.8868547418967587, "grad_norm": 0.16263645887374878, "learning_rate": 0.000136385464144137, "loss": 0.4342, "step": 2955 }, { "epoch": 0.8871548619447779, "grad_norm": 0.1338297724723816, "learning_rate": 0.00013633667198438963, "loss": 0.4227, "step": 2956 }, { "epoch": 0.8874549819927972, "grad_norm": 0.1473718285560608, "learning_rate": 0.00013628786985609296, "loss": 0.4202, "step": 2957 }, { "epoch": 0.8877551020408163, "grad_norm": 0.15010789036750793, "learning_rate": 0.00013623905777263528, "loss": 0.4522, "step": 2958 }, { "epoch": 0.8880552220888356, "grad_norm": 0.12539362907409668, "learning_rate": 0.0001361902357474077, "loss": 0.394, "step": 2959 }, { "epoch": 0.8883553421368547, "grad_norm": 0.14094272255897522, "learning_rate": 0.00013614140379380384, "loss": 0.4374, "step": 2960 }, { "epoch": 0.8886554621848739, "grad_norm": 0.16918066143989563, "learning_rate": 0.00013609256192522031, "loss": 0.4613, "step": 2961 }, { "epoch": 0.8889555822328932, "grad_norm": 0.2934674322605133, "learning_rate": 0.00013604371015505623, "loss": 0.4518, "step": 2962 }, { "epoch": 0.8892557022809123, "grad_norm": 0.17096976935863495, "learning_rate": 0.00013599484849671366, "loss": 0.4799, "step": 2963 }, { "epoch": 0.8895558223289316, "grad_norm": 0.1428793966770172, "learning_rate": 0.00013594597696359704, "loss": 0.4616, "step": 2964 }, { "epoch": 0.8898559423769508, "grad_norm": 0.14048418402671814, "learning_rate": 0.00013589709556911388, "loss": 0.4437, "step": 2965 }, { "epoch": 0.89015606242497, "grad_norm": 0.13983307778835297, "learning_rate": 0.00013584820432667414, "loss": 0.4463, "step": 2966 }, { "epoch": 0.8904561824729892, "grad_norm": 0.1343764066696167, "learning_rate": 0.00013579930324969064, "loss": 0.4309, "step": 2967 }, { "epoch": 0.8907563025210085, "grad_norm": 0.20858599245548248, "learning_rate": 0.00013575039235157874, "loss": 0.4996, "step": 2968 }, { "epoch": 0.8910564225690276, "grad_norm": 0.14643438160419464, "learning_rate": 0.00013570147164575666, "loss": 0.5124, "step": 2969 }, { "epoch": 0.8913565426170468, "grad_norm": 0.16413362324237823, "learning_rate": 0.00013565254114564522, "loss": 0.4909, "step": 2970 }, { "epoch": 0.891656662665066, "grad_norm": 0.1786835789680481, "learning_rate": 0.0001356036008646679, "loss": 0.494, "step": 2971 }, { "epoch": 0.8919567827130852, "grad_norm": 0.1439962089061737, "learning_rate": 0.00013555465081625093, "loss": 0.3975, "step": 2972 }, { "epoch": 0.8922569027611045, "grad_norm": 0.13219578564167023, "learning_rate": 0.0001355056910138232, "loss": 0.4205, "step": 2973 }, { "epoch": 0.8925570228091236, "grad_norm": 0.14857596158981323, "learning_rate": 0.00013545672147081633, "loss": 0.4722, "step": 2974 }, { "epoch": 0.8928571428571429, "grad_norm": 0.13910064101219177, "learning_rate": 0.00013540774220066444, "loss": 0.455, "step": 2975 }, { "epoch": 0.8931572629051621, "grad_norm": 0.14829768240451813, "learning_rate": 0.0001353587532168045, "loss": 0.4569, "step": 2976 }, { "epoch": 0.8934573829531812, "grad_norm": 0.1404755413532257, "learning_rate": 0.00013530975453267602, "loss": 0.4638, "step": 2977 }, { "epoch": 0.8937575030012005, "grad_norm": 0.1413242369890213, "learning_rate": 0.0001352607461617213, "loss": 0.4149, "step": 2978 }, { "epoch": 0.8940576230492197, "grad_norm": 0.1247020959854126, "learning_rate": 0.0001352117281173852, "loss": 0.3795, "step": 2979 }, { "epoch": 0.8943577430972389, "grad_norm": 0.15720205008983612, "learning_rate": 0.00013516270041311523, "loss": 0.4755, "step": 2980 }, { "epoch": 0.8946578631452581, "grad_norm": 0.2672662138938904, "learning_rate": 0.0001351136630623616, "loss": 0.428, "step": 2981 }, { "epoch": 0.8949579831932774, "grad_norm": 0.13200098276138306, "learning_rate": 0.00013506461607857716, "loss": 0.3911, "step": 2982 }, { "epoch": 0.8952581032412965, "grad_norm": 0.19307288527488708, "learning_rate": 0.00013501555947521734, "loss": 0.5166, "step": 2983 }, { "epoch": 0.8955582232893158, "grad_norm": 0.13294395804405212, "learning_rate": 0.0001349664932657403, "loss": 0.3987, "step": 2984 }, { "epoch": 0.8958583433373349, "grad_norm": 0.1404949575662613, "learning_rate": 0.0001349174174636068, "loss": 0.4471, "step": 2985 }, { "epoch": 0.8961584633853541, "grad_norm": 0.15460175275802612, "learning_rate": 0.00013486833208228017, "loss": 0.4506, "step": 2986 }, { "epoch": 0.8964585834333734, "grad_norm": 0.14664404094219208, "learning_rate": 0.00013481923713522646, "loss": 0.4175, "step": 2987 }, { "epoch": 0.8967587034813925, "grad_norm": 0.15789993107318878, "learning_rate": 0.0001347701326359143, "loss": 0.4846, "step": 2988 }, { "epoch": 0.8970588235294118, "grad_norm": 0.1591576188802719, "learning_rate": 0.00013472101859781497, "loss": 0.4632, "step": 2989 }, { "epoch": 0.897358943577431, "grad_norm": 0.16807615756988525, "learning_rate": 0.0001346718950344023, "loss": 0.4484, "step": 2990 }, { "epoch": 0.8976590636254502, "grad_norm": 0.312802791595459, "learning_rate": 0.00013462276195915277, "loss": 0.4931, "step": 2991 }, { "epoch": 0.8979591836734694, "grad_norm": 0.7276992201805115, "learning_rate": 0.00013457361938554557, "loss": 0.4803, "step": 2992 }, { "epoch": 0.8982593037214885, "grad_norm": 0.18568727374076843, "learning_rate": 0.00013452446732706226, "loss": 0.4683, "step": 2993 }, { "epoch": 0.8985594237695078, "grad_norm": 0.13656973838806152, "learning_rate": 0.00013447530579718727, "loss": 0.4395, "step": 2994 }, { "epoch": 0.898859543817527, "grad_norm": 0.20830455422401428, "learning_rate": 0.00013442613480940744, "loss": 0.4739, "step": 2995 }, { "epoch": 0.8991596638655462, "grad_norm": 0.1854456216096878, "learning_rate": 0.00013437695437721226, "loss": 0.4528, "step": 2996 }, { "epoch": 0.8994597839135654, "grad_norm": 0.17800886929035187, "learning_rate": 0.00013432776451409383, "loss": 0.4785, "step": 2997 }, { "epoch": 0.8997599039615847, "grad_norm": 0.14185847342014313, "learning_rate": 0.00013427856523354686, "loss": 0.4384, "step": 2998 }, { "epoch": 0.9000600240096038, "grad_norm": 0.14251980185508728, "learning_rate": 0.00013422935654906851, "loss": 0.4509, "step": 2999 }, { "epoch": 0.9003601440576231, "grad_norm": 0.13576701283454895, "learning_rate": 0.00013418013847415875, "loss": 0.4266, "step": 3000 }, { "epoch": 0.9006602641056423, "grad_norm": 0.1396171748638153, "learning_rate": 0.0001341309110223199, "loss": 0.4369, "step": 3001 }, { "epoch": 0.9009603841536614, "grad_norm": 0.1498328447341919, "learning_rate": 0.00013408167420705697, "loss": 0.4939, "step": 3002 }, { "epoch": 0.9012605042016807, "grad_norm": 0.1731022447347641, "learning_rate": 0.00013403242804187754, "loss": 0.4979, "step": 3003 }, { "epoch": 0.9015606242496998, "grad_norm": 0.6752466559410095, "learning_rate": 0.00013398317254029171, "loss": 0.4421, "step": 3004 }, { "epoch": 0.9018607442977191, "grad_norm": 0.17650246620178223, "learning_rate": 0.00013393390771581216, "loss": 0.4403, "step": 3005 }, { "epoch": 0.9021608643457383, "grad_norm": 0.25184255838394165, "learning_rate": 0.00013388463358195412, "loss": 0.4699, "step": 3006 }, { "epoch": 0.9024609843937575, "grad_norm": 0.20988434553146362, "learning_rate": 0.0001338353501522354, "loss": 0.4254, "step": 3007 }, { "epoch": 0.9027611044417767, "grad_norm": 3.6127822399139404, "learning_rate": 0.0001337860574401764, "loss": 0.4764, "step": 3008 }, { "epoch": 0.9030612244897959, "grad_norm": 0.16843774914741516, "learning_rate": 0.0001337367554592999, "loss": 0.5213, "step": 3009 }, { "epoch": 0.9033613445378151, "grad_norm": 0.1471468061208725, "learning_rate": 0.00013368744422313135, "loss": 0.4574, "step": 3010 }, { "epoch": 0.9036614645858343, "grad_norm": 0.13623493909835815, "learning_rate": 0.00013363812374519882, "loss": 0.3986, "step": 3011 }, { "epoch": 0.9039615846338536, "grad_norm": 0.15088467299938202, "learning_rate": 0.00013358879403903274, "loss": 0.4087, "step": 3012 }, { "epoch": 0.9042617046818727, "grad_norm": 0.1518915593624115, "learning_rate": 0.00013353945511816615, "loss": 0.4246, "step": 3013 }, { "epoch": 0.904561824729892, "grad_norm": 0.14289763569831848, "learning_rate": 0.00013349010699613463, "loss": 0.4194, "step": 3014 }, { "epoch": 0.9048619447779112, "grad_norm": 0.16750043630599976, "learning_rate": 0.00013344074968647628, "loss": 0.5288, "step": 3015 }, { "epoch": 0.9051620648259304, "grad_norm": 0.164357528090477, "learning_rate": 0.0001333913832027317, "loss": 0.4477, "step": 3016 }, { "epoch": 0.9054621848739496, "grad_norm": 0.2598751485347748, "learning_rate": 0.00013334200755844405, "loss": 0.4604, "step": 3017 }, { "epoch": 0.9057623049219687, "grad_norm": 0.15973882377147675, "learning_rate": 0.00013329262276715894, "loss": 0.4256, "step": 3018 }, { "epoch": 0.906062424969988, "grad_norm": 0.16015294194221497, "learning_rate": 0.0001332432288424245, "loss": 0.4479, "step": 3019 }, { "epoch": 0.9063625450180072, "grad_norm": 0.1549282670021057, "learning_rate": 0.00013319382579779143, "loss": 0.4861, "step": 3020 }, { "epoch": 0.9066626650660264, "grad_norm": 0.19171394407749176, "learning_rate": 0.00013314441364681285, "loss": 0.4759, "step": 3021 }, { "epoch": 0.9069627851140456, "grad_norm": 0.14883162081241608, "learning_rate": 0.00013309499240304445, "loss": 0.4646, "step": 3022 }, { "epoch": 0.9072629051620649, "grad_norm": 0.17286548018455505, "learning_rate": 0.00013304556208004438, "loss": 0.4056, "step": 3023 }, { "epoch": 0.907563025210084, "grad_norm": 0.16968637704849243, "learning_rate": 0.00013299612269137328, "loss": 0.4726, "step": 3024 }, { "epoch": 0.9078631452581032, "grad_norm": 0.145048126578331, "learning_rate": 0.00013294667425059426, "loss": 0.4549, "step": 3025 }, { "epoch": 0.9081632653061225, "grad_norm": 0.15538866817951202, "learning_rate": 0.00013289721677127293, "loss": 0.4922, "step": 3026 }, { "epoch": 0.9084633853541416, "grad_norm": 0.16231182217597961, "learning_rate": 0.00013284775026697743, "loss": 0.4381, "step": 3027 }, { "epoch": 0.9087635054021609, "grad_norm": 0.14964693784713745, "learning_rate": 0.00013279827475127829, "loss": 0.4085, "step": 3028 }, { "epoch": 0.90906362545018, "grad_norm": 0.17131774127483368, "learning_rate": 0.00013274879023774853, "loss": 0.4953, "step": 3029 }, { "epoch": 0.9093637454981993, "grad_norm": 0.3522649109363556, "learning_rate": 0.00013269929673996372, "loss": 0.4912, "step": 3030 }, { "epoch": 0.9096638655462185, "grad_norm": 0.18055719137191772, "learning_rate": 0.00013264979427150177, "loss": 0.44, "step": 3031 }, { "epoch": 0.9099639855942377, "grad_norm": 0.14456139504909515, "learning_rate": 0.00013260028284594317, "loss": 0.4186, "step": 3032 }, { "epoch": 0.9102641056422569, "grad_norm": 0.1525500863790512, "learning_rate": 0.00013255076247687075, "loss": 0.4163, "step": 3033 }, { "epoch": 0.9105642256902761, "grad_norm": 0.1520788073539734, "learning_rate": 0.00013250123317786993, "loss": 0.4502, "step": 3034 }, { "epoch": 0.9108643457382953, "grad_norm": 0.17995305359363556, "learning_rate": 0.00013245169496252843, "loss": 0.4875, "step": 3035 }, { "epoch": 0.9111644657863145, "grad_norm": 0.15856656432151794, "learning_rate": 0.00013240214784443653, "loss": 0.4148, "step": 3036 }, { "epoch": 0.9114645858343338, "grad_norm": 0.16556061804294586, "learning_rate": 0.00013235259183718695, "loss": 0.4308, "step": 3037 }, { "epoch": 0.9117647058823529, "grad_norm": 0.174869105219841, "learning_rate": 0.00013230302695437475, "loss": 0.4469, "step": 3038 }, { "epoch": 0.9120648259303722, "grad_norm": 0.1541697084903717, "learning_rate": 0.0001322534532095975, "loss": 0.4533, "step": 3039 }, { "epoch": 0.9123649459783914, "grad_norm": 0.22935479879379272, "learning_rate": 0.00013220387061645518, "loss": 0.4675, "step": 3040 }, { "epoch": 0.9126650660264105, "grad_norm": 0.15487846732139587, "learning_rate": 0.00013215427918855028, "loss": 0.4637, "step": 3041 }, { "epoch": 0.9129651860744298, "grad_norm": 0.1683686524629593, "learning_rate": 0.00013210467893948755, "loss": 0.491, "step": 3042 }, { "epoch": 0.9132653061224489, "grad_norm": 0.14868466556072235, "learning_rate": 0.00013205506988287432, "loss": 0.5013, "step": 3043 }, { "epoch": 0.9135654261704682, "grad_norm": 0.1356276571750641, "learning_rate": 0.00013200545203232017, "loss": 0.408, "step": 3044 }, { "epoch": 0.9138655462184874, "grad_norm": 0.13295316696166992, "learning_rate": 0.00013195582540143728, "loss": 0.3908, "step": 3045 }, { "epoch": 0.9141656662665066, "grad_norm": 0.14142464101314545, "learning_rate": 0.00013190619000384013, "loss": 0.4476, "step": 3046 }, { "epoch": 0.9144657863145258, "grad_norm": 0.16184107959270477, "learning_rate": 0.00013185654585314558, "loss": 0.4602, "step": 3047 }, { "epoch": 0.9147659063625451, "grad_norm": 0.1475936621427536, "learning_rate": 0.00013180689296297295, "loss": 0.4466, "step": 3048 }, { "epoch": 0.9150660264105642, "grad_norm": 0.15406182408332825, "learning_rate": 0.000131757231346944, "loss": 0.4617, "step": 3049 }, { "epoch": 0.9153661464585834, "grad_norm": 0.15649984776973724, "learning_rate": 0.00013170756101868274, "loss": 0.4445, "step": 3050 }, { "epoch": 0.9156662665066027, "grad_norm": 0.14102435111999512, "learning_rate": 0.0001316578819918157, "loss": 0.4178, "step": 3051 }, { "epoch": 0.9159663865546218, "grad_norm": 0.1508234590291977, "learning_rate": 0.00013160819427997174, "loss": 0.4607, "step": 3052 }, { "epoch": 0.9162665066026411, "grad_norm": 0.14864003658294678, "learning_rate": 0.00013155849789678214, "loss": 0.454, "step": 3053 }, { "epoch": 0.9165666266506602, "grad_norm": 0.1495371162891388, "learning_rate": 0.00013150879285588047, "loss": 0.4444, "step": 3054 }, { "epoch": 0.9168667466986795, "grad_norm": 0.14183548092842102, "learning_rate": 0.0001314590791709028, "loss": 0.4621, "step": 3055 }, { "epoch": 0.9171668667466987, "grad_norm": 0.15212209522724152, "learning_rate": 0.0001314093568554875, "loss": 0.4733, "step": 3056 }, { "epoch": 0.9174669867947179, "grad_norm": 0.16908378899097443, "learning_rate": 0.00013135962592327531, "loss": 0.5045, "step": 3057 }, { "epoch": 0.9177671068427371, "grad_norm": 0.14085766673088074, "learning_rate": 0.00013130988638790933, "loss": 0.4682, "step": 3058 }, { "epoch": 0.9180672268907563, "grad_norm": 0.14654438197612762, "learning_rate": 0.00013126013826303501, "loss": 0.4488, "step": 3059 }, { "epoch": 0.9183673469387755, "grad_norm": 0.15144743025302887, "learning_rate": 0.00013121038156230021, "loss": 0.4794, "step": 3060 }, { "epoch": 0.9186674669867947, "grad_norm": 0.1591755449771881, "learning_rate": 0.00013116061629935515, "loss": 0.4484, "step": 3061 }, { "epoch": 0.918967587034814, "grad_norm": 0.1443132907152176, "learning_rate": 0.00013111084248785225, "loss": 0.4609, "step": 3062 }, { "epoch": 0.9192677070828331, "grad_norm": 0.1388029009103775, "learning_rate": 0.00013106106014144648, "loss": 0.417, "step": 3063 }, { "epoch": 0.9195678271308524, "grad_norm": 0.12858764827251434, "learning_rate": 0.00013101126927379504, "loss": 0.3952, "step": 3064 }, { "epoch": 0.9198679471788715, "grad_norm": 0.1703989952802658, "learning_rate": 0.00013096146989855745, "loss": 0.4609, "step": 3065 }, { "epoch": 0.9201680672268907, "grad_norm": 0.135576993227005, "learning_rate": 0.00013091166202939563, "loss": 0.415, "step": 3066 }, { "epoch": 0.92046818727491, "grad_norm": 0.17634700238704681, "learning_rate": 0.00013086184567997374, "loss": 0.5223, "step": 3067 }, { "epoch": 0.9207683073229291, "grad_norm": 0.15296968817710876, "learning_rate": 0.00013081202086395844, "loss": 0.478, "step": 3068 }, { "epoch": 0.9210684273709484, "grad_norm": 0.14750336110591888, "learning_rate": 0.00013076218759501846, "loss": 0.4466, "step": 3069 }, { "epoch": 0.9213685474189676, "grad_norm": 0.15692031383514404, "learning_rate": 0.00013071234588682507, "loss": 0.4698, "step": 3070 }, { "epoch": 0.9216686674669868, "grad_norm": 0.139719620347023, "learning_rate": 0.00013066249575305173, "loss": 0.4203, "step": 3071 }, { "epoch": 0.921968787515006, "grad_norm": 0.14857828617095947, "learning_rate": 0.00013061263720737432, "loss": 0.461, "step": 3072 }, { "epoch": 0.9222689075630253, "grad_norm": 0.18154451251029968, "learning_rate": 0.0001305627702634709, "loss": 0.5074, "step": 3073 }, { "epoch": 0.9225690276110444, "grad_norm": 0.14855815470218658, "learning_rate": 0.00013051289493502192, "loss": 0.4307, "step": 3074 }, { "epoch": 0.9228691476590636, "grad_norm": 0.1603444218635559, "learning_rate": 0.00013046301123571008, "loss": 0.5327, "step": 3075 }, { "epoch": 0.9231692677070829, "grad_norm": 0.15262074768543243, "learning_rate": 0.0001304131191792204, "loss": 0.5154, "step": 3076 }, { "epoch": 0.923469387755102, "grad_norm": 0.13610374927520752, "learning_rate": 0.0001303632187792402, "loss": 0.4173, "step": 3077 }, { "epoch": 0.9237695078031213, "grad_norm": 0.20092245936393738, "learning_rate": 0.00013031331004945913, "loss": 0.4564, "step": 3078 }, { "epoch": 0.9240696278511404, "grad_norm": 0.13647323846817017, "learning_rate": 0.00013026339300356902, "loss": 0.408, "step": 3079 }, { "epoch": 0.9243697478991597, "grad_norm": 0.1391654759645462, "learning_rate": 0.00013021346765526405, "loss": 0.445, "step": 3080 }, { "epoch": 0.9246698679471789, "grad_norm": 0.15535160899162292, "learning_rate": 0.00013016353401824069, "loss": 0.3748, "step": 3081 }, { "epoch": 0.924969987995198, "grad_norm": 0.14320309460163116, "learning_rate": 0.00013011359210619763, "loss": 0.4247, "step": 3082 }, { "epoch": 0.9252701080432173, "grad_norm": 0.19213740527629852, "learning_rate": 0.00013006364193283589, "loss": 0.4484, "step": 3083 }, { "epoch": 0.9255702280912365, "grad_norm": 0.14360110461711884, "learning_rate": 0.0001300136835118587, "loss": 0.4354, "step": 3084 }, { "epoch": 0.9258703481392557, "grad_norm": 0.12578319013118744, "learning_rate": 0.0001299637168569716, "loss": 0.377, "step": 3085 }, { "epoch": 0.9261704681872749, "grad_norm": 0.14346668124198914, "learning_rate": 0.00012991374198188232, "loss": 0.4416, "step": 3086 }, { "epoch": 0.9264705882352942, "grad_norm": 0.14157746732234955, "learning_rate": 0.00012986375890030103, "loss": 0.4369, "step": 3087 }, { "epoch": 0.9267707082833133, "grad_norm": 0.14177608489990234, "learning_rate": 0.00012981376762593983, "loss": 0.4527, "step": 3088 }, { "epoch": 0.9270708283313326, "grad_norm": 0.15607760846614838, "learning_rate": 0.0001297637681725134, "loss": 0.4955, "step": 3089 }, { "epoch": 0.9273709483793517, "grad_norm": 0.141166552901268, "learning_rate": 0.00012971376055373842, "loss": 0.4591, "step": 3090 }, { "epoch": 0.9276710684273709, "grad_norm": 0.1640852391719818, "learning_rate": 0.00012966374478333399, "loss": 0.5217, "step": 3091 }, { "epoch": 0.9279711884753902, "grad_norm": 0.17959006130695343, "learning_rate": 0.00012961372087502126, "loss": 0.4773, "step": 3092 }, { "epoch": 0.9282713085234093, "grad_norm": 0.1554243564605713, "learning_rate": 0.00012956368884252384, "loss": 0.4658, "step": 3093 }, { "epoch": 0.9285714285714286, "grad_norm": 0.1420896202325821, "learning_rate": 0.00012951364869956733, "loss": 0.4294, "step": 3094 }, { "epoch": 0.9288715486194478, "grad_norm": 0.16320428252220154, "learning_rate": 0.00012946360045987975, "loss": 0.4194, "step": 3095 }, { "epoch": 0.929171668667467, "grad_norm": 0.15008452534675598, "learning_rate": 0.0001294135441371912, "loss": 0.4515, "step": 3096 }, { "epoch": 0.9294717887154862, "grad_norm": 0.1435832530260086, "learning_rate": 0.00012936347974523414, "loss": 0.4389, "step": 3097 }, { "epoch": 0.9297719087635054, "grad_norm": 0.15409572422504425, "learning_rate": 0.00012931340729774307, "loss": 0.4622, "step": 3098 }, { "epoch": 0.9300720288115246, "grad_norm": 0.15281450748443604, "learning_rate": 0.00012926332680845488, "loss": 0.459, "step": 3099 }, { "epoch": 0.9303721488595438, "grad_norm": 0.16661444306373596, "learning_rate": 0.0001292132382911085, "loss": 0.4718, "step": 3100 }, { "epoch": 0.930672268907563, "grad_norm": 0.16447588801383972, "learning_rate": 0.00012916314175944515, "loss": 0.4005, "step": 3101 }, { "epoch": 0.9309723889555822, "grad_norm": 0.13854825496673584, "learning_rate": 0.00012911303722720828, "loss": 0.4226, "step": 3102 }, { "epoch": 0.9312725090036015, "grad_norm": 0.13010156154632568, "learning_rate": 0.00012906292470814346, "loss": 0.4053, "step": 3103 }, { "epoch": 0.9315726290516206, "grad_norm": 0.1506943553686142, "learning_rate": 0.00012901280421599853, "loss": 0.4671, "step": 3104 }, { "epoch": 0.9318727490996399, "grad_norm": 0.13658320903778076, "learning_rate": 0.00012896267576452343, "loss": 0.4275, "step": 3105 }, { "epoch": 0.9321728691476591, "grad_norm": 0.15992045402526855, "learning_rate": 0.00012891253936747035, "loss": 0.449, "step": 3106 }, { "epoch": 0.9324729891956782, "grad_norm": 0.14674633741378784, "learning_rate": 0.00012886239503859357, "loss": 0.4654, "step": 3107 }, { "epoch": 0.9327731092436975, "grad_norm": 0.14501380920410156, "learning_rate": 0.0001288122427916497, "loss": 0.4752, "step": 3108 }, { "epoch": 0.9330732292917167, "grad_norm": 0.9473604559898376, "learning_rate": 0.00012876208264039739, "loss": 0.4511, "step": 3109 }, { "epoch": 0.9333733493397359, "grad_norm": 0.13532473146915436, "learning_rate": 0.00012871191459859754, "loss": 0.4061, "step": 3110 }, { "epoch": 0.9336734693877551, "grad_norm": 0.40912723541259766, "learning_rate": 0.0001286617386800131, "loss": 0.4824, "step": 3111 }, { "epoch": 0.9339735894357744, "grad_norm": 0.1879563182592392, "learning_rate": 0.00012861155489840933, "loss": 0.5001, "step": 3112 }, { "epoch": 0.9342737094837935, "grad_norm": 0.1302296668291092, "learning_rate": 0.00012856136326755353, "loss": 0.3693, "step": 3113 }, { "epoch": 0.9345738295318127, "grad_norm": 0.4808141887187958, "learning_rate": 0.00012851116380121526, "loss": 0.4287, "step": 3114 }, { "epoch": 0.9348739495798319, "grad_norm": 0.14809343218803406, "learning_rate": 0.0001284609565131661, "loss": 0.4297, "step": 3115 }, { "epoch": 0.9351740696278511, "grad_norm": 0.23470531404018402, "learning_rate": 0.00012841074141717987, "loss": 0.4535, "step": 3116 }, { "epoch": 0.9354741896758704, "grad_norm": 0.15969856083393097, "learning_rate": 0.0001283605185270325, "loss": 0.4477, "step": 3117 }, { "epoch": 0.9357743097238895, "grad_norm": 0.15086837112903595, "learning_rate": 0.00012831028785650209, "loss": 0.4145, "step": 3118 }, { "epoch": 0.9360744297719088, "grad_norm": 0.2130202353000641, "learning_rate": 0.00012826004941936885, "loss": 0.482, "step": 3119 }, { "epoch": 0.936374549819928, "grad_norm": 0.2209470272064209, "learning_rate": 0.00012820980322941506, "loss": 0.5067, "step": 3120 }, { "epoch": 0.9366746698679472, "grad_norm": 0.16430065035820007, "learning_rate": 0.00012815954930042522, "loss": 0.5202, "step": 3121 }, { "epoch": 0.9369747899159664, "grad_norm": 0.14141945540905, "learning_rate": 0.00012810928764618593, "loss": 0.4575, "step": 3122 }, { "epoch": 0.9372749099639855, "grad_norm": 0.15663444995880127, "learning_rate": 0.0001280590182804859, "loss": 0.5294, "step": 3123 }, { "epoch": 0.9375750300120048, "grad_norm": 0.1487339287996292, "learning_rate": 0.00012800874121711594, "loss": 0.4321, "step": 3124 }, { "epoch": 0.937875150060024, "grad_norm": 0.16639180481433868, "learning_rate": 0.00012795845646986902, "loss": 0.503, "step": 3125 }, { "epoch": 0.9381752701080432, "grad_norm": 0.13797855377197266, "learning_rate": 0.00012790816405254012, "loss": 0.4339, "step": 3126 }, { "epoch": 0.9384753901560624, "grad_norm": 0.14541450142860413, "learning_rate": 0.00012785786397892643, "loss": 0.4296, "step": 3127 }, { "epoch": 0.9387755102040817, "grad_norm": 0.1672312319278717, "learning_rate": 0.00012780755626282721, "loss": 0.4394, "step": 3128 }, { "epoch": 0.9390756302521008, "grad_norm": 0.14389723539352417, "learning_rate": 0.00012775724091804378, "loss": 0.445, "step": 3129 }, { "epoch": 0.93937575030012, "grad_norm": 0.13469307124614716, "learning_rate": 0.00012770691795837956, "loss": 0.4061, "step": 3130 }, { "epoch": 0.9396758703481393, "grad_norm": 0.1481163203716278, "learning_rate": 0.00012765658739764013, "loss": 0.4118, "step": 3131 }, { "epoch": 0.9399759903961584, "grad_norm": 0.15213724970817566, "learning_rate": 0.00012760624924963306, "loss": 0.4611, "step": 3132 }, { "epoch": 0.9402761104441777, "grad_norm": 0.21984997391700745, "learning_rate": 0.00012755590352816806, "loss": 0.3983, "step": 3133 }, { "epoch": 0.9405762304921969, "grad_norm": 0.17315754294395447, "learning_rate": 0.00012750555024705688, "loss": 0.5218, "step": 3134 }, { "epoch": 0.9408763505402161, "grad_norm": 0.148133784532547, "learning_rate": 0.00012745518942011344, "loss": 0.4728, "step": 3135 }, { "epoch": 0.9411764705882353, "grad_norm": 0.16228275001049042, "learning_rate": 0.00012740482106115354, "loss": 0.4677, "step": 3136 }, { "epoch": 0.9414765906362546, "grad_norm": 0.1501329392194748, "learning_rate": 0.00012735444518399526, "loss": 0.4265, "step": 3137 }, { "epoch": 0.9417767106842737, "grad_norm": 0.17041635513305664, "learning_rate": 0.00012730406180245856, "loss": 0.4577, "step": 3138 }, { "epoch": 0.9420768307322929, "grad_norm": 0.1504543125629425, "learning_rate": 0.00012725367093036568, "loss": 0.451, "step": 3139 }, { "epoch": 0.9423769507803121, "grad_norm": 0.1484091579914093, "learning_rate": 0.00012720327258154059, "loss": 0.4865, "step": 3140 }, { "epoch": 0.9426770708283313, "grad_norm": 0.16023018956184387, "learning_rate": 0.00012715286676980963, "loss": 0.5077, "step": 3141 }, { "epoch": 0.9429771908763506, "grad_norm": 0.1468799114227295, "learning_rate": 0.00012710245350900105, "loss": 0.4624, "step": 3142 }, { "epoch": 0.9432773109243697, "grad_norm": 0.1554957628250122, "learning_rate": 0.0001270520328129451, "loss": 0.4606, "step": 3143 }, { "epoch": 0.943577430972389, "grad_norm": 0.1647556573152542, "learning_rate": 0.00012700160469547415, "loss": 0.4102, "step": 3144 }, { "epoch": 0.9438775510204082, "grad_norm": 0.14057038724422455, "learning_rate": 0.00012695116917042255, "loss": 0.4527, "step": 3145 }, { "epoch": 0.9441776710684273, "grad_norm": 0.1454935371875763, "learning_rate": 0.00012690072625162676, "loss": 0.4265, "step": 3146 }, { "epoch": 0.9444777911164466, "grad_norm": 0.1505882441997528, "learning_rate": 0.00012685027595292514, "loss": 0.4554, "step": 3147 }, { "epoch": 0.9447779111644657, "grad_norm": 0.1458381861448288, "learning_rate": 0.0001267998182881582, "loss": 0.4541, "step": 3148 }, { "epoch": 0.945078031212485, "grad_norm": 0.15787646174430847, "learning_rate": 0.00012674935327116842, "loss": 0.466, "step": 3149 }, { "epoch": 0.9453781512605042, "grad_norm": 0.14523647725582123, "learning_rate": 0.00012669888091580033, "loss": 0.4605, "step": 3150 }, { "epoch": 0.9456782713085234, "grad_norm": 0.13974350690841675, "learning_rate": 0.00012664840123590036, "loss": 0.4185, "step": 3151 }, { "epoch": 0.9459783913565426, "grad_norm": 0.14377835392951965, "learning_rate": 0.00012659791424531711, "loss": 0.4824, "step": 3152 }, { "epoch": 0.9462785114045619, "grad_norm": 0.12975840270519257, "learning_rate": 0.00012654741995790102, "loss": 0.4008, "step": 3153 }, { "epoch": 0.946578631452581, "grad_norm": 0.1532629132270813, "learning_rate": 0.00012649691838750475, "loss": 0.471, "step": 3154 }, { "epoch": 0.9468787515006002, "grad_norm": 0.1365705281496048, "learning_rate": 0.00012644640954798271, "loss": 0.435, "step": 3155 }, { "epoch": 0.9471788715486195, "grad_norm": 0.14507761597633362, "learning_rate": 0.00012639589345319146, "loss": 0.4561, "step": 3156 }, { "epoch": 0.9474789915966386, "grad_norm": 0.16338889300823212, "learning_rate": 0.00012634537011698948, "loss": 0.4901, "step": 3157 }, { "epoch": 0.9477791116446579, "grad_norm": 0.15298539400100708, "learning_rate": 0.00012629483955323736, "loss": 0.4517, "step": 3158 }, { "epoch": 0.948079231692677, "grad_norm": 0.14111188054084778, "learning_rate": 0.00012624430177579749, "loss": 0.375, "step": 3159 }, { "epoch": 0.9483793517406963, "grad_norm": 0.1338079571723938, "learning_rate": 0.00012619375679853435, "loss": 0.3994, "step": 3160 }, { "epoch": 0.9486794717887155, "grad_norm": 0.16571113467216492, "learning_rate": 0.00012614320463531442, "loss": 0.5047, "step": 3161 }, { "epoch": 0.9489795918367347, "grad_norm": 0.1337796449661255, "learning_rate": 0.00012609264530000604, "loss": 0.4401, "step": 3162 }, { "epoch": 0.9492797118847539, "grad_norm": 0.15425826609134674, "learning_rate": 0.00012604207880647964, "loss": 0.4715, "step": 3163 }, { "epoch": 0.9495798319327731, "grad_norm": 0.13787232339382172, "learning_rate": 0.0001259915051686075, "loss": 0.4469, "step": 3164 }, { "epoch": 0.9498799519807923, "grad_norm": 0.1763979196548462, "learning_rate": 0.00012594092440026397, "loss": 0.4671, "step": 3165 }, { "epoch": 0.9501800720288115, "grad_norm": 0.14985892176628113, "learning_rate": 0.0001258903365153253, "loss": 0.5102, "step": 3166 }, { "epoch": 0.9504801920768308, "grad_norm": 0.14893373847007751, "learning_rate": 0.00012583974152766966, "loss": 0.4811, "step": 3167 }, { "epoch": 0.9507803121248499, "grad_norm": 0.14797988533973694, "learning_rate": 0.0001257891394511772, "loss": 0.4312, "step": 3168 }, { "epoch": 0.9510804321728692, "grad_norm": 0.14474913477897644, "learning_rate": 0.0001257385302997301, "loss": 0.453, "step": 3169 }, { "epoch": 0.9513805522208884, "grad_norm": 0.15031698346138, "learning_rate": 0.0001256879140872123, "loss": 0.4736, "step": 3170 }, { "epoch": 0.9516806722689075, "grad_norm": 0.1375945806503296, "learning_rate": 0.00012563729082750986, "loss": 0.4032, "step": 3171 }, { "epoch": 0.9519807923169268, "grad_norm": 0.15056562423706055, "learning_rate": 0.00012558666053451062, "loss": 0.428, "step": 3172 }, { "epoch": 0.9522809123649459, "grad_norm": 0.2670383155345917, "learning_rate": 0.0001255360232221045, "loss": 0.4268, "step": 3173 }, { "epoch": 0.9525810324129652, "grad_norm": 0.1334916651248932, "learning_rate": 0.00012548537890418317, "loss": 0.3939, "step": 3174 }, { "epoch": 0.9528811524609844, "grad_norm": 0.14399544894695282, "learning_rate": 0.0001254347275946404, "loss": 0.4705, "step": 3175 }, { "epoch": 0.9531812725090036, "grad_norm": 0.14479242265224457, "learning_rate": 0.00012538406930737175, "loss": 0.434, "step": 3176 }, { "epoch": 0.9534813925570228, "grad_norm": 0.14071334898471832, "learning_rate": 0.00012533340405627475, "loss": 0.429, "step": 3177 }, { "epoch": 0.9537815126050421, "grad_norm": 0.17476089298725128, "learning_rate": 0.00012528273185524885, "loss": 0.4767, "step": 3178 }, { "epoch": 0.9540816326530612, "grad_norm": 0.13704952597618103, "learning_rate": 0.0001252320527181954, "loss": 0.4029, "step": 3179 }, { "epoch": 0.9543817527010804, "grad_norm": 0.14694024622440338, "learning_rate": 0.00012518136665901755, "loss": 0.4931, "step": 3180 }, { "epoch": 0.9546818727490997, "grad_norm": 0.14101053774356842, "learning_rate": 0.00012513067369162052, "loss": 0.4315, "step": 3181 }, { "epoch": 0.9549819927971188, "grad_norm": 0.16008321940898895, "learning_rate": 0.00012507997382991132, "loss": 0.4494, "step": 3182 }, { "epoch": 0.9552821128451381, "grad_norm": 0.15155091881752014, "learning_rate": 0.00012502926708779892, "loss": 0.4968, "step": 3183 }, { "epoch": 0.9555822328931572, "grad_norm": 0.14365136623382568, "learning_rate": 0.00012497855347919407, "loss": 0.4271, "step": 3184 }, { "epoch": 0.9558823529411765, "grad_norm": 0.16472645103931427, "learning_rate": 0.0001249278330180095, "loss": 0.4742, "step": 3185 }, { "epoch": 0.9561824729891957, "grad_norm": 0.14155378937721252, "learning_rate": 0.00012487710571815975, "loss": 0.4285, "step": 3186 }, { "epoch": 0.9564825930372148, "grad_norm": 0.1432572901248932, "learning_rate": 0.00012482637159356131, "loss": 0.4675, "step": 3187 }, { "epoch": 0.9567827130852341, "grad_norm": 0.15490619838237762, "learning_rate": 0.00012477563065813253, "loss": 0.5118, "step": 3188 }, { "epoch": 0.9570828331332533, "grad_norm": 0.1394316554069519, "learning_rate": 0.00012472488292579353, "loss": 0.437, "step": 3189 }, { "epoch": 0.9573829531812725, "grad_norm": 0.14840246737003326, "learning_rate": 0.00012467412841046644, "loss": 0.4374, "step": 3190 }, { "epoch": 0.9576830732292917, "grad_norm": 0.14005251228809357, "learning_rate": 0.00012462336712607515, "loss": 0.4277, "step": 3191 }, { "epoch": 0.957983193277311, "grad_norm": 0.16837067902088165, "learning_rate": 0.00012457259908654544, "loss": 0.439, "step": 3192 }, { "epoch": 0.9582833133253301, "grad_norm": 0.14483563601970673, "learning_rate": 0.00012452182430580487, "loss": 0.4938, "step": 3193 }, { "epoch": 0.9585834333733494, "grad_norm": 0.14190462231636047, "learning_rate": 0.00012447104279778305, "loss": 0.4116, "step": 3194 }, { "epoch": 0.9588835534213686, "grad_norm": 0.1376616656780243, "learning_rate": 0.00012442025457641123, "loss": 0.4001, "step": 3195 }, { "epoch": 0.9591836734693877, "grad_norm": 0.16073790192604065, "learning_rate": 0.00012436945965562258, "loss": 0.5094, "step": 3196 }, { "epoch": 0.959483793517407, "grad_norm": 0.16511289775371552, "learning_rate": 0.00012431865804935207, "loss": 0.5465, "step": 3197 }, { "epoch": 0.9597839135654261, "grad_norm": 0.23794710636138916, "learning_rate": 0.00012426784977153662, "loss": 0.542, "step": 3198 }, { "epoch": 0.9600840336134454, "grad_norm": 0.2180640995502472, "learning_rate": 0.00012421703483611485, "loss": 0.5002, "step": 3199 }, { "epoch": 0.9603841536614646, "grad_norm": 0.6686155796051025, "learning_rate": 0.00012416621325702723, "loss": 0.4223, "step": 3200 }, { "epoch": 0.9606842737094838, "grad_norm": 0.14647041261196136, "learning_rate": 0.00012411538504821613, "loss": 0.4171, "step": 3201 }, { "epoch": 0.960984393757503, "grad_norm": 0.14124207198619843, "learning_rate": 0.00012406455022362565, "loss": 0.4106, "step": 3202 }, { "epoch": 0.9612845138055222, "grad_norm": 0.14072130620479584, "learning_rate": 0.00012401370879720178, "loss": 0.4297, "step": 3203 }, { "epoch": 0.9615846338535414, "grad_norm": 0.13854216039180756, "learning_rate": 0.00012396286078289226, "loss": 0.4273, "step": 3204 }, { "epoch": 0.9618847539015606, "grad_norm": 0.22632403671741486, "learning_rate": 0.00012391200619464663, "loss": 0.4806, "step": 3205 }, { "epoch": 0.9621848739495799, "grad_norm": 0.15359652042388916, "learning_rate": 0.00012386114504641627, "loss": 0.4526, "step": 3206 }, { "epoch": 0.962484993997599, "grad_norm": 0.17294703423976898, "learning_rate": 0.00012381027735215442, "loss": 0.5399, "step": 3207 }, { "epoch": 0.9627851140456183, "grad_norm": 0.3574408292770386, "learning_rate": 0.00012375940312581596, "loss": 0.4419, "step": 3208 }, { "epoch": 0.9630852340936374, "grad_norm": 0.16574545204639435, "learning_rate": 0.00012370852238135775, "loss": 0.4979, "step": 3209 }, { "epoch": 0.9633853541416567, "grad_norm": 0.15823771059513092, "learning_rate": 0.00012365763513273826, "loss": 0.4644, "step": 3210 }, { "epoch": 0.9636854741896759, "grad_norm": 0.2891182601451874, "learning_rate": 0.0001236067413939178, "loss": 0.5055, "step": 3211 }, { "epoch": 0.963985594237695, "grad_norm": 0.18421053886413574, "learning_rate": 0.00012355584117885855, "loss": 0.4635, "step": 3212 }, { "epoch": 0.9642857142857143, "grad_norm": 0.16126295924186707, "learning_rate": 0.00012350493450152437, "loss": 0.5054, "step": 3213 }, { "epoch": 0.9645858343337335, "grad_norm": 0.14591971039772034, "learning_rate": 0.00012345402137588097, "loss": 0.4113, "step": 3214 }, { "epoch": 0.9648859543817527, "grad_norm": 0.2134801745414734, "learning_rate": 0.0001234031018158957, "loss": 0.4249, "step": 3215 }, { "epoch": 0.9651860744297719, "grad_norm": 0.19483153522014618, "learning_rate": 0.00012335217583553783, "loss": 0.4881, "step": 3216 }, { "epoch": 0.9654861944777912, "grad_norm": 0.15648779273033142, "learning_rate": 0.00012330124344877826, "loss": 0.4707, "step": 3217 }, { "epoch": 0.9657863145258103, "grad_norm": 0.1493963748216629, "learning_rate": 0.00012325030466958976, "loss": 0.4717, "step": 3218 }, { "epoch": 0.9660864345738295, "grad_norm": 0.14706221222877502, "learning_rate": 0.00012319935951194677, "loss": 0.445, "step": 3219 }, { "epoch": 0.9663865546218487, "grad_norm": 0.17727939784526825, "learning_rate": 0.0001231484079898255, "loss": 0.3959, "step": 3220 }, { "epoch": 0.9666866746698679, "grad_norm": 0.17178839445114136, "learning_rate": 0.00012309745011720392, "loss": 0.4025, "step": 3221 }, { "epoch": 0.9669867947178872, "grad_norm": 0.1465953290462494, "learning_rate": 0.0001230464859080618, "loss": 0.454, "step": 3222 }, { "epoch": 0.9672869147659063, "grad_norm": 0.23026442527770996, "learning_rate": 0.0001229955153763805, "loss": 0.4959, "step": 3223 }, { "epoch": 0.9675870348139256, "grad_norm": 0.1530740112066269, "learning_rate": 0.00012294453853614325, "loss": 0.4733, "step": 3224 }, { "epoch": 0.9678871548619448, "grad_norm": 0.1319517195224762, "learning_rate": 0.00012289355540133495, "loss": 0.3903, "step": 3225 }, { "epoch": 0.968187274909964, "grad_norm": 0.14231938123703003, "learning_rate": 0.0001228425659859422, "loss": 0.4649, "step": 3226 }, { "epoch": 0.9684873949579832, "grad_norm": 0.1461937129497528, "learning_rate": 0.00012279157030395345, "loss": 0.4358, "step": 3227 }, { "epoch": 0.9687875150060024, "grad_norm": 0.1504702866077423, "learning_rate": 0.0001227405683693587, "loss": 0.4432, "step": 3228 }, { "epoch": 0.9690876350540216, "grad_norm": 0.19015046954154968, "learning_rate": 0.0001226895601961498, "loss": 0.4691, "step": 3229 }, { "epoch": 0.9693877551020408, "grad_norm": 0.16711188852787018, "learning_rate": 0.00012263854579832022, "loss": 0.5068, "step": 3230 }, { "epoch": 0.96968787515006, "grad_norm": 0.14313435554504395, "learning_rate": 0.00012258752518986516, "loss": 0.4372, "step": 3231 }, { "epoch": 0.9699879951980792, "grad_norm": 0.16503359377384186, "learning_rate": 0.00012253649838478157, "loss": 0.5157, "step": 3232 }, { "epoch": 0.9702881152460985, "grad_norm": 0.15321406722068787, "learning_rate": 0.00012248546539706808, "loss": 0.4769, "step": 3233 }, { "epoch": 0.9705882352941176, "grad_norm": 0.14297358691692352, "learning_rate": 0.00012243442624072502, "loss": 0.4221, "step": 3234 }, { "epoch": 0.9708883553421368, "grad_norm": 0.17347034811973572, "learning_rate": 0.00012238338092975432, "loss": 0.4864, "step": 3235 }, { "epoch": 0.9711884753901561, "grad_norm": 0.13694067299365997, "learning_rate": 0.00012233232947815974, "loss": 0.4032, "step": 3236 }, { "epoch": 0.9714885954381752, "grad_norm": 0.16371691226959229, "learning_rate": 0.00012228127189994664, "loss": 0.4915, "step": 3237 }, { "epoch": 0.9717887154861945, "grad_norm": 0.1552603542804718, "learning_rate": 0.0001222302082091221, "loss": 0.4993, "step": 3238 }, { "epoch": 0.9720888355342137, "grad_norm": 0.14726898074150085, "learning_rate": 0.00012217913841969482, "loss": 0.4674, "step": 3239 }, { "epoch": 0.9723889555822329, "grad_norm": 0.18751665949821472, "learning_rate": 0.00012212806254567526, "loss": 0.4898, "step": 3240 }, { "epoch": 0.9726890756302521, "grad_norm": 0.1408817023038864, "learning_rate": 0.00012207698060107545, "loss": 0.437, "step": 3241 }, { "epoch": 0.9729891956782714, "grad_norm": 0.2426920086145401, "learning_rate": 0.00012202589259990916, "loss": 0.513, "step": 3242 }, { "epoch": 0.9732893157262905, "grad_norm": 0.16936664283275604, "learning_rate": 0.00012197479855619179, "loss": 0.4941, "step": 3243 }, { "epoch": 0.9735894357743097, "grad_norm": 0.14508193731307983, "learning_rate": 0.00012192369848394045, "loss": 0.4254, "step": 3244 }, { "epoch": 0.973889555822329, "grad_norm": 0.14925509691238403, "learning_rate": 0.00012187259239717378, "loss": 0.4389, "step": 3245 }, { "epoch": 0.9741896758703481, "grad_norm": 0.15830858051776886, "learning_rate": 0.0001218214803099122, "loss": 0.4658, "step": 3246 }, { "epoch": 0.9744897959183674, "grad_norm": 0.14749321341514587, "learning_rate": 0.00012177036223617775, "loss": 0.4627, "step": 3247 }, { "epoch": 0.9747899159663865, "grad_norm": 0.15965783596038818, "learning_rate": 0.00012171923818999402, "loss": 0.52, "step": 3248 }, { "epoch": 0.9750900360144058, "grad_norm": 0.16403883695602417, "learning_rate": 0.00012166810818538634, "loss": 0.5091, "step": 3249 }, { "epoch": 0.975390156062425, "grad_norm": 0.1529029756784439, "learning_rate": 0.00012161697223638162, "loss": 0.467, "step": 3250 }, { "epoch": 0.9756902761104442, "grad_norm": 0.1996665745973587, "learning_rate": 0.00012156583035700846, "loss": 0.4269, "step": 3251 }, { "epoch": 0.9759903961584634, "grad_norm": 0.15675577521324158, "learning_rate": 0.00012151468256129704, "loss": 0.4584, "step": 3252 }, { "epoch": 0.9762905162064826, "grad_norm": 0.13851302862167358, "learning_rate": 0.00012146352886327916, "loss": 0.4267, "step": 3253 }, { "epoch": 0.9765906362545018, "grad_norm": 0.15544851124286652, "learning_rate": 0.00012141236927698823, "loss": 0.4324, "step": 3254 }, { "epoch": 0.976890756302521, "grad_norm": 0.16857026517391205, "learning_rate": 0.00012136120381645932, "loss": 0.4889, "step": 3255 }, { "epoch": 0.9771908763505402, "grad_norm": 0.14354467391967773, "learning_rate": 0.00012131003249572908, "loss": 0.4235, "step": 3256 }, { "epoch": 0.9774909963985594, "grad_norm": 0.3445586562156677, "learning_rate": 0.00012125885532883579, "loss": 0.4473, "step": 3257 }, { "epoch": 0.9777911164465787, "grad_norm": 0.14772890508174896, "learning_rate": 0.0001212076723298193, "loss": 0.4987, "step": 3258 }, { "epoch": 0.9780912364945978, "grad_norm": 0.15903234481811523, "learning_rate": 0.0001211564835127211, "loss": 0.5344, "step": 3259 }, { "epoch": 0.978391356542617, "grad_norm": 0.13617999851703644, "learning_rate": 0.00012110528889158421, "loss": 0.3771, "step": 3260 }, { "epoch": 0.9786914765906363, "grad_norm": 0.19996723532676697, "learning_rate": 0.00012105408848045335, "loss": 0.4192, "step": 3261 }, { "epoch": 0.9789915966386554, "grad_norm": 0.14216583967208862, "learning_rate": 0.00012100288229337469, "loss": 0.4467, "step": 3262 }, { "epoch": 0.9792917166866747, "grad_norm": 0.153058722615242, "learning_rate": 0.00012095167034439616, "loss": 0.4744, "step": 3263 }, { "epoch": 0.9795918367346939, "grad_norm": 0.13257387280464172, "learning_rate": 0.00012090045264756709, "loss": 0.388, "step": 3264 }, { "epoch": 0.9798919567827131, "grad_norm": 0.14509105682373047, "learning_rate": 0.00012084922921693849, "loss": 0.4715, "step": 3265 }, { "epoch": 0.9801920768307323, "grad_norm": 0.12990200519561768, "learning_rate": 0.00012079800006656292, "loss": 0.3935, "step": 3266 }, { "epoch": 0.9804921968787516, "grad_norm": 0.14619603753089905, "learning_rate": 0.00012074676521049452, "loss": 0.4264, "step": 3267 }, { "epoch": 0.9807923169267707, "grad_norm": 0.14546331763267517, "learning_rate": 0.000120695524662789, "loss": 0.4438, "step": 3268 }, { "epoch": 0.9810924369747899, "grad_norm": 0.14810235798358917, "learning_rate": 0.00012064427843750357, "loss": 0.4313, "step": 3269 }, { "epoch": 0.9813925570228091, "grad_norm": 0.22097627818584442, "learning_rate": 0.00012059302654869707, "loss": 0.3912, "step": 3270 }, { "epoch": 0.9816926770708283, "grad_norm": 0.14861871302127838, "learning_rate": 0.00012054176901042989, "loss": 0.4472, "step": 3271 }, { "epoch": 0.9819927971188476, "grad_norm": 0.14172734320163727, "learning_rate": 0.0001204905058367639, "loss": 0.4345, "step": 3272 }, { "epoch": 0.9822929171668667, "grad_norm": 0.14876016974449158, "learning_rate": 0.00012043923704176259, "loss": 0.381, "step": 3273 }, { "epoch": 0.982593037214886, "grad_norm": 0.15357892215251923, "learning_rate": 0.00012038796263949099, "loss": 0.4809, "step": 3274 }, { "epoch": 0.9828931572629052, "grad_norm": 0.13547773659229279, "learning_rate": 0.00012033668264401558, "loss": 0.3985, "step": 3275 }, { "epoch": 0.9831932773109243, "grad_norm": 0.15516170859336853, "learning_rate": 0.00012028539706940451, "loss": 0.4789, "step": 3276 }, { "epoch": 0.9834933973589436, "grad_norm": 0.13751676678657532, "learning_rate": 0.00012023410592972735, "loss": 0.4275, "step": 3277 }, { "epoch": 0.9837935174069627, "grad_norm": 0.1439884603023529, "learning_rate": 0.00012018280923905528, "loss": 0.4411, "step": 3278 }, { "epoch": 0.984093637454982, "grad_norm": 0.1395803838968277, "learning_rate": 0.00012013150701146086, "loss": 0.4343, "step": 3279 }, { "epoch": 0.9843937575030012, "grad_norm": 0.15619012713432312, "learning_rate": 0.00012008019926101837, "loss": 0.4666, "step": 3280 }, { "epoch": 0.9846938775510204, "grad_norm": 0.13754403591156006, "learning_rate": 0.00012002888600180341, "loss": 0.4219, "step": 3281 }, { "epoch": 0.9849939975990396, "grad_norm": 0.6203069090843201, "learning_rate": 0.00011997756724789333, "loss": 0.4206, "step": 3282 }, { "epoch": 0.9852941176470589, "grad_norm": 0.13347765803337097, "learning_rate": 0.00011992624301336668, "loss": 0.3895, "step": 3283 }, { "epoch": 0.985594237695078, "grad_norm": 0.14219003915786743, "learning_rate": 0.00011987491331230378, "loss": 0.4413, "step": 3284 }, { "epoch": 0.9858943577430972, "grad_norm": 0.1293162852525711, "learning_rate": 0.00011982357815878629, "loss": 0.3935, "step": 3285 }, { "epoch": 0.9861944777911165, "grad_norm": 0.13473662734031677, "learning_rate": 0.00011977223756689746, "loss": 0.388, "step": 3286 }, { "epoch": 0.9864945978391356, "grad_norm": 0.17074993252754211, "learning_rate": 0.00011972089155072195, "loss": 0.4125, "step": 3287 }, { "epoch": 0.9867947178871549, "grad_norm": 0.14385375380516052, "learning_rate": 0.00011966954012434599, "loss": 0.3789, "step": 3288 }, { "epoch": 0.987094837935174, "grad_norm": 0.20002798736095428, "learning_rate": 0.00011961818330185723, "loss": 0.46, "step": 3289 }, { "epoch": 0.9873949579831933, "grad_norm": 0.15859068930149078, "learning_rate": 0.00011956682109734485, "loss": 0.4149, "step": 3290 }, { "epoch": 0.9876950780312125, "grad_norm": 0.16643311083316803, "learning_rate": 0.00011951545352489948, "loss": 0.5037, "step": 3291 }, { "epoch": 0.9879951980792316, "grad_norm": 0.17133454978466034, "learning_rate": 0.00011946408059861316, "loss": 0.5094, "step": 3292 }, { "epoch": 0.9882953181272509, "grad_norm": 0.1505843847990036, "learning_rate": 0.00011941270233257957, "loss": 0.4371, "step": 3293 }, { "epoch": 0.9885954381752701, "grad_norm": 0.15345150232315063, "learning_rate": 0.00011936131874089365, "loss": 0.4646, "step": 3294 }, { "epoch": 0.9888955582232893, "grad_norm": 0.16357767581939697, "learning_rate": 0.00011930992983765196, "loss": 0.4674, "step": 3295 }, { "epoch": 0.9891956782713085, "grad_norm": 0.14693854749202728, "learning_rate": 0.00011925853563695242, "loss": 0.4626, "step": 3296 }, { "epoch": 0.9894957983193278, "grad_norm": 0.1619652509689331, "learning_rate": 0.0001192071361528945, "loss": 0.4899, "step": 3297 }, { "epoch": 0.9897959183673469, "grad_norm": 0.14258073270320892, "learning_rate": 0.00011915573139957898, "loss": 0.4449, "step": 3298 }, { "epoch": 0.9900960384153662, "grad_norm": 0.1380699872970581, "learning_rate": 0.00011910432139110822, "loss": 0.4112, "step": 3299 }, { "epoch": 0.9903961584633854, "grad_norm": 0.13336259126663208, "learning_rate": 0.0001190529061415859, "loss": 0.4064, "step": 3300 }, { "epoch": 0.9906962785114045, "grad_norm": 0.1317206174135208, "learning_rate": 0.00011900148566511733, "loss": 0.4186, "step": 3301 }, { "epoch": 0.9909963985594238, "grad_norm": 0.16753268241882324, "learning_rate": 0.00011895005997580899, "loss": 0.4872, "step": 3302 }, { "epoch": 0.991296518607443, "grad_norm": 0.15727880597114563, "learning_rate": 0.000118898629087769, "loss": 0.5063, "step": 3303 }, { "epoch": 0.9915966386554622, "grad_norm": 0.14946980774402618, "learning_rate": 0.00011884719301510685, "loss": 0.46, "step": 3304 }, { "epoch": 0.9918967587034814, "grad_norm": 0.13264238834381104, "learning_rate": 0.0001187957517719334, "loss": 0.4333, "step": 3305 }, { "epoch": 0.9921968787515006, "grad_norm": 0.1383105218410492, "learning_rate": 0.00011874430537236094, "loss": 0.4093, "step": 3306 }, { "epoch": 0.9924969987995198, "grad_norm": 0.1599801778793335, "learning_rate": 0.00011869285383050328, "loss": 0.5252, "step": 3307 }, { "epoch": 0.992797118847539, "grad_norm": 0.14416632056236267, "learning_rate": 0.00011864139716047549, "loss": 0.4752, "step": 3308 }, { "epoch": 0.9930972388955582, "grad_norm": 0.16169165074825287, "learning_rate": 0.00011858993537639415, "loss": 0.4927, "step": 3309 }, { "epoch": 0.9933973589435774, "grad_norm": 0.15442326664924622, "learning_rate": 0.0001185384684923772, "loss": 0.462, "step": 3310 }, { "epoch": 0.9936974789915967, "grad_norm": 0.14835773408412933, "learning_rate": 0.00011848699652254398, "loss": 0.467, "step": 3311 }, { "epoch": 0.9939975990396158, "grad_norm": 0.15235137939453125, "learning_rate": 0.00011843551948101525, "loss": 0.4383, "step": 3312 }, { "epoch": 0.9942977190876351, "grad_norm": 0.1400371640920639, "learning_rate": 0.00011838403738191317, "loss": 0.3838, "step": 3313 }, { "epoch": 0.9945978391356542, "grad_norm": 0.14551183581352234, "learning_rate": 0.00011833255023936123, "loss": 0.4479, "step": 3314 }, { "epoch": 0.9948979591836735, "grad_norm": 0.22726750373840332, "learning_rate": 0.00011828105806748431, "loss": 0.4438, "step": 3315 }, { "epoch": 0.9951980792316927, "grad_norm": 0.13362866640090942, "learning_rate": 0.00011822956088040878, "loss": 0.4369, "step": 3316 }, { "epoch": 0.9954981992797118, "grad_norm": 0.16523800790309906, "learning_rate": 0.0001181780586922622, "loss": 0.4867, "step": 3317 }, { "epoch": 0.9957983193277311, "grad_norm": 0.12675073742866516, "learning_rate": 0.00011812655151717369, "loss": 0.4169, "step": 3318 }, { "epoch": 0.9960984393757503, "grad_norm": 0.15196578204631805, "learning_rate": 0.00011807503936927363, "loss": 0.4511, "step": 3319 }, { "epoch": 0.9963985594237695, "grad_norm": 0.13615992665290833, "learning_rate": 0.00011802352226269375, "loss": 0.404, "step": 3320 }, { "epoch": 0.9966986794717887, "grad_norm": 0.1454804539680481, "learning_rate": 0.0001179720002115672, "loss": 0.4985, "step": 3321 }, { "epoch": 0.996998799519808, "grad_norm": 0.14081403613090515, "learning_rate": 0.00011792047323002848, "loss": 0.4311, "step": 3322 }, { "epoch": 0.9972989195678271, "grad_norm": 0.14123167097568512, "learning_rate": 0.00011786894133221345, "loss": 0.4246, "step": 3323 }, { "epoch": 0.9975990396158463, "grad_norm": 0.13770636916160583, "learning_rate": 0.00011781740453225923, "loss": 0.3965, "step": 3324 }, { "epoch": 0.9978991596638656, "grad_norm": 0.1396370232105255, "learning_rate": 0.00011776586284430437, "loss": 0.4565, "step": 3325 }, { "epoch": 0.9981992797118847, "grad_norm": 0.15787461400032043, "learning_rate": 0.00011771431628248877, "loss": 0.4392, "step": 3326 }, { "epoch": 0.998499399759904, "grad_norm": 0.13858865201473236, "learning_rate": 0.00011766276486095362, "loss": 0.4487, "step": 3327 }, { "epoch": 0.9987995198079231, "grad_norm": 0.13797371089458466, "learning_rate": 0.00011761120859384147, "loss": 0.4489, "step": 3328 }, { "epoch": 0.9990996398559424, "grad_norm": 0.2163102775812149, "learning_rate": 0.00011755964749529618, "loss": 0.4431, "step": 3329 }, { "epoch": 0.9993997599039616, "grad_norm": 0.1351035088300705, "learning_rate": 0.00011750808157946291, "loss": 0.4456, "step": 3330 }, { "epoch": 0.9996998799519808, "grad_norm": 0.14229658246040344, "learning_rate": 0.00011745651086048825, "loss": 0.4418, "step": 3331 }, { "epoch": 1.0, "grad_norm": 0.128207266330719, "learning_rate": 0.00011740493535252002, "loss": 0.3952, "step": 3332 }, { "epoch": 1.0, "eval_loss": 0.2915521264076233, "eval_runtime": 3781.9927, "eval_samples_per_second": 12.251, "eval_steps_per_second": 0.766, "step": 3332 }, { "epoch": 1.0003001200480193, "grad_norm": 0.1412869095802307, "learning_rate": 0.00011735335506970733, "loss": 0.4536, "step": 3333 }, { "epoch": 1.0006002400960383, "grad_norm": 0.14577415585517883, "learning_rate": 0.00011730177002620066, "loss": 0.3832, "step": 3334 }, { "epoch": 1.0009003601440576, "grad_norm": 0.15070371329784393, "learning_rate": 0.0001172501802361518, "loss": 0.514, "step": 3335 }, { "epoch": 1.0012004801920769, "grad_norm": 0.13936564326286316, "learning_rate": 0.00011719858571371373, "loss": 0.4466, "step": 3336 }, { "epoch": 1.0015006002400961, "grad_norm": 0.14646700024604797, "learning_rate": 0.00011714698647304094, "loss": 0.4197, "step": 3337 }, { "epoch": 1.0018007202881152, "grad_norm": 0.17612683773040771, "learning_rate": 0.00011709538252828902, "loss": 0.4565, "step": 3338 }, { "epoch": 1.0021008403361344, "grad_norm": 0.14111727476119995, "learning_rate": 0.00011704377389361492, "loss": 0.4107, "step": 3339 }, { "epoch": 1.0024009603841537, "grad_norm": 0.13370364904403687, "learning_rate": 0.00011699216058317686, "loss": 0.4129, "step": 3340 }, { "epoch": 1.0027010804321728, "grad_norm": 0.15525327622890472, "learning_rate": 0.00011694054261113441, "loss": 0.4056, "step": 3341 }, { "epoch": 1.003001200480192, "grad_norm": 0.13654319941997528, "learning_rate": 0.00011688891999164834, "loss": 0.4047, "step": 3342 }, { "epoch": 1.0033013205282113, "grad_norm": 0.15027666091918945, "learning_rate": 0.00011683729273888075, "loss": 0.496, "step": 3343 }, { "epoch": 1.0036014405762306, "grad_norm": 0.13460519909858704, "learning_rate": 0.00011678566086699492, "loss": 0.4255, "step": 3344 }, { "epoch": 1.0039015606242496, "grad_norm": 0.1476062536239624, "learning_rate": 0.00011673402439015556, "loss": 0.4577, "step": 3345 }, { "epoch": 1.004201680672269, "grad_norm": 0.18649092316627502, "learning_rate": 0.00011668238332252846, "loss": 0.3768, "step": 3346 }, { "epoch": 1.0045018007202882, "grad_norm": 0.14275908470153809, "learning_rate": 0.00011663073767828079, "loss": 0.488, "step": 3347 }, { "epoch": 1.0048019207683074, "grad_norm": 0.1560365855693817, "learning_rate": 0.0001165790874715809, "loss": 0.4688, "step": 3348 }, { "epoch": 1.0051020408163265, "grad_norm": 0.1347278654575348, "learning_rate": 0.00011652743271659853, "loss": 0.4214, "step": 3349 }, { "epoch": 1.0054021608643458, "grad_norm": 0.1424272209405899, "learning_rate": 0.00011647577342750447, "loss": 0.4421, "step": 3350 }, { "epoch": 1.005702280912365, "grad_norm": 0.16413843631744385, "learning_rate": 0.0001164241096184709, "loss": 0.4707, "step": 3351 }, { "epoch": 1.006002400960384, "grad_norm": 0.24146349728107452, "learning_rate": 0.00011637244130367118, "loss": 0.4515, "step": 3352 }, { "epoch": 1.0063025210084033, "grad_norm": 0.13402995467185974, "learning_rate": 0.00011632076849727993, "loss": 0.4164, "step": 3353 }, { "epoch": 1.0066026410564226, "grad_norm": 0.14538829028606415, "learning_rate": 0.00011626909121347301, "loss": 0.4664, "step": 3354 }, { "epoch": 1.0069027611044419, "grad_norm": 0.42422211170196533, "learning_rate": 0.0001162174094664274, "loss": 0.4278, "step": 3355 }, { "epoch": 1.007202881152461, "grad_norm": 0.14506077766418457, "learning_rate": 0.00011616572327032152, "loss": 0.3753, "step": 3356 }, { "epoch": 1.0075030012004802, "grad_norm": 0.13329797983169556, "learning_rate": 0.0001161140326393348, "loss": 0.4249, "step": 3357 }, { "epoch": 1.0078031212484995, "grad_norm": 0.14923860132694244, "learning_rate": 0.00011606233758764802, "loss": 0.4146, "step": 3358 }, { "epoch": 1.0081032412965185, "grad_norm": 0.14581775665283203, "learning_rate": 0.00011601063812944308, "loss": 0.4863, "step": 3359 }, { "epoch": 1.0084033613445378, "grad_norm": 0.14314207434654236, "learning_rate": 0.00011595893427890316, "loss": 0.4462, "step": 3360 }, { "epoch": 1.008703481392557, "grad_norm": 0.17089812457561493, "learning_rate": 0.00011590722605021262, "loss": 0.3893, "step": 3361 }, { "epoch": 1.0090036014405763, "grad_norm": 0.43584877252578735, "learning_rate": 0.00011585551345755702, "loss": 0.4452, "step": 3362 }, { "epoch": 1.0093037214885954, "grad_norm": 0.18344175815582275, "learning_rate": 0.0001158037965151231, "loss": 0.4966, "step": 3363 }, { "epoch": 1.0096038415366146, "grad_norm": 0.14152958989143372, "learning_rate": 0.00011575207523709886, "loss": 0.4431, "step": 3364 }, { "epoch": 1.009903961584634, "grad_norm": 0.14963635802268982, "learning_rate": 0.00011570034963767335, "loss": 0.4637, "step": 3365 }, { "epoch": 1.010204081632653, "grad_norm": 0.1447412669658661, "learning_rate": 0.00011564861973103698, "loss": 0.4521, "step": 3366 }, { "epoch": 1.0105042016806722, "grad_norm": 0.14700941741466522, "learning_rate": 0.0001155968855313812, "loss": 0.4774, "step": 3367 }, { "epoch": 1.0108043217286915, "grad_norm": 0.14747017621994019, "learning_rate": 0.00011554514705289874, "loss": 0.4459, "step": 3368 }, { "epoch": 1.0111044417767108, "grad_norm": 0.16696229577064514, "learning_rate": 0.00011549340430978342, "loss": 0.4331, "step": 3369 }, { "epoch": 1.0114045618247298, "grad_norm": 0.15966810286045074, "learning_rate": 0.00011544165731623029, "loss": 0.5101, "step": 3370 }, { "epoch": 1.011704681872749, "grad_norm": 0.1359332799911499, "learning_rate": 0.00011538990608643554, "loss": 0.4032, "step": 3371 }, { "epoch": 1.0120048019207684, "grad_norm": 0.16748102009296417, "learning_rate": 0.00011533815063459652, "loss": 0.4798, "step": 3372 }, { "epoch": 1.0123049219687874, "grad_norm": 0.16173438727855682, "learning_rate": 0.00011528639097491174, "loss": 0.435, "step": 3373 }, { "epoch": 1.0126050420168067, "grad_norm": 0.14960527420043945, "learning_rate": 0.00011523462712158089, "loss": 0.4532, "step": 3374 }, { "epoch": 1.012905162064826, "grad_norm": 0.2027568221092224, "learning_rate": 0.00011518285908880477, "loss": 0.4223, "step": 3375 }, { "epoch": 1.0132052821128452, "grad_norm": 0.1638248711824417, "learning_rate": 0.00011513108689078537, "loss": 0.4469, "step": 3376 }, { "epoch": 1.0135054021608643, "grad_norm": 0.16117016971111298, "learning_rate": 0.00011507931054172578, "loss": 0.4655, "step": 3377 }, { "epoch": 1.0138055222088835, "grad_norm": 0.14418861269950867, "learning_rate": 0.00011502753005583022, "loss": 0.4142, "step": 3378 }, { "epoch": 1.0141056422569028, "grad_norm": 0.1503109484910965, "learning_rate": 0.00011497574544730416, "loss": 0.4762, "step": 3379 }, { "epoch": 1.014405762304922, "grad_norm": 0.1379355490207672, "learning_rate": 0.00011492395673035401, "loss": 0.4089, "step": 3380 }, { "epoch": 1.0147058823529411, "grad_norm": 0.1280781477689743, "learning_rate": 0.00011487216391918749, "loss": 0.4092, "step": 3381 }, { "epoch": 1.0150060024009604, "grad_norm": 0.14146004617214203, "learning_rate": 0.00011482036702801329, "loss": 0.4315, "step": 3382 }, { "epoch": 1.0153061224489797, "grad_norm": 0.14633069932460785, "learning_rate": 0.00011476856607104138, "loss": 0.4224, "step": 3383 }, { "epoch": 1.0156062424969987, "grad_norm": 0.13567544519901276, "learning_rate": 0.00011471676106248268, "loss": 0.4246, "step": 3384 }, { "epoch": 1.015906362545018, "grad_norm": 0.14775624871253967, "learning_rate": 0.00011466495201654936, "loss": 0.4343, "step": 3385 }, { "epoch": 1.0162064825930373, "grad_norm": 0.14980390667915344, "learning_rate": 0.00011461313894745458, "loss": 0.414, "step": 3386 }, { "epoch": 1.0165066026410565, "grad_norm": 0.23978058993816376, "learning_rate": 0.00011456132186941276, "loss": 0.5082, "step": 3387 }, { "epoch": 1.0168067226890756, "grad_norm": 0.13374143838882446, "learning_rate": 0.00011450950079663918, "loss": 0.4122, "step": 3388 }, { "epoch": 1.0171068427370948, "grad_norm": 0.1410181224346161, "learning_rate": 0.00011445767574335044, "loss": 0.4113, "step": 3389 }, { "epoch": 1.017406962785114, "grad_norm": 0.13878248631954193, "learning_rate": 0.00011440584672376418, "loss": 0.4088, "step": 3390 }, { "epoch": 1.0177070828331332, "grad_norm": 0.1408509910106659, "learning_rate": 0.00011435401375209904, "loss": 0.4297, "step": 3391 }, { "epoch": 1.0180072028811524, "grad_norm": 0.18739213049411774, "learning_rate": 0.00011430217684257478, "loss": 0.4468, "step": 3392 }, { "epoch": 1.0183073229291717, "grad_norm": 0.14587976038455963, "learning_rate": 0.00011425033600941235, "loss": 0.4673, "step": 3393 }, { "epoch": 1.018607442977191, "grad_norm": 0.12913000583648682, "learning_rate": 0.00011419849126683362, "loss": 0.4014, "step": 3394 }, { "epoch": 1.01890756302521, "grad_norm": 0.14736846089363098, "learning_rate": 0.00011414664262906163, "loss": 0.4622, "step": 3395 }, { "epoch": 1.0192076830732293, "grad_norm": 0.1973484307527542, "learning_rate": 0.00011409479011032045, "loss": 0.4698, "step": 3396 }, { "epoch": 1.0195078031212486, "grad_norm": 0.14890998601913452, "learning_rate": 0.00011404293372483519, "loss": 0.3946, "step": 3397 }, { "epoch": 1.0198079231692676, "grad_norm": 0.1430133879184723, "learning_rate": 0.00011399107348683214, "loss": 0.4599, "step": 3398 }, { "epoch": 1.0201080432172869, "grad_norm": 0.1424548476934433, "learning_rate": 0.00011393920941053846, "loss": 0.4638, "step": 3399 }, { "epoch": 1.0204081632653061, "grad_norm": 0.16244056820869446, "learning_rate": 0.00011388734151018252, "loss": 0.4923, "step": 3400 }, { "epoch": 1.0207082833133254, "grad_norm": 0.17169933021068573, "learning_rate": 0.00011383546979999369, "loss": 0.4545, "step": 3401 }, { "epoch": 1.0210084033613445, "grad_norm": 0.21742023527622223, "learning_rate": 0.00011378359429420238, "loss": 0.4997, "step": 3402 }, { "epoch": 1.0213085234093637, "grad_norm": 0.136476069688797, "learning_rate": 0.00011373171500704001, "loss": 0.3931, "step": 3403 }, { "epoch": 1.021608643457383, "grad_norm": 0.14849358797073364, "learning_rate": 0.00011367983195273907, "loss": 0.4333, "step": 3404 }, { "epoch": 1.0219087635054023, "grad_norm": 0.168882355093956, "learning_rate": 0.0001136279451455331, "loss": 0.4618, "step": 3405 }, { "epoch": 1.0222088835534213, "grad_norm": 0.1467980146408081, "learning_rate": 0.00011357605459965668, "loss": 0.4228, "step": 3406 }, { "epoch": 1.0225090036014406, "grad_norm": 0.16005730628967285, "learning_rate": 0.00011352416032934529, "loss": 0.4384, "step": 3407 }, { "epoch": 1.0228091236494599, "grad_norm": 0.14726972579956055, "learning_rate": 0.00011347226234883564, "loss": 0.4718, "step": 3408 }, { "epoch": 1.023109243697479, "grad_norm": 0.1600033938884735, "learning_rate": 0.0001134203606723653, "loss": 0.5431, "step": 3409 }, { "epoch": 1.0234093637454982, "grad_norm": 0.14114709198474884, "learning_rate": 0.00011336845531417286, "loss": 0.4359, "step": 3410 }, { "epoch": 1.0237094837935174, "grad_norm": 0.14041128754615784, "learning_rate": 0.000113316546288498, "loss": 0.4486, "step": 3411 }, { "epoch": 1.0240096038415367, "grad_norm": 0.14641427993774414, "learning_rate": 0.00011326463360958137, "loss": 0.5054, "step": 3412 }, { "epoch": 1.0243097238895558, "grad_norm": 0.13535800576210022, "learning_rate": 0.00011321271729166462, "loss": 0.4073, "step": 3413 }, { "epoch": 1.024609843937575, "grad_norm": 0.1422286480665207, "learning_rate": 0.00011316079734899039, "loss": 0.4306, "step": 3414 }, { "epoch": 1.0249099639855943, "grad_norm": 0.1431799978017807, "learning_rate": 0.0001131088737958023, "loss": 0.4733, "step": 3415 }, { "epoch": 1.0252100840336134, "grad_norm": 0.15455827116966248, "learning_rate": 0.00011305694664634498, "loss": 0.4936, "step": 3416 }, { "epoch": 1.0255102040816326, "grad_norm": 0.137307807803154, "learning_rate": 0.00011300501591486409, "loss": 0.4496, "step": 3417 }, { "epoch": 1.025810324129652, "grad_norm": 0.1682182252407074, "learning_rate": 0.00011295308161560623, "loss": 0.4657, "step": 3418 }, { "epoch": 1.0261104441776712, "grad_norm": 0.15309010446071625, "learning_rate": 0.00011290114376281893, "loss": 0.4313, "step": 3419 }, { "epoch": 1.0264105642256902, "grad_norm": 0.1386784464120865, "learning_rate": 0.00011284920237075076, "loss": 0.4212, "step": 3420 }, { "epoch": 1.0267106842737095, "grad_norm": 0.14624737203121185, "learning_rate": 0.00011279725745365128, "loss": 0.4283, "step": 3421 }, { "epoch": 1.0270108043217288, "grad_norm": 0.13789516687393188, "learning_rate": 0.00011274530902577093, "loss": 0.431, "step": 3422 }, { "epoch": 1.0273109243697478, "grad_norm": 0.1470216065645218, "learning_rate": 0.00011269335710136122, "loss": 0.4821, "step": 3423 }, { "epoch": 1.027611044417767, "grad_norm": 0.1370469629764557, "learning_rate": 0.00011264140169467455, "loss": 0.4342, "step": 3424 }, { "epoch": 1.0279111644657863, "grad_norm": 0.1463860720396042, "learning_rate": 0.00011258944281996424, "loss": 0.4563, "step": 3425 }, { "epoch": 1.0282112845138056, "grad_norm": 0.15660445392131805, "learning_rate": 0.00011253748049148466, "loss": 0.4593, "step": 3426 }, { "epoch": 1.0285114045618247, "grad_norm": 0.1462990790605545, "learning_rate": 0.00011248551472349107, "loss": 0.4642, "step": 3427 }, { "epoch": 1.028811524609844, "grad_norm": 0.13824063539505005, "learning_rate": 0.0001124335455302397, "loss": 0.4194, "step": 3428 }, { "epoch": 1.0291116446578632, "grad_norm": 0.16063570976257324, "learning_rate": 0.00011238157292598768, "loss": 0.4513, "step": 3429 }, { "epoch": 1.0294117647058822, "grad_norm": 0.13024139404296875, "learning_rate": 0.00011232959692499308, "loss": 0.4132, "step": 3430 }, { "epoch": 1.0297118847539015, "grad_norm": 0.1586381196975708, "learning_rate": 0.00011227761754151495, "loss": 0.5055, "step": 3431 }, { "epoch": 1.0300120048019208, "grad_norm": 0.13493409752845764, "learning_rate": 0.00011222563478981325, "loss": 0.3771, "step": 3432 }, { "epoch": 1.03031212484994, "grad_norm": 0.2602766752243042, "learning_rate": 0.00011217364868414883, "loss": 0.4896, "step": 3433 }, { "epoch": 1.030612244897959, "grad_norm": 0.14443141222000122, "learning_rate": 0.00011212165923878348, "loss": 0.446, "step": 3434 }, { "epoch": 1.0309123649459784, "grad_norm": 0.1360771805047989, "learning_rate": 0.0001120696664679799, "loss": 0.415, "step": 3435 }, { "epoch": 1.0312124849939976, "grad_norm": 0.1296030730009079, "learning_rate": 0.00011201767038600172, "loss": 0.3967, "step": 3436 }, { "epoch": 1.0315126050420167, "grad_norm": 0.15890447795391083, "learning_rate": 0.00011196567100711348, "loss": 0.427, "step": 3437 }, { "epoch": 1.031812725090036, "grad_norm": 0.13646657764911652, "learning_rate": 0.00011191366834558062, "loss": 0.4204, "step": 3438 }, { "epoch": 1.0321128451380552, "grad_norm": 0.13108712434768677, "learning_rate": 0.00011186166241566944, "loss": 0.4104, "step": 3439 }, { "epoch": 1.0324129651860745, "grad_norm": 0.12920904159545898, "learning_rate": 0.00011180965323164719, "loss": 0.4139, "step": 3440 }, { "epoch": 1.0327130852340936, "grad_norm": 0.137689471244812, "learning_rate": 0.00011175764080778197, "loss": 0.4572, "step": 3441 }, { "epoch": 1.0330132052821128, "grad_norm": 0.14051711559295654, "learning_rate": 0.00011170562515834285, "loss": 0.4512, "step": 3442 }, { "epoch": 1.033313325330132, "grad_norm": 0.13927625119686127, "learning_rate": 0.00011165360629759969, "loss": 0.4003, "step": 3443 }, { "epoch": 1.0336134453781514, "grad_norm": 0.13879626989364624, "learning_rate": 0.00011160158423982326, "loss": 0.4416, "step": 3444 }, { "epoch": 1.0339135654261704, "grad_norm": 0.1270892471075058, "learning_rate": 0.00011154955899928521, "loss": 0.4213, "step": 3445 }, { "epoch": 1.0342136854741897, "grad_norm": 0.14068500697612762, "learning_rate": 0.0001114975305902581, "loss": 0.45, "step": 3446 }, { "epoch": 1.034513805522209, "grad_norm": 0.1559022068977356, "learning_rate": 0.00011144549902701528, "loss": 0.4523, "step": 3447 }, { "epoch": 1.034813925570228, "grad_norm": 0.15158261358737946, "learning_rate": 0.00011139346432383109, "loss": 0.4757, "step": 3448 }, { "epoch": 1.0351140456182473, "grad_norm": 0.13974933326244354, "learning_rate": 0.00011134142649498056, "loss": 0.4278, "step": 3449 }, { "epoch": 1.0354141656662665, "grad_norm": 0.1296132653951645, "learning_rate": 0.00011128938555473976, "loss": 0.395, "step": 3450 }, { "epoch": 1.0357142857142858, "grad_norm": 0.14743031561374664, "learning_rate": 0.00011123734151738548, "loss": 0.4368, "step": 3451 }, { "epoch": 1.0360144057623049, "grad_norm": 0.14494973421096802, "learning_rate": 0.00011118529439719538, "loss": 0.4219, "step": 3452 }, { "epoch": 1.0363145258103241, "grad_norm": 0.16061517596244812, "learning_rate": 0.00011113324420844801, "loss": 0.4366, "step": 3453 }, { "epoch": 1.0366146458583434, "grad_norm": 0.18124105036258698, "learning_rate": 0.00011108119096542283, "loss": 0.4265, "step": 3454 }, { "epoch": 1.0369147659063624, "grad_norm": 0.14375890791416168, "learning_rate": 0.00011102913468239989, "loss": 0.4499, "step": 3455 }, { "epoch": 1.0372148859543817, "grad_norm": 0.2264331877231598, "learning_rate": 0.00011097707537366036, "loss": 0.4447, "step": 3456 }, { "epoch": 1.037515006002401, "grad_norm": 0.18140441179275513, "learning_rate": 0.00011092501305348604, "loss": 0.4455, "step": 3457 }, { "epoch": 1.0378151260504203, "grad_norm": 0.15313535928726196, "learning_rate": 0.00011087294773615968, "loss": 0.4656, "step": 3458 }, { "epoch": 1.0381152460984393, "grad_norm": 0.14016591012477875, "learning_rate": 0.00011082087943596479, "loss": 0.4171, "step": 3459 }, { "epoch": 1.0384153661464586, "grad_norm": 0.1311662644147873, "learning_rate": 0.00011076880816718569, "loss": 0.4179, "step": 3460 }, { "epoch": 1.0387154861944778, "grad_norm": 0.16516633331775665, "learning_rate": 0.00011071673394410756, "loss": 0.4836, "step": 3461 }, { "epoch": 1.039015606242497, "grad_norm": 0.12876063585281372, "learning_rate": 0.00011066465678101637, "loss": 0.389, "step": 3462 }, { "epoch": 1.0393157262905162, "grad_norm": 0.1513228714466095, "learning_rate": 0.00011061257669219884, "loss": 0.4861, "step": 3463 }, { "epoch": 1.0396158463385354, "grad_norm": 0.1448003500699997, "learning_rate": 0.00011056049369194262, "loss": 0.454, "step": 3464 }, { "epoch": 1.0399159663865547, "grad_norm": 0.14984130859375, "learning_rate": 0.00011050840779453602, "loss": 0.4777, "step": 3465 }, { "epoch": 1.0402160864345738, "grad_norm": 0.1330723762512207, "learning_rate": 0.00011045631901426828, "loss": 0.4309, "step": 3466 }, { "epoch": 1.040516206482593, "grad_norm": 0.1514752209186554, "learning_rate": 0.00011040422736542928, "loss": 0.4794, "step": 3467 }, { "epoch": 1.0408163265306123, "grad_norm": 0.15264540910720825, "learning_rate": 0.0001103521328623098, "loss": 0.4107, "step": 3468 }, { "epoch": 1.0411164465786316, "grad_norm": 0.1378517597913742, "learning_rate": 0.0001103000355192014, "loss": 0.4224, "step": 3469 }, { "epoch": 1.0414165666266506, "grad_norm": 0.16188235580921173, "learning_rate": 0.00011024793535039634, "loss": 0.4319, "step": 3470 }, { "epoch": 1.0417166866746699, "grad_norm": 0.13926754891872406, "learning_rate": 0.00011019583237018773, "loss": 0.4563, "step": 3471 }, { "epoch": 1.0420168067226891, "grad_norm": 0.15727829933166504, "learning_rate": 0.00011014372659286943, "loss": 0.5049, "step": 3472 }, { "epoch": 1.0423169267707082, "grad_norm": 0.15464873611927032, "learning_rate": 0.00011009161803273607, "loss": 0.5784, "step": 3473 }, { "epoch": 1.0426170468187275, "grad_norm": 0.14383083581924438, "learning_rate": 0.00011003950670408296, "loss": 0.4512, "step": 3474 }, { "epoch": 1.0429171668667467, "grad_norm": 0.14154858887195587, "learning_rate": 0.00010998739262120634, "loss": 0.4304, "step": 3475 }, { "epoch": 1.043217286914766, "grad_norm": 0.14988425374031067, "learning_rate": 0.00010993527579840309, "loss": 0.4911, "step": 3476 }, { "epoch": 1.043517406962785, "grad_norm": 0.1370452344417572, "learning_rate": 0.00010988315624997083, "loss": 0.4377, "step": 3477 }, { "epoch": 1.0438175270108043, "grad_norm": 0.13611678779125214, "learning_rate": 0.00010983103399020797, "loss": 0.4611, "step": 3478 }, { "epoch": 1.0441176470588236, "grad_norm": 0.28860393166542053, "learning_rate": 0.00010977890903341368, "loss": 0.4169, "step": 3479 }, { "epoch": 1.0444177671068426, "grad_norm": 0.13566024601459503, "learning_rate": 0.00010972678139388784, "loss": 0.4358, "step": 3480 }, { "epoch": 1.044717887154862, "grad_norm": 0.1353350579738617, "learning_rate": 0.00010967465108593104, "loss": 0.4122, "step": 3481 }, { "epoch": 1.0450180072028812, "grad_norm": 0.1298576146364212, "learning_rate": 0.00010962251812384465, "loss": 0.361, "step": 3482 }, { "epoch": 1.0453181272509005, "grad_norm": 0.13776518404483795, "learning_rate": 0.00010957038252193075, "loss": 0.3828, "step": 3483 }, { "epoch": 1.0456182472989195, "grad_norm": 0.14902642369270325, "learning_rate": 0.00010951824429449218, "loss": 0.4707, "step": 3484 }, { "epoch": 1.0459183673469388, "grad_norm": 0.1484086811542511, "learning_rate": 0.00010946610345583237, "loss": 0.4041, "step": 3485 }, { "epoch": 1.046218487394958, "grad_norm": 0.15604808926582336, "learning_rate": 0.00010941396002025565, "loss": 0.4443, "step": 3486 }, { "epoch": 1.046518607442977, "grad_norm": 0.1303928792476654, "learning_rate": 0.00010936181400206694, "loss": 0.3916, "step": 3487 }, { "epoch": 1.0468187274909964, "grad_norm": 0.13681869208812714, "learning_rate": 0.00010930966541557192, "loss": 0.4359, "step": 3488 }, { "epoch": 1.0471188475390156, "grad_norm": 0.15033277869224548, "learning_rate": 0.00010925751427507691, "loss": 0.4903, "step": 3489 }, { "epoch": 1.047418967587035, "grad_norm": 0.12609606981277466, "learning_rate": 0.00010920536059488904, "loss": 0.3909, "step": 3490 }, { "epoch": 1.047719087635054, "grad_norm": 0.13723771274089813, "learning_rate": 0.00010915320438931602, "loss": 0.413, "step": 3491 }, { "epoch": 1.0480192076830732, "grad_norm": 0.1487082540988922, "learning_rate": 0.00010910104567266637, "loss": 0.4592, "step": 3492 }, { "epoch": 1.0483193277310925, "grad_norm": 0.13834303617477417, "learning_rate": 0.00010904888445924917, "loss": 0.4248, "step": 3493 }, { "epoch": 1.0486194477791115, "grad_norm": 0.1379755735397339, "learning_rate": 0.00010899672076337429, "loss": 0.455, "step": 3494 }, { "epoch": 1.0489195678271308, "grad_norm": 0.21014422178268433, "learning_rate": 0.00010894455459935222, "loss": 0.4537, "step": 3495 }, { "epoch": 1.04921968787515, "grad_norm": 0.14388030767440796, "learning_rate": 0.00010889238598149418, "loss": 0.4194, "step": 3496 }, { "epoch": 1.0495198079231693, "grad_norm": 0.12824735045433044, "learning_rate": 0.00010884021492411196, "loss": 0.4114, "step": 3497 }, { "epoch": 1.0498199279711884, "grad_norm": 0.15317986905574799, "learning_rate": 0.0001087880414415182, "loss": 0.4266, "step": 3498 }, { "epoch": 1.0501200480192077, "grad_norm": 0.1715250462293625, "learning_rate": 0.00010873586554802602, "loss": 0.4401, "step": 3499 }, { "epoch": 1.050420168067227, "grad_norm": 0.17429319024085999, "learning_rate": 0.00010868368725794928, "loss": 0.4447, "step": 3500 }, { "epoch": 1.0507202881152462, "grad_norm": 0.14579427242279053, "learning_rate": 0.00010863150658560255, "loss": 0.4341, "step": 3501 }, { "epoch": 1.0510204081632653, "grad_norm": 0.16881610453128815, "learning_rate": 0.00010857932354530092, "loss": 0.4351, "step": 3502 }, { "epoch": 1.0513205282112845, "grad_norm": 0.1375623643398285, "learning_rate": 0.0001085271381513603, "loss": 0.4415, "step": 3503 }, { "epoch": 1.0516206482593038, "grad_norm": 0.24451765418052673, "learning_rate": 0.00010847495041809705, "loss": 0.4476, "step": 3504 }, { "epoch": 1.0519207683073228, "grad_norm": 0.1336793452501297, "learning_rate": 0.00010842276035982836, "loss": 0.437, "step": 3505 }, { "epoch": 1.052220888355342, "grad_norm": 0.1505599468946457, "learning_rate": 0.00010837056799087193, "loss": 0.4668, "step": 3506 }, { "epoch": 1.0525210084033614, "grad_norm": 0.1421106457710266, "learning_rate": 0.00010831837332554619, "loss": 0.4106, "step": 3507 }, { "epoch": 1.0528211284513807, "grad_norm": 0.14475306868553162, "learning_rate": 0.00010826617637817007, "loss": 0.4465, "step": 3508 }, { "epoch": 1.0531212484993997, "grad_norm": 0.1494123488664627, "learning_rate": 0.00010821397716306328, "loss": 0.449, "step": 3509 }, { "epoch": 1.053421368547419, "grad_norm": 0.153250589966774, "learning_rate": 0.000108161775694546, "loss": 0.4954, "step": 3510 }, { "epoch": 1.0537214885954382, "grad_norm": 0.13658291101455688, "learning_rate": 0.00010810957198693921, "loss": 0.4306, "step": 3511 }, { "epoch": 1.0540216086434573, "grad_norm": 0.19465994834899902, "learning_rate": 0.00010805736605456426, "loss": 0.4279, "step": 3512 }, { "epoch": 1.0543217286914766, "grad_norm": 0.14101290702819824, "learning_rate": 0.00010800515791174337, "loss": 0.467, "step": 3513 }, { "epoch": 1.0546218487394958, "grad_norm": 0.14020851254463196, "learning_rate": 0.0001079529475727992, "loss": 0.4168, "step": 3514 }, { "epoch": 1.054921968787515, "grad_norm": 0.17031557857990265, "learning_rate": 0.00010790073505205505, "loss": 0.4145, "step": 3515 }, { "epoch": 1.0552220888355341, "grad_norm": 0.13163962960243225, "learning_rate": 0.00010784852036383481, "loss": 0.3924, "step": 3516 }, { "epoch": 1.0555222088835534, "grad_norm": 0.13243068754673004, "learning_rate": 0.00010779630352246302, "loss": 0.4122, "step": 3517 }, { "epoch": 1.0558223289315727, "grad_norm": 0.16340062022209167, "learning_rate": 0.00010774408454226477, "loss": 0.4847, "step": 3518 }, { "epoch": 1.0561224489795917, "grad_norm": 0.1362360268831253, "learning_rate": 0.00010769186343756572, "loss": 0.3993, "step": 3519 }, { "epoch": 1.056422569027611, "grad_norm": 0.14988327026367188, "learning_rate": 0.00010763964022269213, "loss": 0.4475, "step": 3520 }, { "epoch": 1.0567226890756303, "grad_norm": 0.1353144347667694, "learning_rate": 0.00010758741491197081, "loss": 0.4178, "step": 3521 }, { "epoch": 1.0570228091236495, "grad_norm": 0.13179272413253784, "learning_rate": 0.00010753518751972927, "loss": 0.3937, "step": 3522 }, { "epoch": 1.0573229291716686, "grad_norm": 0.1460052877664566, "learning_rate": 0.0001074829580602954, "loss": 0.403, "step": 3523 }, { "epoch": 1.0576230492196879, "grad_norm": 0.14756572246551514, "learning_rate": 0.0001074307265479978, "loss": 0.4483, "step": 3524 }, { "epoch": 1.0579231692677071, "grad_norm": 0.15488174557685852, "learning_rate": 0.00010737849299716555, "loss": 0.4469, "step": 3525 }, { "epoch": 1.0582232893157264, "grad_norm": 0.14268824458122253, "learning_rate": 0.00010732625742212842, "loss": 0.4326, "step": 3526 }, { "epoch": 1.0585234093637454, "grad_norm": 0.13396058976650238, "learning_rate": 0.00010727401983721652, "loss": 0.4401, "step": 3527 }, { "epoch": 1.0588235294117647, "grad_norm": 0.1576491892337799, "learning_rate": 0.00010722178025676069, "loss": 0.5262, "step": 3528 }, { "epoch": 1.059123649459784, "grad_norm": 0.14710913598537445, "learning_rate": 0.00010716953869509228, "loss": 0.4711, "step": 3529 }, { "epoch": 1.059423769507803, "grad_norm": 0.1549479067325592, "learning_rate": 0.00010711729516654311, "loss": 0.445, "step": 3530 }, { "epoch": 1.0597238895558223, "grad_norm": 0.14734241366386414, "learning_rate": 0.00010706504968544564, "loss": 0.4224, "step": 3531 }, { "epoch": 1.0600240096038416, "grad_norm": 0.13729891180992126, "learning_rate": 0.0001070128022661328, "loss": 0.3868, "step": 3532 }, { "epoch": 1.0603241296518608, "grad_norm": 0.1420051008462906, "learning_rate": 0.00010696055292293805, "loss": 0.3858, "step": 3533 }, { "epoch": 1.06062424969988, "grad_norm": 0.13650977611541748, "learning_rate": 0.00010690830167019546, "loss": 0.434, "step": 3534 }, { "epoch": 1.0609243697478992, "grad_norm": 0.14545617997646332, "learning_rate": 0.00010685604852223947, "loss": 0.4619, "step": 3535 }, { "epoch": 1.0612244897959184, "grad_norm": 0.13201873004436493, "learning_rate": 0.00010680379349340522, "loss": 0.3932, "step": 3536 }, { "epoch": 1.0615246098439375, "grad_norm": 0.15978311002254486, "learning_rate": 0.00010675153659802824, "loss": 0.4167, "step": 3537 }, { "epoch": 1.0618247298919568, "grad_norm": 0.13253796100616455, "learning_rate": 0.0001066992778504446, "loss": 0.4082, "step": 3538 }, { "epoch": 1.062124849939976, "grad_norm": 0.15104475617408752, "learning_rate": 0.00010664701726499091, "loss": 0.4454, "step": 3539 }, { "epoch": 1.0624249699879953, "grad_norm": 0.14183232188224792, "learning_rate": 0.00010659475485600423, "loss": 0.388, "step": 3540 }, { "epoch": 1.0627250900360143, "grad_norm": 0.1418931782245636, "learning_rate": 0.0001065424906378222, "loss": 0.4257, "step": 3541 }, { "epoch": 1.0630252100840336, "grad_norm": 0.24038948118686676, "learning_rate": 0.00010649022462478286, "loss": 0.4164, "step": 3542 }, { "epoch": 1.0633253301320529, "grad_norm": 0.13288667798042297, "learning_rate": 0.00010643795683122485, "loss": 0.3729, "step": 3543 }, { "epoch": 1.063625450180072, "grad_norm": 0.1481071263551712, "learning_rate": 0.00010638568727148716, "loss": 0.457, "step": 3544 }, { "epoch": 1.0639255702280912, "grad_norm": 0.1327173411846161, "learning_rate": 0.00010633341595990945, "loss": 0.4079, "step": 3545 }, { "epoch": 1.0642256902761105, "grad_norm": 0.14498308300971985, "learning_rate": 0.00010628114291083163, "loss": 0.4737, "step": 3546 }, { "epoch": 1.0645258103241297, "grad_norm": 0.13920772075653076, "learning_rate": 0.0001062288681385943, "loss": 0.4698, "step": 3547 }, { "epoch": 1.0648259303721488, "grad_norm": 0.1382630616426468, "learning_rate": 0.00010617659165753844, "loss": 0.4414, "step": 3548 }, { "epoch": 1.065126050420168, "grad_norm": 0.14103002846240997, "learning_rate": 0.00010612431348200547, "loss": 0.4357, "step": 3549 }, { "epoch": 1.0654261704681873, "grad_norm": 0.14149099588394165, "learning_rate": 0.00010607203362633728, "loss": 0.4519, "step": 3550 }, { "epoch": 1.0657262905162064, "grad_norm": 0.13301125168800354, "learning_rate": 0.00010601975210487633, "loss": 0.4205, "step": 3551 }, { "epoch": 1.0660264105642256, "grad_norm": 0.17028430104255676, "learning_rate": 0.00010596746893196543, "loss": 0.4366, "step": 3552 }, { "epoch": 1.066326530612245, "grad_norm": 0.15449634194374084, "learning_rate": 0.00010591518412194784, "loss": 0.4861, "step": 3553 }, { "epoch": 1.0666266506602642, "grad_norm": 0.14159183204174042, "learning_rate": 0.00010586289768916729, "loss": 0.4363, "step": 3554 }, { "epoch": 1.0669267707082832, "grad_norm": 0.2055792510509491, "learning_rate": 0.000105810609647968, "loss": 0.4225, "step": 3555 }, { "epoch": 1.0672268907563025, "grad_norm": 0.14333371818065643, "learning_rate": 0.0001057583200126946, "loss": 0.4095, "step": 3556 }, { "epoch": 1.0675270108043218, "grad_norm": 0.1322149783372879, "learning_rate": 0.00010570602879769213, "loss": 0.4055, "step": 3557 }, { "epoch": 1.0678271308523408, "grad_norm": 0.15440814197063446, "learning_rate": 0.00010565373601730606, "loss": 0.4626, "step": 3558 }, { "epoch": 1.06812725090036, "grad_norm": 0.23785772919654846, "learning_rate": 0.00010560144168588237, "loss": 0.4903, "step": 3559 }, { "epoch": 1.0684273709483794, "grad_norm": 0.15003551542758942, "learning_rate": 0.00010554914581776738, "loss": 0.4466, "step": 3560 }, { "epoch": 1.0687274909963986, "grad_norm": 0.1422703117132187, "learning_rate": 0.00010549684842730787, "loss": 0.388, "step": 3561 }, { "epoch": 1.0690276110444177, "grad_norm": 0.15164245665073395, "learning_rate": 0.00010544454952885101, "loss": 0.4697, "step": 3562 }, { "epoch": 1.069327731092437, "grad_norm": 0.15820443630218506, "learning_rate": 0.00010539224913674442, "loss": 0.4113, "step": 3563 }, { "epoch": 1.0696278511404562, "grad_norm": 0.1455969363451004, "learning_rate": 0.00010533994726533612, "loss": 0.4653, "step": 3564 }, { "epoch": 1.0699279711884755, "grad_norm": 0.1496390551328659, "learning_rate": 0.0001052876439289745, "loss": 0.4244, "step": 3565 }, { "epoch": 1.0702280912364945, "grad_norm": 0.351198673248291, "learning_rate": 0.0001052353391420084, "loss": 0.5013, "step": 3566 }, { "epoch": 1.0705282112845138, "grad_norm": 0.14300662279129028, "learning_rate": 0.00010518303291878707, "loss": 0.3909, "step": 3567 }, { "epoch": 1.070828331332533, "grad_norm": 0.12167440354824066, "learning_rate": 0.00010513072527366006, "loss": 0.3628, "step": 3568 }, { "epoch": 1.0711284513805521, "grad_norm": 0.1389562040567398, "learning_rate": 0.00010507841622097739, "loss": 0.4408, "step": 3569 }, { "epoch": 1.0714285714285714, "grad_norm": 0.15119138360023499, "learning_rate": 0.00010502610577508949, "loss": 0.4429, "step": 3570 }, { "epoch": 1.0717286914765907, "grad_norm": 0.17108459770679474, "learning_rate": 0.0001049737939503471, "loss": 0.4332, "step": 3571 }, { "epoch": 1.07202881152461, "grad_norm": 0.1357976347208023, "learning_rate": 0.00010492148076110136, "loss": 0.3945, "step": 3572 }, { "epoch": 1.072328931572629, "grad_norm": 0.12482724338769913, "learning_rate": 0.0001048691662217038, "loss": 0.3649, "step": 3573 }, { "epoch": 1.0726290516206483, "grad_norm": 0.1527443528175354, "learning_rate": 0.00010481685034650632, "loss": 0.4663, "step": 3574 }, { "epoch": 1.0729291716686675, "grad_norm": 0.14375494420528412, "learning_rate": 0.00010476453314986122, "loss": 0.4593, "step": 3575 }, { "epoch": 1.0732292917166866, "grad_norm": 0.13201279938220978, "learning_rate": 0.00010471221464612104, "loss": 0.4413, "step": 3576 }, { "epoch": 1.0735294117647058, "grad_norm": 0.15472981333732605, "learning_rate": 0.00010465989484963881, "loss": 0.4245, "step": 3577 }, { "epoch": 1.0738295318127251, "grad_norm": 0.1417369395494461, "learning_rate": 0.00010460757377476792, "loss": 0.4461, "step": 3578 }, { "epoch": 1.0741296518607444, "grad_norm": 0.12595735490322113, "learning_rate": 0.00010455525143586191, "loss": 0.3612, "step": 3579 }, { "epoch": 1.0744297719087634, "grad_norm": 0.1501154899597168, "learning_rate": 0.00010450292784727496, "loss": 0.4561, "step": 3580 }, { "epoch": 1.0747298919567827, "grad_norm": 0.1509701907634735, "learning_rate": 0.00010445060302336137, "loss": 0.4871, "step": 3581 }, { "epoch": 1.075030012004802, "grad_norm": 0.14055953919887543, "learning_rate": 0.00010439827697847587, "loss": 0.4506, "step": 3582 }, { "epoch": 1.0753301320528212, "grad_norm": 0.15558786690235138, "learning_rate": 0.00010434594972697352, "loss": 0.457, "step": 3583 }, { "epoch": 1.0756302521008403, "grad_norm": 0.14152899384498596, "learning_rate": 0.00010429362128320968, "loss": 0.4506, "step": 3584 }, { "epoch": 1.0759303721488596, "grad_norm": 0.17807260155677795, "learning_rate": 0.00010424129166154009, "loss": 0.474, "step": 3585 }, { "epoch": 1.0762304921968788, "grad_norm": 0.1473320573568344, "learning_rate": 0.00010418896087632077, "loss": 0.4387, "step": 3586 }, { "epoch": 1.0765306122448979, "grad_norm": 0.156795471906662, "learning_rate": 0.00010413662894190806, "loss": 0.4749, "step": 3587 }, { "epoch": 1.0768307322929171, "grad_norm": 0.2006545066833496, "learning_rate": 0.00010408429587265862, "loss": 0.4646, "step": 3588 }, { "epoch": 1.0771308523409364, "grad_norm": 0.14439736306667328, "learning_rate": 0.00010403196168292945, "loss": 0.4322, "step": 3589 }, { "epoch": 1.0774309723889557, "grad_norm": 0.15489919483661652, "learning_rate": 0.00010397962638707783, "loss": 0.4541, "step": 3590 }, { "epoch": 1.0777310924369747, "grad_norm": 0.13993416726589203, "learning_rate": 0.00010392728999946136, "loss": 0.4179, "step": 3591 }, { "epoch": 1.078031212484994, "grad_norm": 0.13038261234760284, "learning_rate": 0.00010387495253443787, "loss": 0.3982, "step": 3592 }, { "epoch": 1.0783313325330133, "grad_norm": 0.1442558765411377, "learning_rate": 0.00010382261400636563, "loss": 0.4591, "step": 3593 }, { "epoch": 1.0786314525810323, "grad_norm": 0.17982600629329681, "learning_rate": 0.00010377027442960303, "loss": 0.4187, "step": 3594 }, { "epoch": 1.0789315726290516, "grad_norm": 0.1430574506521225, "learning_rate": 0.0001037179338185089, "loss": 0.4561, "step": 3595 }, { "epoch": 1.0792316926770709, "grad_norm": 0.13929946720600128, "learning_rate": 0.00010366559218744224, "loss": 0.4253, "step": 3596 }, { "epoch": 1.0795318127250901, "grad_norm": 0.14808815717697144, "learning_rate": 0.00010361324955076242, "loss": 0.4227, "step": 3597 }, { "epoch": 1.0798319327731092, "grad_norm": 0.1456451565027237, "learning_rate": 0.00010356090592282899, "loss": 0.4738, "step": 3598 }, { "epoch": 1.0801320528211285, "grad_norm": 0.14544235169887543, "learning_rate": 0.00010350856131800186, "loss": 0.4524, "step": 3599 }, { "epoch": 1.0804321728691477, "grad_norm": 0.13172706961631775, "learning_rate": 0.00010345621575064117, "loss": 0.407, "step": 3600 }, { "epoch": 1.0807322929171668, "grad_norm": 0.14873874187469482, "learning_rate": 0.00010340386923510733, "loss": 0.4285, "step": 3601 }, { "epoch": 1.081032412965186, "grad_norm": 0.12801893055438995, "learning_rate": 0.00010335152178576095, "loss": 0.3911, "step": 3602 }, { "epoch": 1.0813325330132053, "grad_norm": 0.14230036735534668, "learning_rate": 0.00010329917341696304, "loss": 0.4634, "step": 3603 }, { "epoch": 1.0816326530612246, "grad_norm": 0.14611704647541046, "learning_rate": 0.00010324682414307471, "loss": 0.4463, "step": 3604 }, { "epoch": 1.0819327731092436, "grad_norm": 0.1437060534954071, "learning_rate": 0.00010319447397845745, "loss": 0.4216, "step": 3605 }, { "epoch": 1.082232893157263, "grad_norm": 0.14681190252304077, "learning_rate": 0.00010314212293747285, "loss": 0.4739, "step": 3606 }, { "epoch": 1.0825330132052822, "grad_norm": 0.15303242206573486, "learning_rate": 0.00010308977103448283, "loss": 0.4153, "step": 3607 }, { "epoch": 1.0828331332533012, "grad_norm": 0.16087490320205688, "learning_rate": 0.00010303741828384961, "loss": 0.4687, "step": 3608 }, { "epoch": 1.0831332533013205, "grad_norm": 0.12915025651454926, "learning_rate": 0.00010298506469993548, "loss": 0.3658, "step": 3609 }, { "epoch": 1.0834333733493398, "grad_norm": 0.12952394783496857, "learning_rate": 0.00010293271029710307, "loss": 0.3763, "step": 3610 }, { "epoch": 1.083733493397359, "grad_norm": 0.13518524169921875, "learning_rate": 0.00010288035508971523, "loss": 0.351, "step": 3611 }, { "epoch": 1.084033613445378, "grad_norm": 0.13607051968574524, "learning_rate": 0.000102827999092135, "loss": 0.4144, "step": 3612 }, { "epoch": 1.0843337334933973, "grad_norm": 0.13719278573989868, "learning_rate": 0.00010277564231872565, "loss": 0.4109, "step": 3613 }, { "epoch": 1.0846338535414166, "grad_norm": 0.14822518825531006, "learning_rate": 0.00010272328478385065, "loss": 0.477, "step": 3614 }, { "epoch": 1.0849339735894357, "grad_norm": 0.14354124665260315, "learning_rate": 0.0001026709265018737, "loss": 0.4505, "step": 3615 }, { "epoch": 1.085234093637455, "grad_norm": 0.15144415199756622, "learning_rate": 0.0001026185674871587, "loss": 0.4718, "step": 3616 }, { "epoch": 1.0855342136854742, "grad_norm": 0.13325156271457672, "learning_rate": 0.0001025662077540697, "loss": 0.406, "step": 3617 }, { "epoch": 1.0858343337334935, "grad_norm": 0.12941822409629822, "learning_rate": 0.00010251384731697106, "loss": 0.3857, "step": 3618 }, { "epoch": 1.0861344537815125, "grad_norm": 0.13992074131965637, "learning_rate": 0.00010246148619022722, "loss": 0.4318, "step": 3619 }, { "epoch": 1.0864345738295318, "grad_norm": 0.17397478222846985, "learning_rate": 0.00010240912438820289, "loss": 0.4922, "step": 3620 }, { "epoch": 1.086734693877551, "grad_norm": 0.14126791059970856, "learning_rate": 0.00010235676192526289, "loss": 0.4282, "step": 3621 }, { "epoch": 1.0870348139255703, "grad_norm": 0.13610020279884338, "learning_rate": 0.00010230439881577229, "loss": 0.3859, "step": 3622 }, { "epoch": 1.0873349339735894, "grad_norm": 0.1584099680185318, "learning_rate": 0.00010225203507409629, "loss": 0.5093, "step": 3623 }, { "epoch": 1.0876350540216086, "grad_norm": 0.15142543613910675, "learning_rate": 0.0001021996707146003, "loss": 0.4195, "step": 3624 }, { "epoch": 1.087935174069628, "grad_norm": 0.15312343835830688, "learning_rate": 0.00010214730575164988, "loss": 0.4334, "step": 3625 }, { "epoch": 1.088235294117647, "grad_norm": 0.13882726430892944, "learning_rate": 0.0001020949401996107, "loss": 0.4498, "step": 3626 }, { "epoch": 1.0885354141656662, "grad_norm": 0.14910578727722168, "learning_rate": 0.00010204257407284874, "loss": 0.4318, "step": 3627 }, { "epoch": 1.0888355342136855, "grad_norm": 0.14569604396820068, "learning_rate": 0.00010199020738573001, "loss": 0.454, "step": 3628 }, { "epoch": 1.0891356542617048, "grad_norm": 0.17067281901836395, "learning_rate": 0.00010193784015262069, "loss": 0.5436, "step": 3629 }, { "epoch": 1.0894357743097238, "grad_norm": 0.14397454261779785, "learning_rate": 0.00010188547238788713, "loss": 0.4489, "step": 3630 }, { "epoch": 1.089735894357743, "grad_norm": 0.13740958273410797, "learning_rate": 0.00010183310410589589, "loss": 0.4277, "step": 3631 }, { "epoch": 1.0900360144057624, "grad_norm": 0.14416514337062836, "learning_rate": 0.00010178073532101352, "loss": 0.4115, "step": 3632 }, { "epoch": 1.0903361344537814, "grad_norm": 0.12540937960147858, "learning_rate": 0.00010172836604760683, "loss": 0.3482, "step": 3633 }, { "epoch": 1.0906362545018007, "grad_norm": 0.13062594830989838, "learning_rate": 0.00010167599630004271, "loss": 0.4033, "step": 3634 }, { "epoch": 1.09093637454982, "grad_norm": 0.14485786855220795, "learning_rate": 0.0001016236260926883, "loss": 0.4511, "step": 3635 }, { "epoch": 1.0912364945978392, "grad_norm": 0.14354106783866882, "learning_rate": 0.00010157125543991062, "loss": 0.4444, "step": 3636 }, { "epoch": 1.0915366146458583, "grad_norm": 0.15071211755275726, "learning_rate": 0.00010151888435607706, "loss": 0.4128, "step": 3637 }, { "epoch": 1.0918367346938775, "grad_norm": 0.15684175491333008, "learning_rate": 0.000101466512855555, "loss": 0.4231, "step": 3638 }, { "epoch": 1.0921368547418968, "grad_norm": 0.18450690805912018, "learning_rate": 0.00010141414095271193, "loss": 0.449, "step": 3639 }, { "epoch": 1.092436974789916, "grad_norm": 0.15630345046520233, "learning_rate": 0.00010136176866191548, "loss": 0.4481, "step": 3640 }, { "epoch": 1.0927370948379351, "grad_norm": 0.3001805245876312, "learning_rate": 0.00010130939599753346, "loss": 0.4483, "step": 3641 }, { "epoch": 1.0930372148859544, "grad_norm": 0.23306016623973846, "learning_rate": 0.00010125702297393366, "loss": 0.4181, "step": 3642 }, { "epoch": 1.0933373349339737, "grad_norm": 0.14699408411979675, "learning_rate": 0.00010120464960548402, "loss": 0.465, "step": 3643 }, { "epoch": 1.0936374549819927, "grad_norm": 0.14448916912078857, "learning_rate": 0.00010115227590655257, "loss": 0.4315, "step": 3644 }, { "epoch": 1.093937575030012, "grad_norm": 0.13666747510433197, "learning_rate": 0.0001010999018915074, "loss": 0.4295, "step": 3645 }, { "epoch": 1.0942376950780313, "grad_norm": 0.21696338057518005, "learning_rate": 0.0001010475275747168, "loss": 0.4799, "step": 3646 }, { "epoch": 1.0945378151260505, "grad_norm": 0.15575353801250458, "learning_rate": 0.00010099515297054902, "loss": 0.4692, "step": 3647 }, { "epoch": 1.0948379351740696, "grad_norm": 0.14021803438663483, "learning_rate": 0.00010094277809337243, "loss": 0.3929, "step": 3648 }, { "epoch": 1.0951380552220888, "grad_norm": 0.14631427824497223, "learning_rate": 0.00010089040295755546, "loss": 0.4501, "step": 3649 }, { "epoch": 1.0954381752701081, "grad_norm": 0.16138429939746857, "learning_rate": 0.00010083802757746668, "loss": 0.4264, "step": 3650 }, { "epoch": 1.0957382953181272, "grad_norm": 0.1426657736301422, "learning_rate": 0.0001007856519674746, "loss": 0.3811, "step": 3651 }, { "epoch": 1.0960384153661464, "grad_norm": 0.15724453330039978, "learning_rate": 0.00010073327614194794, "loss": 0.425, "step": 3652 }, { "epoch": 1.0963385354141657, "grad_norm": 0.3289673626422882, "learning_rate": 0.0001006809001152554, "loss": 0.3751, "step": 3653 }, { "epoch": 1.096638655462185, "grad_norm": 0.14599741995334625, "learning_rate": 0.00010062852390176569, "loss": 0.448, "step": 3654 }, { "epoch": 1.096938775510204, "grad_norm": 0.15674425661563873, "learning_rate": 0.00010057614751584765, "loss": 0.4618, "step": 3655 }, { "epoch": 1.0972388955582233, "grad_norm": 0.13821865618228912, "learning_rate": 0.00010052377097187015, "loss": 0.3746, "step": 3656 }, { "epoch": 1.0975390156062426, "grad_norm": 0.1998661309480667, "learning_rate": 0.00010047139428420211, "loss": 0.4147, "step": 3657 }, { "epoch": 1.0978391356542616, "grad_norm": 0.15244896709918976, "learning_rate": 0.00010041901746721245, "loss": 0.4808, "step": 3658 }, { "epoch": 1.0981392557022809, "grad_norm": 0.16402991116046906, "learning_rate": 0.00010036664053527012, "loss": 0.405, "step": 3659 }, { "epoch": 1.0984393757503002, "grad_norm": 0.13648836314678192, "learning_rate": 0.0001003142635027442, "loss": 0.4282, "step": 3660 }, { "epoch": 1.0987394957983194, "grad_norm": 0.1392647624015808, "learning_rate": 0.00010026188638400367, "loss": 0.4078, "step": 3661 }, { "epoch": 1.0990396158463385, "grad_norm": 0.1392301619052887, "learning_rate": 0.00010020950919341763, "loss": 0.4208, "step": 3662 }, { "epoch": 1.0993397358943577, "grad_norm": 0.13113875687122345, "learning_rate": 0.00010015713194535512, "loss": 0.4064, "step": 3663 }, { "epoch": 1.099639855942377, "grad_norm": 0.13509242236614227, "learning_rate": 0.00010010475465418527, "loss": 0.395, "step": 3664 }, { "epoch": 1.099939975990396, "grad_norm": 0.14285650849342346, "learning_rate": 0.00010005237733427721, "loss": 0.4583, "step": 3665 }, { "epoch": 1.1002400960384153, "grad_norm": 0.14350396394729614, "learning_rate": 0.0001, "loss": 0.393, "step": 3666 }, { "epoch": 1.1005402160864346, "grad_norm": 0.18313319981098175, "learning_rate": 9.994762266572281e-05, "loss": 0.399, "step": 3667 }, { "epoch": 1.1008403361344539, "grad_norm": 0.1572713553905487, "learning_rate": 9.989524534581471e-05, "loss": 0.5052, "step": 3668 }, { "epoch": 1.101140456182473, "grad_norm": 0.13151343166828156, "learning_rate": 9.98428680546449e-05, "loss": 0.3965, "step": 3669 }, { "epoch": 1.1014405762304922, "grad_norm": 0.15630406141281128, "learning_rate": 9.979049080658242e-05, "loss": 0.463, "step": 3670 }, { "epoch": 1.1017406962785115, "grad_norm": 0.13366857171058655, "learning_rate": 9.973811361599636e-05, "loss": 0.3853, "step": 3671 }, { "epoch": 1.1020408163265305, "grad_norm": 0.13806574046611786, "learning_rate": 9.968573649725583e-05, "loss": 0.4184, "step": 3672 }, { "epoch": 1.1023409363745498, "grad_norm": 0.131994366645813, "learning_rate": 9.963335946472989e-05, "loss": 0.3853, "step": 3673 }, { "epoch": 1.102641056422569, "grad_norm": 0.14549876749515533, "learning_rate": 9.958098253278758e-05, "loss": 0.4407, "step": 3674 }, { "epoch": 1.1029411764705883, "grad_norm": 0.1381799727678299, "learning_rate": 9.952860571579794e-05, "loss": 0.3665, "step": 3675 }, { "epoch": 1.1032412965186074, "grad_norm": 0.1353103667497635, "learning_rate": 9.947622902812984e-05, "loss": 0.4143, "step": 3676 }, { "epoch": 1.1035414165666266, "grad_norm": 0.15333561599254608, "learning_rate": 9.942385248415237e-05, "loss": 0.4198, "step": 3677 }, { "epoch": 1.103841536614646, "grad_norm": 0.13435952365398407, "learning_rate": 9.937147609823434e-05, "loss": 0.4037, "step": 3678 }, { "epoch": 1.104141656662665, "grad_norm": 0.14614462852478027, "learning_rate": 9.931909988474464e-05, "loss": 0.4549, "step": 3679 }, { "epoch": 1.1044417767106842, "grad_norm": 0.14689116179943085, "learning_rate": 9.926672385805207e-05, "loss": 0.3681, "step": 3680 }, { "epoch": 1.1047418967587035, "grad_norm": 0.16367031633853912, "learning_rate": 9.92143480325254e-05, "loss": 0.3984, "step": 3681 }, { "epoch": 1.1050420168067228, "grad_norm": 0.1460021585226059, "learning_rate": 9.916197242253335e-05, "loss": 0.4533, "step": 3682 }, { "epoch": 1.1053421368547418, "grad_norm": 0.1886349767446518, "learning_rate": 9.910959704244458e-05, "loss": 0.434, "step": 3683 }, { "epoch": 1.105642256902761, "grad_norm": 0.1475280523300171, "learning_rate": 9.90572219066276e-05, "loss": 0.4006, "step": 3684 }, { "epoch": 1.1059423769507803, "grad_norm": 0.14980600774288177, "learning_rate": 9.9004847029451e-05, "loss": 0.4208, "step": 3685 }, { "epoch": 1.1062424969987996, "grad_norm": 0.14014779031276703, "learning_rate": 9.895247242528323e-05, "loss": 0.4283, "step": 3686 }, { "epoch": 1.1065426170468187, "grad_norm": 0.1508089154958725, "learning_rate": 9.89000981084926e-05, "loss": 0.4432, "step": 3687 }, { "epoch": 1.106842737094838, "grad_norm": 0.14019598066806793, "learning_rate": 9.884772409344746e-05, "loss": 0.4329, "step": 3688 }, { "epoch": 1.1071428571428572, "grad_norm": 0.12213557958602905, "learning_rate": 9.879535039451603e-05, "loss": 0.3514, "step": 3689 }, { "epoch": 1.1074429771908763, "grad_norm": 0.1391676515340805, "learning_rate": 9.874297702606636e-05, "loss": 0.3951, "step": 3690 }, { "epoch": 1.1077430972388955, "grad_norm": 0.1456574946641922, "learning_rate": 9.869060400246656e-05, "loss": 0.4052, "step": 3691 }, { "epoch": 1.1080432172869148, "grad_norm": 0.15275728702545166, "learning_rate": 9.86382313380845e-05, "loss": 0.482, "step": 3692 }, { "epoch": 1.108343337334934, "grad_norm": 0.1298411637544632, "learning_rate": 9.858585904728809e-05, "loss": 0.4111, "step": 3693 }, { "epoch": 1.1086434573829531, "grad_norm": 0.1593908816576004, "learning_rate": 9.853348714444506e-05, "loss": 0.4284, "step": 3694 }, { "epoch": 1.1089435774309724, "grad_norm": 0.13594958186149597, "learning_rate": 9.848111564392294e-05, "loss": 0.4011, "step": 3695 }, { "epoch": 1.1092436974789917, "grad_norm": 0.1442171335220337, "learning_rate": 9.842874456008939e-05, "loss": 0.4511, "step": 3696 }, { "epoch": 1.1095438175270107, "grad_norm": 0.141292005777359, "learning_rate": 9.837637390731175e-05, "loss": 0.4284, "step": 3697 }, { "epoch": 1.10984393757503, "grad_norm": 0.14179612696170807, "learning_rate": 9.832400369995728e-05, "loss": 0.4402, "step": 3698 }, { "epoch": 1.1101440576230492, "grad_norm": 0.12931428849697113, "learning_rate": 9.82716339523932e-05, "loss": 0.3745, "step": 3699 }, { "epoch": 1.1104441776710685, "grad_norm": 0.13897705078125, "learning_rate": 9.821926467898653e-05, "loss": 0.4134, "step": 3700 }, { "epoch": 1.1107442977190876, "grad_norm": 0.17523273825645447, "learning_rate": 9.816689589410412e-05, "loss": 0.4313, "step": 3701 }, { "epoch": 1.1110444177671068, "grad_norm": 0.19277136027812958, "learning_rate": 9.811452761211288e-05, "loss": 0.4088, "step": 3702 }, { "epoch": 1.111344537815126, "grad_norm": 0.18094953894615173, "learning_rate": 9.806215984737932e-05, "loss": 0.4272, "step": 3703 }, { "epoch": 1.1116446578631454, "grad_norm": 0.1389746069908142, "learning_rate": 9.800979261427001e-05, "loss": 0.4336, "step": 3704 }, { "epoch": 1.1119447779111644, "grad_norm": 0.16211512684822083, "learning_rate": 9.795742592715127e-05, "loss": 0.439, "step": 3705 }, { "epoch": 1.1122448979591837, "grad_norm": 0.17074798047542572, "learning_rate": 9.790505980038928e-05, "loss": 0.4038, "step": 3706 }, { "epoch": 1.112545018007203, "grad_norm": 0.14179076254367828, "learning_rate": 9.785269424835016e-05, "loss": 0.4595, "step": 3707 }, { "epoch": 1.112845138055222, "grad_norm": 0.1443812996149063, "learning_rate": 9.780032928539973e-05, "loss": 0.4738, "step": 3708 }, { "epoch": 1.1131452581032413, "grad_norm": 0.19112515449523926, "learning_rate": 9.774796492590372e-05, "loss": 0.4503, "step": 3709 }, { "epoch": 1.1134453781512605, "grad_norm": 0.14020976424217224, "learning_rate": 9.769560118422773e-05, "loss": 0.444, "step": 3710 }, { "epoch": 1.1137454981992798, "grad_norm": 0.15076801180839539, "learning_rate": 9.76432380747371e-05, "loss": 0.3726, "step": 3711 }, { "epoch": 1.1140456182472989, "grad_norm": 0.14267252385616302, "learning_rate": 9.759087561179712e-05, "loss": 0.4061, "step": 3712 }, { "epoch": 1.1143457382953181, "grad_norm": 0.1448672115802765, "learning_rate": 9.75385138097728e-05, "loss": 0.4274, "step": 3713 }, { "epoch": 1.1146458583433374, "grad_norm": 0.15275657176971436, "learning_rate": 9.748615268302893e-05, "loss": 0.4378, "step": 3714 }, { "epoch": 1.1149459783913565, "grad_norm": 0.15933093428611755, "learning_rate": 9.743379224593032e-05, "loss": 0.4597, "step": 3715 }, { "epoch": 1.1152460984393757, "grad_norm": 0.1405712515115738, "learning_rate": 9.738143251284135e-05, "loss": 0.4389, "step": 3716 }, { "epoch": 1.115546218487395, "grad_norm": 0.13883735239505768, "learning_rate": 9.732907349812632e-05, "loss": 0.4352, "step": 3717 }, { "epoch": 1.1158463385354143, "grad_norm": 0.1645667403936386, "learning_rate": 9.727671521614938e-05, "loss": 0.3979, "step": 3718 }, { "epoch": 1.1161464585834333, "grad_norm": 0.13730087876319885, "learning_rate": 9.72243576812744e-05, "loss": 0.4031, "step": 3719 }, { "epoch": 1.1164465786314526, "grad_norm": 0.13023704290390015, "learning_rate": 9.717200090786501e-05, "loss": 0.3587, "step": 3720 }, { "epoch": 1.1167466986794718, "grad_norm": 0.17127841711044312, "learning_rate": 9.71196449102848e-05, "loss": 0.4851, "step": 3721 }, { "epoch": 1.117046818727491, "grad_norm": 0.1397799402475357, "learning_rate": 9.706728970289695e-05, "loss": 0.3994, "step": 3722 }, { "epoch": 1.1173469387755102, "grad_norm": 0.13015244901180267, "learning_rate": 9.701493530006455e-05, "loss": 0.4152, "step": 3723 }, { "epoch": 1.1176470588235294, "grad_norm": 0.1434255987405777, "learning_rate": 9.696258171615043e-05, "loss": 0.4336, "step": 3724 }, { "epoch": 1.1179471788715487, "grad_norm": 0.15915736556053162, "learning_rate": 9.691022896551715e-05, "loss": 0.4547, "step": 3725 }, { "epoch": 1.1182472989195678, "grad_norm": 0.15358136594295502, "learning_rate": 9.685787706252716e-05, "loss": 0.4333, "step": 3726 }, { "epoch": 1.118547418967587, "grad_norm": 0.1452532857656479, "learning_rate": 9.68055260215426e-05, "loss": 0.4303, "step": 3727 }, { "epoch": 1.1188475390156063, "grad_norm": 0.1438799947500229, "learning_rate": 9.67531758569253e-05, "loss": 0.4472, "step": 3728 }, { "epoch": 1.1191476590636253, "grad_norm": 0.324162095785141, "learning_rate": 9.670082658303698e-05, "loss": 0.4478, "step": 3729 }, { "epoch": 1.1194477791116446, "grad_norm": 0.3205750584602356, "learning_rate": 9.664847821423907e-05, "loss": 0.4231, "step": 3730 }, { "epoch": 1.1197478991596639, "grad_norm": 0.14763224124908447, "learning_rate": 9.65961307648927e-05, "loss": 0.4589, "step": 3731 }, { "epoch": 1.1200480192076832, "grad_norm": 0.13289707899093628, "learning_rate": 9.654378424935885e-05, "loss": 0.3955, "step": 3732 }, { "epoch": 1.1203481392557022, "grad_norm": 0.1398414522409439, "learning_rate": 9.649143868199814e-05, "loss": 0.4155, "step": 3733 }, { "epoch": 1.1206482593037215, "grad_norm": 0.13791011273860931, "learning_rate": 9.643909407717104e-05, "loss": 0.4285, "step": 3734 }, { "epoch": 1.1209483793517407, "grad_norm": 0.5082004070281982, "learning_rate": 9.638675044923763e-05, "loss": 0.4751, "step": 3735 }, { "epoch": 1.1212484993997598, "grad_norm": 0.14391469955444336, "learning_rate": 9.633440781255777e-05, "loss": 0.4325, "step": 3736 }, { "epoch": 1.121548619447779, "grad_norm": 0.1421026885509491, "learning_rate": 9.628206618149113e-05, "loss": 0.4431, "step": 3737 }, { "epoch": 1.1218487394957983, "grad_norm": 0.1363701969385147, "learning_rate": 9.622972557039701e-05, "loss": 0.4173, "step": 3738 }, { "epoch": 1.1221488595438176, "grad_norm": 0.15972846746444702, "learning_rate": 9.617738599363438e-05, "loss": 0.4415, "step": 3739 }, { "epoch": 1.1224489795918366, "grad_norm": 0.1399405300617218, "learning_rate": 9.612504746556215e-05, "loss": 0.4091, "step": 3740 }, { "epoch": 1.122749099639856, "grad_norm": 0.19689854979515076, "learning_rate": 9.607271000053865e-05, "loss": 0.4199, "step": 3741 }, { "epoch": 1.1230492196878752, "grad_norm": 0.14337493479251862, "learning_rate": 9.602037361292218e-05, "loss": 0.4093, "step": 3742 }, { "epoch": 1.1233493397358945, "grad_norm": 0.12252549827098846, "learning_rate": 9.596803831707056e-05, "loss": 0.3608, "step": 3743 }, { "epoch": 1.1236494597839135, "grad_norm": 0.13820312917232513, "learning_rate": 9.591570412734137e-05, "loss": 0.4243, "step": 3744 }, { "epoch": 1.1239495798319328, "grad_norm": 0.13797035813331604, "learning_rate": 9.586337105809195e-05, "loss": 0.4505, "step": 3745 }, { "epoch": 1.124249699879952, "grad_norm": 0.1960890293121338, "learning_rate": 9.581103912367928e-05, "loss": 0.3763, "step": 3746 }, { "epoch": 1.124549819927971, "grad_norm": 0.14223526418209076, "learning_rate": 9.575870833845994e-05, "loss": 0.4511, "step": 3747 }, { "epoch": 1.1248499399759904, "grad_norm": 0.13488635420799255, "learning_rate": 9.570637871679034e-05, "loss": 0.4108, "step": 3748 }, { "epoch": 1.1251500600240096, "grad_norm": 0.1482326090335846, "learning_rate": 9.565405027302652e-05, "loss": 0.434, "step": 3749 }, { "epoch": 1.125450180072029, "grad_norm": 0.15240339934825897, "learning_rate": 9.560172302152414e-05, "loss": 0.4656, "step": 3750 }, { "epoch": 1.125750300120048, "grad_norm": 0.13519848883152008, "learning_rate": 9.554939697663866e-05, "loss": 0.4065, "step": 3751 }, { "epoch": 1.1260504201680672, "grad_norm": 0.14572805166244507, "learning_rate": 9.549707215272505e-05, "loss": 0.481, "step": 3752 }, { "epoch": 1.1263505402160865, "grad_norm": 0.3154379427433014, "learning_rate": 9.544474856413811e-05, "loss": 0.4518, "step": 3753 }, { "epoch": 1.1266506602641058, "grad_norm": 0.1319061517715454, "learning_rate": 9.539242622523215e-05, "loss": 0.3781, "step": 3754 }, { "epoch": 1.1269507803121248, "grad_norm": 0.13992980122566223, "learning_rate": 9.534010515036117e-05, "loss": 0.4156, "step": 3755 }, { "epoch": 1.127250900360144, "grad_norm": 0.15185213088989258, "learning_rate": 9.528778535387897e-05, "loss": 0.4115, "step": 3756 }, { "epoch": 1.1275510204081634, "grad_norm": 0.13775889575481415, "learning_rate": 9.523546685013883e-05, "loss": 0.4368, "step": 3757 }, { "epoch": 1.1278511404561824, "grad_norm": 0.13281835615634918, "learning_rate": 9.518314965349366e-05, "loss": 0.3926, "step": 3758 }, { "epoch": 1.1281512605042017, "grad_norm": 0.14120443165302277, "learning_rate": 9.513083377829622e-05, "loss": 0.4279, "step": 3759 }, { "epoch": 1.128451380552221, "grad_norm": 0.14515314996242523, "learning_rate": 9.507851923889868e-05, "loss": 0.4163, "step": 3760 }, { "epoch": 1.1287515006002402, "grad_norm": 0.15551668405532837, "learning_rate": 9.502620604965293e-05, "loss": 0.4462, "step": 3761 }, { "epoch": 1.1290516206482593, "grad_norm": 0.37603843212127686, "learning_rate": 9.497389422491054e-05, "loss": 0.3855, "step": 3762 }, { "epoch": 1.1293517406962785, "grad_norm": 0.14419730007648468, "learning_rate": 9.492158377902262e-05, "loss": 0.4237, "step": 3763 }, { "epoch": 1.1296518607442978, "grad_norm": 0.14433281123638153, "learning_rate": 9.486927472633996e-05, "loss": 0.421, "step": 3764 }, { "epoch": 1.1299519807923168, "grad_norm": 0.1525045782327652, "learning_rate": 9.4816967081213e-05, "loss": 0.424, "step": 3765 }, { "epoch": 1.1302521008403361, "grad_norm": 0.16010697185993195, "learning_rate": 9.476466085799161e-05, "loss": 0.4101, "step": 3766 }, { "epoch": 1.1305522208883554, "grad_norm": 0.15306870639324188, "learning_rate": 9.471235607102553e-05, "loss": 0.4936, "step": 3767 }, { "epoch": 1.1308523409363747, "grad_norm": 0.13024035096168518, "learning_rate": 9.466005273466393e-05, "loss": 0.3764, "step": 3768 }, { "epoch": 1.1311524609843937, "grad_norm": 0.13721759617328644, "learning_rate": 9.460775086325559e-05, "loss": 0.4065, "step": 3769 }, { "epoch": 1.131452581032413, "grad_norm": 0.1310778558254242, "learning_rate": 9.455545047114901e-05, "loss": 0.3956, "step": 3770 }, { "epoch": 1.1317527010804322, "grad_norm": 0.1459777057170868, "learning_rate": 9.450315157269214e-05, "loss": 0.4341, "step": 3771 }, { "epoch": 1.1320528211284513, "grad_norm": 0.149492084980011, "learning_rate": 9.445085418223264e-05, "loss": 0.4543, "step": 3772 }, { "epoch": 1.1323529411764706, "grad_norm": 0.1491083949804306, "learning_rate": 9.439855831411766e-05, "loss": 0.4333, "step": 3773 }, { "epoch": 1.1326530612244898, "grad_norm": 0.13482168316841125, "learning_rate": 9.434626398269393e-05, "loss": 0.3962, "step": 3774 }, { "epoch": 1.132953181272509, "grad_norm": 0.14354385435581207, "learning_rate": 9.429397120230789e-05, "loss": 0.4333, "step": 3775 }, { "epoch": 1.1332533013205282, "grad_norm": 0.1358368843793869, "learning_rate": 9.424167998730542e-05, "loss": 0.3885, "step": 3776 }, { "epoch": 1.1335534213685474, "grad_norm": 0.132878839969635, "learning_rate": 9.418939035203198e-05, "loss": 0.3914, "step": 3777 }, { "epoch": 1.1338535414165667, "grad_norm": 0.1317775398492813, "learning_rate": 9.413710231083272e-05, "loss": 0.3988, "step": 3778 }, { "epoch": 1.1341536614645857, "grad_norm": 0.14132483303546906, "learning_rate": 9.40848158780522e-05, "loss": 0.3987, "step": 3779 }, { "epoch": 1.134453781512605, "grad_norm": 0.12594947218894958, "learning_rate": 9.40325310680346e-05, "loss": 0.3727, "step": 3780 }, { "epoch": 1.1347539015606243, "grad_norm": 0.14186976850032806, "learning_rate": 9.398024789512369e-05, "loss": 0.4442, "step": 3781 }, { "epoch": 1.1350540216086435, "grad_norm": 0.13074573874473572, "learning_rate": 9.392796637366272e-05, "loss": 0.374, "step": 3782 }, { "epoch": 1.1353541416566626, "grad_norm": 0.1421251744031906, "learning_rate": 9.387568651799457e-05, "loss": 0.4349, "step": 3783 }, { "epoch": 1.1356542617046819, "grad_norm": 0.15257734060287476, "learning_rate": 9.382340834246161e-05, "loss": 0.4245, "step": 3784 }, { "epoch": 1.1359543817527011, "grad_norm": 0.13763746619224548, "learning_rate": 9.377113186140569e-05, "loss": 0.4402, "step": 3785 }, { "epoch": 1.1362545018007202, "grad_norm": 0.1399967074394226, "learning_rate": 9.37188570891684e-05, "loss": 0.414, "step": 3786 }, { "epoch": 1.1365546218487395, "grad_norm": 0.15171483159065247, "learning_rate": 9.366658404009062e-05, "loss": 0.4263, "step": 3787 }, { "epoch": 1.1368547418967587, "grad_norm": 0.1498602032661438, "learning_rate": 9.361431272851285e-05, "loss": 0.4179, "step": 3788 }, { "epoch": 1.137154861944778, "grad_norm": 0.1423768401145935, "learning_rate": 9.356204316877518e-05, "loss": 0.4607, "step": 3789 }, { "epoch": 1.137454981992797, "grad_norm": 0.1541401445865631, "learning_rate": 9.350977537521717e-05, "loss": 0.4563, "step": 3790 }, { "epoch": 1.1377551020408163, "grad_norm": 0.13775575160980225, "learning_rate": 9.345750936217783e-05, "loss": 0.4046, "step": 3791 }, { "epoch": 1.1380552220888356, "grad_norm": 0.13523773849010468, "learning_rate": 9.340524514399579e-05, "loss": 0.4038, "step": 3792 }, { "epoch": 1.1383553421368546, "grad_norm": 0.1366727203130722, "learning_rate": 9.33529827350091e-05, "loss": 0.4581, "step": 3793 }, { "epoch": 1.138655462184874, "grad_norm": 0.12473126500844955, "learning_rate": 9.330072214955542e-05, "loss": 0.3717, "step": 3794 }, { "epoch": 1.1389555822328932, "grad_norm": 0.15676021575927734, "learning_rate": 9.324846340197178e-05, "loss": 0.4323, "step": 3795 }, { "epoch": 1.1392557022809124, "grad_norm": 0.15680339932441711, "learning_rate": 9.319620650659479e-05, "loss": 0.433, "step": 3796 }, { "epoch": 1.1395558223289315, "grad_norm": 0.14847628772258759, "learning_rate": 9.314395147776055e-05, "loss": 0.3936, "step": 3797 }, { "epoch": 1.1398559423769508, "grad_norm": 0.14503027498722076, "learning_rate": 9.30916983298046e-05, "loss": 0.4131, "step": 3798 }, { "epoch": 1.14015606242497, "grad_norm": 0.17505504190921783, "learning_rate": 9.303944707706196e-05, "loss": 0.4563, "step": 3799 }, { "epoch": 1.140456182472989, "grad_norm": 0.1426694393157959, "learning_rate": 9.298719773386724e-05, "loss": 0.4162, "step": 3800 }, { "epoch": 1.1407563025210083, "grad_norm": 0.1447415053844452, "learning_rate": 9.293495031455437e-05, "loss": 0.4144, "step": 3801 }, { "epoch": 1.1410564225690276, "grad_norm": 0.13297565281391144, "learning_rate": 9.288270483345691e-05, "loss": 0.3883, "step": 3802 }, { "epoch": 1.1413565426170469, "grad_norm": 0.13733936846256256, "learning_rate": 9.283046130490777e-05, "loss": 0.4019, "step": 3803 }, { "epoch": 1.141656662665066, "grad_norm": 0.1973925530910492, "learning_rate": 9.277821974323931e-05, "loss": 0.4488, "step": 3804 }, { "epoch": 1.1419567827130852, "grad_norm": 0.14548492431640625, "learning_rate": 9.272598016278352e-05, "loss": 0.4546, "step": 3805 }, { "epoch": 1.1422569027611045, "grad_norm": 0.15933561325073242, "learning_rate": 9.267374257787163e-05, "loss": 0.4796, "step": 3806 }, { "epoch": 1.1425570228091237, "grad_norm": 0.14018718898296356, "learning_rate": 9.262150700283444e-05, "loss": 0.4026, "step": 3807 }, { "epoch": 1.1428571428571428, "grad_norm": 0.16493748128414154, "learning_rate": 9.256927345200221e-05, "loss": 0.4453, "step": 3808 }, { "epoch": 1.143157262905162, "grad_norm": 0.14382173120975494, "learning_rate": 9.251704193970463e-05, "loss": 0.4456, "step": 3809 }, { "epoch": 1.1434573829531813, "grad_norm": 0.14117833971977234, "learning_rate": 9.246481248027077e-05, "loss": 0.4647, "step": 3810 }, { "epoch": 1.1437575030012006, "grad_norm": 0.15258300304412842, "learning_rate": 9.24125850880292e-05, "loss": 0.4102, "step": 3811 }, { "epoch": 1.1440576230492197, "grad_norm": 0.136815145611763, "learning_rate": 9.23603597773079e-05, "loss": 0.3884, "step": 3812 }, { "epoch": 1.144357743097239, "grad_norm": 0.16704045236110687, "learning_rate": 9.23081365624343e-05, "loss": 0.4011, "step": 3813 }, { "epoch": 1.1446578631452582, "grad_norm": 0.1532745510339737, "learning_rate": 9.225591545773526e-05, "loss": 0.429, "step": 3814 }, { "epoch": 1.1449579831932772, "grad_norm": 0.1496194303035736, "learning_rate": 9.220369647753698e-05, "loss": 0.4077, "step": 3815 }, { "epoch": 1.1452581032412965, "grad_norm": 0.13573145866394043, "learning_rate": 9.215147963616522e-05, "loss": 0.4027, "step": 3816 }, { "epoch": 1.1455582232893158, "grad_norm": 0.1462734192609787, "learning_rate": 9.2099264947945e-05, "loss": 0.3946, "step": 3817 }, { "epoch": 1.145858343337335, "grad_norm": 0.15299293398857117, "learning_rate": 9.204705242720081e-05, "loss": 0.3987, "step": 3818 }, { "epoch": 1.146158463385354, "grad_norm": 0.12202068418264389, "learning_rate": 9.199484208825664e-05, "loss": 0.3443, "step": 3819 }, { "epoch": 1.1464585834333734, "grad_norm": 0.14336168766021729, "learning_rate": 9.194263394543575e-05, "loss": 0.4018, "step": 3820 }, { "epoch": 1.1467587034813926, "grad_norm": 0.14055746793746948, "learning_rate": 9.189042801306081e-05, "loss": 0.4153, "step": 3821 }, { "epoch": 1.1470588235294117, "grad_norm": 0.13209252059459686, "learning_rate": 9.183822430545401e-05, "loss": 0.4144, "step": 3822 }, { "epoch": 1.147358943577431, "grad_norm": 0.1446332335472107, "learning_rate": 9.178602283693672e-05, "loss": 0.4241, "step": 3823 }, { "epoch": 1.1476590636254502, "grad_norm": 0.13562363386154175, "learning_rate": 9.173382362182994e-05, "loss": 0.4073, "step": 3824 }, { "epoch": 1.1479591836734695, "grad_norm": 0.15286138653755188, "learning_rate": 9.168162667445384e-05, "loss": 0.4395, "step": 3825 }, { "epoch": 1.1482593037214885, "grad_norm": 0.13376116752624512, "learning_rate": 9.162943200912807e-05, "loss": 0.3755, "step": 3826 }, { "epoch": 1.1485594237695078, "grad_norm": 0.1457638442516327, "learning_rate": 9.157723964017165e-05, "loss": 0.4311, "step": 3827 }, { "epoch": 1.148859543817527, "grad_norm": 0.17449675500392914, "learning_rate": 9.152504958190298e-05, "loss": 0.3993, "step": 3828 }, { "epoch": 1.1491596638655461, "grad_norm": 0.13668902218341827, "learning_rate": 9.147286184863972e-05, "loss": 0.4458, "step": 3829 }, { "epoch": 1.1494597839135654, "grad_norm": 0.1385473906993866, "learning_rate": 9.14206764546991e-05, "loss": 0.4185, "step": 3830 }, { "epoch": 1.1497599039615847, "grad_norm": 0.176729217171669, "learning_rate": 9.136849341439747e-05, "loss": 0.334, "step": 3831 }, { "epoch": 1.150060024009604, "grad_norm": 0.16116398572921753, "learning_rate": 9.131631274205073e-05, "loss": 0.4478, "step": 3832 }, { "epoch": 1.150360144057623, "grad_norm": 0.12495972961187363, "learning_rate": 9.126413445197401e-05, "loss": 0.3671, "step": 3833 }, { "epoch": 1.1506602641056423, "grad_norm": 0.14716055989265442, "learning_rate": 9.121195855848181e-05, "loss": 0.4454, "step": 3834 }, { "epoch": 1.1509603841536615, "grad_norm": 0.1455603837966919, "learning_rate": 9.115978507588805e-05, "loss": 0.4099, "step": 3835 }, { "epoch": 1.1512605042016806, "grad_norm": 0.14258424937725067, "learning_rate": 9.110761401850587e-05, "loss": 0.4171, "step": 3836 }, { "epoch": 1.1515606242496998, "grad_norm": 0.1323336362838745, "learning_rate": 9.10554454006478e-05, "loss": 0.3682, "step": 3837 }, { "epoch": 1.1518607442977191, "grad_norm": 0.15299271047115326, "learning_rate": 9.100327923662573e-05, "loss": 0.482, "step": 3838 }, { "epoch": 1.1521608643457384, "grad_norm": 0.1293763816356659, "learning_rate": 9.095111554075085e-05, "loss": 0.3568, "step": 3839 }, { "epoch": 1.1524609843937574, "grad_norm": 0.14785557985305786, "learning_rate": 9.089895432733364e-05, "loss": 0.3778, "step": 3840 }, { "epoch": 1.1527611044417767, "grad_norm": 0.12821650505065918, "learning_rate": 9.0846795610684e-05, "loss": 0.3482, "step": 3841 }, { "epoch": 1.153061224489796, "grad_norm": 0.14406611025333405, "learning_rate": 9.079463940511096e-05, "loss": 0.4383, "step": 3842 }, { "epoch": 1.153361344537815, "grad_norm": 0.13842591643333435, "learning_rate": 9.074248572492311e-05, "loss": 0.3886, "step": 3843 }, { "epoch": 1.1536614645858343, "grad_norm": 0.13257431983947754, "learning_rate": 9.069033458442813e-05, "loss": 0.378, "step": 3844 }, { "epoch": 1.1539615846338536, "grad_norm": 0.1456029862165451, "learning_rate": 9.063818599793307e-05, "loss": 0.4176, "step": 3845 }, { "epoch": 1.1542617046818728, "grad_norm": 0.18930472433567047, "learning_rate": 9.058603997974437e-05, "loss": 0.403, "step": 3846 }, { "epoch": 1.1545618247298919, "grad_norm": 0.13556168973445892, "learning_rate": 9.053389654416768e-05, "loss": 0.4137, "step": 3847 }, { "epoch": 1.1548619447779112, "grad_norm": 0.15322500467300415, "learning_rate": 9.048175570550786e-05, "loss": 0.3904, "step": 3848 }, { "epoch": 1.1551620648259304, "grad_norm": 0.13942261040210724, "learning_rate": 9.042961747806927e-05, "loss": 0.3928, "step": 3849 }, { "epoch": 1.1554621848739495, "grad_norm": 0.16500745713710785, "learning_rate": 9.037748187615538e-05, "loss": 0.4227, "step": 3850 }, { "epoch": 1.1557623049219687, "grad_norm": 0.14914722740650177, "learning_rate": 9.032534891406897e-05, "loss": 0.4351, "step": 3851 }, { "epoch": 1.156062424969988, "grad_norm": 0.16964370012283325, "learning_rate": 9.027321860611218e-05, "loss": 0.4478, "step": 3852 }, { "epoch": 1.1563625450180073, "grad_norm": 0.14644289016723633, "learning_rate": 9.02210909665863e-05, "loss": 0.4449, "step": 3853 }, { "epoch": 1.1566626650660263, "grad_norm": 0.1654396653175354, "learning_rate": 9.016896600979205e-05, "loss": 0.4328, "step": 3854 }, { "epoch": 1.1569627851140456, "grad_norm": 0.17683622241020203, "learning_rate": 9.01168437500292e-05, "loss": 0.4209, "step": 3855 }, { "epoch": 1.1572629051620649, "grad_norm": 0.14778678119182587, "learning_rate": 9.006472420159692e-05, "loss": 0.4671, "step": 3856 }, { "epoch": 1.157563025210084, "grad_norm": 0.13837729394435883, "learning_rate": 9.001260737879367e-05, "loss": 0.4191, "step": 3857 }, { "epoch": 1.1578631452581032, "grad_norm": 0.18409544229507446, "learning_rate": 8.996049329591705e-05, "loss": 0.4062, "step": 3858 }, { "epoch": 1.1581632653061225, "grad_norm": 0.15561728179454803, "learning_rate": 8.990838196726396e-05, "loss": 0.426, "step": 3859 }, { "epoch": 1.1584633853541417, "grad_norm": 0.15675321221351624, "learning_rate": 8.985627340713061e-05, "loss": 0.3808, "step": 3860 }, { "epoch": 1.1587635054021608, "grad_norm": 0.14641733467578888, "learning_rate": 8.980416762981226e-05, "loss": 0.4591, "step": 3861 }, { "epoch": 1.15906362545018, "grad_norm": 0.14044514298439026, "learning_rate": 8.975206464960368e-05, "loss": 0.4438, "step": 3862 }, { "epoch": 1.1593637454981993, "grad_norm": 0.1593010425567627, "learning_rate": 8.969996448079864e-05, "loss": 0.4327, "step": 3863 }, { "epoch": 1.1596638655462184, "grad_norm": 0.1507299542427063, "learning_rate": 8.96478671376902e-05, "loss": 0.4078, "step": 3864 }, { "epoch": 1.1599639855942376, "grad_norm": 0.15690159797668457, "learning_rate": 8.959577263457074e-05, "loss": 0.4326, "step": 3865 }, { "epoch": 1.160264105642257, "grad_norm": 0.14754226803779602, "learning_rate": 8.954368098573179e-05, "loss": 0.4303, "step": 3866 }, { "epoch": 1.1605642256902762, "grad_norm": 0.13434180617332458, "learning_rate": 8.949159220546398e-05, "loss": 0.3831, "step": 3867 }, { "epoch": 1.1608643457382952, "grad_norm": 0.13902609050273895, "learning_rate": 8.943950630805742e-05, "loss": 0.4089, "step": 3868 }, { "epoch": 1.1611644657863145, "grad_norm": 0.14676375687122345, "learning_rate": 8.938742330780118e-05, "loss": 0.4371, "step": 3869 }, { "epoch": 1.1614645858343338, "grad_norm": 0.15333369374275208, "learning_rate": 8.933534321898367e-05, "loss": 0.4572, "step": 3870 }, { "epoch": 1.161764705882353, "grad_norm": 0.13503114879131317, "learning_rate": 8.928326605589246e-05, "loss": 0.3563, "step": 3871 }, { "epoch": 1.162064825930372, "grad_norm": 0.2535243630409241, "learning_rate": 8.923119183281432e-05, "loss": 0.3526, "step": 3872 }, { "epoch": 1.1623649459783914, "grad_norm": 0.14466118812561035, "learning_rate": 8.917912056403522e-05, "loss": 0.442, "step": 3873 }, { "epoch": 1.1626650660264106, "grad_norm": 0.1449826955795288, "learning_rate": 8.912705226384035e-05, "loss": 0.4138, "step": 3874 }, { "epoch": 1.16296518607443, "grad_norm": 0.13851653039455414, "learning_rate": 8.907498694651397e-05, "loss": 0.3779, "step": 3875 }, { "epoch": 1.163265306122449, "grad_norm": 0.14941152930259705, "learning_rate": 8.902292462633968e-05, "loss": 0.4427, "step": 3876 }, { "epoch": 1.1635654261704682, "grad_norm": 0.15614332258701324, "learning_rate": 8.897086531760014e-05, "loss": 0.4358, "step": 3877 }, { "epoch": 1.1638655462184875, "grad_norm": 0.14003868401050568, "learning_rate": 8.891880903457721e-05, "loss": 0.3631, "step": 3878 }, { "epoch": 1.1641656662665065, "grad_norm": 0.14047202467918396, "learning_rate": 8.886675579155201e-05, "loss": 0.3985, "step": 3879 }, { "epoch": 1.1644657863145258, "grad_norm": 0.1737263798713684, "learning_rate": 8.881470560280465e-05, "loss": 0.4331, "step": 3880 }, { "epoch": 1.164765906362545, "grad_norm": 0.15184059739112854, "learning_rate": 8.876265848261456e-05, "loss": 0.4084, "step": 3881 }, { "epoch": 1.1650660264105643, "grad_norm": 0.13204747438430786, "learning_rate": 8.871061444526027e-05, "loss": 0.385, "step": 3882 }, { "epoch": 1.1653661464585834, "grad_norm": 0.15236324071884155, "learning_rate": 8.865857350501944e-05, "loss": 0.4529, "step": 3883 }, { "epoch": 1.1656662665066027, "grad_norm": 0.14037908613681793, "learning_rate": 8.860653567616893e-05, "loss": 0.4244, "step": 3884 }, { "epoch": 1.165966386554622, "grad_norm": 0.14538414776325226, "learning_rate": 8.855450097298474e-05, "loss": 0.4007, "step": 3885 }, { "epoch": 1.166266506602641, "grad_norm": 0.1330530345439911, "learning_rate": 8.850246940974191e-05, "loss": 0.381, "step": 3886 }, { "epoch": 1.1665666266506602, "grad_norm": 0.1374024599790573, "learning_rate": 8.845044100071482e-05, "loss": 0.4161, "step": 3887 }, { "epoch": 1.1668667466986795, "grad_norm": 0.14352889358997345, "learning_rate": 8.839841576017679e-05, "loss": 0.4249, "step": 3888 }, { "epoch": 1.1671668667466988, "grad_norm": 0.14906354248523712, "learning_rate": 8.834639370240035e-05, "loss": 0.4431, "step": 3889 }, { "epoch": 1.1674669867947178, "grad_norm": 0.1526843011379242, "learning_rate": 8.829437484165718e-05, "loss": 0.4105, "step": 3890 }, { "epoch": 1.167767106842737, "grad_norm": 0.1684161275625229, "learning_rate": 8.824235919221803e-05, "loss": 0.3991, "step": 3891 }, { "epoch": 1.1680672268907564, "grad_norm": 0.13714158535003662, "learning_rate": 8.819034676835282e-05, "loss": 0.4156, "step": 3892 }, { "epoch": 1.1683673469387754, "grad_norm": 0.15890493988990784, "learning_rate": 8.813833758433061e-05, "loss": 0.4439, "step": 3893 }, { "epoch": 1.1686674669867947, "grad_norm": 0.213880255818367, "learning_rate": 8.808633165441942e-05, "loss": 0.4688, "step": 3894 }, { "epoch": 1.168967587034814, "grad_norm": 0.14558623731136322, "learning_rate": 8.803432899288654e-05, "loss": 0.4253, "step": 3895 }, { "epoch": 1.1692677070828332, "grad_norm": 0.1360839456319809, "learning_rate": 8.79823296139983e-05, "loss": 0.387, "step": 3896 }, { "epoch": 1.1695678271308523, "grad_norm": 0.14573483169078827, "learning_rate": 8.793033353202011e-05, "loss": 0.3862, "step": 3897 }, { "epoch": 1.1698679471788715, "grad_norm": 0.1445828378200531, "learning_rate": 8.787834076121655e-05, "loss": 0.4294, "step": 3898 }, { "epoch": 1.1701680672268908, "grad_norm": 0.14929187297821045, "learning_rate": 8.782635131585122e-05, "loss": 0.4345, "step": 3899 }, { "epoch": 1.1704681872749099, "grad_norm": 0.13889843225479126, "learning_rate": 8.777436521018676e-05, "loss": 0.3895, "step": 3900 }, { "epoch": 1.1707683073229291, "grad_norm": 0.14753101766109467, "learning_rate": 8.772238245848506e-05, "loss": 0.451, "step": 3901 }, { "epoch": 1.1710684273709484, "grad_norm": 0.15742534399032593, "learning_rate": 8.767040307500692e-05, "loss": 0.4267, "step": 3902 }, { "epoch": 1.1713685474189677, "grad_norm": 0.13733577728271484, "learning_rate": 8.761842707401233e-05, "loss": 0.3828, "step": 3903 }, { "epoch": 1.1716686674669867, "grad_norm": 0.18186461925506592, "learning_rate": 8.756645446976034e-05, "loss": 0.4499, "step": 3904 }, { "epoch": 1.171968787515006, "grad_norm": 0.12226764857769012, "learning_rate": 8.751448527650892e-05, "loss": 0.3218, "step": 3905 }, { "epoch": 1.1722689075630253, "grad_norm": 0.13384030759334564, "learning_rate": 8.746251950851536e-05, "loss": 0.3828, "step": 3906 }, { "epoch": 1.1725690276110443, "grad_norm": 0.13068121671676636, "learning_rate": 8.741055718003578e-05, "loss": 0.3803, "step": 3907 }, { "epoch": 1.1728691476590636, "grad_norm": 0.1547713577747345, "learning_rate": 8.735859830532549e-05, "loss": 0.4431, "step": 3908 }, { "epoch": 1.1731692677070829, "grad_norm": 0.151955708861351, "learning_rate": 8.73066428986388e-05, "loss": 0.3832, "step": 3909 }, { "epoch": 1.1734693877551021, "grad_norm": 0.18591925501823425, "learning_rate": 8.725469097422912e-05, "loss": 0.4592, "step": 3910 }, { "epoch": 1.1737695078031212, "grad_norm": 0.13940228521823883, "learning_rate": 8.720274254634873e-05, "loss": 0.4156, "step": 3911 }, { "epoch": 1.1740696278511404, "grad_norm": 0.15555444359779358, "learning_rate": 8.715079762924927e-05, "loss": 0.4367, "step": 3912 }, { "epoch": 1.1743697478991597, "grad_norm": 0.14583249390125275, "learning_rate": 8.709885623718109e-05, "loss": 0.4288, "step": 3913 }, { "epoch": 1.1746698679471788, "grad_norm": 0.14817452430725098, "learning_rate": 8.704691838439381e-05, "loss": 0.358, "step": 3914 }, { "epoch": 1.174969987995198, "grad_norm": 0.14373117685317993, "learning_rate": 8.699498408513592e-05, "loss": 0.4168, "step": 3915 }, { "epoch": 1.1752701080432173, "grad_norm": 0.1352091282606125, "learning_rate": 8.694305335365501e-05, "loss": 0.3532, "step": 3916 }, { "epoch": 1.1755702280912366, "grad_norm": 0.13123095035552979, "learning_rate": 8.689112620419772e-05, "loss": 0.3772, "step": 3917 }, { "epoch": 1.1758703481392556, "grad_norm": 0.14840468764305115, "learning_rate": 8.683920265100966e-05, "loss": 0.3659, "step": 3918 }, { "epoch": 1.1761704681872749, "grad_norm": 0.1469290405511856, "learning_rate": 8.678728270833539e-05, "loss": 0.3946, "step": 3919 }, { "epoch": 1.1764705882352942, "grad_norm": 0.1446821242570877, "learning_rate": 8.673536639041864e-05, "loss": 0.4272, "step": 3920 }, { "epoch": 1.1767707082833132, "grad_norm": 0.15715628862380981, "learning_rate": 8.6683453711502e-05, "loss": 0.444, "step": 3921 }, { "epoch": 1.1770708283313325, "grad_norm": 0.14631730318069458, "learning_rate": 8.663154468582715e-05, "loss": 0.4167, "step": 3922 }, { "epoch": 1.1773709483793517, "grad_norm": 0.1498534232378006, "learning_rate": 8.657963932763475e-05, "loss": 0.4399, "step": 3923 }, { "epoch": 1.177671068427371, "grad_norm": 0.1498582363128662, "learning_rate": 8.652773765116435e-05, "loss": 0.4093, "step": 3924 }, { "epoch": 1.17797118847539, "grad_norm": 0.13406604528427124, "learning_rate": 8.647583967065472e-05, "loss": 0.3837, "step": 3925 }, { "epoch": 1.1782713085234093, "grad_norm": 0.1385580599308014, "learning_rate": 8.642394540034336e-05, "loss": 0.4273, "step": 3926 }, { "epoch": 1.1785714285714286, "grad_norm": 0.17958009243011475, "learning_rate": 8.637205485446691e-05, "loss": 0.4117, "step": 3927 }, { "epoch": 1.1788715486194479, "grad_norm": 0.1370212733745575, "learning_rate": 8.632016804726095e-05, "loss": 0.4077, "step": 3928 }, { "epoch": 1.179171668667467, "grad_norm": 0.13599421083927155, "learning_rate": 8.626828499296005e-05, "loss": 0.3587, "step": 3929 }, { "epoch": 1.1794717887154862, "grad_norm": 0.1433763951063156, "learning_rate": 8.621640570579764e-05, "loss": 0.4223, "step": 3930 }, { "epoch": 1.1797719087635055, "grad_norm": 0.12710444629192352, "learning_rate": 8.616453020000635e-05, "loss": 0.3452, "step": 3931 }, { "epoch": 1.1800720288115247, "grad_norm": 0.13192912936210632, "learning_rate": 8.611265848981749e-05, "loss": 0.347, "step": 3932 }, { "epoch": 1.1803721488595438, "grad_norm": 0.15339438617229462, "learning_rate": 8.606079058946157e-05, "loss": 0.4744, "step": 3933 }, { "epoch": 1.180672268907563, "grad_norm": 0.13660569489002228, "learning_rate": 8.600892651316791e-05, "loss": 0.3795, "step": 3934 }, { "epoch": 1.1809723889555823, "grad_norm": 0.14108389616012573, "learning_rate": 8.595706627516482e-05, "loss": 0.4166, "step": 3935 }, { "epoch": 1.1812725090036014, "grad_norm": 0.15114083886146545, "learning_rate": 8.590520988967958e-05, "loss": 0.4707, "step": 3936 }, { "epoch": 1.1815726290516206, "grad_norm": 0.1492743194103241, "learning_rate": 8.585335737093842e-05, "loss": 0.4332, "step": 3937 }, { "epoch": 1.18187274909964, "grad_norm": 0.16283780336380005, "learning_rate": 8.580150873316639e-05, "loss": 0.3972, "step": 3938 }, { "epoch": 1.1821728691476592, "grad_norm": 0.1415175348520279, "learning_rate": 8.574966399058767e-05, "loss": 0.3945, "step": 3939 }, { "epoch": 1.1824729891956782, "grad_norm": 0.1329096257686615, "learning_rate": 8.56978231574252e-05, "loss": 0.3587, "step": 3940 }, { "epoch": 1.1827731092436975, "grad_norm": 0.15043918788433075, "learning_rate": 8.564598624790098e-05, "loss": 0.4029, "step": 3941 }, { "epoch": 1.1830732292917168, "grad_norm": 0.1478135883808136, "learning_rate": 8.559415327623584e-05, "loss": 0.424, "step": 3942 }, { "epoch": 1.1833733493397358, "grad_norm": 0.14266376197338104, "learning_rate": 8.554232425664954e-05, "loss": 0.4114, "step": 3943 }, { "epoch": 1.183673469387755, "grad_norm": 0.1355818659067154, "learning_rate": 8.549049920336086e-05, "loss": 0.3965, "step": 3944 }, { "epoch": 1.1839735894357744, "grad_norm": 0.1327577829360962, "learning_rate": 8.54386781305873e-05, "loss": 0.3729, "step": 3945 }, { "epoch": 1.1842737094837936, "grad_norm": 0.15911321341991425, "learning_rate": 8.538686105254541e-05, "loss": 0.3915, "step": 3946 }, { "epoch": 1.1845738295318127, "grad_norm": 0.14487755298614502, "learning_rate": 8.533504798345065e-05, "loss": 0.408, "step": 3947 }, { "epoch": 1.184873949579832, "grad_norm": 0.14488984644412994, "learning_rate": 8.528323893751736e-05, "loss": 0.4323, "step": 3948 }, { "epoch": 1.1851740696278512, "grad_norm": 0.14207231998443604, "learning_rate": 8.523143392895863e-05, "loss": 0.3827, "step": 3949 }, { "epoch": 1.1854741896758703, "grad_norm": 0.17004531621932983, "learning_rate": 8.517963297198672e-05, "loss": 0.3853, "step": 3950 }, { "epoch": 1.1857743097238895, "grad_norm": 0.1584494709968567, "learning_rate": 8.512783608081252e-05, "loss": 0.4096, "step": 3951 }, { "epoch": 1.1860744297719088, "grad_norm": 0.12412303686141968, "learning_rate": 8.507604326964601e-05, "loss": 0.3269, "step": 3952 }, { "epoch": 1.186374549819928, "grad_norm": 0.1437680572271347, "learning_rate": 8.502425455269588e-05, "loss": 0.3892, "step": 3953 }, { "epoch": 1.1866746698679471, "grad_norm": 0.1382899433374405, "learning_rate": 8.497246994416977e-05, "loss": 0.4005, "step": 3954 }, { "epoch": 1.1869747899159664, "grad_norm": 0.1461184024810791, "learning_rate": 8.492068945827425e-05, "loss": 0.4206, "step": 3955 }, { "epoch": 1.1872749099639857, "grad_norm": 0.14180120825767517, "learning_rate": 8.486891310921468e-05, "loss": 0.4101, "step": 3956 }, { "epoch": 1.1875750300120047, "grad_norm": 0.1335573047399521, "learning_rate": 8.481714091119525e-05, "loss": 0.3683, "step": 3957 }, { "epoch": 1.187875150060024, "grad_norm": 0.1252361536026001, "learning_rate": 8.476537287841915e-05, "loss": 0.3476, "step": 3958 }, { "epoch": 1.1881752701080432, "grad_norm": 0.13983231782913208, "learning_rate": 8.47136090250883e-05, "loss": 0.4322, "step": 3959 }, { "epoch": 1.1884753901560625, "grad_norm": 0.14104896783828735, "learning_rate": 8.466184936540351e-05, "loss": 0.3907, "step": 3960 }, { "epoch": 1.1887755102040816, "grad_norm": 0.14977477490901947, "learning_rate": 8.46100939135645e-05, "loss": 0.4129, "step": 3961 }, { "epoch": 1.1890756302521008, "grad_norm": 0.1299585998058319, "learning_rate": 8.455834268376972e-05, "loss": 0.3244, "step": 3962 }, { "epoch": 1.18937575030012, "grad_norm": 0.13615921139717102, "learning_rate": 8.450659569021662e-05, "loss": 0.3732, "step": 3963 }, { "epoch": 1.1896758703481392, "grad_norm": 0.1456853151321411, "learning_rate": 8.445485294710131e-05, "loss": 0.4079, "step": 3964 }, { "epoch": 1.1899759903961584, "grad_norm": 0.15386579930782318, "learning_rate": 8.440311446861881e-05, "loss": 0.4494, "step": 3965 }, { "epoch": 1.1902761104441777, "grad_norm": 0.15661481022834778, "learning_rate": 8.435138026896305e-05, "loss": 0.4654, "step": 3966 }, { "epoch": 1.190576230492197, "grad_norm": 0.14809736609458923, "learning_rate": 8.429965036232668e-05, "loss": 0.3916, "step": 3967 }, { "epoch": 1.190876350540216, "grad_norm": 0.1559191644191742, "learning_rate": 8.424792476290117e-05, "loss": 0.3504, "step": 3968 }, { "epoch": 1.1911764705882353, "grad_norm": 0.15862765908241272, "learning_rate": 8.419620348487692e-05, "loss": 0.4435, "step": 3969 }, { "epoch": 1.1914765906362546, "grad_norm": 0.14491339027881622, "learning_rate": 8.414448654244297e-05, "loss": 0.3922, "step": 3970 }, { "epoch": 1.1917767106842736, "grad_norm": 0.14805670082569122, "learning_rate": 8.409277394978739e-05, "loss": 0.3863, "step": 3971 }, { "epoch": 1.1920768307322929, "grad_norm": 0.16750305891036987, "learning_rate": 8.404106572109686e-05, "loss": 0.3847, "step": 3972 }, { "epoch": 1.1923769507803121, "grad_norm": 0.14482684433460236, "learning_rate": 8.398936187055693e-05, "loss": 0.376, "step": 3973 }, { "epoch": 1.1926770708283314, "grad_norm": 0.15524138510227203, "learning_rate": 8.3937662412352e-05, "loss": 0.4175, "step": 3974 }, { "epoch": 1.1929771908763505, "grad_norm": 0.16882078349590302, "learning_rate": 8.388596736066523e-05, "loss": 0.4294, "step": 3975 }, { "epoch": 1.1932773109243697, "grad_norm": 0.1448148787021637, "learning_rate": 8.38342767296785e-05, "loss": 0.4201, "step": 3976 }, { "epoch": 1.193577430972389, "grad_norm": 0.1521812230348587, "learning_rate": 8.378259053357261e-05, "loss": 0.4441, "step": 3977 }, { "epoch": 1.193877551020408, "grad_norm": 0.13205227255821228, "learning_rate": 8.373090878652706e-05, "loss": 0.3791, "step": 3978 }, { "epoch": 1.1941776710684273, "grad_norm": 0.13769741356372833, "learning_rate": 8.367923150272008e-05, "loss": 0.3589, "step": 3979 }, { "epoch": 1.1944777911164466, "grad_norm": 0.1407497078180313, "learning_rate": 8.362755869632883e-05, "loss": 0.3977, "step": 3980 }, { "epoch": 1.1947779111644659, "grad_norm": 0.2244221419095993, "learning_rate": 8.35758903815291e-05, "loss": 0.3696, "step": 3981 }, { "epoch": 1.195078031212485, "grad_norm": 0.1452600508928299, "learning_rate": 8.352422657249556e-05, "loss": 0.399, "step": 3982 }, { "epoch": 1.1953781512605042, "grad_norm": 0.1496151238679886, "learning_rate": 8.347256728340152e-05, "loss": 0.4202, "step": 3983 }, { "epoch": 1.1956782713085234, "grad_norm": 0.16077955067157745, "learning_rate": 8.342091252841909e-05, "loss": 0.451, "step": 3984 }, { "epoch": 1.1959783913565427, "grad_norm": 0.1421745866537094, "learning_rate": 8.336926232171925e-05, "loss": 0.3981, "step": 3985 }, { "epoch": 1.1962785114045618, "grad_norm": 0.13930204510688782, "learning_rate": 8.331761667747158e-05, "loss": 0.3909, "step": 3986 }, { "epoch": 1.196578631452581, "grad_norm": 0.13473330438137054, "learning_rate": 8.326597560984445e-05, "loss": 0.3925, "step": 3987 }, { "epoch": 1.1968787515006003, "grad_norm": 0.1469656080007553, "learning_rate": 8.321433913300509e-05, "loss": 0.4158, "step": 3988 }, { "epoch": 1.1971788715486196, "grad_norm": 0.14228259027004242, "learning_rate": 8.31627072611193e-05, "loss": 0.3434, "step": 3989 }, { "epoch": 1.1974789915966386, "grad_norm": 0.15231838822364807, "learning_rate": 8.311108000835167e-05, "loss": 0.4376, "step": 3990 }, { "epoch": 1.197779111644658, "grad_norm": 0.15814101696014404, "learning_rate": 8.30594573888656e-05, "loss": 0.4136, "step": 3991 }, { "epoch": 1.1980792316926772, "grad_norm": 0.1386740654706955, "learning_rate": 8.300783941682315e-05, "loss": 0.3935, "step": 3992 }, { "epoch": 1.1983793517406962, "grad_norm": 0.15326248109340668, "learning_rate": 8.29562261063851e-05, "loss": 0.4072, "step": 3993 }, { "epoch": 1.1986794717887155, "grad_norm": 0.12558022141456604, "learning_rate": 8.290461747171103e-05, "loss": 0.3537, "step": 3994 }, { "epoch": 1.1989795918367347, "grad_norm": 0.1449139565229416, "learning_rate": 8.285301352695905e-05, "loss": 0.3984, "step": 3995 }, { "epoch": 1.199279711884754, "grad_norm": 0.2730596661567688, "learning_rate": 8.280141428628628e-05, "loss": 0.4265, "step": 3996 }, { "epoch": 1.199579831932773, "grad_norm": 0.14818298816680908, "learning_rate": 8.274981976384825e-05, "loss": 0.3876, "step": 3997 }, { "epoch": 1.1998799519807923, "grad_norm": 0.1385853886604309, "learning_rate": 8.269822997379935e-05, "loss": 0.4075, "step": 3998 }, { "epoch": 1.2001800720288116, "grad_norm": 0.14927978813648224, "learning_rate": 8.264664493029268e-05, "loss": 0.3505, "step": 3999 }, { "epoch": 1.2004801920768307, "grad_norm": 0.14248241484165192, "learning_rate": 8.259506464747999e-05, "loss": 0.3868, "step": 4000 }, { "epoch": 1.20078031212485, "grad_norm": 0.16264191269874573, "learning_rate": 8.254348913951176e-05, "loss": 0.441, "step": 4001 }, { "epoch": 1.2010804321728692, "grad_norm": 0.14608046412467957, "learning_rate": 8.24919184205371e-05, "loss": 0.4003, "step": 4002 }, { "epoch": 1.2013805522208885, "grad_norm": 0.1493375152349472, "learning_rate": 8.244035250470384e-05, "loss": 0.4105, "step": 4003 }, { "epoch": 1.2016806722689075, "grad_norm": 0.1431288868188858, "learning_rate": 8.238879140615855e-05, "loss": 0.406, "step": 4004 }, { "epoch": 1.2019807923169268, "grad_norm": 0.15224696695804596, "learning_rate": 8.23372351390464e-05, "loss": 0.4334, "step": 4005 }, { "epoch": 1.202280912364946, "grad_norm": 0.13419242203235626, "learning_rate": 8.228568371751123e-05, "loss": 0.3413, "step": 4006 }, { "epoch": 1.202581032412965, "grad_norm": 0.14649564027786255, "learning_rate": 8.223413715569565e-05, "loss": 0.4004, "step": 4007 }, { "epoch": 1.2028811524609844, "grad_norm": 0.15351301431655884, "learning_rate": 8.218259546774081e-05, "loss": 0.4088, "step": 4008 }, { "epoch": 1.2031812725090036, "grad_norm": 0.13952364027500153, "learning_rate": 8.213105866778659e-05, "loss": 0.3773, "step": 4009 }, { "epoch": 1.203481392557023, "grad_norm": 0.13849298655986786, "learning_rate": 8.207952676997153e-05, "loss": 0.398, "step": 4010 }, { "epoch": 1.203781512605042, "grad_norm": 0.14556635916233063, "learning_rate": 8.20279997884328e-05, "loss": 0.423, "step": 4011 }, { "epoch": 1.2040816326530612, "grad_norm": 0.14165206253528595, "learning_rate": 8.197647773730627e-05, "loss": 0.3917, "step": 4012 }, { "epoch": 1.2043817527010805, "grad_norm": 0.15320447087287903, "learning_rate": 8.192496063072644e-05, "loss": 0.398, "step": 4013 }, { "epoch": 1.2046818727490995, "grad_norm": 0.1356806606054306, "learning_rate": 8.187344848282631e-05, "loss": 0.3826, "step": 4014 }, { "epoch": 1.2049819927971188, "grad_norm": 0.13317741453647614, "learning_rate": 8.182194130773783e-05, "loss": 0.3748, "step": 4015 }, { "epoch": 1.205282112845138, "grad_norm": 0.13315704464912415, "learning_rate": 8.177043911959127e-05, "loss": 0.3736, "step": 4016 }, { "epoch": 1.2055822328931574, "grad_norm": 0.1495973914861679, "learning_rate": 8.17189419325157e-05, "loss": 0.4167, "step": 4017 }, { "epoch": 1.2058823529411764, "grad_norm": 0.15506240725517273, "learning_rate": 8.166744976063881e-05, "loss": 0.4152, "step": 4018 }, { "epoch": 1.2061824729891957, "grad_norm": 0.1537008285522461, "learning_rate": 8.161596261808687e-05, "loss": 0.4253, "step": 4019 }, { "epoch": 1.206482593037215, "grad_norm": 0.13522090017795563, "learning_rate": 8.156448051898476e-05, "loss": 0.3744, "step": 4020 }, { "epoch": 1.206782713085234, "grad_norm": 0.16604888439178467, "learning_rate": 8.151300347745604e-05, "loss": 0.4121, "step": 4021 }, { "epoch": 1.2070828331332533, "grad_norm": 0.13932572305202484, "learning_rate": 8.146153150762281e-05, "loss": 0.369, "step": 4022 }, { "epoch": 1.2073829531812725, "grad_norm": 0.18557420372962952, "learning_rate": 8.141006462360587e-05, "loss": 0.3774, "step": 4023 }, { "epoch": 1.2076830732292918, "grad_norm": 0.16589613258838654, "learning_rate": 8.135860283952453e-05, "loss": 0.3496, "step": 4024 }, { "epoch": 1.2079831932773109, "grad_norm": 0.1459798514842987, "learning_rate": 8.130714616949673e-05, "loss": 0.412, "step": 4025 }, { "epoch": 1.2082833133253301, "grad_norm": 0.14163541793823242, "learning_rate": 8.125569462763907e-05, "loss": 0.4032, "step": 4026 }, { "epoch": 1.2085834333733494, "grad_norm": 0.1819797307252884, "learning_rate": 8.120424822806665e-05, "loss": 0.4417, "step": 4027 }, { "epoch": 1.2088835534213684, "grad_norm": 0.19398806989192963, "learning_rate": 8.115280698489317e-05, "loss": 0.3794, "step": 4028 }, { "epoch": 1.2091836734693877, "grad_norm": 0.1349167376756668, "learning_rate": 8.1101370912231e-05, "loss": 0.3868, "step": 4029 }, { "epoch": 1.209483793517407, "grad_norm": 0.1430043876171112, "learning_rate": 8.1049940024191e-05, "loss": 0.3705, "step": 4030 }, { "epoch": 1.2097839135654262, "grad_norm": 0.14163738489151, "learning_rate": 8.09985143348827e-05, "loss": 0.3974, "step": 4031 }, { "epoch": 1.2100840336134453, "grad_norm": 0.13367514312267303, "learning_rate": 8.09470938584141e-05, "loss": 0.3634, "step": 4032 }, { "epoch": 1.2103841536614646, "grad_norm": 0.30375921726226807, "learning_rate": 8.08956786088918e-05, "loss": 0.4015, "step": 4033 }, { "epoch": 1.2106842737094838, "grad_norm": 0.13942891359329224, "learning_rate": 8.084426860042105e-05, "loss": 0.3996, "step": 4034 }, { "epoch": 1.2109843937575029, "grad_norm": 0.1532803177833557, "learning_rate": 8.079286384710554e-05, "loss": 0.4384, "step": 4035 }, { "epoch": 1.2112845138055222, "grad_norm": 0.15898968279361725, "learning_rate": 8.074146436304757e-05, "loss": 0.4394, "step": 4036 }, { "epoch": 1.2115846338535414, "grad_norm": 0.1539100557565689, "learning_rate": 8.069007016234806e-05, "loss": 0.3809, "step": 4037 }, { "epoch": 1.2118847539015607, "grad_norm": 0.14513103663921356, "learning_rate": 8.063868125910639e-05, "loss": 0.3703, "step": 4038 }, { "epoch": 1.2121848739495797, "grad_norm": 0.1389445811510086, "learning_rate": 8.058729766742045e-05, "loss": 0.4107, "step": 4039 }, { "epoch": 1.212484993997599, "grad_norm": 0.24129579961299896, "learning_rate": 8.053591940138686e-05, "loss": 0.3628, "step": 4040 }, { "epoch": 1.2127851140456183, "grad_norm": 0.13357827067375183, "learning_rate": 8.048454647510055e-05, "loss": 0.3598, "step": 4041 }, { "epoch": 1.2130852340936373, "grad_norm": 0.12578484416007996, "learning_rate": 8.043317890265516e-05, "loss": 0.3379, "step": 4042 }, { "epoch": 1.2133853541416566, "grad_norm": 0.2575438618659973, "learning_rate": 8.038181669814278e-05, "loss": 0.3604, "step": 4043 }, { "epoch": 1.2136854741896759, "grad_norm": 0.14429044723510742, "learning_rate": 8.033045987565401e-05, "loss": 0.3945, "step": 4044 }, { "epoch": 1.2139855942376951, "grad_norm": 0.2189117670059204, "learning_rate": 8.027910844927808e-05, "loss": 0.3891, "step": 4045 }, { "epoch": 1.2142857142857142, "grad_norm": 0.1560732126235962, "learning_rate": 8.022776243310258e-05, "loss": 0.4577, "step": 4046 }, { "epoch": 1.2145858343337335, "grad_norm": 0.15747374296188354, "learning_rate": 8.017642184121372e-05, "loss": 0.4084, "step": 4047 }, { "epoch": 1.2148859543817527, "grad_norm": 0.1571548581123352, "learning_rate": 8.012508668769624e-05, "loss": 0.4563, "step": 4048 }, { "epoch": 1.215186074429772, "grad_norm": 0.1482471078634262, "learning_rate": 8.007375698663335e-05, "loss": 0.4163, "step": 4049 }, { "epoch": 1.215486194477791, "grad_norm": 0.16825692355632782, "learning_rate": 8.002243275210669e-05, "loss": 0.4144, "step": 4050 }, { "epoch": 1.2157863145258103, "grad_norm": 0.1516883671283722, "learning_rate": 7.99711139981966e-05, "loss": 0.4205, "step": 4051 }, { "epoch": 1.2160864345738296, "grad_norm": 0.15317898988723755, "learning_rate": 7.991980073898164e-05, "loss": 0.4549, "step": 4052 }, { "epoch": 1.2163865546218489, "grad_norm": 0.1662512868642807, "learning_rate": 7.986849298853917e-05, "loss": 0.4926, "step": 4053 }, { "epoch": 1.216686674669868, "grad_norm": 0.1314711719751358, "learning_rate": 7.981719076094479e-05, "loss": 0.3536, "step": 4054 }, { "epoch": 1.2169867947178872, "grad_norm": 0.22032858431339264, "learning_rate": 7.976589407027266e-05, "loss": 0.418, "step": 4055 }, { "epoch": 1.2172869147659064, "grad_norm": 0.1417960524559021, "learning_rate": 7.971460293059551e-05, "loss": 0.396, "step": 4056 }, { "epoch": 1.2175870348139255, "grad_norm": 0.1375245749950409, "learning_rate": 7.966331735598445e-05, "loss": 0.363, "step": 4057 }, { "epoch": 1.2178871548619448, "grad_norm": 0.1464424729347229, "learning_rate": 7.961203736050904e-05, "loss": 0.3651, "step": 4058 }, { "epoch": 1.218187274909964, "grad_norm": 0.13893769681453705, "learning_rate": 7.956076295823744e-05, "loss": 0.3665, "step": 4059 }, { "epoch": 1.2184873949579833, "grad_norm": 0.13774627447128296, "learning_rate": 7.950949416323612e-05, "loss": 0.3287, "step": 4060 }, { "epoch": 1.2187875150060024, "grad_norm": 0.14033019542694092, "learning_rate": 7.945823098957015e-05, "loss": 0.4057, "step": 4061 }, { "epoch": 1.2190876350540216, "grad_norm": 0.14156651496887207, "learning_rate": 7.940697345130296e-05, "loss": 0.3939, "step": 4062 }, { "epoch": 1.219387755102041, "grad_norm": 0.14618803560733795, "learning_rate": 7.935572156249644e-05, "loss": 0.3754, "step": 4063 }, { "epoch": 1.21968787515006, "grad_norm": 0.15165431797504425, "learning_rate": 7.930447533721102e-05, "loss": 0.3706, "step": 4064 }, { "epoch": 1.2199879951980792, "grad_norm": 0.16142675280570984, "learning_rate": 7.925323478950551e-05, "loss": 0.4285, "step": 4065 }, { "epoch": 1.2202881152460985, "grad_norm": 0.14201250672340393, "learning_rate": 7.920199993343709e-05, "loss": 0.3671, "step": 4066 }, { "epoch": 1.2205882352941178, "grad_norm": 0.1458989679813385, "learning_rate": 7.915077078306154e-05, "loss": 0.4101, "step": 4067 }, { "epoch": 1.2208883553421368, "grad_norm": 0.17755059897899628, "learning_rate": 7.909954735243295e-05, "loss": 0.4466, "step": 4068 }, { "epoch": 1.221188475390156, "grad_norm": 0.15208066999912262, "learning_rate": 7.904832965560385e-05, "loss": 0.3984, "step": 4069 }, { "epoch": 1.2214885954381753, "grad_norm": 0.15624454617500305, "learning_rate": 7.899711770662532e-05, "loss": 0.4145, "step": 4070 }, { "epoch": 1.2217887154861944, "grad_norm": 0.1603109985589981, "learning_rate": 7.894591151954666e-05, "loss": 0.3609, "step": 4071 }, { "epoch": 1.2220888355342137, "grad_norm": 0.1522332727909088, "learning_rate": 7.889471110841581e-05, "loss": 0.3777, "step": 4072 }, { "epoch": 1.222388955582233, "grad_norm": 0.15223602950572968, "learning_rate": 7.884351648727895e-05, "loss": 0.4138, "step": 4073 }, { "epoch": 1.2226890756302522, "grad_norm": 0.13721860945224762, "learning_rate": 7.879232767018072e-05, "loss": 0.3867, "step": 4074 }, { "epoch": 1.2229891956782712, "grad_norm": 0.14906872808933258, "learning_rate": 7.874114467116422e-05, "loss": 0.3974, "step": 4075 }, { "epoch": 1.2232893157262905, "grad_norm": 0.14093011617660522, "learning_rate": 7.868996750427096e-05, "loss": 0.3888, "step": 4076 }, { "epoch": 1.2235894357743098, "grad_norm": 0.1495698243379593, "learning_rate": 7.863879618354069e-05, "loss": 0.4007, "step": 4077 }, { "epoch": 1.2238895558223288, "grad_norm": 0.1426883190870285, "learning_rate": 7.858763072301181e-05, "loss": 0.4609, "step": 4078 }, { "epoch": 1.224189675870348, "grad_norm": 0.12629884481430054, "learning_rate": 7.853647113672089e-05, "loss": 0.3626, "step": 4079 }, { "epoch": 1.2244897959183674, "grad_norm": 0.1275751143693924, "learning_rate": 7.848531743870297e-05, "loss": 0.356, "step": 4080 }, { "epoch": 1.2247899159663866, "grad_norm": 0.14233806729316711, "learning_rate": 7.843416964299155e-05, "loss": 0.4043, "step": 4081 }, { "epoch": 1.2250900360144057, "grad_norm": 0.14755657315254211, "learning_rate": 7.838302776361837e-05, "loss": 0.4217, "step": 4082 }, { "epoch": 1.225390156062425, "grad_norm": 0.13869772851467133, "learning_rate": 7.833189181461367e-05, "loss": 0.3906, "step": 4083 }, { "epoch": 1.2256902761104442, "grad_norm": 0.15427730977535248, "learning_rate": 7.828076181000603e-05, "loss": 0.3837, "step": 4084 }, { "epoch": 1.2259903961584633, "grad_norm": 0.14841613173484802, "learning_rate": 7.822963776382229e-05, "loss": 0.4219, "step": 4085 }, { "epoch": 1.2262905162064826, "grad_norm": 0.13470859825611115, "learning_rate": 7.817851969008782e-05, "loss": 0.3649, "step": 4086 }, { "epoch": 1.2265906362545018, "grad_norm": 0.13717618584632874, "learning_rate": 7.812740760282624e-05, "loss": 0.375, "step": 4087 }, { "epoch": 1.226890756302521, "grad_norm": 0.161216139793396, "learning_rate": 7.807630151605957e-05, "loss": 0.4067, "step": 4088 }, { "epoch": 1.2271908763505401, "grad_norm": 0.1632506400346756, "learning_rate": 7.802520144380823e-05, "loss": 0.383, "step": 4089 }, { "epoch": 1.2274909963985594, "grad_norm": 0.20266014337539673, "learning_rate": 7.797410740009084e-05, "loss": 0.3881, "step": 4090 }, { "epoch": 1.2277911164465787, "grad_norm": 0.14472752809524536, "learning_rate": 7.792301939892458e-05, "loss": 0.3934, "step": 4091 }, { "epoch": 1.2280912364945977, "grad_norm": 0.40670523047447205, "learning_rate": 7.787193745432478e-05, "loss": 0.3612, "step": 4092 }, { "epoch": 1.228391356542617, "grad_norm": 0.14954715967178345, "learning_rate": 7.78208615803052e-05, "loss": 0.4437, "step": 4093 }, { "epoch": 1.2286914765906363, "grad_norm": 0.16645628213882446, "learning_rate": 7.776979179087793e-05, "loss": 0.4545, "step": 4094 }, { "epoch": 1.2289915966386555, "grad_norm": 0.1424858123064041, "learning_rate": 7.771872810005341e-05, "loss": 0.389, "step": 4095 }, { "epoch": 1.2292917166866746, "grad_norm": 0.1527540236711502, "learning_rate": 7.766767052184027e-05, "loss": 0.3866, "step": 4096 }, { "epoch": 1.2295918367346939, "grad_norm": 0.14466696977615356, "learning_rate": 7.76166190702457e-05, "loss": 0.4092, "step": 4097 }, { "epoch": 1.2298919567827131, "grad_norm": 0.1450721174478531, "learning_rate": 7.756557375927503e-05, "loss": 0.3895, "step": 4098 }, { "epoch": 1.2301920768307322, "grad_norm": 0.1380809098482132, "learning_rate": 7.751453460293193e-05, "loss": 0.3663, "step": 4099 }, { "epoch": 1.2304921968787514, "grad_norm": 0.15767262876033783, "learning_rate": 7.746350161521845e-05, "loss": 0.3765, "step": 4100 }, { "epoch": 1.2307923169267707, "grad_norm": 0.14586131274700165, "learning_rate": 7.741247481013485e-05, "loss": 0.4031, "step": 4101 }, { "epoch": 1.23109243697479, "grad_norm": 0.19582970440387726, "learning_rate": 7.736145420167981e-05, "loss": 0.3862, "step": 4102 }, { "epoch": 1.231392557022809, "grad_norm": 0.14220896363258362, "learning_rate": 7.731043980385026e-05, "loss": 0.3616, "step": 4103 }, { "epoch": 1.2316926770708283, "grad_norm": 0.1587248295545578, "learning_rate": 7.72594316306413e-05, "loss": 0.4277, "step": 4104 }, { "epoch": 1.2319927971188476, "grad_norm": 0.1498514711856842, "learning_rate": 7.720842969604658e-05, "loss": 0.441, "step": 4105 }, { "epoch": 1.2322929171668668, "grad_norm": 0.1516525149345398, "learning_rate": 7.71574340140578e-05, "loss": 0.3958, "step": 4106 }, { "epoch": 1.232593037214886, "grad_norm": 0.13693274557590485, "learning_rate": 7.710644459866507e-05, "loss": 0.3975, "step": 4107 }, { "epoch": 1.2328931572629052, "grad_norm": 0.15817703306674957, "learning_rate": 7.705546146385676e-05, "loss": 0.4548, "step": 4108 }, { "epoch": 1.2331932773109244, "grad_norm": 0.1288352757692337, "learning_rate": 7.700448462361954e-05, "loss": 0.3288, "step": 4109 }, { "epoch": 1.2334933973589437, "grad_norm": 0.14892712235450745, "learning_rate": 7.695351409193823e-05, "loss": 0.4075, "step": 4110 }, { "epoch": 1.2337935174069627, "grad_norm": 0.1700790822505951, "learning_rate": 7.690254988279608e-05, "loss": 0.4417, "step": 4111 }, { "epoch": 1.234093637454982, "grad_norm": 0.1455063372850418, "learning_rate": 7.685159201017451e-05, "loss": 0.3726, "step": 4112 }, { "epoch": 1.2343937575030013, "grad_norm": 0.14195141196250916, "learning_rate": 7.680064048805326e-05, "loss": 0.3905, "step": 4113 }, { "epoch": 1.2346938775510203, "grad_norm": 0.13561753928661346, "learning_rate": 7.674969533041028e-05, "loss": 0.373, "step": 4114 }, { "epoch": 1.2349939975990396, "grad_norm": 0.13975995779037476, "learning_rate": 7.669875655122174e-05, "loss": 0.3559, "step": 4115 }, { "epoch": 1.2352941176470589, "grad_norm": 0.1478702872991562, "learning_rate": 7.664782416446221e-05, "loss": 0.3822, "step": 4116 }, { "epoch": 1.2355942376950781, "grad_norm": 0.12930183112621307, "learning_rate": 7.659689818410433e-05, "loss": 0.3364, "step": 4117 }, { "epoch": 1.2358943577430972, "grad_norm": 0.15074846148490906, "learning_rate": 7.654597862411906e-05, "loss": 0.4286, "step": 4118 }, { "epoch": 1.2361944777911165, "grad_norm": 0.14158159494400024, "learning_rate": 7.649506549847564e-05, "loss": 0.3773, "step": 4119 }, { "epoch": 1.2364945978391357, "grad_norm": 0.1416250467300415, "learning_rate": 7.644415882114145e-05, "loss": 0.3834, "step": 4120 }, { "epoch": 1.2367947178871548, "grad_norm": 0.14171995222568512, "learning_rate": 7.639325860608221e-05, "loss": 0.3819, "step": 4121 }, { "epoch": 1.237094837935174, "grad_norm": 0.14769646525382996, "learning_rate": 7.63423648672618e-05, "loss": 0.4328, "step": 4122 }, { "epoch": 1.2373949579831933, "grad_norm": 0.2000008076429367, "learning_rate": 7.629147761864229e-05, "loss": 0.4315, "step": 4123 }, { "epoch": 1.2376950780312126, "grad_norm": 0.13998661935329437, "learning_rate": 7.624059687418403e-05, "loss": 0.3934, "step": 4124 }, { "epoch": 1.2379951980792316, "grad_norm": 0.15924957394599915, "learning_rate": 7.61897226478456e-05, "loss": 0.3787, "step": 4125 }, { "epoch": 1.238295318127251, "grad_norm": 0.13868294656276703, "learning_rate": 7.613885495358371e-05, "loss": 0.3768, "step": 4126 }, { "epoch": 1.2385954381752702, "grad_norm": 0.17898708581924438, "learning_rate": 7.608799380535339e-05, "loss": 0.3926, "step": 4127 }, { "epoch": 1.2388955582232892, "grad_norm": 0.14534014463424683, "learning_rate": 7.603713921710779e-05, "loss": 0.4193, "step": 4128 }, { "epoch": 1.2391956782713085, "grad_norm": 0.13152074813842773, "learning_rate": 7.598629120279823e-05, "loss": 0.3697, "step": 4129 }, { "epoch": 1.2394957983193278, "grad_norm": 0.1317710429430008, "learning_rate": 7.593544977637436e-05, "loss": 0.3629, "step": 4130 }, { "epoch": 1.239795918367347, "grad_norm": 0.1301366239786148, "learning_rate": 7.588461495178388e-05, "loss": 0.357, "step": 4131 }, { "epoch": 1.240096038415366, "grad_norm": 0.1713320016860962, "learning_rate": 7.583378674297276e-05, "loss": 0.3781, "step": 4132 }, { "epoch": 1.2403961584633854, "grad_norm": 0.1516532003879547, "learning_rate": 7.57829651638852e-05, "loss": 0.4168, "step": 4133 }, { "epoch": 1.2406962785114046, "grad_norm": 0.13926361501216888, "learning_rate": 7.573215022846339e-05, "loss": 0.3986, "step": 4134 }, { "epoch": 1.2409963985594237, "grad_norm": 0.22416581213474274, "learning_rate": 7.568134195064794e-05, "loss": 0.4877, "step": 4135 }, { "epoch": 1.241296518607443, "grad_norm": 0.14472880959510803, "learning_rate": 7.563054034437747e-05, "loss": 0.34, "step": 4136 }, { "epoch": 1.2415966386554622, "grad_norm": 0.13605383038520813, "learning_rate": 7.557974542358878e-05, "loss": 0.3713, "step": 4137 }, { "epoch": 1.2418967587034815, "grad_norm": 0.15049508213996887, "learning_rate": 7.552895720221697e-05, "loss": 0.4158, "step": 4138 }, { "epoch": 1.2421968787515005, "grad_norm": 0.1582769900560379, "learning_rate": 7.547817569419515e-05, "loss": 0.4324, "step": 4139 }, { "epoch": 1.2424969987995198, "grad_norm": 0.1359313726425171, "learning_rate": 7.54274009134546e-05, "loss": 0.3952, "step": 4140 }, { "epoch": 1.242797118847539, "grad_norm": 0.1425129920244217, "learning_rate": 7.537663287392489e-05, "loss": 0.3736, "step": 4141 }, { "epoch": 1.2430972388955581, "grad_norm": 0.13735489547252655, "learning_rate": 7.532587158953357e-05, "loss": 0.3828, "step": 4142 }, { "epoch": 1.2433973589435774, "grad_norm": 0.14814046025276184, "learning_rate": 7.527511707420646e-05, "loss": 0.4005, "step": 4143 }, { "epoch": 1.2436974789915967, "grad_norm": 0.13848213851451874, "learning_rate": 7.52243693418675e-05, "loss": 0.3814, "step": 4144 }, { "epoch": 1.243997599039616, "grad_norm": 0.1393675059080124, "learning_rate": 7.517362840643868e-05, "loss": 0.385, "step": 4145 }, { "epoch": 1.244297719087635, "grad_norm": 0.13988974690437317, "learning_rate": 7.512289428184025e-05, "loss": 0.4183, "step": 4146 }, { "epoch": 1.2445978391356542, "grad_norm": 0.13378490507602692, "learning_rate": 7.507216698199056e-05, "loss": 0.4053, "step": 4147 }, { "epoch": 1.2448979591836735, "grad_norm": 0.14503192901611328, "learning_rate": 7.502144652080597e-05, "loss": 0.398, "step": 4148 }, { "epoch": 1.2451980792316926, "grad_norm": 0.1375621259212494, "learning_rate": 7.497073291220111e-05, "loss": 0.3977, "step": 4149 }, { "epoch": 1.2454981992797118, "grad_norm": 0.14542661607265472, "learning_rate": 7.492002617008866e-05, "loss": 0.3893, "step": 4150 }, { "epoch": 1.245798319327731, "grad_norm": 0.14464782178401947, "learning_rate": 7.486932630837948e-05, "loss": 0.3877, "step": 4151 }, { "epoch": 1.2460984393757504, "grad_norm": 0.13716407120227814, "learning_rate": 7.481863334098247e-05, "loss": 0.3596, "step": 4152 }, { "epoch": 1.2463985594237694, "grad_norm": 0.15377911925315857, "learning_rate": 7.476794728180463e-05, "loss": 0.4502, "step": 4153 }, { "epoch": 1.2466986794717887, "grad_norm": 0.15197394788265228, "learning_rate": 7.471726814475118e-05, "loss": 0.4417, "step": 4154 }, { "epoch": 1.246998799519808, "grad_norm": 0.14850930869579315, "learning_rate": 7.466659594372527e-05, "loss": 0.4103, "step": 4155 }, { "epoch": 1.247298919567827, "grad_norm": 0.1434469372034073, "learning_rate": 7.461593069262826e-05, "loss": 0.3743, "step": 4156 }, { "epoch": 1.2475990396158463, "grad_norm": 0.13946932554244995, "learning_rate": 7.456527240535962e-05, "loss": 0.3852, "step": 4157 }, { "epoch": 1.2478991596638656, "grad_norm": 0.13940726220607758, "learning_rate": 7.451462109581687e-05, "loss": 0.4089, "step": 4158 }, { "epoch": 1.2481992797118848, "grad_norm": 0.14511612057685852, "learning_rate": 7.446397677789551e-05, "loss": 0.4035, "step": 4159 }, { "epoch": 1.2484993997599039, "grad_norm": 0.13636872172355652, "learning_rate": 7.441333946548939e-05, "loss": 0.3923, "step": 4160 }, { "epoch": 1.2487995198079231, "grad_norm": 0.1461837887763977, "learning_rate": 7.436270917249013e-05, "loss": 0.3883, "step": 4161 }, { "epoch": 1.2490996398559424, "grad_norm": 0.12561574578285217, "learning_rate": 7.431208591278771e-05, "loss": 0.3404, "step": 4162 }, { "epoch": 1.2493997599039615, "grad_norm": 0.14430075883865356, "learning_rate": 7.426146970026993e-05, "loss": 0.3803, "step": 4163 }, { "epoch": 1.2496998799519807, "grad_norm": 0.13891753554344177, "learning_rate": 7.421086054882278e-05, "loss": 0.3907, "step": 4164 }, { "epoch": 1.25, "grad_norm": 0.14693672955036163, "learning_rate": 7.416025847233037e-05, "loss": 0.4115, "step": 4165 }, { "epoch": 1.2503001200480193, "grad_norm": 0.12822362780570984, "learning_rate": 7.410966348467476e-05, "loss": 0.345, "step": 4166 }, { "epoch": 1.2506002400960385, "grad_norm": 0.1424485146999359, "learning_rate": 7.405907559973606e-05, "loss": 0.3831, "step": 4167 }, { "epoch": 1.2509003601440576, "grad_norm": 0.14128592610359192, "learning_rate": 7.400849483139252e-05, "loss": 0.3899, "step": 4168 }, { "epoch": 1.2512004801920769, "grad_norm": 0.14493753015995026, "learning_rate": 7.395792119352041e-05, "loss": 0.3927, "step": 4169 }, { "epoch": 1.251500600240096, "grad_norm": 0.14592917263507843, "learning_rate": 7.390735469999398e-05, "loss": 0.4198, "step": 4170 }, { "epoch": 1.2518007202881152, "grad_norm": 0.14330831170082092, "learning_rate": 7.385679536468562e-05, "loss": 0.3662, "step": 4171 }, { "epoch": 1.2521008403361344, "grad_norm": 0.1558278650045395, "learning_rate": 7.380624320146566e-05, "loss": 0.45, "step": 4172 }, { "epoch": 1.2524009603841537, "grad_norm": 0.16887469589710236, "learning_rate": 7.375569822420254e-05, "loss": 0.4014, "step": 4173 }, { "epoch": 1.252701080432173, "grad_norm": 0.15975980460643768, "learning_rate": 7.370516044676267e-05, "loss": 0.3844, "step": 4174 }, { "epoch": 1.253001200480192, "grad_norm": 0.1546444296836853, "learning_rate": 7.365462988301052e-05, "loss": 0.4198, "step": 4175 }, { "epoch": 1.2533013205282113, "grad_norm": 0.12470715492963791, "learning_rate": 7.360410654680858e-05, "loss": 0.3186, "step": 4176 }, { "epoch": 1.2536014405762306, "grad_norm": 0.14100395143032074, "learning_rate": 7.355359045201734e-05, "loss": 0.393, "step": 4177 }, { "epoch": 1.2539015606242496, "grad_norm": 0.14127913117408752, "learning_rate": 7.350308161249528e-05, "loss": 0.4104, "step": 4178 }, { "epoch": 1.254201680672269, "grad_norm": 0.1390364170074463, "learning_rate": 7.345258004209899e-05, "loss": 0.3893, "step": 4179 }, { "epoch": 1.2545018007202882, "grad_norm": 0.14757046103477478, "learning_rate": 7.340208575468291e-05, "loss": 0.4398, "step": 4180 }, { "epoch": 1.2548019207683074, "grad_norm": 0.143103688955307, "learning_rate": 7.335159876409966e-05, "loss": 0.4398, "step": 4181 }, { "epoch": 1.2551020408163265, "grad_norm": 0.13156215846538544, "learning_rate": 7.33011190841997e-05, "loss": 0.3688, "step": 4182 }, { "epoch": 1.2554021608643458, "grad_norm": 0.16004370152950287, "learning_rate": 7.325064672883157e-05, "loss": 0.3895, "step": 4183 }, { "epoch": 1.255702280912365, "grad_norm": 0.14629091322422028, "learning_rate": 7.32001817118418e-05, "loss": 0.423, "step": 4184 }, { "epoch": 1.256002400960384, "grad_norm": 0.15015138685703278, "learning_rate": 7.31497240470749e-05, "loss": 0.4306, "step": 4185 }, { "epoch": 1.2563025210084033, "grad_norm": 0.19345088303089142, "learning_rate": 7.30992737483733e-05, "loss": 0.4566, "step": 4186 }, { "epoch": 1.2566026410564226, "grad_norm": 0.15372827649116516, "learning_rate": 7.304883082957747e-05, "loss": 0.3913, "step": 4187 }, { "epoch": 1.2569027611044419, "grad_norm": 0.14005455374717712, "learning_rate": 7.29983953045259e-05, "loss": 0.3894, "step": 4188 }, { "epoch": 1.257202881152461, "grad_norm": 0.12181244045495987, "learning_rate": 7.294796718705492e-05, "loss": 0.3323, "step": 4189 }, { "epoch": 1.2575030012004802, "grad_norm": 0.16262075304985046, "learning_rate": 7.289754649099897e-05, "loss": 0.446, "step": 4190 }, { "epoch": 1.2578031212484995, "grad_norm": 0.15365153551101685, "learning_rate": 7.284713323019035e-05, "loss": 0.4019, "step": 4191 }, { "epoch": 1.2581032412965185, "grad_norm": 0.14961516857147217, "learning_rate": 7.279672741845942e-05, "loss": 0.3561, "step": 4192 }, { "epoch": 1.2584033613445378, "grad_norm": 0.12542888522148132, "learning_rate": 7.274632906963437e-05, "loss": 0.3271, "step": 4193 }, { "epoch": 1.258703481392557, "grad_norm": 0.12507741153240204, "learning_rate": 7.269593819754142e-05, "loss": 0.3316, "step": 4194 }, { "epoch": 1.2590036014405763, "grad_norm": 0.13399529457092285, "learning_rate": 7.264555481600476e-05, "loss": 0.3296, "step": 4195 }, { "epoch": 1.2593037214885954, "grad_norm": 0.13027550280094147, "learning_rate": 7.259517893884647e-05, "loss": 0.3587, "step": 4196 }, { "epoch": 1.2596038415366146, "grad_norm": 0.14164437353610992, "learning_rate": 7.254481057988658e-05, "loss": 0.3872, "step": 4197 }, { "epoch": 1.259903961584634, "grad_norm": 0.14004860818386078, "learning_rate": 7.249444975294313e-05, "loss": 0.3901, "step": 4198 }, { "epoch": 1.260204081632653, "grad_norm": 0.13881249725818634, "learning_rate": 7.244409647183197e-05, "loss": 0.3914, "step": 4199 }, { "epoch": 1.2605042016806722, "grad_norm": 0.14471213519573212, "learning_rate": 7.239375075036697e-05, "loss": 0.4063, "step": 4200 }, { "epoch": 1.2608043217286915, "grad_norm": 0.16577906906604767, "learning_rate": 7.23434126023599e-05, "loss": 0.3837, "step": 4201 }, { "epoch": 1.2611044417767108, "grad_norm": 0.14864365756511688, "learning_rate": 7.229308204162044e-05, "loss": 0.421, "step": 4202 }, { "epoch": 1.2614045618247298, "grad_norm": 0.13306841254234314, "learning_rate": 7.224275908195626e-05, "loss": 0.3264, "step": 4203 }, { "epoch": 1.261704681872749, "grad_norm": 0.13763290643692017, "learning_rate": 7.219244373717285e-05, "loss": 0.3785, "step": 4204 }, { "epoch": 1.2620048019207684, "grad_norm": 0.14042170345783234, "learning_rate": 7.214213602107357e-05, "loss": 0.3886, "step": 4205 }, { "epoch": 1.2623049219687874, "grad_norm": 0.17335915565490723, "learning_rate": 7.20918359474599e-05, "loss": 0.4584, "step": 4206 }, { "epoch": 1.2626050420168067, "grad_norm": 0.1289275735616684, "learning_rate": 7.204154353013102e-05, "loss": 0.3179, "step": 4207 }, { "epoch": 1.262905162064826, "grad_norm": 0.13672125339508057, "learning_rate": 7.199125878288406e-05, "loss": 0.3748, "step": 4208 }, { "epoch": 1.2632052821128452, "grad_norm": 0.14343659579753876, "learning_rate": 7.19409817195141e-05, "loss": 0.3674, "step": 4209 }, { "epoch": 1.2635054021608643, "grad_norm": 0.13902506232261658, "learning_rate": 7.189071235381406e-05, "loss": 0.3879, "step": 4210 }, { "epoch": 1.2638055222088835, "grad_norm": 0.12468855828046799, "learning_rate": 7.184045069957482e-05, "loss": 0.3322, "step": 4211 }, { "epoch": 1.2641056422569028, "grad_norm": 0.14416514337062836, "learning_rate": 7.179019677058499e-05, "loss": 0.4047, "step": 4212 }, { "epoch": 1.2644057623049219, "grad_norm": 0.13998650014400482, "learning_rate": 7.173995058063119e-05, "loss": 0.3885, "step": 4213 }, { "epoch": 1.2647058823529411, "grad_norm": 0.14294609427452087, "learning_rate": 7.168971214349792e-05, "loss": 0.3861, "step": 4214 }, { "epoch": 1.2650060024009604, "grad_norm": 0.13009285926818848, "learning_rate": 7.16394814729675e-05, "loss": 0.3611, "step": 4215 }, { "epoch": 1.2653061224489797, "grad_norm": 0.15736983716487885, "learning_rate": 7.158925858282012e-05, "loss": 0.431, "step": 4216 }, { "epoch": 1.265606242496999, "grad_norm": 0.14100012183189392, "learning_rate": 7.153904348683393e-05, "loss": 0.3805, "step": 4217 }, { "epoch": 1.265906362545018, "grad_norm": 0.14148086309432983, "learning_rate": 7.148883619878478e-05, "loss": 0.395, "step": 4218 }, { "epoch": 1.2662064825930373, "grad_norm": 0.13538607954978943, "learning_rate": 7.143863673244648e-05, "loss": 0.3745, "step": 4219 }, { "epoch": 1.2665066026410563, "grad_norm": 0.14992834627628326, "learning_rate": 7.138844510159069e-05, "loss": 0.4278, "step": 4220 }, { "epoch": 1.2668067226890756, "grad_norm": 0.1743413656949997, "learning_rate": 7.13382613199869e-05, "loss": 0.4418, "step": 4221 }, { "epoch": 1.2671068427370948, "grad_norm": 0.14814399182796478, "learning_rate": 7.128808540140249e-05, "loss": 0.4424, "step": 4222 }, { "epoch": 1.267406962785114, "grad_norm": 0.1443677395582199, "learning_rate": 7.123791735960265e-05, "loss": 0.4125, "step": 4223 }, { "epoch": 1.2677070828331334, "grad_norm": 0.18257102370262146, "learning_rate": 7.11877572083503e-05, "loss": 0.3816, "step": 4224 }, { "epoch": 1.2680072028811524, "grad_norm": 0.1487230807542801, "learning_rate": 7.113760496140644e-05, "loss": 0.4117, "step": 4225 }, { "epoch": 1.2683073229291717, "grad_norm": 0.1354549527168274, "learning_rate": 7.108746063252971e-05, "loss": 0.3603, "step": 4226 }, { "epoch": 1.2686074429771907, "grad_norm": 0.14360687136650085, "learning_rate": 7.103732423547659e-05, "loss": 0.3853, "step": 4227 }, { "epoch": 1.26890756302521, "grad_norm": 0.12622900307178497, "learning_rate": 7.098719578400148e-05, "loss": 0.3271, "step": 4228 }, { "epoch": 1.2692076830732293, "grad_norm": 0.1468481868505478, "learning_rate": 7.093707529185652e-05, "loss": 0.3709, "step": 4229 }, { "epoch": 1.2695078031212486, "grad_norm": 0.14633765816688538, "learning_rate": 7.088696277279175e-05, "loss": 0.3869, "step": 4230 }, { "epoch": 1.2698079231692678, "grad_norm": 0.1497291624546051, "learning_rate": 7.083685824055489e-05, "loss": 0.4076, "step": 4231 }, { "epoch": 1.2701080432172869, "grad_norm": 0.14628471434116364, "learning_rate": 7.078676170889153e-05, "loss": 0.4004, "step": 4232 }, { "epoch": 1.2704081632653061, "grad_norm": 0.1478734165430069, "learning_rate": 7.073667319154516e-05, "loss": 0.4367, "step": 4233 }, { "epoch": 1.2707082833133252, "grad_norm": 0.14400777220726013, "learning_rate": 7.068659270225692e-05, "loss": 0.3671, "step": 4234 }, { "epoch": 1.2710084033613445, "grad_norm": 0.13357645273208618, "learning_rate": 7.063652025476586e-05, "loss": 0.3576, "step": 4235 }, { "epoch": 1.2713085234093637, "grad_norm": 0.1438428908586502, "learning_rate": 7.05864558628088e-05, "loss": 0.3985, "step": 4236 }, { "epoch": 1.271608643457383, "grad_norm": 0.14982417225837708, "learning_rate": 7.053639954012028e-05, "loss": 0.3937, "step": 4237 }, { "epoch": 1.2719087635054023, "grad_norm": 0.1599334180355072, "learning_rate": 7.048635130043268e-05, "loss": 0.4364, "step": 4238 }, { "epoch": 1.2722088835534213, "grad_norm": 0.15178163349628448, "learning_rate": 7.04363111574762e-05, "loss": 0.3944, "step": 4239 }, { "epoch": 1.2725090036014406, "grad_norm": 0.13461163640022278, "learning_rate": 7.038627912497873e-05, "loss": 0.3928, "step": 4240 }, { "epoch": 1.2728091236494599, "grad_norm": 0.15435433387756348, "learning_rate": 7.033625521666605e-05, "loss": 0.3937, "step": 4241 }, { "epoch": 1.273109243697479, "grad_norm": 0.1563456654548645, "learning_rate": 7.028623944626162e-05, "loss": 0.4307, "step": 4242 }, { "epoch": 1.2734093637454982, "grad_norm": 0.1376456320285797, "learning_rate": 7.023623182748662e-05, "loss": 0.3446, "step": 4243 }, { "epoch": 1.2737094837935174, "grad_norm": 0.13631781935691833, "learning_rate": 7.018623237406019e-05, "loss": 0.3721, "step": 4244 }, { "epoch": 1.2740096038415367, "grad_norm": 0.14110277593135834, "learning_rate": 7.013624109969902e-05, "loss": 0.387, "step": 4245 }, { "epoch": 1.2743097238895558, "grad_norm": 0.13850592076778412, "learning_rate": 7.008625801811767e-05, "loss": 0.366, "step": 4246 }, { "epoch": 1.274609843937575, "grad_norm": 0.15983489155769348, "learning_rate": 7.003628314302844e-05, "loss": 0.3785, "step": 4247 }, { "epoch": 1.2749099639855943, "grad_norm": 0.1383303701877594, "learning_rate": 6.998631648814136e-05, "loss": 0.4, "step": 4248 }, { "epoch": 1.2752100840336134, "grad_norm": 0.17634017765522003, "learning_rate": 6.993635806716412e-05, "loss": 0.4315, "step": 4249 }, { "epoch": 1.2755102040816326, "grad_norm": 0.1412407010793686, "learning_rate": 6.988640789380241e-05, "loss": 0.3821, "step": 4250 }, { "epoch": 1.275810324129652, "grad_norm": 0.14472277462482452, "learning_rate": 6.983646598175932e-05, "loss": 0.3821, "step": 4251 }, { "epoch": 1.2761104441776712, "grad_norm": 0.13960790634155273, "learning_rate": 6.978653234473596e-05, "loss": 0.3933, "step": 4252 }, { "epoch": 1.2764105642256902, "grad_norm": 0.13559602200984955, "learning_rate": 6.973660699643101e-05, "loss": 0.3692, "step": 4253 }, { "epoch": 1.2767106842737095, "grad_norm": 0.1367492526769638, "learning_rate": 6.968668995054087e-05, "loss": 0.3742, "step": 4254 }, { "epoch": 1.2770108043217288, "grad_norm": 0.13935504853725433, "learning_rate": 6.96367812207598e-05, "loss": 0.3747, "step": 4255 }, { "epoch": 1.2773109243697478, "grad_norm": 0.13473719358444214, "learning_rate": 6.958688082077963e-05, "loss": 0.3574, "step": 4256 }, { "epoch": 1.277611044417767, "grad_norm": 0.14837577939033508, "learning_rate": 6.953698876428995e-05, "loss": 0.3637, "step": 4257 }, { "epoch": 1.2779111644657863, "grad_norm": 0.13354167342185974, "learning_rate": 6.948710506497811e-05, "loss": 0.3746, "step": 4258 }, { "epoch": 1.2782112845138056, "grad_norm": 0.1450507640838623, "learning_rate": 6.94372297365291e-05, "loss": 0.4138, "step": 4259 }, { "epoch": 1.2785114045618247, "grad_norm": 0.1256970465183258, "learning_rate": 6.938736279262567e-05, "loss": 0.3429, "step": 4260 }, { "epoch": 1.278811524609844, "grad_norm": 0.13689365983009338, "learning_rate": 6.933750424694828e-05, "loss": 0.3369, "step": 4261 }, { "epoch": 1.2791116446578632, "grad_norm": 0.1337195336818695, "learning_rate": 6.928765411317492e-05, "loss": 0.3688, "step": 4262 }, { "epoch": 1.2794117647058822, "grad_norm": 0.14053617417812347, "learning_rate": 6.923781240498156e-05, "loss": 0.3618, "step": 4263 }, { "epoch": 1.2797118847539015, "grad_norm": 0.1282060593366623, "learning_rate": 6.91879791360416e-05, "loss": 0.3366, "step": 4264 }, { "epoch": 1.2800120048019208, "grad_norm": 0.1514139473438263, "learning_rate": 6.913815432002625e-05, "loss": 0.3489, "step": 4265 }, { "epoch": 1.28031212484994, "grad_norm": 0.14885970950126648, "learning_rate": 6.90883379706044e-05, "loss": 0.412, "step": 4266 }, { "epoch": 1.280612244897959, "grad_norm": 0.1294536143541336, "learning_rate": 6.903853010144259e-05, "loss": 0.3594, "step": 4267 }, { "epoch": 1.2809123649459784, "grad_norm": 0.13592691719532013, "learning_rate": 6.898873072620498e-05, "loss": 0.3405, "step": 4268 }, { "epoch": 1.2812124849939976, "grad_norm": 0.1535409539937973, "learning_rate": 6.893893985855353e-05, "loss": 0.4069, "step": 4269 }, { "epoch": 1.2815126050420167, "grad_norm": 0.14617381989955902, "learning_rate": 6.888915751214774e-05, "loss": 0.413, "step": 4270 }, { "epoch": 1.281812725090036, "grad_norm": 0.21435479819774628, "learning_rate": 6.883938370064489e-05, "loss": 0.378, "step": 4271 }, { "epoch": 1.2821128451380552, "grad_norm": 0.14236077666282654, "learning_rate": 6.87896184376998e-05, "loss": 0.3811, "step": 4272 }, { "epoch": 1.2824129651860745, "grad_norm": 0.14209921658039093, "learning_rate": 6.8739861736965e-05, "loss": 0.3519, "step": 4273 }, { "epoch": 1.2827130852340938, "grad_norm": 0.13236385583877563, "learning_rate": 6.86901136120907e-05, "loss": 0.3702, "step": 4274 }, { "epoch": 1.2830132052821128, "grad_norm": 0.1324281394481659, "learning_rate": 6.864037407672474e-05, "loss": 0.3698, "step": 4275 }, { "epoch": 1.283313325330132, "grad_norm": 0.14125031232833862, "learning_rate": 6.85906431445125e-05, "loss": 0.4042, "step": 4276 }, { "epoch": 1.2836134453781511, "grad_norm": 0.1484401971101761, "learning_rate": 6.85409208290972e-05, "loss": 0.4272, "step": 4277 }, { "epoch": 1.2839135654261704, "grad_norm": 0.13767188787460327, "learning_rate": 6.849120714411954e-05, "loss": 0.3574, "step": 4278 }, { "epoch": 1.2842136854741897, "grad_norm": 0.13342134654521942, "learning_rate": 6.844150210321788e-05, "loss": 0.3515, "step": 4279 }, { "epoch": 1.284513805522209, "grad_norm": 0.15663060545921326, "learning_rate": 6.83918057200283e-05, "loss": 0.3796, "step": 4280 }, { "epoch": 1.2848139255702282, "grad_norm": 0.14326073229312897, "learning_rate": 6.83421180081843e-05, "loss": 0.3945, "step": 4281 }, { "epoch": 1.2851140456182473, "grad_norm": 0.15463420748710632, "learning_rate": 6.829243898131728e-05, "loss": 0.4049, "step": 4282 }, { "epoch": 1.2854141656662665, "grad_norm": 0.14835111796855927, "learning_rate": 6.824276865305604e-05, "loss": 0.3844, "step": 4283 }, { "epoch": 1.2857142857142856, "grad_norm": 0.12640972435474396, "learning_rate": 6.819310703702704e-05, "loss": 0.3507, "step": 4284 }, { "epoch": 1.2860144057623049, "grad_norm": 0.18915210664272308, "learning_rate": 6.814345414685444e-05, "loss": 0.4182, "step": 4285 }, { "epoch": 1.2863145258103241, "grad_norm": 0.13177894055843353, "learning_rate": 6.809380999615993e-05, "loss": 0.3386, "step": 4286 }, { "epoch": 1.2866146458583434, "grad_norm": 0.14102855324745178, "learning_rate": 6.804417459856273e-05, "loss": 0.3785, "step": 4287 }, { "epoch": 1.2869147659063627, "grad_norm": 0.1460040956735611, "learning_rate": 6.799454796767986e-05, "loss": 0.3954, "step": 4288 }, { "epoch": 1.2872148859543817, "grad_norm": 0.15009137988090515, "learning_rate": 6.794493011712573e-05, "loss": 0.3971, "step": 4289 }, { "epoch": 1.287515006002401, "grad_norm": 0.15093019604682922, "learning_rate": 6.789532106051246e-05, "loss": 0.3468, "step": 4290 }, { "epoch": 1.28781512605042, "grad_norm": 0.15437495708465576, "learning_rate": 6.784572081144975e-05, "loss": 0.3975, "step": 4291 }, { "epoch": 1.2881152460984393, "grad_norm": 0.15075330436229706, "learning_rate": 6.77961293835448e-05, "loss": 0.4207, "step": 4292 }, { "epoch": 1.2884153661464586, "grad_norm": 0.13437460362911224, "learning_rate": 6.77465467904025e-05, "loss": 0.3654, "step": 4293 }, { "epoch": 1.2887154861944778, "grad_norm": 0.14857985079288483, "learning_rate": 6.76969730456253e-05, "loss": 0.3991, "step": 4294 }, { "epoch": 1.2890156062424971, "grad_norm": 0.14068998396396637, "learning_rate": 6.764740816281308e-05, "loss": 0.3888, "step": 4295 }, { "epoch": 1.2893157262905162, "grad_norm": 0.1284976452589035, "learning_rate": 6.759785215556348e-05, "loss": 0.35, "step": 4296 }, { "epoch": 1.2896158463385354, "grad_norm": 0.13528770208358765, "learning_rate": 6.75483050374716e-05, "loss": 0.3762, "step": 4297 }, { "epoch": 1.2899159663865547, "grad_norm": 0.1504359394311905, "learning_rate": 6.74987668221301e-05, "loss": 0.3541, "step": 4298 }, { "epoch": 1.2902160864345738, "grad_norm": 0.13591429591178894, "learning_rate": 6.744923752312928e-05, "loss": 0.3461, "step": 4299 }, { "epoch": 1.290516206482593, "grad_norm": 0.13461627066135406, "learning_rate": 6.739971715405684e-05, "loss": 0.3765, "step": 4300 }, { "epoch": 1.2908163265306123, "grad_norm": 0.12580035626888275, "learning_rate": 6.735020572849827e-05, "loss": 0.3379, "step": 4301 }, { "epoch": 1.2911164465786316, "grad_norm": 0.13868200778961182, "learning_rate": 6.730070326003633e-05, "loss": 0.4032, "step": 4302 }, { "epoch": 1.2914165666266506, "grad_norm": 0.14685222506523132, "learning_rate": 6.725120976225148e-05, "loss": 0.3878, "step": 4303 }, { "epoch": 1.2917166866746699, "grad_norm": 0.15233665704727173, "learning_rate": 6.720172524872174e-05, "loss": 0.4384, "step": 4304 }, { "epoch": 1.2920168067226891, "grad_norm": 0.1587858945131302, "learning_rate": 6.715224973302262e-05, "loss": 0.4009, "step": 4305 }, { "epoch": 1.2923169267707082, "grad_norm": 0.13998299837112427, "learning_rate": 6.710278322872706e-05, "loss": 0.3761, "step": 4306 }, { "epoch": 1.2926170468187275, "grad_norm": 0.1423964649438858, "learning_rate": 6.705332574940577e-05, "loss": 0.4, "step": 4307 }, { "epoch": 1.2929171668667467, "grad_norm": 0.13591860234737396, "learning_rate": 6.700387730862676e-05, "loss": 0.3785, "step": 4308 }, { "epoch": 1.293217286914766, "grad_norm": 0.145105242729187, "learning_rate": 6.695443791995564e-05, "loss": 0.3881, "step": 4309 }, { "epoch": 1.293517406962785, "grad_norm": 0.14852149784564972, "learning_rate": 6.690500759695557e-05, "loss": 0.3549, "step": 4310 }, { "epoch": 1.2938175270108043, "grad_norm": 0.13705085217952728, "learning_rate": 6.685558635318716e-05, "loss": 0.3725, "step": 4311 }, { "epoch": 1.2941176470588236, "grad_norm": 0.14072710275650024, "learning_rate": 6.68061742022086e-05, "loss": 0.381, "step": 4312 }, { "epoch": 1.2944177671068426, "grad_norm": 0.1349777728319168, "learning_rate": 6.675677115757555e-05, "loss": 0.3241, "step": 4313 }, { "epoch": 1.294717887154862, "grad_norm": 0.1425703912973404, "learning_rate": 6.670737723284111e-05, "loss": 0.3685, "step": 4314 }, { "epoch": 1.2950180072028812, "grad_norm": 0.15010738372802734, "learning_rate": 6.665799244155599e-05, "loss": 0.4021, "step": 4315 }, { "epoch": 1.2953181272509005, "grad_norm": 0.14332538843154907, "learning_rate": 6.660861679726831e-05, "loss": 0.3895, "step": 4316 }, { "epoch": 1.2956182472989195, "grad_norm": 0.1347683072090149, "learning_rate": 6.655925031352373e-05, "loss": 0.3727, "step": 4317 }, { "epoch": 1.2959183673469388, "grad_norm": 0.13436463475227356, "learning_rate": 6.650989300386539e-05, "loss": 0.3522, "step": 4318 }, { "epoch": 1.296218487394958, "grad_norm": 0.16632862389087677, "learning_rate": 6.646054488183385e-05, "loss": 0.3476, "step": 4319 }, { "epoch": 1.296518607442977, "grad_norm": 0.13216231763362885, "learning_rate": 6.641120596096729e-05, "loss": 0.3535, "step": 4320 }, { "epoch": 1.2968187274909964, "grad_norm": 0.14814947545528412, "learning_rate": 6.636187625480122e-05, "loss": 0.4084, "step": 4321 }, { "epoch": 1.2971188475390156, "grad_norm": 0.13599202036857605, "learning_rate": 6.631255577686863e-05, "loss": 0.3794, "step": 4322 }, { "epoch": 1.297418967587035, "grad_norm": 0.13081775605678558, "learning_rate": 6.626324454070015e-05, "loss": 0.3527, "step": 4323 }, { "epoch": 1.297719087635054, "grad_norm": 0.1295832246541977, "learning_rate": 6.621394255982367e-05, "loss": 0.3602, "step": 4324 }, { "epoch": 1.2980192076830732, "grad_norm": 0.1495177000761032, "learning_rate": 6.616464984776459e-05, "loss": 0.4013, "step": 4325 }, { "epoch": 1.2983193277310925, "grad_norm": 0.1319665014743805, "learning_rate": 6.61153664180459e-05, "loss": 0.3559, "step": 4326 }, { "epoch": 1.2986194477791115, "grad_norm": 0.14115822315216064, "learning_rate": 6.606609228418787e-05, "loss": 0.4127, "step": 4327 }, { "epoch": 1.2989195678271308, "grad_norm": 0.14079435169696808, "learning_rate": 6.601682745970831e-05, "loss": 0.3385, "step": 4328 }, { "epoch": 1.29921968787515, "grad_norm": 0.140974760055542, "learning_rate": 6.596757195812249e-05, "loss": 0.4101, "step": 4329 }, { "epoch": 1.2995198079231693, "grad_norm": 0.13408349454402924, "learning_rate": 6.591832579294303e-05, "loss": 0.3618, "step": 4330 }, { "epoch": 1.2998199279711884, "grad_norm": 0.22065842151641846, "learning_rate": 6.586908897768011e-05, "loss": 0.4117, "step": 4331 }, { "epoch": 1.3001200480192077, "grad_norm": 0.14712445437908173, "learning_rate": 6.58198615258413e-05, "loss": 0.4249, "step": 4332 }, { "epoch": 1.300420168067227, "grad_norm": 0.1465093344449997, "learning_rate": 6.57706434509315e-05, "loss": 0.3969, "step": 4333 }, { "epoch": 1.300720288115246, "grad_norm": 0.14196144044399261, "learning_rate": 6.572143476645319e-05, "loss": 0.3926, "step": 4334 }, { "epoch": 1.3010204081632653, "grad_norm": 0.14304772019386292, "learning_rate": 6.56722354859062e-05, "loss": 0.4159, "step": 4335 }, { "epoch": 1.3013205282112845, "grad_norm": 0.12894688546657562, "learning_rate": 6.562304562278777e-05, "loss": 0.3409, "step": 4336 }, { "epoch": 1.3016206482593038, "grad_norm": 0.15281268954277039, "learning_rate": 6.557386519059258e-05, "loss": 0.4035, "step": 4337 }, { "epoch": 1.301920768307323, "grad_norm": 0.13902124762535095, "learning_rate": 6.552469420281277e-05, "loss": 0.3834, "step": 4338 }, { "epoch": 1.302220888355342, "grad_norm": 0.18085238337516785, "learning_rate": 6.547553267293773e-05, "loss": 0.3709, "step": 4339 }, { "epoch": 1.3025210084033614, "grad_norm": 0.130579873919487, "learning_rate": 6.542638061445447e-05, "loss": 0.3443, "step": 4340 }, { "epoch": 1.3028211284513804, "grad_norm": 0.14980711042881012, "learning_rate": 6.537723804084721e-05, "loss": 0.4067, "step": 4341 }, { "epoch": 1.3031212484993997, "grad_norm": 0.1403573751449585, "learning_rate": 6.532810496559772e-05, "loss": 0.3608, "step": 4342 }, { "epoch": 1.303421368547419, "grad_norm": 0.27642616629600525, "learning_rate": 6.527898140218507e-05, "loss": 0.3434, "step": 4343 }, { "epoch": 1.3037214885954382, "grad_norm": 0.1325029730796814, "learning_rate": 6.52298673640857e-05, "loss": 0.3457, "step": 4344 }, { "epoch": 1.3040216086434575, "grad_norm": 0.12115392833948135, "learning_rate": 6.518076286477357e-05, "loss": 0.3199, "step": 4345 }, { "epoch": 1.3043217286914766, "grad_norm": 0.15301509201526642, "learning_rate": 6.513166791771987e-05, "loss": 0.3954, "step": 4346 }, { "epoch": 1.3046218487394958, "grad_norm": 0.13748885691165924, "learning_rate": 6.508258253639324e-05, "loss": 0.3633, "step": 4347 }, { "epoch": 1.3049219687875149, "grad_norm": 0.12777800858020782, "learning_rate": 6.503350673425972e-05, "loss": 0.3416, "step": 4348 }, { "epoch": 1.3052220888355341, "grad_norm": 0.12664203345775604, "learning_rate": 6.498444052478268e-05, "loss": 0.3366, "step": 4349 }, { "epoch": 1.3055222088835534, "grad_norm": 0.15768717229366302, "learning_rate": 6.493538392142287e-05, "loss": 0.3392, "step": 4350 }, { "epoch": 1.3058223289315727, "grad_norm": 0.14314647018909454, "learning_rate": 6.488633693763844e-05, "loss": 0.3946, "step": 4351 }, { "epoch": 1.306122448979592, "grad_norm": 0.22907589375972748, "learning_rate": 6.48372995868848e-05, "loss": 0.3377, "step": 4352 }, { "epoch": 1.306422569027611, "grad_norm": 0.1399524062871933, "learning_rate": 6.478827188261484e-05, "loss": 0.3965, "step": 4353 }, { "epoch": 1.3067226890756303, "grad_norm": 0.14998413622379303, "learning_rate": 6.473925383827873e-05, "loss": 0.3794, "step": 4354 }, { "epoch": 1.3070228091236495, "grad_norm": 0.14053849875926971, "learning_rate": 6.469024546732399e-05, "loss": 0.3852, "step": 4355 }, { "epoch": 1.3073229291716686, "grad_norm": 0.13630418479442596, "learning_rate": 6.464124678319554e-05, "loss": 0.3831, "step": 4356 }, { "epoch": 1.3076230492196879, "grad_norm": 0.14305394887924194, "learning_rate": 6.459225779933562e-05, "loss": 0.379, "step": 4357 }, { "epoch": 1.3079231692677071, "grad_norm": 0.1302955448627472, "learning_rate": 6.454327852918372e-05, "loss": 0.3586, "step": 4358 }, { "epoch": 1.3082232893157264, "grad_norm": 0.12461972236633301, "learning_rate": 6.449430898617681e-05, "loss": 0.3082, "step": 4359 }, { "epoch": 1.3085234093637454, "grad_norm": 0.15870194137096405, "learning_rate": 6.444534918374906e-05, "loss": 0.3826, "step": 4360 }, { "epoch": 1.3088235294117647, "grad_norm": 0.1367909014225006, "learning_rate": 6.439639913533212e-05, "loss": 0.3797, "step": 4361 }, { "epoch": 1.309123649459784, "grad_norm": 0.12880021333694458, "learning_rate": 6.434745885435482e-05, "loss": 0.3103, "step": 4362 }, { "epoch": 1.309423769507803, "grad_norm": 0.14461173117160797, "learning_rate": 6.429852835424335e-05, "loss": 0.3728, "step": 4363 }, { "epoch": 1.3097238895558223, "grad_norm": 0.13231562077999115, "learning_rate": 6.424960764842129e-05, "loss": 0.3521, "step": 4364 }, { "epoch": 1.3100240096038416, "grad_norm": 0.14568877220153809, "learning_rate": 6.420069675030941e-05, "loss": 0.3915, "step": 4365 }, { "epoch": 1.3103241296518608, "grad_norm": 0.14014829695224762, "learning_rate": 6.415179567332587e-05, "loss": 0.3451, "step": 4366 }, { "epoch": 1.31062424969988, "grad_norm": 0.13011689484119415, "learning_rate": 6.410290443088613e-05, "loss": 0.3352, "step": 4367 }, { "epoch": 1.3109243697478992, "grad_norm": 0.13787321746349335, "learning_rate": 6.405402303640299e-05, "loss": 0.3268, "step": 4368 }, { "epoch": 1.3112244897959184, "grad_norm": 0.17194852232933044, "learning_rate": 6.400515150328639e-05, "loss": 0.4184, "step": 4369 }, { "epoch": 1.3115246098439375, "grad_norm": 0.1402439922094345, "learning_rate": 6.395628984494378e-05, "loss": 0.3786, "step": 4370 }, { "epoch": 1.3118247298919568, "grad_norm": 0.14670024812221527, "learning_rate": 6.39074380747797e-05, "loss": 0.3761, "step": 4371 }, { "epoch": 1.312124849939976, "grad_norm": 0.1530454009771347, "learning_rate": 6.385859620619619e-05, "loss": 0.4419, "step": 4372 }, { "epoch": 1.3124249699879953, "grad_norm": 0.14788678288459778, "learning_rate": 6.380976425259236e-05, "loss": 0.3608, "step": 4373 }, { "epoch": 1.3127250900360143, "grad_norm": 0.1467645764350891, "learning_rate": 6.376094222736473e-05, "loss": 0.4164, "step": 4374 }, { "epoch": 1.3130252100840336, "grad_norm": 0.14573605358600616, "learning_rate": 6.371213014390706e-05, "loss": 0.378, "step": 4375 }, { "epoch": 1.3133253301320529, "grad_norm": 0.1537141352891922, "learning_rate": 6.366332801561042e-05, "loss": 0.415, "step": 4376 }, { "epoch": 1.313625450180072, "grad_norm": 0.13787482678890228, "learning_rate": 6.361453585586304e-05, "loss": 0.3391, "step": 4377 }, { "epoch": 1.3139255702280912, "grad_norm": 0.24630169570446014, "learning_rate": 6.356575367805054e-05, "loss": 0.3776, "step": 4378 }, { "epoch": 1.3142256902761105, "grad_norm": 0.14193442463874817, "learning_rate": 6.351698149555573e-05, "loss": 0.401, "step": 4379 }, { "epoch": 1.3145258103241297, "grad_norm": 0.13105009496212006, "learning_rate": 6.346821932175873e-05, "loss": 0.3651, "step": 4380 }, { "epoch": 1.3148259303721488, "grad_norm": 0.12521019577980042, "learning_rate": 6.341946717003688e-05, "loss": 0.3149, "step": 4381 }, { "epoch": 1.315126050420168, "grad_norm": 0.1309904307126999, "learning_rate": 6.33707250537647e-05, "loss": 0.3532, "step": 4382 }, { "epoch": 1.3154261704681873, "grad_norm": 0.15112322568893433, "learning_rate": 6.332199298631416e-05, "loss": 0.4302, "step": 4383 }, { "epoch": 1.3157262905162064, "grad_norm": 0.1337057501077652, "learning_rate": 6.327327098105426e-05, "loss": 0.3649, "step": 4384 }, { "epoch": 1.3160264105642256, "grad_norm": 0.1362207680940628, "learning_rate": 6.322455905135129e-05, "loss": 0.3599, "step": 4385 }, { "epoch": 1.316326530612245, "grad_norm": 0.14857180416584015, "learning_rate": 6.317585721056889e-05, "loss": 0.4208, "step": 4386 }, { "epoch": 1.3166266506602642, "grad_norm": 0.1363288164138794, "learning_rate": 6.312716547206782e-05, "loss": 0.3787, "step": 4387 }, { "epoch": 1.3169267707082832, "grad_norm": 0.13450193405151367, "learning_rate": 6.307848384920607e-05, "loss": 0.3591, "step": 4388 }, { "epoch": 1.3172268907563025, "grad_norm": 0.13663101196289062, "learning_rate": 6.302981235533896e-05, "loss": 0.3525, "step": 4389 }, { "epoch": 1.3175270108043218, "grad_norm": 0.14981935918331146, "learning_rate": 6.298115100381882e-05, "loss": 0.4148, "step": 4390 }, { "epoch": 1.3178271308523408, "grad_norm": 0.14152216911315918, "learning_rate": 6.293249980799551e-05, "loss": 0.3772, "step": 4391 }, { "epoch": 1.31812725090036, "grad_norm": 0.1505315750837326, "learning_rate": 6.288385878121582e-05, "loss": 0.3849, "step": 4392 }, { "epoch": 1.3184273709483794, "grad_norm": 0.12864822149276733, "learning_rate": 6.283522793682387e-05, "loss": 0.321, "step": 4393 }, { "epoch": 1.3187274909963986, "grad_norm": 0.14504168927669525, "learning_rate": 6.278660728816097e-05, "loss": 0.3738, "step": 4394 }, { "epoch": 1.319027611044418, "grad_norm": 0.1445467323064804, "learning_rate": 6.273799684856568e-05, "loss": 0.3945, "step": 4395 }, { "epoch": 1.319327731092437, "grad_norm": 0.14859087765216827, "learning_rate": 6.268939663137366e-05, "loss": 0.4002, "step": 4396 }, { "epoch": 1.3196278511404562, "grad_norm": 0.13341623544692993, "learning_rate": 6.264080664991785e-05, "loss": 0.3408, "step": 4397 }, { "epoch": 1.3199279711884753, "grad_norm": 0.13407118618488312, "learning_rate": 6.259222691752837e-05, "loss": 0.3403, "step": 4398 }, { "epoch": 1.3202280912364945, "grad_norm": 0.1552957147359848, "learning_rate": 6.254365744753246e-05, "loss": 0.4055, "step": 4399 }, { "epoch": 1.3205282112845138, "grad_norm": 0.17686431109905243, "learning_rate": 6.249509825325467e-05, "loss": 0.3901, "step": 4400 }, { "epoch": 1.320828331332533, "grad_norm": 0.7685422897338867, "learning_rate": 6.24465493480166e-05, "loss": 0.3732, "step": 4401 }, { "epoch": 1.3211284513805523, "grad_norm": 0.14606846868991852, "learning_rate": 6.239801074513714e-05, "loss": 0.4273, "step": 4402 }, { "epoch": 1.3214285714285714, "grad_norm": 0.14612948894500732, "learning_rate": 6.234948245793224e-05, "loss": 0.3866, "step": 4403 }, { "epoch": 1.3217286914765907, "grad_norm": 0.1363631933927536, "learning_rate": 6.230096449971509e-05, "loss": 0.3642, "step": 4404 }, { "epoch": 1.3220288115246097, "grad_norm": 0.13283447921276093, "learning_rate": 6.225245688379607e-05, "loss": 0.3577, "step": 4405 }, { "epoch": 1.322328931572629, "grad_norm": 0.13503074645996094, "learning_rate": 6.220395962348266e-05, "loss": 0.3462, "step": 4406 }, { "epoch": 1.3226290516206483, "grad_norm": 0.2346821129322052, "learning_rate": 6.215547273207953e-05, "loss": 0.3796, "step": 4407 }, { "epoch": 1.3229291716686675, "grad_norm": 0.1385582536458969, "learning_rate": 6.210699622288853e-05, "loss": 0.3531, "step": 4408 }, { "epoch": 1.3232292917166868, "grad_norm": 0.15664862096309662, "learning_rate": 6.205853010920857e-05, "loss": 0.4227, "step": 4409 }, { "epoch": 1.3235294117647058, "grad_norm": 0.151127889752388, "learning_rate": 6.201007440433588e-05, "loss": 0.368, "step": 4410 }, { "epoch": 1.3238295318127251, "grad_norm": 0.14864672720432281, "learning_rate": 6.196162912156363e-05, "loss": 0.3827, "step": 4411 }, { "epoch": 1.3241296518607442, "grad_norm": 0.1425596922636032, "learning_rate": 6.191319427418225e-05, "loss": 0.368, "step": 4412 }, { "epoch": 1.3244297719087634, "grad_norm": 0.14225400984287262, "learning_rate": 6.18647698754793e-05, "loss": 0.3683, "step": 4413 }, { "epoch": 1.3247298919567827, "grad_norm": 0.14955322444438934, "learning_rate": 6.18163559387395e-05, "loss": 0.3813, "step": 4414 }, { "epoch": 1.325030012004802, "grad_norm": 0.15107311308383942, "learning_rate": 6.176795247724452e-05, "loss": 0.3703, "step": 4415 }, { "epoch": 1.3253301320528212, "grad_norm": 0.1335255354642868, "learning_rate": 6.171955950427346e-05, "loss": 0.3483, "step": 4416 }, { "epoch": 1.3256302521008403, "grad_norm": 0.1327674239873886, "learning_rate": 6.167117703310229e-05, "loss": 0.3497, "step": 4417 }, { "epoch": 1.3259303721488596, "grad_norm": 0.14339998364448547, "learning_rate": 6.162280507700418e-05, "loss": 0.3877, "step": 4418 }, { "epoch": 1.3262304921968788, "grad_norm": 0.1410233974456787, "learning_rate": 6.157444364924945e-05, "loss": 0.3739, "step": 4419 }, { "epoch": 1.3265306122448979, "grad_norm": 0.1621217131614685, "learning_rate": 6.152609276310549e-05, "loss": 0.3469, "step": 4420 }, { "epoch": 1.3268307322929171, "grad_norm": 0.13499078154563904, "learning_rate": 6.147775243183684e-05, "loss": 0.3543, "step": 4421 }, { "epoch": 1.3271308523409364, "grad_norm": 0.1349930465221405, "learning_rate": 6.142942266870509e-05, "loss": 0.3512, "step": 4422 }, { "epoch": 1.3274309723889557, "grad_norm": 0.1554274559020996, "learning_rate": 6.138110348696893e-05, "loss": 0.3829, "step": 4423 }, { "epoch": 1.3277310924369747, "grad_norm": 0.1325419694185257, "learning_rate": 6.133279489988421e-05, "loss": 0.3456, "step": 4424 }, { "epoch": 1.328031212484994, "grad_norm": 0.1462012231349945, "learning_rate": 6.128449692070384e-05, "loss": 0.3833, "step": 4425 }, { "epoch": 1.3283313325330133, "grad_norm": 0.15544404089450836, "learning_rate": 6.123620956267778e-05, "loss": 0.4109, "step": 4426 }, { "epoch": 1.3286314525810323, "grad_norm": 0.1198524534702301, "learning_rate": 6.118793283905319e-05, "loss": 0.3189, "step": 4427 }, { "epoch": 1.3289315726290516, "grad_norm": 0.13243626058101654, "learning_rate": 6.113966676307414e-05, "loss": 0.3354, "step": 4428 }, { "epoch": 1.3292316926770709, "grad_norm": 0.14919179677963257, "learning_rate": 6.109141134798194e-05, "loss": 0.4128, "step": 4429 }, { "epoch": 1.3295318127250901, "grad_norm": 0.1565873920917511, "learning_rate": 6.104316660701485e-05, "loss": 0.3819, "step": 4430 }, { "epoch": 1.3298319327731092, "grad_norm": 0.16061201691627502, "learning_rate": 6.099493255340832e-05, "loss": 0.406, "step": 4431 }, { "epoch": 1.3301320528211285, "grad_norm": 0.14444883167743683, "learning_rate": 6.0946709200394804e-05, "loss": 0.3822, "step": 4432 }, { "epoch": 1.3304321728691477, "grad_norm": 0.1577412635087967, "learning_rate": 6.089849656120383e-05, "loss": 0.3619, "step": 4433 }, { "epoch": 1.3307322929171668, "grad_norm": 0.16566117107868195, "learning_rate": 6.085029464906189e-05, "loss": 0.3786, "step": 4434 }, { "epoch": 1.331032412965186, "grad_norm": 0.14696945250034332, "learning_rate": 6.0802103477192775e-05, "loss": 0.4221, "step": 4435 }, { "epoch": 1.3313325330132053, "grad_norm": 0.14580310881137848, "learning_rate": 6.0753923058817084e-05, "loss": 0.3926, "step": 4436 }, { "epoch": 1.3316326530612246, "grad_norm": 0.1414540857076645, "learning_rate": 6.0705753407152565e-05, "loss": 0.3649, "step": 4437 }, { "epoch": 1.3319327731092436, "grad_norm": 0.1575811207294464, "learning_rate": 6.065759453541404e-05, "loss": 0.3583, "step": 4438 }, { "epoch": 1.332232893157263, "grad_norm": 0.1364908516407013, "learning_rate": 6.06094464568133e-05, "loss": 0.3741, "step": 4439 }, { "epoch": 1.3325330132052822, "grad_norm": 0.14941635727882385, "learning_rate": 6.056130918455929e-05, "loss": 0.4276, "step": 4440 }, { "epoch": 1.3328331332533012, "grad_norm": 0.1328786164522171, "learning_rate": 6.0513182731857886e-05, "loss": 0.3223, "step": 4441 }, { "epoch": 1.3331332533013205, "grad_norm": 0.14873063564300537, "learning_rate": 6.0465067111912e-05, "loss": 0.3706, "step": 4442 }, { "epoch": 1.3334333733493398, "grad_norm": 0.14439992606639862, "learning_rate": 6.041696233792162e-05, "loss": 0.3943, "step": 4443 }, { "epoch": 1.333733493397359, "grad_norm": 0.2462586611509323, "learning_rate": 6.0368868423083745e-05, "loss": 0.3878, "step": 4444 }, { "epoch": 1.334033613445378, "grad_norm": 0.15387386083602905, "learning_rate": 6.032078538059236e-05, "loss": 0.4222, "step": 4445 }, { "epoch": 1.3343337334933973, "grad_norm": 0.1518174707889557, "learning_rate": 6.0272713223638564e-05, "loss": 0.4005, "step": 4446 }, { "epoch": 1.3346338535414166, "grad_norm": 0.15900781750679016, "learning_rate": 6.022465196541035e-05, "loss": 0.4209, "step": 4447 }, { "epoch": 1.3349339735894357, "grad_norm": 0.14064718782901764, "learning_rate": 6.0176601619092754e-05, "loss": 0.3659, "step": 4448 }, { "epoch": 1.335234093637455, "grad_norm": 0.14249639213085175, "learning_rate": 6.012856219786789e-05, "loss": 0.3847, "step": 4449 }, { "epoch": 1.3355342136854742, "grad_norm": 0.2270103543996811, "learning_rate": 6.0080533714914766e-05, "loss": 0.4128, "step": 4450 }, { "epoch": 1.3358343337334935, "grad_norm": 0.15802806615829468, "learning_rate": 6.00325161834095e-05, "loss": 0.4154, "step": 4451 }, { "epoch": 1.3361344537815127, "grad_norm": 0.13083720207214355, "learning_rate": 5.9984509616525154e-05, "loss": 0.3481, "step": 4452 }, { "epoch": 1.3364345738295318, "grad_norm": 0.1440601795911789, "learning_rate": 5.99365140274317e-05, "loss": 0.3868, "step": 4453 }, { "epoch": 1.336734693877551, "grad_norm": 0.1382712870836258, "learning_rate": 5.988852942929628e-05, "loss": 0.3479, "step": 4454 }, { "epoch": 1.33703481392557, "grad_norm": 0.1510000079870224, "learning_rate": 5.984055583528285e-05, "loss": 0.3897, "step": 4455 }, { "epoch": 1.3373349339735894, "grad_norm": 0.14380814135074615, "learning_rate": 5.979259325855242e-05, "loss": 0.4107, "step": 4456 }, { "epoch": 1.3376350540216086, "grad_norm": 0.13862638175487518, "learning_rate": 5.974464171226301e-05, "loss": 0.3668, "step": 4457 }, { "epoch": 1.337935174069628, "grad_norm": 0.1261635273694992, "learning_rate": 5.969670120956956e-05, "loss": 0.3117, "step": 4458 }, { "epoch": 1.3382352941176472, "grad_norm": 0.16036823391914368, "learning_rate": 5.9648771763623944e-05, "loss": 0.3357, "step": 4459 }, { "epoch": 1.3385354141656662, "grad_norm": 0.1365584433078766, "learning_rate": 5.9600853387575163e-05, "loss": 0.326, "step": 4460 }, { "epoch": 1.3388355342136855, "grad_norm": 0.1372981071472168, "learning_rate": 5.9552946094568975e-05, "loss": 0.3527, "step": 4461 }, { "epoch": 1.3391356542617046, "grad_norm": 0.13922113180160522, "learning_rate": 5.950504989774825e-05, "loss": 0.3296, "step": 4462 }, { "epoch": 1.3394357743097238, "grad_norm": 0.1392875760793686, "learning_rate": 5.945716481025275e-05, "loss": 0.3425, "step": 4463 }, { "epoch": 1.339735894357743, "grad_norm": 0.1342422068119049, "learning_rate": 5.940929084521918e-05, "loss": 0.3487, "step": 4464 }, { "epoch": 1.3400360144057624, "grad_norm": 0.13472548127174377, "learning_rate": 5.9361428015781275e-05, "loss": 0.3389, "step": 4465 }, { "epoch": 1.3403361344537816, "grad_norm": 0.1356363594532013, "learning_rate": 5.931357633506957e-05, "loss": 0.3432, "step": 4466 }, { "epoch": 1.3406362545018007, "grad_norm": 0.13877129554748535, "learning_rate": 5.926573581621167e-05, "loss": 0.3675, "step": 4467 }, { "epoch": 1.34093637454982, "grad_norm": 0.1482432633638382, "learning_rate": 5.921790647233205e-05, "loss": 0.3665, "step": 4468 }, { "epoch": 1.341236494597839, "grad_norm": 0.13379403948783875, "learning_rate": 5.9170088316552176e-05, "loss": 0.363, "step": 4469 }, { "epoch": 1.3415366146458583, "grad_norm": 0.14877451956272125, "learning_rate": 5.912228136199038e-05, "loss": 0.3882, "step": 4470 }, { "epoch": 1.3418367346938775, "grad_norm": 0.1336432844400406, "learning_rate": 5.907448562176201e-05, "loss": 0.3507, "step": 4471 }, { "epoch": 1.3421368547418968, "grad_norm": 0.3251681327819824, "learning_rate": 5.902670110897917e-05, "loss": 0.3442, "step": 4472 }, { "epoch": 1.342436974789916, "grad_norm": 0.13489368557929993, "learning_rate": 5.89789278367511e-05, "loss": 0.3294, "step": 4473 }, { "epoch": 1.3427370948379351, "grad_norm": 0.1663886308670044, "learning_rate": 5.8931165818183784e-05, "loss": 0.3603, "step": 4474 }, { "epoch": 1.3430372148859544, "grad_norm": 0.14255166053771973, "learning_rate": 5.888341506638021e-05, "loss": 0.371, "step": 4475 }, { "epoch": 1.3433373349339737, "grad_norm": 0.13316510617733002, "learning_rate": 5.8835675594440256e-05, "loss": 0.3443, "step": 4476 }, { "epoch": 1.3436374549819927, "grad_norm": 0.1348668932914734, "learning_rate": 5.87879474154607e-05, "loss": 0.3486, "step": 4477 }, { "epoch": 1.343937575030012, "grad_norm": 0.1510106921195984, "learning_rate": 5.874023054253516e-05, "loss": 0.41, "step": 4478 }, { "epoch": 1.3442376950780313, "grad_norm": 0.1550380438566208, "learning_rate": 5.869252498875432e-05, "loss": 0.4148, "step": 4479 }, { "epoch": 1.3445378151260505, "grad_norm": 0.1395397037267685, "learning_rate": 5.864483076720555e-05, "loss": 0.37, "step": 4480 }, { "epoch": 1.3448379351740696, "grad_norm": 0.15722878277301788, "learning_rate": 5.859714789097328e-05, "loss": 0.4301, "step": 4481 }, { "epoch": 1.3451380552220888, "grad_norm": 0.14296914637088776, "learning_rate": 5.854947637313872e-05, "loss": 0.367, "step": 4482 }, { "epoch": 1.3454381752701081, "grad_norm": 0.14038380980491638, "learning_rate": 5.8501816226780014e-05, "loss": 0.3523, "step": 4483 }, { "epoch": 1.3457382953181272, "grad_norm": 0.16099123656749725, "learning_rate": 5.845416746497221e-05, "loss": 0.3836, "step": 4484 }, { "epoch": 1.3460384153661464, "grad_norm": 0.16637754440307617, "learning_rate": 5.8406530100787196e-05, "loss": 0.4044, "step": 4485 }, { "epoch": 1.3463385354141657, "grad_norm": 0.1434050351381302, "learning_rate": 5.835890414729366e-05, "loss": 0.3565, "step": 4486 }, { "epoch": 1.346638655462185, "grad_norm": 0.13820092380046844, "learning_rate": 5.831128961755734e-05, "loss": 0.3659, "step": 4487 }, { "epoch": 1.346938775510204, "grad_norm": 0.13020813465118408, "learning_rate": 5.8263686524640604e-05, "loss": 0.3321, "step": 4488 }, { "epoch": 1.3472388955582233, "grad_norm": 0.13537994027137756, "learning_rate": 5.821609488160298e-05, "loss": 0.3601, "step": 4489 }, { "epoch": 1.3475390156062426, "grad_norm": 0.1363266408443451, "learning_rate": 5.81685147015006e-05, "loss": 0.351, "step": 4490 }, { "epoch": 1.3478391356542616, "grad_norm": 0.1465909630060196, "learning_rate": 5.81209459973865e-05, "loss": 0.3895, "step": 4491 }, { "epoch": 1.3481392557022809, "grad_norm": 0.1680755615234375, "learning_rate": 5.8073388782310664e-05, "loss": 0.3673, "step": 4492 }, { "epoch": 1.3484393757503002, "grad_norm": 0.1574047952890396, "learning_rate": 5.802584306931991e-05, "loss": 0.3667, "step": 4493 }, { "epoch": 1.3487394957983194, "grad_norm": 0.14274361729621887, "learning_rate": 5.7978308871457754e-05, "loss": 0.3709, "step": 4494 }, { "epoch": 1.3490396158463385, "grad_norm": 0.14271649718284607, "learning_rate": 5.793078620176475e-05, "loss": 0.3577, "step": 4495 }, { "epoch": 1.3493397358943577, "grad_norm": 0.14356018602848053, "learning_rate": 5.788327507327814e-05, "loss": 0.3825, "step": 4496 }, { "epoch": 1.349639855942377, "grad_norm": 0.16204307973384857, "learning_rate": 5.7835775499032074e-05, "loss": 0.4131, "step": 4497 }, { "epoch": 1.349939975990396, "grad_norm": 0.156327024102211, "learning_rate": 5.778828749205756e-05, "loss": 0.4101, "step": 4498 }, { "epoch": 1.3502400960384153, "grad_norm": 0.137571319937706, "learning_rate": 5.7740811065382295e-05, "loss": 0.3685, "step": 4499 }, { "epoch": 1.3505402160864346, "grad_norm": 0.12720349431037903, "learning_rate": 5.769334623203095e-05, "loss": 0.3303, "step": 4500 }, { "epoch": 1.3508403361344539, "grad_norm": 0.15204890072345734, "learning_rate": 5.764589300502501e-05, "loss": 0.3668, "step": 4501 }, { "epoch": 1.351140456182473, "grad_norm": 0.22256812453269958, "learning_rate": 5.7598451397382614e-05, "loss": 0.3871, "step": 4502 }, { "epoch": 1.3514405762304922, "grad_norm": 0.15941122174263, "learning_rate": 5.755102142211892e-05, "loss": 0.4042, "step": 4503 }, { "epoch": 1.3517406962785115, "grad_norm": 0.1835334300994873, "learning_rate": 5.7503603092245714e-05, "loss": 0.3371, "step": 4504 }, { "epoch": 1.3520408163265305, "grad_norm": 0.14911781251430511, "learning_rate": 5.745619642077171e-05, "loss": 0.3893, "step": 4505 }, { "epoch": 1.3523409363745498, "grad_norm": 0.20111630856990814, "learning_rate": 5.740880142070242e-05, "loss": 0.4041, "step": 4506 }, { "epoch": 1.352641056422569, "grad_norm": 0.14431169629096985, "learning_rate": 5.736141810504009e-05, "loss": 0.3991, "step": 4507 }, { "epoch": 1.3529411764705883, "grad_norm": 0.13495999574661255, "learning_rate": 5.731404648678374e-05, "loss": 0.3401, "step": 4508 }, { "epoch": 1.3532412965186074, "grad_norm": 0.175273135304451, "learning_rate": 5.7266686578929286e-05, "loss": 0.3792, "step": 4509 }, { "epoch": 1.3535414165666266, "grad_norm": 0.14945167303085327, "learning_rate": 5.7219338394469356e-05, "loss": 0.3717, "step": 4510 }, { "epoch": 1.353841536614646, "grad_norm": 0.157761350274086, "learning_rate": 5.7172001946393426e-05, "loss": 0.3739, "step": 4511 }, { "epoch": 1.354141656662665, "grad_norm": 0.13628393411636353, "learning_rate": 5.712467724768766e-05, "loss": 0.3557, "step": 4512 }, { "epoch": 1.3544417767106842, "grad_norm": 0.16030484437942505, "learning_rate": 5.7077364311335e-05, "loss": 0.4011, "step": 4513 }, { "epoch": 1.3547418967587035, "grad_norm": 0.17973542213439941, "learning_rate": 5.703006315031534e-05, "loss": 0.3889, "step": 4514 }, { "epoch": 1.3550420168067228, "grad_norm": 0.13913069665431976, "learning_rate": 5.6982773777605125e-05, "loss": 0.3828, "step": 4515 }, { "epoch": 1.355342136854742, "grad_norm": 0.1738550066947937, "learning_rate": 5.693549620617764e-05, "loss": 0.3889, "step": 4516 }, { "epoch": 1.355642256902761, "grad_norm": 0.14184638857841492, "learning_rate": 5.6888230449002954e-05, "loss": 0.3882, "step": 4517 }, { "epoch": 1.3559423769507803, "grad_norm": 0.14063705503940582, "learning_rate": 5.684097651904791e-05, "loss": 0.3561, "step": 4518 }, { "epoch": 1.3562424969987994, "grad_norm": 0.15086719393730164, "learning_rate": 5.67937344292761e-05, "loss": 0.3944, "step": 4519 }, { "epoch": 1.3565426170468187, "grad_norm": 0.13202974200248718, "learning_rate": 5.674650419264782e-05, "loss": 0.3302, "step": 4520 }, { "epoch": 1.356842737094838, "grad_norm": 0.14360421895980835, "learning_rate": 5.6699285822120116e-05, "loss": 0.3883, "step": 4521 }, { "epoch": 1.3571428571428572, "grad_norm": 0.15063901245594025, "learning_rate": 5.6652079330646834e-05, "loss": 0.3897, "step": 4522 }, { "epoch": 1.3574429771908765, "grad_norm": 0.14402541518211365, "learning_rate": 5.660488473117857e-05, "loss": 0.3493, "step": 4523 }, { "epoch": 1.3577430972388955, "grad_norm": 0.14424912631511688, "learning_rate": 5.6557702036662555e-05, "loss": 0.3862, "step": 4524 }, { "epoch": 1.3580432172869148, "grad_norm": 0.14334195852279663, "learning_rate": 5.651053126004284e-05, "loss": 0.3907, "step": 4525 }, { "epoch": 1.3583433373349338, "grad_norm": 0.13699807226657867, "learning_rate": 5.646337241426024e-05, "loss": 0.3263, "step": 4526 }, { "epoch": 1.3586434573829531, "grad_norm": 0.13770411908626556, "learning_rate": 5.6416225512252166e-05, "loss": 0.3814, "step": 4527 }, { "epoch": 1.3589435774309724, "grad_norm": 0.16407433152198792, "learning_rate": 5.63690905669529e-05, "loss": 0.3604, "step": 4528 }, { "epoch": 1.3592436974789917, "grad_norm": 0.13977479934692383, "learning_rate": 5.6321967591293314e-05, "loss": 0.3662, "step": 4529 }, { "epoch": 1.359543817527011, "grad_norm": 0.14484171569347382, "learning_rate": 5.6274856598201066e-05, "loss": 0.3747, "step": 4530 }, { "epoch": 1.35984393757503, "grad_norm": 0.14661836624145508, "learning_rate": 5.622775760060057e-05, "loss": 0.3541, "step": 4531 }, { "epoch": 1.3601440576230492, "grad_norm": 0.1281379908323288, "learning_rate": 5.618067061141283e-05, "loss": 0.2959, "step": 4532 }, { "epoch": 1.3604441776710683, "grad_norm": 0.13950206339359283, "learning_rate": 5.613359564355569e-05, "loss": 0.3515, "step": 4533 }, { "epoch": 1.3607442977190876, "grad_norm": 0.1432461142539978, "learning_rate": 5.608653270994353e-05, "loss": 0.3415, "step": 4534 }, { "epoch": 1.3610444177671068, "grad_norm": 0.14423154294490814, "learning_rate": 5.6039481823487606e-05, "loss": 0.3798, "step": 4535 }, { "epoch": 1.361344537815126, "grad_norm": 0.1409810185432434, "learning_rate": 5.599244299709578e-05, "loss": 0.36, "step": 4536 }, { "epoch": 1.3616446578631454, "grad_norm": 0.13379138708114624, "learning_rate": 5.594541624367262e-05, "loss": 0.3241, "step": 4537 }, { "epoch": 1.3619447779111644, "grad_norm": 0.13153435289859772, "learning_rate": 5.589840157611929e-05, "loss": 0.3737, "step": 4538 }, { "epoch": 1.3622448979591837, "grad_norm": 0.1384473741054535, "learning_rate": 5.585139900733385e-05, "loss": 0.3801, "step": 4539 }, { "epoch": 1.362545018007203, "grad_norm": 0.14458291232585907, "learning_rate": 5.580440855021083e-05, "loss": 0.3984, "step": 4540 }, { "epoch": 1.362845138055222, "grad_norm": 0.14624524116516113, "learning_rate": 5.575743021764159e-05, "loss": 0.3836, "step": 4541 }, { "epoch": 1.3631452581032413, "grad_norm": 0.14253798127174377, "learning_rate": 5.571046402251401e-05, "loss": 0.375, "step": 4542 }, { "epoch": 1.3634453781512605, "grad_norm": 0.13562585413455963, "learning_rate": 5.566350997771279e-05, "loss": 0.3354, "step": 4543 }, { "epoch": 1.3637454981992798, "grad_norm": 0.19224855303764343, "learning_rate": 5.561656809611925e-05, "loss": 0.3896, "step": 4544 }, { "epoch": 1.3640456182472989, "grad_norm": 0.15661108493804932, "learning_rate": 5.556963839061133e-05, "loss": 0.3606, "step": 4545 }, { "epoch": 1.3643457382953181, "grad_norm": 0.1251692771911621, "learning_rate": 5.55227208740636e-05, "loss": 0.3294, "step": 4546 }, { "epoch": 1.3646458583433374, "grad_norm": 0.22994621098041534, "learning_rate": 5.547581555934742e-05, "loss": 0.3472, "step": 4547 }, { "epoch": 1.3649459783913565, "grad_norm": 0.1264163851737976, "learning_rate": 5.542892245933069e-05, "loss": 0.3172, "step": 4548 }, { "epoch": 1.3652460984393757, "grad_norm": 0.12400945276021957, "learning_rate": 5.538204158687803e-05, "loss": 0.3014, "step": 4549 }, { "epoch": 1.365546218487395, "grad_norm": 0.1770714521408081, "learning_rate": 5.533517295485062e-05, "loss": 0.3442, "step": 4550 }, { "epoch": 1.3658463385354143, "grad_norm": 0.14922089874744415, "learning_rate": 5.5288316576106357e-05, "loss": 0.3843, "step": 4551 }, { "epoch": 1.3661464585834333, "grad_norm": 0.14821475744247437, "learning_rate": 5.524147246349979e-05, "loss": 0.3758, "step": 4552 }, { "epoch": 1.3664465786314526, "grad_norm": 0.137704998254776, "learning_rate": 5.519464062988202e-05, "loss": 0.3639, "step": 4553 }, { "epoch": 1.3667466986794718, "grad_norm": 0.15100188553333282, "learning_rate": 5.514782108810079e-05, "loss": 0.3729, "step": 4554 }, { "epoch": 1.367046818727491, "grad_norm": 0.14446976780891418, "learning_rate": 5.5101013851000547e-05, "loss": 0.3599, "step": 4555 }, { "epoch": 1.3673469387755102, "grad_norm": 0.1340705007314682, "learning_rate": 5.505421893142235e-05, "loss": 0.3555, "step": 4556 }, { "epoch": 1.3676470588235294, "grad_norm": 0.14943592250347137, "learning_rate": 5.500743634220379e-05, "loss": 0.3818, "step": 4557 }, { "epoch": 1.3679471788715487, "grad_norm": 0.13905028998851776, "learning_rate": 5.496066609617918e-05, "loss": 0.3572, "step": 4558 }, { "epoch": 1.3682472989195678, "grad_norm": 0.14025895297527313, "learning_rate": 5.4913908206179323e-05, "loss": 0.3827, "step": 4559 }, { "epoch": 1.368547418967587, "grad_norm": 0.14170074462890625, "learning_rate": 5.486716268503182e-05, "loss": 0.3596, "step": 4560 }, { "epoch": 1.3688475390156063, "grad_norm": 0.15285390615463257, "learning_rate": 5.482042954556073e-05, "loss": 0.3543, "step": 4561 }, { "epoch": 1.3691476590636253, "grad_norm": 0.172708198428154, "learning_rate": 5.4773708800586684e-05, "loss": 0.3804, "step": 4562 }, { "epoch": 1.3694477791116446, "grad_norm": 0.13485442101955414, "learning_rate": 5.4727000462927046e-05, "loss": 0.3591, "step": 4563 }, { "epoch": 1.3697478991596639, "grad_norm": 0.16145791113376617, "learning_rate": 5.468030454539574e-05, "loss": 0.3517, "step": 4564 }, { "epoch": 1.3700480192076832, "grad_norm": 0.1492297351360321, "learning_rate": 5.4633621060803185e-05, "loss": 0.3921, "step": 4565 }, { "epoch": 1.3703481392557022, "grad_norm": 0.14852483570575714, "learning_rate": 5.458695002195655e-05, "loss": 0.3309, "step": 4566 }, { "epoch": 1.3706482593037215, "grad_norm": 0.14032091200351715, "learning_rate": 5.4540291441659376e-05, "loss": 0.3533, "step": 4567 }, { "epoch": 1.3709483793517407, "grad_norm": 0.1701730489730835, "learning_rate": 5.449364533271199e-05, "loss": 0.3812, "step": 4568 }, { "epoch": 1.3712484993997598, "grad_norm": 0.1463678479194641, "learning_rate": 5.444701170791125e-05, "loss": 0.4203, "step": 4569 }, { "epoch": 1.371548619447779, "grad_norm": 0.1414550095796585, "learning_rate": 5.440039058005047e-05, "loss": 0.397, "step": 4570 }, { "epoch": 1.3718487394957983, "grad_norm": 0.1401282399892807, "learning_rate": 5.4353781961919694e-05, "loss": 0.3497, "step": 4571 }, { "epoch": 1.3721488595438176, "grad_norm": 0.14443713426589966, "learning_rate": 5.4307185866305386e-05, "loss": 0.3754, "step": 4572 }, { "epoch": 1.3724489795918369, "grad_norm": 0.13361041247844696, "learning_rate": 5.4260602305990705e-05, "loss": 0.3557, "step": 4573 }, { "epoch": 1.372749099639856, "grad_norm": 0.1520872563123703, "learning_rate": 5.4214031293755354e-05, "loss": 0.3785, "step": 4574 }, { "epoch": 1.3730492196878752, "grad_norm": 0.12615153193473816, "learning_rate": 5.416747284237544e-05, "loss": 0.2981, "step": 4575 }, { "epoch": 1.3733493397358942, "grad_norm": 0.17696449160575867, "learning_rate": 5.412092696462383e-05, "loss": 0.4158, "step": 4576 }, { "epoch": 1.3736494597839135, "grad_norm": 0.1244591623544693, "learning_rate": 5.407439367326988e-05, "loss": 0.2827, "step": 4577 }, { "epoch": 1.3739495798319328, "grad_norm": 0.13495191931724548, "learning_rate": 5.402787298107936e-05, "loss": 0.361, "step": 4578 }, { "epoch": 1.374249699879952, "grad_norm": 0.15213558077812195, "learning_rate": 5.398136490081479e-05, "loss": 0.3548, "step": 4579 }, { "epoch": 1.3745498199279713, "grad_norm": 0.14480598270893097, "learning_rate": 5.393486944523505e-05, "loss": 0.3635, "step": 4580 }, { "epoch": 1.3748499399759904, "grad_norm": 0.14267009496688843, "learning_rate": 5.388838662709566e-05, "loss": 0.3674, "step": 4581 }, { "epoch": 1.3751500600240096, "grad_norm": 0.1517316848039627, "learning_rate": 5.384191645914869e-05, "loss": 0.41, "step": 4582 }, { "epoch": 1.3754501800720287, "grad_norm": 0.14075906574726105, "learning_rate": 5.3795458954142664e-05, "loss": 0.3578, "step": 4583 }, { "epoch": 1.375750300120048, "grad_norm": 0.13696734607219696, "learning_rate": 5.3749014124822626e-05, "loss": 0.3366, "step": 4584 }, { "epoch": 1.3760504201680672, "grad_norm": 0.15023215115070343, "learning_rate": 5.3702581983930234e-05, "loss": 0.362, "step": 4585 }, { "epoch": 1.3763505402160865, "grad_norm": 0.1412237584590912, "learning_rate": 5.365616254420364e-05, "loss": 0.3591, "step": 4586 }, { "epoch": 1.3766506602641058, "grad_norm": 0.14088405668735504, "learning_rate": 5.3609755818377396e-05, "loss": 0.3458, "step": 4587 }, { "epoch": 1.3769507803121248, "grad_norm": 0.13995389640331268, "learning_rate": 5.356336181918271e-05, "loss": 0.366, "step": 4588 }, { "epoch": 1.377250900360144, "grad_norm": 0.13612928986549377, "learning_rate": 5.351698055934724e-05, "loss": 0.3704, "step": 4589 }, { "epoch": 1.3775510204081631, "grad_norm": 0.13480474054813385, "learning_rate": 5.347061205159519e-05, "loss": 0.372, "step": 4590 }, { "epoch": 1.3778511404561824, "grad_norm": 0.15353074669837952, "learning_rate": 5.3424256308647194e-05, "loss": 0.3824, "step": 4591 }, { "epoch": 1.3781512605042017, "grad_norm": 0.14670152962207794, "learning_rate": 5.337791334322038e-05, "loss": 0.3703, "step": 4592 }, { "epoch": 1.378451380552221, "grad_norm": 0.16588912904262543, "learning_rate": 5.333158316802842e-05, "loss": 0.4031, "step": 4593 }, { "epoch": 1.3787515006002402, "grad_norm": 0.14609313011169434, "learning_rate": 5.328526579578156e-05, "loss": 0.3278, "step": 4594 }, { "epoch": 1.3790516206482593, "grad_norm": 0.14630454778671265, "learning_rate": 5.323896123918631e-05, "loss": 0.3869, "step": 4595 }, { "epoch": 1.3793517406962785, "grad_norm": 0.23194457590579987, "learning_rate": 5.3192669510945905e-05, "loss": 0.4115, "step": 4596 }, { "epoch": 1.3796518607442978, "grad_norm": 0.12587039172649384, "learning_rate": 5.3146390623759856e-05, "loss": 0.3213, "step": 4597 }, { "epoch": 1.3799519807923168, "grad_norm": 0.14738459885120392, "learning_rate": 5.3100124590324294e-05, "loss": 0.3602, "step": 4598 }, { "epoch": 1.3802521008403361, "grad_norm": 0.14517807960510254, "learning_rate": 5.3053871423331805e-05, "loss": 0.4087, "step": 4599 }, { "epoch": 1.3805522208883554, "grad_norm": 0.19121530652046204, "learning_rate": 5.3007631135471334e-05, "loss": 0.3497, "step": 4600 }, { "epoch": 1.3808523409363747, "grad_norm": 0.13413242995738983, "learning_rate": 5.2961403739428415e-05, "loss": 0.3384, "step": 4601 }, { "epoch": 1.3811524609843937, "grad_norm": 0.13237817585468292, "learning_rate": 5.291518924788507e-05, "loss": 0.3211, "step": 4602 }, { "epoch": 1.381452581032413, "grad_norm": 0.142702117562294, "learning_rate": 5.28689876735196e-05, "loss": 0.3559, "step": 4603 }, { "epoch": 1.3817527010804322, "grad_norm": 0.1560351699590683, "learning_rate": 5.2822799029006964e-05, "loss": 0.3976, "step": 4604 }, { "epoch": 1.3820528211284513, "grad_norm": 0.14791634678840637, "learning_rate": 5.277662332701842e-05, "loss": 0.3725, "step": 4605 }, { "epoch": 1.3823529411764706, "grad_norm": 0.13751675188541412, "learning_rate": 5.2730460580221774e-05, "loss": 0.344, "step": 4606 }, { "epoch": 1.3826530612244898, "grad_norm": 0.142083540558815, "learning_rate": 5.268431080128129e-05, "loss": 0.3921, "step": 4607 }, { "epoch": 1.382953181272509, "grad_norm": 0.1370341032743454, "learning_rate": 5.2638174002857546e-05, "loss": 0.3543, "step": 4608 }, { "epoch": 1.3832533013205282, "grad_norm": 0.141767218708992, "learning_rate": 5.259205019760772e-05, "loss": 0.3679, "step": 4609 }, { "epoch": 1.3835534213685474, "grad_norm": 0.6375341415405273, "learning_rate": 5.2545939398185284e-05, "loss": 0.3338, "step": 4610 }, { "epoch": 1.3838535414165667, "grad_norm": 0.1471729576587677, "learning_rate": 5.249984161724023e-05, "loss": 0.4011, "step": 4611 }, { "epoch": 1.3841536614645857, "grad_norm": 0.14878086745738983, "learning_rate": 5.2453756867419e-05, "loss": 0.3518, "step": 4612 }, { "epoch": 1.384453781512605, "grad_norm": 0.1476067155599594, "learning_rate": 5.240768516136436e-05, "loss": 0.3717, "step": 4613 }, { "epoch": 1.3847539015606243, "grad_norm": 0.14189663529396057, "learning_rate": 5.236162651171557e-05, "loss": 0.3301, "step": 4614 }, { "epoch": 1.3850540216086435, "grad_norm": 0.14110246300697327, "learning_rate": 5.231558093110832e-05, "loss": 0.3703, "step": 4615 }, { "epoch": 1.3853541416566626, "grad_norm": 0.13561809062957764, "learning_rate": 5.226954843217468e-05, "loss": 0.3044, "step": 4616 }, { "epoch": 1.3856542617046819, "grad_norm": 0.1426294595003128, "learning_rate": 5.222352902754307e-05, "loss": 0.3423, "step": 4617 }, { "epoch": 1.3859543817527011, "grad_norm": 0.1319276988506317, "learning_rate": 5.2177522729838444e-05, "loss": 0.3246, "step": 4618 }, { "epoch": 1.3862545018007202, "grad_norm": 0.1312565952539444, "learning_rate": 5.21315295516821e-05, "loss": 0.3289, "step": 4619 }, { "epoch": 1.3865546218487395, "grad_norm": 0.14471866190433502, "learning_rate": 5.208554950569178e-05, "loss": 0.3788, "step": 4620 }, { "epoch": 1.3868547418967587, "grad_norm": 0.14428383111953735, "learning_rate": 5.203958260448152e-05, "loss": 0.3806, "step": 4621 }, { "epoch": 1.387154861944778, "grad_norm": 0.14408062398433685, "learning_rate": 5.199362886066177e-05, "loss": 0.3829, "step": 4622 }, { "epoch": 1.387454981992797, "grad_norm": 0.1601879894733429, "learning_rate": 5.194768828683953e-05, "loss": 0.3568, "step": 4623 }, { "epoch": 1.3877551020408163, "grad_norm": 0.12715758383274078, "learning_rate": 5.190176089561802e-05, "loss": 0.3262, "step": 4624 }, { "epoch": 1.3880552220888356, "grad_norm": 0.1314728856086731, "learning_rate": 5.1855846699596866e-05, "loss": 0.3452, "step": 4625 }, { "epoch": 1.3883553421368546, "grad_norm": 0.14861977100372314, "learning_rate": 5.18099457113721e-05, "loss": 0.3532, "step": 4626 }, { "epoch": 1.388655462184874, "grad_norm": 0.19684113562107086, "learning_rate": 5.17640579435362e-05, "loss": 0.3177, "step": 4627 }, { "epoch": 1.3889555822328932, "grad_norm": 0.1377023309469223, "learning_rate": 5.171818340867787e-05, "loss": 0.3373, "step": 4628 }, { "epoch": 1.3892557022809124, "grad_norm": 0.13617639243602753, "learning_rate": 5.1672322119382325e-05, "loss": 0.3193, "step": 4629 }, { "epoch": 1.3895558223289317, "grad_norm": 0.14320117235183716, "learning_rate": 5.1626474088231004e-05, "loss": 0.3603, "step": 4630 }, { "epoch": 1.3898559423769508, "grad_norm": 0.15924130380153656, "learning_rate": 5.158063932780185e-05, "loss": 0.3828, "step": 4631 }, { "epoch": 1.39015606242497, "grad_norm": 0.14906635880470276, "learning_rate": 5.153481785066914e-05, "loss": 0.3798, "step": 4632 }, { "epoch": 1.390456182472989, "grad_norm": 0.15486960113048553, "learning_rate": 5.1489009669403354e-05, "loss": 0.4329, "step": 4633 }, { "epoch": 1.3907563025210083, "grad_norm": 0.1481010913848877, "learning_rate": 5.144321479657157e-05, "loss": 0.3531, "step": 4634 }, { "epoch": 1.3910564225690276, "grad_norm": 0.1424623429775238, "learning_rate": 5.1397433244736984e-05, "loss": 0.3399, "step": 4635 }, { "epoch": 1.3913565426170469, "grad_norm": 0.14340506494045258, "learning_rate": 5.1351665026459286e-05, "loss": 0.3841, "step": 4636 }, { "epoch": 1.3916566626650662, "grad_norm": 0.13578680157661438, "learning_rate": 5.130591015429449e-05, "loss": 0.3528, "step": 4637 }, { "epoch": 1.3919567827130852, "grad_norm": 0.13246610760688782, "learning_rate": 5.1260168640794845e-05, "loss": 0.3341, "step": 4638 }, { "epoch": 1.3922569027611045, "grad_norm": 0.14007440209388733, "learning_rate": 5.121444049850906e-05, "loss": 0.3392, "step": 4639 }, { "epoch": 1.3925570228091235, "grad_norm": 0.136866956949234, "learning_rate": 5.116872573998217e-05, "loss": 0.3413, "step": 4640 }, { "epoch": 1.3928571428571428, "grad_norm": 0.17135818302631378, "learning_rate": 5.1123024377755394e-05, "loss": 0.3666, "step": 4641 }, { "epoch": 1.393157262905162, "grad_norm": 0.13718274235725403, "learning_rate": 5.107733642436646e-05, "loss": 0.3593, "step": 4642 }, { "epoch": 1.3934573829531813, "grad_norm": 0.13347193598747253, "learning_rate": 5.103166189234927e-05, "loss": 0.3328, "step": 4643 }, { "epoch": 1.3937575030012006, "grad_norm": 0.21490272879600525, "learning_rate": 5.098600079423415e-05, "loss": 0.3258, "step": 4644 }, { "epoch": 1.3940576230492197, "grad_norm": 0.1582237035036087, "learning_rate": 5.0940353142547726e-05, "loss": 0.376, "step": 4645 }, { "epoch": 1.394357743097239, "grad_norm": 0.13393276929855347, "learning_rate": 5.0894718949812855e-05, "loss": 0.3301, "step": 4646 }, { "epoch": 1.394657863145258, "grad_norm": 0.15065492689609528, "learning_rate": 5.084909822854871e-05, "loss": 0.3984, "step": 4647 }, { "epoch": 1.3949579831932772, "grad_norm": 0.14716662466526031, "learning_rate": 5.080349099127093e-05, "loss": 0.3829, "step": 4648 }, { "epoch": 1.3952581032412965, "grad_norm": 0.1335858702659607, "learning_rate": 5.075789725049126e-05, "loss": 0.3281, "step": 4649 }, { "epoch": 1.3955582232893158, "grad_norm": 0.14929762482643127, "learning_rate": 5.071231701871787e-05, "loss": 0.2957, "step": 4650 }, { "epoch": 1.395858343337335, "grad_norm": 0.12802687287330627, "learning_rate": 5.0666750308455116e-05, "loss": 0.3058, "step": 4651 }, { "epoch": 1.396158463385354, "grad_norm": 0.1500602811574936, "learning_rate": 5.0621197132203724e-05, "loss": 0.3377, "step": 4652 }, { "epoch": 1.3964585834333734, "grad_norm": 0.13277117908000946, "learning_rate": 5.057565750246073e-05, "loss": 0.33, "step": 4653 }, { "epoch": 1.3967587034813926, "grad_norm": 0.14500407874584198, "learning_rate": 5.053013143171936e-05, "loss": 0.3534, "step": 4654 }, { "epoch": 1.3970588235294117, "grad_norm": 0.14183609187602997, "learning_rate": 5.0484618932469166e-05, "loss": 0.3698, "step": 4655 }, { "epoch": 1.397358943577431, "grad_norm": 0.19110167026519775, "learning_rate": 5.0439120017195986e-05, "loss": 0.4062, "step": 4656 }, { "epoch": 1.3976590636254502, "grad_norm": 0.2165842056274414, "learning_rate": 5.039363469838196e-05, "loss": 0.4034, "step": 4657 }, { "epoch": 1.3979591836734695, "grad_norm": 0.1206594929099083, "learning_rate": 5.034816298850542e-05, "loss": 0.3111, "step": 4658 }, { "epoch": 1.3982593037214885, "grad_norm": 0.14055152237415314, "learning_rate": 5.0302704900041055e-05, "loss": 0.3593, "step": 4659 }, { "epoch": 1.3985594237695078, "grad_norm": 0.13259822130203247, "learning_rate": 5.025726044545968e-05, "loss": 0.3557, "step": 4660 }, { "epoch": 1.398859543817527, "grad_norm": 0.13649876415729523, "learning_rate": 5.021182963722859e-05, "loss": 0.3581, "step": 4661 }, { "epoch": 1.3991596638655461, "grad_norm": 0.14338348805904388, "learning_rate": 5.0166412487811134e-05, "loss": 0.3426, "step": 4662 }, { "epoch": 1.3994597839135654, "grad_norm": 0.12635748088359833, "learning_rate": 5.012100900966695e-05, "loss": 0.3296, "step": 4663 }, { "epoch": 1.3997599039615847, "grad_norm": 0.14264103770256042, "learning_rate": 5.0075619215252015e-05, "loss": 0.3673, "step": 4664 }, { "epoch": 1.400060024009604, "grad_norm": 0.18206572532653809, "learning_rate": 5.0030243117018515e-05, "loss": 0.3736, "step": 4665 }, { "epoch": 1.400360144057623, "grad_norm": 0.13971807062625885, "learning_rate": 4.9984880727414794e-05, "loss": 0.3583, "step": 4666 }, { "epoch": 1.4006602641056423, "grad_norm": 0.16418235003948212, "learning_rate": 4.993953205888559e-05, "loss": 0.3846, "step": 4667 }, { "epoch": 1.4009603841536615, "grad_norm": 0.14172151684761047, "learning_rate": 4.989419712387169e-05, "loss": 0.3775, "step": 4668 }, { "epoch": 1.4012605042016806, "grad_norm": 0.14084260165691376, "learning_rate": 4.984887593481028e-05, "loss": 0.3742, "step": 4669 }, { "epoch": 1.4015606242496998, "grad_norm": 0.16080965101718903, "learning_rate": 4.980356850413472e-05, "loss": 0.391, "step": 4670 }, { "epoch": 1.4018607442977191, "grad_norm": 0.12837891280651093, "learning_rate": 4.975827484427453e-05, "loss": 0.3346, "step": 4671 }, { "epoch": 1.4021608643457384, "grad_norm": 0.1495605856180191, "learning_rate": 4.971299496765555e-05, "loss": 0.3961, "step": 4672 }, { "epoch": 1.4024609843937574, "grad_norm": 0.13998694717884064, "learning_rate": 4.9667728886699794e-05, "loss": 0.3839, "step": 4673 }, { "epoch": 1.4027611044417767, "grad_norm": 0.1444583386182785, "learning_rate": 4.962247661382545e-05, "loss": 0.3967, "step": 4674 }, { "epoch": 1.403061224489796, "grad_norm": 0.14261461794376373, "learning_rate": 4.957723816144703e-05, "loss": 0.3635, "step": 4675 }, { "epoch": 1.403361344537815, "grad_norm": 0.1437271535396576, "learning_rate": 4.95320135419751e-05, "loss": 0.3691, "step": 4676 }, { "epoch": 1.4036614645858343, "grad_norm": 0.1375724822282791, "learning_rate": 4.948680276781656e-05, "loss": 0.361, "step": 4677 }, { "epoch": 1.4039615846338536, "grad_norm": 0.1406862735748291, "learning_rate": 4.9441605851374504e-05, "loss": 0.3841, "step": 4678 }, { "epoch": 1.4042617046818728, "grad_norm": 0.13326990604400635, "learning_rate": 4.9396422805048127e-05, "loss": 0.3409, "step": 4679 }, { "epoch": 1.4045618247298919, "grad_norm": 0.1702723652124405, "learning_rate": 4.935125364123292e-05, "loss": 0.3567, "step": 4680 }, { "epoch": 1.4048619447779112, "grad_norm": 0.14896176755428314, "learning_rate": 4.930609837232049e-05, "loss": 0.383, "step": 4681 }, { "epoch": 1.4051620648259304, "grad_norm": 0.13286007940769196, "learning_rate": 4.9260957010698674e-05, "loss": 0.347, "step": 4682 }, { "epoch": 1.4054621848739495, "grad_norm": 0.14665399491786957, "learning_rate": 4.921582956875154e-05, "loss": 0.3629, "step": 4683 }, { "epoch": 1.4057623049219687, "grad_norm": 0.14434906840324402, "learning_rate": 4.917071605885923e-05, "loss": 0.3899, "step": 4684 }, { "epoch": 1.406062424969988, "grad_norm": 0.1460450440645218, "learning_rate": 4.912561649339806e-05, "loss": 0.3806, "step": 4685 }, { "epoch": 1.4063625450180073, "grad_norm": 0.13991695642471313, "learning_rate": 4.908053088474074e-05, "loss": 0.3856, "step": 4686 }, { "epoch": 1.4066626650660263, "grad_norm": 0.13680557906627655, "learning_rate": 4.9035459245255886e-05, "loss": 0.3398, "step": 4687 }, { "epoch": 1.4069627851140456, "grad_norm": 0.13648124039173126, "learning_rate": 4.899040158730837e-05, "loss": 0.3639, "step": 4688 }, { "epoch": 1.4072629051620649, "grad_norm": 0.14287413656711578, "learning_rate": 4.894535792325926e-05, "loss": 0.3517, "step": 4689 }, { "epoch": 1.407563025210084, "grad_norm": 0.1395912617444992, "learning_rate": 4.89003282654658e-05, "loss": 0.373, "step": 4690 }, { "epoch": 1.4078631452581032, "grad_norm": 0.14574593305587769, "learning_rate": 4.885531262628137e-05, "loss": 0.3862, "step": 4691 }, { "epoch": 1.4081632653061225, "grad_norm": 0.13947996497154236, "learning_rate": 4.8810311018055455e-05, "loss": 0.343, "step": 4692 }, { "epoch": 1.4084633853541417, "grad_norm": 0.1489896923303604, "learning_rate": 4.8765323453133714e-05, "loss": 0.3926, "step": 4693 }, { "epoch": 1.408763505402161, "grad_norm": 0.13951550424098969, "learning_rate": 4.8720349943858004e-05, "loss": 0.3528, "step": 4694 }, { "epoch": 1.40906362545018, "grad_norm": 0.15845239162445068, "learning_rate": 4.867539050256631e-05, "loss": 0.433, "step": 4695 }, { "epoch": 1.4093637454981993, "grad_norm": 0.13138680160045624, "learning_rate": 4.8630445141592674e-05, "loss": 0.3104, "step": 4696 }, { "epoch": 1.4096638655462184, "grad_norm": 0.15351898968219757, "learning_rate": 4.858551387326743e-05, "loss": 0.3692, "step": 4697 }, { "epoch": 1.4099639855942376, "grad_norm": 0.27054664492607117, "learning_rate": 4.854059670991682e-05, "loss": 0.4393, "step": 4698 }, { "epoch": 1.410264105642257, "grad_norm": 0.12757544219493866, "learning_rate": 4.849569366386352e-05, "loss": 0.3188, "step": 4699 }, { "epoch": 1.4105642256902762, "grad_norm": 0.13178516924381256, "learning_rate": 4.845080474742608e-05, "loss": 0.3307, "step": 4700 }, { "epoch": 1.4108643457382954, "grad_norm": 0.16457417607307434, "learning_rate": 4.840592997291923e-05, "loss": 0.3728, "step": 4701 }, { "epoch": 1.4111644657863145, "grad_norm": 0.14082013070583344, "learning_rate": 4.836106935265389e-05, "loss": 0.3321, "step": 4702 }, { "epoch": 1.4114645858343338, "grad_norm": 0.15481553971767426, "learning_rate": 4.831622289893708e-05, "loss": 0.399, "step": 4703 }, { "epoch": 1.4117647058823528, "grad_norm": 0.16300667822360992, "learning_rate": 4.8271390624071845e-05, "loss": 0.389, "step": 4704 }, { "epoch": 1.412064825930372, "grad_norm": 0.1538439244031906, "learning_rate": 4.822657254035747e-05, "loss": 0.3816, "step": 4705 }, { "epoch": 1.4123649459783914, "grad_norm": 0.3736082911491394, "learning_rate": 4.818176866008923e-05, "loss": 0.4044, "step": 4706 }, { "epoch": 1.4126650660264106, "grad_norm": 0.13170059025287628, "learning_rate": 4.813697899555858e-05, "loss": 0.3458, "step": 4707 }, { "epoch": 1.41296518607443, "grad_norm": 0.1586686223745346, "learning_rate": 4.8092203559053084e-05, "loss": 0.397, "step": 4708 }, { "epoch": 1.413265306122449, "grad_norm": 0.13637465238571167, "learning_rate": 4.8047442362856296e-05, "loss": 0.2804, "step": 4709 }, { "epoch": 1.4135654261704682, "grad_norm": 0.13989731669425964, "learning_rate": 4.800269541924799e-05, "loss": 0.3598, "step": 4710 }, { "epoch": 1.4138655462184873, "grad_norm": 0.13504591584205627, "learning_rate": 4.795796274050399e-05, "loss": 0.3374, "step": 4711 }, { "epoch": 1.4141656662665065, "grad_norm": 0.14345870912075043, "learning_rate": 4.7913244338896135e-05, "loss": 0.354, "step": 4712 }, { "epoch": 1.4144657863145258, "grad_norm": 0.13402247428894043, "learning_rate": 4.786854022669247e-05, "loss": 0.3387, "step": 4713 }, { "epoch": 1.414765906362545, "grad_norm": 0.13939963281154633, "learning_rate": 4.782385041615699e-05, "loss": 0.3561, "step": 4714 }, { "epoch": 1.4150660264105643, "grad_norm": 0.14345000684261322, "learning_rate": 4.7779174919549864e-05, "loss": 0.3241, "step": 4715 }, { "epoch": 1.4153661464585834, "grad_norm": 0.15040355920791626, "learning_rate": 4.7734513749127354e-05, "loss": 0.402, "step": 4716 }, { "epoch": 1.4156662665066027, "grad_norm": 0.1621185541152954, "learning_rate": 4.768986691714168e-05, "loss": 0.4094, "step": 4717 }, { "epoch": 1.415966386554622, "grad_norm": 0.13609668612480164, "learning_rate": 4.764523443584116e-05, "loss": 0.3525, "step": 4718 }, { "epoch": 1.416266506602641, "grad_norm": 0.13255177438259125, "learning_rate": 4.7600616317470236e-05, "loss": 0.3326, "step": 4719 }, { "epoch": 1.4165666266506602, "grad_norm": 0.1421128660440445, "learning_rate": 4.7556012574269395e-05, "loss": 0.3404, "step": 4720 }, { "epoch": 1.4168667466986795, "grad_norm": 0.14517702162265778, "learning_rate": 4.7511423218475184e-05, "loss": 0.3906, "step": 4721 }, { "epoch": 1.4171668667466988, "grad_norm": 0.16044922173023224, "learning_rate": 4.746684826232015e-05, "loss": 0.4182, "step": 4722 }, { "epoch": 1.4174669867947178, "grad_norm": 0.1331530660390854, "learning_rate": 4.7422287718032844e-05, "loss": 0.3412, "step": 4723 }, { "epoch": 1.417767106842737, "grad_norm": 0.14331965148448944, "learning_rate": 4.737774159783809e-05, "loss": 0.3207, "step": 4724 }, { "epoch": 1.4180672268907564, "grad_norm": 0.1528742015361786, "learning_rate": 4.733320991395652e-05, "loss": 0.3849, "step": 4725 }, { "epoch": 1.4183673469387754, "grad_norm": 0.1370069980621338, "learning_rate": 4.7288692678604876e-05, "loss": 0.3382, "step": 4726 }, { "epoch": 1.4186674669867947, "grad_norm": 0.13793693482875824, "learning_rate": 4.724418990399598e-05, "loss": 0.3557, "step": 4727 }, { "epoch": 1.418967587034814, "grad_norm": 0.13225845992565155, "learning_rate": 4.719970160233865e-05, "loss": 0.3476, "step": 4728 }, { "epoch": 1.4192677070828332, "grad_norm": 0.13750863075256348, "learning_rate": 4.7155227785837784e-05, "loss": 0.4046, "step": 4729 }, { "epoch": 1.4195678271308523, "grad_norm": 0.14539740979671478, "learning_rate": 4.7110768466694224e-05, "loss": 0.3704, "step": 4730 }, { "epoch": 1.4198679471788715, "grad_norm": 0.15951357781887054, "learning_rate": 4.706632365710484e-05, "loss": 0.4196, "step": 4731 }, { "epoch": 1.4201680672268908, "grad_norm": 0.18090002238750458, "learning_rate": 4.7021893369262596e-05, "loss": 0.3847, "step": 4732 }, { "epoch": 1.4204681872749099, "grad_norm": 0.13013353943824768, "learning_rate": 4.697747761535646e-05, "loss": 0.3451, "step": 4733 }, { "epoch": 1.4207683073229291, "grad_norm": 0.18931058049201965, "learning_rate": 4.6933076407571316e-05, "loss": 0.3749, "step": 4734 }, { "epoch": 1.4210684273709484, "grad_norm": 0.12983962893486023, "learning_rate": 4.6888689758088166e-05, "loss": 0.3344, "step": 4735 }, { "epoch": 1.4213685474189677, "grad_norm": 0.15677180886268616, "learning_rate": 4.6844317679084015e-05, "loss": 0.3454, "step": 4736 }, { "epoch": 1.4216686674669867, "grad_norm": 0.15239223837852478, "learning_rate": 4.679996018273175e-05, "loss": 0.3705, "step": 4737 }, { "epoch": 1.421968787515006, "grad_norm": 0.14624455571174622, "learning_rate": 4.675561728120043e-05, "loss": 0.3943, "step": 4738 }, { "epoch": 1.4222689075630253, "grad_norm": 0.15051227807998657, "learning_rate": 4.671128898665493e-05, "loss": 0.3681, "step": 4739 }, { "epoch": 1.4225690276110443, "grad_norm": 0.13271403312683105, "learning_rate": 4.666697531125627e-05, "loss": 0.3202, "step": 4740 }, { "epoch": 1.4228691476590636, "grad_norm": 0.13313762843608856, "learning_rate": 4.662267626716141e-05, "loss": 0.3587, "step": 4741 }, { "epoch": 1.4231692677070829, "grad_norm": 0.13600504398345947, "learning_rate": 4.657839186652324e-05, "loss": 0.3624, "step": 4742 }, { "epoch": 1.4234693877551021, "grad_norm": 0.1591995656490326, "learning_rate": 4.653412212149072e-05, "loss": 0.3628, "step": 4743 }, { "epoch": 1.4237695078031212, "grad_norm": 0.14780530333518982, "learning_rate": 4.64898670442087e-05, "loss": 0.3125, "step": 4744 }, { "epoch": 1.4240696278511404, "grad_norm": 0.15058188140392303, "learning_rate": 4.644562664681806e-05, "loss": 0.396, "step": 4745 }, { "epoch": 1.4243697478991597, "grad_norm": 0.1523490697145462, "learning_rate": 4.640140094145572e-05, "loss": 0.3463, "step": 4746 }, { "epoch": 1.4246698679471788, "grad_norm": 0.14041709899902344, "learning_rate": 4.635718994025443e-05, "loss": 0.3843, "step": 4747 }, { "epoch": 1.424969987995198, "grad_norm": 0.15815354883670807, "learning_rate": 4.631299365534291e-05, "loss": 0.3847, "step": 4748 }, { "epoch": 1.4252701080432173, "grad_norm": 0.1315331608057022, "learning_rate": 4.6268812098846034e-05, "loss": 0.3182, "step": 4749 }, { "epoch": 1.4255702280912366, "grad_norm": 0.13551288843154907, "learning_rate": 4.622464528288443e-05, "loss": 0.3487, "step": 4750 }, { "epoch": 1.4258703481392558, "grad_norm": 0.14589761197566986, "learning_rate": 4.6180493219574796e-05, "loss": 0.3573, "step": 4751 }, { "epoch": 1.4261704681872749, "grad_norm": 0.13148462772369385, "learning_rate": 4.613635592102968e-05, "loss": 0.3306, "step": 4752 }, { "epoch": 1.4264705882352942, "grad_norm": 0.12648631632328033, "learning_rate": 4.60922333993577e-05, "loss": 0.3242, "step": 4753 }, { "epoch": 1.4267707082833132, "grad_norm": 0.15199293196201324, "learning_rate": 4.604812566666338e-05, "loss": 0.3988, "step": 4754 }, { "epoch": 1.4270708283313325, "grad_norm": 0.14493729174137115, "learning_rate": 4.600403273504713e-05, "loss": 0.3856, "step": 4755 }, { "epoch": 1.4273709483793517, "grad_norm": 0.14682787656784058, "learning_rate": 4.5959954616605326e-05, "loss": 0.3585, "step": 4756 }, { "epoch": 1.427671068427371, "grad_norm": 0.13172248005867004, "learning_rate": 4.5915891323430316e-05, "loss": 0.321, "step": 4757 }, { "epoch": 1.4279711884753903, "grad_norm": 0.14919964969158173, "learning_rate": 4.587184286761035e-05, "loss": 0.3612, "step": 4758 }, { "epoch": 1.4282713085234093, "grad_norm": 0.13788238167762756, "learning_rate": 4.582780926122967e-05, "loss": 0.3721, "step": 4759 }, { "epoch": 1.4285714285714286, "grad_norm": 0.144940584897995, "learning_rate": 4.578379051636832e-05, "loss": 0.3752, "step": 4760 }, { "epoch": 1.4288715486194477, "grad_norm": 0.13624529540538788, "learning_rate": 4.5739786645102367e-05, "loss": 0.3111, "step": 4761 }, { "epoch": 1.429171668667467, "grad_norm": 0.14726722240447998, "learning_rate": 4.569579765950379e-05, "loss": 0.3926, "step": 4762 }, { "epoch": 1.4294717887154862, "grad_norm": 0.13372276723384857, "learning_rate": 4.5651823571640464e-05, "loss": 0.3421, "step": 4763 }, { "epoch": 1.4297719087635055, "grad_norm": 0.1457625925540924, "learning_rate": 4.560786439357609e-05, "loss": 0.3598, "step": 4764 }, { "epoch": 1.4300720288115247, "grad_norm": 0.14241823554039001, "learning_rate": 4.5563920137370456e-05, "loss": 0.3546, "step": 4765 }, { "epoch": 1.4303721488595438, "grad_norm": 0.1325092315673828, "learning_rate": 4.551999081507915e-05, "loss": 0.3146, "step": 4766 }, { "epoch": 1.430672268907563, "grad_norm": 0.1474294513463974, "learning_rate": 4.547607643875363e-05, "loss": 0.3759, "step": 4767 }, { "epoch": 1.430972388955582, "grad_norm": 0.1436256319284439, "learning_rate": 4.543217702044139e-05, "loss": 0.355, "step": 4768 }, { "epoch": 1.4312725090036014, "grad_norm": 0.14325234293937683, "learning_rate": 4.538829257218559e-05, "loss": 0.3472, "step": 4769 }, { "epoch": 1.4315726290516206, "grad_norm": 0.1733434945344925, "learning_rate": 4.534442310602559e-05, "loss": 0.4102, "step": 4770 }, { "epoch": 1.43187274909964, "grad_norm": 0.1364019364118576, "learning_rate": 4.53005686339964e-05, "loss": 0.3467, "step": 4771 }, { "epoch": 1.4321728691476592, "grad_norm": 0.1269693672657013, "learning_rate": 4.525672916812894e-05, "loss": 0.3075, "step": 4772 }, { "epoch": 1.4324729891956782, "grad_norm": 0.16181963682174683, "learning_rate": 4.5212904720450134e-05, "loss": 0.3781, "step": 4773 }, { "epoch": 1.4327731092436975, "grad_norm": 0.15238305926322937, "learning_rate": 4.5169095302982724e-05, "loss": 0.3979, "step": 4774 }, { "epoch": 1.4330732292917168, "grad_norm": 0.21794739365577698, "learning_rate": 4.512530092774525e-05, "loss": 0.333, "step": 4775 }, { "epoch": 1.4333733493397358, "grad_norm": 0.14577952027320862, "learning_rate": 4.508152160675229e-05, "loss": 0.375, "step": 4776 }, { "epoch": 1.433673469387755, "grad_norm": 0.14348258078098297, "learning_rate": 4.5037757352014106e-05, "loss": 0.3577, "step": 4777 }, { "epoch": 1.4339735894357744, "grad_norm": 0.14008821547031403, "learning_rate": 4.499400817553696e-05, "loss": 0.3519, "step": 4778 }, { "epoch": 1.4342737094837936, "grad_norm": 0.1430148482322693, "learning_rate": 4.495027408932298e-05, "loss": 0.3549, "step": 4779 }, { "epoch": 1.4345738295318127, "grad_norm": 0.13885506987571716, "learning_rate": 4.490655510537004e-05, "loss": 0.3413, "step": 4780 }, { "epoch": 1.434873949579832, "grad_norm": 0.14540061354637146, "learning_rate": 4.486285123567201e-05, "loss": 0.3672, "step": 4781 }, { "epoch": 1.4351740696278512, "grad_norm": 0.1269986927509308, "learning_rate": 4.481916249221847e-05, "loss": 0.3177, "step": 4782 }, { "epoch": 1.4354741896758703, "grad_norm": 0.12497656792402267, "learning_rate": 4.4775488886994965e-05, "loss": 0.3093, "step": 4783 }, { "epoch": 1.4357743097238895, "grad_norm": 0.1531170904636383, "learning_rate": 4.473183043198288e-05, "loss": 0.364, "step": 4784 }, { "epoch": 1.4360744297719088, "grad_norm": 0.1496521383523941, "learning_rate": 4.468818713915934e-05, "loss": 0.391, "step": 4785 }, { "epoch": 1.436374549819928, "grad_norm": 0.13336549699306488, "learning_rate": 4.4644559020497436e-05, "loss": 0.3365, "step": 4786 }, { "epoch": 1.4366746698679471, "grad_norm": 0.15357303619384766, "learning_rate": 4.4600946087966046e-05, "loss": 0.3706, "step": 4787 }, { "epoch": 1.4369747899159664, "grad_norm": 0.14057567715644836, "learning_rate": 4.4557348353529827e-05, "loss": 0.3582, "step": 4788 }, { "epoch": 1.4372749099639857, "grad_norm": 0.14349892735481262, "learning_rate": 4.451376582914939e-05, "loss": 0.3767, "step": 4789 }, { "epoch": 1.4375750300120047, "grad_norm": 0.13298708200454712, "learning_rate": 4.447019852678101e-05, "loss": 0.3307, "step": 4790 }, { "epoch": 1.437875150060024, "grad_norm": 0.15382781624794006, "learning_rate": 4.442664645837694e-05, "loss": 0.3984, "step": 4791 }, { "epoch": 1.4381752701080432, "grad_norm": 0.14238616824150085, "learning_rate": 4.438310963588522e-05, "loss": 0.3324, "step": 4792 }, { "epoch": 1.4384753901560625, "grad_norm": 0.14080406725406647, "learning_rate": 4.4339588071249625e-05, "loss": 0.3518, "step": 4793 }, { "epoch": 1.4387755102040816, "grad_norm": 0.13965073227882385, "learning_rate": 4.429608177640977e-05, "loss": 0.3336, "step": 4794 }, { "epoch": 1.4390756302521008, "grad_norm": 0.15151721239089966, "learning_rate": 4.425259076330115e-05, "loss": 0.369, "step": 4795 }, { "epoch": 1.43937575030012, "grad_norm": 1.4650521278381348, "learning_rate": 4.420911504385507e-05, "loss": 0.3401, "step": 4796 }, { "epoch": 1.4396758703481392, "grad_norm": 0.14364506304264069, "learning_rate": 4.4165654629998485e-05, "loss": 0.3819, "step": 4797 }, { "epoch": 1.4399759903961584, "grad_norm": 0.13930128514766693, "learning_rate": 4.412220953365433e-05, "loss": 0.354, "step": 4798 }, { "epoch": 1.4402761104441777, "grad_norm": 0.13983362913131714, "learning_rate": 4.4078779766741276e-05, "loss": 0.3461, "step": 4799 }, { "epoch": 1.440576230492197, "grad_norm": 0.17801937460899353, "learning_rate": 4.40353653411738e-05, "loss": 0.3453, "step": 4800 }, { "epoch": 1.440876350540216, "grad_norm": 0.1360776573419571, "learning_rate": 4.399196626886212e-05, "loss": 0.3572, "step": 4801 }, { "epoch": 1.4411764705882353, "grad_norm": 0.13738538324832916, "learning_rate": 4.394858256171223e-05, "loss": 0.3464, "step": 4802 }, { "epoch": 1.4414765906362546, "grad_norm": 0.12927304208278656, "learning_rate": 4.390521423162599e-05, "loss": 0.3278, "step": 4803 }, { "epoch": 1.4417767106842736, "grad_norm": 0.13406582176685333, "learning_rate": 4.386186129050105e-05, "loss": 0.3337, "step": 4804 }, { "epoch": 1.4420768307322929, "grad_norm": 0.1290862262248993, "learning_rate": 4.381852375023072e-05, "loss": 0.3472, "step": 4805 }, { "epoch": 1.4423769507803121, "grad_norm": 0.12430037558078766, "learning_rate": 4.377520162270423e-05, "loss": 0.3343, "step": 4806 }, { "epoch": 1.4426770708283314, "grad_norm": 0.15715405344963074, "learning_rate": 4.373189491980639e-05, "loss": 0.4192, "step": 4807 }, { "epoch": 1.4429771908763505, "grad_norm": 0.13872407376766205, "learning_rate": 4.368860365341805e-05, "loss": 0.3531, "step": 4808 }, { "epoch": 1.4432773109243697, "grad_norm": 0.14024512469768524, "learning_rate": 4.364532783541559e-05, "loss": 0.372, "step": 4809 }, { "epoch": 1.443577430972389, "grad_norm": 0.1702309250831604, "learning_rate": 4.360206747767122e-05, "loss": 0.335, "step": 4810 }, { "epoch": 1.443877551020408, "grad_norm": 0.13013462722301483, "learning_rate": 4.355882259205294e-05, "loss": 0.3233, "step": 4811 }, { "epoch": 1.4441776710684273, "grad_norm": 0.15084701776504517, "learning_rate": 4.351559319042453e-05, "loss": 0.3926, "step": 4812 }, { "epoch": 1.4444777911164466, "grad_norm": 0.13886122405529022, "learning_rate": 4.3472379284645405e-05, "loss": 0.3669, "step": 4813 }, { "epoch": 1.4447779111644659, "grad_norm": 0.1385250985622406, "learning_rate": 4.3429180886570886e-05, "loss": 0.3678, "step": 4814 }, { "epoch": 1.4450780312124851, "grad_norm": 0.14431382715702057, "learning_rate": 4.3385998008051884e-05, "loss": 0.3676, "step": 4815 }, { "epoch": 1.4453781512605042, "grad_norm": 0.1262168139219284, "learning_rate": 4.334283066093515e-05, "loss": 0.2928, "step": 4816 }, { "epoch": 1.4456782713085234, "grad_norm": 0.13885599374771118, "learning_rate": 4.3299678857063194e-05, "loss": 0.3694, "step": 4817 }, { "epoch": 1.4459783913565425, "grad_norm": 0.12713685631752014, "learning_rate": 4.325654260827416e-05, "loss": 0.309, "step": 4818 }, { "epoch": 1.4462785114045618, "grad_norm": 0.1348283588886261, "learning_rate": 4.321342192640204e-05, "loss": 0.3524, "step": 4819 }, { "epoch": 1.446578631452581, "grad_norm": 0.14019560813903809, "learning_rate": 4.3170316823276424e-05, "loss": 0.3647, "step": 4820 }, { "epoch": 1.4468787515006003, "grad_norm": 0.13907159864902496, "learning_rate": 4.312722731072275e-05, "loss": 0.3614, "step": 4821 }, { "epoch": 1.4471788715486196, "grad_norm": 0.1358475685119629, "learning_rate": 4.308415340056217e-05, "loss": 0.3806, "step": 4822 }, { "epoch": 1.4474789915966386, "grad_norm": 0.1443357616662979, "learning_rate": 4.304109510461143e-05, "loss": 0.3861, "step": 4823 }, { "epoch": 1.447779111644658, "grad_norm": 0.1358397752046585, "learning_rate": 4.2998052434683125e-05, "loss": 0.3661, "step": 4824 }, { "epoch": 1.448079231692677, "grad_norm": 0.12247933447360992, "learning_rate": 4.2955025402585544e-05, "loss": 0.2983, "step": 4825 }, { "epoch": 1.4483793517406962, "grad_norm": 0.13556553423404694, "learning_rate": 4.291201402012265e-05, "loss": 0.3371, "step": 4826 }, { "epoch": 1.4486794717887155, "grad_norm": 0.1382128894329071, "learning_rate": 4.286901829909406e-05, "loss": 0.3597, "step": 4827 }, { "epoch": 1.4489795918367347, "grad_norm": 0.1367807388305664, "learning_rate": 4.28260382512952e-05, "loss": 0.3291, "step": 4828 }, { "epoch": 1.449279711884754, "grad_norm": 0.15236921608448029, "learning_rate": 4.278307388851716e-05, "loss": 0.3163, "step": 4829 }, { "epoch": 1.449579831932773, "grad_norm": 0.12621168792247772, "learning_rate": 4.274012522254674e-05, "loss": 0.3256, "step": 4830 }, { "epoch": 1.4498799519807923, "grad_norm": 0.14435584843158722, "learning_rate": 4.269719226516641e-05, "loss": 0.3828, "step": 4831 }, { "epoch": 1.4501800720288116, "grad_norm": 0.12524443864822388, "learning_rate": 4.2654275028154224e-05, "loss": 0.2974, "step": 4832 }, { "epoch": 1.4504801920768307, "grad_norm": 0.15154621005058289, "learning_rate": 4.2611373523284205e-05, "loss": 0.3877, "step": 4833 }, { "epoch": 1.45078031212485, "grad_norm": 0.14805643260478973, "learning_rate": 4.2568487762325806e-05, "loss": 0.3613, "step": 4834 }, { "epoch": 1.4510804321728692, "grad_norm": 0.14266574382781982, "learning_rate": 4.252561775704421e-05, "loss": 0.3802, "step": 4835 }, { "epoch": 1.4513805522208885, "grad_norm": 0.15151824057102203, "learning_rate": 4.2482763519200356e-05, "loss": 0.3503, "step": 4836 }, { "epoch": 1.4516806722689075, "grad_norm": 0.1423339694738388, "learning_rate": 4.24399250605508e-05, "loss": 0.3557, "step": 4837 }, { "epoch": 1.4519807923169268, "grad_norm": 0.14264728128910065, "learning_rate": 4.239710239284781e-05, "loss": 0.3807, "step": 4838 }, { "epoch": 1.452280912364946, "grad_norm": 0.1482369750738144, "learning_rate": 4.235429552783928e-05, "loss": 0.4035, "step": 4839 }, { "epoch": 1.452581032412965, "grad_norm": 0.13236463069915771, "learning_rate": 4.231150447726874e-05, "loss": 0.3464, "step": 4840 }, { "epoch": 1.4528811524609844, "grad_norm": 0.14207333326339722, "learning_rate": 4.226872925287545e-05, "loss": 0.3694, "step": 4841 }, { "epoch": 1.4531812725090036, "grad_norm": 0.2207833230495453, "learning_rate": 4.222596986639435e-05, "loss": 0.3523, "step": 4842 }, { "epoch": 1.453481392557023, "grad_norm": 0.14225220680236816, "learning_rate": 4.2183226329555906e-05, "loss": 0.348, "step": 4843 }, { "epoch": 1.453781512605042, "grad_norm": 0.14505499601364136, "learning_rate": 4.214049865408639e-05, "loss": 0.3859, "step": 4844 }, { "epoch": 1.4540816326530612, "grad_norm": 0.12653787434101105, "learning_rate": 4.209778685170759e-05, "loss": 0.3332, "step": 4845 }, { "epoch": 1.4543817527010805, "grad_norm": 0.16374477744102478, "learning_rate": 4.205509093413702e-05, "loss": 0.3976, "step": 4846 }, { "epoch": 1.4546818727490995, "grad_norm": 0.15292547643184662, "learning_rate": 4.201241091308786e-05, "loss": 0.4021, "step": 4847 }, { "epoch": 1.4549819927971188, "grad_norm": 0.13936926424503326, "learning_rate": 4.196974680026882e-05, "loss": 0.3697, "step": 4848 }, { "epoch": 1.455282112845138, "grad_norm": 0.13519351184368134, "learning_rate": 4.192709860738433e-05, "loss": 0.3469, "step": 4849 }, { "epoch": 1.4555822328931574, "grad_norm": 0.1383168250322342, "learning_rate": 4.1884466346134466e-05, "loss": 0.3847, "step": 4850 }, { "epoch": 1.4558823529411764, "grad_norm": 0.17078574001789093, "learning_rate": 4.1841850028214844e-05, "loss": 0.3696, "step": 4851 }, { "epoch": 1.4561824729891957, "grad_norm": 0.14079110324382782, "learning_rate": 4.179924966531683e-05, "loss": 0.3394, "step": 4852 }, { "epoch": 1.456482593037215, "grad_norm": 0.13478030264377594, "learning_rate": 4.1756665269127274e-05, "loss": 0.337, "step": 4853 }, { "epoch": 1.456782713085234, "grad_norm": 0.13460038602352142, "learning_rate": 4.171409685132873e-05, "loss": 0.3396, "step": 4854 }, { "epoch": 1.4570828331332533, "grad_norm": 0.1408572494983673, "learning_rate": 4.167154442359943e-05, "loss": 0.3464, "step": 4855 }, { "epoch": 1.4573829531812725, "grad_norm": 0.1432645469903946, "learning_rate": 4.162900799761308e-05, "loss": 0.3597, "step": 4856 }, { "epoch": 1.4576830732292918, "grad_norm": 0.1405627578496933, "learning_rate": 4.1586487585039e-05, "loss": 0.37, "step": 4857 }, { "epoch": 1.4579831932773109, "grad_norm": 0.13922898471355438, "learning_rate": 4.154398319754232e-05, "loss": 0.359, "step": 4858 }, { "epoch": 1.4582833133253301, "grad_norm": 0.1451897919178009, "learning_rate": 4.1501494846783526e-05, "loss": 0.369, "step": 4859 }, { "epoch": 1.4585834333733494, "grad_norm": 0.13877728581428528, "learning_rate": 4.145902254441888e-05, "loss": 0.3481, "step": 4860 }, { "epoch": 1.4588835534213684, "grad_norm": 0.15443190932273865, "learning_rate": 4.1416566302100094e-05, "loss": 0.4045, "step": 4861 }, { "epoch": 1.4591836734693877, "grad_norm": 0.1353960931301117, "learning_rate": 4.137412613147459e-05, "loss": 0.3363, "step": 4862 }, { "epoch": 1.459483793517407, "grad_norm": 0.14713869988918304, "learning_rate": 4.1331702044185374e-05, "loss": 0.3736, "step": 4863 }, { "epoch": 1.4597839135654262, "grad_norm": 0.14519952237606049, "learning_rate": 4.1289294051870985e-05, "loss": 0.3613, "step": 4864 }, { "epoch": 1.4600840336134453, "grad_norm": 0.1449516862630844, "learning_rate": 4.124690216616552e-05, "loss": 0.3976, "step": 4865 }, { "epoch": 1.4603841536614646, "grad_norm": 0.1280471831560135, "learning_rate": 4.120452639869875e-05, "loss": 0.3169, "step": 4866 }, { "epoch": 1.4606842737094838, "grad_norm": 0.14638231694698334, "learning_rate": 4.116216676109598e-05, "loss": 0.331, "step": 4867 }, { "epoch": 1.4609843937575029, "grad_norm": 0.14368273317813873, "learning_rate": 4.111982326497813e-05, "loss": 0.3421, "step": 4868 }, { "epoch": 1.4612845138055222, "grad_norm": 0.14355066418647766, "learning_rate": 4.1077495921961604e-05, "loss": 0.3683, "step": 4869 }, { "epoch": 1.4615846338535414, "grad_norm": 0.11699586361646652, "learning_rate": 4.1035184743658376e-05, "loss": 0.3025, "step": 4870 }, { "epoch": 1.4618847539015607, "grad_norm": 0.13622942566871643, "learning_rate": 4.0992889741676145e-05, "loss": 0.3254, "step": 4871 }, { "epoch": 1.46218487394958, "grad_norm": 0.1358841061592102, "learning_rate": 4.0950610927618e-05, "loss": 0.3134, "step": 4872 }, { "epoch": 1.462484993997599, "grad_norm": 0.1376754343509674, "learning_rate": 4.090834831308262e-05, "loss": 0.3594, "step": 4873 }, { "epoch": 1.4627851140456183, "grad_norm": 0.13249529898166656, "learning_rate": 4.086610190966431e-05, "loss": 0.3302, "step": 4874 }, { "epoch": 1.4630852340936373, "grad_norm": 0.13735318183898926, "learning_rate": 4.082387172895291e-05, "loss": 0.3618, "step": 4875 }, { "epoch": 1.4633853541416566, "grad_norm": 0.13755595684051514, "learning_rate": 4.078165778253371e-05, "loss": 0.3545, "step": 4876 }, { "epoch": 1.4636854741896759, "grad_norm": 0.14065246284008026, "learning_rate": 4.073946008198771e-05, "loss": 0.3828, "step": 4877 }, { "epoch": 1.4639855942376951, "grad_norm": 0.16443657875061035, "learning_rate": 4.069727863889128e-05, "loss": 0.391, "step": 4878 }, { "epoch": 1.4642857142857144, "grad_norm": 0.1418590396642685, "learning_rate": 4.065511346481645e-05, "loss": 0.3804, "step": 4879 }, { "epoch": 1.4645858343337335, "grad_norm": 0.15260443091392517, "learning_rate": 4.0612964571330805e-05, "loss": 0.3628, "step": 4880 }, { "epoch": 1.4648859543817527, "grad_norm": 0.1524861603975296, "learning_rate": 4.057083196999732e-05, "loss": 0.366, "step": 4881 }, { "epoch": 1.4651860744297718, "grad_norm": 0.13246721029281616, "learning_rate": 4.0528715672374636e-05, "loss": 0.3532, "step": 4882 }, { "epoch": 1.465486194477791, "grad_norm": 0.13852067291736603, "learning_rate": 4.048661569001692e-05, "loss": 0.3478, "step": 4883 }, { "epoch": 1.4657863145258103, "grad_norm": 0.1355675756931305, "learning_rate": 4.044453203447372e-05, "loss": 0.3339, "step": 4884 }, { "epoch": 1.4660864345738296, "grad_norm": 0.1346714198589325, "learning_rate": 4.04024647172903e-05, "loss": 0.3571, "step": 4885 }, { "epoch": 1.4663865546218489, "grad_norm": 0.14984160661697388, "learning_rate": 4.036041375000728e-05, "loss": 0.3666, "step": 4886 }, { "epoch": 1.466686674669868, "grad_norm": 0.1460392028093338, "learning_rate": 4.031837914416088e-05, "loss": 0.3305, "step": 4887 }, { "epoch": 1.4669867947178872, "grad_norm": 0.1454431265592575, "learning_rate": 4.027636091128284e-05, "loss": 0.3499, "step": 4888 }, { "epoch": 1.4672869147659062, "grad_norm": 0.15010550618171692, "learning_rate": 4.023435906290034e-05, "loss": 0.3345, "step": 4889 }, { "epoch": 1.4675870348139255, "grad_norm": 0.13426147401332855, "learning_rate": 4.019237361053615e-05, "loss": 0.34, "step": 4890 }, { "epoch": 1.4678871548619448, "grad_norm": 0.13417157530784607, "learning_rate": 4.0150404565708435e-05, "loss": 0.3414, "step": 4891 }, { "epoch": 1.468187274909964, "grad_norm": 0.21425917744636536, "learning_rate": 4.010845193993096e-05, "loss": 0.3959, "step": 4892 }, { "epoch": 1.4684873949579833, "grad_norm": 0.14582909643650055, "learning_rate": 4.0066515744712974e-05, "loss": 0.3999, "step": 4893 }, { "epoch": 1.4687875150060024, "grad_norm": 0.12927165627479553, "learning_rate": 4.0024595991559166e-05, "loss": 0.3333, "step": 4894 }, { "epoch": 1.4690876350540216, "grad_norm": 0.1228029727935791, "learning_rate": 3.998269269196966e-05, "loss": 0.295, "step": 4895 }, { "epoch": 1.469387755102041, "grad_norm": 0.13898111879825592, "learning_rate": 3.99408058574403e-05, "loss": 0.3661, "step": 4896 }, { "epoch": 1.46968787515006, "grad_norm": 0.20389845967292786, "learning_rate": 3.989893549946213e-05, "loss": 0.3409, "step": 4897 }, { "epoch": 1.4699879951980792, "grad_norm": 0.12807156145572662, "learning_rate": 3.9857081629521896e-05, "loss": 0.3217, "step": 4898 }, { "epoch": 1.4702881152460985, "grad_norm": 0.13886474072933197, "learning_rate": 3.9815244259101644e-05, "loss": 0.3526, "step": 4899 }, { "epoch": 1.4705882352941178, "grad_norm": 0.14439184963703156, "learning_rate": 3.977342339967902e-05, "loss": 0.3428, "step": 4900 }, { "epoch": 1.4708883553421368, "grad_norm": 0.1395224630832672, "learning_rate": 3.973161906272712e-05, "loss": 0.4465, "step": 4901 }, { "epoch": 1.471188475390156, "grad_norm": 0.1354299634695053, "learning_rate": 3.968983125971447e-05, "loss": 0.3385, "step": 4902 }, { "epoch": 1.4714885954381753, "grad_norm": 0.14423540234565735, "learning_rate": 3.964806000210503e-05, "loss": 0.3571, "step": 4903 }, { "epoch": 1.4717887154861944, "grad_norm": 0.14226119220256805, "learning_rate": 3.9606305301358284e-05, "loss": 0.3239, "step": 4904 }, { "epoch": 1.4720888355342137, "grad_norm": 0.14099536836147308, "learning_rate": 3.95645671689292e-05, "loss": 0.3768, "step": 4905 }, { "epoch": 1.472388955582233, "grad_norm": 0.13216210901737213, "learning_rate": 3.95228456162681e-05, "loss": 0.3315, "step": 4906 }, { "epoch": 1.4726890756302522, "grad_norm": 0.14150847494602203, "learning_rate": 3.948114065482087e-05, "loss": 0.3574, "step": 4907 }, { "epoch": 1.4729891956782712, "grad_norm": 0.12970447540283203, "learning_rate": 3.943945229602869e-05, "loss": 0.3268, "step": 4908 }, { "epoch": 1.4732893157262905, "grad_norm": 0.16438636183738708, "learning_rate": 3.939778055132842e-05, "loss": 0.3805, "step": 4909 }, { "epoch": 1.4735894357743098, "grad_norm": 0.14561450481414795, "learning_rate": 3.935612543215216e-05, "loss": 0.3685, "step": 4910 }, { "epoch": 1.4738895558223288, "grad_norm": 0.14267125725746155, "learning_rate": 3.9314486949927467e-05, "loss": 0.3541, "step": 4911 }, { "epoch": 1.474189675870348, "grad_norm": 0.140004962682724, "learning_rate": 3.9272865116077414e-05, "loss": 0.3572, "step": 4912 }, { "epoch": 1.4744897959183674, "grad_norm": 0.15184147655963898, "learning_rate": 3.9231259942020536e-05, "loss": 0.3903, "step": 4913 }, { "epoch": 1.4747899159663866, "grad_norm": 0.13791409134864807, "learning_rate": 3.918967143917064e-05, "loss": 0.3658, "step": 4914 }, { "epoch": 1.4750900360144057, "grad_norm": 0.13405513763427734, "learning_rate": 3.914809961893714e-05, "loss": 0.3497, "step": 4915 }, { "epoch": 1.475390156062425, "grad_norm": 0.13332565128803253, "learning_rate": 3.910654449272469e-05, "loss": 0.3307, "step": 4916 }, { "epoch": 1.4756902761104442, "grad_norm": 0.1424759030342102, "learning_rate": 3.9065006071933544e-05, "loss": 0.3695, "step": 4917 }, { "epoch": 1.4759903961584633, "grad_norm": 0.13335555791854858, "learning_rate": 3.90234843679593e-05, "loss": 0.3548, "step": 4918 }, { "epoch": 1.4762905162064826, "grad_norm": 0.137489914894104, "learning_rate": 3.8981979392192866e-05, "loss": 0.3147, "step": 4919 }, { "epoch": 1.4765906362545018, "grad_norm": 0.14004887640476227, "learning_rate": 3.8940491156020744e-05, "loss": 0.3701, "step": 4920 }, { "epoch": 1.476890756302521, "grad_norm": 0.15263986587524414, "learning_rate": 3.889901967082476e-05, "loss": 0.4014, "step": 4921 }, { "epoch": 1.4771908763505401, "grad_norm": 0.2184661626815796, "learning_rate": 3.885756494798206e-05, "loss": 0.3517, "step": 4922 }, { "epoch": 1.4774909963985594, "grad_norm": 0.13904131948947906, "learning_rate": 3.8816126998865365e-05, "loss": 0.3309, "step": 4923 }, { "epoch": 1.4777911164465787, "grad_norm": 0.13836504518985748, "learning_rate": 3.877470583484262e-05, "loss": 0.344, "step": 4924 }, { "epoch": 1.4780912364945977, "grad_norm": 0.2074955701828003, "learning_rate": 3.873330146727729e-05, "loss": 0.3074, "step": 4925 }, { "epoch": 1.478391356542617, "grad_norm": 0.14033573865890503, "learning_rate": 3.869191390752821e-05, "loss": 0.3486, "step": 4926 }, { "epoch": 1.4786914765906363, "grad_norm": 0.15356256067752838, "learning_rate": 3.8650543166949526e-05, "loss": 0.3575, "step": 4927 }, { "epoch": 1.4789915966386555, "grad_norm": 0.16172119975090027, "learning_rate": 3.860918925689089e-05, "loss": 0.3891, "step": 4928 }, { "epoch": 1.4792917166866748, "grad_norm": 0.1274283528327942, "learning_rate": 3.8567852188697205e-05, "loss": 0.3287, "step": 4929 }, { "epoch": 1.4795918367346939, "grad_norm": 0.14106817543506622, "learning_rate": 3.852653197370885e-05, "loss": 0.3704, "step": 4930 }, { "epoch": 1.4798919567827131, "grad_norm": 0.1411307454109192, "learning_rate": 3.848522862326159e-05, "loss": 0.3736, "step": 4931 }, { "epoch": 1.4801920768307322, "grad_norm": 0.15969686210155487, "learning_rate": 3.8443942148686505e-05, "loss": 0.358, "step": 4932 }, { "epoch": 1.4804921968787514, "grad_norm": 0.32184886932373047, "learning_rate": 3.840267256130997e-05, "loss": 0.3139, "step": 4933 }, { "epoch": 1.4807923169267707, "grad_norm": 0.3197161853313446, "learning_rate": 3.8361419872453985e-05, "loss": 0.38, "step": 4934 }, { "epoch": 1.48109243697479, "grad_norm": 0.13927941024303436, "learning_rate": 3.832018409343567e-05, "loss": 0.3543, "step": 4935 }, { "epoch": 1.4813925570228093, "grad_norm": 0.14764556288719177, "learning_rate": 3.827896523556757e-05, "loss": 0.371, "step": 4936 }, { "epoch": 1.4816926770708283, "grad_norm": 0.14382779598236084, "learning_rate": 3.8237763310157614e-05, "loss": 0.3562, "step": 4937 }, { "epoch": 1.4819927971188476, "grad_norm": 0.15075556933879852, "learning_rate": 3.819657832850909e-05, "loss": 0.3548, "step": 4938 }, { "epoch": 1.4822929171668666, "grad_norm": 0.12999118864536285, "learning_rate": 3.815541030192067e-05, "loss": 0.3283, "step": 4939 }, { "epoch": 1.482593037214886, "grad_norm": 0.14742016792297363, "learning_rate": 3.811425924168628e-05, "loss": 0.3818, "step": 4940 }, { "epoch": 1.4828931572629052, "grad_norm": 0.12545278668403625, "learning_rate": 3.8073125159095225e-05, "loss": 0.2996, "step": 4941 }, { "epoch": 1.4831932773109244, "grad_norm": 0.1290358453989029, "learning_rate": 3.803200806543218e-05, "loss": 0.3388, "step": 4942 }, { "epoch": 1.4834933973589437, "grad_norm": 0.15177536010742188, "learning_rate": 3.799090797197721e-05, "loss": 0.4223, "step": 4943 }, { "epoch": 1.4837935174069627, "grad_norm": 0.14963765442371368, "learning_rate": 3.794982489000556e-05, "loss": 0.3861, "step": 4944 }, { "epoch": 1.484093637454982, "grad_norm": 0.12542229890823364, "learning_rate": 3.790875883078795e-05, "loss": 0.3119, "step": 4945 }, { "epoch": 1.484393757503001, "grad_norm": 0.14355400204658508, "learning_rate": 3.786770980559041e-05, "loss": 0.4041, "step": 4946 }, { "epoch": 1.4846938775510203, "grad_norm": 0.13719186186790466, "learning_rate": 3.7826677825674204e-05, "loss": 0.345, "step": 4947 }, { "epoch": 1.4849939975990396, "grad_norm": 0.15719708800315857, "learning_rate": 3.778566290229607e-05, "loss": 0.3721, "step": 4948 }, { "epoch": 1.4852941176470589, "grad_norm": 0.14396868646144867, "learning_rate": 3.7744665046707886e-05, "loss": 0.362, "step": 4949 }, { "epoch": 1.4855942376950781, "grad_norm": 0.13629618287086487, "learning_rate": 3.770368427015699e-05, "loss": 0.3377, "step": 4950 }, { "epoch": 1.4858943577430972, "grad_norm": 0.13559213280677795, "learning_rate": 3.766272058388604e-05, "loss": 0.3181, "step": 4951 }, { "epoch": 1.4861944777911165, "grad_norm": 0.15401612222194672, "learning_rate": 3.762177399913285e-05, "loss": 0.4098, "step": 4952 }, { "epoch": 1.4864945978391357, "grad_norm": 0.13623987138271332, "learning_rate": 3.758084452713073e-05, "loss": 0.3505, "step": 4953 }, { "epoch": 1.4867947178871548, "grad_norm": 0.13094072043895721, "learning_rate": 3.753993217910815e-05, "loss": 0.3286, "step": 4954 }, { "epoch": 1.487094837935174, "grad_norm": 0.1474945843219757, "learning_rate": 3.749903696628898e-05, "loss": 0.3716, "step": 4955 }, { "epoch": 1.4873949579831933, "grad_norm": 0.1438131332397461, "learning_rate": 3.745815889989237e-05, "loss": 0.3337, "step": 4956 }, { "epoch": 1.4876950780312126, "grad_norm": 0.13847722113132477, "learning_rate": 3.7417297991132696e-05, "loss": 0.3528, "step": 4957 }, { "epoch": 1.4879951980792316, "grad_norm": 0.12775957584381104, "learning_rate": 3.7376454251219704e-05, "loss": 0.3065, "step": 4958 }, { "epoch": 1.488295318127251, "grad_norm": 0.19468966126441956, "learning_rate": 3.733562769135845e-05, "loss": 0.3522, "step": 4959 }, { "epoch": 1.4885954381752702, "grad_norm": 0.14810489118099213, "learning_rate": 3.729481832274916e-05, "loss": 0.3835, "step": 4960 }, { "epoch": 1.4888955582232892, "grad_norm": 0.14498479664325714, "learning_rate": 3.7254026156587475e-05, "loss": 0.37, "step": 4961 }, { "epoch": 1.4891956782713085, "grad_norm": 0.13816475868225098, "learning_rate": 3.72132512040642e-05, "loss": 0.3304, "step": 4962 }, { "epoch": 1.4894957983193278, "grad_norm": 0.14384940266609192, "learning_rate": 3.717249347636551e-05, "loss": 0.3834, "step": 4963 }, { "epoch": 1.489795918367347, "grad_norm": 0.1332027018070221, "learning_rate": 3.713175298467285e-05, "loss": 0.3327, "step": 4964 }, { "epoch": 1.490096038415366, "grad_norm": 0.14304400980472565, "learning_rate": 3.7091029740162875e-05, "loss": 0.3589, "step": 4965 }, { "epoch": 1.4903961584633854, "grad_norm": 0.14571350812911987, "learning_rate": 3.705032375400751e-05, "loss": 0.3768, "step": 4966 }, { "epoch": 1.4906962785114046, "grad_norm": 0.13593170046806335, "learning_rate": 3.700963503737399e-05, "loss": 0.3525, "step": 4967 }, { "epoch": 1.4909963985594237, "grad_norm": 0.13215714693069458, "learning_rate": 3.696896360142483e-05, "loss": 0.3438, "step": 4968 }, { "epoch": 1.491296518607443, "grad_norm": 0.1519247442483902, "learning_rate": 3.692830945731778e-05, "loss": 0.3527, "step": 4969 }, { "epoch": 1.4915966386554622, "grad_norm": 0.13658547401428223, "learning_rate": 3.688767261620578e-05, "loss": 0.3478, "step": 4970 }, { "epoch": 1.4918967587034815, "grad_norm": 0.13602906465530396, "learning_rate": 3.6847053089237116e-05, "loss": 0.3562, "step": 4971 }, { "epoch": 1.4921968787515005, "grad_norm": 0.15309105813503265, "learning_rate": 3.680645088755533e-05, "loss": 0.3967, "step": 4972 }, { "epoch": 1.4924969987995198, "grad_norm": 0.13403762876987457, "learning_rate": 3.6765866022299125e-05, "loss": 0.3157, "step": 4973 }, { "epoch": 1.492797118847539, "grad_norm": 0.1302029937505722, "learning_rate": 3.672529850460246e-05, "loss": 0.3342, "step": 4974 }, { "epoch": 1.4930972388955581, "grad_norm": 0.14181581139564514, "learning_rate": 3.66847483455946e-05, "loss": 0.3477, "step": 4975 }, { "epoch": 1.4933973589435774, "grad_norm": 0.1388651430606842, "learning_rate": 3.6644215556400065e-05, "loss": 0.3307, "step": 4976 }, { "epoch": 1.4936974789915967, "grad_norm": 0.15407897531986237, "learning_rate": 3.6603700148138476e-05, "loss": 0.3553, "step": 4977 }, { "epoch": 1.493997599039616, "grad_norm": 0.12832655012607574, "learning_rate": 3.6563202131924854e-05, "loss": 0.3082, "step": 4978 }, { "epoch": 1.494297719087635, "grad_norm": 0.16573424637317657, "learning_rate": 3.652272151886925e-05, "loss": 0.3633, "step": 4979 }, { "epoch": 1.4945978391356542, "grad_norm": 0.1342608481645584, "learning_rate": 3.64822583200772e-05, "loss": 0.3504, "step": 4980 }, { "epoch": 1.4948979591836735, "grad_norm": 0.1758069545030594, "learning_rate": 3.644181254664925e-05, "loss": 0.3236, "step": 4981 }, { "epoch": 1.4951980792316926, "grad_norm": 0.14578987658023834, "learning_rate": 3.6401384209681186e-05, "loss": 0.3612, "step": 4982 }, { "epoch": 1.4954981992797118, "grad_norm": 0.12802115082740784, "learning_rate": 3.6360973320264125e-05, "loss": 0.3038, "step": 4983 }, { "epoch": 1.495798319327731, "grad_norm": 0.1445339173078537, "learning_rate": 3.632057988948433e-05, "loss": 0.339, "step": 4984 }, { "epoch": 1.4960984393757504, "grad_norm": 0.14311954379081726, "learning_rate": 3.6280203928423225e-05, "loss": 0.3449, "step": 4985 }, { "epoch": 1.4963985594237694, "grad_norm": 0.1445217877626419, "learning_rate": 3.623984544815756e-05, "loss": 0.3677, "step": 4986 }, { "epoch": 1.4966986794717887, "grad_norm": 0.1266975849866867, "learning_rate": 3.619950445975916e-05, "loss": 0.3318, "step": 4987 }, { "epoch": 1.496998799519808, "grad_norm": 0.1584240347146988, "learning_rate": 3.6159180974295124e-05, "loss": 0.3361, "step": 4988 }, { "epoch": 1.497298919567827, "grad_norm": 0.1305302083492279, "learning_rate": 3.611887500282779e-05, "loss": 0.3173, "step": 4989 }, { "epoch": 1.4975990396158463, "grad_norm": 0.1282181441783905, "learning_rate": 3.607858655641457e-05, "loss": 0.318, "step": 4990 }, { "epoch": 1.4978991596638656, "grad_norm": 0.14413386583328247, "learning_rate": 3.60383156461082e-05, "loss": 0.3473, "step": 4991 }, { "epoch": 1.4981992797118848, "grad_norm": 0.1319807767868042, "learning_rate": 3.599806228295647e-05, "loss": 0.3275, "step": 4992 }, { "epoch": 1.498499399759904, "grad_norm": 0.12965364754199982, "learning_rate": 3.595782647800248e-05, "loss": 0.314, "step": 4993 }, { "epoch": 1.4987995198079231, "grad_norm": 0.15261231362819672, "learning_rate": 3.5917608242284476e-05, "loss": 0.3604, "step": 4994 }, { "epoch": 1.4990996398559424, "grad_norm": 0.1458638608455658, "learning_rate": 3.587740758683581e-05, "loss": 0.3317, "step": 4995 }, { "epoch": 1.4993997599039615, "grad_norm": 0.13909973204135895, "learning_rate": 3.583722452268511e-05, "loss": 0.3365, "step": 4996 }, { "epoch": 1.4996998799519807, "grad_norm": 0.1375804841518402, "learning_rate": 3.579705906085618e-05, "loss": 0.3128, "step": 4997 }, { "epoch": 1.5, "grad_norm": 0.1468920111656189, "learning_rate": 3.575691121236785e-05, "loss": 0.3497, "step": 4998 }, { "epoch": 1.5003001200480193, "grad_norm": 0.13905927538871765, "learning_rate": 3.5716780988234324e-05, "loss": 0.3517, "step": 4999 }, { "epoch": 1.5006002400960385, "grad_norm": 0.17258362472057343, "learning_rate": 3.56766683994648e-05, "loss": 0.3557, "step": 5000 }, { "epoch": 1.5009003601440576, "grad_norm": 0.12960495054721832, "learning_rate": 3.563657345706372e-05, "loss": 0.3206, "step": 5001 }, { "epoch": 1.5012004801920769, "grad_norm": 0.1402052789926529, "learning_rate": 3.5596496172030724e-05, "loss": 0.3591, "step": 5002 }, { "epoch": 1.501500600240096, "grad_norm": 0.1431601196527481, "learning_rate": 3.555643655536051e-05, "loss": 0.3474, "step": 5003 }, { "epoch": 1.5018007202881152, "grad_norm": 0.3191090524196625, "learning_rate": 3.5516394618042944e-05, "loss": 0.3813, "step": 5004 }, { "epoch": 1.5021008403361344, "grad_norm": 0.13000676035881042, "learning_rate": 3.5476370371063114e-05, "loss": 0.3076, "step": 5005 }, { "epoch": 1.5024009603841537, "grad_norm": 0.1422010362148285, "learning_rate": 3.5436363825401234e-05, "loss": 0.3459, "step": 5006 }, { "epoch": 1.502701080432173, "grad_norm": 0.14779047667980194, "learning_rate": 3.539637499203259e-05, "loss": 0.3802, "step": 5007 }, { "epoch": 1.503001200480192, "grad_norm": 0.13392239809036255, "learning_rate": 3.535640388192767e-05, "loss": 0.3367, "step": 5008 }, { "epoch": 1.5033013205282113, "grad_norm": 0.15039633214473724, "learning_rate": 3.531645050605211e-05, "loss": 0.3627, "step": 5009 }, { "epoch": 1.5036014405762304, "grad_norm": 0.14044032990932465, "learning_rate": 3.527651487536669e-05, "loss": 0.3698, "step": 5010 }, { "epoch": 1.5039015606242496, "grad_norm": 0.13334433734416962, "learning_rate": 3.5236597000827266e-05, "loss": 0.3202, "step": 5011 }, { "epoch": 1.504201680672269, "grad_norm": 0.1374587118625641, "learning_rate": 3.519669689338478e-05, "loss": 0.3147, "step": 5012 }, { "epoch": 1.5045018007202882, "grad_norm": 0.1453571766614914, "learning_rate": 3.515681456398545e-05, "loss": 0.3571, "step": 5013 }, { "epoch": 1.5048019207683074, "grad_norm": 0.16557316482067108, "learning_rate": 3.511695002357055e-05, "loss": 0.3688, "step": 5014 }, { "epoch": 1.5051020408163265, "grad_norm": 0.14268682897090912, "learning_rate": 3.507710328307638e-05, "loss": 0.3739, "step": 5015 }, { "epoch": 1.5054021608643458, "grad_norm": 0.1268330067396164, "learning_rate": 3.503727435343451e-05, "loss": 0.3199, "step": 5016 }, { "epoch": 1.5057022809123648, "grad_norm": 0.1584567278623581, "learning_rate": 3.499746324557147e-05, "loss": 0.3647, "step": 5017 }, { "epoch": 1.506002400960384, "grad_norm": 0.14097891747951508, "learning_rate": 3.495766997040909e-05, "loss": 0.3678, "step": 5018 }, { "epoch": 1.5063025210084033, "grad_norm": 0.13286574184894562, "learning_rate": 3.4917894538864136e-05, "loss": 0.3404, "step": 5019 }, { "epoch": 1.5066026410564226, "grad_norm": 0.14076447486877441, "learning_rate": 3.487813696184852e-05, "loss": 0.3693, "step": 5020 }, { "epoch": 1.5069027611044419, "grad_norm": 0.12194446474313736, "learning_rate": 3.4838397250269295e-05, "loss": 0.2822, "step": 5021 }, { "epoch": 1.5072028811524611, "grad_norm": 0.13897386193275452, "learning_rate": 3.4798675415028635e-05, "loss": 0.3423, "step": 5022 }, { "epoch": 1.5075030012004802, "grad_norm": 0.13166168332099915, "learning_rate": 3.4758971467023716e-05, "loss": 0.3215, "step": 5023 }, { "epoch": 1.5078031212484992, "grad_norm": 0.13450632989406586, "learning_rate": 3.4719285417146905e-05, "loss": 0.3254, "step": 5024 }, { "epoch": 1.5081032412965185, "grad_norm": 0.14347293972969055, "learning_rate": 3.467961727628557e-05, "loss": 0.3639, "step": 5025 }, { "epoch": 1.5084033613445378, "grad_norm": 0.14105167984962463, "learning_rate": 3.463996705532222e-05, "loss": 0.3437, "step": 5026 }, { "epoch": 1.508703481392557, "grad_norm": 0.14335931837558746, "learning_rate": 3.46003347651345e-05, "loss": 0.3381, "step": 5027 }, { "epoch": 1.5090036014405763, "grad_norm": 0.1352183073759079, "learning_rate": 3.4560720416594985e-05, "loss": 0.3409, "step": 5028 }, { "epoch": 1.5093037214885956, "grad_norm": 0.16700023412704468, "learning_rate": 3.452112402057149e-05, "loss": 0.3631, "step": 5029 }, { "epoch": 1.5096038415366146, "grad_norm": 0.13665127754211426, "learning_rate": 3.448154558792677e-05, "loss": 0.3541, "step": 5030 }, { "epoch": 1.5099039615846337, "grad_norm": 0.135504350066185, "learning_rate": 3.444198512951875e-05, "loss": 0.3407, "step": 5031 }, { "epoch": 1.510204081632653, "grad_norm": 0.20623868703842163, "learning_rate": 3.4402442656200405e-05, "loss": 0.3433, "step": 5032 }, { "epoch": 1.5105042016806722, "grad_norm": 0.1356758177280426, "learning_rate": 3.436291817881971e-05, "loss": 0.3423, "step": 5033 }, { "epoch": 1.5108043217286915, "grad_norm": 0.13682779669761658, "learning_rate": 3.4323411708219786e-05, "loss": 0.3654, "step": 5034 }, { "epoch": 1.5111044417767108, "grad_norm": 0.13643723726272583, "learning_rate": 3.4283923255238805e-05, "loss": 0.3394, "step": 5035 }, { "epoch": 1.51140456182473, "grad_norm": 0.17409205436706543, "learning_rate": 3.424445283070989e-05, "loss": 0.3956, "step": 5036 }, { "epoch": 1.511704681872749, "grad_norm": 0.13246287405490875, "learning_rate": 3.42050004454614e-05, "loss": 0.3444, "step": 5037 }, { "epoch": 1.5120048019207684, "grad_norm": 0.15758417546749115, "learning_rate": 3.416556611031656e-05, "loss": 0.367, "step": 5038 }, { "epoch": 1.5123049219687874, "grad_norm": 0.13775351643562317, "learning_rate": 3.412614983609376e-05, "loss": 0.3288, "step": 5039 }, { "epoch": 1.5126050420168067, "grad_norm": 0.14162656664848328, "learning_rate": 3.408675163360643e-05, "loss": 0.375, "step": 5040 }, { "epoch": 1.512905162064826, "grad_norm": 0.13742078840732574, "learning_rate": 3.4047371513662995e-05, "loss": 0.3426, "step": 5041 }, { "epoch": 1.5132052821128452, "grad_norm": 0.1556188017129898, "learning_rate": 3.400800948706687e-05, "loss": 0.3621, "step": 5042 }, { "epoch": 1.5135054021608645, "grad_norm": 0.15120911598205566, "learning_rate": 3.3968665564616696e-05, "loss": 0.3516, "step": 5043 }, { "epoch": 1.5138055222088835, "grad_norm": 0.1464402675628662, "learning_rate": 3.392933975710598e-05, "loss": 0.3689, "step": 5044 }, { "epoch": 1.5141056422569028, "grad_norm": 0.16086047887802124, "learning_rate": 3.389003207532326e-05, "loss": 0.3695, "step": 5045 }, { "epoch": 1.5144057623049219, "grad_norm": 0.1495581418275833, "learning_rate": 3.385074253005219e-05, "loss": 0.2683, "step": 5046 }, { "epoch": 1.5147058823529411, "grad_norm": 0.153389573097229, "learning_rate": 3.381147113207139e-05, "loss": 0.3959, "step": 5047 }, { "epoch": 1.5150060024009604, "grad_norm": 0.15461201965808868, "learning_rate": 3.377221789215457e-05, "loss": 0.3639, "step": 5048 }, { "epoch": 1.5153061224489797, "grad_norm": 0.15333007276058197, "learning_rate": 3.373298282107036e-05, "loss": 0.3764, "step": 5049 }, { "epoch": 1.515606242496999, "grad_norm": 0.5077924132347107, "learning_rate": 3.369376592958243e-05, "loss": 0.3872, "step": 5050 }, { "epoch": 1.515906362545018, "grad_norm": 0.13519461452960968, "learning_rate": 3.3654567228449507e-05, "loss": 0.3458, "step": 5051 }, { "epoch": 1.5162064825930373, "grad_norm": 0.13457411527633667, "learning_rate": 3.3615386728425334e-05, "loss": 0.3426, "step": 5052 }, { "epoch": 1.5165066026410563, "grad_norm": 0.14452552795410156, "learning_rate": 3.3576224440258586e-05, "loss": 0.3459, "step": 5053 }, { "epoch": 1.5168067226890756, "grad_norm": 0.1387055516242981, "learning_rate": 3.353708037469304e-05, "loss": 0.3158, "step": 5054 }, { "epoch": 1.5171068427370948, "grad_norm": 0.13338612020015717, "learning_rate": 3.349795454246736e-05, "loss": 0.3177, "step": 5055 }, { "epoch": 1.517406962785114, "grad_norm": 0.1499544382095337, "learning_rate": 3.345884695431529e-05, "loss": 0.3984, "step": 5056 }, { "epoch": 1.5177070828331334, "grad_norm": 0.13665451109409332, "learning_rate": 3.34197576209656e-05, "loss": 0.3412, "step": 5057 }, { "epoch": 1.5180072028811524, "grad_norm": 0.13638487458229065, "learning_rate": 3.3380686553141916e-05, "loss": 0.3392, "step": 5058 }, { "epoch": 1.5183073229291717, "grad_norm": 0.13395284116268158, "learning_rate": 3.334163376156298e-05, "loss": 0.3449, "step": 5059 }, { "epoch": 1.5186074429771907, "grad_norm": 0.1375882923603058, "learning_rate": 3.3302599256942524e-05, "loss": 0.348, "step": 5060 }, { "epoch": 1.51890756302521, "grad_norm": 0.1323576122522354, "learning_rate": 3.326358304998913e-05, "loss": 0.3393, "step": 5061 }, { "epoch": 1.5192076830732293, "grad_norm": 0.1526079922914505, "learning_rate": 3.3224585151406515e-05, "loss": 0.3878, "step": 5062 }, { "epoch": 1.5195078031212486, "grad_norm": 0.12548959255218506, "learning_rate": 3.318560557189325e-05, "loss": 0.3211, "step": 5063 }, { "epoch": 1.5198079231692678, "grad_norm": 0.17658188939094543, "learning_rate": 3.314664432214297e-05, "loss": 0.3357, "step": 5064 }, { "epoch": 1.5201080432172869, "grad_norm": 0.1468879133462906, "learning_rate": 3.310770141284426e-05, "loss": 0.3722, "step": 5065 }, { "epoch": 1.5204081632653061, "grad_norm": 0.16095523536205292, "learning_rate": 3.3068776854680617e-05, "loss": 0.3582, "step": 5066 }, { "epoch": 1.5207082833133252, "grad_norm": 0.1672348976135254, "learning_rate": 3.302987065833057e-05, "loss": 0.3277, "step": 5067 }, { "epoch": 1.5210084033613445, "grad_norm": 0.1487703174352646, "learning_rate": 3.299098283446762e-05, "loss": 0.3194, "step": 5068 }, { "epoch": 1.5213085234093637, "grad_norm": 0.1495758444070816, "learning_rate": 3.295211339376014e-05, "loss": 0.3806, "step": 5069 }, { "epoch": 1.521608643457383, "grad_norm": 0.1635085493326187, "learning_rate": 3.2913262346871564e-05, "loss": 0.3439, "step": 5070 }, { "epoch": 1.5219087635054023, "grad_norm": 0.14415377378463745, "learning_rate": 3.2874429704460176e-05, "loss": 0.3567, "step": 5071 }, { "epoch": 1.5222088835534213, "grad_norm": 0.14679555594921112, "learning_rate": 3.283561547717929e-05, "loss": 0.3822, "step": 5072 }, { "epoch": 1.5225090036014406, "grad_norm": 0.12641476094722748, "learning_rate": 3.27968196756772e-05, "loss": 0.3094, "step": 5073 }, { "epoch": 1.5228091236494596, "grad_norm": 0.14666663110256195, "learning_rate": 3.2758042310597036e-05, "loss": 0.3933, "step": 5074 }, { "epoch": 1.523109243697479, "grad_norm": 0.13911989331245422, "learning_rate": 3.271928339257689e-05, "loss": 0.3321, "step": 5075 }, { "epoch": 1.5234093637454982, "grad_norm": 0.12876641750335693, "learning_rate": 3.268054293224987e-05, "loss": 0.3133, "step": 5076 }, { "epoch": 1.5237094837935174, "grad_norm": 0.12591290473937988, "learning_rate": 3.2641820940243974e-05, "loss": 0.2988, "step": 5077 }, { "epoch": 1.5240096038415367, "grad_norm": 0.12686516344547272, "learning_rate": 3.260311742718216e-05, "loss": 0.308, "step": 5078 }, { "epoch": 1.5243097238895558, "grad_norm": 0.12560750544071198, "learning_rate": 3.2564432403682266e-05, "loss": 0.3145, "step": 5079 }, { "epoch": 1.524609843937575, "grad_norm": 0.1369137167930603, "learning_rate": 3.252576588035703e-05, "loss": 0.3474, "step": 5080 }, { "epoch": 1.524909963985594, "grad_norm": 0.13097818195819855, "learning_rate": 3.2487117867814287e-05, "loss": 0.3274, "step": 5081 }, { "epoch": 1.5252100840336134, "grad_norm": 0.14597076177597046, "learning_rate": 3.244848837665662e-05, "loss": 0.3392, "step": 5082 }, { "epoch": 1.5255102040816326, "grad_norm": 0.14146724343299866, "learning_rate": 3.240987741748154e-05, "loss": 0.3407, "step": 5083 }, { "epoch": 1.525810324129652, "grad_norm": 0.14190976321697235, "learning_rate": 3.237128500088157e-05, "loss": 0.3641, "step": 5084 }, { "epoch": 1.5261104441776712, "grad_norm": 0.13037167489528656, "learning_rate": 3.233271113744412e-05, "loss": 0.3424, "step": 5085 }, { "epoch": 1.5264105642256904, "grad_norm": 0.14053994417190552, "learning_rate": 3.2294155837751414e-05, "loss": 0.3489, "step": 5086 }, { "epoch": 1.5267106842737095, "grad_norm": 0.14578457176685333, "learning_rate": 3.225561911238074e-05, "loss": 0.3868, "step": 5087 }, { "epoch": 1.5270108043217285, "grad_norm": 0.19754479825496674, "learning_rate": 3.221710097190414e-05, "loss": 0.3485, "step": 5088 }, { "epoch": 1.5273109243697478, "grad_norm": 0.16041827201843262, "learning_rate": 3.217860142688864e-05, "loss": 0.3714, "step": 5089 }, { "epoch": 1.527611044417767, "grad_norm": 0.13646718859672546, "learning_rate": 3.21401204878962e-05, "loss": 0.3332, "step": 5090 }, { "epoch": 1.5279111644657863, "grad_norm": 0.13508738577365875, "learning_rate": 3.2101658165483536e-05, "loss": 0.3278, "step": 5091 }, { "epoch": 1.5282112845138056, "grad_norm": 0.1406700760126114, "learning_rate": 3.206321447020241e-05, "loss": 0.3305, "step": 5092 }, { "epoch": 1.5285114045618249, "grad_norm": 0.14580851793289185, "learning_rate": 3.202478941259941e-05, "loss": 0.3549, "step": 5093 }, { "epoch": 1.528811524609844, "grad_norm": 0.12813225388526917, "learning_rate": 3.1986383003215956e-05, "loss": 0.3258, "step": 5094 }, { "epoch": 1.5291116446578632, "grad_norm": 0.1420821100473404, "learning_rate": 3.194799525258849e-05, "loss": 0.3582, "step": 5095 }, { "epoch": 1.5294117647058822, "grad_norm": 0.14169463515281677, "learning_rate": 3.190962617124816e-05, "loss": 0.3594, "step": 5096 }, { "epoch": 1.5297118847539015, "grad_norm": 0.1366046667098999, "learning_rate": 3.187127576972112e-05, "loss": 0.341, "step": 5097 }, { "epoch": 1.5300120048019208, "grad_norm": 0.15085168182849884, "learning_rate": 3.1832944058528417e-05, "loss": 0.3941, "step": 5098 }, { "epoch": 1.53031212484994, "grad_norm": 0.14959454536437988, "learning_rate": 3.179463104818582e-05, "loss": 0.3454, "step": 5099 }, { "epoch": 1.5306122448979593, "grad_norm": 0.1488467901945114, "learning_rate": 3.175633674920415e-05, "loss": 0.3976, "step": 5100 }, { "epoch": 1.5309123649459784, "grad_norm": 0.13554778695106506, "learning_rate": 3.171806117208894e-05, "loss": 0.3371, "step": 5101 }, { "epoch": 1.5312124849939976, "grad_norm": 0.13067273795604706, "learning_rate": 3.167980432734069e-05, "loss": 0.3399, "step": 5102 }, { "epoch": 1.5315126050420167, "grad_norm": 0.14046433568000793, "learning_rate": 3.164156622545475e-05, "loss": 0.3634, "step": 5103 }, { "epoch": 1.531812725090036, "grad_norm": 0.13948778808116913, "learning_rate": 3.160334687692128e-05, "loss": 0.3373, "step": 5104 }, { "epoch": 1.5321128451380552, "grad_norm": 0.1371791511774063, "learning_rate": 3.1565146292225255e-05, "loss": 0.3558, "step": 5105 }, { "epoch": 1.5324129651860745, "grad_norm": 0.16795873641967773, "learning_rate": 3.1526964481846686e-05, "loss": 0.3789, "step": 5106 }, { "epoch": 1.5327130852340938, "grad_norm": 0.13602060079574585, "learning_rate": 3.1488801456260245e-05, "loss": 0.3624, "step": 5107 }, { "epoch": 1.5330132052821128, "grad_norm": 0.1465499848127365, "learning_rate": 3.145065722593555e-05, "loss": 0.3648, "step": 5108 }, { "epoch": 1.533313325330132, "grad_norm": 0.13361485302448273, "learning_rate": 3.1412531801337e-05, "loss": 0.3288, "step": 5109 }, { "epoch": 1.5336134453781511, "grad_norm": 0.1314423680305481, "learning_rate": 3.1374425192923874e-05, "loss": 0.3078, "step": 5110 }, { "epoch": 1.5339135654261704, "grad_norm": 0.13410235941410065, "learning_rate": 3.133633741115034e-05, "loss": 0.3042, "step": 5111 }, { "epoch": 1.5342136854741897, "grad_norm": 0.1429615616798401, "learning_rate": 3.129826846646528e-05, "loss": 0.3567, "step": 5112 }, { "epoch": 1.534513805522209, "grad_norm": 0.1350201666355133, "learning_rate": 3.1260218369312476e-05, "loss": 0.3678, "step": 5113 }, { "epoch": 1.5348139255702282, "grad_norm": 0.14673860371112823, "learning_rate": 3.122218713013055e-05, "loss": 0.3624, "step": 5114 }, { "epoch": 1.5351140456182473, "grad_norm": 0.1545906364917755, "learning_rate": 3.118417475935297e-05, "loss": 0.3363, "step": 5115 }, { "epoch": 1.5354141656662665, "grad_norm": 0.157174751162529, "learning_rate": 3.114618126740793e-05, "loss": 0.3788, "step": 5116 }, { "epoch": 1.5357142857142856, "grad_norm": 0.15201139450073242, "learning_rate": 3.1108206664718576e-05, "loss": 0.3788, "step": 5117 }, { "epoch": 1.5360144057623049, "grad_norm": 0.14599697291851044, "learning_rate": 3.10702509617027e-05, "loss": 0.3891, "step": 5118 }, { "epoch": 1.5363145258103241, "grad_norm": 0.14983749389648438, "learning_rate": 3.103231416877315e-05, "loss": 0.374, "step": 5119 }, { "epoch": 1.5366146458583434, "grad_norm": 0.14620278775691986, "learning_rate": 3.099439629633738e-05, "loss": 0.351, "step": 5120 }, { "epoch": 1.5369147659063627, "grad_norm": 0.13003404438495636, "learning_rate": 3.09564973547977e-05, "loss": 0.3184, "step": 5121 }, { "epoch": 1.5372148859543817, "grad_norm": 0.13253653049468994, "learning_rate": 3.0918617354551274e-05, "loss": 0.3432, "step": 5122 }, { "epoch": 1.537515006002401, "grad_norm": 0.14117096364498138, "learning_rate": 3.088075630599008e-05, "loss": 0.3601, "step": 5123 }, { "epoch": 1.53781512605042, "grad_norm": 0.18316718935966492, "learning_rate": 3.084291421950081e-05, "loss": 0.3356, "step": 5124 }, { "epoch": 1.5381152460984393, "grad_norm": 0.14761757850646973, "learning_rate": 3.0805091105465044e-05, "loss": 0.3648, "step": 5125 }, { "epoch": 1.5384153661464586, "grad_norm": 0.13970275223255157, "learning_rate": 3.076728697425908e-05, "loss": 0.383, "step": 5126 }, { "epoch": 1.5387154861944778, "grad_norm": 0.22657494246959686, "learning_rate": 3.0729501836254074e-05, "loss": 0.3171, "step": 5127 }, { "epoch": 1.5390156062424971, "grad_norm": 0.13654956221580505, "learning_rate": 3.069173570181597e-05, "loss": 0.3436, "step": 5128 }, { "epoch": 1.5393157262905162, "grad_norm": 0.13815683126449585, "learning_rate": 3.0653988581305426e-05, "loss": 0.3628, "step": 5129 }, { "epoch": 1.5396158463385354, "grad_norm": 0.1333167552947998, "learning_rate": 3.061626048507794e-05, "loss": 0.3195, "step": 5130 }, { "epoch": 1.5399159663865545, "grad_norm": 0.14363545179367065, "learning_rate": 3.057855142348384e-05, "loss": 0.384, "step": 5131 }, { "epoch": 1.5402160864345738, "grad_norm": 0.13697920739650726, "learning_rate": 3.054086140686808e-05, "loss": 0.3477, "step": 5132 }, { "epoch": 1.540516206482593, "grad_norm": 0.14586204290390015, "learning_rate": 3.0503190445570585e-05, "loss": 0.3556, "step": 5133 }, { "epoch": 1.5408163265306123, "grad_norm": 0.1277673840522766, "learning_rate": 3.0465538549925854e-05, "loss": 0.3254, "step": 5134 }, { "epoch": 1.5411164465786316, "grad_norm": 0.1379811316728592, "learning_rate": 3.0427905730263307e-05, "loss": 0.3568, "step": 5135 }, { "epoch": 1.5414165666266506, "grad_norm": 0.13037370145320892, "learning_rate": 3.0390291996907094e-05, "loss": 0.3191, "step": 5136 }, { "epoch": 1.5417166866746699, "grad_norm": 0.14184845983982086, "learning_rate": 3.0352697360176065e-05, "loss": 0.3401, "step": 5137 }, { "epoch": 1.542016806722689, "grad_norm": 0.14727821946144104, "learning_rate": 3.031512183038392e-05, "loss": 0.3251, "step": 5138 }, { "epoch": 1.5423169267707082, "grad_norm": 0.24418002367019653, "learning_rate": 3.0277565417839026e-05, "loss": 0.4121, "step": 5139 }, { "epoch": 1.5426170468187275, "grad_norm": 0.12005669623613358, "learning_rate": 3.0240028132844577e-05, "loss": 0.2846, "step": 5140 }, { "epoch": 1.5429171668667467, "grad_norm": 0.1330462247133255, "learning_rate": 3.0202509985698535e-05, "loss": 0.3317, "step": 5141 }, { "epoch": 1.543217286914766, "grad_norm": 0.1424010545015335, "learning_rate": 3.0165010986693543e-05, "loss": 0.3527, "step": 5142 }, { "epoch": 1.5435174069627853, "grad_norm": 0.14978627860546112, "learning_rate": 3.0127531146116948e-05, "loss": 0.3544, "step": 5143 }, { "epoch": 1.5438175270108043, "grad_norm": 0.1406516581773758, "learning_rate": 3.0090070474251053e-05, "loss": 0.3504, "step": 5144 }, { "epoch": 1.5441176470588234, "grad_norm": 0.2728155553340912, "learning_rate": 3.005262898137269e-05, "loss": 0.3539, "step": 5145 }, { "epoch": 1.5444177671068426, "grad_norm": 0.14986300468444824, "learning_rate": 3.0015206677753484e-05, "loss": 0.3957, "step": 5146 }, { "epoch": 1.544717887154862, "grad_norm": 0.13398940861225128, "learning_rate": 2.9977803573659834e-05, "loss": 0.3314, "step": 5147 }, { "epoch": 1.5450180072028812, "grad_norm": 0.13351598381996155, "learning_rate": 2.994041967935286e-05, "loss": 0.3343, "step": 5148 }, { "epoch": 1.5453181272509005, "grad_norm": 0.13882090151309967, "learning_rate": 2.990305500508843e-05, "loss": 0.3053, "step": 5149 }, { "epoch": 1.5456182472989197, "grad_norm": 0.16126897931098938, "learning_rate": 2.9865709561117093e-05, "loss": 0.3439, "step": 5150 }, { "epoch": 1.5459183673469388, "grad_norm": 0.13146162033081055, "learning_rate": 2.9828383357684098e-05, "loss": 0.3413, "step": 5151 }, { "epoch": 1.5462184873949578, "grad_norm": 0.1378421038389206, "learning_rate": 2.9791076405029506e-05, "loss": 0.3522, "step": 5152 }, { "epoch": 1.546518607442977, "grad_norm": 0.13887012004852295, "learning_rate": 2.9753788713388075e-05, "loss": 0.346, "step": 5153 }, { "epoch": 1.5468187274909964, "grad_norm": 0.14214332401752472, "learning_rate": 2.9716520292989202e-05, "loss": 0.3624, "step": 5154 }, { "epoch": 1.5471188475390156, "grad_norm": 0.1410626918077469, "learning_rate": 2.9679271154057065e-05, "loss": 0.3348, "step": 5155 }, { "epoch": 1.547418967587035, "grad_norm": 0.14501497149467468, "learning_rate": 2.9642041306810554e-05, "loss": 0.3366, "step": 5156 }, { "epoch": 1.5477190876350542, "grad_norm": 0.14532721042633057, "learning_rate": 2.9604830761463277e-05, "loss": 0.366, "step": 5157 }, { "epoch": 1.5480192076830732, "grad_norm": 0.13929975032806396, "learning_rate": 2.956763952822349e-05, "loss": 0.3473, "step": 5158 }, { "epoch": 1.5483193277310925, "grad_norm": 0.13484100997447968, "learning_rate": 2.9530467617294143e-05, "loss": 0.3353, "step": 5159 }, { "epoch": 1.5486194477791115, "grad_norm": 0.13206981122493744, "learning_rate": 2.949331503887296e-05, "loss": 0.3306, "step": 5160 }, { "epoch": 1.5489195678271308, "grad_norm": 0.1376311480998993, "learning_rate": 2.945618180315236e-05, "loss": 0.3328, "step": 5161 }, { "epoch": 1.54921968787515, "grad_norm": 0.14766620099544525, "learning_rate": 2.9419067920319343e-05, "loss": 0.36, "step": 5162 }, { "epoch": 1.5495198079231693, "grad_norm": 0.1360243707895279, "learning_rate": 2.9381973400555762e-05, "loss": 0.3476, "step": 5163 }, { "epoch": 1.5498199279711886, "grad_norm": 0.12841099500656128, "learning_rate": 2.9344898254038e-05, "loss": 0.2991, "step": 5164 }, { "epoch": 1.5501200480192077, "grad_norm": 0.14261536300182343, "learning_rate": 2.9307842490937232e-05, "loss": 0.3875, "step": 5165 }, { "epoch": 1.550420168067227, "grad_norm": 0.14344365894794464, "learning_rate": 2.9270806121419304e-05, "loss": 0.3349, "step": 5166 }, { "epoch": 1.550720288115246, "grad_norm": 0.1426357924938202, "learning_rate": 2.9233789155644663e-05, "loss": 0.3446, "step": 5167 }, { "epoch": 1.5510204081632653, "grad_norm": 0.13309688866138458, "learning_rate": 2.9196791603768514e-05, "loss": 0.3017, "step": 5168 }, { "epoch": 1.5513205282112845, "grad_norm": 0.1456434279680252, "learning_rate": 2.9159813475940756e-05, "loss": 0.3558, "step": 5169 }, { "epoch": 1.5516206482593038, "grad_norm": 0.14691193401813507, "learning_rate": 2.9122854782305853e-05, "loss": 0.3781, "step": 5170 }, { "epoch": 1.551920768307323, "grad_norm": 0.13335365056991577, "learning_rate": 2.9085915533003037e-05, "loss": 0.3391, "step": 5171 }, { "epoch": 1.552220888355342, "grad_norm": 0.125112846493721, "learning_rate": 2.904899573816613e-05, "loss": 0.2738, "step": 5172 }, { "epoch": 1.5525210084033614, "grad_norm": 0.1315487027168274, "learning_rate": 2.9012095407923677e-05, "loss": 0.3344, "step": 5173 }, { "epoch": 1.5528211284513804, "grad_norm": 0.13965719938278198, "learning_rate": 2.8975214552398888e-05, "loss": 0.3801, "step": 5174 }, { "epoch": 1.5531212484993997, "grad_norm": 0.16344398260116577, "learning_rate": 2.8938353181709576e-05, "loss": 0.3921, "step": 5175 }, { "epoch": 1.553421368547419, "grad_norm": 0.12906277179718018, "learning_rate": 2.89015113059682e-05, "loss": 0.3302, "step": 5176 }, { "epoch": 1.5537214885954382, "grad_norm": 0.12793534994125366, "learning_rate": 2.8864688935281948e-05, "loss": 0.3145, "step": 5177 }, { "epoch": 1.5540216086434575, "grad_norm": 0.13471034169197083, "learning_rate": 2.8827886079752598e-05, "loss": 0.3614, "step": 5178 }, { "epoch": 1.5543217286914766, "grad_norm": 0.14348182082176208, "learning_rate": 2.879110274947664e-05, "loss": 0.3566, "step": 5179 }, { "epoch": 1.5546218487394958, "grad_norm": 0.1379540115594864, "learning_rate": 2.8754338954545078e-05, "loss": 0.3603, "step": 5180 }, { "epoch": 1.5549219687875149, "grad_norm": 0.1349738985300064, "learning_rate": 2.8717594705043694e-05, "loss": 0.3629, "step": 5181 }, { "epoch": 1.5552220888355341, "grad_norm": 0.14187569916248322, "learning_rate": 2.868087001105285e-05, "loss": 0.3665, "step": 5182 }, { "epoch": 1.5555222088835534, "grad_norm": 0.1274200677871704, "learning_rate": 2.864416488264755e-05, "loss": 0.308, "step": 5183 }, { "epoch": 1.5558223289315727, "grad_norm": 0.1288219839334488, "learning_rate": 2.8607479329897367e-05, "loss": 0.3393, "step": 5184 }, { "epoch": 1.556122448979592, "grad_norm": 0.1553051769733429, "learning_rate": 2.85708133628666e-05, "loss": 0.3391, "step": 5185 }, { "epoch": 1.556422569027611, "grad_norm": 0.13538624346256256, "learning_rate": 2.8534166991614142e-05, "loss": 0.3562, "step": 5186 }, { "epoch": 1.5567226890756303, "grad_norm": 0.14043527841567993, "learning_rate": 2.849754022619352e-05, "loss": 0.3662, "step": 5187 }, { "epoch": 1.5570228091236493, "grad_norm": 0.13456833362579346, "learning_rate": 2.8460933076652864e-05, "loss": 0.3468, "step": 5188 }, { "epoch": 1.5573229291716686, "grad_norm": 0.1411183923482895, "learning_rate": 2.8424345553034836e-05, "loss": 0.3789, "step": 5189 }, { "epoch": 1.5576230492196879, "grad_norm": 0.1420077085494995, "learning_rate": 2.8387777665376947e-05, "loss": 0.3526, "step": 5190 }, { "epoch": 1.5579231692677071, "grad_norm": 0.14961381256580353, "learning_rate": 2.83512294237111e-05, "loss": 0.3393, "step": 5191 }, { "epoch": 1.5582232893157264, "grad_norm": 0.14596201479434967, "learning_rate": 2.8314700838063866e-05, "loss": 0.303, "step": 5192 }, { "epoch": 1.5585234093637454, "grad_norm": 0.14556051790714264, "learning_rate": 2.8278191918456475e-05, "loss": 0.3635, "step": 5193 }, { "epoch": 1.5588235294117647, "grad_norm": 0.1302669495344162, "learning_rate": 2.8241702674904756e-05, "loss": 0.3203, "step": 5194 }, { "epoch": 1.5591236494597838, "grad_norm": 0.13536348938941956, "learning_rate": 2.820523311741906e-05, "loss": 0.3282, "step": 5195 }, { "epoch": 1.559423769507803, "grad_norm": 0.16835986077785492, "learning_rate": 2.816878325600444e-05, "loss": 0.4177, "step": 5196 }, { "epoch": 1.5597238895558223, "grad_norm": 0.14837545156478882, "learning_rate": 2.8132353100660447e-05, "loss": 0.3293, "step": 5197 }, { "epoch": 1.5600240096038416, "grad_norm": 0.14597195386886597, "learning_rate": 2.8095942661381304e-05, "loss": 0.337, "step": 5198 }, { "epoch": 1.5603241296518608, "grad_norm": 0.13168051838874817, "learning_rate": 2.8059551948155827e-05, "loss": 0.336, "step": 5199 }, { "epoch": 1.5606242496998801, "grad_norm": 0.12910909950733185, "learning_rate": 2.8023180970967333e-05, "loss": 0.3251, "step": 5200 }, { "epoch": 1.5609243697478992, "grad_norm": 0.12473164498806, "learning_rate": 2.798682973979384e-05, "loss": 0.2888, "step": 5201 }, { "epoch": 1.5612244897959182, "grad_norm": 0.1480257660150528, "learning_rate": 2.7950498264607828e-05, "loss": 0.3411, "step": 5202 }, { "epoch": 1.5615246098439375, "grad_norm": 0.13709281384944916, "learning_rate": 2.7914186555376464e-05, "loss": 0.3476, "step": 5203 }, { "epoch": 1.5618247298919568, "grad_norm": 0.1475631445646286, "learning_rate": 2.7877894622061474e-05, "loss": 0.3519, "step": 5204 }, { "epoch": 1.562124849939976, "grad_norm": 0.15101012587547302, "learning_rate": 2.7841622474619057e-05, "loss": 0.3564, "step": 5205 }, { "epoch": 1.5624249699879953, "grad_norm": 0.11792006343603134, "learning_rate": 2.780537012300011e-05, "loss": 0.2761, "step": 5206 }, { "epoch": 1.5627250900360146, "grad_norm": 0.12644313275814056, "learning_rate": 2.7769137577150072e-05, "loss": 0.3149, "step": 5207 }, { "epoch": 1.5630252100840336, "grad_norm": 0.14322860538959503, "learning_rate": 2.7732924847008867e-05, "loss": 0.3762, "step": 5208 }, { "epoch": 1.5633253301320527, "grad_norm": 0.13497188687324524, "learning_rate": 2.769673194251111e-05, "loss": 0.3269, "step": 5209 }, { "epoch": 1.563625450180072, "grad_norm": 0.13116684556007385, "learning_rate": 2.766055887358584e-05, "loss": 0.3188, "step": 5210 }, { "epoch": 1.5639255702280912, "grad_norm": 0.1373387724161148, "learning_rate": 2.762440565015676e-05, "loss": 0.3245, "step": 5211 }, { "epoch": 1.5642256902761105, "grad_norm": 0.13331609964370728, "learning_rate": 2.7588272282142112e-05, "loss": 0.3355, "step": 5212 }, { "epoch": 1.5645258103241297, "grad_norm": 0.1577932983636856, "learning_rate": 2.755215877945465e-05, "loss": 0.391, "step": 5213 }, { "epoch": 1.564825930372149, "grad_norm": 0.13589797914028168, "learning_rate": 2.7516065152001634e-05, "loss": 0.3428, "step": 5214 }, { "epoch": 1.565126050420168, "grad_norm": 0.1299768090248108, "learning_rate": 2.747999140968507e-05, "loss": 0.3207, "step": 5215 }, { "epoch": 1.5654261704681873, "grad_norm": 0.13584931194782257, "learning_rate": 2.744393756240127e-05, "loss": 0.3261, "step": 5216 }, { "epoch": 1.5657262905162064, "grad_norm": 0.13659065961837769, "learning_rate": 2.7407903620041264e-05, "loss": 0.3554, "step": 5217 }, { "epoch": 1.5660264105642256, "grad_norm": 0.14530958235263824, "learning_rate": 2.7371889592490485e-05, "loss": 0.3719, "step": 5218 }, { "epoch": 1.566326530612245, "grad_norm": 0.14446115493774414, "learning_rate": 2.7335895489628994e-05, "loss": 0.3721, "step": 5219 }, { "epoch": 1.5666266506602642, "grad_norm": 0.1385437399148941, "learning_rate": 2.72999213213314e-05, "loss": 0.3531, "step": 5220 }, { "epoch": 1.5669267707082835, "grad_norm": 0.14599959552288055, "learning_rate": 2.7263967097466768e-05, "loss": 0.3797, "step": 5221 }, { "epoch": 1.5672268907563025, "grad_norm": 0.14255771040916443, "learning_rate": 2.72280328278987e-05, "loss": 0.3566, "step": 5222 }, { "epoch": 1.5675270108043218, "grad_norm": 0.40354663133621216, "learning_rate": 2.7192118522485378e-05, "loss": 0.3534, "step": 5223 }, { "epoch": 1.5678271308523408, "grad_norm": 0.12925659120082855, "learning_rate": 2.7156224191079515e-05, "loss": 0.3132, "step": 5224 }, { "epoch": 1.56812725090036, "grad_norm": 0.12465454638004303, "learning_rate": 2.712034984352825e-05, "loss": 0.3152, "step": 5225 }, { "epoch": 1.5684273709483794, "grad_norm": 0.1358371078968048, "learning_rate": 2.7084495489673346e-05, "loss": 0.3388, "step": 5226 }, { "epoch": 1.5687274909963986, "grad_norm": 0.17917902767658234, "learning_rate": 2.704866113935095e-05, "loss": 0.3481, "step": 5227 }, { "epoch": 1.569027611044418, "grad_norm": 0.1871613711118698, "learning_rate": 2.7012846802391935e-05, "loss": 0.342, "step": 5228 }, { "epoch": 1.569327731092437, "grad_norm": 0.16490772366523743, "learning_rate": 2.697705248862149e-05, "loss": 0.3537, "step": 5229 }, { "epoch": 1.5696278511404562, "grad_norm": 0.1259879767894745, "learning_rate": 2.6941278207859333e-05, "loss": 0.313, "step": 5230 }, { "epoch": 1.5699279711884753, "grad_norm": 0.1332831084728241, "learning_rate": 2.6905523969919767e-05, "loss": 0.32, "step": 5231 }, { "epoch": 1.5702280912364945, "grad_norm": 0.1395072638988495, "learning_rate": 2.6869789784611587e-05, "loss": 0.3319, "step": 5232 }, { "epoch": 1.5705282112845138, "grad_norm": 0.15206611156463623, "learning_rate": 2.683407566173799e-05, "loss": 0.3713, "step": 5233 }, { "epoch": 1.570828331332533, "grad_norm": 0.14162662625312805, "learning_rate": 2.679838161109681e-05, "loss": 0.3485, "step": 5234 }, { "epoch": 1.5711284513805523, "grad_norm": 0.1351427137851715, "learning_rate": 2.6762707642480223e-05, "loss": 0.2969, "step": 5235 }, { "epoch": 1.5714285714285714, "grad_norm": 0.13487915694713593, "learning_rate": 2.6727053765675024e-05, "loss": 0.3315, "step": 5236 }, { "epoch": 1.5717286914765907, "grad_norm": 0.1483466923236847, "learning_rate": 2.6691419990462465e-05, "loss": 0.3556, "step": 5237 }, { "epoch": 1.5720288115246097, "grad_norm": 0.12802277505397797, "learning_rate": 2.6655806326618194e-05, "loss": 0.3064, "step": 5238 }, { "epoch": 1.572328931572629, "grad_norm": 0.13978759944438934, "learning_rate": 2.6620212783912478e-05, "loss": 0.3361, "step": 5239 }, { "epoch": 1.5726290516206483, "grad_norm": 0.152753084897995, "learning_rate": 2.6584639372109942e-05, "loss": 0.3462, "step": 5240 }, { "epoch": 1.5729291716686675, "grad_norm": 0.14378800988197327, "learning_rate": 2.6549086100969768e-05, "loss": 0.3703, "step": 5241 }, { "epoch": 1.5732292917166868, "grad_norm": 0.132570281624794, "learning_rate": 2.6513552980245628e-05, "loss": 0.3278, "step": 5242 }, { "epoch": 1.5735294117647058, "grad_norm": 0.13974635303020477, "learning_rate": 2.6478040019685556e-05, "loss": 0.3537, "step": 5243 }, { "epoch": 1.5738295318127251, "grad_norm": 0.14354409277439117, "learning_rate": 2.6442547229032154e-05, "loss": 0.3446, "step": 5244 }, { "epoch": 1.5741296518607442, "grad_norm": 0.15387193858623505, "learning_rate": 2.6407074618022508e-05, "loss": 0.3818, "step": 5245 }, { "epoch": 1.5744297719087634, "grad_norm": 0.14714168012142181, "learning_rate": 2.6371622196388045e-05, "loss": 0.3008, "step": 5246 }, { "epoch": 1.5747298919567827, "grad_norm": 0.1442662477493286, "learning_rate": 2.6336189973854796e-05, "loss": 0.3339, "step": 5247 }, { "epoch": 1.575030012004802, "grad_norm": 0.12938161194324493, "learning_rate": 2.630077796014312e-05, "loss": 0.3124, "step": 5248 }, { "epoch": 1.5753301320528212, "grad_norm": 0.15024563670158386, "learning_rate": 2.6265386164967943e-05, "loss": 0.3416, "step": 5249 }, { "epoch": 1.5756302521008403, "grad_norm": 0.13507284224033356, "learning_rate": 2.623001459803861e-05, "loss": 0.3314, "step": 5250 }, { "epoch": 1.5759303721488596, "grad_norm": 0.14251956343650818, "learning_rate": 2.6194663269058885e-05, "loss": 0.3278, "step": 5251 }, { "epoch": 1.5762304921968786, "grad_norm": 0.1405022144317627, "learning_rate": 2.6159332187726936e-05, "loss": 0.3475, "step": 5252 }, { "epoch": 1.5765306122448979, "grad_norm": 0.13790111243724823, "learning_rate": 2.612402136373555e-05, "loss": 0.3562, "step": 5253 }, { "epoch": 1.5768307322929171, "grad_norm": 0.15381157398223877, "learning_rate": 2.608873080677181e-05, "loss": 0.3815, "step": 5254 }, { "epoch": 1.5771308523409364, "grad_norm": 0.13172820210456848, "learning_rate": 2.6053460526517236e-05, "loss": 0.2976, "step": 5255 }, { "epoch": 1.5774309723889557, "grad_norm": 0.13020232319831848, "learning_rate": 2.6018210532647848e-05, "loss": 0.3058, "step": 5256 }, { "epoch": 1.5777310924369747, "grad_norm": 0.13710160553455353, "learning_rate": 2.598298083483408e-05, "loss": 0.3421, "step": 5257 }, { "epoch": 1.578031212484994, "grad_norm": 0.138636976480484, "learning_rate": 2.594777144274083e-05, "loss": 0.3448, "step": 5258 }, { "epoch": 1.578331332533013, "grad_norm": 0.15129081904888153, "learning_rate": 2.591258236602736e-05, "loss": 0.3643, "step": 5259 }, { "epoch": 1.5786314525810323, "grad_norm": 0.1381225436925888, "learning_rate": 2.5877413614347358e-05, "loss": 0.3301, "step": 5260 }, { "epoch": 1.5789315726290516, "grad_norm": 0.1401417851448059, "learning_rate": 2.5842265197348993e-05, "loss": 0.3597, "step": 5261 }, { "epoch": 1.5792316926770709, "grad_norm": 0.13018085062503815, "learning_rate": 2.5807137124674864e-05, "loss": 0.322, "step": 5262 }, { "epoch": 1.5795318127250901, "grad_norm": 0.13071636855602264, "learning_rate": 2.5772029405961895e-05, "loss": 0.338, "step": 5263 }, { "epoch": 1.5798319327731094, "grad_norm": 0.1339547485113144, "learning_rate": 2.573694205084155e-05, "loss": 0.3647, "step": 5264 }, { "epoch": 1.5801320528211285, "grad_norm": 0.13119250535964966, "learning_rate": 2.5701875068939585e-05, "loss": 0.3234, "step": 5265 }, { "epoch": 1.5804321728691475, "grad_norm": 0.1344720870256424, "learning_rate": 2.5666828469876247e-05, "loss": 0.3265, "step": 5266 }, { "epoch": 1.5807322929171668, "grad_norm": 0.14399132132530212, "learning_rate": 2.563180226326619e-05, "loss": 0.3714, "step": 5267 }, { "epoch": 1.581032412965186, "grad_norm": 0.15395571291446686, "learning_rate": 2.559679645871842e-05, "loss": 0.3698, "step": 5268 }, { "epoch": 1.5813325330132053, "grad_norm": 0.12918394804000854, "learning_rate": 2.5561811065836384e-05, "loss": 0.3321, "step": 5269 }, { "epoch": 1.5816326530612246, "grad_norm": 0.1321277916431427, "learning_rate": 2.5526846094217948e-05, "loss": 0.3496, "step": 5270 }, { "epoch": 1.5819327731092439, "grad_norm": 0.1286695897579193, "learning_rate": 2.549190155345532e-05, "loss": 0.3039, "step": 5271 }, { "epoch": 1.582232893157263, "grad_norm": 0.11904725432395935, "learning_rate": 2.5456977453135167e-05, "loss": 0.2819, "step": 5272 }, { "epoch": 1.582533013205282, "grad_norm": 0.12723879516124725, "learning_rate": 2.5422073802838476e-05, "loss": 0.3384, "step": 5273 }, { "epoch": 1.5828331332533012, "grad_norm": 0.136310875415802, "learning_rate": 2.5387190612140678e-05, "loss": 0.3584, "step": 5274 }, { "epoch": 1.5831332533013205, "grad_norm": 0.20283541083335876, "learning_rate": 2.5352327890611605e-05, "loss": 0.34, "step": 5275 }, { "epoch": 1.5834333733493398, "grad_norm": 0.13946253061294556, "learning_rate": 2.5317485647815398e-05, "loss": 0.3577, "step": 5276 }, { "epoch": 1.583733493397359, "grad_norm": 0.12818807363510132, "learning_rate": 2.5282663893310643e-05, "loss": 0.311, "step": 5277 }, { "epoch": 1.5840336134453783, "grad_norm": 0.13578660786151886, "learning_rate": 2.524786263665033e-05, "loss": 0.3456, "step": 5278 }, { "epoch": 1.5843337334933973, "grad_norm": 0.14931534230709076, "learning_rate": 2.521308188738173e-05, "loss": 0.3522, "step": 5279 }, { "epoch": 1.5846338535414166, "grad_norm": 0.13508422672748566, "learning_rate": 2.5178321655046577e-05, "loss": 0.3234, "step": 5280 }, { "epoch": 1.5849339735894357, "grad_norm": 0.18868158757686615, "learning_rate": 2.5143581949180915e-05, "loss": 0.342, "step": 5281 }, { "epoch": 1.585234093637455, "grad_norm": 0.16816258430480957, "learning_rate": 2.510886277931519e-05, "loss": 0.4059, "step": 5282 }, { "epoch": 1.5855342136854742, "grad_norm": 0.26136964559555054, "learning_rate": 2.5074164154974245e-05, "loss": 0.3438, "step": 5283 }, { "epoch": 1.5858343337334935, "grad_norm": 0.17227911949157715, "learning_rate": 2.503948608567722e-05, "loss": 0.3629, "step": 5284 }, { "epoch": 1.5861344537815127, "grad_norm": 0.1345667839050293, "learning_rate": 2.5004828580937613e-05, "loss": 0.35, "step": 5285 }, { "epoch": 1.5864345738295318, "grad_norm": 0.13711421191692352, "learning_rate": 2.4970191650263354e-05, "loss": 0.33, "step": 5286 }, { "epoch": 1.586734693877551, "grad_norm": 0.14105214178562164, "learning_rate": 2.4935575303156677e-05, "loss": 0.3553, "step": 5287 }, { "epoch": 1.58703481392557, "grad_norm": 0.14076834917068481, "learning_rate": 2.490097954911421e-05, "loss": 0.3633, "step": 5288 }, { "epoch": 1.5873349339735894, "grad_norm": 0.17734204232692719, "learning_rate": 2.4866404397626885e-05, "loss": 0.3428, "step": 5289 }, { "epoch": 1.5876350540216086, "grad_norm": 0.12258309870958328, "learning_rate": 2.4831849858179913e-05, "loss": 0.3083, "step": 5290 }, { "epoch": 1.587935174069628, "grad_norm": 0.13976798951625824, "learning_rate": 2.4797315940253075e-05, "loss": 0.3542, "step": 5291 }, { "epoch": 1.5882352941176472, "grad_norm": 0.141945019364357, "learning_rate": 2.4762802653320295e-05, "loss": 0.3841, "step": 5292 }, { "epoch": 1.5885354141656662, "grad_norm": 0.1637776494026184, "learning_rate": 2.4728310006849863e-05, "loss": 0.3916, "step": 5293 }, { "epoch": 1.5888355342136855, "grad_norm": 0.1532609760761261, "learning_rate": 2.4693838010304472e-05, "loss": 0.3856, "step": 5294 }, { "epoch": 1.5891356542617046, "grad_norm": 0.1620074063539505, "learning_rate": 2.4659386673141137e-05, "loss": 0.4465, "step": 5295 }, { "epoch": 1.5894357743097238, "grad_norm": 0.1464465707540512, "learning_rate": 2.462495600481115e-05, "loss": 0.3593, "step": 5296 }, { "epoch": 1.589735894357743, "grad_norm": 0.19453909993171692, "learning_rate": 2.4590546014760217e-05, "loss": 0.462, "step": 5297 }, { "epoch": 1.5900360144057624, "grad_norm": 0.12688057124614716, "learning_rate": 2.455615671242827e-05, "loss": 0.3361, "step": 5298 }, { "epoch": 1.5903361344537816, "grad_norm": 0.12854287028312683, "learning_rate": 2.452178810724963e-05, "loss": 0.3359, "step": 5299 }, { "epoch": 1.5906362545018007, "grad_norm": 0.15983322262763977, "learning_rate": 2.448744020865299e-05, "loss": 0.3733, "step": 5300 }, { "epoch": 1.59093637454982, "grad_norm": 0.1243366226553917, "learning_rate": 2.4453113026061225e-05, "loss": 0.3484, "step": 5301 }, { "epoch": 1.591236494597839, "grad_norm": 0.12155354768037796, "learning_rate": 2.4418806568891638e-05, "loss": 0.35, "step": 5302 }, { "epoch": 1.5915366146458583, "grad_norm": 0.10961625725030899, "learning_rate": 2.4384520846555835e-05, "loss": 0.2637, "step": 5303 }, { "epoch": 1.5918367346938775, "grad_norm": 0.1370275318622589, "learning_rate": 2.435025586845966e-05, "loss": 0.3819, "step": 5304 }, { "epoch": 1.5921368547418968, "grad_norm": 0.13036561012268066, "learning_rate": 2.4316011644003367e-05, "loss": 0.3608, "step": 5305 }, { "epoch": 1.592436974789916, "grad_norm": 0.12635314464569092, "learning_rate": 2.4281788182581424e-05, "loss": 0.3476, "step": 5306 }, { "epoch": 1.5927370948379351, "grad_norm": 0.14407332241535187, "learning_rate": 2.424758549358266e-05, "loss": 0.4149, "step": 5307 }, { "epoch": 1.5930372148859544, "grad_norm": 0.12928353250026703, "learning_rate": 2.4213403586390216e-05, "loss": 0.347, "step": 5308 }, { "epoch": 1.5933373349339734, "grad_norm": 0.19450713694095612, "learning_rate": 2.4179242470381457e-05, "loss": 0.3485, "step": 5309 }, { "epoch": 1.5936374549819927, "grad_norm": 0.13167737424373627, "learning_rate": 2.4145102154928156e-05, "loss": 0.3479, "step": 5310 }, { "epoch": 1.593937575030012, "grad_norm": 0.14388799667358398, "learning_rate": 2.411098264939625e-05, "loss": 0.3537, "step": 5311 }, { "epoch": 1.5942376950780313, "grad_norm": 0.13148075342178345, "learning_rate": 2.407688396314607e-05, "loss": 0.3536, "step": 5312 }, { "epoch": 1.5945378151260505, "grad_norm": 0.1360936015844345, "learning_rate": 2.4042806105532224e-05, "loss": 0.3769, "step": 5313 }, { "epoch": 1.5948379351740696, "grad_norm": 0.1445259004831314, "learning_rate": 2.4008749085903547e-05, "loss": 0.4002, "step": 5314 }, { "epoch": 1.5951380552220888, "grad_norm": 0.1379500776529312, "learning_rate": 2.3974712913603136e-05, "loss": 0.3729, "step": 5315 }, { "epoch": 1.595438175270108, "grad_norm": 0.13159777224063873, "learning_rate": 2.3940697597968555e-05, "loss": 0.333, "step": 5316 }, { "epoch": 1.5957382953181272, "grad_norm": 0.14502441883087158, "learning_rate": 2.390670314833142e-05, "loss": 0.3638, "step": 5317 }, { "epoch": 1.5960384153661464, "grad_norm": 0.13807806372642517, "learning_rate": 2.387272957401777e-05, "loss": 0.3774, "step": 5318 }, { "epoch": 1.5963385354141657, "grad_norm": 0.1350492238998413, "learning_rate": 2.3838776884347812e-05, "loss": 0.3455, "step": 5319 }, { "epoch": 1.596638655462185, "grad_norm": 0.14351780712604523, "learning_rate": 2.380484508863611e-05, "loss": 0.3827, "step": 5320 }, { "epoch": 1.5969387755102042, "grad_norm": 0.14616893231868744, "learning_rate": 2.3770934196191485e-05, "loss": 0.3513, "step": 5321 }, { "epoch": 1.5972388955582233, "grad_norm": 0.12817350029945374, "learning_rate": 2.3737044216316972e-05, "loss": 0.3259, "step": 5322 }, { "epoch": 1.5975390156062423, "grad_norm": 0.13494904339313507, "learning_rate": 2.3703175158309887e-05, "loss": 0.3648, "step": 5323 }, { "epoch": 1.5978391356542616, "grad_norm": 0.12941431999206543, "learning_rate": 2.366932703146182e-05, "loss": 0.3253, "step": 5324 }, { "epoch": 1.5981392557022809, "grad_norm": 0.1348377913236618, "learning_rate": 2.363549984505864e-05, "loss": 0.3704, "step": 5325 }, { "epoch": 1.5984393757503002, "grad_norm": 0.13204167783260345, "learning_rate": 2.360169360838046e-05, "loss": 0.3487, "step": 5326 }, { "epoch": 1.5987394957983194, "grad_norm": 0.13480515778064728, "learning_rate": 2.3567908330701582e-05, "loss": 0.3632, "step": 5327 }, { "epoch": 1.5990396158463387, "grad_norm": 0.13230575621128082, "learning_rate": 2.353414402129064e-05, "loss": 0.3195, "step": 5328 }, { "epoch": 1.5993397358943577, "grad_norm": 0.13184982538223267, "learning_rate": 2.3500400689410507e-05, "loss": 0.3509, "step": 5329 }, { "epoch": 1.5996398559423768, "grad_norm": 0.13251787424087524, "learning_rate": 2.346667834431826e-05, "loss": 0.3497, "step": 5330 }, { "epoch": 1.599939975990396, "grad_norm": 0.14108306169509888, "learning_rate": 2.343297699526521e-05, "loss": 0.3865, "step": 5331 }, { "epoch": 1.6002400960384153, "grad_norm": 0.13187861442565918, "learning_rate": 2.339929665149695e-05, "loss": 0.3317, "step": 5332 }, { "epoch": 1.6005402160864346, "grad_norm": 0.13160671293735504, "learning_rate": 2.3365637322253343e-05, "loss": 0.339, "step": 5333 }, { "epoch": 1.6008403361344539, "grad_norm": 0.14250072836875916, "learning_rate": 2.333199901676837e-05, "loss": 0.3684, "step": 5334 }, { "epoch": 1.6011404561824731, "grad_norm": 0.1388656049966812, "learning_rate": 2.329838174427037e-05, "loss": 0.3613, "step": 5335 }, { "epoch": 1.6014405762304922, "grad_norm": 0.1267981380224228, "learning_rate": 2.32647855139818e-05, "loss": 0.3245, "step": 5336 }, { "epoch": 1.6017406962785115, "grad_norm": 0.13511812686920166, "learning_rate": 2.3231210335119447e-05, "loss": 0.3543, "step": 5337 }, { "epoch": 1.6020408163265305, "grad_norm": 0.12260711938142776, "learning_rate": 2.319765621689428e-05, "loss": 0.3222, "step": 5338 }, { "epoch": 1.6023409363745498, "grad_norm": 0.12926319241523743, "learning_rate": 2.3164123168511452e-05, "loss": 0.3285, "step": 5339 }, { "epoch": 1.602641056422569, "grad_norm": 0.1304606795310974, "learning_rate": 2.3130611199170384e-05, "loss": 0.3303, "step": 5340 }, { "epoch": 1.6029411764705883, "grad_norm": 0.1373075544834137, "learning_rate": 2.3097120318064725e-05, "loss": 0.3623, "step": 5341 }, { "epoch": 1.6032412965186076, "grad_norm": 0.14596083760261536, "learning_rate": 2.3063650534382265e-05, "loss": 0.3451, "step": 5342 }, { "epoch": 1.6035414165666266, "grad_norm": 0.13206394016742706, "learning_rate": 2.3030201857305124e-05, "loss": 0.3309, "step": 5343 }, { "epoch": 1.603841536614646, "grad_norm": 0.13339190185070038, "learning_rate": 2.2996774296009482e-05, "loss": 0.3646, "step": 5344 }, { "epoch": 1.604141656662665, "grad_norm": 0.12648890912532806, "learning_rate": 2.296336785966585e-05, "loss": 0.3096, "step": 5345 }, { "epoch": 1.6044417767106842, "grad_norm": 0.12503592669963837, "learning_rate": 2.2929982557438935e-05, "loss": 0.3232, "step": 5346 }, { "epoch": 1.6047418967587035, "grad_norm": 0.1373262256383896, "learning_rate": 2.2896618398487534e-05, "loss": 0.373, "step": 5347 }, { "epoch": 1.6050420168067228, "grad_norm": 0.13667701184749603, "learning_rate": 2.28632753919648e-05, "loss": 0.3287, "step": 5348 }, { "epoch": 1.605342136854742, "grad_norm": 0.15100489556789398, "learning_rate": 2.2829953547017945e-05, "loss": 0.4028, "step": 5349 }, { "epoch": 1.605642256902761, "grad_norm": 0.130011647939682, "learning_rate": 2.2796652872788448e-05, "loss": 0.3171, "step": 5350 }, { "epoch": 1.6059423769507803, "grad_norm": 0.13675802946090698, "learning_rate": 2.2763373378412002e-05, "loss": 0.3716, "step": 5351 }, { "epoch": 1.6062424969987994, "grad_norm": 0.13907109200954437, "learning_rate": 2.2730115073018433e-05, "loss": 0.3369, "step": 5352 }, { "epoch": 1.6065426170468187, "grad_norm": 0.138726145029068, "learning_rate": 2.2696877965731723e-05, "loss": 0.3643, "step": 5353 }, { "epoch": 1.606842737094838, "grad_norm": 0.12959453463554382, "learning_rate": 2.2663662065670187e-05, "loss": 0.3278, "step": 5354 }, { "epoch": 1.6071428571428572, "grad_norm": 0.14672960340976715, "learning_rate": 2.2630467381946152e-05, "loss": 0.3871, "step": 5355 }, { "epoch": 1.6074429771908765, "grad_norm": 0.12879501283168793, "learning_rate": 2.2597293923666263e-05, "loss": 0.3165, "step": 5356 }, { "epoch": 1.6077430972388955, "grad_norm": 0.13967733085155487, "learning_rate": 2.2564141699931207e-05, "loss": 0.3689, "step": 5357 }, { "epoch": 1.6080432172869148, "grad_norm": 0.1460617035627365, "learning_rate": 2.2531010719835943e-05, "loss": 0.3635, "step": 5358 }, { "epoch": 1.6083433373349338, "grad_norm": 0.1329815536737442, "learning_rate": 2.2497900992469623e-05, "loss": 0.3449, "step": 5359 }, { "epoch": 1.6086434573829531, "grad_norm": 0.12362521886825562, "learning_rate": 2.246481252691548e-05, "loss": 0.3257, "step": 5360 }, { "epoch": 1.6089435774309724, "grad_norm": 0.14099068939685822, "learning_rate": 2.243174533225092e-05, "loss": 0.3651, "step": 5361 }, { "epoch": 1.6092436974789917, "grad_norm": 0.1269274204969406, "learning_rate": 2.23986994175476e-05, "loss": 0.3161, "step": 5362 }, { "epoch": 1.609543817527011, "grad_norm": 0.16197726130485535, "learning_rate": 2.2365674791871282e-05, "loss": 0.3528, "step": 5363 }, { "epoch": 1.60984393757503, "grad_norm": 0.13220979273319244, "learning_rate": 2.2332671464281863e-05, "loss": 0.3522, "step": 5364 }, { "epoch": 1.6101440576230492, "grad_norm": 0.12421213090419769, "learning_rate": 2.229968944383346e-05, "loss": 0.3048, "step": 5365 }, { "epoch": 1.6104441776710683, "grad_norm": 0.13302136957645416, "learning_rate": 2.2266728739574283e-05, "loss": 0.3266, "step": 5366 }, { "epoch": 1.6107442977190876, "grad_norm": 0.1288372278213501, "learning_rate": 2.2233789360546788e-05, "loss": 0.3034, "step": 5367 }, { "epoch": 1.6110444177671068, "grad_norm": 0.19321192800998688, "learning_rate": 2.2200871315787452e-05, "loss": 0.3439, "step": 5368 }, { "epoch": 1.611344537815126, "grad_norm": 0.15680034458637238, "learning_rate": 2.216797461432696e-05, "loss": 0.3291, "step": 5369 }, { "epoch": 1.6116446578631454, "grad_norm": 0.13714157044887543, "learning_rate": 2.213509926519016e-05, "loss": 0.3425, "step": 5370 }, { "epoch": 1.6119447779111644, "grad_norm": 0.12285466492176056, "learning_rate": 2.2102245277396073e-05, "loss": 0.2918, "step": 5371 }, { "epoch": 1.6122448979591837, "grad_norm": 0.1348283290863037, "learning_rate": 2.2069412659957734e-05, "loss": 0.3331, "step": 5372 }, { "epoch": 1.6125450180072027, "grad_norm": 0.13425099849700928, "learning_rate": 2.2036601421882464e-05, "loss": 0.352, "step": 5373 }, { "epoch": 1.612845138055222, "grad_norm": 0.41682004928588867, "learning_rate": 2.2003811572171594e-05, "loss": 0.3343, "step": 5374 }, { "epoch": 1.6131452581032413, "grad_norm": 0.14027799665927887, "learning_rate": 2.1971043119820665e-05, "loss": 0.3461, "step": 5375 }, { "epoch": 1.6134453781512605, "grad_norm": 0.2078617662191391, "learning_rate": 2.1938296073819354e-05, "loss": 0.3625, "step": 5376 }, { "epoch": 1.6137454981992798, "grad_norm": 0.14351028203964233, "learning_rate": 2.1905570443151402e-05, "loss": 0.3744, "step": 5377 }, { "epoch": 1.614045618247299, "grad_norm": 0.13274571299552917, "learning_rate": 2.187286623679471e-05, "loss": 0.3238, "step": 5378 }, { "epoch": 1.6143457382953181, "grad_norm": 0.13354356586933136, "learning_rate": 2.184018346372134e-05, "loss": 0.3426, "step": 5379 }, { "epoch": 1.6146458583433372, "grad_norm": 0.13134168088436127, "learning_rate": 2.1807522132897383e-05, "loss": 0.3177, "step": 5380 }, { "epoch": 1.6149459783913565, "grad_norm": 0.13090597093105316, "learning_rate": 2.1774882253283168e-05, "loss": 0.338, "step": 5381 }, { "epoch": 1.6152460984393757, "grad_norm": 0.13348695635795593, "learning_rate": 2.1742263833832998e-05, "loss": 0.3206, "step": 5382 }, { "epoch": 1.615546218487395, "grad_norm": 0.17404712736606598, "learning_rate": 2.1709666883495395e-05, "loss": 0.3455, "step": 5383 }, { "epoch": 1.6158463385354143, "grad_norm": 0.1371351182460785, "learning_rate": 2.167709141121298e-05, "loss": 0.3503, "step": 5384 }, { "epoch": 1.6161464585834335, "grad_norm": 0.13850924372673035, "learning_rate": 2.1644537425922427e-05, "loss": 0.3404, "step": 5385 }, { "epoch": 1.6164465786314526, "grad_norm": 0.15199445188045502, "learning_rate": 2.1612004936554575e-05, "loss": 0.3823, "step": 5386 }, { "epoch": 1.6167466986794716, "grad_norm": 0.15341348946094513, "learning_rate": 2.1579493952034312e-05, "loss": 0.3871, "step": 5387 }, { "epoch": 1.617046818727491, "grad_norm": 0.13436566293239594, "learning_rate": 2.154700448128065e-05, "loss": 0.3408, "step": 5388 }, { "epoch": 1.6173469387755102, "grad_norm": 0.13752338290214539, "learning_rate": 2.1514536533206763e-05, "loss": 0.3459, "step": 5389 }, { "epoch": 1.6176470588235294, "grad_norm": 0.19831904768943787, "learning_rate": 2.148209011671979e-05, "loss": 0.3378, "step": 5390 }, { "epoch": 1.6179471788715487, "grad_norm": 0.13402965664863586, "learning_rate": 2.144966524072105e-05, "loss": 0.3408, "step": 5391 }, { "epoch": 1.618247298919568, "grad_norm": 0.12890419363975525, "learning_rate": 2.141726191410599e-05, "loss": 0.3318, "step": 5392 }, { "epoch": 1.618547418967587, "grad_norm": 0.13093030452728271, "learning_rate": 2.138488014576404e-05, "loss": 0.3107, "step": 5393 }, { "epoch": 1.6188475390156063, "grad_norm": 0.13020655512809753, "learning_rate": 2.135251994457874e-05, "loss": 0.3396, "step": 5394 }, { "epoch": 1.6191476590636253, "grad_norm": 0.12640948593616486, "learning_rate": 2.132018131942779e-05, "loss": 0.3196, "step": 5395 }, { "epoch": 1.6194477791116446, "grad_norm": 0.14091448485851288, "learning_rate": 2.128786427918289e-05, "loss": 0.3364, "step": 5396 }, { "epoch": 1.6197478991596639, "grad_norm": 0.13731209933757782, "learning_rate": 2.1255568832709904e-05, "loss": 0.323, "step": 5397 }, { "epoch": 1.6200480192076832, "grad_norm": 0.13610906898975372, "learning_rate": 2.122329498886868e-05, "loss": 0.3347, "step": 5398 }, { "epoch": 1.6203481392557024, "grad_norm": 0.14475463330745697, "learning_rate": 2.1191042756513114e-05, "loss": 0.3649, "step": 5399 }, { "epoch": 1.6206482593037215, "grad_norm": 0.13745839893817902, "learning_rate": 2.1158812144491357e-05, "loss": 0.3576, "step": 5400 }, { "epoch": 1.6209483793517407, "grad_norm": 0.14333461225032806, "learning_rate": 2.1126603161645454e-05, "loss": 0.3593, "step": 5401 }, { "epoch": 1.6212484993997598, "grad_norm": 0.2060752958059311, "learning_rate": 2.109441581681153e-05, "loss": 0.3303, "step": 5402 }, { "epoch": 1.621548619447779, "grad_norm": 0.13641700148582458, "learning_rate": 2.1062250118819847e-05, "loss": 0.3497, "step": 5403 }, { "epoch": 1.6218487394957983, "grad_norm": 0.14730916917324066, "learning_rate": 2.1030106076494726e-05, "loss": 0.3594, "step": 5404 }, { "epoch": 1.6221488595438176, "grad_norm": 0.12196728587150574, "learning_rate": 2.0997983698654467e-05, "loss": 0.3124, "step": 5405 }, { "epoch": 1.6224489795918369, "grad_norm": 0.15658676624298096, "learning_rate": 2.0965882994111517e-05, "loss": 0.3964, "step": 5406 }, { "epoch": 1.622749099639856, "grad_norm": 0.1642051339149475, "learning_rate": 2.0933803971672295e-05, "loss": 0.3354, "step": 5407 }, { "epoch": 1.6230492196878752, "grad_norm": 0.1441899985074997, "learning_rate": 2.0901746640137333e-05, "loss": 0.3449, "step": 5408 }, { "epoch": 1.6233493397358942, "grad_norm": 0.13454824686050415, "learning_rate": 2.086971100830122e-05, "loss": 0.3565, "step": 5409 }, { "epoch": 1.6236494597839135, "grad_norm": 0.13018536567687988, "learning_rate": 2.0837697084952503e-05, "loss": 0.3188, "step": 5410 }, { "epoch": 1.6239495798319328, "grad_norm": 0.13388392329216003, "learning_rate": 2.080570487887391e-05, "loss": 0.3501, "step": 5411 }, { "epoch": 1.624249699879952, "grad_norm": 0.1390429586172104, "learning_rate": 2.077373439884206e-05, "loss": 0.3354, "step": 5412 }, { "epoch": 1.6245498199279713, "grad_norm": 0.12056306004524231, "learning_rate": 2.074178565362772e-05, "loss": 0.3043, "step": 5413 }, { "epoch": 1.6248499399759904, "grad_norm": 0.13007420301437378, "learning_rate": 2.0709858651995695e-05, "loss": 0.3229, "step": 5414 }, { "epoch": 1.6251500600240096, "grad_norm": 0.1374148428440094, "learning_rate": 2.067795340270473e-05, "loss": 0.3537, "step": 5415 }, { "epoch": 1.6254501800720287, "grad_norm": 0.1289047747850418, "learning_rate": 2.0646069914507704e-05, "loss": 0.3079, "step": 5416 }, { "epoch": 1.625750300120048, "grad_norm": 0.14260342717170715, "learning_rate": 2.0614208196151508e-05, "loss": 0.3489, "step": 5417 }, { "epoch": 1.6260504201680672, "grad_norm": 0.14635786414146423, "learning_rate": 2.0582368256376972e-05, "loss": 0.3703, "step": 5418 }, { "epoch": 1.6263505402160865, "grad_norm": 0.13332021236419678, "learning_rate": 2.0550550103919087e-05, "loss": 0.3383, "step": 5419 }, { "epoch": 1.6266506602641058, "grad_norm": 0.130903959274292, "learning_rate": 2.0518753747506748e-05, "loss": 0.3348, "step": 5420 }, { "epoch": 1.6269507803121248, "grad_norm": 0.14831160008907318, "learning_rate": 2.048697919586292e-05, "loss": 0.3663, "step": 5421 }, { "epoch": 1.627250900360144, "grad_norm": 0.14494484663009644, "learning_rate": 2.0455226457704656e-05, "loss": 0.3611, "step": 5422 }, { "epoch": 1.6275510204081631, "grad_norm": 0.13702887296676636, "learning_rate": 2.0423495541742888e-05, "loss": 0.3142, "step": 5423 }, { "epoch": 1.6278511404561824, "grad_norm": 0.12333647906780243, "learning_rate": 2.0391786456682603e-05, "loss": 0.3311, "step": 5424 }, { "epoch": 1.6281512605042017, "grad_norm": 0.13059678673744202, "learning_rate": 2.0360099211222928e-05, "loss": 0.3353, "step": 5425 }, { "epoch": 1.628451380552221, "grad_norm": 0.21004725992679596, "learning_rate": 2.0328433814056803e-05, "loss": 0.343, "step": 5426 }, { "epoch": 1.6287515006002402, "grad_norm": 0.1576651930809021, "learning_rate": 2.0296790273871323e-05, "loss": 0.3474, "step": 5427 }, { "epoch": 1.6290516206482593, "grad_norm": 0.13524000346660614, "learning_rate": 2.0265168599347482e-05, "loss": 0.334, "step": 5428 }, { "epoch": 1.6293517406962785, "grad_norm": 0.1273518055677414, "learning_rate": 2.0233568799160364e-05, "loss": 0.3149, "step": 5429 }, { "epoch": 1.6296518607442976, "grad_norm": 0.126450315117836, "learning_rate": 2.0201990881979006e-05, "loss": 0.3019, "step": 5430 }, { "epoch": 1.6299519807923168, "grad_norm": 0.20898903906345367, "learning_rate": 2.0170434856466447e-05, "loss": 0.3572, "step": 5431 }, { "epoch": 1.6302521008403361, "grad_norm": 0.14011222124099731, "learning_rate": 2.0138900731279686e-05, "loss": 0.3595, "step": 5432 }, { "epoch": 1.6305522208883554, "grad_norm": 0.12649571895599365, "learning_rate": 2.010738851506977e-05, "loss": 0.3063, "step": 5433 }, { "epoch": 1.6308523409363747, "grad_norm": 0.1454174965620041, "learning_rate": 2.0075898216481746e-05, "loss": 0.37, "step": 5434 }, { "epoch": 1.6311524609843937, "grad_norm": 0.1352839320898056, "learning_rate": 2.0044429844154577e-05, "loss": 0.3335, "step": 5435 }, { "epoch": 1.631452581032413, "grad_norm": 0.1313468962907791, "learning_rate": 2.0012983406721274e-05, "loss": 0.3296, "step": 5436 }, { "epoch": 1.631752701080432, "grad_norm": 0.18670260906219482, "learning_rate": 1.9981558912808752e-05, "loss": 0.3488, "step": 5437 }, { "epoch": 1.6320528211284513, "grad_norm": 0.13672375679016113, "learning_rate": 1.9950156371038053e-05, "loss": 0.3515, "step": 5438 }, { "epoch": 1.6323529411764706, "grad_norm": 0.13667839765548706, "learning_rate": 1.9918775790024047e-05, "loss": 0.343, "step": 5439 }, { "epoch": 1.6326530612244898, "grad_norm": 0.14519430696964264, "learning_rate": 1.9887417178375633e-05, "loss": 0.3269, "step": 5440 }, { "epoch": 1.632953181272509, "grad_norm": 0.17133067548274994, "learning_rate": 1.9856080544695687e-05, "loss": 0.3153, "step": 5441 }, { "epoch": 1.6332533013205284, "grad_norm": 0.1435331255197525, "learning_rate": 1.98247658975811e-05, "loss": 0.3279, "step": 5442 }, { "epoch": 1.6335534213685474, "grad_norm": 0.13129855692386627, "learning_rate": 1.9793473245622616e-05, "loss": 0.3123, "step": 5443 }, { "epoch": 1.6338535414165665, "grad_norm": 0.13884615898132324, "learning_rate": 1.9762202597405088e-05, "loss": 0.331, "step": 5444 }, { "epoch": 1.6341536614645857, "grad_norm": 0.13685715198516846, "learning_rate": 1.9730953961507203e-05, "loss": 0.3496, "step": 5445 }, { "epoch": 1.634453781512605, "grad_norm": 0.3771587312221527, "learning_rate": 1.9699727346501674e-05, "loss": 0.3631, "step": 5446 }, { "epoch": 1.6347539015606243, "grad_norm": 0.13653336465358734, "learning_rate": 1.966852276095521e-05, "loss": 0.3424, "step": 5447 }, { "epoch": 1.6350540216086435, "grad_norm": 0.13888002932071686, "learning_rate": 1.9637340213428368e-05, "loss": 0.3464, "step": 5448 }, { "epoch": 1.6353541416566628, "grad_norm": 0.13981866836547852, "learning_rate": 1.960617971247579e-05, "loss": 0.3625, "step": 5449 }, { "epoch": 1.6356542617046819, "grad_norm": 0.38733047246932983, "learning_rate": 1.957504126664593e-05, "loss": 0.2653, "step": 5450 }, { "epoch": 1.635954381752701, "grad_norm": 0.20445388555526733, "learning_rate": 1.95439248844813e-05, "loss": 0.3469, "step": 5451 }, { "epoch": 1.6362545018007202, "grad_norm": 0.1441969871520996, "learning_rate": 1.9512830574518348e-05, "loss": 0.3691, "step": 5452 }, { "epoch": 1.6365546218487395, "grad_norm": 0.139201819896698, "learning_rate": 1.9481758345287383e-05, "loss": 0.3677, "step": 5453 }, { "epoch": 1.6368547418967587, "grad_norm": 0.13921897113323212, "learning_rate": 1.9450708205312762e-05, "loss": 0.3377, "step": 5454 }, { "epoch": 1.637154861944778, "grad_norm": 0.13050687313079834, "learning_rate": 1.941968016311273e-05, "loss": 0.2899, "step": 5455 }, { "epoch": 1.6374549819927973, "grad_norm": 0.12800630927085876, "learning_rate": 1.9388674227199443e-05, "loss": 0.2944, "step": 5456 }, { "epoch": 1.6377551020408163, "grad_norm": 0.14279204607009888, "learning_rate": 1.9357690406079076e-05, "loss": 0.3534, "step": 5457 }, { "epoch": 1.6380552220888356, "grad_norm": 0.1505601704120636, "learning_rate": 1.932672870825162e-05, "loss": 0.3562, "step": 5458 }, { "epoch": 1.6383553421368546, "grad_norm": 0.13510508835315704, "learning_rate": 1.929578914221111e-05, "loss": 0.3237, "step": 5459 }, { "epoch": 1.638655462184874, "grad_norm": 0.1296669989824295, "learning_rate": 1.9264871716445454e-05, "loss": 0.322, "step": 5460 }, { "epoch": 1.6389555822328932, "grad_norm": 0.13488991558551788, "learning_rate": 1.9233976439436495e-05, "loss": 0.3416, "step": 5461 }, { "epoch": 1.6392557022809124, "grad_norm": 0.14997981488704681, "learning_rate": 1.9203103319659942e-05, "loss": 0.3439, "step": 5462 }, { "epoch": 1.6395558223289317, "grad_norm": 0.13843125104904175, "learning_rate": 1.9172252365585574e-05, "loss": 0.3777, "step": 5463 }, { "epoch": 1.6398559423769508, "grad_norm": 0.12544837594032288, "learning_rate": 1.9141423585676953e-05, "loss": 0.2917, "step": 5464 }, { "epoch": 1.64015606242497, "grad_norm": 0.12239256501197815, "learning_rate": 1.9110616988391572e-05, "loss": 0.3061, "step": 5465 }, { "epoch": 1.640456182472989, "grad_norm": 0.15580035746097565, "learning_rate": 1.90798325821809e-05, "loss": 0.371, "step": 5466 }, { "epoch": 1.6407563025210083, "grad_norm": 0.12866289913654327, "learning_rate": 1.9049070375490273e-05, "loss": 0.3229, "step": 5467 }, { "epoch": 1.6410564225690276, "grad_norm": 0.13854321837425232, "learning_rate": 1.9018330376758997e-05, "loss": 0.3643, "step": 5468 }, { "epoch": 1.6413565426170469, "grad_norm": 0.13516482710838318, "learning_rate": 1.898761259442019e-05, "loss": 0.349, "step": 5469 }, { "epoch": 1.6416566626650662, "grad_norm": 0.15773414075374603, "learning_rate": 1.89569170369009e-05, "loss": 0.3718, "step": 5470 }, { "epoch": 1.6419567827130852, "grad_norm": 0.13902094960212708, "learning_rate": 1.892624371262215e-05, "loss": 0.3655, "step": 5471 }, { "epoch": 1.6422569027611045, "grad_norm": 0.12148105353116989, "learning_rate": 1.8895592629998814e-05, "loss": 0.2935, "step": 5472 }, { "epoch": 1.6425570228091235, "grad_norm": 0.1329748034477234, "learning_rate": 1.8864963797439617e-05, "loss": 0.3394, "step": 5473 }, { "epoch": 1.6428571428571428, "grad_norm": 0.15787284076213837, "learning_rate": 1.8834357223347297e-05, "loss": 0.3844, "step": 5474 }, { "epoch": 1.643157262905162, "grad_norm": 0.15091612935066223, "learning_rate": 1.8803772916118324e-05, "loss": 0.3612, "step": 5475 }, { "epoch": 1.6434573829531813, "grad_norm": 0.15188480913639069, "learning_rate": 1.8773210884143255e-05, "loss": 0.3882, "step": 5476 }, { "epoch": 1.6437575030012006, "grad_norm": 0.1658872812986374, "learning_rate": 1.87426711358064e-05, "loss": 0.344, "step": 5477 }, { "epoch": 1.6440576230492197, "grad_norm": 0.13719511032104492, "learning_rate": 1.8712153679485932e-05, "loss": 0.3691, "step": 5478 }, { "epoch": 1.644357743097239, "grad_norm": 0.1480136513710022, "learning_rate": 1.8681658523554025e-05, "loss": 0.3757, "step": 5479 }, { "epoch": 1.644657863145258, "grad_norm": 0.13319872319698334, "learning_rate": 1.865118567637667e-05, "loss": 0.3083, "step": 5480 }, { "epoch": 1.6449579831932772, "grad_norm": 0.13993017375469208, "learning_rate": 1.8620735146313705e-05, "loss": 0.3557, "step": 5481 }, { "epoch": 1.6452581032412965, "grad_norm": 0.14838139712810516, "learning_rate": 1.859030694171895e-05, "loss": 0.3484, "step": 5482 }, { "epoch": 1.6455582232893158, "grad_norm": 0.12882272899150848, "learning_rate": 1.8559901070939956e-05, "loss": 0.3198, "step": 5483 }, { "epoch": 1.645858343337335, "grad_norm": 0.14248614013195038, "learning_rate": 1.8529517542318265e-05, "loss": 0.3543, "step": 5484 }, { "epoch": 1.646158463385354, "grad_norm": 0.1413465291261673, "learning_rate": 1.8499156364189283e-05, "loss": 0.3405, "step": 5485 }, { "epoch": 1.6464585834333734, "grad_norm": 0.1345796436071396, "learning_rate": 1.8468817544882178e-05, "loss": 0.348, "step": 5486 }, { "epoch": 1.6467587034813924, "grad_norm": 0.13932184875011444, "learning_rate": 1.8438501092720105e-05, "loss": 0.3472, "step": 5487 }, { "epoch": 1.6470588235294117, "grad_norm": 0.1598397195339203, "learning_rate": 1.840820701602004e-05, "loss": 0.3611, "step": 5488 }, { "epoch": 1.647358943577431, "grad_norm": 0.14302033185958862, "learning_rate": 1.8377935323092788e-05, "loss": 0.3227, "step": 5489 }, { "epoch": 1.6476590636254502, "grad_norm": 0.12700681388378143, "learning_rate": 1.834768602224307e-05, "loss": 0.3124, "step": 5490 }, { "epoch": 1.6479591836734695, "grad_norm": 0.1239708662033081, "learning_rate": 1.83174591217694e-05, "loss": 0.2994, "step": 5491 }, { "epoch": 1.6482593037214885, "grad_norm": 0.13394364714622498, "learning_rate": 1.828725462996419e-05, "loss": 0.3269, "step": 5492 }, { "epoch": 1.6485594237695078, "grad_norm": 0.12652020156383514, "learning_rate": 1.825707255511374e-05, "loss": 0.3007, "step": 5493 }, { "epoch": 1.6488595438175269, "grad_norm": 0.1336975246667862, "learning_rate": 1.822691290549813e-05, "loss": 0.3347, "step": 5494 }, { "epoch": 1.6491596638655461, "grad_norm": 0.12572890520095825, "learning_rate": 1.8196775689391266e-05, "loss": 0.2905, "step": 5495 }, { "epoch": 1.6494597839135654, "grad_norm": 0.1284392923116684, "learning_rate": 1.8166660915060986e-05, "loss": 0.3219, "step": 5496 }, { "epoch": 1.6497599039615847, "grad_norm": 0.13893640041351318, "learning_rate": 1.8136568590768944e-05, "loss": 0.3341, "step": 5497 }, { "epoch": 1.650060024009604, "grad_norm": 0.1879337877035141, "learning_rate": 1.8106498724770638e-05, "loss": 0.3586, "step": 5498 }, { "epoch": 1.6503601440576232, "grad_norm": 0.14967003464698792, "learning_rate": 1.8076451325315368e-05, "loss": 0.3552, "step": 5499 }, { "epoch": 1.6506602641056423, "grad_norm": 0.14558528363704681, "learning_rate": 1.8046426400646244e-05, "loss": 0.3317, "step": 5500 }, { "epoch": 1.6509603841536613, "grad_norm": 0.12756778299808502, "learning_rate": 1.801642395900036e-05, "loss": 0.2754, "step": 5501 }, { "epoch": 1.6512605042016806, "grad_norm": 0.13232913613319397, "learning_rate": 1.7986444008608496e-05, "loss": 0.3194, "step": 5502 }, { "epoch": 1.6515606242496998, "grad_norm": 0.14271697402000427, "learning_rate": 1.7956486557695263e-05, "loss": 0.3397, "step": 5503 }, { "epoch": 1.6518607442977191, "grad_norm": 0.14435574412345886, "learning_rate": 1.7926551614479192e-05, "loss": 0.3668, "step": 5504 }, { "epoch": 1.6521608643457384, "grad_norm": 0.1720234900712967, "learning_rate": 1.789663918717258e-05, "loss": 0.3604, "step": 5505 }, { "epoch": 1.6524609843937577, "grad_norm": 0.13502709567546844, "learning_rate": 1.78667492839816e-05, "loss": 0.3402, "step": 5506 }, { "epoch": 1.6527611044417767, "grad_norm": 0.13042998313903809, "learning_rate": 1.7836881913106152e-05, "loss": 0.3042, "step": 5507 }, { "epoch": 1.6530612244897958, "grad_norm": 0.1390058696269989, "learning_rate": 1.7807037082739996e-05, "loss": 0.3705, "step": 5508 }, { "epoch": 1.653361344537815, "grad_norm": 0.1370200514793396, "learning_rate": 1.7777214801070752e-05, "loss": 0.3602, "step": 5509 }, { "epoch": 1.6536614645858343, "grad_norm": 0.12913063168525696, "learning_rate": 1.774741507627984e-05, "loss": 0.2908, "step": 5510 }, { "epoch": 1.6539615846338536, "grad_norm": 0.13181883096694946, "learning_rate": 1.7717637916542408e-05, "loss": 0.327, "step": 5511 }, { "epoch": 1.6542617046818728, "grad_norm": 0.13132743537425995, "learning_rate": 1.768788333002752e-05, "loss": 0.317, "step": 5512 }, { "epoch": 1.654561824729892, "grad_norm": 0.140101358294487, "learning_rate": 1.7658151324898033e-05, "loss": 0.3595, "step": 5513 }, { "epoch": 1.6548619447779112, "grad_norm": 0.13309207558631897, "learning_rate": 1.762844190931051e-05, "loss": 0.325, "step": 5514 }, { "epoch": 1.6551620648259304, "grad_norm": 0.14855670928955078, "learning_rate": 1.7598755091415474e-05, "loss": 0.3458, "step": 5515 }, { "epoch": 1.6554621848739495, "grad_norm": 0.15083813667297363, "learning_rate": 1.7569090879357077e-05, "loss": 0.3892, "step": 5516 }, { "epoch": 1.6557623049219687, "grad_norm": 0.12831264734268188, "learning_rate": 1.75394492812734e-05, "loss": 0.3063, "step": 5517 }, { "epoch": 1.656062424969988, "grad_norm": 0.14697401225566864, "learning_rate": 1.7509830305296304e-05, "loss": 0.3457, "step": 5518 }, { "epoch": 1.6563625450180073, "grad_norm": 0.13638198375701904, "learning_rate": 1.748023395955135e-05, "loss": 0.3427, "step": 5519 }, { "epoch": 1.6566626650660266, "grad_norm": 0.14705848693847656, "learning_rate": 1.7450660252158015e-05, "loss": 0.3866, "step": 5520 }, { "epoch": 1.6569627851140456, "grad_norm": 0.14067374169826508, "learning_rate": 1.7421109191229458e-05, "loss": 0.341, "step": 5521 }, { "epoch": 1.6572629051620649, "grad_norm": 0.13154152035713196, "learning_rate": 1.7391580784872696e-05, "loss": 0.3297, "step": 5522 }, { "epoch": 1.657563025210084, "grad_norm": 0.12965817749500275, "learning_rate": 1.736207504118853e-05, "loss": 0.3166, "step": 5523 }, { "epoch": 1.6578631452581032, "grad_norm": 0.14938890933990479, "learning_rate": 1.7332591968271507e-05, "loss": 0.3274, "step": 5524 }, { "epoch": 1.6581632653061225, "grad_norm": 0.14167505502700806, "learning_rate": 1.730313157420992e-05, "loss": 0.3417, "step": 5525 }, { "epoch": 1.6584633853541417, "grad_norm": 0.13157668709754944, "learning_rate": 1.7273693867085972e-05, "loss": 0.3086, "step": 5526 }, { "epoch": 1.658763505402161, "grad_norm": 0.13809318840503693, "learning_rate": 1.7244278854975504e-05, "loss": 0.3837, "step": 5527 }, { "epoch": 1.65906362545018, "grad_norm": 0.14108997583389282, "learning_rate": 1.721488654594824e-05, "loss": 0.3717, "step": 5528 }, { "epoch": 1.6593637454981993, "grad_norm": 0.15386879444122314, "learning_rate": 1.718551694806755e-05, "loss": 0.3282, "step": 5529 }, { "epoch": 1.6596638655462184, "grad_norm": 0.15658845007419586, "learning_rate": 1.71561700693907e-05, "loss": 0.3445, "step": 5530 }, { "epoch": 1.6599639855942376, "grad_norm": 0.1463109701871872, "learning_rate": 1.712684591796867e-05, "loss": 0.3512, "step": 5531 }, { "epoch": 1.660264105642257, "grad_norm": 0.12552201747894287, "learning_rate": 1.7097544501846185e-05, "loss": 0.302, "step": 5532 }, { "epoch": 1.6605642256902762, "grad_norm": 0.16053760051727295, "learning_rate": 1.7068265829061745e-05, "loss": 0.3488, "step": 5533 }, { "epoch": 1.6608643457382954, "grad_norm": 0.14493612945079803, "learning_rate": 1.703900990764763e-05, "loss": 0.3747, "step": 5534 }, { "epoch": 1.6611644657863145, "grad_norm": 0.13223378360271454, "learning_rate": 1.7009776745629858e-05, "loss": 0.3232, "step": 5535 }, { "epoch": 1.6614645858343338, "grad_norm": 0.13315922021865845, "learning_rate": 1.698056635102826e-05, "loss": 0.341, "step": 5536 }, { "epoch": 1.6617647058823528, "grad_norm": 0.13376358151435852, "learning_rate": 1.6951378731856292e-05, "loss": 0.335, "step": 5537 }, { "epoch": 1.662064825930372, "grad_norm": 0.12876003980636597, "learning_rate": 1.6922213896121296e-05, "loss": 0.3229, "step": 5538 }, { "epoch": 1.6623649459783914, "grad_norm": 0.15036362409591675, "learning_rate": 1.6893071851824328e-05, "loss": 0.3764, "step": 5539 }, { "epoch": 1.6626650660264106, "grad_norm": 0.13925230503082275, "learning_rate": 1.6863952606960132e-05, "loss": 0.358, "step": 5540 }, { "epoch": 1.66296518607443, "grad_norm": 0.15111415088176727, "learning_rate": 1.6834856169517232e-05, "loss": 0.3122, "step": 5541 }, { "epoch": 1.663265306122449, "grad_norm": 0.15889416635036469, "learning_rate": 1.6805782547477935e-05, "loss": 0.3673, "step": 5542 }, { "epoch": 1.6635654261704682, "grad_norm": 0.14329048991203308, "learning_rate": 1.6776731748818254e-05, "loss": 0.3623, "step": 5543 }, { "epoch": 1.6638655462184873, "grad_norm": 0.1454945206642151, "learning_rate": 1.6747703781507905e-05, "loss": 0.3686, "step": 5544 }, { "epoch": 1.6641656662665065, "grad_norm": 0.1360265016555786, "learning_rate": 1.6718698653510433e-05, "loss": 0.3642, "step": 5545 }, { "epoch": 1.6644657863145258, "grad_norm": 0.13417474925518036, "learning_rate": 1.6689716372783002e-05, "loss": 0.332, "step": 5546 }, { "epoch": 1.664765906362545, "grad_norm": 0.1391753852367401, "learning_rate": 1.666075694727661e-05, "loss": 0.3443, "step": 5547 }, { "epoch": 1.6650660264105643, "grad_norm": 0.13319963216781616, "learning_rate": 1.663182038493595e-05, "loss": 0.324, "step": 5548 }, { "epoch": 1.6653661464585834, "grad_norm": 0.13567818701267242, "learning_rate": 1.6602906693699394e-05, "loss": 0.3287, "step": 5549 }, { "epoch": 1.6656662665066027, "grad_norm": 0.1341031938791275, "learning_rate": 1.6574015881499106e-05, "loss": 0.3005, "step": 5550 }, { "epoch": 1.6659663865546217, "grad_norm": 0.13578887283802032, "learning_rate": 1.6545147956260987e-05, "loss": 0.3469, "step": 5551 }, { "epoch": 1.666266506602641, "grad_norm": 0.1205897331237793, "learning_rate": 1.6516302925904547e-05, "loss": 0.2774, "step": 5552 }, { "epoch": 1.6665666266506602, "grad_norm": 0.12699086964130402, "learning_rate": 1.648748079834315e-05, "loss": 0.3243, "step": 5553 }, { "epoch": 1.6668667466986795, "grad_norm": 0.1304396241903305, "learning_rate": 1.645868158148377e-05, "loss": 0.3284, "step": 5554 }, { "epoch": 1.6671668667466988, "grad_norm": 0.13411088287830353, "learning_rate": 1.6429905283227164e-05, "loss": 0.3419, "step": 5555 }, { "epoch": 1.667466986794718, "grad_norm": 0.1532057374715805, "learning_rate": 1.6401151911467815e-05, "loss": 0.3682, "step": 5556 }, { "epoch": 1.667767106842737, "grad_norm": 0.14714492857456207, "learning_rate": 1.6372421474093814e-05, "loss": 0.3613, "step": 5557 }, { "epoch": 1.6680672268907561, "grad_norm": 0.14000540971755981, "learning_rate": 1.6343713978987073e-05, "loss": 0.3205, "step": 5558 }, { "epoch": 1.6683673469387754, "grad_norm": 0.13985879719257355, "learning_rate": 1.6315029434023143e-05, "loss": 0.3453, "step": 5559 }, { "epoch": 1.6686674669867947, "grad_norm": 0.13146568834781647, "learning_rate": 1.6286367847071294e-05, "loss": 0.3348, "step": 5560 }, { "epoch": 1.668967587034814, "grad_norm": 0.13760332763195038, "learning_rate": 1.6257729225994544e-05, "loss": 0.345, "step": 5561 }, { "epoch": 1.6692677070828332, "grad_norm": 0.1339607685804367, "learning_rate": 1.6229113578649547e-05, "loss": 0.3365, "step": 5562 }, { "epoch": 1.6695678271308525, "grad_norm": 0.13104887306690216, "learning_rate": 1.6200520912886618e-05, "loss": 0.3335, "step": 5563 }, { "epoch": 1.6698679471788715, "grad_norm": 0.14629746973514557, "learning_rate": 1.6171951236549932e-05, "loss": 0.4062, "step": 5564 }, { "epoch": 1.6701680672268906, "grad_norm": 0.14013619720935822, "learning_rate": 1.6143404557477183e-05, "loss": 0.3638, "step": 5565 }, { "epoch": 1.6704681872749099, "grad_norm": 0.1354597955942154, "learning_rate": 1.6114880883499873e-05, "loss": 0.3543, "step": 5566 }, { "epoch": 1.6707683073229291, "grad_norm": 0.13572631776332855, "learning_rate": 1.6086380222443087e-05, "loss": 0.3399, "step": 5567 }, { "epoch": 1.6710684273709484, "grad_norm": 0.1253531277179718, "learning_rate": 1.6057902582125683e-05, "loss": 0.3035, "step": 5568 }, { "epoch": 1.6713685474189677, "grad_norm": 0.1563360095024109, "learning_rate": 1.60294479703602e-05, "loss": 0.3735, "step": 5569 }, { "epoch": 1.671668667466987, "grad_norm": 0.14220857620239258, "learning_rate": 1.6001016394952817e-05, "loss": 0.3537, "step": 5570 }, { "epoch": 1.671968787515006, "grad_norm": 0.12539853155612946, "learning_rate": 1.597260786370337e-05, "loss": 0.2969, "step": 5571 }, { "epoch": 1.6722689075630253, "grad_norm": 0.13844381272792816, "learning_rate": 1.594422238440546e-05, "loss": 0.3463, "step": 5572 }, { "epoch": 1.6725690276110443, "grad_norm": 0.12984538078308105, "learning_rate": 1.5915859964846325e-05, "loss": 0.3183, "step": 5573 }, { "epoch": 1.6728691476590636, "grad_norm": 0.13644136488437653, "learning_rate": 1.5887520612806817e-05, "loss": 0.3268, "step": 5574 }, { "epoch": 1.6731692677070829, "grad_norm": 0.1338503360748291, "learning_rate": 1.5859204336061562e-05, "loss": 0.315, "step": 5575 }, { "epoch": 1.6734693877551021, "grad_norm": 0.14446158707141876, "learning_rate": 1.583091114237878e-05, "loss": 0.2993, "step": 5576 }, { "epoch": 1.6737695078031214, "grad_norm": 0.1409435272216797, "learning_rate": 1.5802641039520415e-05, "loss": 0.3283, "step": 5577 }, { "epoch": 1.6740696278511404, "grad_norm": 0.13679242134094238, "learning_rate": 1.5774394035242035e-05, "loss": 0.3384, "step": 5578 }, { "epoch": 1.6743697478991597, "grad_norm": 0.12947718799114227, "learning_rate": 1.574617013729285e-05, "loss": 0.3212, "step": 5579 }, { "epoch": 1.6746698679471788, "grad_norm": 0.15234880149364471, "learning_rate": 1.5717969353415772e-05, "loss": 0.3421, "step": 5580 }, { "epoch": 1.674969987995198, "grad_norm": 0.12767057120800018, "learning_rate": 1.56897916913474e-05, "loss": 0.3232, "step": 5581 }, { "epoch": 1.6752701080432173, "grad_norm": 0.1383272111415863, "learning_rate": 1.566163715881791e-05, "loss": 0.3422, "step": 5582 }, { "epoch": 1.6755702280912366, "grad_norm": 0.15944012999534607, "learning_rate": 1.5633505763551205e-05, "loss": 0.3187, "step": 5583 }, { "epoch": 1.6758703481392558, "grad_norm": 0.16072912514209747, "learning_rate": 1.5605397513264764e-05, "loss": 0.3355, "step": 5584 }, { "epoch": 1.6761704681872749, "grad_norm": 0.13186360895633698, "learning_rate": 1.5577312415669842e-05, "loss": 0.3175, "step": 5585 }, { "epoch": 1.6764705882352942, "grad_norm": 0.12840430438518524, "learning_rate": 1.5549250478471213e-05, "loss": 0.2959, "step": 5586 }, { "epoch": 1.6767707082833132, "grad_norm": 0.13190403580665588, "learning_rate": 1.5521211709367335e-05, "loss": 0.3085, "step": 5587 }, { "epoch": 1.6770708283313325, "grad_norm": 0.13660357892513275, "learning_rate": 1.5493196116050336e-05, "loss": 0.3472, "step": 5588 }, { "epoch": 1.6773709483793517, "grad_norm": 0.14554022252559662, "learning_rate": 1.5465203706206e-05, "loss": 0.3534, "step": 5589 }, { "epoch": 1.677671068427371, "grad_norm": 0.1355992555618286, "learning_rate": 1.5437234487513687e-05, "loss": 0.317, "step": 5590 }, { "epoch": 1.6779711884753903, "grad_norm": 0.21639618277549744, "learning_rate": 1.5409288467646465e-05, "loss": 0.333, "step": 5591 }, { "epoch": 1.6782713085234093, "grad_norm": 0.13919563591480255, "learning_rate": 1.538136565427096e-05, "loss": 0.3193, "step": 5592 }, { "epoch": 1.6785714285714286, "grad_norm": 0.15131469070911407, "learning_rate": 1.5353466055047504e-05, "loss": 0.3236, "step": 5593 }, { "epoch": 1.6788715486194477, "grad_norm": 0.13067404925823212, "learning_rate": 1.532558967763005e-05, "loss": 0.3201, "step": 5594 }, { "epoch": 1.679171668667467, "grad_norm": 0.14243102073669434, "learning_rate": 1.5297736529666117e-05, "loss": 0.3494, "step": 5595 }, { "epoch": 1.6794717887154862, "grad_norm": 0.13154637813568115, "learning_rate": 1.526990661879695e-05, "loss": 0.3085, "step": 5596 }, { "epoch": 1.6797719087635055, "grad_norm": 0.16695889830589294, "learning_rate": 1.5242099952657307e-05, "loss": 0.3541, "step": 5597 }, { "epoch": 1.6800720288115247, "grad_norm": 0.13061389327049255, "learning_rate": 1.521431653887566e-05, "loss": 0.3114, "step": 5598 }, { "epoch": 1.6803721488595438, "grad_norm": 0.13496339321136475, "learning_rate": 1.5186556385074103e-05, "loss": 0.3062, "step": 5599 }, { "epoch": 1.680672268907563, "grad_norm": 0.13125289976596832, "learning_rate": 1.5158819498868248e-05, "loss": 0.3207, "step": 5600 }, { "epoch": 1.680972388955582, "grad_norm": 0.14022837579250336, "learning_rate": 1.5131105887867425e-05, "loss": 0.3347, "step": 5601 }, { "epoch": 1.6812725090036014, "grad_norm": 0.13820809125900269, "learning_rate": 1.5103415559674561e-05, "loss": 0.3541, "step": 5602 }, { "epoch": 1.6815726290516206, "grad_norm": 0.14528962969779968, "learning_rate": 1.5075748521886179e-05, "loss": 0.3441, "step": 5603 }, { "epoch": 1.68187274909964, "grad_norm": 0.1386173814535141, "learning_rate": 1.5048104782092364e-05, "loss": 0.3351, "step": 5604 }, { "epoch": 1.6821728691476592, "grad_norm": 0.1344204694032669, "learning_rate": 1.5020484347876895e-05, "loss": 0.3267, "step": 5605 }, { "epoch": 1.6824729891956782, "grad_norm": 0.1447894424200058, "learning_rate": 1.4992887226817132e-05, "loss": 0.332, "step": 5606 }, { "epoch": 1.6827731092436975, "grad_norm": 0.1385362595319748, "learning_rate": 1.496531342648403e-05, "loss": 0.3419, "step": 5607 }, { "epoch": 1.6830732292917165, "grad_norm": 0.14121302962303162, "learning_rate": 1.4937762954442136e-05, "loss": 0.3506, "step": 5608 }, { "epoch": 1.6833733493397358, "grad_norm": 0.1456373929977417, "learning_rate": 1.4910235818249552e-05, "loss": 0.3536, "step": 5609 }, { "epoch": 1.683673469387755, "grad_norm": 0.13301417231559753, "learning_rate": 1.4882732025458124e-05, "loss": 0.3085, "step": 5610 }, { "epoch": 1.6839735894357744, "grad_norm": 0.12184537202119827, "learning_rate": 1.4855251583613172e-05, "loss": 0.2981, "step": 5611 }, { "epoch": 1.6842737094837936, "grad_norm": 0.15292181074619293, "learning_rate": 1.48277945002536e-05, "loss": 0.3656, "step": 5612 }, { "epoch": 1.6845738295318127, "grad_norm": 0.13678281009197235, "learning_rate": 1.480036078291197e-05, "loss": 0.3061, "step": 5613 }, { "epoch": 1.684873949579832, "grad_norm": 0.13338017463684082, "learning_rate": 1.4772950439114408e-05, "loss": 0.3272, "step": 5614 }, { "epoch": 1.685174069627851, "grad_norm": 0.13358749449253082, "learning_rate": 1.4745563476380652e-05, "loss": 0.2983, "step": 5615 }, { "epoch": 1.6854741896758703, "grad_norm": 0.13526228070259094, "learning_rate": 1.4718199902223984e-05, "loss": 0.3391, "step": 5616 }, { "epoch": 1.6857743097238895, "grad_norm": 0.13874565064907074, "learning_rate": 1.4690859724151262e-05, "loss": 0.3166, "step": 5617 }, { "epoch": 1.6860744297719088, "grad_norm": 0.1305008977651596, "learning_rate": 1.4663542949662967e-05, "loss": 0.2987, "step": 5618 }, { "epoch": 1.686374549819928, "grad_norm": 0.13568472862243652, "learning_rate": 1.463624958625317e-05, "loss": 0.3285, "step": 5619 }, { "epoch": 1.6866746698679473, "grad_norm": 0.14798003435134888, "learning_rate": 1.4608979641409448e-05, "loss": 0.364, "step": 5620 }, { "epoch": 1.6869747899159664, "grad_norm": 0.1313888132572174, "learning_rate": 1.4581733122613028e-05, "loss": 0.3244, "step": 5621 }, { "epoch": 1.6872749099639854, "grad_norm": 0.14711931347846985, "learning_rate": 1.4554510037338654e-05, "loss": 0.3295, "step": 5622 }, { "epoch": 1.6875750300120047, "grad_norm": 0.1367296576499939, "learning_rate": 1.4527310393054693e-05, "loss": 0.3393, "step": 5623 }, { "epoch": 1.687875150060024, "grad_norm": 0.13434435427188873, "learning_rate": 1.4500134197223058e-05, "loss": 0.3123, "step": 5624 }, { "epoch": 1.6881752701080432, "grad_norm": 0.1336866021156311, "learning_rate": 1.4472981457299195e-05, "loss": 0.3313, "step": 5625 }, { "epoch": 1.6884753901560625, "grad_norm": 0.14393779635429382, "learning_rate": 1.4445852180732167e-05, "loss": 0.3551, "step": 5626 }, { "epoch": 1.6887755102040818, "grad_norm": 0.14075292646884918, "learning_rate": 1.4418746374964598e-05, "loss": 0.3383, "step": 5627 }, { "epoch": 1.6890756302521008, "grad_norm": 0.13475462794303894, "learning_rate": 1.4391664047432618e-05, "loss": 0.3221, "step": 5628 }, { "epoch": 1.6893757503001199, "grad_norm": 0.1758328527212143, "learning_rate": 1.4364605205565984e-05, "loss": 0.3569, "step": 5629 }, { "epoch": 1.6896758703481392, "grad_norm": 0.1369779109954834, "learning_rate": 1.4337569856787958e-05, "loss": 0.315, "step": 5630 }, { "epoch": 1.6899759903961584, "grad_norm": 0.1335962861776352, "learning_rate": 1.4310558008515373e-05, "loss": 0.3243, "step": 5631 }, { "epoch": 1.6902761104441777, "grad_norm": 0.131963849067688, "learning_rate": 1.428356966815867e-05, "loss": 0.3198, "step": 5632 }, { "epoch": 1.690576230492197, "grad_norm": 0.14859957993030548, "learning_rate": 1.4256604843121735e-05, "loss": 0.3731, "step": 5633 }, { "epoch": 1.6908763505402162, "grad_norm": 0.13696502149105072, "learning_rate": 1.4229663540802052e-05, "loss": 0.3406, "step": 5634 }, { "epoch": 1.6911764705882353, "grad_norm": 0.13967718183994293, "learning_rate": 1.4202745768590719e-05, "loss": 0.3454, "step": 5635 }, { "epoch": 1.6914765906362546, "grad_norm": 0.1300186663866043, "learning_rate": 1.4175851533872253e-05, "loss": 0.3271, "step": 5636 }, { "epoch": 1.6917767106842736, "grad_norm": 0.13645406067371368, "learning_rate": 1.414898084402484e-05, "loss": 0.3508, "step": 5637 }, { "epoch": 1.6920768307322929, "grad_norm": 0.14068691432476044, "learning_rate": 1.4122133706420093e-05, "loss": 0.3228, "step": 5638 }, { "epoch": 1.6923769507803121, "grad_norm": 0.14424894750118256, "learning_rate": 1.4095310128423233e-05, "loss": 0.3767, "step": 5639 }, { "epoch": 1.6926770708283314, "grad_norm": 0.14913184940814972, "learning_rate": 1.406851011739303e-05, "loss": 0.358, "step": 5640 }, { "epoch": 1.6929771908763507, "grad_norm": 0.1272694319486618, "learning_rate": 1.4041733680681734e-05, "loss": 0.3129, "step": 5641 }, { "epoch": 1.6932773109243697, "grad_norm": 0.1338389813899994, "learning_rate": 1.4014980825635137e-05, "loss": 0.3244, "step": 5642 }, { "epoch": 1.693577430972389, "grad_norm": 0.13543696701526642, "learning_rate": 1.3988251559592592e-05, "loss": 0.3224, "step": 5643 }, { "epoch": 1.693877551020408, "grad_norm": 0.1341482698917389, "learning_rate": 1.3961545889886973e-05, "loss": 0.3345, "step": 5644 }, { "epoch": 1.6941776710684273, "grad_norm": 0.17108403146266937, "learning_rate": 1.3934863823844702e-05, "loss": 0.4254, "step": 5645 }, { "epoch": 1.6944777911164466, "grad_norm": 0.14934246242046356, "learning_rate": 1.3908205368785654e-05, "loss": 0.3827, "step": 5646 }, { "epoch": 1.6947779111644659, "grad_norm": 0.12500108778476715, "learning_rate": 1.3881570532023246e-05, "loss": 0.293, "step": 5647 }, { "epoch": 1.6950780312124851, "grad_norm": 0.13049247860908508, "learning_rate": 1.3854959320864513e-05, "loss": 0.3155, "step": 5648 }, { "epoch": 1.6953781512605042, "grad_norm": 0.12995624542236328, "learning_rate": 1.3828371742609914e-05, "loss": 0.3187, "step": 5649 }, { "epoch": 1.6956782713085234, "grad_norm": 0.13610060513019562, "learning_rate": 1.3801807804553401e-05, "loss": 0.3513, "step": 5650 }, { "epoch": 1.6959783913565425, "grad_norm": 0.1441008299589157, "learning_rate": 1.3775267513982526e-05, "loss": 0.3432, "step": 5651 }, { "epoch": 1.6962785114045618, "grad_norm": 0.12790493667125702, "learning_rate": 1.374875087817833e-05, "loss": 0.32, "step": 5652 }, { "epoch": 1.696578631452581, "grad_norm": 0.13147617876529694, "learning_rate": 1.3722257904415292e-05, "loss": 0.3082, "step": 5653 }, { "epoch": 1.6968787515006003, "grad_norm": 0.13431623578071594, "learning_rate": 1.3695788599961513e-05, "loss": 0.3088, "step": 5654 }, { "epoch": 1.6971788715486196, "grad_norm": 0.1450989991426468, "learning_rate": 1.3669342972078491e-05, "loss": 0.3468, "step": 5655 }, { "epoch": 1.6974789915966386, "grad_norm": 0.12794899940490723, "learning_rate": 1.3642921028021305e-05, "loss": 0.295, "step": 5656 }, { "epoch": 1.697779111644658, "grad_norm": 0.1322779357433319, "learning_rate": 1.3616522775038543e-05, "loss": 0.3225, "step": 5657 }, { "epoch": 1.698079231692677, "grad_norm": 0.12647461891174316, "learning_rate": 1.3590148220372211e-05, "loss": 0.3138, "step": 5658 }, { "epoch": 1.6983793517406962, "grad_norm": 0.15429219603538513, "learning_rate": 1.3563797371257914e-05, "loss": 0.3915, "step": 5659 }, { "epoch": 1.6986794717887155, "grad_norm": 0.13390842080116272, "learning_rate": 1.3537470234924642e-05, "loss": 0.3187, "step": 5660 }, { "epoch": 1.6989795918367347, "grad_norm": 0.12282349169254303, "learning_rate": 1.3511166818595001e-05, "loss": 0.2752, "step": 5661 }, { "epoch": 1.699279711884754, "grad_norm": 0.14638248085975647, "learning_rate": 1.3484887129485025e-05, "loss": 0.3508, "step": 5662 }, { "epoch": 1.699579831932773, "grad_norm": 0.13514555990695953, "learning_rate": 1.3458631174804204e-05, "loss": 0.3368, "step": 5663 }, { "epoch": 1.6998799519807923, "grad_norm": 0.14483727514743805, "learning_rate": 1.34323989617556e-05, "loss": 0.3362, "step": 5664 }, { "epoch": 1.7001800720288114, "grad_norm": 0.1352541744709015, "learning_rate": 1.340619049753572e-05, "loss": 0.3323, "step": 5665 }, { "epoch": 1.7004801920768307, "grad_norm": 0.13121896982192993, "learning_rate": 1.3380005789334516e-05, "loss": 0.317, "step": 5666 }, { "epoch": 1.70078031212485, "grad_norm": 0.13836635649204254, "learning_rate": 1.3353844844335516e-05, "loss": 0.3603, "step": 5667 }, { "epoch": 1.7010804321728692, "grad_norm": 0.13154856860637665, "learning_rate": 1.3327707669715616e-05, "loss": 0.3273, "step": 5668 }, { "epoch": 1.7013805522208885, "grad_norm": 0.13235405087471008, "learning_rate": 1.330159427264529e-05, "loss": 0.3131, "step": 5669 }, { "epoch": 1.7016806722689075, "grad_norm": 0.7332162857055664, "learning_rate": 1.3275504660288462e-05, "loss": 0.2848, "step": 5670 }, { "epoch": 1.7019807923169268, "grad_norm": 0.1268726885318756, "learning_rate": 1.3249438839802497e-05, "loss": 0.2979, "step": 5671 }, { "epoch": 1.7022809123649458, "grad_norm": 0.17241127789020538, "learning_rate": 1.3223396818338207e-05, "loss": 0.3856, "step": 5672 }, { "epoch": 1.702581032412965, "grad_norm": 0.13850437104701996, "learning_rate": 1.3197378603040011e-05, "loss": 0.3415, "step": 5673 }, { "epoch": 1.7028811524609844, "grad_norm": 0.13507312536239624, "learning_rate": 1.3171384201045655e-05, "loss": 0.3255, "step": 5674 }, { "epoch": 1.7031812725090036, "grad_norm": 0.12530501186847687, "learning_rate": 1.3145413619486425e-05, "loss": 0.2889, "step": 5675 }, { "epoch": 1.703481392557023, "grad_norm": 0.13460050523281097, "learning_rate": 1.311946686548703e-05, "loss": 0.3323, "step": 5676 }, { "epoch": 1.7037815126050422, "grad_norm": 0.13420239090919495, "learning_rate": 1.3093543946165665e-05, "loss": 0.3217, "step": 5677 }, { "epoch": 1.7040816326530612, "grad_norm": 0.13585264980793, "learning_rate": 1.3067644868634033e-05, "loss": 0.3271, "step": 5678 }, { "epoch": 1.7043817527010803, "grad_norm": 0.14396932721138, "learning_rate": 1.3041769639997203e-05, "loss": 0.397, "step": 5679 }, { "epoch": 1.7046818727490995, "grad_norm": 0.1304837018251419, "learning_rate": 1.3015918267353743e-05, "loss": 0.3261, "step": 5680 }, { "epoch": 1.7049819927971188, "grad_norm": 0.13240686058998108, "learning_rate": 1.2990090757795692e-05, "loss": 0.3205, "step": 5681 }, { "epoch": 1.705282112845138, "grad_norm": 0.13736537098884583, "learning_rate": 1.2964287118408558e-05, "loss": 0.3205, "step": 5682 }, { "epoch": 1.7055822328931574, "grad_norm": 0.15131819248199463, "learning_rate": 1.2938507356271235e-05, "loss": 0.3682, "step": 5683 }, { "epoch": 1.7058823529411766, "grad_norm": 0.13606952130794525, "learning_rate": 1.2912751478456142e-05, "loss": 0.3338, "step": 5684 }, { "epoch": 1.7061824729891957, "grad_norm": 0.13007688522338867, "learning_rate": 1.288701949202904e-05, "loss": 0.3257, "step": 5685 }, { "epoch": 1.7064825930372147, "grad_norm": 0.13017094135284424, "learning_rate": 1.2861311404049292e-05, "loss": 0.3186, "step": 5686 }, { "epoch": 1.706782713085234, "grad_norm": 0.13054896891117096, "learning_rate": 1.2835627221569579e-05, "loss": 0.3057, "step": 5687 }, { "epoch": 1.7070828331332533, "grad_norm": 0.1377072036266327, "learning_rate": 1.2809966951636032e-05, "loss": 0.3688, "step": 5688 }, { "epoch": 1.7073829531812725, "grad_norm": 0.13601499795913696, "learning_rate": 1.2784330601288297e-05, "loss": 0.3423, "step": 5689 }, { "epoch": 1.7076830732292918, "grad_norm": 0.12117687612771988, "learning_rate": 1.2758718177559403e-05, "loss": 0.2921, "step": 5690 }, { "epoch": 1.707983193277311, "grad_norm": 0.14201779663562775, "learning_rate": 1.2733129687475797e-05, "loss": 0.3598, "step": 5691 }, { "epoch": 1.7082833133253301, "grad_norm": 0.13235652446746826, "learning_rate": 1.2707565138057432e-05, "loss": 0.2976, "step": 5692 }, { "epoch": 1.7085834333733494, "grad_norm": 0.1324567198753357, "learning_rate": 1.2682024536317605e-05, "loss": 0.3467, "step": 5693 }, { "epoch": 1.7088835534213684, "grad_norm": 0.1447012722492218, "learning_rate": 1.2656507889263114e-05, "loss": 0.3083, "step": 5694 }, { "epoch": 1.7091836734693877, "grad_norm": 0.1385689079761505, "learning_rate": 1.2631015203894159e-05, "loss": 0.3418, "step": 5695 }, { "epoch": 1.709483793517407, "grad_norm": 0.13922815024852753, "learning_rate": 1.2605546487204345e-05, "loss": 0.3214, "step": 5696 }, { "epoch": 1.7097839135654262, "grad_norm": 0.15285637974739075, "learning_rate": 1.2580101746180738e-05, "loss": 0.393, "step": 5697 }, { "epoch": 1.7100840336134455, "grad_norm": 0.153276264667511, "learning_rate": 1.2554680987803823e-05, "loss": 0.3278, "step": 5698 }, { "epoch": 1.7103841536614646, "grad_norm": 0.14622247219085693, "learning_rate": 1.2529284219047465e-05, "loss": 0.3835, "step": 5699 }, { "epoch": 1.7106842737094838, "grad_norm": 0.1340278536081314, "learning_rate": 1.2503911446879014e-05, "loss": 0.3435, "step": 5700 }, { "epoch": 1.7109843937575029, "grad_norm": 0.1415686458349228, "learning_rate": 1.2478562678259153e-05, "loss": 0.3351, "step": 5701 }, { "epoch": 1.7112845138055222, "grad_norm": 0.13266758620738983, "learning_rate": 1.2453237920142047e-05, "loss": 0.313, "step": 5702 }, { "epoch": 1.7115846338535414, "grad_norm": 0.14347507059574127, "learning_rate": 1.242793717947528e-05, "loss": 0.3406, "step": 5703 }, { "epoch": 1.7118847539015607, "grad_norm": 0.1388009488582611, "learning_rate": 1.2402660463199767e-05, "loss": 0.3525, "step": 5704 }, { "epoch": 1.71218487394958, "grad_norm": 0.1338505744934082, "learning_rate": 1.2377407778249939e-05, "loss": 0.3172, "step": 5705 }, { "epoch": 1.712484993997599, "grad_norm": 0.133287712931633, "learning_rate": 1.2352179131553532e-05, "loss": 0.3187, "step": 5706 }, { "epoch": 1.7127851140456183, "grad_norm": 0.1391024887561798, "learning_rate": 1.2326974530031766e-05, "loss": 0.3381, "step": 5707 }, { "epoch": 1.7130852340936373, "grad_norm": 0.15472294390201569, "learning_rate": 1.230179398059924e-05, "loss": 0.4018, "step": 5708 }, { "epoch": 1.7133853541416566, "grad_norm": 0.13657771050930023, "learning_rate": 1.2276637490163945e-05, "loss": 0.3345, "step": 5709 }, { "epoch": 1.7136854741896759, "grad_norm": 0.13321413099765778, "learning_rate": 1.2251505065627211e-05, "loss": 0.3282, "step": 5710 }, { "epoch": 1.7139855942376951, "grad_norm": 0.14870762825012207, "learning_rate": 1.2226396713883936e-05, "loss": 0.3839, "step": 5711 }, { "epoch": 1.7142857142857144, "grad_norm": 0.1392969787120819, "learning_rate": 1.2201312441822266e-05, "loss": 0.3219, "step": 5712 }, { "epoch": 1.7145858343337335, "grad_norm": 0.14034144580364227, "learning_rate": 1.217625225632375e-05, "loss": 0.3757, "step": 5713 }, { "epoch": 1.7148859543817527, "grad_norm": 0.13876692950725555, "learning_rate": 1.215121616426339e-05, "loss": 0.3196, "step": 5714 }, { "epoch": 1.7151860744297718, "grad_norm": 0.14527247846126556, "learning_rate": 1.2126204172509547e-05, "loss": 0.3407, "step": 5715 }, { "epoch": 1.715486194477791, "grad_norm": 0.17731516063213348, "learning_rate": 1.2101216287924e-05, "loss": 0.4293, "step": 5716 }, { "epoch": 1.7157863145258103, "grad_norm": 0.14165017008781433, "learning_rate": 1.2076252517361863e-05, "loss": 0.3579, "step": 5717 }, { "epoch": 1.7160864345738296, "grad_norm": 0.19433917105197906, "learning_rate": 1.2051312867671637e-05, "loss": 0.4835, "step": 5718 }, { "epoch": 1.7163865546218489, "grad_norm": 0.17780497670173645, "learning_rate": 1.2026397345695261e-05, "loss": 0.3634, "step": 5719 }, { "epoch": 1.716686674669868, "grad_norm": 0.13712306320667267, "learning_rate": 1.2001505958268045e-05, "loss": 0.3226, "step": 5720 }, { "epoch": 1.7169867947178872, "grad_norm": 0.15281295776367188, "learning_rate": 1.1976638712218591e-05, "loss": 0.3207, "step": 5721 }, { "epoch": 1.7172869147659062, "grad_norm": 0.13083398342132568, "learning_rate": 1.1951795614368988e-05, "loss": 0.3064, "step": 5722 }, { "epoch": 1.7175870348139255, "grad_norm": 0.1333472579717636, "learning_rate": 1.1926976671534662e-05, "loss": 0.3406, "step": 5723 }, { "epoch": 1.7178871548619448, "grad_norm": 0.1443636268377304, "learning_rate": 1.1902181890524378e-05, "loss": 0.3642, "step": 5724 }, { "epoch": 1.718187274909964, "grad_norm": 0.12199776619672775, "learning_rate": 1.1877411278140327e-05, "loss": 0.2984, "step": 5725 }, { "epoch": 1.7184873949579833, "grad_norm": 0.13965708017349243, "learning_rate": 1.1852664841177995e-05, "loss": 0.3693, "step": 5726 }, { "epoch": 1.7187875150060024, "grad_norm": 0.12849895656108856, "learning_rate": 1.1827942586426333e-05, "loss": 0.3135, "step": 5727 }, { "epoch": 1.7190876350540216, "grad_norm": 0.1383999139070511, "learning_rate": 1.18032445206676e-05, "loss": 0.3667, "step": 5728 }, { "epoch": 1.7193877551020407, "grad_norm": 0.13418391346931458, "learning_rate": 1.177857065067739e-05, "loss": 0.3105, "step": 5729 }, { "epoch": 1.71968787515006, "grad_norm": 0.12100134044885635, "learning_rate": 1.1753920983224753e-05, "loss": 0.3037, "step": 5730 }, { "epoch": 1.7199879951980792, "grad_norm": 0.1342541128396988, "learning_rate": 1.1729295525071993e-05, "loss": 0.3378, "step": 5731 }, { "epoch": 1.7202881152460985, "grad_norm": 0.14842109382152557, "learning_rate": 1.1704694282974838e-05, "loss": 0.34, "step": 5732 }, { "epoch": 1.7205882352941178, "grad_norm": 0.13383586704730988, "learning_rate": 1.1680117263682388e-05, "loss": 0.3363, "step": 5733 }, { "epoch": 1.7208883553421368, "grad_norm": 0.13726623356342316, "learning_rate": 1.1655564473937008e-05, "loss": 0.3484, "step": 5734 }, { "epoch": 1.721188475390156, "grad_norm": 0.14171043038368225, "learning_rate": 1.163103592047452e-05, "loss": 0.3593, "step": 5735 }, { "epoch": 1.7214885954381751, "grad_norm": 0.14122125506401062, "learning_rate": 1.1606531610024041e-05, "loss": 0.3486, "step": 5736 }, { "epoch": 1.7217887154861944, "grad_norm": 0.12228088825941086, "learning_rate": 1.1582051549308037e-05, "loss": 0.297, "step": 5737 }, { "epoch": 1.7220888355342137, "grad_norm": 0.18381671607494354, "learning_rate": 1.155759574504235e-05, "loss": 0.3073, "step": 5738 }, { "epoch": 1.722388955582233, "grad_norm": 0.13159750401973724, "learning_rate": 1.153316420393612e-05, "loss": 0.3465, "step": 5739 }, { "epoch": 1.7226890756302522, "grad_norm": 0.12137060612440109, "learning_rate": 1.1508756932691878e-05, "loss": 0.2972, "step": 5740 }, { "epoch": 1.7229891956782715, "grad_norm": 0.14963874220848083, "learning_rate": 1.14843739380055e-05, "loss": 0.3397, "step": 5741 }, { "epoch": 1.7232893157262905, "grad_norm": 0.13432660698890686, "learning_rate": 1.1460015226566168e-05, "loss": 0.3184, "step": 5742 }, { "epoch": 1.7235894357743096, "grad_norm": 0.13611102104187012, "learning_rate": 1.143568080505637e-05, "loss": 0.3438, "step": 5743 }, { "epoch": 1.7238895558223288, "grad_norm": 0.1286766082048416, "learning_rate": 1.1411370680152022e-05, "loss": 0.3213, "step": 5744 }, { "epoch": 1.724189675870348, "grad_norm": 0.12776191532611847, "learning_rate": 1.1387084858522323e-05, "loss": 0.3253, "step": 5745 }, { "epoch": 1.7244897959183674, "grad_norm": 0.1348733901977539, "learning_rate": 1.1362823346829821e-05, "loss": 0.3155, "step": 5746 }, { "epoch": 1.7247899159663866, "grad_norm": 0.13117042183876038, "learning_rate": 1.1338586151730345e-05, "loss": 0.3278, "step": 5747 }, { "epoch": 1.725090036014406, "grad_norm": 0.13857294619083405, "learning_rate": 1.1314373279873114e-05, "loss": 0.3598, "step": 5748 }, { "epoch": 1.725390156062425, "grad_norm": 0.1337532252073288, "learning_rate": 1.1290184737900677e-05, "loss": 0.3278, "step": 5749 }, { "epoch": 1.7256902761104442, "grad_norm": 0.12995721399784088, "learning_rate": 1.1266020532448863e-05, "loss": 0.315, "step": 5750 }, { "epoch": 1.7259903961584633, "grad_norm": 0.14016960561275482, "learning_rate": 1.124188067014681e-05, "loss": 0.3571, "step": 5751 }, { "epoch": 1.7262905162064826, "grad_norm": 0.14358165860176086, "learning_rate": 1.1217765157617055e-05, "loss": 0.3646, "step": 5752 }, { "epoch": 1.7265906362545018, "grad_norm": 0.1202707290649414, "learning_rate": 1.1193674001475408e-05, "loss": 0.2831, "step": 5753 }, { "epoch": 1.726890756302521, "grad_norm": 0.12729522585868835, "learning_rate": 1.1169607208330979e-05, "loss": 0.3072, "step": 5754 }, { "epoch": 1.7271908763505404, "grad_norm": 0.140628382563591, "learning_rate": 1.1145564784786245e-05, "loss": 0.3378, "step": 5755 }, { "epoch": 1.7274909963985594, "grad_norm": 0.13112959265708923, "learning_rate": 1.112154673743694e-05, "loss": 0.3419, "step": 5756 }, { "epoch": 1.7277911164465787, "grad_norm": 0.13791124522686005, "learning_rate": 1.1097553072872157e-05, "loss": 0.3393, "step": 5757 }, { "epoch": 1.7280912364945977, "grad_norm": 0.15736477077007294, "learning_rate": 1.1073583797674291e-05, "loss": 0.3615, "step": 5758 }, { "epoch": 1.728391356542617, "grad_norm": 0.14608369767665863, "learning_rate": 1.1049638918419025e-05, "loss": 0.3666, "step": 5759 }, { "epoch": 1.7286914765906363, "grad_norm": 0.13242143392562866, "learning_rate": 1.1025718441675348e-05, "loss": 0.3433, "step": 5760 }, { "epoch": 1.7289915966386555, "grad_norm": 0.1380014419555664, "learning_rate": 1.1001822374005611e-05, "loss": 0.3624, "step": 5761 }, { "epoch": 1.7292917166866748, "grad_norm": 0.12307605892419815, "learning_rate": 1.097795072196538e-05, "loss": 0.3035, "step": 5762 }, { "epoch": 1.7295918367346939, "grad_norm": 0.14597541093826294, "learning_rate": 1.0954103492103619e-05, "loss": 0.3854, "step": 5763 }, { "epoch": 1.7298919567827131, "grad_norm": 0.17674827575683594, "learning_rate": 1.093028069096248e-05, "loss": 0.3735, "step": 5764 }, { "epoch": 1.7301920768307322, "grad_norm": 0.12740208208560944, "learning_rate": 1.0906482325077517e-05, "loss": 0.3211, "step": 5765 }, { "epoch": 1.7304921968787514, "grad_norm": 0.13835439085960388, "learning_rate": 1.0882708400977537e-05, "loss": 0.3217, "step": 5766 }, { "epoch": 1.7307923169267707, "grad_norm": 0.13297055661678314, "learning_rate": 1.0858958925184626e-05, "loss": 0.3256, "step": 5767 }, { "epoch": 1.73109243697479, "grad_norm": 0.13225091993808746, "learning_rate": 1.0835233904214215e-05, "loss": 0.2926, "step": 5768 }, { "epoch": 1.7313925570228093, "grad_norm": 0.14690400660037994, "learning_rate": 1.0811533344574943e-05, "loss": 0.3783, "step": 5769 }, { "epoch": 1.7316926770708283, "grad_norm": 0.15814612805843353, "learning_rate": 1.0787857252768807e-05, "loss": 0.3502, "step": 5770 }, { "epoch": 1.7319927971188476, "grad_norm": 0.13790132105350494, "learning_rate": 1.0764205635291092e-05, "loss": 0.3314, "step": 5771 }, { "epoch": 1.7322929171668666, "grad_norm": 0.14764297008514404, "learning_rate": 1.0740578498630339e-05, "loss": 0.3743, "step": 5772 }, { "epoch": 1.732593037214886, "grad_norm": 0.1338595747947693, "learning_rate": 1.0716975849268329e-05, "loss": 0.3228, "step": 5773 }, { "epoch": 1.7328931572629052, "grad_norm": 0.1401987373828888, "learning_rate": 1.0693397693680263e-05, "loss": 0.3548, "step": 5774 }, { "epoch": 1.7331932773109244, "grad_norm": 0.1693205088376999, "learning_rate": 1.0669844038334476e-05, "loss": 0.3243, "step": 5775 }, { "epoch": 1.7334933973589437, "grad_norm": 0.13423842191696167, "learning_rate": 1.0646314889692688e-05, "loss": 0.34, "step": 5776 }, { "epoch": 1.7337935174069627, "grad_norm": 0.13911528885364532, "learning_rate": 1.0622810254209814e-05, "loss": 0.3614, "step": 5777 }, { "epoch": 1.734093637454982, "grad_norm": 0.15006420016288757, "learning_rate": 1.0599330138334084e-05, "loss": 0.3359, "step": 5778 }, { "epoch": 1.734393757503001, "grad_norm": 0.1311221867799759, "learning_rate": 1.0575874548507036e-05, "loss": 0.3401, "step": 5779 }, { "epoch": 1.7346938775510203, "grad_norm": 0.12324915081262589, "learning_rate": 1.0552443491163422e-05, "loss": 0.3074, "step": 5780 }, { "epoch": 1.7349939975990396, "grad_norm": 0.140946164727211, "learning_rate": 1.0529036972731255e-05, "loss": 0.3578, "step": 5781 }, { "epoch": 1.7352941176470589, "grad_norm": 0.13226085901260376, "learning_rate": 1.0505654999631865e-05, "loss": 0.3147, "step": 5782 }, { "epoch": 1.7355942376950781, "grad_norm": 0.13607388734817505, "learning_rate": 1.0482297578279854e-05, "loss": 0.3367, "step": 5783 }, { "epoch": 1.7358943577430972, "grad_norm": 0.1267491728067398, "learning_rate": 1.045896471508302e-05, "loss": 0.3052, "step": 5784 }, { "epoch": 1.7361944777911165, "grad_norm": 0.11670931428670883, "learning_rate": 1.0435656416442485e-05, "loss": 0.2676, "step": 5785 }, { "epoch": 1.7364945978391355, "grad_norm": 0.13407576084136963, "learning_rate": 1.0412372688752614e-05, "loss": 0.3232, "step": 5786 }, { "epoch": 1.7367947178871548, "grad_norm": 0.12784159183502197, "learning_rate": 1.0389113538401052e-05, "loss": 0.3163, "step": 5787 }, { "epoch": 1.737094837935174, "grad_norm": 0.1374209225177765, "learning_rate": 1.036587897176865e-05, "loss": 0.334, "step": 5788 }, { "epoch": 1.7373949579831933, "grad_norm": 0.1251617819070816, "learning_rate": 1.0342668995229555e-05, "loss": 0.302, "step": 5789 }, { "epoch": 1.7376950780312126, "grad_norm": 0.13529962301254272, "learning_rate": 1.0319483615151137e-05, "loss": 0.3428, "step": 5790 }, { "epoch": 1.7379951980792316, "grad_norm": 0.1410738229751587, "learning_rate": 1.029632283789409e-05, "loss": 0.3559, "step": 5791 }, { "epoch": 1.738295318127251, "grad_norm": 0.1510618031024933, "learning_rate": 1.0273186669812262e-05, "loss": 0.3485, "step": 5792 }, { "epoch": 1.73859543817527, "grad_norm": 0.13779668509960175, "learning_rate": 1.0250075117252821e-05, "loss": 0.333, "step": 5793 }, { "epoch": 1.7388955582232892, "grad_norm": 0.15373767912387848, "learning_rate": 1.022698818655612e-05, "loss": 0.314, "step": 5794 }, { "epoch": 1.7391956782713085, "grad_norm": 0.14197716116905212, "learning_rate": 1.0203925884055853e-05, "loss": 0.3448, "step": 5795 }, { "epoch": 1.7394957983193278, "grad_norm": 0.13314983248710632, "learning_rate": 1.0180888216078865e-05, "loss": 0.3419, "step": 5796 }, { "epoch": 1.739795918367347, "grad_norm": 0.1329575926065445, "learning_rate": 1.0157875188945254e-05, "loss": 0.3018, "step": 5797 }, { "epoch": 1.7400960384153663, "grad_norm": 0.17678740620613098, "learning_rate": 1.0134886808968403e-05, "loss": 0.3794, "step": 5798 }, { "epoch": 1.7403961584633854, "grad_norm": 0.138626828789711, "learning_rate": 1.0111923082454932e-05, "loss": 0.3211, "step": 5799 }, { "epoch": 1.7406962785114044, "grad_norm": 0.12878531217575073, "learning_rate": 1.0088984015704629e-05, "loss": 0.3038, "step": 5800 }, { "epoch": 1.7409963985594237, "grad_norm": 0.13110844790935516, "learning_rate": 1.006606961501061e-05, "loss": 0.3051, "step": 5801 }, { "epoch": 1.741296518607443, "grad_norm": 0.15521807968616486, "learning_rate": 1.0043179886659137e-05, "loss": 0.4017, "step": 5802 }, { "epoch": 1.7415966386554622, "grad_norm": 0.15013937652111053, "learning_rate": 1.0020314836929778e-05, "loss": 0.3396, "step": 5803 }, { "epoch": 1.7418967587034815, "grad_norm": 0.1314486414194107, "learning_rate": 9.997474472095291e-06, "loss": 0.3179, "step": 5804 }, { "epoch": 1.7421968787515008, "grad_norm": 0.14400826394557953, "learning_rate": 9.974658798421643e-06, "loss": 0.3734, "step": 5805 }, { "epoch": 1.7424969987995198, "grad_norm": 0.14358164370059967, "learning_rate": 9.951867822168082e-06, "loss": 0.3622, "step": 5806 }, { "epoch": 1.7427971188475389, "grad_norm": 0.13000699877738953, "learning_rate": 9.929101549587027e-06, "loss": 0.3424, "step": 5807 }, { "epoch": 1.7430972388955581, "grad_norm": 0.13931672275066376, "learning_rate": 9.906359986924164e-06, "loss": 0.3369, "step": 5808 }, { "epoch": 1.7433973589435774, "grad_norm": 0.1330275535583496, "learning_rate": 9.883643140418387e-06, "loss": 0.3279, "step": 5809 }, { "epoch": 1.7436974789915967, "grad_norm": 0.15318936109542847, "learning_rate": 9.860951016301756e-06, "loss": 0.3555, "step": 5810 }, { "epoch": 1.743997599039616, "grad_norm": 0.12491942197084427, "learning_rate": 9.838283620799638e-06, "loss": 0.2907, "step": 5811 }, { "epoch": 1.7442977190876352, "grad_norm": 0.13773776590824127, "learning_rate": 9.81564096013058e-06, "loss": 0.3346, "step": 5812 }, { "epoch": 1.7445978391356542, "grad_norm": 0.14335572719573975, "learning_rate": 9.793023040506322e-06, "loss": 0.351, "step": 5813 }, { "epoch": 1.7448979591836735, "grad_norm": 0.1270056813955307, "learning_rate": 9.770429868131803e-06, "loss": 0.2976, "step": 5814 }, { "epoch": 1.7451980792316926, "grad_norm": 0.14624319970607758, "learning_rate": 9.74786144920522e-06, "loss": 0.3517, "step": 5815 }, { "epoch": 1.7454981992797118, "grad_norm": 0.14200744032859802, "learning_rate": 9.725317789917964e-06, "loss": 0.3661, "step": 5816 }, { "epoch": 1.745798319327731, "grad_norm": 0.13641224801540375, "learning_rate": 9.702798896454658e-06, "loss": 0.324, "step": 5817 }, { "epoch": 1.7460984393757504, "grad_norm": 0.14655904471874237, "learning_rate": 9.680304774993065e-06, "loss": 0.3712, "step": 5818 }, { "epoch": 1.7463985594237696, "grad_norm": 0.1340646892786026, "learning_rate": 9.657835431704165e-06, "loss": 0.3474, "step": 5819 }, { "epoch": 1.7466986794717887, "grad_norm": 0.15028226375579834, "learning_rate": 9.635390872752237e-06, "loss": 0.3052, "step": 5820 }, { "epoch": 1.746998799519808, "grad_norm": 0.1361413598060608, "learning_rate": 9.612971104294655e-06, "loss": 0.3421, "step": 5821 }, { "epoch": 1.747298919567827, "grad_norm": 0.1370229870080948, "learning_rate": 9.590576132481988e-06, "loss": 0.3645, "step": 5822 }, { "epoch": 1.7475990396158463, "grad_norm": 0.12540948390960693, "learning_rate": 9.568205963458076e-06, "loss": 0.3009, "step": 5823 }, { "epoch": 1.7478991596638656, "grad_norm": 0.13558447360992432, "learning_rate": 9.545860603359924e-06, "loss": 0.3289, "step": 5824 }, { "epoch": 1.7481992797118848, "grad_norm": 0.12754158675670624, "learning_rate": 9.523540058317726e-06, "loss": 0.3161, "step": 5825 }, { "epoch": 1.748499399759904, "grad_norm": 0.16585437953472137, "learning_rate": 9.50124433445485e-06, "loss": 0.3261, "step": 5826 }, { "epoch": 1.7487995198079231, "grad_norm": 0.13536782562732697, "learning_rate": 9.478973437887873e-06, "loss": 0.334, "step": 5827 }, { "epoch": 1.7490996398559424, "grad_norm": 0.134988933801651, "learning_rate": 9.456727374726559e-06, "loss": 0.3285, "step": 5828 }, { "epoch": 1.7493997599039615, "grad_norm": 0.15050005912780762, "learning_rate": 9.434506151073885e-06, "loss": 0.3595, "step": 5829 }, { "epoch": 1.7496998799519807, "grad_norm": 0.13214461505413055, "learning_rate": 9.412309773025952e-06, "loss": 0.3347, "step": 5830 }, { "epoch": 1.75, "grad_norm": 0.14154385030269623, "learning_rate": 9.390138246672131e-06, "loss": 0.3509, "step": 5831 }, { "epoch": 1.7503001200480193, "grad_norm": 0.14709118008613586, "learning_rate": 9.36799157809487e-06, "loss": 0.3898, "step": 5832 }, { "epoch": 1.7506002400960385, "grad_norm": 0.13362213969230652, "learning_rate": 9.345869773369875e-06, "loss": 0.3212, "step": 5833 }, { "epoch": 1.7509003601440576, "grad_norm": 0.1537594348192215, "learning_rate": 9.323772838566037e-06, "loss": 0.365, "step": 5834 }, { "epoch": 1.7512004801920769, "grad_norm": 0.13459311425685883, "learning_rate": 9.301700779745359e-06, "loss": 0.3054, "step": 5835 }, { "epoch": 1.751500600240096, "grad_norm": 0.12368548661470413, "learning_rate": 9.279653602963068e-06, "loss": 0.3151, "step": 5836 }, { "epoch": 1.7518007202881152, "grad_norm": 0.13498592376708984, "learning_rate": 9.25763131426758e-06, "loss": 0.3331, "step": 5837 }, { "epoch": 1.7521008403361344, "grad_norm": 0.13990454375743866, "learning_rate": 9.235633919700414e-06, "loss": 0.3613, "step": 5838 }, { "epoch": 1.7524009603841537, "grad_norm": 0.13571259379386902, "learning_rate": 9.213661425296338e-06, "loss": 0.3605, "step": 5839 }, { "epoch": 1.752701080432173, "grad_norm": 0.1423293650150299, "learning_rate": 9.191713837083238e-06, "loss": 0.3442, "step": 5840 }, { "epoch": 1.753001200480192, "grad_norm": 0.12556971609592438, "learning_rate": 9.169791161082175e-06, "loss": 0.297, "step": 5841 }, { "epoch": 1.7533013205282113, "grad_norm": 0.14545780420303345, "learning_rate": 9.147893403307418e-06, "loss": 0.323, "step": 5842 }, { "epoch": 1.7536014405762304, "grad_norm": 0.14192785322666168, "learning_rate": 9.126020569766336e-06, "loss": 0.3551, "step": 5843 }, { "epoch": 1.7539015606242496, "grad_norm": 0.16282089054584503, "learning_rate": 9.104172666459453e-06, "loss": 0.3321, "step": 5844 }, { "epoch": 1.754201680672269, "grad_norm": 0.13829950988292694, "learning_rate": 9.082349699380588e-06, "loss": 0.3415, "step": 5845 }, { "epoch": 1.7545018007202882, "grad_norm": 0.1373262256383896, "learning_rate": 9.060551674516538e-06, "loss": 0.3403, "step": 5846 }, { "epoch": 1.7548019207683074, "grad_norm": 0.14032645523548126, "learning_rate": 9.038778597847398e-06, "loss": 0.3585, "step": 5847 }, { "epoch": 1.7551020408163265, "grad_norm": 0.13157224655151367, "learning_rate": 9.01703047534631e-06, "loss": 0.3272, "step": 5848 }, { "epoch": 1.7554021608643458, "grad_norm": 0.12770244479179382, "learning_rate": 8.99530731297965e-06, "loss": 0.316, "step": 5849 }, { "epoch": 1.7557022809123648, "grad_norm": 0.13309158384799957, "learning_rate": 8.973609116706926e-06, "loss": 0.3057, "step": 5850 }, { "epoch": 1.756002400960384, "grad_norm": 0.14228470623493195, "learning_rate": 8.95193589248079e-06, "loss": 0.3395, "step": 5851 }, { "epoch": 1.7563025210084033, "grad_norm": 0.14081576466560364, "learning_rate": 8.930287646247015e-06, "loss": 0.3626, "step": 5852 }, { "epoch": 1.7566026410564226, "grad_norm": 0.1345006823539734, "learning_rate": 8.908664383944554e-06, "loss": 0.3514, "step": 5853 }, { "epoch": 1.7569027611044419, "grad_norm": 0.13585862517356873, "learning_rate": 8.887066111505515e-06, "loss": 0.3293, "step": 5854 }, { "epoch": 1.7572028811524611, "grad_norm": 0.13082218170166016, "learning_rate": 8.86549283485516e-06, "loss": 0.3116, "step": 5855 }, { "epoch": 1.7575030012004802, "grad_norm": 0.1294088065624237, "learning_rate": 8.843944559911843e-06, "loss": 0.2926, "step": 5856 }, { "epoch": 1.7578031212484992, "grad_norm": 0.1375386118888855, "learning_rate": 8.822421292587047e-06, "loss": 0.3253, "step": 5857 }, { "epoch": 1.7581032412965185, "grad_norm": 0.13382361829280853, "learning_rate": 8.800923038785502e-06, "loss": 0.3276, "step": 5858 }, { "epoch": 1.7584033613445378, "grad_norm": 0.13776427507400513, "learning_rate": 8.779449804404993e-06, "loss": 0.3388, "step": 5859 }, { "epoch": 1.758703481392557, "grad_norm": 0.1295420527458191, "learning_rate": 8.758001595336418e-06, "loss": 0.3082, "step": 5860 }, { "epoch": 1.7590036014405763, "grad_norm": 0.13528567552566528, "learning_rate": 8.73657841746387e-06, "loss": 0.3456, "step": 5861 }, { "epoch": 1.7593037214885956, "grad_norm": 0.14936035871505737, "learning_rate": 8.71518027666457e-06, "loss": 0.3768, "step": 5862 }, { "epoch": 1.7596038415366146, "grad_norm": 0.13647547364234924, "learning_rate": 8.693807178808822e-06, "loss": 0.3217, "step": 5863 }, { "epoch": 1.7599039615846337, "grad_norm": 0.13374289870262146, "learning_rate": 8.672459129760125e-06, "loss": 0.3364, "step": 5864 }, { "epoch": 1.760204081632653, "grad_norm": 0.17200399935245514, "learning_rate": 8.651136135375026e-06, "loss": 0.3566, "step": 5865 }, { "epoch": 1.7605042016806722, "grad_norm": 0.1232210174202919, "learning_rate": 8.62983820150327e-06, "loss": 0.2979, "step": 5866 }, { "epoch": 1.7608043217286915, "grad_norm": 0.13488927483558655, "learning_rate": 8.608565333987717e-06, "loss": 0.3255, "step": 5867 }, { "epoch": 1.7611044417767108, "grad_norm": 0.12796378135681152, "learning_rate": 8.587317538664307e-06, "loss": 0.3192, "step": 5868 }, { "epoch": 1.76140456182473, "grad_norm": 0.14889287948608398, "learning_rate": 8.566094821362148e-06, "loss": 0.3042, "step": 5869 }, { "epoch": 1.761704681872749, "grad_norm": 0.13275815546512604, "learning_rate": 8.544897187903423e-06, "loss": 0.3288, "step": 5870 }, { "epoch": 1.7620048019207684, "grad_norm": 0.12997561693191528, "learning_rate": 8.52372464410348e-06, "loss": 0.3275, "step": 5871 }, { "epoch": 1.7623049219687874, "grad_norm": 0.15465877950191498, "learning_rate": 8.502577195770777e-06, "loss": 0.3424, "step": 5872 }, { "epoch": 1.7626050420168067, "grad_norm": 0.134208545088768, "learning_rate": 8.481454848706838e-06, "loss": 0.3376, "step": 5873 }, { "epoch": 1.762905162064826, "grad_norm": 0.12746387720108032, "learning_rate": 8.46035760870636e-06, "loss": 0.2846, "step": 5874 }, { "epoch": 1.7632052821128452, "grad_norm": 0.13657613098621368, "learning_rate": 8.439285481557136e-06, "loss": 0.3367, "step": 5875 }, { "epoch": 1.7635054021608645, "grad_norm": 0.128277987241745, "learning_rate": 8.418238473040041e-06, "loss": 0.313, "step": 5876 }, { "epoch": 1.7638055222088835, "grad_norm": 0.12578245997428894, "learning_rate": 8.397216588929101e-06, "loss": 0.2953, "step": 5877 }, { "epoch": 1.7641056422569028, "grad_norm": 0.1306450366973877, "learning_rate": 8.376219834991406e-06, "loss": 0.3353, "step": 5878 }, { "epoch": 1.7644057623049219, "grad_norm": 0.15677793323993683, "learning_rate": 8.355248216987189e-06, "loss": 0.3395, "step": 5879 }, { "epoch": 1.7647058823529411, "grad_norm": 0.13563622534275055, "learning_rate": 8.33430174066978e-06, "loss": 0.3219, "step": 5880 }, { "epoch": 1.7650060024009604, "grad_norm": 0.14059104025363922, "learning_rate": 8.313380411785599e-06, "loss": 0.3612, "step": 5881 }, { "epoch": 1.7653061224489797, "grad_norm": 0.14718660712242126, "learning_rate": 8.292484236074139e-06, "loss": 0.3386, "step": 5882 }, { "epoch": 1.765606242496999, "grad_norm": 0.14431437849998474, "learning_rate": 8.271613219268093e-06, "loss": 0.3526, "step": 5883 }, { "epoch": 1.765906362545018, "grad_norm": 0.14376048743724823, "learning_rate": 8.250767367093126e-06, "loss": 0.3241, "step": 5884 }, { "epoch": 1.7662064825930373, "grad_norm": 0.13576029241085052, "learning_rate": 8.229946685268097e-06, "loss": 0.3458, "step": 5885 }, { "epoch": 1.7665066026410563, "grad_norm": 0.1485685408115387, "learning_rate": 8.209151179504893e-06, "loss": 0.3531, "step": 5886 }, { "epoch": 1.7668067226890756, "grad_norm": 0.12961770594120026, "learning_rate": 8.188380855508536e-06, "loss": 0.3223, "step": 5887 }, { "epoch": 1.7671068427370948, "grad_norm": 0.13379228115081787, "learning_rate": 8.16763571897714e-06, "loss": 0.3253, "step": 5888 }, { "epoch": 1.767406962785114, "grad_norm": 0.13768836855888367, "learning_rate": 8.146915775601882e-06, "loss": 0.312, "step": 5889 }, { "epoch": 1.7677070828331334, "grad_norm": 0.14346608519554138, "learning_rate": 8.126221031067027e-06, "loss": 0.3625, "step": 5890 }, { "epoch": 1.7680072028811524, "grad_norm": 0.13501888513565063, "learning_rate": 8.105551491049945e-06, "loss": 0.3347, "step": 5891 }, { "epoch": 1.7683073229291717, "grad_norm": 0.5826078057289124, "learning_rate": 8.084907161221123e-06, "loss": 0.319, "step": 5892 }, { "epoch": 1.7686074429771907, "grad_norm": 0.19021013379096985, "learning_rate": 8.064288047244039e-06, "loss": 0.3612, "step": 5893 }, { "epoch": 1.76890756302521, "grad_norm": 0.13114556670188904, "learning_rate": 8.043694154775372e-06, "loss": 0.3109, "step": 5894 }, { "epoch": 1.7692076830732293, "grad_norm": 0.14585594832897186, "learning_rate": 8.023125489464744e-06, "loss": 0.369, "step": 5895 }, { "epoch": 1.7695078031212486, "grad_norm": 0.13737501204013824, "learning_rate": 8.00258205695501e-06, "loss": 0.3367, "step": 5896 }, { "epoch": 1.7698079231692678, "grad_norm": 0.13545137643814087, "learning_rate": 7.982063862881994e-06, "loss": 0.331, "step": 5897 }, { "epoch": 1.7701080432172869, "grad_norm": 0.12734316289424896, "learning_rate": 7.961570912874617e-06, "loss": 0.3134, "step": 5898 }, { "epoch": 1.7704081632653061, "grad_norm": 0.12725022435188293, "learning_rate": 7.94110321255489e-06, "loss": 0.3137, "step": 5899 }, { "epoch": 1.7707082833133252, "grad_norm": 0.12916243076324463, "learning_rate": 7.920660767537901e-06, "loss": 0.3092, "step": 5900 }, { "epoch": 1.7710084033613445, "grad_norm": 0.12457706034183502, "learning_rate": 7.900243583431788e-06, "loss": 0.2868, "step": 5901 }, { "epoch": 1.7713085234093637, "grad_norm": 0.12678676843643188, "learning_rate": 7.8798516658378e-06, "loss": 0.2941, "step": 5902 }, { "epoch": 1.771608643457383, "grad_norm": 0.13747821748256683, "learning_rate": 7.859485020350177e-06, "loss": 0.3329, "step": 5903 }, { "epoch": 1.7719087635054023, "grad_norm": 0.1366930603981018, "learning_rate": 7.839143652556314e-06, "loss": 0.3282, "step": 5904 }, { "epoch": 1.7722088835534213, "grad_norm": 0.15380358695983887, "learning_rate": 7.818827568036624e-06, "loss": 0.3268, "step": 5905 }, { "epoch": 1.7725090036014406, "grad_norm": 0.13130567967891693, "learning_rate": 7.798536772364572e-06, "loss": 0.3032, "step": 5906 }, { "epoch": 1.7728091236494596, "grad_norm": 0.13877171277999878, "learning_rate": 7.778271271106719e-06, "loss": 0.3384, "step": 5907 }, { "epoch": 1.773109243697479, "grad_norm": 0.13448159396648407, "learning_rate": 7.758031069822702e-06, "loss": 0.3186, "step": 5908 }, { "epoch": 1.7734093637454982, "grad_norm": 0.22665102779865265, "learning_rate": 7.737816174065138e-06, "loss": 0.3249, "step": 5909 }, { "epoch": 1.7737094837935174, "grad_norm": 0.3638037443161011, "learning_rate": 7.717626589379789e-06, "loss": 0.3345, "step": 5910 }, { "epoch": 1.7740096038415367, "grad_norm": 0.1367301195859909, "learning_rate": 7.697462321305404e-06, "loss": 0.3451, "step": 5911 }, { "epoch": 1.7743097238895558, "grad_norm": 0.1390855461359024, "learning_rate": 7.677323375373835e-06, "loss": 0.3084, "step": 5912 }, { "epoch": 1.774609843937575, "grad_norm": 0.13941091299057007, "learning_rate": 7.657209757109995e-06, "loss": 0.3547, "step": 5913 }, { "epoch": 1.774909963985594, "grad_norm": 0.1419765055179596, "learning_rate": 7.637121472031782e-06, "loss": 0.3672, "step": 5914 }, { "epoch": 1.7752100840336134, "grad_norm": 0.14254456758499146, "learning_rate": 7.617058525650223e-06, "loss": 0.2838, "step": 5915 }, { "epoch": 1.7755102040816326, "grad_norm": 0.14249834418296814, "learning_rate": 7.597020923469322e-06, "loss": 0.3307, "step": 5916 }, { "epoch": 1.775810324129652, "grad_norm": 0.14962702989578247, "learning_rate": 7.577008670986185e-06, "loss": 0.3624, "step": 5917 }, { "epoch": 1.7761104441776712, "grad_norm": 0.13569994270801544, "learning_rate": 7.5570217736909535e-06, "loss": 0.3354, "step": 5918 }, { "epoch": 1.7764105642256904, "grad_norm": 0.14091704785823822, "learning_rate": 7.537060237066806e-06, "loss": 0.3263, "step": 5919 }, { "epoch": 1.7767106842737095, "grad_norm": 0.1351580172777176, "learning_rate": 7.517124066589909e-06, "loss": 0.3112, "step": 5920 }, { "epoch": 1.7770108043217285, "grad_norm": 0.1452702134847641, "learning_rate": 7.497213267729586e-06, "loss": 0.357, "step": 5921 }, { "epoch": 1.7773109243697478, "grad_norm": 0.11909019947052002, "learning_rate": 7.4773278459481234e-06, "loss": 0.2723, "step": 5922 }, { "epoch": 1.777611044417767, "grad_norm": 0.13754090666770935, "learning_rate": 7.4574678067008245e-06, "loss": 0.296, "step": 5923 }, { "epoch": 1.7779111644657863, "grad_norm": 0.12423918396234512, "learning_rate": 7.4376331554360964e-06, "loss": 0.2805, "step": 5924 }, { "epoch": 1.7782112845138056, "grad_norm": 0.14556850492954254, "learning_rate": 7.417823897595322e-06, "loss": 0.3754, "step": 5925 }, { "epoch": 1.7785114045618249, "grad_norm": 0.21286626160144806, "learning_rate": 7.398040038612986e-06, "loss": 0.3514, "step": 5926 }, { "epoch": 1.778811524609844, "grad_norm": 0.13760562241077423, "learning_rate": 7.378281583916535e-06, "loss": 0.3351, "step": 5927 }, { "epoch": 1.7791116446578632, "grad_norm": 0.13750424981117249, "learning_rate": 7.358548538926457e-06, "loss": 0.3317, "step": 5928 }, { "epoch": 1.7794117647058822, "grad_norm": 0.1289098858833313, "learning_rate": 7.338840909056311e-06, "loss": 0.3015, "step": 5929 }, { "epoch": 1.7797118847539015, "grad_norm": 0.13978558778762817, "learning_rate": 7.319158699712669e-06, "loss": 0.3486, "step": 5930 }, { "epoch": 1.7800120048019208, "grad_norm": 0.14103145897388458, "learning_rate": 7.299501916295093e-06, "loss": 0.3395, "step": 5931 }, { "epoch": 1.78031212484994, "grad_norm": 0.14018738269805908, "learning_rate": 7.279870564196201e-06, "loss": 0.35, "step": 5932 }, { "epoch": 1.7806122448979593, "grad_norm": 0.13689963519573212, "learning_rate": 7.26026464880164e-06, "loss": 0.348, "step": 5933 }, { "epoch": 1.7809123649459784, "grad_norm": 0.12805680930614471, "learning_rate": 7.240684175490075e-06, "loss": 0.3071, "step": 5934 }, { "epoch": 1.7812124849939976, "grad_norm": 0.13557565212249756, "learning_rate": 7.2211291496331876e-06, "loss": 0.3366, "step": 5935 }, { "epoch": 1.7815126050420167, "grad_norm": 0.14251002669334412, "learning_rate": 7.20159957659563e-06, "loss": 0.3636, "step": 5936 }, { "epoch": 1.781812725090036, "grad_norm": 0.13931401073932648, "learning_rate": 7.182095461735161e-06, "loss": 0.3223, "step": 5937 }, { "epoch": 1.7821128451380552, "grad_norm": 0.13256897032260895, "learning_rate": 7.1626168104025e-06, "loss": 0.3125, "step": 5938 }, { "epoch": 1.7824129651860745, "grad_norm": 0.13756443560123444, "learning_rate": 7.143163627941385e-06, "loss": 0.3507, "step": 5939 }, { "epoch": 1.7827130852340938, "grad_norm": 0.14429163932800293, "learning_rate": 7.12373591968859e-06, "loss": 0.3552, "step": 5940 }, { "epoch": 1.7830132052821128, "grad_norm": 0.12304878979921341, "learning_rate": 7.104333690973852e-06, "loss": 0.2926, "step": 5941 }, { "epoch": 1.783313325330132, "grad_norm": 0.13818055391311646, "learning_rate": 7.084956947119969e-06, "loss": 0.3131, "step": 5942 }, { "epoch": 1.7836134453781511, "grad_norm": 0.14789220690727234, "learning_rate": 7.065605693442745e-06, "loss": 0.3543, "step": 5943 }, { "epoch": 1.7839135654261704, "grad_norm": 0.1315280646085739, "learning_rate": 7.046279935250943e-06, "loss": 0.3336, "step": 5944 }, { "epoch": 1.7842136854741897, "grad_norm": 0.14173337817192078, "learning_rate": 7.0269796778463906e-06, "loss": 0.3321, "step": 5945 }, { "epoch": 1.784513805522209, "grad_norm": 0.14042839407920837, "learning_rate": 7.007704926523884e-06, "loss": 0.322, "step": 5946 }, { "epoch": 1.7848139255702282, "grad_norm": 0.13916705548763275, "learning_rate": 6.988455686571216e-06, "loss": 0.3326, "step": 5947 }, { "epoch": 1.7851140456182473, "grad_norm": 0.15724653005599976, "learning_rate": 6.9692319632692185e-06, "loss": 0.3875, "step": 5948 }, { "epoch": 1.7854141656662665, "grad_norm": 0.14085812866687775, "learning_rate": 6.950033761891672e-06, "loss": 0.3354, "step": 5949 }, { "epoch": 1.7857142857142856, "grad_norm": 0.14589902758598328, "learning_rate": 6.930861087705398e-06, "loss": 0.3516, "step": 5950 }, { "epoch": 1.7860144057623049, "grad_norm": 0.1342475861310959, "learning_rate": 6.9117139459702105e-06, "loss": 0.2982, "step": 5951 }, { "epoch": 1.7863145258103241, "grad_norm": 0.1392049491405487, "learning_rate": 6.892592341938908e-06, "loss": 0.3603, "step": 5952 }, { "epoch": 1.7866146458583434, "grad_norm": 0.13203851878643036, "learning_rate": 6.873496280857239e-06, "loss": 0.3169, "step": 5953 }, { "epoch": 1.7869147659063627, "grad_norm": 0.11781606823205948, "learning_rate": 6.854425767964034e-06, "loss": 0.28, "step": 5954 }, { "epoch": 1.7872148859543817, "grad_norm": 0.13385334610939026, "learning_rate": 6.835380808491065e-06, "loss": 0.3304, "step": 5955 }, { "epoch": 1.787515006002401, "grad_norm": 0.13968412578105927, "learning_rate": 6.816361407663096e-06, "loss": 0.3427, "step": 5956 }, { "epoch": 1.78781512605042, "grad_norm": 0.14403066039085388, "learning_rate": 6.797367570697866e-06, "loss": 0.3407, "step": 5957 }, { "epoch": 1.7881152460984393, "grad_norm": 0.13157077133655548, "learning_rate": 6.778399302806116e-06, "loss": 0.3045, "step": 5958 }, { "epoch": 1.7884153661464586, "grad_norm": 0.13473302125930786, "learning_rate": 6.7594566091916165e-06, "loss": 0.3423, "step": 5959 }, { "epoch": 1.7887154861944778, "grad_norm": 0.14571532607078552, "learning_rate": 6.7405394950510345e-06, "loss": 0.3547, "step": 5960 }, { "epoch": 1.7890156062424971, "grad_norm": 0.13778209686279297, "learning_rate": 6.721647965574063e-06, "loss": 0.3251, "step": 5961 }, { "epoch": 1.7893157262905162, "grad_norm": 0.1374538093805313, "learning_rate": 6.702782025943377e-06, "loss": 0.3319, "step": 5962 }, { "epoch": 1.7896158463385354, "grad_norm": 0.14873814582824707, "learning_rate": 6.68394168133466e-06, "loss": 0.3829, "step": 5963 }, { "epoch": 1.7899159663865545, "grad_norm": 0.14020781219005585, "learning_rate": 6.665126936916532e-06, "loss": 0.3257, "step": 5964 }, { "epoch": 1.7902160864345738, "grad_norm": 0.12964782118797302, "learning_rate": 6.646337797850588e-06, "loss": 0.3252, "step": 5965 }, { "epoch": 1.790516206482593, "grad_norm": 0.14314241707324982, "learning_rate": 6.6275742692914145e-06, "loss": 0.3333, "step": 5966 }, { "epoch": 1.7908163265306123, "grad_norm": 0.13476979732513428, "learning_rate": 6.608836356386583e-06, "loss": 0.3065, "step": 5967 }, { "epoch": 1.7911164465786316, "grad_norm": 0.14589078724384308, "learning_rate": 6.5901240642766256e-06, "loss": 0.3388, "step": 5968 }, { "epoch": 1.7914165666266506, "grad_norm": 0.135845348238945, "learning_rate": 6.571437398095026e-06, "loss": 0.329, "step": 5969 }, { "epoch": 1.7917166866746699, "grad_norm": 0.13261300325393677, "learning_rate": 6.552776362968271e-06, "loss": 0.3308, "step": 5970 }, { "epoch": 1.792016806722689, "grad_norm": 0.1329193115234375, "learning_rate": 6.534140964015822e-06, "loss": 0.3224, "step": 5971 }, { "epoch": 1.7923169267707082, "grad_norm": 0.1382874846458435, "learning_rate": 6.515531206350045e-06, "loss": 0.3134, "step": 5972 }, { "epoch": 1.7926170468187275, "grad_norm": 0.1270531266927719, "learning_rate": 6.496947095076345e-06, "loss": 0.3019, "step": 5973 }, { "epoch": 1.7929171668667467, "grad_norm": 0.13709579408168793, "learning_rate": 6.478388635293031e-06, "loss": 0.3426, "step": 5974 }, { "epoch": 1.793217286914766, "grad_norm": 0.1275123655796051, "learning_rate": 6.459855832091422e-06, "loss": 0.3127, "step": 5975 }, { "epoch": 1.7935174069627853, "grad_norm": 0.13616742193698883, "learning_rate": 6.441348690555804e-06, "loss": 0.3208, "step": 5976 }, { "epoch": 1.7938175270108043, "grad_norm": 0.19298814237117767, "learning_rate": 6.4228672157633505e-06, "loss": 0.333, "step": 5977 }, { "epoch": 1.7941176470588234, "grad_norm": 0.13557209074497223, "learning_rate": 6.404411412784283e-06, "loss": 0.3354, "step": 5978 }, { "epoch": 1.7944177671068426, "grad_norm": 0.13909755647182465, "learning_rate": 6.385981286681708e-06, "loss": 0.3035, "step": 5979 }, { "epoch": 1.794717887154862, "grad_norm": 0.12394416332244873, "learning_rate": 6.367576842511735e-06, "loss": 0.2896, "step": 5980 }, { "epoch": 1.7950180072028812, "grad_norm": 0.16678430140018463, "learning_rate": 6.349198085323427e-06, "loss": 0.3978, "step": 5981 }, { "epoch": 1.7953181272509005, "grad_norm": 0.13517844676971436, "learning_rate": 6.330845020158771e-06, "loss": 0.3176, "step": 5982 }, { "epoch": 1.7956182472989197, "grad_norm": 0.1313657909631729, "learning_rate": 6.312517652052685e-06, "loss": 0.3053, "step": 5983 }, { "epoch": 1.7959183673469388, "grad_norm": 0.13233062624931335, "learning_rate": 6.294215986033136e-06, "loss": 0.3354, "step": 5984 }, { "epoch": 1.7962184873949578, "grad_norm": 0.13155484199523926, "learning_rate": 6.27594002712093e-06, "loss": 0.3378, "step": 5985 }, { "epoch": 1.796518607442977, "grad_norm": 0.13558879494667053, "learning_rate": 6.257689780329901e-06, "loss": 0.3398, "step": 5986 }, { "epoch": 1.7968187274909964, "grad_norm": 0.14417846500873566, "learning_rate": 6.239465250666754e-06, "loss": 0.3284, "step": 5987 }, { "epoch": 1.7971188475390156, "grad_norm": 0.1278696209192276, "learning_rate": 6.221266443131213e-06, "loss": 0.2962, "step": 5988 }, { "epoch": 1.797418967587035, "grad_norm": 0.13779926300048828, "learning_rate": 6.203093362715906e-06, "loss": 0.3422, "step": 5989 }, { "epoch": 1.7977190876350542, "grad_norm": 0.12292470782995224, "learning_rate": 6.184946014406412e-06, "loss": 0.2764, "step": 5990 }, { "epoch": 1.7980192076830732, "grad_norm": 0.13775518536567688, "learning_rate": 6.166824403181226e-06, "loss": 0.3344, "step": 5991 }, { "epoch": 1.7983193277310925, "grad_norm": 0.13000649213790894, "learning_rate": 6.148728534011805e-06, "loss": 0.2886, "step": 5992 }, { "epoch": 1.7986194477791115, "grad_norm": 0.13752397894859314, "learning_rate": 6.130658411862577e-06, "loss": 0.313, "step": 5993 }, { "epoch": 1.7989195678271308, "grad_norm": 0.1430034637451172, "learning_rate": 6.112614041690856e-06, "loss": 0.3374, "step": 5994 }, { "epoch": 1.79921968787515, "grad_norm": 0.15081876516342163, "learning_rate": 6.094595428446892e-06, "loss": 0.3419, "step": 5995 }, { "epoch": 1.7995198079231693, "grad_norm": 0.1434541642665863, "learning_rate": 6.076602577073898e-06, "loss": 0.3438, "step": 5996 }, { "epoch": 1.7998199279711886, "grad_norm": 0.14269985258579254, "learning_rate": 6.058635492508013e-06, "loss": 0.343, "step": 5997 }, { "epoch": 1.8001200480192077, "grad_norm": 0.13220560550689697, "learning_rate": 6.040694179678308e-06, "loss": 0.3192, "step": 5998 }, { "epoch": 1.800420168067227, "grad_norm": 0.14262396097183228, "learning_rate": 6.022778643506743e-06, "loss": 0.3248, "step": 5999 }, { "epoch": 1.800720288115246, "grad_norm": 0.15476013720035553, "learning_rate": 6.004888888908256e-06, "loss": 0.3666, "step": 6000 }, { "epoch": 1.8010204081632653, "grad_norm": 0.14492842555046082, "learning_rate": 5.987024920790718e-06, "loss": 0.3681, "step": 6001 }, { "epoch": 1.8013205282112845, "grad_norm": 0.14077229797840118, "learning_rate": 5.969186744054866e-06, "loss": 0.3446, "step": 6002 }, { "epoch": 1.8016206482593038, "grad_norm": 0.12190459668636322, "learning_rate": 5.9513743635944305e-06, "loss": 0.2874, "step": 6003 }, { "epoch": 1.801920768307323, "grad_norm": 0.1958537995815277, "learning_rate": 5.9335877842960016e-06, "loss": 0.3188, "step": 6004 }, { "epoch": 1.802220888355342, "grad_norm": 0.17165255546569824, "learning_rate": 5.915827011039166e-06, "loss": 0.3645, "step": 6005 }, { "epoch": 1.8025210084033614, "grad_norm": 0.14606942236423492, "learning_rate": 5.898092048696369e-06, "loss": 0.3358, "step": 6006 }, { "epoch": 1.8028211284513804, "grad_norm": 0.12356963008642197, "learning_rate": 5.8803829021329745e-06, "loss": 0.3013, "step": 6007 }, { "epoch": 1.8031212484993997, "grad_norm": 0.1409122347831726, "learning_rate": 5.862699576207298e-06, "loss": 0.341, "step": 6008 }, { "epoch": 1.803421368547419, "grad_norm": 0.1566123217344284, "learning_rate": 5.845042075770579e-06, "loss": 0.3365, "step": 6009 }, { "epoch": 1.8037214885954382, "grad_norm": 0.11835382878780365, "learning_rate": 5.827410405666911e-06, "loss": 0.2722, "step": 6010 }, { "epoch": 1.8040216086434575, "grad_norm": 0.12948180735111237, "learning_rate": 5.809804570733379e-06, "loss": 0.3027, "step": 6011 }, { "epoch": 1.8043217286914766, "grad_norm": 0.13504937291145325, "learning_rate": 5.792224575799909e-06, "loss": 0.3196, "step": 6012 }, { "epoch": 1.8046218487394958, "grad_norm": 0.13487012684345245, "learning_rate": 5.774670425689388e-06, "loss": 0.3207, "step": 6013 }, { "epoch": 1.8049219687875149, "grad_norm": 0.1346614956855774, "learning_rate": 5.757142125217596e-06, "loss": 0.3139, "step": 6014 }, { "epoch": 1.8052220888355341, "grad_norm": 0.14863623678684235, "learning_rate": 5.73963967919321e-06, "loss": 0.3089, "step": 6015 }, { "epoch": 1.8055222088835534, "grad_norm": 0.1441640555858612, "learning_rate": 5.722163092417854e-06, "loss": 0.3358, "step": 6016 }, { "epoch": 1.8058223289315727, "grad_norm": 0.1421898901462555, "learning_rate": 5.704712369685982e-06, "loss": 0.3461, "step": 6017 }, { "epoch": 1.806122448979592, "grad_norm": 0.14115412533283234, "learning_rate": 5.687287515785034e-06, "loss": 0.3346, "step": 6018 }, { "epoch": 1.806422569027611, "grad_norm": 0.13820458948612213, "learning_rate": 5.669888535495327e-06, "loss": 0.3218, "step": 6019 }, { "epoch": 1.8067226890756303, "grad_norm": 0.13921624422073364, "learning_rate": 5.652515433590033e-06, "loss": 0.2962, "step": 6020 }, { "epoch": 1.8070228091236493, "grad_norm": 0.13091017305850983, "learning_rate": 5.6351682148352956e-06, "loss": 0.311, "step": 6021 }, { "epoch": 1.8073229291716686, "grad_norm": 0.12136702984571457, "learning_rate": 5.61784688399013e-06, "loss": 0.283, "step": 6022 }, { "epoch": 1.8076230492196879, "grad_norm": 0.13937890529632568, "learning_rate": 5.600551445806412e-06, "loss": 0.3326, "step": 6023 }, { "epoch": 1.8079231692677071, "grad_norm": 0.12617534399032593, "learning_rate": 5.583281905028981e-06, "loss": 0.2787, "step": 6024 }, { "epoch": 1.8082232893157264, "grad_norm": 0.12522746622562408, "learning_rate": 5.566038266395501e-06, "loss": 0.2995, "step": 6025 }, { "epoch": 1.8085234093637454, "grad_norm": 0.13523294031620026, "learning_rate": 5.548820534636601e-06, "loss": 0.3327, "step": 6026 }, { "epoch": 1.8088235294117647, "grad_norm": 0.13850083947181702, "learning_rate": 5.531628714475756e-06, "loss": 0.3378, "step": 6027 }, { "epoch": 1.8091236494597838, "grad_norm": 0.14354543387889862, "learning_rate": 5.5144628106293504e-06, "loss": 0.3489, "step": 6028 }, { "epoch": 1.809423769507803, "grad_norm": 0.13455809652805328, "learning_rate": 5.4973228278066165e-06, "loss": 0.3046, "step": 6029 }, { "epoch": 1.8097238895558223, "grad_norm": 0.15517891943454742, "learning_rate": 5.480208770709771e-06, "loss": 0.3443, "step": 6030 }, { "epoch": 1.8100240096038416, "grad_norm": 0.13433986902236938, "learning_rate": 5.463120644033826e-06, "loss": 0.3268, "step": 6031 }, { "epoch": 1.8103241296518608, "grad_norm": 0.1414816975593567, "learning_rate": 5.4460584524667066e-06, "loss": 0.3199, "step": 6032 }, { "epoch": 1.8106242496998801, "grad_norm": 0.2488507181406021, "learning_rate": 5.4290222006892376e-06, "loss": 0.3088, "step": 6033 }, { "epoch": 1.8109243697478992, "grad_norm": 0.1471622735261917, "learning_rate": 5.412011893375124e-06, "loss": 0.3491, "step": 6034 }, { "epoch": 1.8112244897959182, "grad_norm": 0.14099839329719543, "learning_rate": 5.395027535190967e-06, "loss": 0.3299, "step": 6035 }, { "epoch": 1.8115246098439375, "grad_norm": 0.16024468839168549, "learning_rate": 5.378069130796193e-06, "loss": 0.3463, "step": 6036 }, { "epoch": 1.8118247298919568, "grad_norm": 0.135604590177536, "learning_rate": 5.36113668484316e-06, "loss": 0.322, "step": 6037 }, { "epoch": 1.812124849939976, "grad_norm": 0.1391889750957489, "learning_rate": 5.344230201977096e-06, "loss": 0.2822, "step": 6038 }, { "epoch": 1.8124249699879953, "grad_norm": 0.15819205343723297, "learning_rate": 5.327349686836103e-06, "loss": 0.337, "step": 6039 }, { "epoch": 1.8127250900360146, "grad_norm": 0.13925877213478088, "learning_rate": 5.310495144051142e-06, "loss": 0.3259, "step": 6040 }, { "epoch": 1.8130252100840336, "grad_norm": 0.14417746663093567, "learning_rate": 5.293666578246081e-06, "loss": 0.3574, "step": 6041 }, { "epoch": 1.8133253301320527, "grad_norm": 0.13997943699359894, "learning_rate": 5.2768639940376285e-06, "loss": 0.3283, "step": 6042 }, { "epoch": 1.813625450180072, "grad_norm": 0.14645910263061523, "learning_rate": 5.260087396035385e-06, "loss": 0.3107, "step": 6043 }, { "epoch": 1.8139255702280912, "grad_norm": 0.138387531042099, "learning_rate": 5.243336788841835e-06, "loss": 0.3173, "step": 6044 }, { "epoch": 1.8142256902761105, "grad_norm": 0.15049739181995392, "learning_rate": 5.226612177052292e-06, "loss": 0.3612, "step": 6045 }, { "epoch": 1.8145258103241297, "grad_norm": 0.12821055948734283, "learning_rate": 5.209913565254964e-06, "loss": 0.2949, "step": 6046 }, { "epoch": 1.814825930372149, "grad_norm": 0.1518714874982834, "learning_rate": 5.193240958030954e-06, "loss": 0.3597, "step": 6047 }, { "epoch": 1.815126050420168, "grad_norm": 0.136074498295784, "learning_rate": 5.1765943599541565e-06, "loss": 0.3219, "step": 6048 }, { "epoch": 1.8154261704681873, "grad_norm": 0.13379958271980286, "learning_rate": 5.159973775591409e-06, "loss": 0.2928, "step": 6049 }, { "epoch": 1.8157262905162064, "grad_norm": 0.141684889793396, "learning_rate": 5.143379209502352e-06, "loss": 0.3436, "step": 6050 }, { "epoch": 1.8160264105642256, "grad_norm": 0.133530855178833, "learning_rate": 5.126810666239523e-06, "loss": 0.3262, "step": 6051 }, { "epoch": 1.816326530612245, "grad_norm": 0.14149607717990875, "learning_rate": 5.1102681503483405e-06, "loss": 0.3661, "step": 6052 }, { "epoch": 1.8166266506602642, "grad_norm": 0.13825611770153046, "learning_rate": 5.093751666367008e-06, "loss": 0.3447, "step": 6053 }, { "epoch": 1.8169267707082835, "grad_norm": 0.13014687597751617, "learning_rate": 5.077261218826657e-06, "loss": 0.3188, "step": 6054 }, { "epoch": 1.8172268907563025, "grad_norm": 0.16165469586849213, "learning_rate": 5.060796812251267e-06, "loss": 0.3487, "step": 6055 }, { "epoch": 1.8175270108043218, "grad_norm": 0.136117085814476, "learning_rate": 5.0443584511576266e-06, "loss": 0.3348, "step": 6056 }, { "epoch": 1.8178271308523408, "grad_norm": 0.1398078203201294, "learning_rate": 5.02794614005544e-06, "loss": 0.3206, "step": 6057 }, { "epoch": 1.81812725090036, "grad_norm": 0.13721197843551636, "learning_rate": 5.011559883447215e-06, "loss": 0.3112, "step": 6058 }, { "epoch": 1.8184273709483794, "grad_norm": 0.13817249238491058, "learning_rate": 4.9951996858283445e-06, "loss": 0.3026, "step": 6059 }, { "epoch": 1.8187274909963986, "grad_norm": 0.15346643328666687, "learning_rate": 4.978865551687062e-06, "loss": 0.4138, "step": 6060 }, { "epoch": 1.819027611044418, "grad_norm": 0.1322508603334427, "learning_rate": 4.96255748550446e-06, "loss": 0.3104, "step": 6061 }, { "epoch": 1.819327731092437, "grad_norm": 0.1391531378030777, "learning_rate": 4.9462754917544375e-06, "loss": 0.3215, "step": 6062 }, { "epoch": 1.8196278511404562, "grad_norm": 0.15099678933620453, "learning_rate": 4.930019574903788e-06, "loss": 0.3407, "step": 6063 }, { "epoch": 1.8199279711884753, "grad_norm": 0.14447703957557678, "learning_rate": 4.913789739412145e-06, "loss": 0.3536, "step": 6064 }, { "epoch": 1.8202280912364945, "grad_norm": 0.14205755293369293, "learning_rate": 4.89758598973199e-06, "loss": 0.3422, "step": 6065 }, { "epoch": 1.8205282112845138, "grad_norm": 0.1352236121892929, "learning_rate": 4.881408330308612e-06, "loss": 0.3327, "step": 6066 }, { "epoch": 1.820828331332533, "grad_norm": 0.12289389967918396, "learning_rate": 4.86525676558015e-06, "loss": 0.2921, "step": 6067 }, { "epoch": 1.8211284513805523, "grad_norm": 0.13680601119995117, "learning_rate": 4.84913129997765e-06, "loss": 0.3273, "step": 6068 }, { "epoch": 1.8214285714285714, "grad_norm": 0.14029449224472046, "learning_rate": 4.8330319379249255e-06, "loss": 0.331, "step": 6069 }, { "epoch": 1.8217286914765907, "grad_norm": 0.14590826630592346, "learning_rate": 4.8169586838386346e-06, "loss": 0.3302, "step": 6070 }, { "epoch": 1.8220288115246097, "grad_norm": 0.13038024306297302, "learning_rate": 4.800911542128295e-06, "loss": 0.3136, "step": 6071 }, { "epoch": 1.822328931572629, "grad_norm": 0.1627214103937149, "learning_rate": 4.784890517196283e-06, "loss": 0.2965, "step": 6072 }, { "epoch": 1.8226290516206483, "grad_norm": 0.14358121156692505, "learning_rate": 4.768895613437763e-06, "loss": 0.3538, "step": 6073 }, { "epoch": 1.8229291716686675, "grad_norm": 0.1441047191619873, "learning_rate": 4.752926835240756e-06, "loss": 0.3311, "step": 6074 }, { "epoch": 1.8232292917166868, "grad_norm": 0.13355863094329834, "learning_rate": 4.7369841869861045e-06, "loss": 0.2886, "step": 6075 }, { "epoch": 1.8235294117647058, "grad_norm": 0.14023597538471222, "learning_rate": 4.721067673047497e-06, "loss": 0.3348, "step": 6076 }, { "epoch": 1.8238295318127251, "grad_norm": 0.13049958646297455, "learning_rate": 4.705177297791463e-06, "loss": 0.3054, "step": 6077 }, { "epoch": 1.8241296518607442, "grad_norm": 0.13980326056480408, "learning_rate": 4.689313065577328e-06, "loss": 0.3282, "step": 6078 }, { "epoch": 1.8244297719087634, "grad_norm": 0.13872842490673065, "learning_rate": 4.673474980757264e-06, "loss": 0.3089, "step": 6079 }, { "epoch": 1.8247298919567827, "grad_norm": 0.13736894726753235, "learning_rate": 4.657663047676264e-06, "loss": 0.3271, "step": 6080 }, { "epoch": 1.825030012004802, "grad_norm": 0.13429389894008636, "learning_rate": 4.6418772706721565e-06, "loss": 0.3148, "step": 6081 }, { "epoch": 1.8253301320528212, "grad_norm": 0.14426174759864807, "learning_rate": 4.6261176540755904e-06, "loss": 0.3375, "step": 6082 }, { "epoch": 1.8256302521008403, "grad_norm": 0.13774125277996063, "learning_rate": 4.610384202210028e-06, "loss": 0.3294, "step": 6083 }, { "epoch": 1.8259303721488596, "grad_norm": 0.16279757022857666, "learning_rate": 4.5946769193917714e-06, "loss": 0.2977, "step": 6084 }, { "epoch": 1.8262304921968786, "grad_norm": 0.1437031626701355, "learning_rate": 4.578995809929931e-06, "loss": 0.3454, "step": 6085 }, { "epoch": 1.8265306122448979, "grad_norm": 0.13721896708011627, "learning_rate": 4.563340878126432e-06, "loss": 0.3293, "step": 6086 }, { "epoch": 1.8268307322929171, "grad_norm": 0.12678766250610352, "learning_rate": 4.547712128276038e-06, "loss": 0.2902, "step": 6087 }, { "epoch": 1.8271308523409364, "grad_norm": 0.1424795538187027, "learning_rate": 4.532109564666298e-06, "loss": 0.3391, "step": 6088 }, { "epoch": 1.8274309723889557, "grad_norm": 0.13385315239429474, "learning_rate": 4.51653319157761e-06, "loss": 0.3196, "step": 6089 }, { "epoch": 1.8277310924369747, "grad_norm": 0.13906684517860413, "learning_rate": 4.500983013283188e-06, "loss": 0.3381, "step": 6090 }, { "epoch": 1.828031212484994, "grad_norm": 0.1481311172246933, "learning_rate": 4.485459034049022e-06, "loss": 0.337, "step": 6091 }, { "epoch": 1.828331332533013, "grad_norm": 0.12925542891025543, "learning_rate": 4.4699612581339255e-06, "loss": 0.3032, "step": 6092 }, { "epoch": 1.8286314525810323, "grad_norm": 0.13815073668956757, "learning_rate": 4.454489689789576e-06, "loss": 0.3315, "step": 6093 }, { "epoch": 1.8289315726290516, "grad_norm": 0.14559106528759003, "learning_rate": 4.439044333260389e-06, "loss": 0.3465, "step": 6094 }, { "epoch": 1.8292316926770709, "grad_norm": 0.14658205211162567, "learning_rate": 4.423625192783643e-06, "loss": 0.3493, "step": 6095 }, { "epoch": 1.8295318127250901, "grad_norm": 0.1342991292476654, "learning_rate": 4.408232272589375e-06, "loss": 0.3209, "step": 6096 }, { "epoch": 1.8298319327731094, "grad_norm": 0.1332305371761322, "learning_rate": 4.3928655769004735e-06, "loss": 0.3115, "step": 6097 }, { "epoch": 1.8301320528211285, "grad_norm": 0.14046160876750946, "learning_rate": 4.3775251099326234e-06, "loss": 0.335, "step": 6098 }, { "epoch": 1.8304321728691475, "grad_norm": 0.14250709116458893, "learning_rate": 4.362210875894302e-06, "loss": 0.3192, "step": 6099 }, { "epoch": 1.8307322929171668, "grad_norm": 0.15121778845787048, "learning_rate": 4.34692287898677e-06, "loss": 0.3292, "step": 6100 }, { "epoch": 1.831032412965186, "grad_norm": 0.13479244709014893, "learning_rate": 4.33166112340413e-06, "loss": 0.3366, "step": 6101 }, { "epoch": 1.8313325330132053, "grad_norm": 0.14662350714206696, "learning_rate": 4.316425613333286e-06, "loss": 0.3178, "step": 6102 }, { "epoch": 1.8316326530612246, "grad_norm": 0.13920946419239044, "learning_rate": 4.301216352953896e-06, "loss": 0.3309, "step": 6103 }, { "epoch": 1.8319327731092439, "grad_norm": 0.14373935759067535, "learning_rate": 4.286033346438478e-06, "loss": 0.3579, "step": 6104 }, { "epoch": 1.832232893157263, "grad_norm": 0.13850827515125275, "learning_rate": 4.270876597952278e-06, "loss": 0.3567, "step": 6105 }, { "epoch": 1.832533013205282, "grad_norm": 0.1590890735387802, "learning_rate": 4.255746111653425e-06, "loss": 0.3535, "step": 6106 }, { "epoch": 1.8328331332533012, "grad_norm": 0.1391279101371765, "learning_rate": 4.240641891692754e-06, "loss": 0.316, "step": 6107 }, { "epoch": 1.8331332533013205, "grad_norm": 0.13280515372753143, "learning_rate": 4.225563942213939e-06, "loss": 0.3132, "step": 6108 }, { "epoch": 1.8334333733493398, "grad_norm": 0.15908294916152954, "learning_rate": 4.21051226735345e-06, "loss": 0.3567, "step": 6109 }, { "epoch": 1.833733493397359, "grad_norm": 0.13278742134571075, "learning_rate": 4.195486871240562e-06, "loss": 0.3045, "step": 6110 }, { "epoch": 1.8340336134453783, "grad_norm": 0.1318618208169937, "learning_rate": 4.180487757997276e-06, "loss": 0.299, "step": 6111 }, { "epoch": 1.8343337334933973, "grad_norm": 0.187179833650589, "learning_rate": 4.165514931738468e-06, "loss": 0.4666, "step": 6112 }, { "epoch": 1.8346338535414166, "grad_norm": 0.13341879844665527, "learning_rate": 4.150568396571741e-06, "loss": 0.327, "step": 6113 }, { "epoch": 1.8349339735894357, "grad_norm": 0.13785672187805176, "learning_rate": 4.135648156597493e-06, "loss": 0.3386, "step": 6114 }, { "epoch": 1.835234093637455, "grad_norm": 0.13748951256275177, "learning_rate": 4.120754215908962e-06, "loss": 0.3249, "step": 6115 }, { "epoch": 1.8355342136854742, "grad_norm": 0.1445334553718567, "learning_rate": 4.105886578592089e-06, "loss": 0.3285, "step": 6116 }, { "epoch": 1.8358343337334935, "grad_norm": 0.13730578124523163, "learning_rate": 4.091045248725645e-06, "loss": 0.3357, "step": 6117 }, { "epoch": 1.8361344537815127, "grad_norm": 0.13931235671043396, "learning_rate": 4.076230230381217e-06, "loss": 0.3319, "step": 6118 }, { "epoch": 1.8364345738295318, "grad_norm": 0.13775765895843506, "learning_rate": 4.061441527623078e-06, "loss": 0.3037, "step": 6119 }, { "epoch": 1.836734693877551, "grad_norm": 0.12979570031166077, "learning_rate": 4.046679144508392e-06, "loss": 0.3191, "step": 6120 }, { "epoch": 1.83703481392557, "grad_norm": 0.14721934497356415, "learning_rate": 4.031943085087009e-06, "loss": 0.3514, "step": 6121 }, { "epoch": 1.8373349339735894, "grad_norm": 0.12928462028503418, "learning_rate": 4.017233353401617e-06, "loss": 0.2874, "step": 6122 }, { "epoch": 1.8376350540216086, "grad_norm": 0.1679869145154953, "learning_rate": 4.002549953487678e-06, "loss": 0.3385, "step": 6123 }, { "epoch": 1.837935174069628, "grad_norm": 0.12665621936321259, "learning_rate": 3.987892889373368e-06, "loss": 0.2923, "step": 6124 }, { "epoch": 1.8382352941176472, "grad_norm": 0.1349416971206665, "learning_rate": 3.973262165079738e-06, "loss": 0.314, "step": 6125 }, { "epoch": 1.8385354141656662, "grad_norm": 0.14436101913452148, "learning_rate": 3.958657784620512e-06, "loss": 0.3367, "step": 6126 }, { "epoch": 1.8388355342136855, "grad_norm": 0.1507471650838852, "learning_rate": 3.944079752002272e-06, "loss": 0.3592, "step": 6127 }, { "epoch": 1.8391356542617046, "grad_norm": 0.13125640153884888, "learning_rate": 3.92952807122432e-06, "loss": 0.3067, "step": 6128 }, { "epoch": 1.8394357743097238, "grad_norm": 0.15047404170036316, "learning_rate": 3.91500274627874e-06, "loss": 0.3058, "step": 6129 }, { "epoch": 1.839735894357743, "grad_norm": 0.16610467433929443, "learning_rate": 3.900503781150366e-06, "loss": 0.3592, "step": 6130 }, { "epoch": 1.8400360144057624, "grad_norm": 0.13654379546642303, "learning_rate": 3.886031179816874e-06, "loss": 0.3265, "step": 6131 }, { "epoch": 1.8403361344537816, "grad_norm": 0.17456600069999695, "learning_rate": 3.871584946248618e-06, "loss": 0.3436, "step": 6132 }, { "epoch": 1.8406362545018007, "grad_norm": 0.13722942769527435, "learning_rate": 3.857165084408776e-06, "loss": 0.3314, "step": 6133 }, { "epoch": 1.84093637454982, "grad_norm": 0.1379007250070572, "learning_rate": 3.842771598253247e-06, "loss": 0.3313, "step": 6134 }, { "epoch": 1.841236494597839, "grad_norm": 0.13639438152313232, "learning_rate": 3.828404491730741e-06, "loss": 0.3197, "step": 6135 }, { "epoch": 1.8415366146458583, "grad_norm": 0.14406685531139374, "learning_rate": 3.8140637687827164e-06, "loss": 0.2994, "step": 6136 }, { "epoch": 1.8418367346938775, "grad_norm": 0.13586309552192688, "learning_rate": 3.7997494333433692e-06, "loss": 0.3434, "step": 6137 }, { "epoch": 1.8421368547418968, "grad_norm": 0.1358279585838318, "learning_rate": 3.785461489339659e-06, "loss": 0.3033, "step": 6138 }, { "epoch": 1.842436974789916, "grad_norm": 0.14901627600193024, "learning_rate": 3.77119994069135e-06, "loss": 0.3284, "step": 6139 }, { "epoch": 1.8427370948379351, "grad_norm": 0.14877700805664062, "learning_rate": 3.7569647913109243e-06, "loss": 0.338, "step": 6140 }, { "epoch": 1.8430372148859544, "grad_norm": 0.13883346319198608, "learning_rate": 3.7427560451036125e-06, "loss": 0.3173, "step": 6141 }, { "epoch": 1.8433373349339734, "grad_norm": 0.13665060698986053, "learning_rate": 3.728573705967442e-06, "loss": 0.334, "step": 6142 }, { "epoch": 1.8436374549819927, "grad_norm": 0.13771137595176697, "learning_rate": 3.7144177777931777e-06, "loss": 0.3363, "step": 6143 }, { "epoch": 1.843937575030012, "grad_norm": 0.26751354336738586, "learning_rate": 3.7002882644643356e-06, "loss": 0.3369, "step": 6144 }, { "epoch": 1.8442376950780313, "grad_norm": 0.1271308958530426, "learning_rate": 3.6861851698571815e-06, "loss": 0.3021, "step": 6145 }, { "epoch": 1.8445378151260505, "grad_norm": 0.19478288292884827, "learning_rate": 3.6721084978407206e-06, "loss": 0.328, "step": 6146 }, { "epoch": 1.8448379351740696, "grad_norm": 0.13766752183437347, "learning_rate": 3.6580582522767417e-06, "loss": 0.3071, "step": 6147 }, { "epoch": 1.8451380552220888, "grad_norm": 0.13980470597743988, "learning_rate": 3.6440344370197834e-06, "loss": 0.3354, "step": 6148 }, { "epoch": 1.845438175270108, "grad_norm": 0.14060752093791962, "learning_rate": 3.6300370559170904e-06, "loss": 0.3525, "step": 6149 }, { "epoch": 1.8457382953181272, "grad_norm": 0.1450841873884201, "learning_rate": 3.6160661128087025e-06, "loss": 0.3382, "step": 6150 }, { "epoch": 1.8460384153661464, "grad_norm": 0.14197222888469696, "learning_rate": 3.6021216115273758e-06, "loss": 0.3325, "step": 6151 }, { "epoch": 1.8463385354141657, "grad_norm": 0.12967342138290405, "learning_rate": 3.5882035558986284e-06, "loss": 0.3034, "step": 6152 }, { "epoch": 1.846638655462185, "grad_norm": 0.1390308290719986, "learning_rate": 3.5743119497407386e-06, "loss": 0.353, "step": 6153 }, { "epoch": 1.8469387755102042, "grad_norm": 0.15909169614315033, "learning_rate": 3.560446796864669e-06, "loss": 0.306, "step": 6154 }, { "epoch": 1.8472388955582233, "grad_norm": 0.12947727739810944, "learning_rate": 3.5466081010742e-06, "loss": 0.3016, "step": 6155 }, { "epoch": 1.8475390156062423, "grad_norm": 0.14105425775051117, "learning_rate": 3.5327958661658058e-06, "loss": 0.3306, "step": 6156 }, { "epoch": 1.8478391356542616, "grad_norm": 0.1362878829240799, "learning_rate": 3.519010095928721e-06, "loss": 0.2877, "step": 6157 }, { "epoch": 1.8481392557022809, "grad_norm": 0.1474706381559372, "learning_rate": 3.5052507941449097e-06, "loss": 0.3723, "step": 6158 }, { "epoch": 1.8484393757503002, "grad_norm": 0.1433713734149933, "learning_rate": 3.491517964589064e-06, "loss": 0.3019, "step": 6159 }, { "epoch": 1.8487394957983194, "grad_norm": 0.1212991252541542, "learning_rate": 3.4778116110286473e-06, "loss": 0.2583, "step": 6160 }, { "epoch": 1.8490396158463387, "grad_norm": 0.1453191190958023, "learning_rate": 3.4641317372238414e-06, "loss": 0.3144, "step": 6161 }, { "epoch": 1.8493397358943577, "grad_norm": 0.1551435887813568, "learning_rate": 3.4504783469275547e-06, "loss": 0.3199, "step": 6162 }, { "epoch": 1.8496398559423768, "grad_norm": 0.147239089012146, "learning_rate": 3.436851443885447e-06, "loss": 0.3611, "step": 6163 }, { "epoch": 1.849939975990396, "grad_norm": 0.13194699585437775, "learning_rate": 3.4232510318358833e-06, "loss": 0.2908, "step": 6164 }, { "epoch": 1.8502400960384153, "grad_norm": 0.1313251256942749, "learning_rate": 3.4096771145099904e-06, "loss": 0.3066, "step": 6165 }, { "epoch": 1.8505402160864346, "grad_norm": 0.13080264627933502, "learning_rate": 3.3961296956316335e-06, "loss": 0.2998, "step": 6166 }, { "epoch": 1.8508403361344539, "grad_norm": 0.12654796242713928, "learning_rate": 3.3826087789173734e-06, "loss": 0.3054, "step": 6167 }, { "epoch": 1.8511404561824731, "grad_norm": 0.13733084499835968, "learning_rate": 3.369114368076509e-06, "loss": 0.3307, "step": 6168 }, { "epoch": 1.8514405762304922, "grad_norm": 0.13465648889541626, "learning_rate": 3.355646466811113e-06, "loss": 0.3306, "step": 6169 }, { "epoch": 1.8517406962785115, "grad_norm": 0.13510553538799286, "learning_rate": 3.34220507881593e-06, "loss": 0.3206, "step": 6170 }, { "epoch": 1.8520408163265305, "grad_norm": 0.13174894452095032, "learning_rate": 3.3287902077784317e-06, "loss": 0.285, "step": 6171 }, { "epoch": 1.8523409363745498, "grad_norm": 0.13492131233215332, "learning_rate": 3.3154018573788528e-06, "loss": 0.3102, "step": 6172 }, { "epoch": 1.852641056422569, "grad_norm": 0.1444476842880249, "learning_rate": 3.3020400312901324e-06, "loss": 0.3472, "step": 6173 }, { "epoch": 1.8529411764705883, "grad_norm": 0.6041731238365173, "learning_rate": 3.28870473317795e-06, "loss": 0.3192, "step": 6174 }, { "epoch": 1.8532412965186076, "grad_norm": 0.14589573442935944, "learning_rate": 3.2753959667006673e-06, "loss": 0.3472, "step": 6175 }, { "epoch": 1.8535414165666266, "grad_norm": 0.13637158274650574, "learning_rate": 3.2621137355093756e-06, "loss": 0.3383, "step": 6176 }, { "epoch": 1.853841536614646, "grad_norm": 0.14120420813560486, "learning_rate": 3.24885804324796e-06, "loss": 0.3095, "step": 6177 }, { "epoch": 1.854141656662665, "grad_norm": 0.13430336117744446, "learning_rate": 3.2356288935529335e-06, "loss": 0.3196, "step": 6178 }, { "epoch": 1.8544417767106842, "grad_norm": 0.12128697335720062, "learning_rate": 3.2224262900535483e-06, "loss": 0.2697, "step": 6179 }, { "epoch": 1.8547418967587035, "grad_norm": 0.12741613388061523, "learning_rate": 3.209250236371797e-06, "loss": 0.2973, "step": 6180 }, { "epoch": 1.8550420168067228, "grad_norm": 0.13828550279140472, "learning_rate": 3.1961007361223983e-06, "loss": 0.3176, "step": 6181 }, { "epoch": 1.855342136854742, "grad_norm": 0.14088614284992218, "learning_rate": 3.1829777929127447e-06, "loss": 0.3294, "step": 6182 }, { "epoch": 1.855642256902761, "grad_norm": 0.13326530158519745, "learning_rate": 3.1698814103429895e-06, "loss": 0.3191, "step": 6183 }, { "epoch": 1.8559423769507803, "grad_norm": 0.13025276362895966, "learning_rate": 3.156811592005937e-06, "loss": 0.3197, "step": 6184 }, { "epoch": 1.8562424969987994, "grad_norm": 0.14010116457939148, "learning_rate": 3.143768341487163e-06, "loss": 0.3246, "step": 6185 }, { "epoch": 1.8565426170468187, "grad_norm": 0.1410944014787674, "learning_rate": 3.13075166236495e-06, "loss": 0.3408, "step": 6186 }, { "epoch": 1.856842737094838, "grad_norm": 0.17456680536270142, "learning_rate": 3.1177615582102528e-06, "loss": 0.3391, "step": 6187 }, { "epoch": 1.8571428571428572, "grad_norm": 0.12378163635730743, "learning_rate": 3.1047980325867643e-06, "loss": 0.2792, "step": 6188 }, { "epoch": 1.8574429771908765, "grad_norm": 0.13946925103664398, "learning_rate": 3.091861089050874e-06, "loss": 0.334, "step": 6189 }, { "epoch": 1.8577430972388955, "grad_norm": 0.14545568823814392, "learning_rate": 3.0789507311516864e-06, "loss": 0.3196, "step": 6190 }, { "epoch": 1.8580432172869148, "grad_norm": 0.13843731582164764, "learning_rate": 3.0660669624310245e-06, "loss": 0.33, "step": 6191 }, { "epoch": 1.8583433373349338, "grad_norm": 0.12731510400772095, "learning_rate": 3.053209786423372e-06, "loss": 0.2945, "step": 6192 }, { "epoch": 1.8586434573829531, "grad_norm": 0.1322941929101944, "learning_rate": 3.0403792066559744e-06, "loss": 0.2708, "step": 6193 }, { "epoch": 1.8589435774309724, "grad_norm": 0.12273052334785461, "learning_rate": 3.027575226648749e-06, "loss": 0.2851, "step": 6194 }, { "epoch": 1.8592436974789917, "grad_norm": 0.13278886675834656, "learning_rate": 3.014797849914319e-06, "loss": 0.304, "step": 6195 }, { "epoch": 1.859543817527011, "grad_norm": 0.1393134891986847, "learning_rate": 3.0020470799580146e-06, "loss": 0.3533, "step": 6196 }, { "epoch": 1.85984393757503, "grad_norm": 0.14386825263500214, "learning_rate": 2.9893229202778374e-06, "loss": 0.3234, "step": 6197 }, { "epoch": 1.8601440576230492, "grad_norm": 0.13732436299324036, "learning_rate": 2.9766253743645502e-06, "loss": 0.3315, "step": 6198 }, { "epoch": 1.8604441776710683, "grad_norm": 0.13331647217273712, "learning_rate": 2.9639544457015666e-06, "loss": 0.3119, "step": 6199 }, { "epoch": 1.8607442977190876, "grad_norm": 0.14015549421310425, "learning_rate": 2.9513101377650175e-06, "loss": 0.3078, "step": 6200 }, { "epoch": 1.8610444177671068, "grad_norm": 0.1331723928451538, "learning_rate": 2.9386924540236948e-06, "loss": 0.3048, "step": 6201 }, { "epoch": 1.861344537815126, "grad_norm": 0.14064151048660278, "learning_rate": 2.9261013979391407e-06, "loss": 0.3428, "step": 6202 }, { "epoch": 1.8616446578631454, "grad_norm": 0.14177489280700684, "learning_rate": 2.9135369729655583e-06, "loss": 0.351, "step": 6203 }, { "epoch": 1.8619447779111644, "grad_norm": 0.1311153918504715, "learning_rate": 2.9009991825498684e-06, "loss": 0.2902, "step": 6204 }, { "epoch": 1.8622448979591837, "grad_norm": 0.137837752699852, "learning_rate": 2.888488030131653e-06, "loss": 0.3241, "step": 6205 }, { "epoch": 1.8625450180072027, "grad_norm": 0.13662120699882507, "learning_rate": 2.8760035191432e-06, "loss": 0.3118, "step": 6206 }, { "epoch": 1.862845138055222, "grad_norm": 0.13921856880187988, "learning_rate": 2.863545653009525e-06, "loss": 0.3412, "step": 6207 }, { "epoch": 1.8631452581032413, "grad_norm": 0.13769812881946564, "learning_rate": 2.851114435148261e-06, "loss": 0.3458, "step": 6208 }, { "epoch": 1.8634453781512605, "grad_norm": 0.12593281269073486, "learning_rate": 2.83870986896978e-06, "loss": 0.2957, "step": 6209 }, { "epoch": 1.8637454981992798, "grad_norm": 0.13339969515800476, "learning_rate": 2.8263319578771485e-06, "loss": 0.307, "step": 6210 }, { "epoch": 1.864045618247299, "grad_norm": 0.11993400007486343, "learning_rate": 2.813980705266095e-06, "loss": 0.2759, "step": 6211 }, { "epoch": 1.8643457382953181, "grad_norm": 0.13781361281871796, "learning_rate": 2.801656114525031e-06, "loss": 0.3271, "step": 6212 }, { "epoch": 1.8646458583433372, "grad_norm": 0.12544366717338562, "learning_rate": 2.789358189035096e-06, "loss": 0.2687, "step": 6213 }, { "epoch": 1.8649459783913565, "grad_norm": 0.14004136621952057, "learning_rate": 2.777086932170048e-06, "loss": 0.3392, "step": 6214 }, { "epoch": 1.8652460984393757, "grad_norm": 0.12872520089149475, "learning_rate": 2.7648423472963927e-06, "loss": 0.318, "step": 6215 }, { "epoch": 1.865546218487395, "grad_norm": 0.13727736473083496, "learning_rate": 2.752624437773299e-06, "loss": 0.331, "step": 6216 }, { "epoch": 1.8658463385354143, "grad_norm": 0.12658123672008514, "learning_rate": 2.740433206952575e-06, "loss": 0.2883, "step": 6217 }, { "epoch": 1.8661464585834335, "grad_norm": 0.13739939033985138, "learning_rate": 2.7282686581787674e-06, "loss": 0.3244, "step": 6218 }, { "epoch": 1.8664465786314526, "grad_norm": 0.1341625303030014, "learning_rate": 2.7161307947890957e-06, "loss": 0.2999, "step": 6219 }, { "epoch": 1.8667466986794716, "grad_norm": 0.12846803665161133, "learning_rate": 2.704019620113407e-06, "loss": 0.311, "step": 6220 }, { "epoch": 1.867046818727491, "grad_norm": 0.1295066922903061, "learning_rate": 2.6919351374743e-06, "loss": 0.2861, "step": 6221 }, { "epoch": 1.8673469387755102, "grad_norm": 0.13613493740558624, "learning_rate": 2.6798773501869878e-06, "loss": 0.3222, "step": 6222 }, { "epoch": 1.8676470588235294, "grad_norm": 0.14230984449386597, "learning_rate": 2.6678462615593925e-06, "loss": 0.3588, "step": 6223 }, { "epoch": 1.8679471788715487, "grad_norm": 0.13602472841739655, "learning_rate": 2.6558418748921177e-06, "loss": 0.2879, "step": 6224 }, { "epoch": 1.868247298919568, "grad_norm": 0.14377835392951965, "learning_rate": 2.643864193478407e-06, "loss": 0.3034, "step": 6225 }, { "epoch": 1.868547418967587, "grad_norm": 0.13924340903759003, "learning_rate": 2.6319132206042206e-06, "loss": 0.3208, "step": 6226 }, { "epoch": 1.8688475390156063, "grad_norm": 0.12001512944698334, "learning_rate": 2.6199889595481584e-06, "loss": 0.2733, "step": 6227 }, { "epoch": 1.8691476590636253, "grad_norm": 0.14791560173034668, "learning_rate": 2.608091413581504e-06, "loss": 0.3565, "step": 6228 }, { "epoch": 1.8694477791116446, "grad_norm": 0.1310638189315796, "learning_rate": 2.5962205859682343e-06, "loss": 0.3036, "step": 6229 }, { "epoch": 1.8697478991596639, "grad_norm": 0.1368735432624817, "learning_rate": 2.584376479964945e-06, "loss": 0.316, "step": 6230 }, { "epoch": 1.8700480192076832, "grad_norm": 0.17357869446277618, "learning_rate": 2.572559098820937e-06, "loss": 0.3233, "step": 6231 }, { "epoch": 1.8703481392557024, "grad_norm": 0.14382725954055786, "learning_rate": 2.5607684457782055e-06, "loss": 0.3314, "step": 6232 }, { "epoch": 1.8706482593037215, "grad_norm": 0.13554109632968903, "learning_rate": 2.54900452407133e-06, "loss": 0.331, "step": 6233 }, { "epoch": 1.8709483793517407, "grad_norm": 0.12987254559993744, "learning_rate": 2.5372673369276514e-06, "loss": 0.275, "step": 6234 }, { "epoch": 1.8712484993997598, "grad_norm": 0.13484419882297516, "learning_rate": 2.5255568875671042e-06, "loss": 0.3219, "step": 6235 }, { "epoch": 1.871548619447779, "grad_norm": 0.1299813985824585, "learning_rate": 2.5138731792023197e-06, "loss": 0.296, "step": 6236 }, { "epoch": 1.8718487394957983, "grad_norm": 0.13492323458194733, "learning_rate": 2.5022162150386107e-06, "loss": 0.3178, "step": 6237 }, { "epoch": 1.8721488595438176, "grad_norm": 0.13738395273685455, "learning_rate": 2.490585998273909e-06, "loss": 0.3024, "step": 6238 }, { "epoch": 1.8724489795918369, "grad_norm": 0.13431774079799652, "learning_rate": 2.478982532098828e-06, "loss": 0.3195, "step": 6239 }, { "epoch": 1.872749099639856, "grad_norm": 0.14140775799751282, "learning_rate": 2.4674058196966663e-06, "loss": 0.3145, "step": 6240 }, { "epoch": 1.8730492196878752, "grad_norm": 0.1380709409713745, "learning_rate": 2.4558558642433615e-06, "loss": 0.3109, "step": 6241 }, { "epoch": 1.8733493397358942, "grad_norm": 0.12591363489627838, "learning_rate": 2.44433266890749e-06, "loss": 0.2855, "step": 6242 }, { "epoch": 1.8736494597839135, "grad_norm": 0.14441455900669098, "learning_rate": 2.432836236850322e-06, "loss": 0.3523, "step": 6243 }, { "epoch": 1.8739495798319328, "grad_norm": 0.1465701162815094, "learning_rate": 2.421366571225769e-06, "loss": 0.3623, "step": 6244 }, { "epoch": 1.874249699879952, "grad_norm": 0.13750213384628296, "learning_rate": 2.409923675180403e-06, "loss": 0.3363, "step": 6245 }, { "epoch": 1.8745498199279713, "grad_norm": 0.12628985941410065, "learning_rate": 2.3985075518534682e-06, "loss": 0.295, "step": 6246 }, { "epoch": 1.8748499399759904, "grad_norm": 0.13401252031326294, "learning_rate": 2.387118204376804e-06, "loss": 0.3156, "step": 6247 }, { "epoch": 1.8751500600240096, "grad_norm": 0.1269846260547638, "learning_rate": 2.375755635874988e-06, "loss": 0.2952, "step": 6248 }, { "epoch": 1.8754501800720287, "grad_norm": 0.13198496401309967, "learning_rate": 2.364419849465205e-06, "loss": 0.296, "step": 6249 }, { "epoch": 1.875750300120048, "grad_norm": 0.13994579017162323, "learning_rate": 2.353110848257267e-06, "loss": 0.3445, "step": 6250 }, { "epoch": 1.8760504201680672, "grad_norm": 0.21288661658763885, "learning_rate": 2.3418286353537154e-06, "loss": 0.3113, "step": 6251 }, { "epoch": 1.8763505402160865, "grad_norm": 0.13985422253608704, "learning_rate": 2.3305732138496404e-06, "loss": 0.3039, "step": 6252 }, { "epoch": 1.8766506602641058, "grad_norm": 0.13241468369960785, "learning_rate": 2.3193445868328944e-06, "loss": 0.3079, "step": 6253 }, { "epoch": 1.8769507803121248, "grad_norm": 0.13896723091602325, "learning_rate": 2.308142757383902e-06, "loss": 0.3081, "step": 6254 }, { "epoch": 1.877250900360144, "grad_norm": 0.14832334220409393, "learning_rate": 2.2969677285757385e-06, "loss": 0.3642, "step": 6255 }, { "epoch": 1.8775510204081631, "grad_norm": 0.1311514973640442, "learning_rate": 2.2858195034741626e-06, "loss": 0.317, "step": 6256 }, { "epoch": 1.8778511404561824, "grad_norm": 0.28102830052375793, "learning_rate": 2.274698085137561e-06, "loss": 0.3163, "step": 6257 }, { "epoch": 1.8781512605042017, "grad_norm": 0.1358410269021988, "learning_rate": 2.26360347661696e-06, "loss": 0.3264, "step": 6258 }, { "epoch": 1.878451380552221, "grad_norm": 0.14667531847953796, "learning_rate": 2.2525356809560472e-06, "loss": 0.3245, "step": 6259 }, { "epoch": 1.8787515006002402, "grad_norm": 0.12747597694396973, "learning_rate": 2.241494701191127e-06, "loss": 0.306, "step": 6260 }, { "epoch": 1.8790516206482593, "grad_norm": 0.13438037037849426, "learning_rate": 2.2304805403511873e-06, "loss": 0.3146, "step": 6261 }, { "epoch": 1.8793517406962785, "grad_norm": 0.14660976827144623, "learning_rate": 2.219493201457834e-06, "loss": 0.3551, "step": 6262 }, { "epoch": 1.8796518607442976, "grad_norm": 0.13732437789440155, "learning_rate": 2.208532687525311e-06, "loss": 0.3265, "step": 6263 }, { "epoch": 1.8799519807923168, "grad_norm": 0.1420273333787918, "learning_rate": 2.197599001560502e-06, "loss": 0.3023, "step": 6264 }, { "epoch": 1.8802521008403361, "grad_norm": 0.15207207202911377, "learning_rate": 2.186692146562963e-06, "loss": 0.3678, "step": 6265 }, { "epoch": 1.8805522208883554, "grad_norm": 0.13141755759716034, "learning_rate": 2.175812125524834e-06, "loss": 0.3135, "step": 6266 }, { "epoch": 1.8808523409363747, "grad_norm": 0.15551365911960602, "learning_rate": 2.16495894143095e-06, "loss": 0.3508, "step": 6267 }, { "epoch": 1.8811524609843937, "grad_norm": 0.1405211091041565, "learning_rate": 2.154132597258729e-06, "loss": 0.3328, "step": 6268 }, { "epoch": 1.881452581032413, "grad_norm": 0.13350637257099152, "learning_rate": 2.143333095978284e-06, "loss": 0.3246, "step": 6269 }, { "epoch": 1.881752701080432, "grad_norm": 0.13670623302459717, "learning_rate": 2.1325604405523334e-06, "loss": 0.3061, "step": 6270 }, { "epoch": 1.8820528211284513, "grad_norm": 0.1253797709941864, "learning_rate": 2.1218146339362143e-06, "loss": 0.2961, "step": 6271 }, { "epoch": 1.8823529411764706, "grad_norm": 0.133013516664505, "learning_rate": 2.1110956790779123e-06, "loss": 0.325, "step": 6272 }, { "epoch": 1.8826530612244898, "grad_norm": 0.13871300220489502, "learning_rate": 2.100403578918053e-06, "loss": 0.3338, "step": 6273 }, { "epoch": 1.882953181272509, "grad_norm": 0.13345804810523987, "learning_rate": 2.0897383363899124e-06, "loss": 0.3085, "step": 6274 }, { "epoch": 1.8832533013205284, "grad_norm": 0.1403323858976364, "learning_rate": 2.079099954419361e-06, "loss": 0.3655, "step": 6275 }, { "epoch": 1.8835534213685474, "grad_norm": 0.13470247387886047, "learning_rate": 2.06848843592492e-06, "loss": 0.3061, "step": 6276 }, { "epoch": 1.8838535414165665, "grad_norm": 0.15145735442638397, "learning_rate": 2.0579037838177164e-06, "loss": 0.3408, "step": 6277 }, { "epoch": 1.8841536614645857, "grad_norm": 0.13776715099811554, "learning_rate": 2.047346001001571e-06, "loss": 0.3192, "step": 6278 }, { "epoch": 1.884453781512605, "grad_norm": 0.14677351713180542, "learning_rate": 2.0368150903728677e-06, "loss": 0.3179, "step": 6279 }, { "epoch": 1.8847539015606243, "grad_norm": 0.14180903136730194, "learning_rate": 2.026311054820629e-06, "loss": 0.3353, "step": 6280 }, { "epoch": 1.8850540216086435, "grad_norm": 0.13480877876281738, "learning_rate": 2.015833897226538e-06, "loss": 0.3031, "step": 6281 }, { "epoch": 1.8853541416566628, "grad_norm": 0.13390487432479858, "learning_rate": 2.0053836204648625e-06, "loss": 0.3205, "step": 6282 }, { "epoch": 1.8856542617046819, "grad_norm": 0.13204741477966309, "learning_rate": 1.9949602274025424e-06, "loss": 0.3097, "step": 6283 }, { "epoch": 1.885954381752701, "grad_norm": 0.1440712958574295, "learning_rate": 1.984563720899091e-06, "loss": 0.35, "step": 6284 }, { "epoch": 1.8862545018007202, "grad_norm": 0.1389112025499344, "learning_rate": 1.9741941038066815e-06, "loss": 0.3161, "step": 6285 }, { "epoch": 1.8865546218487395, "grad_norm": 0.1399288922548294, "learning_rate": 1.963851378970094e-06, "loss": 0.3146, "step": 6286 }, { "epoch": 1.8868547418967587, "grad_norm": 0.13302282989025116, "learning_rate": 1.9535355492267483e-06, "loss": 0.3035, "step": 6287 }, { "epoch": 1.887154861944778, "grad_norm": 0.13178716599941254, "learning_rate": 1.943246617406669e-06, "loss": 0.2989, "step": 6288 }, { "epoch": 1.8874549819927973, "grad_norm": 0.1290881186723709, "learning_rate": 1.932984586332487e-06, "loss": 0.2986, "step": 6289 }, { "epoch": 1.8877551020408163, "grad_norm": 0.13396479189395905, "learning_rate": 1.922749458819506e-06, "loss": 0.322, "step": 6290 }, { "epoch": 1.8880552220888356, "grad_norm": 0.1531609743833542, "learning_rate": 1.9125412376755912e-06, "loss": 0.2845, "step": 6291 }, { "epoch": 1.8883553421368546, "grad_norm": 0.1545073240995407, "learning_rate": 1.9023599257012692e-06, "loss": 0.3098, "step": 6292 }, { "epoch": 1.888655462184874, "grad_norm": 0.15298312902450562, "learning_rate": 1.8922055256896499e-06, "loss": 0.3225, "step": 6293 }, { "epoch": 1.8889555822328932, "grad_norm": 0.13482391834259033, "learning_rate": 1.8820780404264827e-06, "loss": 0.3218, "step": 6294 }, { "epoch": 1.8892557022809124, "grad_norm": 0.1857229620218277, "learning_rate": 1.871977472690134e-06, "loss": 0.3341, "step": 6295 }, { "epoch": 1.8895558223289317, "grad_norm": 0.13722006976604462, "learning_rate": 1.8619038252515653e-06, "loss": 0.3266, "step": 6296 }, { "epoch": 1.8898559423769508, "grad_norm": 0.13641677796840668, "learning_rate": 1.8518571008743769e-06, "loss": 0.3169, "step": 6297 }, { "epoch": 1.89015606242497, "grad_norm": 0.1337769478559494, "learning_rate": 1.8418373023147639e-06, "loss": 0.3164, "step": 6298 }, { "epoch": 1.890456182472989, "grad_norm": 0.14053909480571747, "learning_rate": 1.83184443232155e-06, "loss": 0.3049, "step": 6299 }, { "epoch": 1.8907563025210083, "grad_norm": 0.1474149376153946, "learning_rate": 1.8218784936361644e-06, "loss": 0.3539, "step": 6300 }, { "epoch": 1.8910564225690276, "grad_norm": 0.1520894169807434, "learning_rate": 1.8119394889926532e-06, "loss": 0.3675, "step": 6301 }, { "epoch": 1.8913565426170469, "grad_norm": 0.141342893242836, "learning_rate": 1.8020274211176469e-06, "loss": 0.3524, "step": 6302 }, { "epoch": 1.8916566626650662, "grad_norm": 0.14925958216190338, "learning_rate": 1.7921422927304254e-06, "loss": 0.3447, "step": 6303 }, { "epoch": 1.8919567827130852, "grad_norm": 0.13178478181362152, "learning_rate": 1.782284106542864e-06, "loss": 0.277, "step": 6304 }, { "epoch": 1.8922569027611045, "grad_norm": 0.12817543745040894, "learning_rate": 1.772452865259433e-06, "loss": 0.3033, "step": 6305 }, { "epoch": 1.8925570228091235, "grad_norm": 0.13969522714614868, "learning_rate": 1.762648571577219e-06, "loss": 0.3311, "step": 6306 }, { "epoch": 1.8928571428571428, "grad_norm": 0.13275279104709625, "learning_rate": 1.752871228185926e-06, "loss": 0.3204, "step": 6307 }, { "epoch": 1.893157262905162, "grad_norm": 0.1373925358057022, "learning_rate": 1.7431208377678531e-06, "loss": 0.326, "step": 6308 }, { "epoch": 1.8934573829531813, "grad_norm": 0.1355675309896469, "learning_rate": 1.733397402997916e-06, "loss": 0.3346, "step": 6309 }, { "epoch": 1.8937575030012006, "grad_norm": 0.12994515895843506, "learning_rate": 1.7237009265436032e-06, "loss": 0.2927, "step": 6310 }, { "epoch": 1.8940576230492197, "grad_norm": 0.1230861097574234, "learning_rate": 1.7140314110650535e-06, "loss": 0.2682, "step": 6311 }, { "epoch": 1.894357743097239, "grad_norm": 0.14534206688404083, "learning_rate": 1.704388859214978e-06, "loss": 0.3348, "step": 6312 }, { "epoch": 1.894657863145258, "grad_norm": 0.1521633118391037, "learning_rate": 1.6947732736387168e-06, "loss": 0.3034, "step": 6313 }, { "epoch": 1.8949579831932772, "grad_norm": 0.12484516948461533, "learning_rate": 1.6851846569741813e-06, "loss": 0.2813, "step": 6314 }, { "epoch": 1.8952581032412965, "grad_norm": 0.15173867344856262, "learning_rate": 1.675623011851879e-06, "loss": 0.3582, "step": 6315 }, { "epoch": 1.8955582232893158, "grad_norm": 0.12107089906930923, "learning_rate": 1.6660883408949778e-06, "loss": 0.2877, "step": 6316 }, { "epoch": 1.895858343337335, "grad_norm": 0.1327318549156189, "learning_rate": 1.6565806467191859e-06, "loss": 0.315, "step": 6317 }, { "epoch": 1.896158463385354, "grad_norm": 0.14046049118041992, "learning_rate": 1.6470999319328161e-06, "loss": 0.3211, "step": 6318 }, { "epoch": 1.8964585834333734, "grad_norm": 0.13437533378601074, "learning_rate": 1.6376461991368218e-06, "loss": 0.2932, "step": 6319 }, { "epoch": 1.8967587034813924, "grad_norm": 0.1479692906141281, "learning_rate": 1.6282194509247063e-06, "loss": 0.3447, "step": 6320 }, { "epoch": 1.8970588235294117, "grad_norm": 0.1407313048839569, "learning_rate": 1.6188196898826003e-06, "loss": 0.3301, "step": 6321 }, { "epoch": 1.897358943577431, "grad_norm": 0.1369362771511078, "learning_rate": 1.6094469185892191e-06, "loss": 0.3149, "step": 6322 }, { "epoch": 1.8976590636254502, "grad_norm": 3.8090240955352783, "learning_rate": 1.6001011396158617e-06, "loss": 0.3424, "step": 6323 }, { "epoch": 1.8979591836734695, "grad_norm": 0.14959876239299774, "learning_rate": 1.5907823555264434e-06, "loss": 0.3371, "step": 6324 }, { "epoch": 1.8982593037214885, "grad_norm": 0.13633541762828827, "learning_rate": 1.581490568877475e-06, "loss": 0.325, "step": 6325 }, { "epoch": 1.8985594237695078, "grad_norm": 0.13324010372161865, "learning_rate": 1.572225782218051e-06, "loss": 0.312, "step": 6326 }, { "epoch": 1.8988595438175269, "grad_norm": 0.20001792907714844, "learning_rate": 1.5629879980898376e-06, "loss": 0.3349, "step": 6327 }, { "epoch": 1.8991596638655461, "grad_norm": 0.13812509179115295, "learning_rate": 1.5537772190271416e-06, "loss": 0.3196, "step": 6328 }, { "epoch": 1.8994597839135654, "grad_norm": 0.14415699243545532, "learning_rate": 1.5445934475568192e-06, "loss": 0.3431, "step": 6329 }, { "epoch": 1.8997599039615847, "grad_norm": 0.15070272982120514, "learning_rate": 1.5354366861983438e-06, "loss": 0.3123, "step": 6330 }, { "epoch": 1.900060024009604, "grad_norm": 0.13872075080871582, "learning_rate": 1.5263069374637507e-06, "loss": 0.3242, "step": 6331 }, { "epoch": 1.9003601440576232, "grad_norm": 0.13535228371620178, "learning_rate": 1.5172042038577028e-06, "loss": 0.3043, "step": 6332 }, { "epoch": 1.9006602641056423, "grad_norm": 0.13848936557769775, "learning_rate": 1.5081284878774138e-06, "loss": 0.3087, "step": 6333 }, { "epoch": 1.9009603841536613, "grad_norm": 0.14651234447956085, "learning_rate": 1.4990797920127141e-06, "loss": 0.3517, "step": 6334 }, { "epoch": 1.9012605042016806, "grad_norm": 0.14521829783916473, "learning_rate": 1.4900581187459961e-06, "loss": 0.3502, "step": 6335 }, { "epoch": 1.9015606242496998, "grad_norm": 0.13988415896892548, "learning_rate": 1.4810634705522686e-06, "loss": 0.2946, "step": 6336 }, { "epoch": 1.9018607442977191, "grad_norm": 0.1395883411169052, "learning_rate": 1.472095849899091e-06, "loss": 0.3162, "step": 6337 }, { "epoch": 1.9021608643457384, "grad_norm": 0.13737809658050537, "learning_rate": 1.4631552592466514e-06, "loss": 0.3351, "step": 6338 }, { "epoch": 1.9024609843937577, "grad_norm": 0.13726972043514252, "learning_rate": 1.4542417010476873e-06, "loss": 0.3002, "step": 6339 }, { "epoch": 1.9027611044417767, "grad_norm": 0.17059005796909332, "learning_rate": 1.4453551777475094e-06, "loss": 0.4004, "step": 6340 }, { "epoch": 1.9030612244897958, "grad_norm": 0.14700105786323547, "learning_rate": 1.4364956917840678e-06, "loss": 0.3654, "step": 6341 }, { "epoch": 1.903361344537815, "grad_norm": 0.1453956961631775, "learning_rate": 1.4276632455878403e-06, "loss": 0.3261, "step": 6342 }, { "epoch": 1.9036614645858343, "grad_norm": 0.1321887969970703, "learning_rate": 1.418857841581922e-06, "loss": 0.2817, "step": 6343 }, { "epoch": 1.9039615846338536, "grad_norm": 0.13092279434204102, "learning_rate": 1.4100794821819585e-06, "loss": 0.2898, "step": 6344 }, { "epoch": 1.9042617046818728, "grad_norm": 0.130708247423172, "learning_rate": 1.40132816979619e-06, "loss": 0.3026, "step": 6345 }, { "epoch": 1.904561824729892, "grad_norm": 0.12597669661045074, "learning_rate": 1.3926039068254626e-06, "loss": 0.2967, "step": 6346 }, { "epoch": 1.9048619447779112, "grad_norm": 0.14716726541519165, "learning_rate": 1.383906695663173e-06, "loss": 0.3724, "step": 6347 }, { "epoch": 1.9051620648259304, "grad_norm": 0.13780756294727325, "learning_rate": 1.3752365386952681e-06, "loss": 0.3179, "step": 6348 }, { "epoch": 1.9054621848739495, "grad_norm": 0.1345573216676712, "learning_rate": 1.3665934383003343e-06, "loss": 0.3294, "step": 6349 }, { "epoch": 1.9057623049219687, "grad_norm": 0.13241131603717804, "learning_rate": 1.3579773968495191e-06, "loss": 0.2994, "step": 6350 }, { "epoch": 1.906062424969988, "grad_norm": 0.13944432139396667, "learning_rate": 1.3493884167064986e-06, "loss": 0.3159, "step": 6351 }, { "epoch": 1.9063625450180073, "grad_norm": 0.14155784249305725, "learning_rate": 1.3408265002275877e-06, "loss": 0.3411, "step": 6352 }, { "epoch": 1.9066626650660266, "grad_norm": 0.14534442126750946, "learning_rate": 1.332291649761641e-06, "loss": 0.3306, "step": 6353 }, { "epoch": 1.9069627851140456, "grad_norm": 0.13481180369853973, "learning_rate": 1.323783867650097e-06, "loss": 0.3298, "step": 6354 }, { "epoch": 1.9072629051620649, "grad_norm": 0.12702417373657227, "learning_rate": 1.3153031562269768e-06, "loss": 0.2922, "step": 6355 }, { "epoch": 1.907563025210084, "grad_norm": 0.28436294198036194, "learning_rate": 1.3068495178188533e-06, "loss": 0.3397, "step": 6356 }, { "epoch": 1.9078631452581032, "grad_norm": 0.1368831992149353, "learning_rate": 1.2984229547448935e-06, "loss": 0.3242, "step": 6357 }, { "epoch": 1.9081632653061225, "grad_norm": 0.14487719535827637, "learning_rate": 1.2900234693168255e-06, "loss": 0.3448, "step": 6358 }, { "epoch": 1.9084633853541417, "grad_norm": 0.1391216218471527, "learning_rate": 1.2816510638389512e-06, "loss": 0.3179, "step": 6359 }, { "epoch": 1.908763505402161, "grad_norm": 0.12535832822322845, "learning_rate": 1.2733057406081438e-06, "loss": 0.2929, "step": 6360 }, { "epoch": 1.90906362545018, "grad_norm": 0.1655777245759964, "learning_rate": 1.2649875019138501e-06, "loss": 0.3486, "step": 6361 }, { "epoch": 1.9093637454981993, "grad_norm": 0.14286132156848907, "learning_rate": 1.256696350038078e-06, "loss": 0.3336, "step": 6362 }, { "epoch": 1.9096638655462184, "grad_norm": 0.13462306559085846, "learning_rate": 1.2484322872554299e-06, "loss": 0.314, "step": 6363 }, { "epoch": 1.9099639855942376, "grad_norm": 0.1311066746711731, "learning_rate": 1.2401953158330148e-06, "loss": 0.2993, "step": 6364 }, { "epoch": 1.910264105642257, "grad_norm": 0.15170760452747345, "learning_rate": 1.2319854380305918e-06, "loss": 0.2987, "step": 6365 }, { "epoch": 1.9105642256902762, "grad_norm": 0.13527582585811615, "learning_rate": 1.2238026561004369e-06, "loss": 0.3204, "step": 6366 }, { "epoch": 1.9108643457382954, "grad_norm": 0.14560282230377197, "learning_rate": 1.2156469722873875e-06, "loss": 0.343, "step": 6367 }, { "epoch": 1.9111644657863145, "grad_norm": 0.1309434473514557, "learning_rate": 1.2075183888288767e-06, "loss": 0.292, "step": 6368 }, { "epoch": 1.9114645858343338, "grad_norm": 0.12859490513801575, "learning_rate": 1.1994169079548756e-06, "loss": 0.3092, "step": 6369 }, { "epoch": 1.9117647058823528, "grad_norm": 0.13696551322937012, "learning_rate": 1.1913425318879511e-06, "loss": 0.3166, "step": 6370 }, { "epoch": 1.912064825930372, "grad_norm": 0.13910551369190216, "learning_rate": 1.1832952628431981e-06, "loss": 0.3227, "step": 6371 }, { "epoch": 1.9123649459783914, "grad_norm": 0.13507243990898132, "learning_rate": 1.175275103028306e-06, "loss": 0.3335, "step": 6372 }, { "epoch": 1.9126650660264106, "grad_norm": 0.1377187818288803, "learning_rate": 1.1672820546435038e-06, "loss": 0.3268, "step": 6373 }, { "epoch": 1.91296518607443, "grad_norm": 0.14127525687217712, "learning_rate": 1.1593161198815927e-06, "loss": 0.3476, "step": 6374 }, { "epoch": 1.913265306122449, "grad_norm": 0.14496168494224548, "learning_rate": 1.1513773009279472e-06, "loss": 0.3512, "step": 6375 }, { "epoch": 1.9135654261704682, "grad_norm": 0.13064956665039062, "learning_rate": 1.1434655999604805e-06, "loss": 0.2882, "step": 6376 }, { "epoch": 1.9138655462184873, "grad_norm": 0.12650299072265625, "learning_rate": 1.1355810191496674e-06, "loss": 0.2786, "step": 6377 }, { "epoch": 1.9141656662665065, "grad_norm": 0.13515569269657135, "learning_rate": 1.1277235606585778e-06, "loss": 0.3191, "step": 6378 }, { "epoch": 1.9144657863145258, "grad_norm": 0.14777721464633942, "learning_rate": 1.1198932266427985e-06, "loss": 0.3264, "step": 6379 }, { "epoch": 1.914765906362545, "grad_norm": 0.13458096981048584, "learning_rate": 1.1120900192505e-06, "loss": 0.3169, "step": 6380 }, { "epoch": 1.9150660264105643, "grad_norm": 0.1463477462530136, "learning_rate": 1.1043139406223813e-06, "loss": 0.3233, "step": 6381 }, { "epoch": 1.9153661464585834, "grad_norm": 0.13509635627269745, "learning_rate": 1.0965649928917466e-06, "loss": 0.3077, "step": 6382 }, { "epoch": 1.9156662665066027, "grad_norm": 0.12657250463962555, "learning_rate": 1.088843178184429e-06, "loss": 0.2954, "step": 6383 }, { "epoch": 1.9159663865546217, "grad_norm": 0.13948281109333038, "learning_rate": 1.0811484986188002e-06, "loss": 0.3244, "step": 6384 }, { "epoch": 1.916266506602641, "grad_norm": 0.13280120491981506, "learning_rate": 1.073480956305839e-06, "loss": 0.3264, "step": 6385 }, { "epoch": 1.9165666266506602, "grad_norm": 0.1396474540233612, "learning_rate": 1.0658405533489956e-06, "loss": 0.3166, "step": 6386 }, { "epoch": 1.9168667466986795, "grad_norm": 0.13262183964252472, "learning_rate": 1.0582272918443825e-06, "loss": 0.3328, "step": 6387 }, { "epoch": 1.9171668667466988, "grad_norm": 0.14330746233463287, "learning_rate": 1.0506411738805845e-06, "loss": 0.3334, "step": 6388 }, { "epoch": 1.917466986794718, "grad_norm": 0.14604748785495758, "learning_rate": 1.0430822015387698e-06, "loss": 0.355, "step": 6389 }, { "epoch": 1.917767106842737, "grad_norm": 0.1383308321237564, "learning_rate": 1.0355503768926466e-06, "loss": 0.3327, "step": 6390 }, { "epoch": 1.9180672268907561, "grad_norm": 0.13492268323898315, "learning_rate": 1.0280457020085067e-06, "loss": 0.3161, "step": 6391 }, { "epoch": 1.9183673469387754, "grad_norm": 0.13790442049503326, "learning_rate": 1.020568178945147e-06, "loss": 0.3404, "step": 6392 }, { "epoch": 1.9186674669867947, "grad_norm": 0.13623499870300293, "learning_rate": 1.0131178097539605e-06, "loss": 0.3125, "step": 6393 }, { "epoch": 1.918967587034814, "grad_norm": 0.14850133657455444, "learning_rate": 1.0056945964788566e-06, "loss": 0.3303, "step": 6394 }, { "epoch": 1.9192677070828332, "grad_norm": 0.1354813277721405, "learning_rate": 9.98298541156306e-07, "loss": 0.2941, "step": 6395 }, { "epoch": 1.9195678271308525, "grad_norm": 0.12442360073328018, "learning_rate": 9.909296458153528e-07, "loss": 0.2821, "step": 6396 }, { "epoch": 1.9198679471788715, "grad_norm": 0.13599467277526855, "learning_rate": 9.83587912477546e-07, "loss": 0.3295, "step": 6397 }, { "epoch": 1.9201680672268906, "grad_norm": 0.1269826591014862, "learning_rate": 9.762733431570192e-07, "loss": 0.2964, "step": 6398 }, { "epoch": 1.9204681872749099, "grad_norm": 0.1483106166124344, "learning_rate": 9.68985939860434e-07, "loss": 0.369, "step": 6399 }, { "epoch": 1.9207683073229291, "grad_norm": 0.13698875904083252, "learning_rate": 9.61725704587002e-07, "loss": 0.3383, "step": 6400 }, { "epoch": 1.9210684273709484, "grad_norm": 0.14250008761882782, "learning_rate": 9.54492639328508e-07, "loss": 0.3152, "step": 6401 }, { "epoch": 1.9213685474189677, "grad_norm": 0.19880473613739014, "learning_rate": 9.472867460692314e-07, "loss": 0.3343, "step": 6402 }, { "epoch": 1.921668667466987, "grad_norm": 0.14081600308418274, "learning_rate": 9.401080267860351e-07, "loss": 0.2955, "step": 6403 }, { "epoch": 1.921968787515006, "grad_norm": 0.13255588710308075, "learning_rate": 9.32956483448344e-07, "loss": 0.3268, "step": 6404 }, { "epoch": 1.9222689075630253, "grad_norm": 0.15398523211479187, "learning_rate": 9.258321180180663e-07, "loss": 0.3647, "step": 6405 }, { "epoch": 1.9225690276110443, "grad_norm": 0.13041505217552185, "learning_rate": 9.187349324497052e-07, "loss": 0.3074, "step": 6406 }, { "epoch": 1.9228691476590636, "grad_norm": 0.1473788172006607, "learning_rate": 9.116649286902923e-07, "loss": 0.3741, "step": 6407 }, { "epoch": 1.9231692677070829, "grad_norm": 0.14926253259181976, "learning_rate": 9.046221086793983e-07, "loss": 0.3567, "step": 6408 }, { "epoch": 1.9234693877551021, "grad_norm": 0.12680687010288239, "learning_rate": 8.976064743491552e-07, "loss": 0.2923, "step": 6409 }, { "epoch": 1.9237695078031214, "grad_norm": 0.13940927386283875, "learning_rate": 8.906180276242015e-07, "loss": 0.3216, "step": 6410 }, { "epoch": 1.9240696278511404, "grad_norm": 0.12795254588127136, "learning_rate": 8.83656770421748e-07, "loss": 0.2889, "step": 6411 }, { "epoch": 1.9243697478991597, "grad_norm": 0.12966111302375793, "learning_rate": 8.767227046515225e-07, "loss": 0.3146, "step": 6412 }, { "epoch": 1.9246698679471788, "grad_norm": 0.12147068977355957, "learning_rate": 8.698158322158256e-07, "loss": 0.2657, "step": 6413 }, { "epoch": 1.924969987995198, "grad_norm": 0.13021109998226166, "learning_rate": 8.62936155009475e-07, "loss": 0.2975, "step": 6414 }, { "epoch": 1.9252701080432173, "grad_norm": 0.18556343019008636, "learning_rate": 8.560836749198165e-07, "loss": 0.3116, "step": 6415 }, { "epoch": 1.9255702280912366, "grad_norm": 0.1517120897769928, "learning_rate": 8.492583938267684e-07, "loss": 0.3077, "step": 6416 }, { "epoch": 1.9258703481392558, "grad_norm": 0.14694081246852875, "learning_rate": 8.424603136027553e-07, "loss": 0.2696, "step": 6417 }, { "epoch": 1.9261704681872749, "grad_norm": 0.13473621010780334, "learning_rate": 8.356894361127632e-07, "loss": 0.3118, "step": 6418 }, { "epoch": 1.9264705882352942, "grad_norm": 0.13514980673789978, "learning_rate": 8.28945763214295e-07, "loss": 0.3083, "step": 6419 }, { "epoch": 1.9267707082833132, "grad_norm": 0.13456521928310394, "learning_rate": 8.22229296757393e-07, "loss": 0.3228, "step": 6420 }, { "epoch": 1.9270708283313325, "grad_norm": 0.1491681933403015, "learning_rate": 8.155400385846613e-07, "loss": 0.3463, "step": 6421 }, { "epoch": 1.9273709483793517, "grad_norm": 0.14137840270996094, "learning_rate": 8.088779905312205e-07, "loss": 0.3295, "step": 6422 }, { "epoch": 1.927671068427371, "grad_norm": 0.1492571234703064, "learning_rate": 8.022431544247089e-07, "loss": 0.3616, "step": 6423 }, { "epoch": 1.9279711884753903, "grad_norm": 0.13985922932624817, "learning_rate": 7.95635532085326e-07, "loss": 0.3373, "step": 6424 }, { "epoch": 1.9282713085234093, "grad_norm": 0.14516708254814148, "learning_rate": 7.890551253258105e-07, "loss": 0.3349, "step": 6425 }, { "epoch": 1.9285714285714286, "grad_norm": 0.1339079886674881, "learning_rate": 7.825019359514074e-07, "loss": 0.3046, "step": 6426 }, { "epoch": 1.9288715486194477, "grad_norm": 0.12760967016220093, "learning_rate": 7.759759657599008e-07, "loss": 0.293, "step": 6427 }, { "epoch": 1.929171668667467, "grad_norm": 0.13753746449947357, "learning_rate": 7.694772165416364e-07, "loss": 0.3201, "step": 6428 }, { "epoch": 1.9294717887154862, "grad_norm": 0.1318732351064682, "learning_rate": 7.630056900794658e-07, "loss": 0.3123, "step": 6429 }, { "epoch": 1.9297719087635055, "grad_norm": 0.1316746026277542, "learning_rate": 7.565613881487687e-07, "loss": 0.3313, "step": 6430 }, { "epoch": 1.9300720288115247, "grad_norm": 0.14019489288330078, "learning_rate": 7.501443125174757e-07, "loss": 0.3272, "step": 6431 }, { "epoch": 1.9303721488595438, "grad_norm": 0.13703711330890656, "learning_rate": 7.437544649460337e-07, "loss": 0.3283, "step": 6432 }, { "epoch": 1.930672268907563, "grad_norm": 0.124041348695755, "learning_rate": 7.373918471874186e-07, "loss": 0.2811, "step": 6433 }, { "epoch": 1.930972388955582, "grad_norm": 0.12873201072216034, "learning_rate": 7.310564609871451e-07, "loss": 0.3065, "step": 6434 }, { "epoch": 1.9312725090036014, "grad_norm": 0.12377699464559555, "learning_rate": 7.247483080832562e-07, "loss": 0.2889, "step": 6435 }, { "epoch": 1.9315726290516206, "grad_norm": 0.13439233601093292, "learning_rate": 7.184673902063343e-07, "loss": 0.3289, "step": 6436 }, { "epoch": 1.93187274909964, "grad_norm": 0.12906034290790558, "learning_rate": 7.122137090794457e-07, "loss": 0.2994, "step": 6437 }, { "epoch": 1.9321728691476592, "grad_norm": 0.13723750412464142, "learning_rate": 7.059872664182398e-07, "loss": 0.3202, "step": 6438 }, { "epoch": 1.9324729891956782, "grad_norm": 0.13962934911251068, "learning_rate": 6.99788063930873e-07, "loss": 0.3247, "step": 6439 }, { "epoch": 1.9327731092436975, "grad_norm": 0.1447930932044983, "learning_rate": 6.936161033180066e-07, "loss": 0.3359, "step": 6440 }, { "epoch": 1.9330732292917165, "grad_norm": 0.13536879420280457, "learning_rate": 6.874713862728643e-07, "loss": 0.3049, "step": 6441 }, { "epoch": 1.9333733493397358, "grad_norm": 0.2102050930261612, "learning_rate": 6.813539144811642e-07, "loss": 0.286, "step": 6442 }, { "epoch": 1.933673469387755, "grad_norm": 0.1379159837961197, "learning_rate": 6.75263689621175e-07, "loss": 0.3416, "step": 6443 }, { "epoch": 1.9339735894357744, "grad_norm": 0.14714962244033813, "learning_rate": 6.692007133636824e-07, "loss": 0.3524, "step": 6444 }, { "epoch": 1.9342737094837936, "grad_norm": 0.1256219893693924, "learning_rate": 6.631649873719781e-07, "loss": 0.2596, "step": 6445 }, { "epoch": 1.9345738295318127, "grad_norm": 0.13435864448547363, "learning_rate": 6.571565133019153e-07, "loss": 0.2911, "step": 6446 }, { "epoch": 1.934873949579832, "grad_norm": 0.12946513295173645, "learning_rate": 6.511752928018422e-07, "loss": 0.3092, "step": 6447 }, { "epoch": 1.935174069627851, "grad_norm": 0.2115807682275772, "learning_rate": 6.45221327512624e-07, "loss": 0.3168, "step": 6448 }, { "epoch": 1.9354741896758703, "grad_norm": 0.13688799738883972, "learning_rate": 6.392946190676763e-07, "loss": 0.3171, "step": 6449 }, { "epoch": 1.9357743097238895, "grad_norm": 0.13157397508621216, "learning_rate": 6.333951690929318e-07, "loss": 0.2952, "step": 6450 }, { "epoch": 1.9360744297719088, "grad_norm": 0.148507758975029, "learning_rate": 6.275229792068183e-07, "loss": 0.3397, "step": 6451 }, { "epoch": 1.936374549819928, "grad_norm": 0.14567214250564575, "learning_rate": 6.216780510203136e-07, "loss": 0.3507, "step": 6452 }, { "epoch": 1.9366746698679473, "grad_norm": 0.14839902520179749, "learning_rate": 6.158603861369017e-07, "loss": 0.3664, "step": 6453 }, { "epoch": 1.9369747899159664, "grad_norm": 0.138484388589859, "learning_rate": 6.10069986152595e-07, "loss": 0.3245, "step": 6454 }, { "epoch": 1.9372749099639854, "grad_norm": 0.15200287103652954, "learning_rate": 6.043068526559337e-07, "loss": 0.3764, "step": 6455 }, { "epoch": 1.9375750300120047, "grad_norm": 0.13714081048965454, "learning_rate": 5.985709872279644e-07, "loss": 0.3084, "step": 6456 }, { "epoch": 1.937875150060024, "grad_norm": 0.14301633834838867, "learning_rate": 5.928623914422282e-07, "loss": 0.3569, "step": 6457 }, { "epoch": 1.9381752701080432, "grad_norm": 0.13486430048942566, "learning_rate": 5.871810668648503e-07, "loss": 0.3099, "step": 6458 }, { "epoch": 1.9384753901560625, "grad_norm": 0.14129821956157684, "learning_rate": 5.815270150544172e-07, "loss": 0.3073, "step": 6459 }, { "epoch": 1.9387755102040818, "grad_norm": 0.13441157341003418, "learning_rate": 5.759002375620548e-07, "loss": 0.309, "step": 6460 }, { "epoch": 1.9390756302521008, "grad_norm": 0.1334627866744995, "learning_rate": 5.703007359314172e-07, "loss": 0.3143, "step": 6461 }, { "epoch": 1.9393757503001199, "grad_norm": 0.13053907454013824, "learning_rate": 5.647285116986311e-07, "loss": 0.2893, "step": 6462 }, { "epoch": 1.9396758703481392, "grad_norm": 0.1443127542734146, "learning_rate": 5.591835663924183e-07, "loss": 0.2946, "step": 6463 }, { "epoch": 1.9399759903961584, "grad_norm": 0.14227718114852905, "learning_rate": 5.536659015339396e-07, "loss": 0.323, "step": 6464 }, { "epoch": 1.9402761104441777, "grad_norm": 0.13019327819347382, "learning_rate": 5.481755186369064e-07, "loss": 0.277, "step": 6465 }, { "epoch": 1.940576230492197, "grad_norm": 0.19664429128170013, "learning_rate": 5.427124192075473e-07, "loss": 0.3693, "step": 6466 }, { "epoch": 1.9408763505402162, "grad_norm": 0.13815642893314362, "learning_rate": 5.372766047446076e-07, "loss": 0.3371, "step": 6467 }, { "epoch": 1.9411764705882353, "grad_norm": 0.14420412480831146, "learning_rate": 5.31868076739328e-07, "loss": 0.3298, "step": 6468 }, { "epoch": 1.9414765906362546, "grad_norm": 0.1388360857963562, "learning_rate": 5.264868366754772e-07, "loss": 0.2977, "step": 6469 }, { "epoch": 1.9417767106842736, "grad_norm": 0.13571693003177643, "learning_rate": 5.211328860293519e-07, "loss": 0.3245, "step": 6470 }, { "epoch": 1.9420768307322929, "grad_norm": 0.1365104615688324, "learning_rate": 5.158062262697217e-07, "loss": 0.3233, "step": 6471 }, { "epoch": 1.9423769507803121, "grad_norm": 0.1375596821308136, "learning_rate": 5.105068588579176e-07, "loss": 0.344, "step": 6472 }, { "epoch": 1.9426770708283314, "grad_norm": 0.14423203468322754, "learning_rate": 5.052347852477546e-07, "loss": 0.3578, "step": 6473 }, { "epoch": 1.9429771908763507, "grad_norm": 0.1386280208826065, "learning_rate": 4.999900068855645e-07, "loss": 0.3251, "step": 6474 }, { "epoch": 1.9432773109243697, "grad_norm": 0.13474127650260925, "learning_rate": 4.947725252101854e-07, "loss": 0.3256, "step": 6475 }, { "epoch": 1.943577430972389, "grad_norm": 0.13253982365131378, "learning_rate": 4.895823416529832e-07, "loss": 0.2867, "step": 6476 }, { "epoch": 1.943877551020408, "grad_norm": 0.13384990394115448, "learning_rate": 4.844194576378191e-07, "loss": 0.3187, "step": 6477 }, { "epoch": 1.9441776710684273, "grad_norm": 0.1321321576833725, "learning_rate": 4.792838745810712e-07, "loss": 0.3056, "step": 6478 }, { "epoch": 1.9444777911164466, "grad_norm": 0.13640731573104858, "learning_rate": 4.741755938916237e-07, "loss": 0.3207, "step": 6479 }, { "epoch": 1.9447779111644659, "grad_norm": 0.1355549395084381, "learning_rate": 4.6909461697088874e-07, "loss": 0.3141, "step": 6480 }, { "epoch": 1.9450780312124851, "grad_norm": 0.13593652844429016, "learning_rate": 4.6404094521276256e-07, "loss": 0.3282, "step": 6481 }, { "epoch": 1.9453781512605042, "grad_norm": 0.13409483432769775, "learning_rate": 4.5901458000366937e-07, "loss": 0.3311, "step": 6482 }, { "epoch": 1.9456782713085234, "grad_norm": 0.12935426831245422, "learning_rate": 4.5401552272252847e-07, "loss": 0.2935, "step": 6483 }, { "epoch": 1.9459783913565425, "grad_norm": 0.14159248769283295, "learning_rate": 4.490437747407761e-07, "loss": 0.3422, "step": 6484 }, { "epoch": 1.9462785114045618, "grad_norm": 0.12727589905261993, "learning_rate": 4.4409933742235455e-07, "loss": 0.2879, "step": 6485 }, { "epoch": 1.946578631452581, "grad_norm": 0.14284352958202362, "learning_rate": 4.3918221212371215e-07, "loss": 0.3316, "step": 6486 }, { "epoch": 1.9468787515006003, "grad_norm": 0.12972243130207062, "learning_rate": 4.3429240019380313e-07, "loss": 0.3138, "step": 6487 }, { "epoch": 1.9471788715486196, "grad_norm": 0.13691653311252594, "learning_rate": 4.2942990297411003e-07, "loss": 0.3217, "step": 6488 }, { "epoch": 1.9474789915966386, "grad_norm": 0.13877379894256592, "learning_rate": 4.2459472179857683e-07, "loss": 0.3463, "step": 6489 }, { "epoch": 1.947779111644658, "grad_norm": 0.13706190884113312, "learning_rate": 4.197868579936981e-07, "loss": 0.3185, "step": 6490 }, { "epoch": 1.948079231692677, "grad_norm": 0.11960577964782715, "learning_rate": 4.1500631287844095e-07, "loss": 0.27, "step": 6491 }, { "epoch": 1.9483793517406962, "grad_norm": 0.13107003271579742, "learning_rate": 4.1025308776430074e-07, "loss": 0.2798, "step": 6492 }, { "epoch": 1.9486794717887155, "grad_norm": 0.1464453488588333, "learning_rate": 4.055271839552788e-07, "loss": 0.3588, "step": 6493 }, { "epoch": 1.9489795918367347, "grad_norm": 0.13021965324878693, "learning_rate": 4.0082860274787136e-07, "loss": 0.3166, "step": 6494 }, { "epoch": 1.949279711884754, "grad_norm": 0.13702203333377838, "learning_rate": 3.9615734543106965e-07, "loss": 0.3327, "step": 6495 }, { "epoch": 1.949579831932773, "grad_norm": 0.23359109461307526, "learning_rate": 3.915134132863707e-07, "loss": 0.3223, "step": 6496 }, { "epoch": 1.9498799519807923, "grad_norm": 0.14288662374019623, "learning_rate": 3.8689680758781097e-07, "loss": 0.3269, "step": 6497 }, { "epoch": 1.9501800720288114, "grad_norm": 0.1728636920452118, "learning_rate": 3.8230752960188856e-07, "loss": 0.363, "step": 6498 }, { "epoch": 1.9504801920768307, "grad_norm": 0.14792786538600922, "learning_rate": 3.777455805876184e-07, "loss": 0.3396, "step": 6499 }, { "epoch": 1.95078031212485, "grad_norm": 0.13643766939640045, "learning_rate": 3.732109617965218e-07, "loss": 0.3061, "step": 6500 }, { "epoch": 1.9510804321728692, "grad_norm": 0.13429324328899384, "learning_rate": 3.6870367447262575e-07, "loss": 0.3222, "step": 6501 }, { "epoch": 1.9513805522208885, "grad_norm": 0.13534994423389435, "learning_rate": 3.642237198524412e-07, "loss": 0.3378, "step": 6502 }, { "epoch": 1.9516806722689075, "grad_norm": 0.12566448748111725, "learning_rate": 3.59771099164985e-07, "loss": 0.2892, "step": 6503 }, { "epoch": 1.9519807923169268, "grad_norm": 0.13708549737930298, "learning_rate": 3.553458136318022e-07, "loss": 0.2964, "step": 6504 }, { "epoch": 1.9522809123649458, "grad_norm": 0.12881408631801605, "learning_rate": 3.5094786446692176e-07, "loss": 0.3036, "step": 6505 }, { "epoch": 1.952581032412965, "grad_norm": 0.13288307189941406, "learning_rate": 3.465772528768452e-07, "loss": 0.2775, "step": 6506 }, { "epoch": 1.9528811524609844, "grad_norm": 0.13816514611244202, "learning_rate": 3.422339800606245e-07, "loss": 0.3356, "step": 6507 }, { "epoch": 1.9531812725090036, "grad_norm": 0.15389056503772736, "learning_rate": 3.3791804720977317e-07, "loss": 0.3055, "step": 6508 }, { "epoch": 1.953481392557023, "grad_norm": 0.1330726593732834, "learning_rate": 3.3362945550832193e-07, "loss": 0.3045, "step": 6509 }, { "epoch": 1.9537815126050422, "grad_norm": 0.15567626059055328, "learning_rate": 3.293682061327963e-07, "loss": 0.3367, "step": 6510 }, { "epoch": 1.9540816326530612, "grad_norm": 0.1269209086894989, "learning_rate": 3.251343002522278e-07, "loss": 0.2885, "step": 6511 }, { "epoch": 1.9543817527010803, "grad_norm": 0.14024633169174194, "learning_rate": 3.2092773902812065e-07, "loss": 0.3536, "step": 6512 }, { "epoch": 1.9546818727490995, "grad_norm": 0.13049684464931488, "learning_rate": 3.1674852361451845e-07, "loss": 0.3069, "step": 6513 }, { "epoch": 1.9549819927971188, "grad_norm": 0.1355288028717041, "learning_rate": 3.1259665515793736e-07, "loss": 0.3177, "step": 6514 }, { "epoch": 1.955282112845138, "grad_norm": 0.1447446644306183, "learning_rate": 3.0847213479737736e-07, "loss": 0.3527, "step": 6515 }, { "epoch": 1.9555822328931574, "grad_norm": 0.13086079061031342, "learning_rate": 3.043749636643778e-07, "loss": 0.3017, "step": 6516 }, { "epoch": 1.9558823529411766, "grad_norm": 0.1448763906955719, "learning_rate": 3.0030514288292843e-07, "loss": 0.3334, "step": 6517 }, { "epoch": 1.9561824729891957, "grad_norm": 0.13058732450008392, "learning_rate": 2.962626735695584e-07, "loss": 0.301, "step": 6518 }, { "epoch": 1.9564825930372147, "grad_norm": 0.13642174005508423, "learning_rate": 2.9224755683325835e-07, "loss": 0.3271, "step": 6519 }, { "epoch": 1.956782713085234, "grad_norm": 0.16053105890750885, "learning_rate": 2.882597937755249e-07, "loss": 0.3601, "step": 6520 }, { "epoch": 1.9570828331332533, "grad_norm": 0.13160178065299988, "learning_rate": 2.8429938549037195e-07, "loss": 0.3119, "step": 6521 }, { "epoch": 1.9573829531812725, "grad_norm": 0.13854114711284637, "learning_rate": 2.803663330642747e-07, "loss": 0.3053, "step": 6522 }, { "epoch": 1.9576830732292918, "grad_norm": 0.13475844264030457, "learning_rate": 2.7646063757623684e-07, "loss": 0.3062, "step": 6523 }, { "epoch": 1.957983193277311, "grad_norm": 0.1336299628019333, "learning_rate": 2.7258230009774564e-07, "loss": 0.3118, "step": 6524 }, { "epoch": 1.9582833133253301, "grad_norm": 0.1466909497976303, "learning_rate": 2.6873132169275005e-07, "loss": 0.3514, "step": 6525 }, { "epoch": 1.9585834333733494, "grad_norm": 0.12885448336601257, "learning_rate": 2.649077034177494e-07, "loss": 0.297, "step": 6526 }, { "epoch": 1.9588835534213684, "grad_norm": 0.12889419496059418, "learning_rate": 2.6111144632169347e-07, "loss": 0.2792, "step": 6527 }, { "epoch": 1.9591836734693877, "grad_norm": 0.24885424971580505, "learning_rate": 2.5734255144604923e-07, "loss": 0.3579, "step": 6528 }, { "epoch": 1.959483793517407, "grad_norm": 0.1575038582086563, "learning_rate": 2.5360101982476735e-07, "loss": 0.3783, "step": 6529 }, { "epoch": 1.9597839135654262, "grad_norm": 0.16567088663578033, "learning_rate": 2.498868524843045e-07, "loss": 0.3695, "step": 6530 }, { "epoch": 1.9600840336134455, "grad_norm": 0.14174482226371765, "learning_rate": 2.462000504435791e-07, "loss": 0.3543, "step": 6531 }, { "epoch": 1.9603841536614646, "grad_norm": 0.1361747682094574, "learning_rate": 2.4254061471403745e-07, "loss": 0.2955, "step": 6532 }, { "epoch": 1.9606842737094838, "grad_norm": 0.1308164745569229, "learning_rate": 2.389085462995988e-07, "loss": 0.2976, "step": 6533 }, { "epoch": 1.9609843937575029, "grad_norm": 0.1284637451171875, "learning_rate": 2.3530384619668833e-07, "loss": 0.2951, "step": 6534 }, { "epoch": 1.9612845138055222, "grad_norm": 0.13373473286628723, "learning_rate": 2.3172651539420387e-07, "loss": 0.3109, "step": 6535 }, { "epoch": 1.9615846338535414, "grad_norm": 0.1285080760717392, "learning_rate": 2.2817655487353819e-07, "loss": 0.3022, "step": 6536 }, { "epoch": 1.9618847539015607, "grad_norm": 0.14141826331615448, "learning_rate": 2.246539656086011e-07, "loss": 0.336, "step": 6537 }, { "epoch": 1.96218487394958, "grad_norm": 0.1411728709936142, "learning_rate": 2.2115874856577512e-07, "loss": 0.3233, "step": 6538 }, { "epoch": 1.962484993997599, "grad_norm": 0.1605583131313324, "learning_rate": 2.1769090470391552e-07, "loss": 0.3788, "step": 6539 }, { "epoch": 1.9627851140456183, "grad_norm": 0.13145780563354492, "learning_rate": 2.1425043497439456e-07, "loss": 0.3067, "step": 6540 }, { "epoch": 1.9630852340936373, "grad_norm": 0.39733070135116577, "learning_rate": 2.108373403210573e-07, "loss": 0.3441, "step": 6541 }, { "epoch": 1.9633853541416566, "grad_norm": 0.1398887187242508, "learning_rate": 2.0745162168026576e-07, "loss": 0.3276, "step": 6542 }, { "epoch": 1.9636854741896759, "grad_norm": 0.14685961604118347, "learning_rate": 2.040932799808326e-07, "loss": 0.3502, "step": 6543 }, { "epoch": 1.9639855942376951, "grad_norm": 0.137023463845253, "learning_rate": 2.0076231614409858e-07, "loss": 0.3303, "step": 6544 }, { "epoch": 1.9642857142857144, "grad_norm": 0.1411551833152771, "learning_rate": 1.9745873108385494e-07, "loss": 0.3581, "step": 6545 }, { "epoch": 1.9645858343337335, "grad_norm": 0.13286294043064117, "learning_rate": 1.9418252570642115e-07, "loss": 0.2939, "step": 6546 }, { "epoch": 1.9648859543817527, "grad_norm": 0.13654114305973053, "learning_rate": 1.9093370091057826e-07, "loss": 0.2977, "step": 6547 }, { "epoch": 1.9651860744297718, "grad_norm": 0.14429955184459686, "learning_rate": 1.8771225758761334e-07, "loss": 0.3418, "step": 6548 }, { "epoch": 1.965486194477791, "grad_norm": 0.13695666193962097, "learning_rate": 1.84518196621275e-07, "loss": 0.3337, "step": 6549 }, { "epoch": 1.9657863145258103, "grad_norm": 0.1368245631456375, "learning_rate": 1.8135151888782899e-07, "loss": 0.3357, "step": 6550 }, { "epoch": 1.9660864345738296, "grad_norm": 0.13573360443115234, "learning_rate": 1.7821222525601367e-07, "loss": 0.3194, "step": 6551 }, { "epoch": 1.9663865546218489, "grad_norm": 0.12387190759181976, "learning_rate": 1.751003165870624e-07, "loss": 0.2838, "step": 6552 }, { "epoch": 1.966686674669868, "grad_norm": 0.1596154421567917, "learning_rate": 1.7201579373469222e-07, "loss": 0.2843, "step": 6553 }, { "epoch": 1.9669867947178872, "grad_norm": 0.1379440575838089, "learning_rate": 1.68958657545093e-07, "loss": 0.3187, "step": 6554 }, { "epoch": 1.9672869147659062, "grad_norm": 0.14830918610095978, "learning_rate": 1.6592890885697154e-07, "loss": 0.3553, "step": 6555 }, { "epoch": 1.9675870348139255, "grad_norm": 0.1428672969341278, "learning_rate": 1.6292654850149635e-07, "loss": 0.3343, "step": 6556 }, { "epoch": 1.9678871548619448, "grad_norm": 0.1255159080028534, "learning_rate": 1.5995157730233078e-07, "loss": 0.2831, "step": 6557 }, { "epoch": 1.968187274909964, "grad_norm": 0.13673074543476105, "learning_rate": 1.57003996075622e-07, "loss": 0.3325, "step": 6558 }, { "epoch": 1.9684873949579833, "grad_norm": 0.1297430694103241, "learning_rate": 1.5408380563001201e-07, "loss": 0.3088, "step": 6559 }, { "epoch": 1.9687875150060024, "grad_norm": 0.14209860563278198, "learning_rate": 1.5119100676662667e-07, "loss": 0.3209, "step": 6560 }, { "epoch": 1.9690876350540216, "grad_norm": 0.14692017436027527, "learning_rate": 1.483256002790534e-07, "loss": 0.3308, "step": 6561 }, { "epoch": 1.9693877551020407, "grad_norm": 0.22346758842468262, "learning_rate": 1.454875869533967e-07, "loss": 0.3626, "step": 6562 }, { "epoch": 1.96968787515006, "grad_norm": 0.13532333076000214, "learning_rate": 1.426769675682227e-07, "loss": 0.3124, "step": 6563 }, { "epoch": 1.9699879951980792, "grad_norm": 0.14279678463935852, "learning_rate": 1.3989374289461453e-07, "loss": 0.3689, "step": 6564 }, { "epoch": 1.9702881152460985, "grad_norm": 0.13842518627643585, "learning_rate": 1.3713791369609485e-07, "loss": 0.3364, "step": 6565 }, { "epoch": 1.9705882352941178, "grad_norm": 0.12624448537826538, "learning_rate": 1.344094807287033e-07, "loss": 0.3029, "step": 6566 }, { "epoch": 1.9708883553421368, "grad_norm": 0.14754411578178406, "learning_rate": 1.3170844474095223e-07, "loss": 0.3388, "step": 6567 }, { "epoch": 1.971188475390156, "grad_norm": 0.1261216700077057, "learning_rate": 1.290348064738378e-07, "loss": 0.2859, "step": 6568 }, { "epoch": 1.9714885954381751, "grad_norm": 0.14190185070037842, "learning_rate": 1.2638856666085108e-07, "loss": 0.3544, "step": 6569 }, { "epoch": 1.9717887154861944, "grad_norm": 0.14383383095264435, "learning_rate": 1.2376972602795578e-07, "loss": 0.3575, "step": 6570 }, { "epoch": 1.9720888355342137, "grad_norm": 0.13956892490386963, "learning_rate": 1.2117828529358832e-07, "loss": 0.3361, "step": 6571 }, { "epoch": 1.972388955582233, "grad_norm": 0.14807261526584625, "learning_rate": 1.1861424516869113e-07, "loss": 0.3459, "step": 6572 }, { "epoch": 1.9726890756302522, "grad_norm": 0.13360875844955444, "learning_rate": 1.160776063566793e-07, "loss": 0.315, "step": 6573 }, { "epoch": 1.9729891956782715, "grad_norm": 0.14176610112190247, "learning_rate": 1.1356836955345174e-07, "loss": 0.3671, "step": 6574 }, { "epoch": 1.9732893157262905, "grad_norm": 0.143992617726326, "learning_rate": 1.1108653544738001e-07, "loss": 0.3537, "step": 6575 }, { "epoch": 1.9735894357743096, "grad_norm": 0.1435699164867401, "learning_rate": 1.086321047193306e-07, "loss": 0.2967, "step": 6576 }, { "epoch": 1.9738895558223288, "grad_norm": 0.14607293903827667, "learning_rate": 1.0620507804265378e-07, "loss": 0.3161, "step": 6577 }, { "epoch": 1.974189675870348, "grad_norm": 0.14399291574954987, "learning_rate": 1.0380545608317249e-07, "loss": 0.3256, "step": 6578 }, { "epoch": 1.9744897959183674, "grad_norm": 0.14030712842941284, "learning_rate": 1.0143323949919348e-07, "loss": 0.3303, "step": 6579 }, { "epoch": 1.9747899159663866, "grad_norm": 0.1557692587375641, "learning_rate": 9.908842894151837e-08, "loss": 0.3634, "step": 6580 }, { "epoch": 1.975090036014406, "grad_norm": 0.1479206681251526, "learning_rate": 9.67710250533993e-08, "loss": 0.3574, "step": 6581 }, { "epoch": 1.975390156062425, "grad_norm": 0.14016804099082947, "learning_rate": 9.448102847060548e-08, "loss": 0.3373, "step": 6582 }, { "epoch": 1.9756902761104442, "grad_norm": 0.2394646406173706, "learning_rate": 9.221843982136768e-08, "loss": 0.301, "step": 6583 }, { "epoch": 1.9759903961584633, "grad_norm": 0.1508098840713501, "learning_rate": 8.998325972640053e-08, "loss": 0.3216, "step": 6584 }, { "epoch": 1.9762905162064826, "grad_norm": 0.13166458904743195, "learning_rate": 8.77754887989024e-08, "loss": 0.3017, "step": 6585 }, { "epoch": 1.9765906362545018, "grad_norm": 0.13379202783107758, "learning_rate": 8.559512764454436e-08, "loss": 0.3, "step": 6586 }, { "epoch": 1.976890756302521, "grad_norm": 0.2006409913301468, "learning_rate": 8.344217686148125e-08, "loss": 0.3411, "step": 6587 }, { "epoch": 1.9771908763505404, "grad_norm": 0.1287575513124466, "learning_rate": 8.131663704035176e-08, "loss": 0.3021, "step": 6588 }, { "epoch": 1.9774909963985594, "grad_norm": 0.13200537860393524, "learning_rate": 7.921850876428937e-08, "loss": 0.3174, "step": 6589 }, { "epoch": 1.9777911164465787, "grad_norm": 0.13923829793930054, "learning_rate": 7.714779260886707e-08, "loss": 0.3576, "step": 6590 }, { "epoch": 1.9780912364945977, "grad_norm": 0.15228983759880066, "learning_rate": 7.510448914217483e-08, "loss": 0.3829, "step": 6591 }, { "epoch": 1.978391356542617, "grad_norm": 0.12499164044857025, "learning_rate": 7.308859892477538e-08, "loss": 0.2745, "step": 6592 }, { "epoch": 1.9786914765906363, "grad_norm": 0.12901915609836578, "learning_rate": 7.1100122509693e-08, "loss": 0.2978, "step": 6593 }, { "epoch": 1.9789915966386555, "grad_norm": 0.4345323145389557, "learning_rate": 6.9139060442458e-08, "loss": 0.3215, "step": 6594 }, { "epoch": 1.9792917166866748, "grad_norm": 0.14252708852291107, "learning_rate": 6.720541326105112e-08, "loss": 0.3377, "step": 6595 }, { "epoch": 1.9795918367346939, "grad_norm": 0.13123819231987, "learning_rate": 6.529918149594805e-08, "loss": 0.2767, "step": 6596 }, { "epoch": 1.9798919567827131, "grad_norm": 0.13903144001960754, "learning_rate": 6.342036567009713e-08, "loss": 0.3356, "step": 6597 }, { "epoch": 1.9801920768307322, "grad_norm": 0.12551505863666534, "learning_rate": 6.15689662989527e-08, "loss": 0.282, "step": 6598 }, { "epoch": 1.9804921968787514, "grad_norm": 0.13220173120498657, "learning_rate": 5.97449838903974e-08, "loss": 0.3032, "step": 6599 }, { "epoch": 1.9807923169267707, "grad_norm": 0.13848182559013367, "learning_rate": 5.7948418944842043e-08, "loss": 0.313, "step": 6600 }, { "epoch": 1.98109243697479, "grad_norm": 0.13549165427684784, "learning_rate": 5.617927195513684e-08, "loss": 0.2995, "step": 6601 }, { "epoch": 1.9813925570228093, "grad_norm": 0.12920106947422028, "learning_rate": 5.443754340663798e-08, "loss": 0.2794, "step": 6602 }, { "epoch": 1.9816926770708283, "grad_norm": 0.13664115965366364, "learning_rate": 5.2723233777163264e-08, "loss": 0.3129, "step": 6603 }, { "epoch": 1.9819927971188476, "grad_norm": 0.13307543098926544, "learning_rate": 5.103634353701425e-08, "loss": 0.3056, "step": 6604 }, { "epoch": 1.9822929171668666, "grad_norm": 0.12146459519863129, "learning_rate": 4.937687314897632e-08, "loss": 0.273, "step": 6605 }, { "epoch": 1.982593037214886, "grad_norm": 0.14381779730319977, "learning_rate": 4.774482306829642e-08, "loss": 0.3365, "step": 6606 }, { "epoch": 1.9828931572629052, "grad_norm": 0.13763312995433807, "learning_rate": 4.6140193742716386e-08, "loss": 0.2844, "step": 6607 }, { "epoch": 1.9831932773109244, "grad_norm": 0.14180998504161835, "learning_rate": 4.4562985612439654e-08, "loss": 0.3379, "step": 6608 }, { "epoch": 1.9834933973589437, "grad_norm": 0.13503345847129822, "learning_rate": 4.3013199110164546e-08, "loss": 0.3064, "step": 6609 }, { "epoch": 1.9837935174069627, "grad_norm": 0.1461196392774582, "learning_rate": 4.149083466105097e-08, "loss": 0.3128, "step": 6610 }, { "epoch": 1.984093637454982, "grad_norm": 0.13612636923789978, "learning_rate": 3.999589268274262e-08, "loss": 0.314, "step": 6611 }, { "epoch": 1.984393757503001, "grad_norm": 0.13711991906166077, "learning_rate": 3.852837358535588e-08, "loss": 0.333, "step": 6612 }, { "epoch": 1.9846938775510203, "grad_norm": 0.13318751752376556, "learning_rate": 3.708827777150203e-08, "loss": 0.3011, "step": 6613 }, { "epoch": 1.9849939975990396, "grad_norm": 0.13475048542022705, "learning_rate": 3.5675605636242834e-08, "loss": 0.2911, "step": 6614 }, { "epoch": 1.9852941176470589, "grad_norm": 0.141270250082016, "learning_rate": 3.429035756713494e-08, "loss": 0.2776, "step": 6615 }, { "epoch": 1.9855942376950781, "grad_norm": 0.13348527252674103, "learning_rate": 3.29325339441966e-08, "loss": 0.3176, "step": 6616 }, { "epoch": 1.9858943577430972, "grad_norm": 0.1243927925825119, "learning_rate": 3.160213513994093e-08, "loss": 0.2785, "step": 6617 }, { "epoch": 1.9861944777911165, "grad_norm": 0.12726986408233643, "learning_rate": 3.029916151934264e-08, "loss": 0.2822, "step": 6618 }, { "epoch": 1.9864945978391355, "grad_norm": 0.1278541535139084, "learning_rate": 2.9023613439860264e-08, "loss": 0.2964, "step": 6619 }, { "epoch": 1.9867947178871548, "grad_norm": 0.13514913618564606, "learning_rate": 2.7775491251413877e-08, "loss": 0.2679, "step": 6620 }, { "epoch": 1.987094837935174, "grad_norm": 0.14197710156440735, "learning_rate": 2.655479529642957e-08, "loss": 0.3246, "step": 6621 }, { "epoch": 1.9873949579831933, "grad_norm": 0.1379374861717224, "learning_rate": 2.536152590978391e-08, "loss": 0.2937, "step": 6622 }, { "epoch": 1.9876950780312126, "grad_norm": 0.14387285709381104, "learning_rate": 2.4195683418826166e-08, "loss": 0.3585, "step": 6623 }, { "epoch": 1.9879951980792316, "grad_norm": 0.1456388235092163, "learning_rate": 2.305726814341158e-08, "loss": 0.362, "step": 6624 }, { "epoch": 1.988295318127251, "grad_norm": 0.13793382048606873, "learning_rate": 2.1946280395845896e-08, "loss": 0.3162, "step": 6625 }, { "epoch": 1.98859543817527, "grad_norm": 0.14543136954307556, "learning_rate": 2.0862720480896437e-08, "loss": 0.3259, "step": 6626 }, { "epoch": 1.9888955582232892, "grad_norm": 0.14183880388736725, "learning_rate": 1.9806588695847616e-08, "loss": 0.3312, "step": 6627 }, { "epoch": 1.9891956782713085, "grad_norm": 0.14327046275138855, "learning_rate": 1.8777885330434343e-08, "loss": 0.3273, "step": 6628 }, { "epoch": 1.9894957983193278, "grad_norm": 0.14274021983146667, "learning_rate": 1.77766106668531e-08, "loss": 0.3486, "step": 6629 }, { "epoch": 1.989795918367347, "grad_norm": 0.13794536888599396, "learning_rate": 1.6802764979817474e-08, "loss": 0.3162, "step": 6630 }, { "epoch": 1.9900960384153663, "grad_norm": 0.1309654861688614, "learning_rate": 1.5856348536469335e-08, "loss": 0.2896, "step": 6631 }, { "epoch": 1.9903961584633854, "grad_norm": 0.12529993057250977, "learning_rate": 1.4937361596456533e-08, "loss": 0.2938, "step": 6632 }, { "epoch": 1.9906962785114044, "grad_norm": 0.12475324422121048, "learning_rate": 1.4045804411888519e-08, "loss": 0.3028, "step": 6633 }, { "epoch": 1.9909963985594237, "grad_norm": 0.14064736664295197, "learning_rate": 1.3181677227358524e-08, "loss": 0.3366, "step": 6634 }, { "epoch": 1.991296518607443, "grad_norm": 0.15773440897464752, "learning_rate": 1.2344980279932472e-08, "loss": 0.3596, "step": 6635 }, { "epoch": 1.9915966386554622, "grad_norm": 0.13815179467201233, "learning_rate": 1.153571379913787e-08, "loss": 0.3334, "step": 6636 }, { "epoch": 1.9918967587034815, "grad_norm": 0.12979494035243988, "learning_rate": 1.075387800699712e-08, "loss": 0.3144, "step": 6637 }, { "epoch": 1.9921968787515008, "grad_norm": 0.1313927173614502, "learning_rate": 9.999473117994207e-09, "loss": 0.2959, "step": 6638 }, { "epoch": 1.9924969987995198, "grad_norm": 0.15096567571163177, "learning_rate": 9.272499339096907e-09, "loss": 0.3656, "step": 6639 }, { "epoch": 1.9927971188475389, "grad_norm": 0.13682463765144348, "learning_rate": 8.572956869734583e-09, "loss": 0.3431, "step": 6640 }, { "epoch": 1.9930972388955581, "grad_norm": 0.14152565598487854, "learning_rate": 7.900845901820386e-09, "loss": 0.3461, "step": 6641 }, { "epoch": 1.9933973589435774, "grad_norm": 0.13964256644248962, "learning_rate": 7.256166619729055e-09, "loss": 0.3275, "step": 6642 }, { "epoch": 1.9936974789915967, "grad_norm": 0.13702847063541412, "learning_rate": 6.638919200352423e-09, "loss": 0.3272, "step": 6643 }, { "epoch": 1.993997599039616, "grad_norm": 0.13425695896148682, "learning_rate": 6.049103812988399e-09, "loss": 0.3083, "step": 6644 }, { "epoch": 1.9942977190876352, "grad_norm": 0.1240556463599205, "learning_rate": 5.486720619474195e-09, "loss": 0.2745, "step": 6645 }, { "epoch": 1.9945978391356542, "grad_norm": 0.1433292180299759, "learning_rate": 4.9517697740864014e-09, "loss": 0.3145, "step": 6646 }, { "epoch": 1.9948979591836735, "grad_norm": 0.1353660225868225, "learning_rate": 4.444251423563195e-09, "loss": 0.316, "step": 6647 }, { "epoch": 1.9951980792316926, "grad_norm": 0.1324574053287506, "learning_rate": 3.964165707170953e-09, "loss": 0.3157, "step": 6648 }, { "epoch": 1.9954981992797118, "grad_norm": 0.14322207868099213, "learning_rate": 3.5115127565821248e-09, "loss": 0.3434, "step": 6649 }, { "epoch": 1.995798319327731, "grad_norm": 0.143235981464386, "learning_rate": 3.0862926959973617e-09, "loss": 0.2979, "step": 6650 }, { "epoch": 1.9960984393757504, "grad_norm": 0.14621366560459137, "learning_rate": 2.6885056420677955e-09, "loss": 0.3256, "step": 6651 }, { "epoch": 1.9963985594237696, "grad_norm": 0.12844175100326538, "learning_rate": 2.3181517039061462e-09, "loss": 0.2874, "step": 6652 }, { "epoch": 1.9966986794717887, "grad_norm": 0.14089906215667725, "learning_rate": 1.975230983142229e-09, "loss": 0.3557, "step": 6653 }, { "epoch": 1.996998799519808, "grad_norm": 0.13490818440914154, "learning_rate": 1.6597435738341384e-09, "loss": 0.3061, "step": 6654 }, { "epoch": 1.997298919567827, "grad_norm": 0.13551375269889832, "learning_rate": 1.3716895625348614e-09, "loss": 0.2963, "step": 6655 }, { "epoch": 1.9975990396158463, "grad_norm": 0.14081865549087524, "learning_rate": 1.111069028258971e-09, "loss": 0.2799, "step": 6656 }, { "epoch": 1.9978991596638656, "grad_norm": 0.13580313324928284, "learning_rate": 8.778820425270339e-10, "loss": 0.3239, "step": 6657 }, { "epoch": 1.9981992797118848, "grad_norm": 0.13394388556480408, "learning_rate": 6.721286692989992e-10, "loss": 0.3128, "step": 6658 }, { "epoch": 1.998499399759904, "grad_norm": 0.13376793265342712, "learning_rate": 4.938089650186051e-10, "loss": 0.3188, "step": 6659 }, { "epoch": 1.9987995198079231, "grad_norm": 0.1346546858549118, "learning_rate": 3.429229786133803e-10, "loss": 0.3196, "step": 6660 }, { "epoch": 1.9990996398559424, "grad_norm": 0.13749544322490692, "learning_rate": 2.1947075147243924e-10, "loss": 0.3127, "step": 6661 }, { "epoch": 1.9993997599039615, "grad_norm": 0.17157703638076782, "learning_rate": 1.2345231745758412e-10, "loss": 0.3164, "step": 6662 }, { "epoch": 1.9996998799519807, "grad_norm": 0.13034595549106598, "learning_rate": 5.486770291440735e-11, "loss": 0.3217, "step": 6663 }, { "epoch": 2.0, "grad_norm": 0.12287092208862305, "learning_rate": 1.3716926672291408e-11, "loss": 0.2773, "step": 6664 }, { "epoch": 2.0, "eval_loss": 0.27432090044021606, "eval_runtime": 3791.9201, "eval_samples_per_second": 12.219, "eval_steps_per_second": 0.764, "step": 6664 } ], "logging_steps": 1, "max_steps": 6664, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.481222963259716e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }