{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016, "grad_norm": 36.86253356933594, "learning_rate": 0.0, "loss": 6.970664024353027, "step": 1 }, { "epoch": 0.0032, "grad_norm": 46.62815475463867, "learning_rate": 1e-05, "loss": 7.119298934936523, "step": 2 }, { "epoch": 0.0048, "grad_norm": 38.75471878051758, "learning_rate": 2e-05, "loss": 6.804569721221924, "step": 3 }, { "epoch": 0.0064, "grad_norm": 23.79888343811035, "learning_rate": 3e-05, "loss": 6.857824325561523, "step": 4 }, { "epoch": 0.008, "grad_norm": 21.74065589904785, "learning_rate": 4e-05, "loss": 6.550006866455078, "step": 5 }, { "epoch": 0.0096, "grad_norm": 18.586503982543945, "learning_rate": 5e-05, "loss": 6.686573028564453, "step": 6 }, { "epoch": 0.0112, "grad_norm": 15.070769309997559, "learning_rate": 6e-05, "loss": 6.578455924987793, "step": 7 }, { "epoch": 0.0128, "grad_norm": 7.648688316345215, "learning_rate": 7.000000000000001e-05, "loss": 6.312182426452637, "step": 8 }, { "epoch": 0.0144, "grad_norm": 11.109657287597656, "learning_rate": 8e-05, "loss": 6.330634593963623, "step": 9 }, { "epoch": 0.016, "grad_norm": 11.878482818603516, "learning_rate": 8.999999999999999e-05, "loss": 6.246118545532227, "step": 10 }, { "epoch": 0.0176, "grad_norm": 15.020891189575195, "learning_rate": 0.0001, "loss": 6.080811500549316, "step": 11 }, { "epoch": 0.0192, "grad_norm": 10.975037574768066, "learning_rate": 0.00011, "loss": 6.293004989624023, "step": 12 }, { "epoch": 0.0208, "grad_norm": 5.8413214683532715, "learning_rate": 0.00012, "loss": 6.052936553955078, "step": 13 }, { "epoch": 0.0224, "grad_norm": 11.264659881591797, "learning_rate": 0.00013000000000000002, "loss": 6.178928852081299, "step": 14 }, { "epoch": 0.024, "grad_norm": 5.662702560424805, "learning_rate": 0.00014000000000000001, "loss": 6.111515998840332, "step": 15 }, { "epoch": 0.0256, "grad_norm": 7.999163627624512, "learning_rate": 0.00015, "loss": 6.4416985511779785, "step": 16 }, { "epoch": 0.0272, "grad_norm": 4.368480205535889, "learning_rate": 0.00016, "loss": 6.070431709289551, "step": 17 }, { "epoch": 0.0288, "grad_norm": 6.195078372955322, "learning_rate": 0.00017, "loss": 6.400940895080566, "step": 18 }, { "epoch": 0.0304, "grad_norm": 4.218802452087402, "learning_rate": 0.00017999999999999998, "loss": 5.938872337341309, "step": 19 }, { "epoch": 0.032, "grad_norm": 8.09906005859375, "learning_rate": 0.00019, "loss": 6.1384053230285645, "step": 20 }, { "epoch": 0.0336, "grad_norm": 5.7899065017700195, "learning_rate": 0.0002, "loss": 6.211584568023682, "step": 21 }, { "epoch": 0.0352, "grad_norm": 3.5106873512268066, "learning_rate": 0.00021, "loss": 6.081808567047119, "step": 22 }, { "epoch": 0.0368, "grad_norm": 5.990793228149414, "learning_rate": 0.00022, "loss": 6.311020374298096, "step": 23 }, { "epoch": 0.0384, "grad_norm": 4.144802570343018, "learning_rate": 0.00023, "loss": 6.124863147735596, "step": 24 }, { "epoch": 0.04, "grad_norm": 4.716309547424316, "learning_rate": 0.00024, "loss": 6.189701557159424, "step": 25 }, { "epoch": 0.0416, "grad_norm": 3.5594468116760254, "learning_rate": 0.00025, "loss": 5.711904048919678, "step": 26 }, { "epoch": 0.0432, "grad_norm": 7.885351657867432, "learning_rate": 0.00026000000000000003, "loss": 6.188915729522705, "step": 27 }, { "epoch": 0.0448, "grad_norm": 4.330770492553711, "learning_rate": 0.00027, "loss": 6.156501293182373, "step": 28 }, { "epoch": 0.0464, "grad_norm": 6.669336318969727, "learning_rate": 0.00028000000000000003, "loss": 6.337223052978516, "step": 29 }, { "epoch": 0.048, "grad_norm": 4.431726932525635, "learning_rate": 0.00029, "loss": 5.854226112365723, "step": 30 }, { "epoch": 0.0496, "grad_norm": 5.652588367462158, "learning_rate": 0.0003, "loss": 6.1465911865234375, "step": 31 }, { "epoch": 0.0512, "grad_norm": 4.4275360107421875, "learning_rate": 0.00031, "loss": 6.568665504455566, "step": 32 }, { "epoch": 0.0528, "grad_norm": 5.78800106048584, "learning_rate": 0.00032, "loss": 5.84707498550415, "step": 33 }, { "epoch": 0.0544, "grad_norm": 5.778809547424316, "learning_rate": 0.00033, "loss": 6.26806640625, "step": 34 }, { "epoch": 0.056, "grad_norm": 3.150599718093872, "learning_rate": 0.00034, "loss": 5.942642688751221, "step": 35 }, { "epoch": 0.0576, "grad_norm": 5.755363464355469, "learning_rate": 0.00035, "loss": 6.048552989959717, "step": 36 }, { "epoch": 0.0592, "grad_norm": 4.171663284301758, "learning_rate": 0.00035999999999999997, "loss": 6.371613025665283, "step": 37 }, { "epoch": 0.0608, "grad_norm": 4.288946628570557, "learning_rate": 0.00037, "loss": 6.1250200271606445, "step": 38 }, { "epoch": 0.0624, "grad_norm": 4.6287713050842285, "learning_rate": 0.00038, "loss": 6.222686767578125, "step": 39 }, { "epoch": 0.064, "grad_norm": 5.058150291442871, "learning_rate": 0.00039000000000000005, "loss": 6.543748378753662, "step": 40 }, { "epoch": 0.0656, "grad_norm": 4.104369640350342, "learning_rate": 0.0004, "loss": 6.065921783447266, "step": 41 }, { "epoch": 0.0672, "grad_norm": 6.011862754821777, "learning_rate": 0.00041, "loss": 5.975309371948242, "step": 42 }, { "epoch": 0.0688, "grad_norm": 3.899702787399292, "learning_rate": 0.00042, "loss": 6.357814788818359, "step": 43 }, { "epoch": 0.0704, "grad_norm": 4.300708770751953, "learning_rate": 0.00043, "loss": 5.761978626251221, "step": 44 }, { "epoch": 0.072, "grad_norm": 5.165529727935791, "learning_rate": 0.00044, "loss": 6.23648738861084, "step": 45 }, { "epoch": 0.0736, "grad_norm": 3.270381212234497, "learning_rate": 0.00045000000000000004, "loss": 6.216146469116211, "step": 46 }, { "epoch": 0.0752, "grad_norm": 3.381625175476074, "learning_rate": 0.00046, "loss": 5.920130729675293, "step": 47 }, { "epoch": 0.0768, "grad_norm": 2.7397425174713135, "learning_rate": 0.00047, "loss": 5.948547840118408, "step": 48 }, { "epoch": 0.0784, "grad_norm": 4.689820289611816, "learning_rate": 0.00048, "loss": 6.4204936027526855, "step": 49 }, { "epoch": 0.08, "grad_norm": 4.353704929351807, "learning_rate": 0.00049, "loss": 5.919530391693115, "step": 50 }, { "epoch": 0.0816, "grad_norm": 3.5159783363342285, "learning_rate": 0.0005, "loss": 6.303610324859619, "step": 51 }, { "epoch": 0.0832, "grad_norm": 3.121208429336548, "learning_rate": 0.000499996268589849, "loss": 5.74945068359375, "step": 52 }, { "epoch": 0.0848, "grad_norm": 3.5212831497192383, "learning_rate": 0.0004999850744707835, "loss": 6.07124662399292, "step": 53 }, { "epoch": 0.0864, "grad_norm": 2.848412275314331, "learning_rate": 0.0004999664179769621, "loss": 6.209238052368164, "step": 54 }, { "epoch": 0.088, "grad_norm": 2.6709463596343994, "learning_rate": 0.0004999402996653051, "loss": 5.881043910980225, "step": 55 }, { "epoch": 0.0896, "grad_norm": 2.7929718494415283, "learning_rate": 0.0004999067203154777, "loss": 6.170549392700195, "step": 56 }, { "epoch": 0.0912, "grad_norm": 2.7909319400787354, "learning_rate": 0.0004998656809298664, "loss": 5.91437292098999, "step": 57 }, { "epoch": 0.0928, "grad_norm": 3.028071880340576, "learning_rate": 0.0004998171827335494, "loss": 5.768723964691162, "step": 58 }, { "epoch": 0.0944, "grad_norm": 3.5717194080352783, "learning_rate": 0.0004997612271742601, "loss": 6.126382827758789, "step": 59 }, { "epoch": 0.096, "grad_norm": 2.5707123279571533, "learning_rate": 0.0004996978159223436, "loss": 6.031285285949707, "step": 60 }, { "epoch": 0.0976, "grad_norm": 2.886106252670288, "learning_rate": 0.000499626950870707, "loss": 5.81216287612915, "step": 61 }, { "epoch": 0.0992, "grad_norm": 3.2320756912231445, "learning_rate": 0.000499548634134763, "loss": 6.256302833557129, "step": 62 }, { "epoch": 0.1008, "grad_norm": 2.3101658821105957, "learning_rate": 0.0004994628680523662, "loss": 6.089540481567383, "step": 63 }, { "epoch": 0.1024, "grad_norm": 2.2067813873291016, "learning_rate": 0.0004993696551837443, "loss": 6.167810440063477, "step": 64 }, { "epoch": 0.104, "grad_norm": 2.936598300933838, "learning_rate": 0.0004992689983114208, "loss": 6.019635200500488, "step": 65 }, { "epoch": 0.1056, "grad_norm": 3.3017938137054443, "learning_rate": 0.0004991609004401324, "loss": 5.883628845214844, "step": 66 }, { "epoch": 0.1072, "grad_norm": 3.359445333480835, "learning_rate": 0.0004990453647967389, "loss": 5.827721118927002, "step": 67 }, { "epoch": 0.1088, "grad_norm": 3.057800769805908, "learning_rate": 0.0004989223948301272, "loss": 5.853091239929199, "step": 68 }, { "epoch": 0.1104, "grad_norm": 3.4538474082946777, "learning_rate": 0.0004987919942111087, "loss": 6.159923553466797, "step": 69 }, { "epoch": 0.112, "grad_norm": 2.778003692626953, "learning_rate": 0.0004986541668323086, "loss": 5.855865478515625, "step": 70 }, { "epoch": 0.1136, "grad_norm": 2.497781753540039, "learning_rate": 0.0004985089168080509, "loss": 6.018093109130859, "step": 71 }, { "epoch": 0.1152, "grad_norm": 2.0816121101379395, "learning_rate": 0.0004983562484742349, "loss": 6.006240367889404, "step": 72 }, { "epoch": 0.1168, "grad_norm": 2.8136582374572754, "learning_rate": 0.000498196166388206, "loss": 5.550631999969482, "step": 73 }, { "epoch": 0.1184, "grad_norm": 2.223203420639038, "learning_rate": 0.0004980286753286195, "loss": 5.823319911956787, "step": 74 }, { "epoch": 0.12, "grad_norm": 2.3398818969726562, "learning_rate": 0.0004978537802952981, "loss": 5.757394790649414, "step": 75 }, { "epoch": 0.1216, "grad_norm": 3.7000091075897217, "learning_rate": 0.0004976714865090827, "loss": 6.139785289764404, "step": 76 }, { "epoch": 0.1232, "grad_norm": 2.992990255355835, "learning_rate": 0.0004974817994116764, "loss": 5.841603755950928, "step": 77 }, { "epoch": 0.1248, "grad_norm": 4.935225963592529, "learning_rate": 0.0004972847246654819, "loss": 5.688216209411621, "step": 78 }, { "epoch": 0.1264, "grad_norm": 2.531768798828125, "learning_rate": 0.0004970802681534331, "loss": 6.026415824890137, "step": 79 }, { "epoch": 0.128, "grad_norm": 3.366121530532837, "learning_rate": 0.0004968684359788187, "loss": 6.1217217445373535, "step": 80 }, { "epoch": 0.1296, "grad_norm": 2.439563035964966, "learning_rate": 0.0004966492344651005, "loss": 5.786462783813477, "step": 81 }, { "epoch": 0.1312, "grad_norm": 2.759390115737915, "learning_rate": 0.0004964226701557246, "loss": 6.397160053253174, "step": 82 }, { "epoch": 0.1328, "grad_norm": 2.6187775135040283, "learning_rate": 0.000496188749813926, "loss": 5.781584739685059, "step": 83 }, { "epoch": 0.1344, "grad_norm": 2.3311808109283447, "learning_rate": 0.0004959474804225263, "loss": 5.623251914978027, "step": 84 }, { "epoch": 0.136, "grad_norm": 1.8278515338897705, "learning_rate": 0.0004956988691837262, "loss": 5.646507263183594, "step": 85 }, { "epoch": 0.1376, "grad_norm": 1.940083622932434, "learning_rate": 0.0004954429235188896, "loss": 5.845520496368408, "step": 86 }, { "epoch": 0.1392, "grad_norm": 1.715268611907959, "learning_rate": 0.0004951796510683226, "loss": 5.86661434173584, "step": 87 }, { "epoch": 0.1408, "grad_norm": 2.3065476417541504, "learning_rate": 0.0004949090596910452, "loss": 6.391292572021484, "step": 88 }, { "epoch": 0.1424, "grad_norm": 2.54691481590271, "learning_rate": 0.0004946311574645565, "loss": 5.941152572631836, "step": 89 }, { "epoch": 0.144, "grad_norm": 2.3436925411224365, "learning_rate": 0.0004943459526845942, "loss": 5.867047309875488, "step": 90 }, { "epoch": 0.1456, "grad_norm": 2.8488574028015137, "learning_rate": 0.0004940534538648862, "loss": 6.295483112335205, "step": 91 }, { "epoch": 0.1472, "grad_norm": 2.0991811752319336, "learning_rate": 0.0004937536697368971, "loss": 6.155615329742432, "step": 92 }, { "epoch": 0.1488, "grad_norm": 2.874187707901001, "learning_rate": 0.0004934466092495673, "loss": 6.002193450927734, "step": 93 }, { "epoch": 0.1504, "grad_norm": 2.6309406757354736, "learning_rate": 0.0004931322815690456, "loss": 6.190125942230225, "step": 94 }, { "epoch": 0.152, "grad_norm": 2.5140063762664795, "learning_rate": 0.0004928106960784163, "loss": 5.832353591918945, "step": 95 }, { "epoch": 0.1536, "grad_norm": 2.2540531158447266, "learning_rate": 0.0004924818623774179, "loss": 5.870430946350098, "step": 96 }, { "epoch": 0.1552, "grad_norm": 2.5736892223358154, "learning_rate": 0.0004921457902821578, "loss": 5.9354658126831055, "step": 97 }, { "epoch": 0.1568, "grad_norm": 2.8597569465637207, "learning_rate": 0.0004918024898248188, "loss": 5.980432987213135, "step": 98 }, { "epoch": 0.1584, "grad_norm": 2.679422616958618, "learning_rate": 0.0004914519712533592, "loss": 5.808017253875732, "step": 99 }, { "epoch": 0.16, "grad_norm": 2.6200029850006104, "learning_rate": 0.0004910942450312075, "loss": 6.042236804962158, "step": 100 }, { "epoch": 0.1616, "grad_norm": 2.3748672008514404, "learning_rate": 0.0004907293218369499, "loss": 5.913302421569824, "step": 101 }, { "epoch": 0.1632, "grad_norm": 2.0950937271118164, "learning_rate": 0.000490357212564011, "loss": 5.478336334228516, "step": 102 }, { "epoch": 0.1648, "grad_norm": 2.222339391708374, "learning_rate": 0.0004899779283203296, "loss": 5.753122329711914, "step": 103 }, { "epoch": 0.1664, "grad_norm": 1.8135013580322266, "learning_rate": 0.0004895914804280262, "loss": 5.8378705978393555, "step": 104 }, { "epoch": 0.168, "grad_norm": 1.834136962890625, "learning_rate": 0.0004891978804230655, "loss": 5.386728286743164, "step": 105 }, { "epoch": 0.1696, "grad_norm": 2.7069461345672607, "learning_rate": 0.000488797140054912, "loss": 5.91385555267334, "step": 106 }, { "epoch": 0.1712, "grad_norm": 2.961819648742676, "learning_rate": 0.0004883892712861791, "loss": 5.622028350830078, "step": 107 }, { "epoch": 0.1728, "grad_norm": 2.6172969341278076, "learning_rate": 0.0004879742862922721, "loss": 5.701954364776611, "step": 108 }, { "epoch": 0.1744, "grad_norm": 2.4764273166656494, "learning_rate": 0.0004875521974610247, "loss": 5.922611236572266, "step": 109 }, { "epoch": 0.176, "grad_norm": 2.321749448776245, "learning_rate": 0.00048712301739232933, "loss": 5.958606719970703, "step": 110 }, { "epoch": 0.1776, "grad_norm": 2.569371461868286, "learning_rate": 0.00048668675889776094, "loss": 5.966418266296387, "step": 111 }, { "epoch": 0.1792, "grad_norm": 2.0367257595062256, "learning_rate": 0.00048624343500019453, "loss": 5.828032970428467, "step": 112 }, { "epoch": 0.1808, "grad_norm": 2.0033013820648193, "learning_rate": 0.0004857930589334164, "loss": 5.9207658767700195, "step": 113 }, { "epoch": 0.1824, "grad_norm": 2.4433813095092773, "learning_rate": 0.00048533564414172915, "loss": 5.987303256988525, "step": 114 }, { "epoch": 0.184, "grad_norm": 1.6759791374206543, "learning_rate": 0.00048487120427955047, "loss": 5.758200168609619, "step": 115 }, { "epoch": 0.1856, "grad_norm": 1.9562362432479858, "learning_rate": 0.0004843997532110051, "loss": 6.076003074645996, "step": 116 }, { "epoch": 0.1872, "grad_norm": 1.9110207557678223, "learning_rate": 0.0004839213050095116, "loss": 5.927783966064453, "step": 117 }, { "epoch": 0.1888, "grad_norm": 1.9068591594696045, "learning_rate": 0.00048343587395736177, "loss": 5.609103202819824, "step": 118 }, { "epoch": 0.1904, "grad_norm": 1.801079273223877, "learning_rate": 0.0004829434745452944, "loss": 6.146678924560547, "step": 119 }, { "epoch": 0.192, "grad_norm": 2.160980224609375, "learning_rate": 0.00048244412147206283, "loss": 5.927748203277588, "step": 120 }, { "epoch": 0.1936, "grad_norm": 2.2416179180145264, "learning_rate": 0.0004819378296439961, "loss": 5.8890509605407715, "step": 121 }, { "epoch": 0.1952, "grad_norm": 2.4966790676116943, "learning_rate": 0.000481424614174554, "loss": 5.660029411315918, "step": 122 }, { "epoch": 0.1968, "grad_norm": 2.2897145748138428, "learning_rate": 0.00048090449038387564, "loss": 5.889649391174316, "step": 123 }, { "epoch": 0.1984, "grad_norm": 2.1526010036468506, "learning_rate": 0.00048037747379832266, "loss": 5.937025547027588, "step": 124 }, { "epoch": 0.2, "grad_norm": 2.3477089405059814, "learning_rate": 0.0004798435801500154, "loss": 5.83440637588501, "step": 125 }, { "epoch": 0.2016, "grad_norm": 1.9011043310165405, "learning_rate": 0.00047930282537636326, "loss": 6.049851417541504, "step": 126 }, { "epoch": 0.2032, "grad_norm": 2.7886276245117188, "learning_rate": 0.00047875522561958907, "loss": 6.053065299987793, "step": 127 }, { "epoch": 0.2048, "grad_norm": 1.8351131677627563, "learning_rate": 0.0004782007972262471, "loss": 5.606479644775391, "step": 128 }, { "epoch": 0.2064, "grad_norm": 1.759033441543579, "learning_rate": 0.0004776395567467353, "loss": 5.892756462097168, "step": 129 }, { "epoch": 0.208, "grad_norm": 1.9948967695236206, "learning_rate": 0.00047707152093480097, "loss": 5.802677631378174, "step": 130 }, { "epoch": 0.2096, "grad_norm": 1.7873433828353882, "learning_rate": 0.0004764967067470409, "loss": 5.694087505340576, "step": 131 }, { "epoch": 0.2112, "grad_norm": 2.129274606704712, "learning_rate": 0.00047591513134239506, "loss": 6.053646087646484, "step": 132 }, { "epoch": 0.2128, "grad_norm": 1.815743327140808, "learning_rate": 0.0004753268120816344, "loss": 5.840423107147217, "step": 133 }, { "epoch": 0.2144, "grad_norm": 1.6211766004562378, "learning_rate": 0.0004747317665268427, "loss": 5.866158962249756, "step": 134 }, { "epoch": 0.216, "grad_norm": 1.5764577388763428, "learning_rate": 0.000474130012440892, "loss": 5.642172813415527, "step": 135 }, { "epoch": 0.2176, "grad_norm": 1.6282553672790527, "learning_rate": 0.0004735215677869128, "loss": 5.813696384429932, "step": 136 }, { "epoch": 0.2192, "grad_norm": 1.587697148323059, "learning_rate": 0.0004729064507277576, "loss": 5.456190586090088, "step": 137 }, { "epoch": 0.2208, "grad_norm": 2.2339489459991455, "learning_rate": 0.0004722846796254586, "loss": 5.826436996459961, "step": 138 }, { "epoch": 0.2224, "grad_norm": 1.6775805950164795, "learning_rate": 0.00047165627304068, "loss": 5.307504653930664, "step": 139 }, { "epoch": 0.224, "grad_norm": 1.7358742952346802, "learning_rate": 0.0004710212497321633, "loss": 5.858373641967773, "step": 140 }, { "epoch": 0.2256, "grad_norm": 1.7377792596817017, "learning_rate": 0.0004703796286561679, "loss": 5.746421813964844, "step": 141 }, { "epoch": 0.2272, "grad_norm": 1.7279226779937744, "learning_rate": 0.00046973142896590504, "loss": 5.818030834197998, "step": 142 }, { "epoch": 0.2288, "grad_norm": 1.896462321281433, "learning_rate": 0.0004690766700109659, "loss": 5.706021308898926, "step": 143 }, { "epoch": 0.2304, "grad_norm": 1.599483609199524, "learning_rate": 0.00046841537133674414, "loss": 5.414737701416016, "step": 144 }, { "epoch": 0.232, "grad_norm": 2.0782713890075684, "learning_rate": 0.00046774755268385253, "loss": 6.040131092071533, "step": 145 }, { "epoch": 0.2336, "grad_norm": 1.5299904346466064, "learning_rate": 0.00046707323398753343, "loss": 5.940986633300781, "step": 146 }, { "epoch": 0.2352, "grad_norm": 1.7263022661209106, "learning_rate": 0.00046639243537706387, "loss": 5.658965587615967, "step": 147 }, { "epoch": 0.2368, "grad_norm": 1.9568145275115967, "learning_rate": 0.0004657051771751546, "loss": 5.630545139312744, "step": 148 }, { "epoch": 0.2384, "grad_norm": 1.7731075286865234, "learning_rate": 0.0004650114798973434, "loss": 5.288701057434082, "step": 149 }, { "epoch": 0.24, "grad_norm": 1.5925266742706299, "learning_rate": 0.000464311364251383, "loss": 5.936962127685547, "step": 150 }, { "epoch": 0.2416, "grad_norm": 1.6020593643188477, "learning_rate": 0.0004636048511366222, "loss": 5.519335746765137, "step": 151 }, { "epoch": 0.2432, "grad_norm": 1.5809364318847656, "learning_rate": 0.0004628919616433827, "loss": 5.557144641876221, "step": 152 }, { "epoch": 0.2448, "grad_norm": 1.8422110080718994, "learning_rate": 0.0004621727170523293, "loss": 5.852574348449707, "step": 153 }, { "epoch": 0.2464, "grad_norm": 1.6175079345703125, "learning_rate": 0.0004614471388338346, "loss": 5.70945405960083, "step": 154 }, { "epoch": 0.248, "grad_norm": 1.7624582052230835, "learning_rate": 0.00046071524864733796, "loss": 5.58186149597168, "step": 155 }, { "epoch": 0.2496, "grad_norm": 1.5593520402908325, "learning_rate": 0.0004599770683406991, "loss": 5.716488361358643, "step": 156 }, { "epoch": 0.2512, "grad_norm": 1.9119805097579956, "learning_rate": 0.0004592326199495461, "loss": 5.6072845458984375, "step": 157 }, { "epoch": 0.2528, "grad_norm": 1.7177708148956299, "learning_rate": 0.0004584819256966171, "loss": 5.845829010009766, "step": 158 }, { "epoch": 0.2544, "grad_norm": 2.197434663772583, "learning_rate": 0.0004577250079910973, "loss": 5.7057013511657715, "step": 159 }, { "epoch": 0.256, "grad_norm": 2.089193344116211, "learning_rate": 0.00045696188942795005, "loss": 5.745038986206055, "step": 160 }, { "epoch": 0.2576, "grad_norm": 2.2623579502105713, "learning_rate": 0.0004561925927872421, "loss": 5.437371253967285, "step": 161 }, { "epoch": 0.2592, "grad_norm": 1.5014855861663818, "learning_rate": 0.000455417141033464, "loss": 5.617335796356201, "step": 162 }, { "epoch": 0.2608, "grad_norm": 1.6091152429580688, "learning_rate": 0.00045463555731484396, "loss": 5.750364303588867, "step": 163 }, { "epoch": 0.2624, "grad_norm": 1.7927204370498657, "learning_rate": 0.0004538478649626574, "loss": 6.134846210479736, "step": 164 }, { "epoch": 0.264, "grad_norm": 1.5488578081130981, "learning_rate": 0.00045305408749053016, "loss": 5.881228923797607, "step": 165 }, { "epoch": 0.2656, "grad_norm": 1.6964894533157349, "learning_rate": 0.0004522542485937369, "loss": 5.726894855499268, "step": 166 }, { "epoch": 0.2672, "grad_norm": 1.640055775642395, "learning_rate": 0.0004514483721484933, "loss": 5.594513893127441, "step": 167 }, { "epoch": 0.2688, "grad_norm": 1.622751235961914, "learning_rate": 0.0004506364822112439, "loss": 5.518566131591797, "step": 168 }, { "epoch": 0.2704, "grad_norm": 1.5396101474761963, "learning_rate": 0.00044981860301794335, "loss": 5.589843273162842, "step": 169 }, { "epoch": 0.272, "grad_norm": 1.4792349338531494, "learning_rate": 0.0004489947589833336, "loss": 5.4407501220703125, "step": 170 }, { "epoch": 0.2736, "grad_norm": 1.678307056427002, "learning_rate": 0.00044816497470021456, "loss": 5.557910919189453, "step": 171 }, { "epoch": 0.2752, "grad_norm": 1.7133512496948242, "learning_rate": 0.0004473292749387102, "loss": 5.618350982666016, "step": 172 }, { "epoch": 0.2768, "grad_norm": 1.4833654165267944, "learning_rate": 0.00044648768464552904, "loss": 5.650544166564941, "step": 173 }, { "epoch": 0.2784, "grad_norm": 1.787833571434021, "learning_rate": 0.00044564022894321966, "loss": 5.516573429107666, "step": 174 }, { "epoch": 0.28, "grad_norm": 2.016937255859375, "learning_rate": 0.00044478693312942054, "loss": 5.867213249206543, "step": 175 }, { "epoch": 0.2816, "grad_norm": 1.6533347368240356, "learning_rate": 0.00044392782267610497, "loss": 5.728193283081055, "step": 176 }, { "epoch": 0.2832, "grad_norm": 1.545316457748413, "learning_rate": 0.00044306292322882063, "loss": 5.591842174530029, "step": 177 }, { "epoch": 0.2848, "grad_norm": 1.8199504613876343, "learning_rate": 0.00044219226060592415, "loss": 5.673701763153076, "step": 178 }, { "epoch": 0.2864, "grad_norm": 1.597760558128357, "learning_rate": 0.0004413158607978104, "loss": 5.541760444641113, "step": 179 }, { "epoch": 0.288, "grad_norm": 1.8495144844055176, "learning_rate": 0.0004404337499661364, "loss": 5.602829456329346, "step": 180 }, { "epoch": 0.2896, "grad_norm": 1.8567280769348145, "learning_rate": 0.00043954595444304067, "loss": 5.71918249130249, "step": 181 }, { "epoch": 0.2912, "grad_norm": 1.8808255195617676, "learning_rate": 0.0004386525007303571, "loss": 5.545975208282471, "step": 182 }, { "epoch": 0.2928, "grad_norm": 1.7914137840270996, "learning_rate": 0.00043775341549882364, "loss": 5.760030269622803, "step": 183 }, { "epoch": 0.2944, "grad_norm": 1.5386247634887695, "learning_rate": 0.00043684872558728637, "loss": 5.41167688369751, "step": 184 }, { "epoch": 0.296, "grad_norm": 1.7406638860702515, "learning_rate": 0.00043593845800189826, "loss": 5.6405463218688965, "step": 185 }, { "epoch": 0.2976, "grad_norm": 1.7136033773422241, "learning_rate": 0.000435022639915313, "loss": 5.921665191650391, "step": 186 }, { "epoch": 0.2992, "grad_norm": 1.6137181520462036, "learning_rate": 0.00043410129866587377, "loss": 5.523682117462158, "step": 187 }, { "epoch": 0.3008, "grad_norm": 1.4593943357467651, "learning_rate": 0.00043317446175679733, "loss": 5.579282283782959, "step": 188 }, { "epoch": 0.3024, "grad_norm": 1.498769760131836, "learning_rate": 0.00043224215685535287, "loss": 5.65568733215332, "step": 189 }, { "epoch": 0.304, "grad_norm": 1.4099656343460083, "learning_rate": 0.00043130441179203626, "loss": 5.450364589691162, "step": 190 }, { "epoch": 0.3056, "grad_norm": 1.762242317199707, "learning_rate": 0.00043036125455973894, "loss": 5.701364517211914, "step": 191 }, { "epoch": 0.3072, "grad_norm": 1.9644355773925781, "learning_rate": 0.00042941271331291275, "loss": 5.515183448791504, "step": 192 }, { "epoch": 0.3088, "grad_norm": 1.9126542806625366, "learning_rate": 0.0004284588163667292, "loss": 5.794773578643799, "step": 193 }, { "epoch": 0.3104, "grad_norm": 1.8638148307800293, "learning_rate": 0.0004274995921962343, "loss": 5.806097030639648, "step": 194 }, { "epoch": 0.312, "grad_norm": 1.701051115989685, "learning_rate": 0.00042653506943549844, "loss": 5.101565361022949, "step": 195 }, { "epoch": 0.3136, "grad_norm": 2.270686626434326, "learning_rate": 0.00042556527687676184, "loss": 5.6310319900512695, "step": 196 }, { "epoch": 0.3152, "grad_norm": 1.8609226942062378, "learning_rate": 0.00042459024346957477, "loss": 5.535915851593018, "step": 197 }, { "epoch": 0.3168, "grad_norm": 2.0503954887390137, "learning_rate": 0.0004236099983199338, "loss": 5.734372138977051, "step": 198 }, { "epoch": 0.3184, "grad_norm": 1.6068768501281738, "learning_rate": 0.00042262457068941247, "loss": 5.578657150268555, "step": 199 }, { "epoch": 0.32, "grad_norm": 1.602341651916504, "learning_rate": 0.000421633989994288, "loss": 5.451129913330078, "step": 200 }, { "epoch": 0.3216, "grad_norm": 1.4740185737609863, "learning_rate": 0.00042063828580466355, "loss": 5.597467422485352, "step": 201 }, { "epoch": 0.3232, "grad_norm": 1.6884571313858032, "learning_rate": 0.0004196374878435846, "loss": 5.773179054260254, "step": 202 }, { "epoch": 0.3248, "grad_norm": 3.2064454555511475, "learning_rate": 0.00041863162598615265, "loss": 5.903354167938232, "step": 203 }, { "epoch": 0.3264, "grad_norm": 2.3717195987701416, "learning_rate": 0.0004176207302586329, "loss": 5.43741512298584, "step": 204 }, { "epoch": 0.328, "grad_norm": 1.7029227018356323, "learning_rate": 0.0004166048308375578, "loss": 5.542079925537109, "step": 205 }, { "epoch": 0.3296, "grad_norm": 1.4132956266403198, "learning_rate": 0.0004155839580488269, "loss": 5.548293590545654, "step": 206 }, { "epoch": 0.3312, "grad_norm": 1.7507219314575195, "learning_rate": 0.0004145581423668008, "loss": 5.625497817993164, "step": 207 }, { "epoch": 0.3328, "grad_norm": 1.7790549993515015, "learning_rate": 0.00041352741441339175, "loss": 5.523196220397949, "step": 208 }, { "epoch": 0.3344, "grad_norm": 1.6135910749435425, "learning_rate": 0.0004124918049571499, "loss": 5.497952461242676, "step": 209 }, { "epoch": 0.336, "grad_norm": 1.700406789779663, "learning_rate": 0.00041145134491234425, "loss": 5.513679027557373, "step": 210 }, { "epoch": 0.3376, "grad_norm": 1.5768215656280518, "learning_rate": 0.00041040606533804025, "loss": 5.65580940246582, "step": 211 }, { "epoch": 0.3392, "grad_norm": 1.5992205142974854, "learning_rate": 0.00040935599743717243, "loss": 5.415986061096191, "step": 212 }, { "epoch": 0.3408, "grad_norm": 2.1629347801208496, "learning_rate": 0.00040830117255561294, "loss": 5.394900321960449, "step": 213 }, { "epoch": 0.3424, "grad_norm": 1.5803372859954834, "learning_rate": 0.000407241622181236, "loss": 5.085600852966309, "step": 214 }, { "epoch": 0.344, "grad_norm": 1.4815354347229004, "learning_rate": 0.0004061773779429776, "loss": 5.647576332092285, "step": 215 }, { "epoch": 0.3456, "grad_norm": 1.5663725137710571, "learning_rate": 0.00040510847160989203, "loss": 5.418036460876465, "step": 216 }, { "epoch": 0.3472, "grad_norm": 1.7371917963027954, "learning_rate": 0.00040403493509020275, "loss": 5.280213356018066, "step": 217 }, { "epoch": 0.3488, "grad_norm": 1.4984663724899292, "learning_rate": 0.0004029568004303501, "loss": 5.509110927581787, "step": 218 }, { "epoch": 0.3504, "grad_norm": 1.5602787733078003, "learning_rate": 0.0004018740998140352, "loss": 5.608109951019287, "step": 219 }, { "epoch": 0.352, "grad_norm": 1.6253869533538818, "learning_rate": 0.0004007868655612586, "loss": 5.198980331420898, "step": 220 }, { "epoch": 0.3536, "grad_norm": 2.013225555419922, "learning_rate": 0.00039969513012735566, "loss": 5.129229545593262, "step": 221 }, { "epoch": 0.3552, "grad_norm": 1.4294469356536865, "learning_rate": 0.00039859892610202786, "loss": 5.616961479187012, "step": 222 }, { "epoch": 0.3568, "grad_norm": 1.7147184610366821, "learning_rate": 0.0003974982862083697, "loss": 5.369600772857666, "step": 223 }, { "epoch": 0.3584, "grad_norm": 1.6554255485534668, "learning_rate": 0.00039639324330189234, "loss": 5.445437431335449, "step": 224 }, { "epoch": 0.36, "grad_norm": 2.799031972885132, "learning_rate": 0.00039528383036954224, "loss": 5.5256500244140625, "step": 225 }, { "epoch": 0.3616, "grad_norm": 1.364023208618164, "learning_rate": 0.00039417008052871684, "loss": 5.256645202636719, "step": 226 }, { "epoch": 0.3632, "grad_norm": 1.6340276002883911, "learning_rate": 0.0003930520270262757, "loss": 5.542902946472168, "step": 227 }, { "epoch": 0.3648, "grad_norm": 1.289225459098816, "learning_rate": 0.0003919297032375485, "loss": 5.363834381103516, "step": 228 }, { "epoch": 0.3664, "grad_norm": 1.7022228240966797, "learning_rate": 0.00039080314266533826, "loss": 5.533950328826904, "step": 229 }, { "epoch": 0.368, "grad_norm": 1.5650995969772339, "learning_rate": 0.00038967237893892134, "loss": 5.173304557800293, "step": 230 }, { "epoch": 0.3696, "grad_norm": 1.7082035541534424, "learning_rate": 0.00038853744581304376, "loss": 5.347742080688477, "step": 231 }, { "epoch": 0.3712, "grad_norm": 1.5300484895706177, "learning_rate": 0.00038739837716691327, "loss": 5.307585716247559, "step": 232 }, { "epoch": 0.3728, "grad_norm": 1.4221162796020508, "learning_rate": 0.0003862552070031886, "loss": 5.390194892883301, "step": 233 }, { "epoch": 0.3744, "grad_norm": 1.5934863090515137, "learning_rate": 0.00038510796944696355, "loss": 5.698745250701904, "step": 234 }, { "epoch": 0.376, "grad_norm": 1.574376106262207, "learning_rate": 0.00038395669874474915, "loss": 5.695178508758545, "step": 235 }, { "epoch": 0.3776, "grad_norm": 1.4545917510986328, "learning_rate": 0.00038280142926345084, "loss": 5.21755313873291, "step": 236 }, { "epoch": 0.3792, "grad_norm": 1.6824661493301392, "learning_rate": 0.0003816421954893428, "loss": 5.816608428955078, "step": 237 }, { "epoch": 0.3808, "grad_norm": 1.943800449371338, "learning_rate": 0.0003804790320270384, "loss": 5.530592441558838, "step": 238 }, { "epoch": 0.3824, "grad_norm": 1.4291504621505737, "learning_rate": 0.00037931197359845713, "loss": 5.4604811668396, "step": 239 }, { "epoch": 0.384, "grad_norm": 1.450872778892517, "learning_rate": 0.00037814105504178853, "loss": 5.420169353485107, "step": 240 }, { "epoch": 0.3856, "grad_norm": 1.431982159614563, "learning_rate": 0.00037696631131045155, "loss": 5.437797546386719, "step": 241 }, { "epoch": 0.3872, "grad_norm": 1.5654010772705078, "learning_rate": 0.00037578777747205173, "loss": 5.542431354522705, "step": 242 }, { "epoch": 0.3888, "grad_norm": 1.4680758714675903, "learning_rate": 0.000374605488707334, "loss": 5.8609299659729, "step": 243 }, { "epoch": 0.3904, "grad_norm": 1.484171748161316, "learning_rate": 0.0003734194803091329, "loss": 5.2261762619018555, "step": 244 }, { "epoch": 0.392, "grad_norm": 1.378163456916809, "learning_rate": 0.00037222978768131857, "loss": 5.523834228515625, "step": 245 }, { "epoch": 0.3936, "grad_norm": 1.8471333980560303, "learning_rate": 0.00037103644633774014, "loss": 5.406384468078613, "step": 246 }, { "epoch": 0.3952, "grad_norm": 1.4139055013656616, "learning_rate": 0.00036983949190116575, "loss": 5.400781631469727, "step": 247 }, { "epoch": 0.3968, "grad_norm": 1.2311971187591553, "learning_rate": 0.0003686389601022188, "loss": 5.407512664794922, "step": 248 }, { "epoch": 0.3984, "grad_norm": 1.7283658981323242, "learning_rate": 0.0003674348867783115, "loss": 5.575046062469482, "step": 249 }, { "epoch": 0.4, "grad_norm": 1.3995170593261719, "learning_rate": 0.0003662273078725754, "loss": 5.523738384246826, "step": 250 }, { "epoch": 0.4016, "grad_norm": 1.3066350221633911, "learning_rate": 0.00036501625943278804, "loss": 5.64078426361084, "step": 251 }, { "epoch": 0.4032, "grad_norm": 1.3789863586425781, "learning_rate": 0.0003638017776102968, "loss": 5.428204536437988, "step": 252 }, { "epoch": 0.4048, "grad_norm": 1.721011757850647, "learning_rate": 0.00036258389865894027, "loss": 5.646852016448975, "step": 253 }, { "epoch": 0.4064, "grad_norm": 1.7198848724365234, "learning_rate": 0.0003613626589339652, "loss": 5.864961624145508, "step": 254 }, { "epoch": 0.408, "grad_norm": 1.8125197887420654, "learning_rate": 0.00036013809489094246, "loss": 5.502827167510986, "step": 255 }, { "epoch": 0.4096, "grad_norm": 1.5398613214492798, "learning_rate": 0.00035891024308467727, "loss": 5.422593116760254, "step": 256 }, { "epoch": 0.4112, "grad_norm": 1.2854444980621338, "learning_rate": 0.0003576791401681194, "loss": 5.769440650939941, "step": 257 }, { "epoch": 0.4128, "grad_norm": 1.302415370941162, "learning_rate": 0.0003564448228912682, "loss": 5.568209171295166, "step": 258 }, { "epoch": 0.4144, "grad_norm": 1.4718657732009888, "learning_rate": 0.00035520732810007566, "loss": 5.543675422668457, "step": 259 }, { "epoch": 0.416, "grad_norm": 1.6336448192596436, "learning_rate": 0.0003539666927353469, "loss": 5.599291801452637, "step": 260 }, { "epoch": 0.4176, "grad_norm": 1.7621365785598755, "learning_rate": 0.00035272295383163713, "loss": 5.4962263107299805, "step": 261 }, { "epoch": 0.4192, "grad_norm": 1.6452198028564453, "learning_rate": 0.00035147614851614587, "loss": 5.347473621368408, "step": 262 }, { "epoch": 0.4208, "grad_norm": 1.3223097324371338, "learning_rate": 0.00035022631400760944, "loss": 5.4395928382873535, "step": 263 }, { "epoch": 0.4224, "grad_norm": 1.178402304649353, "learning_rate": 0.0003489734876151891, "loss": 5.452559471130371, "step": 264 }, { "epoch": 0.424, "grad_norm": 1.493491530418396, "learning_rate": 0.0003477177067373579, "loss": 5.549748420715332, "step": 265 }, { "epoch": 0.4256, "grad_norm": 1.2983075380325317, "learning_rate": 0.0003464590088607839, "loss": 5.593997478485107, "step": 266 }, { "epoch": 0.4272, "grad_norm": 1.4325454235076904, "learning_rate": 0.00034519743155921127, "loss": 5.567399978637695, "step": 267 }, { "epoch": 0.4288, "grad_norm": 1.3392157554626465, "learning_rate": 0.00034393301249233897, "loss": 5.392118453979492, "step": 268 }, { "epoch": 0.4304, "grad_norm": 1.543241262435913, "learning_rate": 0.000342665789404696, "loss": 5.2302565574646, "step": 269 }, { "epoch": 0.432, "grad_norm": 1.5115416049957275, "learning_rate": 0.00034139580012451523, "loss": 5.704424858093262, "step": 270 }, { "epoch": 0.4336, "grad_norm": 1.3637906312942505, "learning_rate": 0.0003401230825626037, "loss": 5.522019863128662, "step": 271 }, { "epoch": 0.4352, "grad_norm": 1.5312447547912598, "learning_rate": 0.00033884767471121125, "loss": 5.600247859954834, "step": 272 }, { "epoch": 0.4368, "grad_norm": 1.467431664466858, "learning_rate": 0.00033756961464289633, "loss": 5.204289436340332, "step": 273 }, { "epoch": 0.4384, "grad_norm": 1.352095603942871, "learning_rate": 0.0003362889405093894, "loss": 5.327722549438477, "step": 274 }, { "epoch": 0.44, "grad_norm": 1.3652808666229248, "learning_rate": 0.0003350056905404543, "loss": 5.118766784667969, "step": 275 }, { "epoch": 0.4416, "grad_norm": 1.6171950101852417, "learning_rate": 0.00033371990304274655, "loss": 5.259974479675293, "step": 276 }, { "epoch": 0.4432, "grad_norm": 1.6351940631866455, "learning_rate": 0.0003324316163986704, "loss": 5.432730197906494, "step": 277 }, { "epoch": 0.4448, "grad_norm": 1.6966768503189087, "learning_rate": 0.00033114086906523265, "loss": 5.381967544555664, "step": 278 }, { "epoch": 0.4464, "grad_norm": 1.3781499862670898, "learning_rate": 0.00032984769957289503, "loss": 5.303073883056641, "step": 279 }, { "epoch": 0.448, "grad_norm": 1.5721884965896606, "learning_rate": 0.0003285521465244237, "loss": 5.291014671325684, "step": 280 }, { "epoch": 0.4496, "grad_norm": 1.1372907161712646, "learning_rate": 0.00032725424859373687, "loss": 5.211060523986816, "step": 281 }, { "epoch": 0.4512, "grad_norm": 1.293617844581604, "learning_rate": 0.00032595404452475085, "loss": 5.443847179412842, "step": 282 }, { "epoch": 0.4528, "grad_norm": 2.1258699893951416, "learning_rate": 0.0003246515731302228, "loss": 5.064897537231445, "step": 283 }, { "epoch": 0.4544, "grad_norm": 1.418958067893982, "learning_rate": 0.00032334687329059264, "loss": 5.420772552490234, "step": 284 }, { "epoch": 0.456, "grad_norm": 1.2100834846496582, "learning_rate": 0.0003220399839528222, "loss": 5.425792217254639, "step": 285 }, { "epoch": 0.4576, "grad_norm": 1.2931607961654663, "learning_rate": 0.0003207309441292325, "loss": 5.330716609954834, "step": 286 }, { "epoch": 0.4592, "grad_norm": 1.4552083015441895, "learning_rate": 0.0003194197928963396, "loss": 5.734864234924316, "step": 287 }, { "epoch": 0.4608, "grad_norm": 1.377821683883667, "learning_rate": 0.00031810656939368744, "loss": 5.4975361824035645, "step": 288 }, { "epoch": 0.4624, "grad_norm": 1.3547130823135376, "learning_rate": 0.0003167913128226803, "loss": 5.421193599700928, "step": 289 }, { "epoch": 0.464, "grad_norm": 1.4445191621780396, "learning_rate": 0.0003154740624454118, "loss": 5.138959884643555, "step": 290 }, { "epoch": 0.4656, "grad_norm": 1.3757892847061157, "learning_rate": 0.00031415485758349345, "loss": 5.1781840324401855, "step": 291 }, { "epoch": 0.4672, "grad_norm": 1.2458899021148682, "learning_rate": 0.0003128337376168805, "loss": 4.89755916595459, "step": 292 }, { "epoch": 0.4688, "grad_norm": 1.581918478012085, "learning_rate": 0.00031151074198269656, "loss": 5.327348709106445, "step": 293 }, { "epoch": 0.4704, "grad_norm": 1.5751845836639404, "learning_rate": 0.00031018591017405644, "loss": 5.386034965515137, "step": 294 }, { "epoch": 0.472, "grad_norm": 1.6921762228012085, "learning_rate": 0.0003088592817388869, "loss": 5.158099174499512, "step": 295 }, { "epoch": 0.4736, "grad_norm": 1.62604820728302, "learning_rate": 0.0003075308962787466, "loss": 5.450359344482422, "step": 296 }, { "epoch": 0.4752, "grad_norm": 1.2735328674316406, "learning_rate": 0.00030620079344764327, "loss": 5.264720439910889, "step": 297 }, { "epoch": 0.4768, "grad_norm": 1.6045722961425781, "learning_rate": 0.00030486901295085066, "loss": 5.421563625335693, "step": 298 }, { "epoch": 0.4784, "grad_norm": 1.4631224870681763, "learning_rate": 0.0003035355945437228, "loss": 5.549293041229248, "step": 299 }, { "epoch": 0.48, "grad_norm": 1.34758460521698, "learning_rate": 0.00030220057803050765, "loss": 5.213095664978027, "step": 300 }, { "epoch": 0.4816, "grad_norm": 1.659041404724121, "learning_rate": 0.0003008640032631585, "loss": 5.40679931640625, "step": 301 }, { "epoch": 0.4832, "grad_norm": 1.3234513998031616, "learning_rate": 0.00029952591014014454, "loss": 5.249087333679199, "step": 302 }, { "epoch": 0.4848, "grad_norm": 1.2783095836639404, "learning_rate": 0.0002981863386052599, "loss": 5.571717262268066, "step": 303 }, { "epoch": 0.4864, "grad_norm": 1.2698612213134766, "learning_rate": 0.0002968453286464312, "loss": 5.460443019866943, "step": 304 }, { "epoch": 0.488, "grad_norm": 1.411340594291687, "learning_rate": 0.00029550292029452375, "loss": 5.521218776702881, "step": 305 }, { "epoch": 0.4896, "grad_norm": 1.2482413053512573, "learning_rate": 0.0002941591536221469, "loss": 5.2962646484375, "step": 306 }, { "epoch": 0.4912, "grad_norm": 1.3746726512908936, "learning_rate": 0.0002928140687424573, "loss": 5.614439964294434, "step": 307 }, { "epoch": 0.4928, "grad_norm": 1.5684117078781128, "learning_rate": 0.00029146770580796205, "loss": 5.34489107131958, "step": 308 }, { "epoch": 0.4944, "grad_norm": 1.8253686428070068, "learning_rate": 0.00029012010500931965, "loss": 5.56744384765625, "step": 309 }, { "epoch": 0.496, "grad_norm": 1.4048644304275513, "learning_rate": 0.00028877130657414054, "loss": 5.361034393310547, "step": 310 }, { "epoch": 0.4976, "grad_norm": 1.3948677778244019, "learning_rate": 0.0002874213507657861, "loss": 5.47017240524292, "step": 311 }, { "epoch": 0.4992, "grad_norm": 1.4963343143463135, "learning_rate": 0.00028607027788216674, "loss": 5.397054672241211, "step": 312 }, { "epoch": 0.5008, "grad_norm": 1.3787459135055542, "learning_rate": 0.00028471812825453914, "loss": 5.223832607269287, "step": 313 }, { "epoch": 0.5024, "grad_norm": 1.6353243589401245, "learning_rate": 0.0002833649422463019, "loss": 5.2796525955200195, "step": 314 }, { "epoch": 0.504, "grad_norm": 1.3684626817703247, "learning_rate": 0.0002820107602517913, "loss": 5.421512126922607, "step": 315 }, { "epoch": 0.5056, "grad_norm": 1.2275117635726929, "learning_rate": 0.0002806556226950746, "loss": 5.282046318054199, "step": 316 }, { "epoch": 0.5072, "grad_norm": 1.5556248426437378, "learning_rate": 0.00027929957002874436, "loss": 5.28046178817749, "step": 317 }, { "epoch": 0.5088, "grad_norm": 1.5862129926681519, "learning_rate": 0.00027794264273270987, "loss": 5.368446350097656, "step": 318 }, { "epoch": 0.5104, "grad_norm": 1.4724379777908325, "learning_rate": 0.00027658488131298946, "loss": 5.535717010498047, "step": 319 }, { "epoch": 0.512, "grad_norm": 1.257763147354126, "learning_rate": 0.00027522632630050116, "loss": 5.145805835723877, "step": 320 }, { "epoch": 0.5136, "grad_norm": 1.2067614793777466, "learning_rate": 0.00027386701824985254, "loss": 5.230715274810791, "step": 321 }, { "epoch": 0.5152, "grad_norm": 1.3885655403137207, "learning_rate": 0.00027250699773813066, "loss": 5.397106170654297, "step": 322 }, { "epoch": 0.5168, "grad_norm": 1.3261369466781616, "learning_rate": 0.00027114630536369, "loss": 5.118717193603516, "step": 323 }, { "epoch": 0.5184, "grad_norm": 1.3677432537078857, "learning_rate": 0.0002697849817449415, "loss": 5.1717400550842285, "step": 324 }, { "epoch": 0.52, "grad_norm": 1.476125955581665, "learning_rate": 0.00026842306751913926, "loss": 5.247461318969727, "step": 325 }, { "epoch": 0.5216, "grad_norm": 1.4229127168655396, "learning_rate": 0.0002670606033411678, "loss": 5.157002925872803, "step": 326 }, { "epoch": 0.5232, "grad_norm": 1.4474886655807495, "learning_rate": 0.0002656976298823284, "loss": 5.441634178161621, "step": 327 }, { "epoch": 0.5248, "grad_norm": 1.4530051946640015, "learning_rate": 0.00026433418782912505, "loss": 5.526297569274902, "step": 328 }, { "epoch": 0.5264, "grad_norm": 1.1628731489181519, "learning_rate": 0.00026297031788205, "loss": 5.242552280426025, "step": 329 }, { "epoch": 0.528, "grad_norm": 1.245635747909546, "learning_rate": 0.00026160606075436844, "loss": 5.074901103973389, "step": 330 }, { "epoch": 0.5296, "grad_norm": 1.2995966672897339, "learning_rate": 0.0002602414571709036, "loss": 5.35468864440918, "step": 331 }, { "epoch": 0.5312, "grad_norm": 1.1434332132339478, "learning_rate": 0.00025887654786682076, "loss": 5.233968257904053, "step": 332 }, { "epoch": 0.5328, "grad_norm": 1.8108292818069458, "learning_rate": 0.0002575113735864114, "loss": 5.389377593994141, "step": 333 }, { "epoch": 0.5344, "grad_norm": 1.7074164152145386, "learning_rate": 0.0002561459750818769, "loss": 5.581827163696289, "step": 334 }, { "epoch": 0.536, "grad_norm": 1.5946106910705566, "learning_rate": 0.0002547803931121119, "loss": 5.279594898223877, "step": 335 }, { "epoch": 0.5376, "grad_norm": 1.6184440851211548, "learning_rate": 0.00025341466844148775, "loss": 5.198509693145752, "step": 336 }, { "epoch": 0.5392, "grad_norm": 1.2537761926651, "learning_rate": 0.0002520488418386358, "loss": 5.231502056121826, "step": 337 }, { "epoch": 0.5408, "grad_norm": 1.4233760833740234, "learning_rate": 0.00025068295407523, "loss": 5.152407646179199, "step": 338 }, { "epoch": 0.5424, "grad_norm": 1.2714813947677612, "learning_rate": 0.00024931704592477, "loss": 5.5605878829956055, "step": 339 }, { "epoch": 0.544, "grad_norm": 1.2656306028366089, "learning_rate": 0.0002479511581613642, "loss": 5.457594394683838, "step": 340 }, { "epoch": 0.5456, "grad_norm": 1.1355462074279785, "learning_rate": 0.00024658533155851227, "loss": 5.645468711853027, "step": 341 }, { "epoch": 0.5472, "grad_norm": 1.564833641052246, "learning_rate": 0.0002452196068878881, "loss": 5.560579299926758, "step": 342 }, { "epoch": 0.5488, "grad_norm": 1.2560124397277832, "learning_rate": 0.00024385402491812317, "loss": 5.102597236633301, "step": 343 }, { "epoch": 0.5504, "grad_norm": 1.47645103931427, "learning_rate": 0.00024248862641358866, "loss": 5.347832679748535, "step": 344 }, { "epoch": 0.552, "grad_norm": 1.634925127029419, "learning_rate": 0.00024112345213317933, "loss": 5.229283332824707, "step": 345 }, { "epoch": 0.5536, "grad_norm": 1.3205620050430298, "learning_rate": 0.00023975854282909641, "loss": 5.406874179840088, "step": 346 }, { "epoch": 0.5552, "grad_norm": 1.5099257230758667, "learning_rate": 0.00023839393924563162, "loss": 5.050958156585693, "step": 347 }, { "epoch": 0.5568, "grad_norm": 1.2842683792114258, "learning_rate": 0.0002370296821179501, "loss": 5.189534664154053, "step": 348 }, { "epoch": 0.5584, "grad_norm": 1.1710087060928345, "learning_rate": 0.00023566581217087493, "loss": 5.227584362030029, "step": 349 }, { "epoch": 0.56, "grad_norm": 1.3577237129211426, "learning_rate": 0.00023430237011767165, "loss": 5.079989433288574, "step": 350 }, { "epoch": 0.5616, "grad_norm": 1.2834707498550415, "learning_rate": 0.00023293939665883229, "loss": 5.309730052947998, "step": 351 }, { "epoch": 0.5632, "grad_norm": 1.4233572483062744, "learning_rate": 0.0002315769324808608, "loss": 5.27959680557251, "step": 352 }, { "epoch": 0.5648, "grad_norm": 1.7741755247116089, "learning_rate": 0.00023021501825505847, "loss": 5.245169162750244, "step": 353 }, { "epoch": 0.5664, "grad_norm": 1.743356466293335, "learning_rate": 0.00022885369463631, "loss": 5.384469985961914, "step": 354 }, { "epoch": 0.568, "grad_norm": 1.3255281448364258, "learning_rate": 0.00022749300226186948, "loss": 5.170154094696045, "step": 355 }, { "epoch": 0.5696, "grad_norm": 1.3135267496109009, "learning_rate": 0.0002261329817501475, "loss": 5.177214622497559, "step": 356 }, { "epoch": 0.5712, "grad_norm": 1.334771990776062, "learning_rate": 0.00022477367369949885, "loss": 5.129632472991943, "step": 357 }, { "epoch": 0.5728, "grad_norm": 1.5144758224487305, "learning_rate": 0.00022341511868701055, "loss": 4.999809741973877, "step": 358 }, { "epoch": 0.5744, "grad_norm": 1.2315837144851685, "learning_rate": 0.0002220573572672902, "loss": 5.348094940185547, "step": 359 }, { "epoch": 0.576, "grad_norm": 1.4335271120071411, "learning_rate": 0.00022070042997125567, "loss": 5.095552444458008, "step": 360 }, { "epoch": 0.5776, "grad_norm": 1.6949377059936523, "learning_rate": 0.00021934437730492543, "loss": 5.0214924812316895, "step": 361 }, { "epoch": 0.5792, "grad_norm": 1.3676100969314575, "learning_rate": 0.00021798923974820884, "loss": 5.584174633026123, "step": 362 }, { "epoch": 0.5808, "grad_norm": 1.3218090534210205, "learning_rate": 0.0002166350577536981, "loss": 5.239519119262695, "step": 363 }, { "epoch": 0.5824, "grad_norm": 1.3069649934768677, "learning_rate": 0.00021528187174546092, "loss": 5.372768402099609, "step": 364 }, { "epoch": 0.584, "grad_norm": 1.3426185846328735, "learning_rate": 0.00021392972211783332, "loss": 5.219846248626709, "step": 365 }, { "epoch": 0.5856, "grad_norm": 1.3411294221878052, "learning_rate": 0.00021257864923421402, "loss": 4.874852180480957, "step": 366 }, { "epoch": 0.5872, "grad_norm": 1.4060436487197876, "learning_rate": 0.00021122869342585948, "loss": 5.2531046867370605, "step": 367 }, { "epoch": 0.5888, "grad_norm": 1.192141056060791, "learning_rate": 0.00020987989499068042, "loss": 5.342706203460693, "step": 368 }, { "epoch": 0.5904, "grad_norm": 1.3001792430877686, "learning_rate": 0.00020853229419203807, "loss": 5.323460578918457, "step": 369 }, { "epoch": 0.592, "grad_norm": 1.4926820993423462, "learning_rate": 0.0002071859312575427, "loss": 5.296498775482178, "step": 370 }, { "epoch": 0.5936, "grad_norm": 1.434384822845459, "learning_rate": 0.00020584084637785316, "loss": 5.084543228149414, "step": 371 }, { "epoch": 0.5952, "grad_norm": 2.288747787475586, "learning_rate": 0.00020449707970547629, "loss": 5.0905585289001465, "step": 372 }, { "epoch": 0.5968, "grad_norm": 1.4251408576965332, "learning_rate": 0.0002031546713535688, "loss": 5.365981101989746, "step": 373 }, { "epoch": 0.5984, "grad_norm": 1.317584753036499, "learning_rate": 0.00020181366139474012, "loss": 5.608163356781006, "step": 374 }, { "epoch": 0.6, "grad_norm": 1.187654733657837, "learning_rate": 0.00020047408985985552, "loss": 4.876247406005859, "step": 375 }, { "epoch": 0.6016, "grad_norm": 2.2563083171844482, "learning_rate": 0.0001991359967368416, "loss": 5.187510013580322, "step": 376 }, { "epoch": 0.6032, "grad_norm": 1.282902479171753, "learning_rate": 0.00019779942196949238, "loss": 5.240813255310059, "step": 377 }, { "epoch": 0.6048, "grad_norm": 1.345765471458435, "learning_rate": 0.00019646440545627723, "loss": 5.2197957038879395, "step": 378 }, { "epoch": 0.6064, "grad_norm": 1.2917436361312866, "learning_rate": 0.0001951309870491494, "loss": 5.324549674987793, "step": 379 }, { "epoch": 0.608, "grad_norm": 1.2900662422180176, "learning_rate": 0.0001937992065523567, "loss": 5.294788360595703, "step": 380 }, { "epoch": 0.6096, "grad_norm": 1.5021880865097046, "learning_rate": 0.00019246910372125342, "loss": 5.409048080444336, "step": 381 }, { "epoch": 0.6112, "grad_norm": 1.6828486919403076, "learning_rate": 0.0001911407182611131, "loss": 5.392390251159668, "step": 382 }, { "epoch": 0.6128, "grad_norm": 1.5416451692581177, "learning_rate": 0.00018981408982594365, "loss": 5.151852130889893, "step": 383 }, { "epoch": 0.6144, "grad_norm": 1.2479101419448853, "learning_rate": 0.00018848925801730342, "loss": 5.193958759307861, "step": 384 }, { "epoch": 0.616, "grad_norm": 1.471063256263733, "learning_rate": 0.00018716626238311958, "loss": 4.912611961364746, "step": 385 }, { "epoch": 0.6176, "grad_norm": 1.285828948020935, "learning_rate": 0.00018584514241650667, "loss": 4.898399829864502, "step": 386 }, { "epoch": 0.6192, "grad_norm": 1.8733534812927246, "learning_rate": 0.0001845259375545882, "loss": 4.954188346862793, "step": 387 }, { "epoch": 0.6208, "grad_norm": 1.55088472366333, "learning_rate": 0.00018320868717731977, "loss": 5.39755916595459, "step": 388 }, { "epoch": 0.6224, "grad_norm": 1.876209020614624, "learning_rate": 0.00018189343060631257, "loss": 5.461378574371338, "step": 389 }, { "epoch": 0.624, "grad_norm": 1.192241907119751, "learning_rate": 0.0001805802071036605, "loss": 4.95612096786499, "step": 390 }, { "epoch": 0.6256, "grad_norm": 1.3008099794387817, "learning_rate": 0.00017926905587076748, "loss": 5.485091209411621, "step": 391 }, { "epoch": 0.6272, "grad_norm": 1.7544057369232178, "learning_rate": 0.00017796001604717787, "loss": 4.80226993560791, "step": 392 }, { "epoch": 0.6288, "grad_norm": 1.2537293434143066, "learning_rate": 0.00017665312670940743, "loss": 5.096302509307861, "step": 393 }, { "epoch": 0.6304, "grad_norm": 1.1589773893356323, "learning_rate": 0.0001753484268697772, "loss": 5.296406269073486, "step": 394 }, { "epoch": 0.632, "grad_norm": 1.2438563108444214, "learning_rate": 0.0001740459554752492, "loss": 5.258586406707764, "step": 395 }, { "epoch": 0.6336, "grad_norm": 1.2174347639083862, "learning_rate": 0.00017274575140626317, "loss": 5.269428253173828, "step": 396 }, { "epoch": 0.6352, "grad_norm": 1.4118070602416992, "learning_rate": 0.00017144785347557643, "loss": 4.895862579345703, "step": 397 }, { "epoch": 0.6368, "grad_norm": 1.2514833211898804, "learning_rate": 0.000170152300427105, "loss": 5.026675701141357, "step": 398 }, { "epoch": 0.6384, "grad_norm": 1.2788375616073608, "learning_rate": 0.0001688591309347674, "loss": 5.225519180297852, "step": 399 }, { "epoch": 0.64, "grad_norm": 2.105532646179199, "learning_rate": 0.00016756838360132968, "loss": 4.846694469451904, "step": 400 }, { "epoch": 0.6416, "grad_norm": 1.5078997611999512, "learning_rate": 0.00016628009695725346, "loss": 5.365673065185547, "step": 401 }, { "epoch": 0.6432, "grad_norm": 1.2744578123092651, "learning_rate": 0.00016499430945954576, "loss": 5.406460285186768, "step": 402 }, { "epoch": 0.6448, "grad_norm": 1.494751214981079, "learning_rate": 0.0001637110594906106, "loss": 5.130960464477539, "step": 403 }, { "epoch": 0.6464, "grad_norm": 1.4280880689620972, "learning_rate": 0.00016243038535710365, "loss": 5.194888114929199, "step": 404 }, { "epoch": 0.648, "grad_norm": 1.2241549491882324, "learning_rate": 0.00016115232528878876, "loss": 4.969592571258545, "step": 405 }, { "epoch": 0.6496, "grad_norm": 1.2147563695907593, "learning_rate": 0.00015987691743739636, "loss": 5.176176071166992, "step": 406 }, { "epoch": 0.6512, "grad_norm": 1.1825661659240723, "learning_rate": 0.00015860419987548486, "loss": 5.012125015258789, "step": 407 }, { "epoch": 0.6528, "grad_norm": 1.3765822649002075, "learning_rate": 0.00015733421059530397, "loss": 5.192166328430176, "step": 408 }, { "epoch": 0.6544, "grad_norm": 1.455336332321167, "learning_rate": 0.00015606698750766107, "loss": 5.153839111328125, "step": 409 }, { "epoch": 0.656, "grad_norm": 1.2426291704177856, "learning_rate": 0.00015480256844078877, "loss": 5.300335884094238, "step": 410 }, { "epoch": 0.6576, "grad_norm": 1.2273467779159546, "learning_rate": 0.00015354099113921613, "loss": 5.370866775512695, "step": 411 }, { "epoch": 0.6592, "grad_norm": 1.3272308111190796, "learning_rate": 0.0001522822932626421, "loss": 5.237664699554443, "step": 412 }, { "epoch": 0.6608, "grad_norm": 1.486881136894226, "learning_rate": 0.00015102651238481092, "loss": 5.199460029602051, "step": 413 }, { "epoch": 0.6624, "grad_norm": 1.225791096687317, "learning_rate": 0.0001497736859923906, "loss": 5.001354217529297, "step": 414 }, { "epoch": 0.664, "grad_norm": 1.1577017307281494, "learning_rate": 0.00014852385148385412, "loss": 4.978085517883301, "step": 415 }, { "epoch": 0.6656, "grad_norm": 1.1296128034591675, "learning_rate": 0.00014727704616836296, "loss": 5.08205509185791, "step": 416 }, { "epoch": 0.6672, "grad_norm": 1.450363278388977, "learning_rate": 0.00014603330726465315, "loss": 5.209231853485107, "step": 417 }, { "epoch": 0.6688, "grad_norm": 1.1756222248077393, "learning_rate": 0.00014479267189992435, "loss": 5.059493064880371, "step": 418 }, { "epoch": 0.6704, "grad_norm": 1.3998825550079346, "learning_rate": 0.00014355517710873183, "loss": 4.99937629699707, "step": 419 }, { "epoch": 0.672, "grad_norm": 1.3438893556594849, "learning_rate": 0.00014232085983188064, "loss": 5.317448616027832, "step": 420 }, { "epoch": 0.6736, "grad_norm": 1.080320119857788, "learning_rate": 0.00014108975691532271, "loss": 5.1715264320373535, "step": 421 }, { "epoch": 0.6752, "grad_norm": 1.2611881494522095, "learning_rate": 0.00013986190510905758, "loss": 4.58638858795166, "step": 422 }, { "epoch": 0.6768, "grad_norm": 1.2457435131072998, "learning_rate": 0.0001386373410660347, "loss": 4.950125217437744, "step": 423 }, { "epoch": 0.6784, "grad_norm": 1.7552827596664429, "learning_rate": 0.00013741610134105983, "loss": 5.444072723388672, "step": 424 }, { "epoch": 0.68, "grad_norm": 1.21152925491333, "learning_rate": 0.0001361982223897032, "loss": 5.073456287384033, "step": 425 }, { "epoch": 0.6816, "grad_norm": 1.5059016942977905, "learning_rate": 0.00013498374056721197, "loss": 5.584665298461914, "step": 426 }, { "epoch": 0.6832, "grad_norm": 1.4177290201187134, "learning_rate": 0.00013377269212742457, "loss": 5.289451599121094, "step": 427 }, { "epoch": 0.6848, "grad_norm": 1.4181674718856812, "learning_rate": 0.0001325651132216886, "loss": 4.7561540603637695, "step": 428 }, { "epoch": 0.6864, "grad_norm": 1.1193443536758423, "learning_rate": 0.00013136103989778137, "loss": 5.055768013000488, "step": 429 }, { "epoch": 0.688, "grad_norm": 1.1662368774414062, "learning_rate": 0.00013016050809883434, "loss": 4.925864219665527, "step": 430 }, { "epoch": 0.6896, "grad_norm": 1.188244104385376, "learning_rate": 0.00012896355366225998, "loss": 4.825364589691162, "step": 431 }, { "epoch": 0.6912, "grad_norm": 1.4330700635910034, "learning_rate": 0.00012777021231868144, "loss": 5.1424055099487305, "step": 432 }, { "epoch": 0.6928, "grad_norm": 1.5289138555526733, "learning_rate": 0.00012658051969086713, "loss": 5.1443772315979, "step": 433 }, { "epoch": 0.6944, "grad_norm": 1.455989122390747, "learning_rate": 0.00012539451129266603, "loss": 4.967620849609375, "step": 434 }, { "epoch": 0.696, "grad_norm": 1.36936354637146, "learning_rate": 0.00012421222252794833, "loss": 5.1624908447265625, "step": 435 }, { "epoch": 0.6976, "grad_norm": 1.3274517059326172, "learning_rate": 0.0001230336886895485, "loss": 5.160506725311279, "step": 436 }, { "epoch": 0.6992, "grad_norm": 1.3301618099212646, "learning_rate": 0.0001218589449582116, "loss": 4.8344645500183105, "step": 437 }, { "epoch": 0.7008, "grad_norm": 1.4845178127288818, "learning_rate": 0.00012068802640154292, "loss": 4.987344264984131, "step": 438 }, { "epoch": 0.7024, "grad_norm": 1.2381513118743896, "learning_rate": 0.00011952096797296167, "loss": 4.904998779296875, "step": 439 }, { "epoch": 0.704, "grad_norm": 1.395328402519226, "learning_rate": 0.00011835780451065722, "loss": 4.8166656494140625, "step": 440 }, { "epoch": 0.7056, "grad_norm": 1.9488160610198975, "learning_rate": 0.00011719857073654922, "loss": 5.329633712768555, "step": 441 }, { "epoch": 0.7072, "grad_norm": 1.4535843133926392, "learning_rate": 0.00011604330125525078, "loss": 4.918258190155029, "step": 442 }, { "epoch": 0.7088, "grad_norm": 1.4393301010131836, "learning_rate": 0.00011489203055303646, "loss": 5.293149471282959, "step": 443 }, { "epoch": 0.7104, "grad_norm": 1.5147560834884644, "learning_rate": 0.00011374479299681142, "loss": 5.193087100982666, "step": 444 }, { "epoch": 0.712, "grad_norm": 1.9008417129516602, "learning_rate": 0.00011260162283308678, "loss": 5.060847282409668, "step": 445 }, { "epoch": 0.7136, "grad_norm": 1.42693030834198, "learning_rate": 0.00011146255418695633, "loss": 5.017470836639404, "step": 446 }, { "epoch": 0.7152, "grad_norm": 1.3155730962753296, "learning_rate": 0.00011032762106107872, "loss": 5.276302337646484, "step": 447 }, { "epoch": 0.7168, "grad_norm": 1.414832592010498, "learning_rate": 0.00010919685733466175, "loss": 5.105321884155273, "step": 448 }, { "epoch": 0.7184, "grad_norm": 1.4331352710723877, "learning_rate": 0.00010807029676245145, "loss": 5.178823471069336, "step": 449 }, { "epoch": 0.72, "grad_norm": 2.958193302154541, "learning_rate": 0.00010694797297372433, "loss": 5.053134918212891, "step": 450 }, { "epoch": 0.7216, "grad_norm": 1.4789056777954102, "learning_rate": 0.00010582991947128323, "loss": 5.253017425537109, "step": 451 }, { "epoch": 0.7232, "grad_norm": 1.4438488483428955, "learning_rate": 0.00010471616963045788, "loss": 4.795893669128418, "step": 452 }, { "epoch": 0.7248, "grad_norm": 1.0840559005737305, "learning_rate": 0.00010360675669810765, "loss": 4.984047889709473, "step": 453 }, { "epoch": 0.7264, "grad_norm": 1.0405324697494507, "learning_rate": 0.00010250171379163034, "loss": 5.2449116706848145, "step": 454 }, { "epoch": 0.728, "grad_norm": 1.5884569883346558, "learning_rate": 0.00010140107389797223, "loss": 4.744875907897949, "step": 455 }, { "epoch": 0.7296, "grad_norm": 1.3832892179489136, "learning_rate": 0.00010030486987264437, "loss": 5.204304218292236, "step": 456 }, { "epoch": 0.7312, "grad_norm": 1.5350919961929321, "learning_rate": 9.921313443874142e-05, "loss": 4.8627400398254395, "step": 457 }, { "epoch": 0.7328, "grad_norm": 1.3951729536056519, "learning_rate": 9.812590018596485e-05, "loss": 4.816617488861084, "step": 458 }, { "epoch": 0.7344, "grad_norm": 1.4187312126159668, "learning_rate": 9.704319956964996e-05, "loss": 5.244232654571533, "step": 459 }, { "epoch": 0.736, "grad_norm": 1.9965143203735352, "learning_rate": 9.596506490979737e-05, "loss": 5.668506145477295, "step": 460 }, { "epoch": 0.7376, "grad_norm": 1.6400834321975708, "learning_rate": 9.489152839010798e-05, "loss": 5.365629196166992, "step": 461 }, { "epoch": 0.7392, "grad_norm": 1.442253828048706, "learning_rate": 9.382262205702247e-05, "loss": 5.322830677032471, "step": 462 }, { "epoch": 0.7408, "grad_norm": 1.1397078037261963, "learning_rate": 9.275837781876404e-05, "loss": 5.002555847167969, "step": 463 }, { "epoch": 0.7424, "grad_norm": 1.4520896673202515, "learning_rate": 9.16988274443871e-05, "loss": 5.138970375061035, "step": 464 }, { "epoch": 0.744, "grad_norm": 1.3373026847839355, "learning_rate": 9.064400256282756e-05, "loss": 5.060115814208984, "step": 465 }, { "epoch": 0.7456, "grad_norm": 1.3698216676712036, "learning_rate": 8.959393466195972e-05, "loss": 5.160407066345215, "step": 466 }, { "epoch": 0.7472, "grad_norm": 1.45284104347229, "learning_rate": 8.854865508765577e-05, "loss": 4.794371604919434, "step": 467 }, { "epoch": 0.7488, "grad_norm": 1.2445486783981323, "learning_rate": 8.750819504285015e-05, "loss": 4.926098823547363, "step": 468 }, { "epoch": 0.7504, "grad_norm": 1.5558010339736938, "learning_rate": 8.647258558660828e-05, "loss": 5.0971245765686035, "step": 469 }, { "epoch": 0.752, "grad_norm": 1.5887895822525024, "learning_rate": 8.544185763319925e-05, "loss": 5.4126152992248535, "step": 470 }, { "epoch": 0.7536, "grad_norm": 1.1927727460861206, "learning_rate": 8.441604195117314e-05, "loss": 4.76765251159668, "step": 471 }, { "epoch": 0.7552, "grad_norm": 1.1783281564712524, "learning_rate": 8.339516916244216e-05, "loss": 5.2575907707214355, "step": 472 }, { "epoch": 0.7568, "grad_norm": 1.4256731271743774, "learning_rate": 8.237926974136715e-05, "loss": 4.811319351196289, "step": 473 }, { "epoch": 0.7584, "grad_norm": 1.1950210332870483, "learning_rate": 8.136837401384733e-05, "loss": 5.229648590087891, "step": 474 }, { "epoch": 0.76, "grad_norm": 1.409590721130371, "learning_rate": 8.036251215641546e-05, "loss": 5.007275104522705, "step": 475 }, { "epoch": 0.7616, "grad_norm": 1.3664684295654297, "learning_rate": 7.936171419533653e-05, "loss": 5.1865339279174805, "step": 476 }, { "epoch": 0.7632, "grad_norm": 1.272782564163208, "learning_rate": 7.836601000571197e-05, "loss": 5.0746636390686035, "step": 477 }, { "epoch": 0.7648, "grad_norm": 1.430291771888733, "learning_rate": 7.737542931058755e-05, "loss": 5.309817790985107, "step": 478 }, { "epoch": 0.7664, "grad_norm": 1.391274094581604, "learning_rate": 7.63900016800663e-05, "loss": 4.913700103759766, "step": 479 }, { "epoch": 0.768, "grad_norm": 1.8367639780044556, "learning_rate": 7.54097565304252e-05, "loss": 4.870950222015381, "step": 480 }, { "epoch": 0.7696, "grad_norm": 1.5375534296035767, "learning_rate": 7.443472312323824e-05, "loss": 5.078888893127441, "step": 481 }, { "epoch": 0.7712, "grad_norm": 1.3212310075759888, "learning_rate": 7.346493056450157e-05, "loss": 4.916213512420654, "step": 482 }, { "epoch": 0.7728, "grad_norm": 1.4506617784500122, "learning_rate": 7.250040780376577e-05, "loss": 4.79956579208374, "step": 483 }, { "epoch": 0.7744, "grad_norm": 1.269956350326538, "learning_rate": 7.154118363327075e-05, "loss": 5.207999229431152, "step": 484 }, { "epoch": 0.776, "grad_norm": 1.386398196220398, "learning_rate": 7.058728668708727e-05, "loss": 4.866647720336914, "step": 485 }, { "epoch": 0.7776, "grad_norm": 1.2891589403152466, "learning_rate": 6.963874544026109e-05, "loss": 5.038686752319336, "step": 486 }, { "epoch": 0.7792, "grad_norm": 1.2647722959518433, "learning_rate": 6.869558820796376e-05, "loss": 5.102810859680176, "step": 487 }, { "epoch": 0.7808, "grad_norm": 1.2693649530410767, "learning_rate": 6.775784314464717e-05, "loss": 4.887539863586426, "step": 488 }, { "epoch": 0.7824, "grad_norm": 1.6362860202789307, "learning_rate": 6.68255382432027e-05, "loss": 4.774933338165283, "step": 489 }, { "epoch": 0.784, "grad_norm": 1.5527857542037964, "learning_rate": 6.589870133412626e-05, "loss": 5.0828680992126465, "step": 490 }, { "epoch": 0.7856, "grad_norm": 1.6107929944992065, "learning_rate": 6.497736008468701e-05, "loss": 4.6461639404296875, "step": 491 }, { "epoch": 0.7872, "grad_norm": 1.12363862991333, "learning_rate": 6.406154199810179e-05, "loss": 5.033900260925293, "step": 492 }, { "epoch": 0.7888, "grad_norm": 1.1499987840652466, "learning_rate": 6.315127441271368e-05, "loss": 4.9476094245910645, "step": 493 }, { "epoch": 0.7904, "grad_norm": 1.5613439083099365, "learning_rate": 6.224658450117637e-05, "loss": 5.146108150482178, "step": 494 }, { "epoch": 0.792, "grad_norm": 1.2324504852294922, "learning_rate": 6.134749926964289e-05, "loss": 4.819706916809082, "step": 495 }, { "epoch": 0.7936, "grad_norm": 1.1125681400299072, "learning_rate": 6.0454045556959356e-05, "loss": 4.930054664611816, "step": 496 }, { "epoch": 0.7952, "grad_norm": 1.6992604732513428, "learning_rate": 5.9566250033863567e-05, "loss": 5.198884963989258, "step": 497 }, { "epoch": 0.7968, "grad_norm": 1.920567512512207, "learning_rate": 5.8684139202189654e-05, "loss": 5.21380615234375, "step": 498 }, { "epoch": 0.7984, "grad_norm": 1.3954874277114868, "learning_rate": 5.780773939407585e-05, "loss": 4.928266525268555, "step": 499 }, { "epoch": 0.8, "grad_norm": 1.4884490966796875, "learning_rate": 5.693707677117943e-05, "loss": 5.14831018447876, "step": 500 }, { "epoch": 0.8016, "grad_norm": 1.7664364576339722, "learning_rate": 5.607217732389502e-05, "loss": 5.231863975524902, "step": 501 }, { "epoch": 0.8032, "grad_norm": 1.1272830963134766, "learning_rate": 5.5213066870579476e-05, "loss": 5.004734039306641, "step": 502 }, { "epoch": 0.8048, "grad_norm": 1.2964353561401367, "learning_rate": 5.4359771056780333e-05, "loss": 4.362703323364258, "step": 503 }, { "epoch": 0.8064, "grad_norm": 1.3352986574172974, "learning_rate": 5.3512315354470956e-05, "loss": 4.99576473236084, "step": 504 }, { "epoch": 0.808, "grad_norm": 1.4980597496032715, "learning_rate": 5.267072506128981e-05, "loss": 5.139542579650879, "step": 505 }, { "epoch": 0.8096, "grad_norm": 1.1959021091461182, "learning_rate": 5.183502529978548e-05, "loss": 5.123270034790039, "step": 506 }, { "epoch": 0.8112, "grad_norm": 1.3239198923110962, "learning_rate": 5.10052410166664e-05, "loss": 5.379024028778076, "step": 507 }, { "epoch": 0.8128, "grad_norm": 1.204946756362915, "learning_rate": 5.018139698205665e-05, "loss": 5.012156963348389, "step": 508 }, { "epoch": 0.8144, "grad_norm": 1.5109254121780396, "learning_rate": 4.9363517788756195e-05, "loss": 4.902032852172852, "step": 509 }, { "epoch": 0.816, "grad_norm": 1.1028631925582886, "learning_rate": 4.855162785150674e-05, "loss": 5.165895938873291, "step": 510 }, { "epoch": 0.8176, "grad_norm": 1.042698860168457, "learning_rate": 4.7745751406263163e-05, "loss": 4.897646427154541, "step": 511 }, { "epoch": 0.8192, "grad_norm": 1.2713276147842407, "learning_rate": 4.694591250946983e-05, "loss": 4.820833206176758, "step": 512 }, { "epoch": 0.8208, "grad_norm": 1.1189286708831787, "learning_rate": 4.615213503734267e-05, "loss": 4.981866836547852, "step": 513 }, { "epoch": 0.8224, "grad_norm": 1.3545044660568237, "learning_rate": 4.536444268515608e-05, "loss": 4.901456832885742, "step": 514 }, { "epoch": 0.824, "grad_norm": 1.3025493621826172, "learning_rate": 4.458285896653602e-05, "loss": 5.010705947875977, "step": 515 }, { "epoch": 0.8256, "grad_norm": 1.5655075311660767, "learning_rate": 4.380740721275786e-05, "loss": 5.438045501708984, "step": 516 }, { "epoch": 0.8272, "grad_norm": 1.4804078340530396, "learning_rate": 4.303811057205007e-05, "loss": 4.864298343658447, "step": 517 }, { "epoch": 0.8288, "grad_norm": 1.3067195415496826, "learning_rate": 4.227499200890275e-05, "loss": 5.399082183837891, "step": 518 }, { "epoch": 0.8304, "grad_norm": 1.3728652000427246, "learning_rate": 4.1518074303383004e-05, "loss": 4.861635684967041, "step": 519 }, { "epoch": 0.832, "grad_norm": 1.0616425275802612, "learning_rate": 4.076738005045394e-05, "loss": 5.093954563140869, "step": 520 }, { "epoch": 0.8336, "grad_norm": 1.2632859945297241, "learning_rate": 4.002293165930088e-05, "loss": 5.069172382354736, "step": 521 }, { "epoch": 0.8352, "grad_norm": 1.54668390750885, "learning_rate": 3.9284751352662045e-05, "loss": 5.132449150085449, "step": 522 }, { "epoch": 0.8368, "grad_norm": 1.4716906547546387, "learning_rate": 3.855286116616541e-05, "loss": 4.952608585357666, "step": 523 }, { "epoch": 0.8384, "grad_norm": 1.315252423286438, "learning_rate": 3.782728294767068e-05, "loss": 4.983213424682617, "step": 524 }, { "epoch": 0.84, "grad_norm": 1.4445892572402954, "learning_rate": 3.7108038356617305e-05, "loss": 5.154409885406494, "step": 525 }, { "epoch": 0.8416, "grad_norm": 1.3014910221099854, "learning_rate": 3.6395148863377855e-05, "loss": 4.867927551269531, "step": 526 }, { "epoch": 0.8432, "grad_norm": 1.1832693815231323, "learning_rate": 3.568863574861708e-05, "loss": 4.7219462394714355, "step": 527 }, { "epoch": 0.8448, "grad_norm": 1.38213312625885, "learning_rate": 3.49885201026566e-05, "loss": 4.771894931793213, "step": 528 }, { "epoch": 0.8464, "grad_norm": 1.2693217992782593, "learning_rate": 3.4294822824845444e-05, "loss": 4.964877128601074, "step": 529 }, { "epoch": 0.848, "grad_norm": 1.170465350151062, "learning_rate": 3.3607564622936207e-05, "loss": 4.916166305541992, "step": 530 }, { "epoch": 0.8496, "grad_norm": 1.267838716506958, "learning_rate": 3.292676601246661e-05, "loss": 5.243579387664795, "step": 531 }, { "epoch": 0.8512, "grad_norm": 1.3622010946273804, "learning_rate": 3.2252447316147456e-05, "loss": 4.598936080932617, "step": 532 }, { "epoch": 0.8528, "grad_norm": 1.5820192098617554, "learning_rate": 3.1584628663255847e-05, "loss": 5.2594170570373535, "step": 533 }, { "epoch": 0.8544, "grad_norm": 1.5312021970748901, "learning_rate": 3.092332998903416e-05, "loss": 5.157290935516357, "step": 534 }, { "epoch": 0.856, "grad_norm": 1.4027749300003052, "learning_rate": 3.0268571034094944e-05, "loss": 5.125532150268555, "step": 535 }, { "epoch": 0.8576, "grad_norm": 1.1611146926879883, "learning_rate": 2.962037134383211e-05, "loss": 5.000718593597412, "step": 536 }, { "epoch": 0.8592, "grad_norm": 1.3523814678192139, "learning_rate": 2.8978750267836752e-05, "loss": 4.671696662902832, "step": 537 }, { "epoch": 0.8608, "grad_norm": 1.2509510517120361, "learning_rate": 2.8343726959320082e-05, "loss": 5.075153350830078, "step": 538 }, { "epoch": 0.8624, "grad_norm": 1.3108588457107544, "learning_rate": 2.7715320374541357e-05, "loss": 4.994152545928955, "step": 539 }, { "epoch": 0.864, "grad_norm": 1.1837953329086304, "learning_rate": 2.7093549272242445e-05, "loss": 5.121654510498047, "step": 540 }, { "epoch": 0.8656, "grad_norm": 1.5410609245300293, "learning_rate": 2.6478432213087213e-05, "loss": 4.955600738525391, "step": 541 }, { "epoch": 0.8672, "grad_norm": 1.0305265188217163, "learning_rate": 2.5869987559107992e-05, "loss": 5.132237911224365, "step": 542 }, { "epoch": 0.8688, "grad_norm": 1.219406247138977, "learning_rate": 2.5268233473157294e-05, "loss": 4.905612468719482, "step": 543 }, { "epoch": 0.8704, "grad_norm": 1.5246868133544922, "learning_rate": 2.467318791836559e-05, "loss": 5.272589206695557, "step": 544 }, { "epoch": 0.872, "grad_norm": 1.2425482273101807, "learning_rate": 2.408486865760495e-05, "loss": 5.108579158782959, "step": 545 }, { "epoch": 0.8736, "grad_norm": 1.1925750970840454, "learning_rate": 2.3503293252959136e-05, "loss": 5.024507522583008, "step": 546 }, { "epoch": 0.8752, "grad_norm": 1.2723841667175293, "learning_rate": 2.2928479065199072e-05, "loss": 5.255931377410889, "step": 547 }, { "epoch": 0.8768, "grad_norm": 1.620451807975769, "learning_rate": 2.2360443253264777e-05, "loss": 5.196926593780518, "step": 548 }, { "epoch": 0.8784, "grad_norm": 1.1335077285766602, "learning_rate": 2.179920277375294e-05, "loss": 4.717995643615723, "step": 549 }, { "epoch": 0.88, "grad_norm": 1.1418888568878174, "learning_rate": 2.1244774380410976e-05, "loss": 5.335053443908691, "step": 550 }, { "epoch": 0.8816, "grad_norm": 1.3852171897888184, "learning_rate": 2.0697174623636794e-05, "loss": 5.047591209411621, "step": 551 }, { "epoch": 0.8832, "grad_norm": 1.2350728511810303, "learning_rate": 2.015641984998459e-05, "loss": 4.715671062469482, "step": 552 }, { "epoch": 0.8848, "grad_norm": 1.115648865699768, "learning_rate": 1.9622526201677344e-05, "loss": 5.0985612869262695, "step": 553 }, { "epoch": 0.8864, "grad_norm": 1.7186869382858276, "learning_rate": 1.9095509616124385e-05, "loss": 4.931835651397705, "step": 554 }, { "epoch": 0.888, "grad_norm": 1.2360730171203613, "learning_rate": 1.85753858254461e-05, "loss": 4.8929924964904785, "step": 555 }, { "epoch": 0.8896, "grad_norm": 1.146570086479187, "learning_rate": 1.8062170356003854e-05, "loss": 5.117987632751465, "step": 556 }, { "epoch": 0.8912, "grad_norm": 1.1873035430908203, "learning_rate": 1.7555878527937163e-05, "loss": 4.8101091384887695, "step": 557 }, { "epoch": 0.8928, "grad_norm": 1.1898494958877563, "learning_rate": 1.7056525454705623e-05, "loss": 5.127380847930908, "step": 558 }, { "epoch": 0.8944, "grad_norm": 1.431149959564209, "learning_rate": 1.656412604263824e-05, "loss": 5.338906764984131, "step": 559 }, { "epoch": 0.896, "grad_norm": 1.1228066682815552, "learning_rate": 1.607869499048839e-05, "loss": 4.9782185554504395, "step": 560 }, { "epoch": 0.8976, "grad_norm": 1.3961535692214966, "learning_rate": 1.5600246788994937e-05, "loss": 4.974421501159668, "step": 561 }, { "epoch": 0.8992, "grad_norm": 1.281671166419983, "learning_rate": 1.5128795720449617e-05, "loss": 4.919782638549805, "step": 562 }, { "epoch": 0.9008, "grad_norm": 1.220367670059204, "learning_rate": 1.4664355858270862e-05, "loss": 4.936645030975342, "step": 563 }, { "epoch": 0.9024, "grad_norm": 1.0977709293365479, "learning_rate": 1.4206941066583629e-05, "loss": 4.759374618530273, "step": 564 }, { "epoch": 0.904, "grad_norm": 1.2086211442947388, "learning_rate": 1.3756564999805515e-05, "loss": 5.17381477355957, "step": 565 }, { "epoch": 0.9056, "grad_norm": 1.172023892402649, "learning_rate": 1.3313241102239054e-05, "loss": 4.950685977935791, "step": 566 }, { "epoch": 0.9072, "grad_norm": 1.210207462310791, "learning_rate": 1.2876982607670674e-05, "loss": 5.04666805267334, "step": 567 }, { "epoch": 0.9088, "grad_norm": 1.0206573009490967, "learning_rate": 1.2447802538975345e-05, "loss": 5.030869483947754, "step": 568 }, { "epoch": 0.9104, "grad_norm": 1.2772059440612793, "learning_rate": 1.2025713707727953e-05, "loss": 5.230049133300781, "step": 569 }, { "epoch": 0.912, "grad_norm": 1.1435636281967163, "learning_rate": 1.1610728713820906e-05, "loss": 5.214902400970459, "step": 570 }, { "epoch": 0.9136, "grad_norm": 1.430433988571167, "learning_rate": 1.120285994508799e-05, "loss": 4.8903584480285645, "step": 571 }, { "epoch": 0.9152, "grad_norm": 1.2580111026763916, "learning_rate": 1.08021195769345e-05, "loss": 5.1730055809021, "step": 572 }, { "epoch": 0.9168, "grad_norm": 1.3038173913955688, "learning_rate": 1.0408519571973806e-05, "loss": 5.069331169128418, "step": 573 }, { "epoch": 0.9184, "grad_norm": 1.4082874059677124, "learning_rate": 1.0022071679670425e-05, "loss": 5.165510177612305, "step": 574 }, { "epoch": 0.92, "grad_norm": 1.3335379362106323, "learning_rate": 9.642787435989008e-06, "loss": 4.859002113342285, "step": 575 }, { "epoch": 0.9216, "grad_norm": 1.1995774507522583, "learning_rate": 9.270678163050216e-06, "loss": 5.164345741271973, "step": 576 }, { "epoch": 0.9232, "grad_norm": 1.0635286569595337, "learning_rate": 8.90575496879248e-06, "loss": 4.728398323059082, "step": 577 }, { "epoch": 0.9248, "grad_norm": 1.1882269382476807, "learning_rate": 8.548028746640846e-06, "loss": 4.7602972984313965, "step": 578 }, { "epoch": 0.9264, "grad_norm": 1.389762282371521, "learning_rate": 8.197510175181277e-06, "loss": 5.069275856018066, "step": 579 }, { "epoch": 0.928, "grad_norm": 1.1334697008132935, "learning_rate": 7.854209717842232e-06, "loss": 5.110383033752441, "step": 580 }, { "epoch": 0.9296, "grad_norm": 1.244832992553711, "learning_rate": 7.518137622582188e-06, "loss": 5.184660911560059, "step": 581 }, { "epoch": 0.9312, "grad_norm": 1.1092815399169922, "learning_rate": 7.1893039215838175e-06, "loss": 4.963058948516846, "step": 582 }, { "epoch": 0.9328, "grad_norm": 1.6420494318008423, "learning_rate": 6.867718430954351e-06, "loss": 4.9267964363098145, "step": 583 }, { "epoch": 0.9344, "grad_norm": 1.49501371383667, "learning_rate": 6.553390750432708e-06, "loss": 4.730033874511719, "step": 584 }, { "epoch": 0.936, "grad_norm": 1.2878178358078003, "learning_rate": 6.246330263102895e-06, "loss": 5.060173034667969, "step": 585 }, { "epoch": 0.9376, "grad_norm": 1.2040040493011475, "learning_rate": 5.9465461351138615e-06, "loss": 5.053962707519531, "step": 586 }, { "epoch": 0.9392, "grad_norm": 1.1503539085388184, "learning_rate": 5.654047315405892e-06, "loss": 4.980835437774658, "step": 587 }, { "epoch": 0.9408, "grad_norm": 0.9667116403579712, "learning_rate": 5.368842535443508e-06, "loss": 5.023655414581299, "step": 588 }, { "epoch": 0.9424, "grad_norm": 1.2056710720062256, "learning_rate": 5.09094030895485e-06, "loss": 4.959043979644775, "step": 589 }, { "epoch": 0.944, "grad_norm": 1.0608792304992676, "learning_rate": 4.8203489316773485e-06, "loss": 5.312167644500732, "step": 590 }, { "epoch": 0.9456, "grad_norm": 1.4500396251678467, "learning_rate": 4.557076481110367e-06, "loss": 4.965682029724121, "step": 591 }, { "epoch": 0.9472, "grad_norm": 1.118233561515808, "learning_rate": 4.301130816273813e-06, "loss": 4.988546848297119, "step": 592 }, { "epoch": 0.9488, "grad_norm": 1.2060961723327637, "learning_rate": 4.05251957747374e-06, "loss": 5.0205888748168945, "step": 593 }, { "epoch": 0.9504, "grad_norm": 1.270868182182312, "learning_rate": 3.811250186074089e-06, "loss": 5.278676509857178, "step": 594 }, { "epoch": 0.952, "grad_norm": 1.4645127058029175, "learning_rate": 3.5773298442753898e-06, "loss": 4.93894100189209, "step": 595 }, { "epoch": 0.9536, "grad_norm": 1.21164870262146, "learning_rate": 3.3507655348995192e-06, "loss": 5.321264266967773, "step": 596 }, { "epoch": 0.9552, "grad_norm": 1.2144756317138672, "learning_rate": 3.131564021181338e-06, "loss": 4.879669666290283, "step": 597 }, { "epoch": 0.9568, "grad_norm": 1.7862255573272705, "learning_rate": 2.9197318465669364e-06, "loss": 5.113965034484863, "step": 598 }, { "epoch": 0.9584, "grad_norm": 1.427722692489624, "learning_rate": 2.7152753345181247e-06, "loss": 4.928999423980713, "step": 599 }, { "epoch": 0.96, "grad_norm": 1.3544409275054932, "learning_rate": 2.518200588323666e-06, "loss": 5.407461166381836, "step": 600 }, { "epoch": 0.9616, "grad_norm": 1.8953897953033447, "learning_rate": 2.328513490917311e-06, "loss": 4.892749309539795, "step": 601 }, { "epoch": 0.9632, "grad_norm": 1.3621735572814941, "learning_rate": 2.1462197047019127e-06, "loss": 5.107844352722168, "step": 602 }, { "epoch": 0.9648, "grad_norm": 1.19562566280365, "learning_rate": 1.9713246713805587e-06, "loss": 5.338631629943848, "step": 603 }, { "epoch": 0.9664, "grad_norm": 1.0211833715438843, "learning_rate": 1.803833611794037e-06, "loss": 4.848773002624512, "step": 604 }, { "epoch": 0.968, "grad_norm": 1.4424593448638916, "learning_rate": 1.643751525765097e-06, "loss": 5.272921562194824, "step": 605 }, { "epoch": 0.9696, "grad_norm": 1.2189918756484985, "learning_rate": 1.4910831919490997e-06, "loss": 4.7630157470703125, "step": 606 }, { "epoch": 0.9712, "grad_norm": 1.1489924192428589, "learning_rate": 1.345833167691407e-06, "loss": 5.053176403045654, "step": 607 }, { "epoch": 0.9728, "grad_norm": 1.102137804031372, "learning_rate": 1.2080057888913253e-06, "loss": 5.1648359298706055, "step": 608 }, { "epoch": 0.9744, "grad_norm": 1.193506121635437, "learning_rate": 1.0776051698727362e-06, "loss": 4.978764533996582, "step": 609 }, { "epoch": 0.976, "grad_norm": 1.3150538206100464, "learning_rate": 9.546352032611395e-07, "loss": 5.2356038093566895, "step": 610 }, { "epoch": 0.9776, "grad_norm": 1.2881925106048584, "learning_rate": 8.390995598676066e-07, "loss": 5.024952411651611, "step": 611 }, { "epoch": 0.9792, "grad_norm": 1.2736302614212036, "learning_rate": 7.310016885791471e-07, "loss": 5.065498352050781, "step": 612 }, { "epoch": 0.9808, "grad_norm": 1.3327683210372925, "learning_rate": 6.303448162556791e-07, "loss": 5.073752403259277, "step": 613 }, { "epoch": 0.9824, "grad_norm": 1.3384580612182617, "learning_rate": 5.371319476338288e-07, "loss": 5.055788993835449, "step": 614 }, { "epoch": 0.984, "grad_norm": 1.3576717376708984, "learning_rate": 4.513658652371133e-07, "loss": 5.128819465637207, "step": 615 }, { "epoch": 0.9856, "grad_norm": 1.2477798461914062, "learning_rate": 3.7304912929300716e-07, "loss": 4.873608112335205, "step": 616 }, { "epoch": 0.9872, "grad_norm": 1.2135578393936157, "learning_rate": 3.0218407765642e-07, "loss": 5.116058349609375, "step": 617 }, { "epoch": 0.9888, "grad_norm": 1.3859200477600098, "learning_rate": 2.387728257399191e-07, "loss": 4.957461357116699, "step": 618 }, { "epoch": 0.9904, "grad_norm": 1.2815113067626953, "learning_rate": 1.8281726645061335e-07, "loss": 4.715893745422363, "step": 619 }, { "epoch": 0.992, "grad_norm": 1.275434136390686, "learning_rate": 1.343190701336705e-07, "loss": 4.888550281524658, "step": 620 }, { "epoch": 0.9936, "grad_norm": 1.0546596050262451, "learning_rate": 9.327968452232938e-08, "loss": 4.995277404785156, "step": 621 }, { "epoch": 0.9952, "grad_norm": 1.248382568359375, "learning_rate": 5.970033469490655e-08, "loss": 5.184177398681641, "step": 622 }, { "epoch": 0.9968, "grad_norm": 1.3588132858276367, "learning_rate": 3.3582023037964645e-08, "loss": 4.7490763664245605, "step": 623 }, { "epoch": 0.9984, "grad_norm": 1.121005654335022, "learning_rate": 1.492552921655843e-08, "loss": 4.857783317565918, "step": 624 }, { "epoch": 1.0, "grad_norm": 1.4019795656204224, "learning_rate": 3.731410150975556e-09, "loss": 5.065018177032471, "step": 625 } ], "logging_steps": 1, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.933484093429535e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }