{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9943851768669285, "eval_steps": 500, "global_step": 333, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008983717012914094, "grad_norm": 5.839109932109842, "learning_rate": 1.663780588235294e-06, "loss": 0.851, "step": 1 }, { "epoch": 0.017967434025828188, "grad_norm": 5.796557129672097, "learning_rate": 3.327561176470588e-06, "loss": 0.8589, "step": 2 }, { "epoch": 0.02695115103874228, "grad_norm": 5.7589149287471075, "learning_rate": 4.991341764705883e-06, "loss": 0.8612, "step": 3 }, { "epoch": 0.035934868051656375, "grad_norm": 4.220444230457979, "learning_rate": 6.655122352941176e-06, "loss": 0.8097, "step": 4 }, { "epoch": 0.044918585064570464, "grad_norm": 2.2279497783130267, "learning_rate": 8.318902941176471e-06, "loss": 0.7672, "step": 5 }, { "epoch": 0.05390230207748456, "grad_norm": 3.814533547249801, "learning_rate": 9.982683529411766e-06, "loss": 0.7561, "step": 6 }, { "epoch": 0.06288601909039865, "grad_norm": 4.0811292825217835, "learning_rate": 1.1646464117647058e-05, "loss": 0.76, "step": 7 }, { "epoch": 0.07186973610331275, "grad_norm": 3.672264516349975, "learning_rate": 1.3310244705882352e-05, "loss": 0.7142, "step": 8 }, { "epoch": 0.08085345311622684, "grad_norm": 3.4788052125571918, "learning_rate": 1.4974025294117648e-05, "loss": 0.7167, "step": 9 }, { "epoch": 0.08983717012914093, "grad_norm": 2.107646195472596, "learning_rate": 1.6637805882352943e-05, "loss": 0.6916, "step": 10 }, { "epoch": 0.09882088714205503, "grad_norm": 2.3107123413459094, "learning_rate": 1.8301586470588235e-05, "loss": 0.6619, "step": 11 }, { "epoch": 0.10780460415496912, "grad_norm": 1.97063494813433, "learning_rate": 1.996536705882353e-05, "loss": 0.651, "step": 12 }, { "epoch": 0.11678832116788321, "grad_norm": 1.172538415212184, "learning_rate": 2.1629147647058824e-05, "loss": 0.6292, "step": 13 }, { "epoch": 0.1257720381807973, "grad_norm": 1.425434087428503, "learning_rate": 2.3292928235294116e-05, "loss": 0.624, "step": 14 }, { "epoch": 0.13475575519371139, "grad_norm": 1.0298093152751933, "learning_rate": 2.4956708823529412e-05, "loss": 0.6177, "step": 15 }, { "epoch": 0.1437394722066255, "grad_norm": 0.9184264462076409, "learning_rate": 2.6620489411764705e-05, "loss": 0.6094, "step": 16 }, { "epoch": 0.1527231892195396, "grad_norm": 0.9876447317203728, "learning_rate": 2.828427e-05, "loss": 0.5913, "step": 17 }, { "epoch": 0.16170690623245368, "grad_norm": 0.9186614069420004, "learning_rate": 2.9948050588235297e-05, "loss": 0.5853, "step": 18 }, { "epoch": 0.17069062324536777, "grad_norm": 0.9898980015818278, "learning_rate": 3.161183117647059e-05, "loss": 0.5837, "step": 19 }, { "epoch": 0.17967434025828186, "grad_norm": 1.0813008236428436, "learning_rate": 3.3275611764705885e-05, "loss": 0.5826, "step": 20 }, { "epoch": 0.18865805727119594, "grad_norm": 1.1477562316475312, "learning_rate": 3.493939235294118e-05, "loss": 0.5722, "step": 21 }, { "epoch": 0.19764177428411006, "grad_norm": 0.8464148409927464, "learning_rate": 3.660317294117647e-05, "loss": 0.5673, "step": 22 }, { "epoch": 0.20662549129702415, "grad_norm": 0.7702840400962941, "learning_rate": 3.826695352941177e-05, "loss": 0.5588, "step": 23 }, { "epoch": 0.21560920830993824, "grad_norm": 1.1144594493299889, "learning_rate": 3.993073411764706e-05, "loss": 0.5584, "step": 24 }, { "epoch": 0.22459292532285233, "grad_norm": 1.1476160682417311, "learning_rate": 4.1594514705882355e-05, "loss": 0.5569, "step": 25 }, { "epoch": 0.23357664233576642, "grad_norm": 1.1535268292624339, "learning_rate": 4.325829529411765e-05, "loss": 0.5553, "step": 26 }, { "epoch": 0.2425603593486805, "grad_norm": 1.5110809161997512, "learning_rate": 4.492207588235294e-05, "loss": 0.554, "step": 27 }, { "epoch": 0.2515440763615946, "grad_norm": 1.1832182446871022, "learning_rate": 4.658585647058823e-05, "loss": 0.5421, "step": 28 }, { "epoch": 0.2605277933745087, "grad_norm": 1.1859526315720494, "learning_rate": 4.824963705882353e-05, "loss": 0.5413, "step": 29 }, { "epoch": 0.26951151038742277, "grad_norm": 1.5427900149672942, "learning_rate": 4.9913417647058825e-05, "loss": 0.5447, "step": 30 }, { "epoch": 0.2784952274003369, "grad_norm": 1.1712836835670162, "learning_rate": 5.157719823529412e-05, "loss": 0.5346, "step": 31 }, { "epoch": 0.287478944413251, "grad_norm": 1.3657285701127917, "learning_rate": 5.324097882352941e-05, "loss": 0.54, "step": 32 }, { "epoch": 0.29646266142616506, "grad_norm": 1.401113516783269, "learning_rate": 5.490475941176471e-05, "loss": 0.5272, "step": 33 }, { "epoch": 0.3054463784390792, "grad_norm": 1.283015659035394, "learning_rate": 5.656854e-05, "loss": 0.5269, "step": 34 }, { "epoch": 0.31443009545199324, "grad_norm": 1.1625204652087118, "learning_rate": 5.65669787647368e-05, "loss": 0.5254, "step": 35 }, { "epoch": 0.32341381246490736, "grad_norm": 1.9389962167077017, "learning_rate": 5.6562295231301345e-05, "loss": 0.5325, "step": 36 }, { "epoch": 0.3323975294778215, "grad_norm": 1.2095439312643286, "learning_rate": 5.655448991673705e-05, "loss": 0.5247, "step": 37 }, { "epoch": 0.34138124649073553, "grad_norm": 2.1581962123054512, "learning_rate": 5.6543563682719456e-05, "loss": 0.5301, "step": 38 }, { "epoch": 0.35036496350364965, "grad_norm": 1.645168282852934, "learning_rate": 5.652951773546123e-05, "loss": 0.5251, "step": 39 }, { "epoch": 0.3593486805165637, "grad_norm": 1.7810864857463629, "learning_rate": 5.651235362557887e-05, "loss": 0.529, "step": 40 }, { "epoch": 0.36833239752947783, "grad_norm": 1.3721422433398012, "learning_rate": 5.649207324792161e-05, "loss": 0.5263, "step": 41 }, { "epoch": 0.3773161145423919, "grad_norm": 1.7315681950487964, "learning_rate": 5.646867884136221e-05, "loss": 0.5233, "step": 42 }, { "epoch": 0.386299831555306, "grad_norm": 1.2328449836692366, "learning_rate": 5.644217298854978e-05, "loss": 0.5121, "step": 43 }, { "epoch": 0.3952835485682201, "grad_norm": 1.688017129833438, "learning_rate": 5.641255861562469e-05, "loss": 0.5076, "step": 44 }, { "epoch": 0.4042672655811342, "grad_norm": 1.3793575135144216, "learning_rate": 5.637983899189553e-05, "loss": 0.5178, "step": 45 }, { "epoch": 0.4132509825940483, "grad_norm": 1.1818050497678425, "learning_rate": 5.634401772947817e-05, "loss": 0.5099, "step": 46 }, { "epoch": 0.42223469960696236, "grad_norm": 1.334664385713327, "learning_rate": 5.630509878289701e-05, "loss": 0.5125, "step": 47 }, { "epoch": 0.4312184166198765, "grad_norm": 1.1824189144503987, "learning_rate": 5.626308644864844e-05, "loss": 0.5035, "step": 48 }, { "epoch": 0.4402021336327906, "grad_norm": 1.8310997335960193, "learning_rate": 5.621798536472649e-05, "loss": 0.5129, "step": 49 }, { "epoch": 0.44918585064570465, "grad_norm": 0.9498119984179156, "learning_rate": 5.616980051011082e-05, "loss": 0.4933, "step": 50 }, { "epoch": 0.45816956765861877, "grad_norm": 1.3738927247826827, "learning_rate": 5.611853720421709e-05, "loss": 0.5114, "step": 51 }, { "epoch": 0.46715328467153283, "grad_norm": 1.3239245607536279, "learning_rate": 5.606420110630966e-05, "loss": 0.5051, "step": 52 }, { "epoch": 0.47613700168444695, "grad_norm": 1.3491292246681823, "learning_rate": 5.6006798214876905e-05, "loss": 0.4984, "step": 53 }, { "epoch": 0.485120718697361, "grad_norm": 1.556526288137193, "learning_rate": 5.5946334866968935e-05, "loss": 0.5057, "step": 54 }, { "epoch": 0.4941044357102751, "grad_norm": 0.8396609232112651, "learning_rate": 5.5882817737498054e-05, "loss": 0.5067, "step": 55 }, { "epoch": 0.5030881527231892, "grad_norm": 1.3086549421559865, "learning_rate": 5.581625383850187e-05, "loss": 0.5016, "step": 56 }, { "epoch": 0.5120718697361033, "grad_norm": 1.6438518323894828, "learning_rate": 5.5746650518369164e-05, "loss": 0.5105, "step": 57 }, { "epoch": 0.5210555867490174, "grad_norm": 0.9704759451995054, "learning_rate": 5.5674015461028693e-05, "loss": 0.501, "step": 58 }, { "epoch": 0.5300393037619315, "grad_norm": 1.520513804956881, "learning_rate": 5.559835668510092e-05, "loss": 0.5055, "step": 59 }, { "epoch": 0.5390230207748455, "grad_norm": 1.4886814176870304, "learning_rate": 5.5519682543012745e-05, "loss": 0.4997, "step": 60 }, { "epoch": 0.5480067377877597, "grad_norm": 0.9983783244132893, "learning_rate": 5.5438001720075464e-05, "loss": 0.4999, "step": 61 }, { "epoch": 0.5569904548006738, "grad_norm": 1.8027514771648614, "learning_rate": 5.535332323352595e-05, "loss": 0.4973, "step": 62 }, { "epoch": 0.5659741718135879, "grad_norm": 1.5234595633592394, "learning_rate": 5.526565643153117e-05, "loss": 0.5027, "step": 63 }, { "epoch": 0.574957888826502, "grad_norm": 1.5395421556206257, "learning_rate": 5.517501099215618e-05, "loss": 0.4927, "step": 64 }, { "epoch": 0.583941605839416, "grad_norm": 1.2783343984201738, "learning_rate": 5.5081396922295734e-05, "loss": 0.4927, "step": 65 }, { "epoch": 0.5929253228523301, "grad_norm": 1.3855491947405612, "learning_rate": 5.498482455656953e-05, "loss": 0.4922, "step": 66 }, { "epoch": 0.6019090398652442, "grad_norm": 1.1582514991567736, "learning_rate": 5.488530455618133e-05, "loss": 0.4911, "step": 67 }, { "epoch": 0.6108927568781584, "grad_norm": 1.4059224870864222, "learning_rate": 5.4782847907741996e-05, "loss": 0.4947, "step": 68 }, { "epoch": 0.6198764738910725, "grad_norm": 1.1973724677392805, "learning_rate": 5.4677465922056604e-05, "loss": 0.4891, "step": 69 }, { "epoch": 0.6288601909039865, "grad_norm": 1.3369194820930967, "learning_rate": 5.456917023287581e-05, "loss": 0.4877, "step": 70 }, { "epoch": 0.6378439079169006, "grad_norm": 1.2691892558171078, "learning_rate": 5.445797279561149e-05, "loss": 0.4895, "step": 71 }, { "epoch": 0.6468276249298147, "grad_norm": 1.0922464334569515, "learning_rate": 5.434388588601693e-05, "loss": 0.4803, "step": 72 }, { "epoch": 0.6558113419427288, "grad_norm": 0.8753013887933451, "learning_rate": 5.422692209883164e-05, "loss": 0.4914, "step": 73 }, { "epoch": 0.664795058955643, "grad_norm": 0.9796697285671012, "learning_rate": 5.4107094346390925e-05, "loss": 0.4869, "step": 74 }, { "epoch": 0.673778775968557, "grad_norm": 1.2844167323369493, "learning_rate": 5.398441585720041e-05, "loss": 0.4867, "step": 75 }, { "epoch": 0.6827624929814711, "grad_norm": 1.4887584341619542, "learning_rate": 5.3858900174475716e-05, "loss": 0.4991, "step": 76 }, { "epoch": 0.6917462099943852, "grad_norm": 0.8921558529768149, "learning_rate": 5.373056115464729e-05, "loss": 0.4867, "step": 77 }, { "epoch": 0.7007299270072993, "grad_norm": 1.2859329068604128, "learning_rate": 5.359941296583069e-05, "loss": 0.4882, "step": 78 }, { "epoch": 0.7097136440202133, "grad_norm": 0.7544133782391679, "learning_rate": 5.346547008626259e-05, "loss": 0.4804, "step": 79 }, { "epoch": 0.7186973610331274, "grad_norm": 1.2158913130951334, "learning_rate": 5.332874730270231e-05, "loss": 0.4965, "step": 80 }, { "epoch": 0.7276810780460415, "grad_norm": 1.5397019534433478, "learning_rate": 5.3189259708799525e-05, "loss": 0.4863, "step": 81 }, { "epoch": 0.7366647950589557, "grad_norm": 1.0368691373073795, "learning_rate": 5.304702270342788e-05, "loss": 0.4709, "step": 82 }, { "epoch": 0.7456485120718698, "grad_norm": 1.6327070593356623, "learning_rate": 5.290205198898512e-05, "loss": 0.4935, "step": 83 }, { "epoch": 0.7546322290847838, "grad_norm": 0.7996850897364002, "learning_rate": 5.275436356965955e-05, "loss": 0.4833, "step": 84 }, { "epoch": 0.7636159460976979, "grad_norm": 1.3138953152120028, "learning_rate": 5.260397374966324e-05, "loss": 0.4831, "step": 85 }, { "epoch": 0.772599663110612, "grad_norm": 1.0413080501496228, "learning_rate": 5.245089913143211e-05, "loss": 0.4857, "step": 86 }, { "epoch": 0.7815833801235261, "grad_norm": 1.2505485373744896, "learning_rate": 5.229515661379311e-05, "loss": 0.4783, "step": 87 }, { "epoch": 0.7905670971364402, "grad_norm": 0.8250922322384497, "learning_rate": 5.213676339009861e-05, "loss": 0.4888, "step": 88 }, { "epoch": 0.7995508141493542, "grad_norm": 0.930860026309023, "learning_rate": 5.197573694632837e-05, "loss": 0.4722, "step": 89 }, { "epoch": 0.8085345311622684, "grad_norm": 1.1999339483974405, "learning_rate": 5.181209505915914e-05, "loss": 0.4844, "step": 90 }, { "epoch": 0.8175182481751825, "grad_norm": 0.9824068299768655, "learning_rate": 5.164585579400215e-05, "loss": 0.4835, "step": 91 }, { "epoch": 0.8265019651880966, "grad_norm": 1.506768206352777, "learning_rate": 5.1477037503008845e-05, "loss": 0.4795, "step": 92 }, { "epoch": 0.8354856822010107, "grad_norm": 0.7471067724218583, "learning_rate": 5.130565882304478e-05, "loss": 0.4752, "step": 93 }, { "epoch": 0.8444693992139247, "grad_norm": 1.231448677263746, "learning_rate": 5.113173867363228e-05, "loss": 0.4789, "step": 94 }, { "epoch": 0.8534531162268388, "grad_norm": 1.3024759491093874, "learning_rate": 5.095529625486171e-05, "loss": 0.4805, "step": 95 }, { "epoch": 0.862436833239753, "grad_norm": 0.9214289630598883, "learning_rate": 5.0776351045271936e-05, "loss": 0.486, "step": 96 }, { "epoch": 0.8714205502526671, "grad_norm": 1.0404626244101485, "learning_rate": 5.0594922799699925e-05, "loss": 0.4845, "step": 97 }, { "epoch": 0.8804042672655812, "grad_norm": 0.849414300185437, "learning_rate": 5.0411031547099916e-05, "loss": 0.467, "step": 98 }, { "epoch": 0.8893879842784952, "grad_norm": 0.6937685475658341, "learning_rate": 5.0224697588332306e-05, "loss": 0.4808, "step": 99 }, { "epoch": 0.8983717012914093, "grad_norm": 0.9214456229044827, "learning_rate": 5.003594149392247e-05, "loss": 0.4686, "step": 100 }, { "epoch": 0.9073554183043234, "grad_norm": 0.8633442974924467, "learning_rate": 4.984478410178992e-05, "loss": 0.481, "step": 101 }, { "epoch": 0.9163391353172375, "grad_norm": 1.0904707694598144, "learning_rate": 4.965124651494785e-05, "loss": 0.4677, "step": 102 }, { "epoch": 0.9253228523301515, "grad_norm": 0.8481076014465915, "learning_rate": 4.9455350099173434e-05, "loss": 0.4802, "step": 103 }, { "epoch": 0.9343065693430657, "grad_norm": 0.6006636997635961, "learning_rate": 4.925711648064916e-05, "loss": 0.4694, "step": 104 }, { "epoch": 0.9432902863559798, "grad_norm": 0.7238908358412791, "learning_rate": 4.9056567543575374e-05, "loss": 0.4809, "step": 105 }, { "epoch": 0.9522740033688939, "grad_norm": 0.719964931081471, "learning_rate": 4.885372542775435e-05, "loss": 0.4677, "step": 106 }, { "epoch": 0.961257720381808, "grad_norm": 0.6462909305498641, "learning_rate": 4.864861252614612e-05, "loss": 0.47, "step": 107 }, { "epoch": 0.970241437394722, "grad_norm": 0.605236737922692, "learning_rate": 4.844125148239645e-05, "loss": 0.4604, "step": 108 }, { "epoch": 0.9792251544076361, "grad_norm": 0.6196855103029525, "learning_rate": 4.823166518833697e-05, "loss": 0.4686, "step": 109 }, { "epoch": 0.9882088714205502, "grad_norm": 0.5910630181346482, "learning_rate": 4.801987678145811e-05, "loss": 0.4771, "step": 110 }, { "epoch": 0.9971925884334644, "grad_norm": 0.5265183055483387, "learning_rate": 4.7805909642354734e-05, "loss": 0.4701, "step": 111 }, { "epoch": 1.0075800112296462, "grad_norm": 0.7416855722204436, "learning_rate": 4.7589787392145085e-05, "loss": 0.8384, "step": 112 }, { "epoch": 1.0165637282425604, "grad_norm": 0.9235671375653838, "learning_rate": 4.737153388986303e-05, "loss": 0.4427, "step": 113 }, { "epoch": 1.0255474452554745, "grad_norm": 1.2985752147785057, "learning_rate": 4.7151173229824185e-05, "loss": 0.4674, "step": 114 }, { "epoch": 1.0345311622683886, "grad_norm": 0.8686104223850358, "learning_rate": 4.6928729738965966e-05, "loss": 0.4585, "step": 115 }, { "epoch": 1.0435148792813027, "grad_norm": 1.4248620564884518, "learning_rate": 4.6704227974162e-05, "loss": 0.4664, "step": 116 }, { "epoch": 1.0524985962942168, "grad_norm": 0.8090345615734776, "learning_rate": 4.647769271951114e-05, "loss": 0.4526, "step": 117 }, { "epoch": 1.0614823133071307, "grad_norm": 1.8942652592210458, "learning_rate": 4.624914898360141e-05, "loss": 0.4616, "step": 118 }, { "epoch": 1.0704660303200448, "grad_norm": 1.176745396077979, "learning_rate": 4.601862199674913e-05, "loss": 0.4517, "step": 119 }, { "epoch": 1.079449747332959, "grad_norm": 1.9084152203109792, "learning_rate": 4.5786137208213634e-05, "loss": 0.4512, "step": 120 }, { "epoch": 1.088433464345873, "grad_norm": 1.837711635191509, "learning_rate": 4.555172028338775e-05, "loss": 0.4642, "step": 121 }, { "epoch": 1.0974171813587872, "grad_norm": 0.8682381358770546, "learning_rate": 4.531539710096439e-05, "loss": 0.4458, "step": 122 }, { "epoch": 1.1064008983717013, "grad_norm": 1.2949713196196633, "learning_rate": 4.507719375007978e-05, "loss": 0.4659, "step": 123 }, { "epoch": 1.1153846153846154, "grad_norm": 0.6820031143942866, "learning_rate": 4.483713652743316e-05, "loss": 0.4422, "step": 124 }, { "epoch": 1.1243683323975295, "grad_norm": 0.9015249547203725, "learning_rate": 4.459525193438388e-05, "loss": 0.4674, "step": 125 }, { "epoch": 1.1333520494104437, "grad_norm": 0.7596159903360272, "learning_rate": 4.4351566674025625e-05, "loss": 0.4397, "step": 126 }, { "epoch": 1.1423357664233578, "grad_norm": 0.8048846249914353, "learning_rate": 4.410610764823863e-05, "loss": 0.4537, "step": 127 }, { "epoch": 1.1513194834362717, "grad_norm": 0.5674553882616571, "learning_rate": 4.3858901954719706e-05, "loss": 0.4431, "step": 128 }, { "epoch": 1.1603032004491858, "grad_norm": 0.7662516848679161, "learning_rate": 4.3609976883990836e-05, "loss": 0.4564, "step": 129 }, { "epoch": 1.1692869174621, "grad_norm": 0.5438455650436745, "learning_rate": 4.335935991638637e-05, "loss": 0.4461, "step": 130 }, { "epoch": 1.178270634475014, "grad_norm": 0.659318129425458, "learning_rate": 4.310707871901933e-05, "loss": 0.4489, "step": 131 }, { "epoch": 1.1872543514879281, "grad_norm": 0.603233928019015, "learning_rate": 4.2853161142727056e-05, "loss": 0.4463, "step": 132 }, { "epoch": 1.1962380685008422, "grad_norm": 0.5437863962201774, "learning_rate": 4.25976352189966e-05, "loss": 0.4476, "step": 133 }, { "epoch": 1.2052217855137564, "grad_norm": 0.44198003824815413, "learning_rate": 4.234052915687014e-05, "loss": 0.4417, "step": 134 }, { "epoch": 1.2142055025266705, "grad_norm": 0.4940269180275878, "learning_rate": 4.208187133983084e-05, "loss": 0.4437, "step": 135 }, { "epoch": 1.2231892195395846, "grad_norm": 0.4676186442195625, "learning_rate": 4.18216903226694e-05, "loss": 0.4429, "step": 136 }, { "epoch": 1.2321729365524985, "grad_norm": 0.557497711331083, "learning_rate": 4.156001482833174e-05, "loss": 0.4464, "step": 137 }, { "epoch": 1.2411566535654126, "grad_norm": 0.47586024576684477, "learning_rate": 4.1296873744748095e-05, "loss": 0.4426, "step": 138 }, { "epoch": 1.2501403705783267, "grad_norm": 0.32603863165308805, "learning_rate": 4.103229612164391e-05, "loss": 0.4416, "step": 139 }, { "epoch": 1.2591240875912408, "grad_norm": 0.4351165089036505, "learning_rate": 4.076631116733286e-05, "loss": 0.4401, "step": 140 }, { "epoch": 1.268107804604155, "grad_norm": 0.4958952788514796, "learning_rate": 4.0498948245492365e-05, "loss": 0.4452, "step": 141 }, { "epoch": 1.277091521617069, "grad_norm": 0.5509848964383834, "learning_rate": 4.023023687192194e-05, "loss": 0.4411, "step": 142 }, { "epoch": 1.2860752386299832, "grad_norm": 0.3808831227241545, "learning_rate": 3.996020671128483e-05, "loss": 0.4443, "step": 143 }, { "epoch": 1.2950589556428973, "grad_norm": 0.36241744805641257, "learning_rate": 3.9688887573833065e-05, "loss": 0.45, "step": 144 }, { "epoch": 1.3040426726558114, "grad_norm": 0.36700641622974883, "learning_rate": 3.941630941211662e-05, "loss": 0.4307, "step": 145 }, { "epoch": 1.3130263896687255, "grad_norm": 0.3136123844114524, "learning_rate": 3.914250231767668e-05, "loss": 0.4428, "step": 146 }, { "epoch": 1.3220101066816397, "grad_norm": 0.30609996444955184, "learning_rate": 3.886749651772372e-05, "loss": 0.4433, "step": 147 }, { "epoch": 1.3309938236945535, "grad_norm": 0.35148333104030377, "learning_rate": 3.8591322371800516e-05, "loss": 0.448, "step": 148 }, { "epoch": 1.3399775407074677, "grad_norm": 0.3406382310678225, "learning_rate": 3.831401036843058e-05, "loss": 0.4356, "step": 149 }, { "epoch": 1.3489612577203818, "grad_norm": 0.32870885630433616, "learning_rate": 3.8035591121752334e-05, "loss": 0.4433, "step": 150 }, { "epoch": 1.357944974733296, "grad_norm": 0.40610208793950014, "learning_rate": 3.7756095368139454e-05, "loss": 0.4419, "step": 151 }, { "epoch": 1.36692869174621, "grad_norm": 0.3410029435891093, "learning_rate": 3.747555396280769e-05, "loss": 0.4284, "step": 152 }, { "epoch": 1.3759124087591241, "grad_norm": 0.30509070420240164, "learning_rate": 3.719399787640854e-05, "loss": 0.4346, "step": 153 }, { "epoch": 1.3848961257720382, "grad_norm": 0.30952788069600484, "learning_rate": 3.691145819161026e-05, "loss": 0.4366, "step": 154 }, { "epoch": 1.3938798427849521, "grad_norm": 0.28009014545926486, "learning_rate": 3.6627966099666466e-05, "loss": 0.4322, "step": 155 }, { "epoch": 1.4028635597978663, "grad_norm": 0.255197994228949, "learning_rate": 3.6343552896972686e-05, "loss": 0.4297, "step": 156 }, { "epoch": 1.4118472768107804, "grad_norm": 0.26775780849454006, "learning_rate": 3.605824998161141e-05, "loss": 0.4423, "step": 157 }, { "epoch": 1.4208309938236945, "grad_norm": 0.2748112146974553, "learning_rate": 3.5772088849885886e-05, "loss": 0.4316, "step": 158 }, { "epoch": 1.4298147108366086, "grad_norm": 0.272040591675239, "learning_rate": 3.548510109284296e-05, "loss": 0.4464, "step": 159 }, { "epoch": 1.4387984278495227, "grad_norm": 0.23246881072840514, "learning_rate": 3.519731839278567e-05, "loss": 0.4409, "step": 160 }, { "epoch": 1.4477821448624368, "grad_norm": 0.24504527703462542, "learning_rate": 3.4908772519775565e-05, "loss": 0.4419, "step": 161 }, { "epoch": 1.456765861875351, "grad_norm": 0.24765234500656855, "learning_rate": 3.461949532812546e-05, "loss": 0.4478, "step": 162 }, { "epoch": 1.465749578888265, "grad_norm": 0.327477057941263, "learning_rate": 3.43295187528828e-05, "loss": 0.4411, "step": 163 }, { "epoch": 1.4747332959011792, "grad_norm": 0.27408220536723477, "learning_rate": 3.403887480630422e-05, "loss": 0.4372, "step": 164 }, { "epoch": 1.4837170129140933, "grad_norm": 0.2479972152670547, "learning_rate": 3.374759557432146e-05, "loss": 0.44, "step": 165 }, { "epoch": 1.4927007299270074, "grad_norm": 0.2576799538776205, "learning_rate": 3.345571321299926e-05, "loss": 0.4345, "step": 166 }, { "epoch": 1.5016844469399215, "grad_norm": 0.2949577781981658, "learning_rate": 3.316325994498539e-05, "loss": 0.4426, "step": 167 }, { "epoch": 1.5106681639528357, "grad_norm": 0.25387337797793086, "learning_rate": 3.28702680559535e-05, "loss": 0.4414, "step": 168 }, { "epoch": 1.5196518809657495, "grad_norm": 0.29898671970481416, "learning_rate": 3.2576769891038794e-05, "loss": 0.4295, "step": 169 }, { "epoch": 1.5286355979786637, "grad_norm": 0.22913788989001677, "learning_rate": 3.2282797851267353e-05, "loss": 0.4402, "step": 170 }, { "epoch": 1.5376193149915778, "grad_norm": 0.3095695481139338, "learning_rate": 3.198838438997912e-05, "loss": 0.4367, "step": 171 }, { "epoch": 1.546603032004492, "grad_norm": 0.2942100195557735, "learning_rate": 3.169356200924522e-05, "loss": 0.441, "step": 172 }, { "epoch": 1.5555867490174058, "grad_norm": 0.2009760877564207, "learning_rate": 3.1398363256279894e-05, "loss": 0.4361, "step": 173 }, { "epoch": 1.56457046603032, "grad_norm": 0.22271313757898464, "learning_rate": 3.110282071984731e-05, "loss": 0.429, "step": 174 }, { "epoch": 1.573554183043234, "grad_norm": 0.23273324004009835, "learning_rate": 3.080696702666401e-05, "loss": 0.444, "step": 175 }, { "epoch": 1.5825379000561481, "grad_norm": 0.23105576612308468, "learning_rate": 3.051083483779696e-05, "loss": 0.4415, "step": 176 }, { "epoch": 1.5915216170690623, "grad_norm": 0.2707194719098357, "learning_rate": 3.0214456845057964e-05, "loss": 0.4337, "step": 177 }, { "epoch": 1.6005053340819764, "grad_norm": 0.19401437496621246, "learning_rate": 2.9917865767394592e-05, "loss": 0.4293, "step": 178 }, { "epoch": 1.6094890510948905, "grad_norm": 0.23447424010049614, "learning_rate": 2.9621094347278115e-05, "loss": 0.4381, "step": 179 }, { "epoch": 1.6184727681078046, "grad_norm": 0.24721825324120247, "learning_rate": 2.9324175347088936e-05, "loss": 0.4347, "step": 180 }, { "epoch": 1.6274564851207187, "grad_norm": 0.18188660812700427, "learning_rate": 2.9027141545499668e-05, "loss": 0.4393, "step": 181 }, { "epoch": 1.6364402021336328, "grad_norm": 0.190718000775263, "learning_rate": 2.873002573385654e-05, "loss": 0.4302, "step": 182 }, { "epoch": 1.645423919146547, "grad_norm": 0.20646822886164018, "learning_rate": 2.84328607125594e-05, "loss": 0.4265, "step": 183 }, { "epoch": 1.654407636159461, "grad_norm": 0.18501972197639874, "learning_rate": 2.813567928744061e-05, "loss": 0.4264, "step": 184 }, { "epoch": 1.6633913531723752, "grad_norm": 0.2131469646078243, "learning_rate": 2.7838514266143464e-05, "loss": 0.4353, "step": 185 }, { "epoch": 1.6723750701852893, "grad_norm": 0.20765635355407858, "learning_rate": 2.754139845450034e-05, "loss": 0.4358, "step": 186 }, { "epoch": 1.6813587871982034, "grad_norm": 0.21408480102506994, "learning_rate": 2.7244364652911073e-05, "loss": 0.436, "step": 187 }, { "epoch": 1.6903425042111173, "grad_norm": 0.20394614955447768, "learning_rate": 2.6947445652721887e-05, "loss": 0.4317, "step": 188 }, { "epoch": 1.6993262212240314, "grad_norm": 0.2462732253149224, "learning_rate": 2.6650674232605416e-05, "loss": 0.4381, "step": 189 }, { "epoch": 1.7083099382369455, "grad_norm": 0.19402664296982586, "learning_rate": 2.635408315494204e-05, "loss": 0.4289, "step": 190 }, { "epoch": 1.7172936552498597, "grad_norm": 0.25323884789633694, "learning_rate": 2.6057705162203045e-05, "loss": 0.4387, "step": 191 }, { "epoch": 1.7262773722627736, "grad_norm": 0.17952016249190222, "learning_rate": 2.5761572973335996e-05, "loss": 0.4392, "step": 192 }, { "epoch": 1.7352610892756877, "grad_norm": 0.21550956031191665, "learning_rate": 2.5465719280152693e-05, "loss": 0.4385, "step": 193 }, { "epoch": 1.7442448062886018, "grad_norm": 0.2260946384930941, "learning_rate": 2.5170176743720114e-05, "loss": 0.4297, "step": 194 }, { "epoch": 1.753228523301516, "grad_norm": 0.189354294580711, "learning_rate": 2.487497799075478e-05, "loss": 0.4402, "step": 195 }, { "epoch": 1.76221224031443, "grad_norm": 0.18347107243203223, "learning_rate": 2.4580155610020893e-05, "loss": 0.4436, "step": 196 }, { "epoch": 1.7711959573273441, "grad_norm": 0.23718361693873785, "learning_rate": 2.4285742148732662e-05, "loss": 0.431, "step": 197 }, { "epoch": 1.7801796743402583, "grad_norm": 0.15875452261920012, "learning_rate": 2.399177010896121e-05, "loss": 0.4227, "step": 198 }, { "epoch": 1.7891633913531724, "grad_norm": 0.20090739499698712, "learning_rate": 2.3698271944046514e-05, "loss": 0.4287, "step": 199 }, { "epoch": 1.7981471083660865, "grad_norm": 0.15078862855529346, "learning_rate": 2.3405280055014613e-05, "loss": 0.4293, "step": 200 }, { "epoch": 1.8071308253790006, "grad_norm": 0.16864941614937132, "learning_rate": 2.3112826787000755e-05, "loss": 0.433, "step": 201 }, { "epoch": 1.8161145423919147, "grad_norm": 0.1640970390638266, "learning_rate": 2.2820944425678543e-05, "loss": 0.4465, "step": 202 }, { "epoch": 1.8250982594048288, "grad_norm": 0.181930840073729, "learning_rate": 2.2529665193695787e-05, "loss": 0.4311, "step": 203 }, { "epoch": 1.834081976417743, "grad_norm": 0.1618511339670659, "learning_rate": 2.2239021247117203e-05, "loss": 0.425, "step": 204 }, { "epoch": 1.843065693430657, "grad_norm": 0.1608105810202212, "learning_rate": 2.1949044671874553e-05, "loss": 0.4372, "step": 205 }, { "epoch": 1.8520494104435712, "grad_norm": 0.17614884968053077, "learning_rate": 2.165976748022444e-05, "loss": 0.4242, "step": 206 }, { "epoch": 1.861033127456485, "grad_norm": 0.18688103883770055, "learning_rate": 2.1371221607214342e-05, "loss": 0.4329, "step": 207 }, { "epoch": 1.8700168444693992, "grad_norm": 0.1888649411522682, "learning_rate": 2.108343890715705e-05, "loss": 0.4421, "step": 208 }, { "epoch": 1.8790005614823133, "grad_norm": 0.19568546233660988, "learning_rate": 2.0796451150114122e-05, "loss": 0.4292, "step": 209 }, { "epoch": 1.8879842784952274, "grad_norm": 0.20130506786491167, "learning_rate": 2.0510290018388582e-05, "loss": 0.4305, "step": 210 }, { "epoch": 1.8969679955081415, "grad_norm": 0.16622134218898874, "learning_rate": 2.0224987103027312e-05, "loss": 0.4304, "step": 211 }, { "epoch": 1.9059517125210554, "grad_norm": 0.1714373409659067, "learning_rate": 1.9940573900333532e-05, "loss": 0.4388, "step": 212 }, { "epoch": 1.9149354295339696, "grad_norm": 0.16165865567699295, "learning_rate": 1.9657081808389732e-05, "loss": 0.4199, "step": 213 }, { "epoch": 1.9239191465468837, "grad_norm": 0.1588217221931874, "learning_rate": 1.9374542123591462e-05, "loss": 0.4391, "step": 214 }, { "epoch": 1.9329028635597978, "grad_norm": 0.16789101350762428, "learning_rate": 1.9092986037192315e-05, "loss": 0.429, "step": 215 }, { "epoch": 1.941886580572712, "grad_norm": 0.14933213528792, "learning_rate": 1.881244463186054e-05, "loss": 0.4327, "step": 216 }, { "epoch": 1.950870297585626, "grad_norm": 0.15634986800174527, "learning_rate": 1.8532948878247664e-05, "loss": 0.4222, "step": 217 }, { "epoch": 1.9598540145985401, "grad_norm": 0.16965600317101812, "learning_rate": 1.825452963156942e-05, "loss": 0.4422, "step": 218 }, { "epoch": 1.9688377316114543, "grad_norm": 0.13903361538978226, "learning_rate": 1.7977217628199486e-05, "loss": 0.4324, "step": 219 }, { "epoch": 1.9778214486243684, "grad_norm": 0.18809988345643205, "learning_rate": 1.770104348227628e-05, "loss": 0.4284, "step": 220 }, { "epoch": 1.9868051656372825, "grad_norm": 0.16274715282926117, "learning_rate": 1.742603768232333e-05, "loss": 0.417, "step": 221 }, { "epoch": 1.9957888826501966, "grad_norm": 0.18587407946726342, "learning_rate": 1.7152230587883387e-05, "loss": 0.4419, "step": 222 }, { "epoch": 2.0061763054463784, "grad_norm": 0.35271858704421305, "learning_rate": 1.6879652426166937e-05, "loss": 0.7668, "step": 223 }, { "epoch": 2.0151600224592925, "grad_norm": 0.18863183688272459, "learning_rate": 1.660833328871518e-05, "loss": 0.3965, "step": 224 }, { "epoch": 2.0241437394722066, "grad_norm": 0.2508766464598683, "learning_rate": 1.6338303128078067e-05, "loss": 0.4199, "step": 225 }, { "epoch": 2.0331274564851207, "grad_norm": 0.1846517351611734, "learning_rate": 1.6069591754507644e-05, "loss": 0.4059, "step": 226 }, { "epoch": 2.042111173498035, "grad_norm": 0.20660957505878264, "learning_rate": 1.5802228832667142e-05, "loss": 0.4115, "step": 227 }, { "epoch": 2.051094890510949, "grad_norm": 0.20768143407603465, "learning_rate": 1.553624387835609e-05, "loss": 0.4033, "step": 228 }, { "epoch": 2.060078607523863, "grad_norm": 0.23623799408278454, "learning_rate": 1.5271666255251907e-05, "loss": 0.419, "step": 229 }, { "epoch": 2.069062324536777, "grad_norm": 0.19328378884342695, "learning_rate": 1.5008525171668266e-05, "loss": 0.4045, "step": 230 }, { "epoch": 2.0780460415496913, "grad_norm": 0.20354843925554086, "learning_rate": 1.474684967733061e-05, "loss": 0.4063, "step": 231 }, { "epoch": 2.0870297585626054, "grad_norm": 0.1962670346347417, "learning_rate": 1.4486668660169169e-05, "loss": 0.4098, "step": 232 }, { "epoch": 2.0960134755755195, "grad_norm": 0.1697063402554285, "learning_rate": 1.4228010843129864e-05, "loss": 0.3947, "step": 233 }, { "epoch": 2.1049971925884337, "grad_norm": 0.21767004866396167, "learning_rate": 1.3970904781003401e-05, "loss": 0.4114, "step": 234 }, { "epoch": 2.1139809096013478, "grad_norm": 0.16628750228694827, "learning_rate": 1.3715378857272944e-05, "loss": 0.4089, "step": 235 }, { "epoch": 2.1229646266142614, "grad_norm": 0.17079039606673918, "learning_rate": 1.3461461280980681e-05, "loss": 0.4047, "step": 236 }, { "epoch": 2.1319483436271756, "grad_norm": 0.1984666308638168, "learning_rate": 1.3209180083613638e-05, "loss": 0.4126, "step": 237 }, { "epoch": 2.1409320606400897, "grad_norm": 0.15844397108776676, "learning_rate": 1.2958563116009172e-05, "loss": 0.401, "step": 238 }, { "epoch": 2.149915777653004, "grad_norm": 0.1682087305867551, "learning_rate": 1.27096380452803e-05, "loss": 0.404, "step": 239 }, { "epoch": 2.158899494665918, "grad_norm": 0.15876022024115735, "learning_rate": 1.2462432351761374e-05, "loss": 0.406, "step": 240 }, { "epoch": 2.167883211678832, "grad_norm": 0.13010884230934192, "learning_rate": 1.2216973325974375e-05, "loss": 0.3955, "step": 241 }, { "epoch": 2.176866928691746, "grad_norm": 0.15194484720161236, "learning_rate": 1.1973288065616129e-05, "loss": 0.4204, "step": 242 }, { "epoch": 2.1858506457046603, "grad_norm": 0.146927657510162, "learning_rate": 1.1731403472566841e-05, "loss": 0.4259, "step": 243 }, { "epoch": 2.1948343627175744, "grad_norm": 0.1357980790994208, "learning_rate": 1.1491346249920226e-05, "loss": 0.3998, "step": 244 }, { "epoch": 2.2038180797304885, "grad_norm": 0.14520203880695473, "learning_rate": 1.1253142899035609e-05, "loss": 0.4038, "step": 245 }, { "epoch": 2.2128017967434026, "grad_norm": 0.12180527212805993, "learning_rate": 1.101681971661226e-05, "loss": 0.3995, "step": 246 }, { "epoch": 2.2217855137563167, "grad_norm": 0.16678443391851008, "learning_rate": 1.0782402791786366e-05, "loss": 0.4093, "step": 247 }, { "epoch": 2.230769230769231, "grad_norm": 0.14169624933871403, "learning_rate": 1.054991800325088e-05, "loss": 0.3945, "step": 248 }, { "epoch": 2.239752947782145, "grad_norm": 0.12385040853111917, "learning_rate": 1.0319391016398607e-05, "loss": 0.4045, "step": 249 }, { "epoch": 2.248736664795059, "grad_norm": 0.13012596569465296, "learning_rate": 1.009084728048887e-05, "loss": 0.4037, "step": 250 }, { "epoch": 2.257720381807973, "grad_norm": 0.12678805353118078, "learning_rate": 9.864312025838009e-06, "loss": 0.4085, "step": 251 }, { "epoch": 2.2667040988208873, "grad_norm": 0.13122317737854305, "learning_rate": 9.63981026103404e-06, "loss": 0.415, "step": 252 }, { "epoch": 2.2756878158338014, "grad_norm": 0.1228277598641854, "learning_rate": 9.417366770175821e-06, "loss": 0.4017, "step": 253 }, { "epoch": 2.2846715328467155, "grad_norm": 0.1285711649465565, "learning_rate": 9.197006110136977e-06, "loss": 0.422, "step": 254 }, { "epoch": 2.293655249859629, "grad_norm": 0.13481029664375577, "learning_rate": 8.978752607854924e-06, "loss": 0.4046, "step": 255 }, { "epoch": 2.3026389668725433, "grad_norm": 0.11543534305483437, "learning_rate": 8.762630357645268e-06, "loss": 0.414, "step": 256 }, { "epoch": 2.3116226838854574, "grad_norm": 0.13900679103109587, "learning_rate": 8.548663218541897e-06, "loss": 0.4125, "step": 257 }, { "epoch": 2.3206064008983716, "grad_norm": 0.1259428788035887, "learning_rate": 8.336874811663032e-06, "loss": 0.3983, "step": 258 }, { "epoch": 2.3295901179112857, "grad_norm": 0.11085457797406689, "learning_rate": 8.127288517603557e-06, "loss": 0.4193, "step": 259 }, { "epoch": 2.3385738349242, "grad_norm": 0.11622214274255102, "learning_rate": 7.919927473853877e-06, "loss": 0.3937, "step": 260 }, { "epoch": 2.347557551937114, "grad_norm": 0.12462100200017505, "learning_rate": 7.714814572245652e-06, "loss": 0.4155, "step": 261 }, { "epoch": 2.356541268950028, "grad_norm": 0.12614101513377815, "learning_rate": 7.511972456424624e-06, "loss": 0.4121, "step": 262 }, { "epoch": 2.365524985962942, "grad_norm": 0.12461680064110435, "learning_rate": 7.31142351935084e-06, "loss": 0.4068, "step": 263 }, { "epoch": 2.3745087029758563, "grad_norm": 0.11950703045498604, "learning_rate": 7.113189900826568e-06, "loss": 0.4117, "step": 264 }, { "epoch": 2.3834924199887704, "grad_norm": 0.13056991708575583, "learning_rate": 6.917293485052153e-06, "loss": 0.4128, "step": 265 }, { "epoch": 2.3924761370016845, "grad_norm": 0.13745171444337193, "learning_rate": 6.723755898210081e-06, "loss": 0.4151, "step": 266 }, { "epoch": 2.4014598540145986, "grad_norm": 0.13033817774915013, "learning_rate": 6.53259850607753e-06, "loss": 0.4016, "step": 267 }, { "epoch": 2.4104435710275127, "grad_norm": 0.10688098668609738, "learning_rate": 6.343842411667697e-06, "loss": 0.4134, "step": 268 }, { "epoch": 2.419427288040427, "grad_norm": 0.1287888983585153, "learning_rate": 6.157508452900079e-06, "loss": 0.4174, "step": 269 }, { "epoch": 2.428411005053341, "grad_norm": 0.1248679388883586, "learning_rate": 5.973617200300082e-06, "loss": 0.3956, "step": 270 }, { "epoch": 2.437394722066255, "grad_norm": 0.11070702147294058, "learning_rate": 5.792188954728074e-06, "loss": 0.3993, "step": 271 }, { "epoch": 2.446378439079169, "grad_norm": 0.12330276650877879, "learning_rate": 5.6132437451382956e-06, "loss": 0.4085, "step": 272 }, { "epoch": 2.4553621560920833, "grad_norm": 0.1119180851449193, "learning_rate": 5.436801326367725e-06, "loss": 0.3998, "step": 273 }, { "epoch": 2.464345873104997, "grad_norm": 0.1328994320544872, "learning_rate": 5.262881176955216e-06, "loss": 0.419, "step": 274 }, { "epoch": 2.473329590117911, "grad_norm": 0.11619373835235447, "learning_rate": 5.09150249699116e-06, "loss": 0.4097, "step": 275 }, { "epoch": 2.482313307130825, "grad_norm": 0.11879193778970075, "learning_rate": 4.92268420599785e-06, "loss": 0.4154, "step": 276 }, { "epoch": 2.4912970241437393, "grad_norm": 0.11150882823633602, "learning_rate": 4.756444940840868e-06, "loss": 0.3938, "step": 277 }, { "epoch": 2.5002807411566534, "grad_norm": 0.10688470139111161, "learning_rate": 4.5928030536716305e-06, "loss": 0.3951, "step": 278 }, { "epoch": 2.5092644581695676, "grad_norm": 0.10966725943546395, "learning_rate": 4.431776609901392e-06, "loss": 0.4067, "step": 279 }, { "epoch": 2.5182481751824817, "grad_norm": 0.10615758090408767, "learning_rate": 4.273383386206893e-06, "loss": 0.4171, "step": 280 }, { "epoch": 2.527231892195396, "grad_norm": 0.10796581041782187, "learning_rate": 4.11764086856789e-06, "loss": 0.3986, "step": 281 }, { "epoch": 2.53621560920831, "grad_norm": 0.10704980127868732, "learning_rate": 3.964566250336768e-06, "loss": 0.4025, "step": 282 }, { "epoch": 2.545199326221224, "grad_norm": 0.11367865379296356, "learning_rate": 3.814176430340453e-06, "loss": 0.4189, "step": 283 }, { "epoch": 2.554183043234138, "grad_norm": 0.10114216928540735, "learning_rate": 3.6664880110148826e-06, "loss": 0.3926, "step": 284 }, { "epoch": 2.5631667602470523, "grad_norm": 0.11361742890474415, "learning_rate": 3.5215172965721247e-06, "loss": 0.4245, "step": 285 }, { "epoch": 2.5721504772599664, "grad_norm": 0.1084518684866817, "learning_rate": 3.3792802912004827e-06, "loss": 0.4107, "step": 286 }, { "epoch": 2.5811341942728805, "grad_norm": 0.10549395673412279, "learning_rate": 3.2397926972976876e-06, "loss": 0.4107, "step": 287 }, { "epoch": 2.5901179112857946, "grad_norm": 0.09969645074776698, "learning_rate": 3.1030699137374146e-06, "loss": 0.3994, "step": 288 }, { "epoch": 2.5991016282987087, "grad_norm": 0.1056666118347554, "learning_rate": 2.969127034169312e-06, "loss": 0.3998, "step": 289 }, { "epoch": 2.608085345311623, "grad_norm": 0.1035766308650711, "learning_rate": 2.837978845352723e-06, "loss": 0.417, "step": 290 }, { "epoch": 2.6170690623245365, "grad_norm": 0.10302107528120776, "learning_rate": 2.709639825524283e-06, "loss": 0.4109, "step": 291 }, { "epoch": 2.626052779337451, "grad_norm": 0.10759853807512873, "learning_rate": 2.5841241427995886e-06, "loss": 0.3955, "step": 292 }, { "epoch": 2.6350364963503647, "grad_norm": 0.10880577647272674, "learning_rate": 2.461445653609079e-06, "loss": 0.4147, "step": 293 }, { "epoch": 2.6440202133632793, "grad_norm": 0.09689142380499235, "learning_rate": 2.3416179011683658e-06, "loss": 0.4106, "step": 294 }, { "epoch": 2.653003930376193, "grad_norm": 0.10685593466563702, "learning_rate": 2.2246541139830715e-06, "loss": 0.4014, "step": 295 }, { "epoch": 2.661987647389107, "grad_norm": 0.0983605426363825, "learning_rate": 2.1105672043885143e-06, "loss": 0.4116, "step": 296 }, { "epoch": 2.670971364402021, "grad_norm": 0.09655008561783156, "learning_rate": 1.9993697671241945e-06, "loss": 0.4081, "step": 297 }, { "epoch": 2.6799550814149353, "grad_norm": 0.11364035199790146, "learning_rate": 1.8910740779434006e-06, "loss": 0.4076, "step": 298 }, { "epoch": 2.6889387984278494, "grad_norm": 0.10418804191385413, "learning_rate": 1.7856920922580113e-06, "loss": 0.4011, "step": 299 }, { "epoch": 2.6979225154407636, "grad_norm": 0.09941508460951572, "learning_rate": 1.683235443818673e-06, "loss": 0.3984, "step": 300 }, { "epoch": 2.7069062324536777, "grad_norm": 0.09866363300713496, "learning_rate": 1.5837154434304704e-06, "loss": 0.4088, "step": 301 }, { "epoch": 2.715889949466592, "grad_norm": 0.10476593966969794, "learning_rate": 1.4871430777042698e-06, "loss": 0.4158, "step": 302 }, { "epoch": 2.724873666479506, "grad_norm": 0.1003488280191521, "learning_rate": 1.3935290078438229e-06, "loss": 0.4023, "step": 303 }, { "epoch": 2.73385738349242, "grad_norm": 0.09404490189909534, "learning_rate": 1.3028835684688378e-06, "loss": 0.3971, "step": 304 }, { "epoch": 2.742841100505334, "grad_norm": 0.10211715235272904, "learning_rate": 1.2152167664740558e-06, "loss": 0.4075, "step": 305 }, { "epoch": 2.7518248175182483, "grad_norm": 0.1049198084000181, "learning_rate": 1.1305382799245437e-06, "loss": 0.3984, "step": 306 }, { "epoch": 2.7608085345311624, "grad_norm": 0.09652931457838564, "learning_rate": 1.0488574569872655e-06, "loss": 0.4098, "step": 307 }, { "epoch": 2.7697922515440765, "grad_norm": 0.08905719728483828, "learning_rate": 9.70183314899084e-07, "loss": 0.3915, "step": 308 }, { "epoch": 2.7787759685569906, "grad_norm": 0.0938697467560363, "learning_rate": 8.945245389713029e-07, "loss": 0.4248, "step": 309 }, { "epoch": 2.7877596855699043, "grad_norm": 0.1012541455507019, "learning_rate": 8.218894816308398e-07, "loss": 0.3914, "step": 310 }, { "epoch": 2.796743402582819, "grad_norm": 0.09264156371198151, "learning_rate": 7.522861614981325e-07, "loss": 0.4122, "step": 311 }, { "epoch": 2.8057271195957325, "grad_norm": 0.0957623449129902, "learning_rate": 6.85722262501941e-07, "loss": 0.4102, "step": 312 }, { "epoch": 2.814710836608647, "grad_norm": 0.09713743297082324, "learning_rate": 6.222051330310619e-07, "loss": 0.4026, "step": 313 }, { "epoch": 2.8236945536215607, "grad_norm": 0.09186827414249998, "learning_rate": 5.617417851230931e-07, "loss": 0.4096, "step": 314 }, { "epoch": 2.832678270634475, "grad_norm": 0.09713737535786383, "learning_rate": 5.043388936903404e-07, "loss": 0.4032, "step": 315 }, { "epoch": 2.841661987647389, "grad_norm": 0.09342384537711217, "learning_rate": 4.500027957829165e-07, "loss": 0.4071, "step": 316 }, { "epoch": 2.850645704660303, "grad_norm": 0.09314756742095372, "learning_rate": 3.987394898891804e-07, "loss": 0.4137, "step": 317 }, { "epoch": 2.859629421673217, "grad_norm": 0.0887933631925566, "learning_rate": 3.50554635273513e-07, "loss": 0.4081, "step": 318 }, { "epoch": 2.8686131386861313, "grad_norm": 0.09565485915542679, "learning_rate": 3.0545355135156165e-07, "loss": 0.4025, "step": 319 }, { "epoch": 2.8775968556990454, "grad_norm": 0.0950394236491987, "learning_rate": 2.634412171029911e-07, "loss": 0.4118, "step": 320 }, { "epoch": 2.8865805727119596, "grad_norm": 0.08929953352976103, "learning_rate": 2.2452227052183503e-07, "loss": 0.4051, "step": 321 }, { "epoch": 2.8955642897248737, "grad_norm": 0.087031212892392, "learning_rate": 1.8870100810447e-07, "loss": 0.4037, "step": 322 }, { "epoch": 2.904548006737788, "grad_norm": 0.09107399363880533, "learning_rate": 1.559813843753062e-07, "loss": 0.4089, "step": 323 }, { "epoch": 2.913531723750702, "grad_norm": 0.09057801956608469, "learning_rate": 1.2636701145022035e-07, "loss": 0.4016, "step": 324 }, { "epoch": 2.922515440763616, "grad_norm": 0.09636376248734924, "learning_rate": 9.986115863779269e-08, "loss": 0.4139, "step": 325 }, { "epoch": 2.93149915777653, "grad_norm": 0.08743246679042953, "learning_rate": 7.646675207839336e-08, "loss": 0.3973, "step": 326 }, { "epoch": 2.9404828747894443, "grad_norm": 0.08981696851089446, "learning_rate": 5.618637442113561e-08, "loss": 0.3952, "step": 327 }, { "epoch": 2.9494665918023584, "grad_norm": 0.09021046410712788, "learning_rate": 3.9022264538775445e-08, "loss": 0.4095, "step": 328 }, { "epoch": 2.958450308815272, "grad_norm": 0.09125597188841794, "learning_rate": 2.4976317280541322e-08, "loss": 0.404, "step": 329 }, { "epoch": 2.9674340258281866, "grad_norm": 0.09031347282702284, "learning_rate": 1.4050083262957056e-08, "loss": 0.4065, "step": 330 }, { "epoch": 2.9764177428411003, "grad_norm": 0.09548041676594968, "learning_rate": 6.24476869865149e-09, "loss": 0.4085, "step": 331 }, { "epoch": 2.985401459854015, "grad_norm": 0.08990746175705872, "learning_rate": 1.5612352631989697e-09, "loss": 0.4029, "step": 332 }, { "epoch": 2.9943851768669285, "grad_norm": 0.09349234362985716, "learning_rate": 0.0, "loss": 0.407, "step": 333 }, { "epoch": 2.9943851768669285, "step": 333, "total_flos": 4607052149424128.0, "train_loss": 0.4632815238949773, "train_runtime": 115748.1353, "train_samples_per_second": 2.954, "train_steps_per_second": 0.003 } ], "logging_steps": 1.0, "max_steps": 333, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4607052149424128.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }