{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7105680018103643, "eval_steps": 393, "global_step": 1570, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00045259108395564606, "grad_norm": 0.5928090810775757, "learning_rate": 2e-05, "loss": 1.4929, "step": 1 }, { "epoch": 0.00045259108395564606, "eval_loss": 0.36171355843544006, "eval_runtime": 51.6261, "eval_samples_per_second": 18.034, "eval_steps_per_second": 9.026, "step": 1 }, { "epoch": 0.0009051821679112921, "grad_norm": 0.7159351110458374, "learning_rate": 4e-05, "loss": 1.6326, "step": 2 }, { "epoch": 0.0013577732518669382, "grad_norm": 0.847549557685852, "learning_rate": 6e-05, "loss": 1.6612, "step": 3 }, { "epoch": 0.0018103643358225842, "grad_norm": 0.6347976326942444, "learning_rate": 8e-05, "loss": 1.5476, "step": 4 }, { "epoch": 0.0022629554197782305, "grad_norm": 0.8424527645111084, "learning_rate": 0.0001, "loss": 1.3815, "step": 5 }, { "epoch": 0.0027155465037338763, "grad_norm": 0.8723886013031006, "learning_rate": 0.00012, "loss": 1.3837, "step": 6 }, { "epoch": 0.0031681375876895226, "grad_norm": 0.518265426158905, "learning_rate": 0.00014, "loss": 1.4496, "step": 7 }, { "epoch": 0.0036207286716451684, "grad_norm": 0.6353972554206848, "learning_rate": 0.00016, "loss": 1.5068, "step": 8 }, { "epoch": 0.004073319755600814, "grad_norm": 0.6076619625091553, "learning_rate": 0.00018, "loss": 1.449, "step": 9 }, { "epoch": 0.004525910839556461, "grad_norm": 0.636408805847168, "learning_rate": 0.0002, "loss": 1.3747, "step": 10 }, { "epoch": 0.004978501923512107, "grad_norm": 0.5302889347076416, "learning_rate": 0.00019999979722220036, "loss": 1.2439, "step": 11 }, { "epoch": 0.005431093007467753, "grad_norm": 0.5638456344604492, "learning_rate": 0.00019999918888962372, "loss": 1.2716, "step": 12 }, { "epoch": 0.005883684091423399, "grad_norm": 0.4960058629512787, "learning_rate": 0.00019999817500473724, "loss": 1.0349, "step": 13 }, { "epoch": 0.006336275175379045, "grad_norm": 0.6153571605682373, "learning_rate": 0.0001999967555716528, "loss": 1.0655, "step": 14 }, { "epoch": 0.006788866259334691, "grad_norm": 0.6212618350982666, "learning_rate": 0.00019999493059612698, "loss": 1.1561, "step": 15 }, { "epoch": 0.007241457343290337, "grad_norm": 0.6057634949684143, "learning_rate": 0.00019999270008556108, "loss": 1.0444, "step": 16 }, { "epoch": 0.007694048427245984, "grad_norm": 0.5705118179321289, "learning_rate": 0.00019999006404900104, "loss": 1.1472, "step": 17 }, { "epoch": 0.008146639511201629, "grad_norm": 0.5696781873703003, "learning_rate": 0.00019998702249713748, "loss": 1.1899, "step": 18 }, { "epoch": 0.008599230595157275, "grad_norm": 0.5650632381439209, "learning_rate": 0.00019998357544230558, "loss": 0.8681, "step": 19 }, { "epoch": 0.009051821679112922, "grad_norm": 0.5231237411499023, "learning_rate": 0.00019997972289848503, "loss": 1.0433, "step": 20 }, { "epoch": 0.009504412763068567, "grad_norm": 0.6101372838020325, "learning_rate": 0.00019997546488130006, "loss": 1.0171, "step": 21 }, { "epoch": 0.009957003847024214, "grad_norm": 0.5169489979743958, "learning_rate": 0.00019997080140801932, "loss": 0.8874, "step": 22 }, { "epoch": 0.01040959493097986, "grad_norm": 0.5703766942024231, "learning_rate": 0.00019996573249755572, "loss": 0.9659, "step": 23 }, { "epoch": 0.010862186014935505, "grad_norm": 0.514780580997467, "learning_rate": 0.00019996025817046662, "loss": 1.0149, "step": 24 }, { "epoch": 0.011314777098891152, "grad_norm": 0.5197772979736328, "learning_rate": 0.00019995437844895334, "loss": 0.9625, "step": 25 }, { "epoch": 0.011767368182846799, "grad_norm": 0.5655032396316528, "learning_rate": 0.0001999480933568615, "loss": 1.2514, "step": 26 }, { "epoch": 0.012219959266802444, "grad_norm": 0.5189302563667297, "learning_rate": 0.00019994140291968061, "loss": 1.1052, "step": 27 }, { "epoch": 0.01267255035075809, "grad_norm": 0.522919774055481, "learning_rate": 0.00019993430716454413, "loss": 1.0664, "step": 28 }, { "epoch": 0.013125141434713735, "grad_norm": 0.5491878390312195, "learning_rate": 0.00019992680612022928, "loss": 1.0344, "step": 29 }, { "epoch": 0.013577732518669382, "grad_norm": 0.5318437218666077, "learning_rate": 0.00019991889981715698, "loss": 1.1589, "step": 30 }, { "epoch": 0.014030323602625029, "grad_norm": 0.5532176494598389, "learning_rate": 0.00019991058828739165, "loss": 1.0761, "step": 31 }, { "epoch": 0.014482914686580674, "grad_norm": 0.4940855801105499, "learning_rate": 0.0001999018715646412, "loss": 1.011, "step": 32 }, { "epoch": 0.01493550577053632, "grad_norm": 0.4913943409919739, "learning_rate": 0.00019989274968425677, "loss": 0.9709, "step": 33 }, { "epoch": 0.015388096854491967, "grad_norm": 0.5112089514732361, "learning_rate": 0.00019988322268323268, "loss": 1.0093, "step": 34 }, { "epoch": 0.015840687938447614, "grad_norm": 0.512718915939331, "learning_rate": 0.00019987329060020616, "loss": 0.9619, "step": 35 }, { "epoch": 0.016293279022403257, "grad_norm": 1.3937323093414307, "learning_rate": 0.0001998629534754574, "loss": 0.7394, "step": 36 }, { "epoch": 0.016745870106358904, "grad_norm": 0.49547895789146423, "learning_rate": 0.00019985221135090914, "loss": 0.9585, "step": 37 }, { "epoch": 0.01719846119031455, "grad_norm": 0.41780880093574524, "learning_rate": 0.00019984106427012668, "loss": 0.8373, "step": 38 }, { "epoch": 0.017651052274270197, "grad_norm": 0.5070527791976929, "learning_rate": 0.00019982951227831764, "loss": 0.875, "step": 39 }, { "epoch": 0.018103643358225844, "grad_norm": 0.49340716004371643, "learning_rate": 0.00019981755542233177, "loss": 0.951, "step": 40 }, { "epoch": 0.01855623444218149, "grad_norm": 0.5306276082992554, "learning_rate": 0.00019980519375066073, "loss": 1.0026, "step": 41 }, { "epoch": 0.019008825526137134, "grad_norm": 0.4602161645889282, "learning_rate": 0.00019979242731343804, "loss": 0.998, "step": 42 }, { "epoch": 0.01946141661009278, "grad_norm": 0.45144957304000854, "learning_rate": 0.00019977925616243862, "loss": 0.8961, "step": 43 }, { "epoch": 0.019914007694048427, "grad_norm": 0.5363659262657166, "learning_rate": 0.00019976568035107888, "loss": 1.1133, "step": 44 }, { "epoch": 0.020366598778004074, "grad_norm": 0.5454615354537964, "learning_rate": 0.00019975169993441627, "loss": 0.9765, "step": 45 }, { "epoch": 0.02081918986195972, "grad_norm": 0.4762699007987976, "learning_rate": 0.00019973731496914914, "loss": 0.9735, "step": 46 }, { "epoch": 0.021271780945915364, "grad_norm": 0.5286387205123901, "learning_rate": 0.0001997225255136165, "loss": 1.1714, "step": 47 }, { "epoch": 0.02172437202987101, "grad_norm": 0.5487295389175415, "learning_rate": 0.00019970733162779785, "loss": 1.0149, "step": 48 }, { "epoch": 0.022176963113826657, "grad_norm": 0.5506228804588318, "learning_rate": 0.0001996917333733128, "loss": 0.9927, "step": 49 }, { "epoch": 0.022629554197782304, "grad_norm": 0.4660683274269104, "learning_rate": 0.00019967573081342103, "loss": 1.0786, "step": 50 }, { "epoch": 0.02308214528173795, "grad_norm": 0.45142054557800293, "learning_rate": 0.0001996593240130217, "loss": 0.6478, "step": 51 }, { "epoch": 0.023534736365693597, "grad_norm": 0.5232899785041809, "learning_rate": 0.00019964251303865362, "loss": 0.96, "step": 52 }, { "epoch": 0.02398732744964924, "grad_norm": 0.4651690125465393, "learning_rate": 0.0001996252979584946, "loss": 0.8486, "step": 53 }, { "epoch": 0.024439918533604887, "grad_norm": 0.5819869041442871, "learning_rate": 0.0001996076788423613, "loss": 1.2186, "step": 54 }, { "epoch": 0.024892509617560534, "grad_norm": 0.49748942255973816, "learning_rate": 0.00019958965576170908, "loss": 0.8847, "step": 55 }, { "epoch": 0.02534510070151618, "grad_norm": 0.47792863845825195, "learning_rate": 0.0001995712287896316, "loss": 0.9235, "step": 56 }, { "epoch": 0.025797691785471828, "grad_norm": 0.5491011142730713, "learning_rate": 0.00019955239800086043, "loss": 1.1844, "step": 57 }, { "epoch": 0.02625028286942747, "grad_norm": 0.4292598366737366, "learning_rate": 0.00019953316347176488, "loss": 0.7802, "step": 58 }, { "epoch": 0.026702873953383117, "grad_norm": 0.5144155621528625, "learning_rate": 0.00019951352528035164, "loss": 0.9687, "step": 59 }, { "epoch": 0.027155465037338764, "grad_norm": 0.47235575318336487, "learning_rate": 0.00019949348350626456, "loss": 0.9453, "step": 60 }, { "epoch": 0.02760805612129441, "grad_norm": 0.5346776843070984, "learning_rate": 0.00019947303823078416, "loss": 0.9696, "step": 61 }, { "epoch": 0.028060647205250058, "grad_norm": 0.5405656695365906, "learning_rate": 0.00019945218953682734, "loss": 0.8242, "step": 62 }, { "epoch": 0.028513238289205704, "grad_norm": 0.5093278288841248, "learning_rate": 0.0001994309375089472, "loss": 0.9403, "step": 63 }, { "epoch": 0.028965829373161348, "grad_norm": 0.5229548811912537, "learning_rate": 0.00019940928223333252, "loss": 1.0942, "step": 64 }, { "epoch": 0.029418420457116994, "grad_norm": 0.5226697325706482, "learning_rate": 0.00019938722379780747, "loss": 0.9806, "step": 65 }, { "epoch": 0.02987101154107264, "grad_norm": 0.5586569905281067, "learning_rate": 0.00019936476229183133, "loss": 1.3018, "step": 66 }, { "epoch": 0.030323602625028288, "grad_norm": 0.4146330654621124, "learning_rate": 0.0001993418978064979, "loss": 0.8051, "step": 67 }, { "epoch": 0.030776193708983934, "grad_norm": 0.43404343724250793, "learning_rate": 0.0001993186304345354, "loss": 0.8969, "step": 68 }, { "epoch": 0.031228784792939578, "grad_norm": 0.4898476302623749, "learning_rate": 0.00019929496027030604, "loss": 1.132, "step": 69 }, { "epoch": 0.03168137587689523, "grad_norm": 0.47987234592437744, "learning_rate": 0.0001992708874098054, "loss": 0.9463, "step": 70 }, { "epoch": 0.03213396696085087, "grad_norm": 0.5059100985527039, "learning_rate": 0.0001992464119506624, "loss": 1.1106, "step": 71 }, { "epoch": 0.032586558044806514, "grad_norm": 0.4559473693370819, "learning_rate": 0.00019922153399213853, "loss": 1.0079, "step": 72 }, { "epoch": 0.033039149128762164, "grad_norm": 0.4334002435207367, "learning_rate": 0.00019919625363512786, "loss": 0.7356, "step": 73 }, { "epoch": 0.03349174021271781, "grad_norm": 0.4892705976963043, "learning_rate": 0.0001991705709821562, "loss": 0.9491, "step": 74 }, { "epoch": 0.03394433129667346, "grad_norm": 0.44334766268730164, "learning_rate": 0.00019914448613738106, "loss": 0.8698, "step": 75 }, { "epoch": 0.0343969223806291, "grad_norm": 0.4887705445289612, "learning_rate": 0.00019911799920659093, "loss": 1.0981, "step": 76 }, { "epoch": 0.034849513464584744, "grad_norm": 0.4934709072113037, "learning_rate": 0.00019909111029720505, "loss": 1.1072, "step": 77 }, { "epoch": 0.035302104548540394, "grad_norm": 0.47873038053512573, "learning_rate": 0.00019906381951827293, "loss": 1.1416, "step": 78 }, { "epoch": 0.03575469563249604, "grad_norm": 0.5812323093414307, "learning_rate": 0.00019903612698047383, "loss": 1.034, "step": 79 }, { "epoch": 0.03620728671645169, "grad_norm": 0.4874022603034973, "learning_rate": 0.0001990080327961164, "loss": 1.0374, "step": 80 }, { "epoch": 0.03665987780040733, "grad_norm": 0.4300713837146759, "learning_rate": 0.00019897953707913817, "loss": 0.6254, "step": 81 }, { "epoch": 0.03711246888436298, "grad_norm": 0.44634315371513367, "learning_rate": 0.0001989506399451051, "loss": 0.7691, "step": 82 }, { "epoch": 0.037565059968318625, "grad_norm": 0.5015506148338318, "learning_rate": 0.00019892134151121117, "loss": 1.0913, "step": 83 }, { "epoch": 0.03801765105227427, "grad_norm": 0.4348723292350769, "learning_rate": 0.0001988916418962778, "loss": 0.8272, "step": 84 }, { "epoch": 0.03847024213622992, "grad_norm": 0.5083072781562805, "learning_rate": 0.00019886154122075343, "loss": 0.9879, "step": 85 }, { "epoch": 0.03892283322018556, "grad_norm": 0.5124669075012207, "learning_rate": 0.00019883103960671305, "loss": 1.3012, "step": 86 }, { "epoch": 0.03937542430414121, "grad_norm": 0.5468320846557617, "learning_rate": 0.00019880013717785765, "loss": 1.0486, "step": 87 }, { "epoch": 0.039828015388096855, "grad_norm": 0.4499719738960266, "learning_rate": 0.00019876883405951377, "loss": 0.751, "step": 88 }, { "epoch": 0.0402806064720525, "grad_norm": 0.5176135301589966, "learning_rate": 0.00019873713037863298, "loss": 1.0467, "step": 89 }, { "epoch": 0.04073319755600815, "grad_norm": 0.4073967933654785, "learning_rate": 0.00019870502626379127, "loss": 0.7725, "step": 90 }, { "epoch": 0.04118578863996379, "grad_norm": 0.49511098861694336, "learning_rate": 0.00019867252184518878, "loss": 1.0994, "step": 91 }, { "epoch": 0.04163837972391944, "grad_norm": 0.4873201251029968, "learning_rate": 0.0001986396172546489, "loss": 0.8481, "step": 92 }, { "epoch": 0.042090970807875085, "grad_norm": 0.5509189963340759, "learning_rate": 0.0001986063126256181, "loss": 0.9772, "step": 93 }, { "epoch": 0.04254356189183073, "grad_norm": 0.4365142285823822, "learning_rate": 0.0001985726080931651, "loss": 0.6995, "step": 94 }, { "epoch": 0.04299615297578638, "grad_norm": 0.5211268067359924, "learning_rate": 0.0001985385037939806, "loss": 0.9292, "step": 95 }, { "epoch": 0.04344874405974202, "grad_norm": 0.6072234511375427, "learning_rate": 0.00019850399986637643, "loss": 1.3945, "step": 96 }, { "epoch": 0.04390133514369767, "grad_norm": 0.433661550283432, "learning_rate": 0.00019846909645028523, "loss": 0.885, "step": 97 }, { "epoch": 0.044353926227653315, "grad_norm": 0.48475149273872375, "learning_rate": 0.00019843379368725977, "loss": 0.9542, "step": 98 }, { "epoch": 0.04480651731160896, "grad_norm": 0.5443070530891418, "learning_rate": 0.00019839809172047238, "loss": 1.324, "step": 99 }, { "epoch": 0.04525910839556461, "grad_norm": 0.5014687776565552, "learning_rate": 0.00019836199069471437, "loss": 0.7702, "step": 100 }, { "epoch": 0.04571169947952025, "grad_norm": 0.49939578771591187, "learning_rate": 0.0001983254907563955, "loss": 1.2017, "step": 101 }, { "epoch": 0.0461642905634759, "grad_norm": 0.40175965428352356, "learning_rate": 0.00019828859205354323, "loss": 0.7663, "step": 102 }, { "epoch": 0.046616881647431545, "grad_norm": 0.46545523405075073, "learning_rate": 0.0001982512947358024, "loss": 0.9994, "step": 103 }, { "epoch": 0.047069472731387195, "grad_norm": 0.4311598837375641, "learning_rate": 0.00019821359895443434, "loss": 0.7427, "step": 104 }, { "epoch": 0.04752206381534284, "grad_norm": 0.5804029107093811, "learning_rate": 0.00019817550486231643, "loss": 1.046, "step": 105 }, { "epoch": 0.04797465489929848, "grad_norm": 0.5326130986213684, "learning_rate": 0.00019813701261394136, "loss": 1.0632, "step": 106 }, { "epoch": 0.04842724598325413, "grad_norm": 0.4057491719722748, "learning_rate": 0.00019809812236541661, "loss": 0.8145, "step": 107 }, { "epoch": 0.048879837067209775, "grad_norm": 0.4094937741756439, "learning_rate": 0.00019805883427446376, "loss": 0.8497, "step": 108 }, { "epoch": 0.049332428151165425, "grad_norm": 0.48108571767807007, "learning_rate": 0.00019801914850041784, "loss": 0.7832, "step": 109 }, { "epoch": 0.04978501923512107, "grad_norm": 0.4242852032184601, "learning_rate": 0.00019797906520422677, "loss": 0.8913, "step": 110 }, { "epoch": 0.05023761031907671, "grad_norm": 0.5135735869407654, "learning_rate": 0.00019793858454845057, "loss": 0.9478, "step": 111 }, { "epoch": 0.05069020140303236, "grad_norm": 0.4040193557739258, "learning_rate": 0.00019789770669726087, "loss": 0.719, "step": 112 }, { "epoch": 0.051142792486988005, "grad_norm": 0.441881388425827, "learning_rate": 0.00019785643181643997, "loss": 0.7208, "step": 113 }, { "epoch": 0.051595383570943655, "grad_norm": 0.3968985974788666, "learning_rate": 0.00019781476007338058, "loss": 0.6075, "step": 114 }, { "epoch": 0.0520479746548993, "grad_norm": 0.4533964991569519, "learning_rate": 0.00019777269163708468, "loss": 0.8069, "step": 115 }, { "epoch": 0.05250056573885494, "grad_norm": 0.5719001889228821, "learning_rate": 0.00019773022667816327, "loss": 0.9846, "step": 116 }, { "epoch": 0.05295315682281059, "grad_norm": 0.637066125869751, "learning_rate": 0.00019768736536883528, "loss": 1.1128, "step": 117 }, { "epoch": 0.053405747906766235, "grad_norm": 0.5196853876113892, "learning_rate": 0.00019764410788292722, "loss": 0.7845, "step": 118 }, { "epoch": 0.053858338990721885, "grad_norm": 0.48675844073295593, "learning_rate": 0.0001976004543958722, "loss": 0.8867, "step": 119 }, { "epoch": 0.05431093007467753, "grad_norm": 0.5182852149009705, "learning_rate": 0.00019755640508470942, "loss": 1.0401, "step": 120 }, { "epoch": 0.05476352115863317, "grad_norm": 0.6056920886039734, "learning_rate": 0.00019751196012808325, "loss": 1.2113, "step": 121 }, { "epoch": 0.05521611224258882, "grad_norm": 0.49781960248947144, "learning_rate": 0.0001974671197062428, "loss": 0.9309, "step": 122 }, { "epoch": 0.055668703326544465, "grad_norm": 0.6912221908569336, "learning_rate": 0.00019742188400104086, "loss": 1.2537, "step": 123 }, { "epoch": 0.056121294410500115, "grad_norm": 0.41682809591293335, "learning_rate": 0.00019737625319593335, "loss": 0.7506, "step": 124 }, { "epoch": 0.05657388549445576, "grad_norm": 0.5782731771469116, "learning_rate": 0.0001973302274759786, "loss": 1.1436, "step": 125 }, { "epoch": 0.05702647657841141, "grad_norm": 0.4784829020500183, "learning_rate": 0.00019728380702783643, "loss": 0.8599, "step": 126 }, { "epoch": 0.05747906766236705, "grad_norm": 0.46277278661727905, "learning_rate": 0.00019723699203976766, "loss": 0.9085, "step": 127 }, { "epoch": 0.057931658746322695, "grad_norm": 0.45224806666374207, "learning_rate": 0.00019718978270163304, "loss": 0.8367, "step": 128 }, { "epoch": 0.058384249830278345, "grad_norm": 0.660629391670227, "learning_rate": 0.00019714217920489266, "loss": 1.2308, "step": 129 }, { "epoch": 0.05883684091423399, "grad_norm": 0.5309016704559326, "learning_rate": 0.0001970941817426052, "loss": 1.2314, "step": 130 }, { "epoch": 0.05928943199818964, "grad_norm": 0.5790516138076782, "learning_rate": 0.00019704579050942706, "loss": 1.0749, "step": 131 }, { "epoch": 0.05974202308214528, "grad_norm": 0.47186607122421265, "learning_rate": 0.0001969970057016116, "loss": 0.9159, "step": 132 }, { "epoch": 0.060194614166100925, "grad_norm": 0.5415465831756592, "learning_rate": 0.00019694782751700828, "loss": 0.8399, "step": 133 }, { "epoch": 0.060647205250056575, "grad_norm": 0.47656404972076416, "learning_rate": 0.00019689825615506207, "loss": 1.0643, "step": 134 }, { "epoch": 0.06109979633401222, "grad_norm": 0.5224644541740417, "learning_rate": 0.00019684829181681234, "loss": 0.9919, "step": 135 }, { "epoch": 0.06155238741796787, "grad_norm": 0.5177377462387085, "learning_rate": 0.00019679793470489228, "loss": 0.8967, "step": 136 }, { "epoch": 0.06200497850192351, "grad_norm": 0.5282057523727417, "learning_rate": 0.000196747185023528, "loss": 0.986, "step": 137 }, { "epoch": 0.062457569585879155, "grad_norm": 0.4550272524356842, "learning_rate": 0.00019669604297853764, "loss": 1.0715, "step": 138 }, { "epoch": 0.0629101606698348, "grad_norm": 0.4701433479785919, "learning_rate": 0.00019664450877733062, "loss": 0.7913, "step": 139 }, { "epoch": 0.06336275175379046, "grad_norm": 0.48283687233924866, "learning_rate": 0.00019659258262890683, "loss": 0.9137, "step": 140 }, { "epoch": 0.06381534283774609, "grad_norm": 0.5827288031578064, "learning_rate": 0.00019654026474385562, "loss": 1.1277, "step": 141 }, { "epoch": 0.06426793392170174, "grad_norm": 0.46015703678131104, "learning_rate": 0.00019648755533435518, "loss": 0.8968, "step": 142 }, { "epoch": 0.06472052500565739, "grad_norm": 0.4119875133037567, "learning_rate": 0.00019643445461417134, "loss": 0.6121, "step": 143 }, { "epoch": 0.06517311608961303, "grad_norm": 0.40021613240242004, "learning_rate": 0.00019638096279865717, "loss": 0.5665, "step": 144 }, { "epoch": 0.06562570717356868, "grad_norm": 0.4844271242618561, "learning_rate": 0.00019632708010475165, "loss": 0.9002, "step": 145 }, { "epoch": 0.06607829825752433, "grad_norm": 0.49857303500175476, "learning_rate": 0.00019627280675097908, "loss": 1.0462, "step": 146 }, { "epoch": 0.06653088934147998, "grad_norm": 0.46803635358810425, "learning_rate": 0.0001962181429574481, "loss": 0.8032, "step": 147 }, { "epoch": 0.06698348042543562, "grad_norm": 0.48295503854751587, "learning_rate": 0.00019616308894585078, "loss": 0.8879, "step": 148 }, { "epoch": 0.06743607150939127, "grad_norm": 0.47980958223342896, "learning_rate": 0.00019610764493946175, "loss": 0.8207, "step": 149 }, { "epoch": 0.06788866259334692, "grad_norm": 0.41453817486763, "learning_rate": 0.00019605181116313724, "loss": 0.7636, "step": 150 }, { "epoch": 0.06834125367730255, "grad_norm": 0.5246773958206177, "learning_rate": 0.0001959955878433143, "loss": 0.8858, "step": 151 }, { "epoch": 0.0687938447612582, "grad_norm": 0.5314387679100037, "learning_rate": 0.00019593897520800977, "loss": 1.0188, "step": 152 }, { "epoch": 0.06924643584521385, "grad_norm": 0.5274377465248108, "learning_rate": 0.0001958819734868193, "loss": 1.1265, "step": 153 }, { "epoch": 0.06969902692916949, "grad_norm": 0.49274730682373047, "learning_rate": 0.00019582458291091663, "loss": 0.8626, "step": 154 }, { "epoch": 0.07015161801312514, "grad_norm": 0.6286381483078003, "learning_rate": 0.0001957668037130524, "loss": 1.095, "step": 155 }, { "epoch": 0.07060420909708079, "grad_norm": 0.4335956275463104, "learning_rate": 0.00019570863612755344, "loss": 0.6156, "step": 156 }, { "epoch": 0.07105680018103644, "grad_norm": 0.6056137084960938, "learning_rate": 0.00019565008039032158, "loss": 1.2356, "step": 157 }, { "epoch": 0.07150939126499208, "grad_norm": 0.5735076069831848, "learning_rate": 0.0001955911367388329, "loss": 0.9665, "step": 158 }, { "epoch": 0.07196198234894773, "grad_norm": 0.6242491006851196, "learning_rate": 0.00019553180541213673, "loss": 1.205, "step": 159 }, { "epoch": 0.07241457343290338, "grad_norm": 0.5141255259513855, "learning_rate": 0.00019547208665085457, "loss": 0.8392, "step": 160 }, { "epoch": 0.07286716451685901, "grad_norm": 0.47017166018486023, "learning_rate": 0.0001954119806971792, "loss": 0.8133, "step": 161 }, { "epoch": 0.07331975560081466, "grad_norm": 0.5935090184211731, "learning_rate": 0.00019535148779487363, "loss": 0.8971, "step": 162 }, { "epoch": 0.07377234668477031, "grad_norm": 0.5817804932594299, "learning_rate": 0.0001952906081892703, "loss": 0.8039, "step": 163 }, { "epoch": 0.07422493776872596, "grad_norm": 0.5110310912132263, "learning_rate": 0.0001952293421272698, "loss": 0.9442, "step": 164 }, { "epoch": 0.0746775288526816, "grad_norm": 0.7098664045333862, "learning_rate": 0.00019516768985734006, "loss": 1.149, "step": 165 }, { "epoch": 0.07513011993663725, "grad_norm": 0.5433538556098938, "learning_rate": 0.00019510565162951537, "loss": 1.0884, "step": 166 }, { "epoch": 0.0755827110205929, "grad_norm": 0.471447616815567, "learning_rate": 0.0001950432276953952, "loss": 0.8294, "step": 167 }, { "epoch": 0.07603530210454854, "grad_norm": 0.5262457728385925, "learning_rate": 0.0001949804183081433, "loss": 0.9602, "step": 168 }, { "epoch": 0.07648789318850419, "grad_norm": 0.45325416326522827, "learning_rate": 0.0001949172237224867, "loss": 0.9151, "step": 169 }, { "epoch": 0.07694048427245984, "grad_norm": 0.5733582377433777, "learning_rate": 0.00019485364419471454, "loss": 1.089, "step": 170 }, { "epoch": 0.07739307535641547, "grad_norm": 0.4842165410518646, "learning_rate": 0.00019478967998267722, "loss": 1.0392, "step": 171 }, { "epoch": 0.07784566644037112, "grad_norm": 0.43813711404800415, "learning_rate": 0.00019472533134578507, "loss": 0.8412, "step": 172 }, { "epoch": 0.07829825752432677, "grad_norm": 0.41982951760292053, "learning_rate": 0.00019466059854500768, "loss": 0.6114, "step": 173 }, { "epoch": 0.07875084860828242, "grad_norm": 0.4452666640281677, "learning_rate": 0.00019459548184287253, "loss": 0.7847, "step": 174 }, { "epoch": 0.07920343969223806, "grad_norm": 0.6118762493133545, "learning_rate": 0.00019452998150346401, "loss": 0.9116, "step": 175 }, { "epoch": 0.07965603077619371, "grad_norm": 0.45418888330459595, "learning_rate": 0.0001944640977924225, "loss": 0.7162, "step": 176 }, { "epoch": 0.08010862186014936, "grad_norm": 0.5309512615203857, "learning_rate": 0.000194397830976943, "loss": 0.9851, "step": 177 }, { "epoch": 0.080561212944105, "grad_norm": 0.5224008560180664, "learning_rate": 0.0001943311813257743, "loss": 0.8163, "step": 178 }, { "epoch": 0.08101380402806065, "grad_norm": 0.5464060306549072, "learning_rate": 0.00019426414910921787, "loss": 0.9043, "step": 179 }, { "epoch": 0.0814663951120163, "grad_norm": 0.5652410387992859, "learning_rate": 0.0001941967345991265, "loss": 1.049, "step": 180 }, { "epoch": 0.08191898619597195, "grad_norm": 0.4448956251144409, "learning_rate": 0.00019412893806890357, "loss": 0.6375, "step": 181 }, { "epoch": 0.08237157727992758, "grad_norm": 0.45628809928894043, "learning_rate": 0.00019406075979350174, "loss": 0.8139, "step": 182 }, { "epoch": 0.08282416836388323, "grad_norm": 0.4771364629268646, "learning_rate": 0.00019399220004942175, "loss": 0.8003, "step": 183 }, { "epoch": 0.08327675944783888, "grad_norm": 0.5508702397346497, "learning_rate": 0.00019392325911471155, "loss": 0.8694, "step": 184 }, { "epoch": 0.08372935053179452, "grad_norm": 0.5558999180793762, "learning_rate": 0.0001938539372689649, "loss": 0.9957, "step": 185 }, { "epoch": 0.08418194161575017, "grad_norm": 0.6134940385818481, "learning_rate": 0.00019378423479332046, "loss": 0.9309, "step": 186 }, { "epoch": 0.08463453269970582, "grad_norm": 0.41704538464546204, "learning_rate": 0.00019371415197046052, "loss": 0.6967, "step": 187 }, { "epoch": 0.08508712378366146, "grad_norm": 0.637626588344574, "learning_rate": 0.0001936436890846099, "loss": 0.9637, "step": 188 }, { "epoch": 0.0855397148676171, "grad_norm": 0.5710074305534363, "learning_rate": 0.00019357284642153476, "loss": 0.8212, "step": 189 }, { "epoch": 0.08599230595157276, "grad_norm": 0.7479123473167419, "learning_rate": 0.0001935016242685415, "loss": 1.2082, "step": 190 }, { "epoch": 0.0864448970355284, "grad_norm": 0.45262470841407776, "learning_rate": 0.00019343002291447554, "loss": 0.7451, "step": 191 }, { "epoch": 0.08689748811948404, "grad_norm": 0.4303995370864868, "learning_rate": 0.00019335804264972018, "loss": 0.7669, "step": 192 }, { "epoch": 0.08735007920343969, "grad_norm": 0.576163649559021, "learning_rate": 0.00019328568376619543, "loss": 1.1127, "step": 193 }, { "epoch": 0.08780267028739534, "grad_norm": 0.46824729442596436, "learning_rate": 0.0001932129465573568, "loss": 0.7348, "step": 194 }, { "epoch": 0.08825526137135098, "grad_norm": 0.4299353063106537, "learning_rate": 0.00019313983131819407, "loss": 0.6548, "step": 195 }, { "epoch": 0.08870785245530663, "grad_norm": 0.47497665882110596, "learning_rate": 0.00019306633834523024, "loss": 0.9236, "step": 196 }, { "epoch": 0.08916044353926228, "grad_norm": 0.47756174206733704, "learning_rate": 0.00019299246793652014, "loss": 0.8134, "step": 197 }, { "epoch": 0.08961303462321792, "grad_norm": 0.5055831074714661, "learning_rate": 0.00019291822039164933, "loss": 1.0333, "step": 198 }, { "epoch": 0.09006562570717357, "grad_norm": 0.46794986724853516, "learning_rate": 0.00019284359601173294, "loss": 0.8624, "step": 199 }, { "epoch": 0.09051821679112922, "grad_norm": 0.5907809138298035, "learning_rate": 0.0001927685950994143, "loss": 0.9727, "step": 200 }, { "epoch": 0.09097080787508487, "grad_norm": 0.5275614857673645, "learning_rate": 0.00019269321795886381, "loss": 1.0431, "step": 201 }, { "epoch": 0.0914233989590405, "grad_norm": 0.4664234519004822, "learning_rate": 0.00019261746489577765, "loss": 0.9929, "step": 202 }, { "epoch": 0.09187599004299615, "grad_norm": 0.4664883017539978, "learning_rate": 0.00019254133621737668, "loss": 0.865, "step": 203 }, { "epoch": 0.0923285811269518, "grad_norm": 0.5554076433181763, "learning_rate": 0.00019246483223240494, "loss": 1.0111, "step": 204 }, { "epoch": 0.09278117221090744, "grad_norm": 0.6166553497314453, "learning_rate": 0.0001923879532511287, "loss": 1.0876, "step": 205 }, { "epoch": 0.09323376329486309, "grad_norm": 0.5797654986381531, "learning_rate": 0.0001923106995853349, "loss": 1.0707, "step": 206 }, { "epoch": 0.09368635437881874, "grad_norm": 0.5357984304428101, "learning_rate": 0.00019223307154833015, "loss": 0.801, "step": 207 }, { "epoch": 0.09413894546277439, "grad_norm": 0.5104133486747742, "learning_rate": 0.0001921550694549393, "loss": 1.0773, "step": 208 }, { "epoch": 0.09459153654673003, "grad_norm": 0.44878950715065, "learning_rate": 0.0001920766936215042, "loss": 0.866, "step": 209 }, { "epoch": 0.09504412763068568, "grad_norm": 0.4473859369754791, "learning_rate": 0.00019199794436588243, "loss": 0.6928, "step": 210 }, { "epoch": 0.09549671871464133, "grad_norm": 0.485485702753067, "learning_rate": 0.000191918822007446, "loss": 0.9197, "step": 211 }, { "epoch": 0.09594930979859696, "grad_norm": 0.45027539134025574, "learning_rate": 0.00019183932686708008, "loss": 0.5633, "step": 212 }, { "epoch": 0.09640190088255261, "grad_norm": 0.47163695096969604, "learning_rate": 0.00019175945926718166, "loss": 1.1098, "step": 213 }, { "epoch": 0.09685449196650826, "grad_norm": 0.44396013021469116, "learning_rate": 0.00019167921953165825, "loss": 0.8471, "step": 214 }, { "epoch": 0.0973070830504639, "grad_norm": 0.4827806055545807, "learning_rate": 0.00019159860798592662, "loss": 0.7481, "step": 215 }, { "epoch": 0.09775967413441955, "grad_norm": 0.6113815307617188, "learning_rate": 0.00019151762495691135, "loss": 0.8872, "step": 216 }, { "epoch": 0.0982122652183752, "grad_norm": 0.4316766858100891, "learning_rate": 0.0001914362707730437, "loss": 0.7989, "step": 217 }, { "epoch": 0.09866485630233085, "grad_norm": 0.5059136748313904, "learning_rate": 0.0001913545457642601, "loss": 1.0096, "step": 218 }, { "epoch": 0.09911744738628649, "grad_norm": 0.4753996431827545, "learning_rate": 0.00019127245026200092, "loss": 0.7742, "step": 219 }, { "epoch": 0.09957003847024214, "grad_norm": 0.48832663893699646, "learning_rate": 0.00019118998459920902, "loss": 0.9366, "step": 220 }, { "epoch": 0.10002262955419779, "grad_norm": 0.5136207938194275, "learning_rate": 0.00019110714911032853, "loss": 0.7963, "step": 221 }, { "epoch": 0.10047522063815342, "grad_norm": 0.5010231137275696, "learning_rate": 0.00019102394413130346, "loss": 0.8357, "step": 222 }, { "epoch": 0.10092781172210907, "grad_norm": 0.5360503792762756, "learning_rate": 0.00019094036999957624, "loss": 0.9801, "step": 223 }, { "epoch": 0.10138040280606472, "grad_norm": 0.5078533887863159, "learning_rate": 0.00019085642705408637, "loss": 1.0204, "step": 224 }, { "epoch": 0.10183299389002037, "grad_norm": 0.48204243183135986, "learning_rate": 0.00019077211563526926, "loss": 0.874, "step": 225 }, { "epoch": 0.10228558497397601, "grad_norm": 0.44971680641174316, "learning_rate": 0.00019068743608505455, "loss": 0.7917, "step": 226 }, { "epoch": 0.10273817605793166, "grad_norm": 0.49740248918533325, "learning_rate": 0.00019060238874686491, "loss": 0.9008, "step": 227 }, { "epoch": 0.10319076714188731, "grad_norm": 0.4406406879425049, "learning_rate": 0.00019051697396561454, "loss": 0.7824, "step": 228 }, { "epoch": 0.10364335822584295, "grad_norm": 0.5389720797538757, "learning_rate": 0.00019043119208770793, "loss": 1.0637, "step": 229 }, { "epoch": 0.1040959493097986, "grad_norm": 0.4832242429256439, "learning_rate": 0.00019034504346103823, "loss": 0.9178, "step": 230 }, { "epoch": 0.10454854039375425, "grad_norm": 0.5743380784988403, "learning_rate": 0.00019025852843498607, "loss": 1.101, "step": 231 }, { "epoch": 0.10500113147770988, "grad_norm": 0.5157460570335388, "learning_rate": 0.00019017164736041795, "loss": 1.1203, "step": 232 }, { "epoch": 0.10545372256166553, "grad_norm": 0.4559588134288788, "learning_rate": 0.00019008440058968496, "loss": 0.8919, "step": 233 }, { "epoch": 0.10590631364562118, "grad_norm": 0.5415903329849243, "learning_rate": 0.0001899967884766212, "loss": 1.1227, "step": 234 }, { "epoch": 0.10635890472957683, "grad_norm": 0.5378996729850769, "learning_rate": 0.00018990881137654258, "loss": 1.0748, "step": 235 }, { "epoch": 0.10681149581353247, "grad_norm": 0.5181474089622498, "learning_rate": 0.00018982046964624515, "loss": 1.0313, "step": 236 }, { "epoch": 0.10726408689748812, "grad_norm": 0.4330384433269501, "learning_rate": 0.0001897317636440037, "loss": 0.6787, "step": 237 }, { "epoch": 0.10771667798144377, "grad_norm": 0.5662621855735779, "learning_rate": 0.00018964269372957038, "loss": 0.6968, "step": 238 }, { "epoch": 0.1081692690653994, "grad_norm": 0.46083390712738037, "learning_rate": 0.00018955326026417327, "loss": 0.7441, "step": 239 }, { "epoch": 0.10862186014935506, "grad_norm": 0.469539612531662, "learning_rate": 0.00018946346361051474, "loss": 0.7932, "step": 240 }, { "epoch": 0.1090744512333107, "grad_norm": 0.5363450050354004, "learning_rate": 0.0001893733041327702, "loss": 0.9415, "step": 241 }, { "epoch": 0.10952704231726634, "grad_norm": 0.5973456501960754, "learning_rate": 0.00018928278219658643, "loss": 0.9286, "step": 242 }, { "epoch": 0.109979633401222, "grad_norm": 0.4615383744239807, "learning_rate": 0.00018919189816908022, "loss": 0.8515, "step": 243 }, { "epoch": 0.11043222448517764, "grad_norm": 0.5972759127616882, "learning_rate": 0.0001891006524188368, "loss": 0.8229, "step": 244 }, { "epoch": 0.1108848155691333, "grad_norm": 0.4316255450248718, "learning_rate": 0.00018900904531590846, "loss": 0.6559, "step": 245 }, { "epoch": 0.11133740665308893, "grad_norm": 0.5585528016090393, "learning_rate": 0.00018891707723181294, "loss": 0.9146, "step": 246 }, { "epoch": 0.11178999773704458, "grad_norm": 0.4745887219905853, "learning_rate": 0.0001888247485395319, "loss": 0.8527, "step": 247 }, { "epoch": 0.11224258882100023, "grad_norm": 0.6520152688026428, "learning_rate": 0.0001887320596135096, "loss": 0.8711, "step": 248 }, { "epoch": 0.11269517990495587, "grad_norm": 0.5280656218528748, "learning_rate": 0.0001886390108296511, "loss": 0.9714, "step": 249 }, { "epoch": 0.11314777098891152, "grad_norm": 0.48541924357414246, "learning_rate": 0.000188545602565321, "loss": 0.8401, "step": 250 }, { "epoch": 0.11360036207286717, "grad_norm": 0.5795570611953735, "learning_rate": 0.0001884518351993417, "loss": 1.035, "step": 251 }, { "epoch": 0.11405295315682282, "grad_norm": 0.5781338214874268, "learning_rate": 0.00018835770911199205, "loss": 0.9356, "step": 252 }, { "epoch": 0.11450554424077845, "grad_norm": 0.5215576887130737, "learning_rate": 0.00018826322468500566, "loss": 0.9987, "step": 253 }, { "epoch": 0.1149581353247341, "grad_norm": 0.5532097220420837, "learning_rate": 0.00018816838230156942, "loss": 1.2014, "step": 254 }, { "epoch": 0.11541072640868975, "grad_norm": 0.4665840268135071, "learning_rate": 0.00018807318234632185, "loss": 0.8483, "step": 255 }, { "epoch": 0.11586331749264539, "grad_norm": 0.49034935235977173, "learning_rate": 0.00018797762520535177, "loss": 1.046, "step": 256 }, { "epoch": 0.11631590857660104, "grad_norm": 0.5094912648200989, "learning_rate": 0.00018788171126619653, "loss": 0.7983, "step": 257 }, { "epoch": 0.11676849966055669, "grad_norm": 0.46415218710899353, "learning_rate": 0.00018778544091784048, "loss": 0.6571, "step": 258 }, { "epoch": 0.11722109074451233, "grad_norm": 0.5812395811080933, "learning_rate": 0.00018768881455071332, "loss": 0.8487, "step": 259 }, { "epoch": 0.11767368182846798, "grad_norm": 0.48839807510375977, "learning_rate": 0.0001875918325566888, "loss": 0.8423, "step": 260 }, { "epoch": 0.11812627291242363, "grad_norm": 0.5598899126052856, "learning_rate": 0.00018749449532908275, "loss": 1.1157, "step": 261 }, { "epoch": 0.11857886399637928, "grad_norm": 0.38726699352264404, "learning_rate": 0.0001873968032626518, "loss": 0.6387, "step": 262 }, { "epoch": 0.11903145508033491, "grad_norm": 0.48521196842193604, "learning_rate": 0.0001872987567535916, "loss": 0.8264, "step": 263 }, { "epoch": 0.11948404616429056, "grad_norm": 0.5407915711402893, "learning_rate": 0.00018720035619953517, "loss": 0.8444, "step": 264 }, { "epoch": 0.11993663724824621, "grad_norm": 0.4376087486743927, "learning_rate": 0.00018710160199955156, "loss": 0.7221, "step": 265 }, { "epoch": 0.12038922833220185, "grad_norm": 0.45480218529701233, "learning_rate": 0.00018700249455414394, "loss": 0.7882, "step": 266 }, { "epoch": 0.1208418194161575, "grad_norm": 0.443365216255188, "learning_rate": 0.00018690303426524806, "loss": 0.8778, "step": 267 }, { "epoch": 0.12129441050011315, "grad_norm": 0.4929787516593933, "learning_rate": 0.00018680322153623075, "loss": 0.8709, "step": 268 }, { "epoch": 0.12174700158406879, "grad_norm": 0.45391973853111267, "learning_rate": 0.00018670305677188808, "loss": 0.7935, "step": 269 }, { "epoch": 0.12219959266802444, "grad_norm": 0.4776461720466614, "learning_rate": 0.00018660254037844388, "loss": 0.9575, "step": 270 }, { "epoch": 0.12265218375198009, "grad_norm": 0.43372777104377747, "learning_rate": 0.000186501672763548, "loss": 0.718, "step": 271 }, { "epoch": 0.12310477483593574, "grad_norm": 0.45444566011428833, "learning_rate": 0.0001864004543362747, "loss": 0.8217, "step": 272 }, { "epoch": 0.12355736591989137, "grad_norm": 0.5201200842857361, "learning_rate": 0.00018629888550712097, "loss": 1.0717, "step": 273 }, { "epoch": 0.12400995700384702, "grad_norm": 0.4872967302799225, "learning_rate": 0.00018619696668800492, "loss": 0.9816, "step": 274 }, { "epoch": 0.12446254808780267, "grad_norm": 0.5114011764526367, "learning_rate": 0.000186094698292264, "loss": 1.045, "step": 275 }, { "epoch": 0.12491513917175831, "grad_norm": 0.42714235186576843, "learning_rate": 0.00018599208073465345, "loss": 0.7804, "step": 276 }, { "epoch": 0.12536773025571396, "grad_norm": 0.4325241148471832, "learning_rate": 0.00018588911443134448, "loss": 0.8942, "step": 277 }, { "epoch": 0.1258203213396696, "grad_norm": 0.6905471086502075, "learning_rate": 0.00018578579979992266, "loss": 1.0869, "step": 278 }, { "epoch": 0.12627291242362526, "grad_norm": 0.6683794260025024, "learning_rate": 0.00018568213725938634, "loss": 0.8228, "step": 279 }, { "epoch": 0.1267255035075809, "grad_norm": 0.48306554555892944, "learning_rate": 0.00018557812723014476, "loss": 0.853, "step": 280 }, { "epoch": 0.12717809459153653, "grad_norm": 0.5165983438491821, "learning_rate": 0.0001854737701340164, "loss": 0.8805, "step": 281 }, { "epoch": 0.12763068567549218, "grad_norm": 0.43435442447662354, "learning_rate": 0.00018536906639422725, "loss": 0.7154, "step": 282 }, { "epoch": 0.12808327675944783, "grad_norm": 0.579204797744751, "learning_rate": 0.00018526401643540922, "loss": 1.1599, "step": 283 }, { "epoch": 0.12853586784340348, "grad_norm": 0.4634973406791687, "learning_rate": 0.00018515862068359837, "loss": 0.7779, "step": 284 }, { "epoch": 0.12898845892735913, "grad_norm": 0.5537360310554504, "learning_rate": 0.00018505287956623297, "loss": 1.0001, "step": 285 }, { "epoch": 0.12944105001131478, "grad_norm": 0.46273747086524963, "learning_rate": 0.0001849467935121521, "loss": 0.7031, "step": 286 }, { "epoch": 0.12989364109527043, "grad_norm": 0.5336446762084961, "learning_rate": 0.0001848403629515937, "loss": 0.8497, "step": 287 }, { "epoch": 0.13034623217922606, "grad_norm": 0.5585876703262329, "learning_rate": 0.00018473358831619282, "loss": 0.7961, "step": 288 }, { "epoch": 0.1307988232631817, "grad_norm": 0.6317759156227112, "learning_rate": 0.00018462647003898006, "loss": 0.8452, "step": 289 }, { "epoch": 0.13125141434713736, "grad_norm": 0.5151224732398987, "learning_rate": 0.0001845190085543795, "loss": 0.9339, "step": 290 }, { "epoch": 0.131704005431093, "grad_norm": 0.46558281779289246, "learning_rate": 0.00018441120429820722, "loss": 0.8505, "step": 291 }, { "epoch": 0.13215659651504866, "grad_norm": 0.4349653124809265, "learning_rate": 0.00018430305770766948, "loss": 0.768, "step": 292 }, { "epoch": 0.1326091875990043, "grad_norm": 0.43986043334007263, "learning_rate": 0.00018419456922136074, "loss": 0.648, "step": 293 }, { "epoch": 0.13306177868295996, "grad_norm": 0.4826880097389221, "learning_rate": 0.00018408573927926222, "loss": 0.9613, "step": 294 }, { "epoch": 0.13351436976691558, "grad_norm": 0.5607436299324036, "learning_rate": 0.0001839765683227398, "loss": 1.0627, "step": 295 }, { "epoch": 0.13396696085087123, "grad_norm": 0.5668855905532837, "learning_rate": 0.00018386705679454242, "loss": 1.1457, "step": 296 }, { "epoch": 0.13441955193482688, "grad_norm": 0.5762825012207031, "learning_rate": 0.0001837572051388002, "loss": 1.2212, "step": 297 }, { "epoch": 0.13487214301878253, "grad_norm": 0.48467156291007996, "learning_rate": 0.00018364701380102266, "loss": 0.8753, "step": 298 }, { "epoch": 0.13532473410273818, "grad_norm": 0.4921865165233612, "learning_rate": 0.00018353648322809704, "loss": 0.9958, "step": 299 }, { "epoch": 0.13577732518669383, "grad_norm": 0.4598338305950165, "learning_rate": 0.00018342561386828615, "loss": 0.8717, "step": 300 }, { "epoch": 0.13622991627064948, "grad_norm": 0.5907824039459229, "learning_rate": 0.00018331440617122696, "loss": 0.8604, "step": 301 }, { "epoch": 0.1366825073546051, "grad_norm": 0.5254366397857666, "learning_rate": 0.00018320286058792843, "loss": 0.9818, "step": 302 }, { "epoch": 0.13713509843856075, "grad_norm": 0.4536130130290985, "learning_rate": 0.00018309097757077, "loss": 0.7919, "step": 303 }, { "epoch": 0.1375876895225164, "grad_norm": 0.4499724209308624, "learning_rate": 0.00018297875757349952, "loss": 0.8955, "step": 304 }, { "epoch": 0.13804028060647205, "grad_norm": 0.555507481098175, "learning_rate": 0.00018286620105123142, "loss": 0.9697, "step": 305 }, { "epoch": 0.1384928716904277, "grad_norm": 0.4722172021865845, "learning_rate": 0.000182753308460445, "loss": 0.7845, "step": 306 }, { "epoch": 0.13894546277438335, "grad_norm": 0.5722295641899109, "learning_rate": 0.00018264008025898248, "loss": 1.0834, "step": 307 }, { "epoch": 0.13939805385833898, "grad_norm": 0.44553884863853455, "learning_rate": 0.00018252651690604715, "loss": 0.6268, "step": 308 }, { "epoch": 0.13985064494229463, "grad_norm": 0.5120521187782288, "learning_rate": 0.00018241261886220154, "loss": 0.7139, "step": 309 }, { "epoch": 0.14030323602625028, "grad_norm": 0.47789305448532104, "learning_rate": 0.00018229838658936564, "loss": 0.9572, "step": 310 }, { "epoch": 0.14075582711020593, "grad_norm": 0.42972978949546814, "learning_rate": 0.00018218382055081475, "loss": 0.6164, "step": 311 }, { "epoch": 0.14120841819416158, "grad_norm": 0.5786593556404114, "learning_rate": 0.00018206892121117783, "loss": 1.1196, "step": 312 }, { "epoch": 0.14166100927811723, "grad_norm": 0.4262322187423706, "learning_rate": 0.00018195368903643563, "loss": 0.7097, "step": 313 }, { "epoch": 0.14211360036207288, "grad_norm": 0.4580809473991394, "learning_rate": 0.0001818381244939187, "loss": 0.8742, "step": 314 }, { "epoch": 0.1425661914460285, "grad_norm": 0.6090126633644104, "learning_rate": 0.00018172222805230547, "loss": 1.1822, "step": 315 }, { "epoch": 0.14301878252998415, "grad_norm": 0.4505937397480011, "learning_rate": 0.0001816060001816205, "loss": 0.7508, "step": 316 }, { "epoch": 0.1434713736139398, "grad_norm": 0.5005435943603516, "learning_rate": 0.00018148944135323237, "loss": 0.7746, "step": 317 }, { "epoch": 0.14392396469789545, "grad_norm": 0.43439781665802, "learning_rate": 0.00018137255203985197, "loss": 0.6364, "step": 318 }, { "epoch": 0.1443765557818511, "grad_norm": 0.47296142578125, "learning_rate": 0.00018125533271553043, "loss": 0.8561, "step": 319 }, { "epoch": 0.14482914686580675, "grad_norm": 0.543493390083313, "learning_rate": 0.00018113778385565733, "loss": 0.8883, "step": 320 }, { "epoch": 0.1452817379497624, "grad_norm": 0.5074918270111084, "learning_rate": 0.00018101990593695857, "loss": 0.8878, "step": 321 }, { "epoch": 0.14573432903371802, "grad_norm": 0.600610613822937, "learning_rate": 0.00018090169943749476, "loss": 0.9266, "step": 322 }, { "epoch": 0.14618692011767367, "grad_norm": 0.6033854484558105, "learning_rate": 0.00018078316483665887, "loss": 1.3694, "step": 323 }, { "epoch": 0.14663951120162932, "grad_norm": 0.4634937345981598, "learning_rate": 0.0001806643026151747, "loss": 0.9367, "step": 324 }, { "epoch": 0.14709210228558497, "grad_norm": 0.46875283122062683, "learning_rate": 0.0001805451132550946, "loss": 0.7489, "step": 325 }, { "epoch": 0.14754469336954062, "grad_norm": 0.42799684405326843, "learning_rate": 0.0001804255972397977, "loss": 0.5965, "step": 326 }, { "epoch": 0.14799728445349628, "grad_norm": 0.49495095014572144, "learning_rate": 0.0001803057550539879, "loss": 0.9141, "step": 327 }, { "epoch": 0.14844987553745193, "grad_norm": 0.6663426756858826, "learning_rate": 0.00018018558718369186, "loss": 0.9727, "step": 328 }, { "epoch": 0.14890246662140755, "grad_norm": 0.45158758759498596, "learning_rate": 0.00018006509411625716, "loss": 0.6957, "step": 329 }, { "epoch": 0.1493550577053632, "grad_norm": 0.5451595783233643, "learning_rate": 0.00017994427634035015, "loss": 1.154, "step": 330 }, { "epoch": 0.14980764878931885, "grad_norm": 0.45239847898483276, "learning_rate": 0.00017982313434595406, "loss": 0.797, "step": 331 }, { "epoch": 0.1502602398732745, "grad_norm": 0.420845091342926, "learning_rate": 0.00017970166862436707, "loss": 0.867, "step": 332 }, { "epoch": 0.15071283095723015, "grad_norm": 0.5063892006874084, "learning_rate": 0.00017957987966820023, "loss": 0.8467, "step": 333 }, { "epoch": 0.1511654220411858, "grad_norm": 0.4861128628253937, "learning_rate": 0.00017945776797137543, "loss": 0.8838, "step": 334 }, { "epoch": 0.15161801312514142, "grad_norm": 0.4689355194568634, "learning_rate": 0.00017933533402912354, "loss": 0.8019, "step": 335 }, { "epoch": 0.15207060420909707, "grad_norm": 0.49119648337364197, "learning_rate": 0.0001792125783379822, "loss": 0.9141, "step": 336 }, { "epoch": 0.15252319529305272, "grad_norm": 0.5174407362937927, "learning_rate": 0.00017908950139579406, "loss": 0.93, "step": 337 }, { "epoch": 0.15297578637700837, "grad_norm": 0.45945340394973755, "learning_rate": 0.0001789661037017045, "loss": 0.8128, "step": 338 }, { "epoch": 0.15342837746096402, "grad_norm": 0.6184009909629822, "learning_rate": 0.0001788423857561598, "loss": 0.8328, "step": 339 }, { "epoch": 0.15388096854491967, "grad_norm": 0.49892330169677734, "learning_rate": 0.00017871834806090501, "loss": 0.7559, "step": 340 }, { "epoch": 0.15433355962887532, "grad_norm": 0.49974218010902405, "learning_rate": 0.00017859399111898197, "loss": 0.9687, "step": 341 }, { "epoch": 0.15478615071283094, "grad_norm": 0.4635416567325592, "learning_rate": 0.0001784693154347272, "loss": 0.8017, "step": 342 }, { "epoch": 0.1552387417967866, "grad_norm": 0.5158035755157471, "learning_rate": 0.0001783443215137699, "loss": 0.8791, "step": 343 }, { "epoch": 0.15569133288074224, "grad_norm": 0.46447592973709106, "learning_rate": 0.00017821900986302996, "loss": 0.7625, "step": 344 }, { "epoch": 0.1561439239646979, "grad_norm": 0.5464069247245789, "learning_rate": 0.00017809338099071577, "loss": 0.9033, "step": 345 }, { "epoch": 0.15659651504865355, "grad_norm": 0.46802228689193726, "learning_rate": 0.00017796743540632223, "loss": 0.82, "step": 346 }, { "epoch": 0.1570491061326092, "grad_norm": 0.46313050389289856, "learning_rate": 0.00017784117362062879, "loss": 0.8819, "step": 347 }, { "epoch": 0.15750169721656485, "grad_norm": 0.5359646677970886, "learning_rate": 0.0001777145961456971, "loss": 0.8463, "step": 348 }, { "epoch": 0.15795428830052047, "grad_norm": 0.5274454951286316, "learning_rate": 0.00017758770349486923, "loss": 0.9164, "step": 349 }, { "epoch": 0.15840687938447612, "grad_norm": 0.5028342008590698, "learning_rate": 0.00017746049618276545, "loss": 1.0029, "step": 350 }, { "epoch": 0.15885947046843177, "grad_norm": 0.4811888337135315, "learning_rate": 0.00017733297472528214, "loss": 0.791, "step": 351 }, { "epoch": 0.15931206155238742, "grad_norm": 0.4980330169200897, "learning_rate": 0.00017720513963958968, "loss": 0.8853, "step": 352 }, { "epoch": 0.15976465263634307, "grad_norm": 0.4352525770664215, "learning_rate": 0.00017707699144413046, "loss": 0.753, "step": 353 }, { "epoch": 0.16021724372029872, "grad_norm": 0.4622915983200073, "learning_rate": 0.00017694853065861662, "loss": 0.7534, "step": 354 }, { "epoch": 0.16066983480425437, "grad_norm": 0.520939826965332, "learning_rate": 0.00017681975780402807, "loss": 0.8995, "step": 355 }, { "epoch": 0.16112242588821, "grad_norm": 0.5207616090774536, "learning_rate": 0.00017669067340261036, "loss": 0.8877, "step": 356 }, { "epoch": 0.16157501697216564, "grad_norm": 0.5376969575881958, "learning_rate": 0.00017656127797787248, "loss": 1.0664, "step": 357 }, { "epoch": 0.1620276080561213, "grad_norm": 0.5164622664451599, "learning_rate": 0.00017643157205458483, "loss": 0.8304, "step": 358 }, { "epoch": 0.16248019914007694, "grad_norm": 0.5401540398597717, "learning_rate": 0.0001763015561587771, "loss": 0.8678, "step": 359 }, { "epoch": 0.1629327902240326, "grad_norm": 0.7328370213508606, "learning_rate": 0.00017617123081773591, "loss": 1.015, "step": 360 }, { "epoch": 0.16338538130798824, "grad_norm": 0.6089193820953369, "learning_rate": 0.0001760405965600031, "loss": 1.1274, "step": 361 }, { "epoch": 0.1638379723919439, "grad_norm": 0.4811232388019562, "learning_rate": 0.00017590965391537316, "loss": 0.9439, "step": 362 }, { "epoch": 0.16429056347589951, "grad_norm": 0.41662436723709106, "learning_rate": 0.00017577840341489133, "loss": 0.6521, "step": 363 }, { "epoch": 0.16474315455985516, "grad_norm": 0.5151225924491882, "learning_rate": 0.00017564684559085136, "loss": 0.6962, "step": 364 }, { "epoch": 0.16519574564381082, "grad_norm": 0.5826026201248169, "learning_rate": 0.00017551498097679338, "loss": 1.0393, "step": 365 }, { "epoch": 0.16564833672776647, "grad_norm": 0.39569127559661865, "learning_rate": 0.0001753828101075017, "loss": 0.6932, "step": 366 }, { "epoch": 0.16610092781172212, "grad_norm": 0.566507875919342, "learning_rate": 0.00017525033351900268, "loss": 0.7179, "step": 367 }, { "epoch": 0.16655351889567777, "grad_norm": 0.524178683757782, "learning_rate": 0.00017511755174856255, "loss": 1.0949, "step": 368 }, { "epoch": 0.1670061099796334, "grad_norm": 0.5008012652397156, "learning_rate": 0.00017498446533468524, "loss": 0.7463, "step": 369 }, { "epoch": 0.16745870106358904, "grad_norm": 0.5324717164039612, "learning_rate": 0.00017485107481711012, "loss": 1.0179, "step": 370 }, { "epoch": 0.1679112921475447, "grad_norm": 0.5710434913635254, "learning_rate": 0.00017471738073680993, "loss": 0.9511, "step": 371 }, { "epoch": 0.16836388323150034, "grad_norm": 0.5566431283950806, "learning_rate": 0.00017458338363598845, "loss": 0.837, "step": 372 }, { "epoch": 0.168816474315456, "grad_norm": 0.5454690456390381, "learning_rate": 0.00017444908405807845, "loss": 0.9261, "step": 373 }, { "epoch": 0.16926906539941164, "grad_norm": 0.5501024723052979, "learning_rate": 0.00017431448254773944, "loss": 0.9062, "step": 374 }, { "epoch": 0.1697216564833673, "grad_norm": 0.41848915815353394, "learning_rate": 0.0001741795796508553, "loss": 0.613, "step": 375 }, { "epoch": 0.1701742475673229, "grad_norm": 0.6741843819618225, "learning_rate": 0.00017404437591453235, "loss": 1.3163, "step": 376 }, { "epoch": 0.17062683865127856, "grad_norm": 0.5285744667053223, "learning_rate": 0.00017390887188709685, "loss": 1.0035, "step": 377 }, { "epoch": 0.1710794297352342, "grad_norm": 0.4732317328453064, "learning_rate": 0.00017377306811809304, "loss": 0.6205, "step": 378 }, { "epoch": 0.17153202081918986, "grad_norm": 0.5695931911468506, "learning_rate": 0.00017363696515828062, "loss": 0.9944, "step": 379 }, { "epoch": 0.1719846119031455, "grad_norm": 0.46868863701820374, "learning_rate": 0.00017350056355963287, "loss": 0.7082, "step": 380 }, { "epoch": 0.17243720298710116, "grad_norm": 0.5603721737861633, "learning_rate": 0.00017336386387533406, "loss": 1.0011, "step": 381 }, { "epoch": 0.1728897940710568, "grad_norm": 0.5308412313461304, "learning_rate": 0.00017322686665977737, "loss": 1.007, "step": 382 }, { "epoch": 0.17334238515501244, "grad_norm": 0.5501987934112549, "learning_rate": 0.00017308957246856273, "loss": 1.0982, "step": 383 }, { "epoch": 0.17379497623896809, "grad_norm": 0.5032144784927368, "learning_rate": 0.00017295198185849443, "loss": 0.9649, "step": 384 }, { "epoch": 0.17424756732292374, "grad_norm": 0.5478222370147705, "learning_rate": 0.00017281409538757883, "loss": 0.8592, "step": 385 }, { "epoch": 0.17470015840687939, "grad_norm": 0.5468262434005737, "learning_rate": 0.00017267591361502232, "loss": 0.7801, "step": 386 }, { "epoch": 0.17515274949083504, "grad_norm": 0.49443894624710083, "learning_rate": 0.00017253743710122875, "loss": 0.8721, "step": 387 }, { "epoch": 0.17560534057479069, "grad_norm": 0.5474779605865479, "learning_rate": 0.00017239866640779745, "loss": 1.0061, "step": 388 }, { "epoch": 0.17605793165874634, "grad_norm": 0.45006951689720154, "learning_rate": 0.00017225960209752062, "loss": 0.7008, "step": 389 }, { "epoch": 0.17651052274270196, "grad_norm": 0.49803587794303894, "learning_rate": 0.00017212024473438147, "loss": 0.8088, "step": 390 }, { "epoch": 0.1769631138266576, "grad_norm": 0.47258761525154114, "learning_rate": 0.0001719805948835515, "loss": 0.7485, "step": 391 }, { "epoch": 0.17741570491061326, "grad_norm": 0.5001921653747559, "learning_rate": 0.00017184065311138857, "loss": 1.0942, "step": 392 }, { "epoch": 0.1778682959945689, "grad_norm": 0.4275917708873749, "learning_rate": 0.00017170041998543437, "loss": 0.6614, "step": 393 }, { "epoch": 0.1778682959945689, "eval_loss": 0.21127600967884064, "eval_runtime": 51.6077, "eval_samples_per_second": 18.04, "eval_steps_per_second": 9.03, "step": 393 }, { "epoch": 0.17832088707852456, "grad_norm": 0.4007515609264374, "learning_rate": 0.00017155989607441213, "loss": 0.6961, "step": 394 }, { "epoch": 0.1787734781624802, "grad_norm": 0.4367770850658417, "learning_rate": 0.00017141908194822446, "loss": 0.7962, "step": 395 }, { "epoch": 0.17922606924643583, "grad_norm": 0.5173846483230591, "learning_rate": 0.000171277978177951, "loss": 0.837, "step": 396 }, { "epoch": 0.17967866033039148, "grad_norm": 0.5975651741027832, "learning_rate": 0.00017113658533584594, "loss": 0.9096, "step": 397 }, { "epoch": 0.18013125141434713, "grad_norm": 0.512630045413971, "learning_rate": 0.00017099490399533583, "loss": 0.8301, "step": 398 }, { "epoch": 0.18058384249830278, "grad_norm": 0.5926880836486816, "learning_rate": 0.00017085293473101734, "loss": 0.8294, "step": 399 }, { "epoch": 0.18103643358225843, "grad_norm": 0.4605947732925415, "learning_rate": 0.00017071067811865476, "loss": 0.819, "step": 400 }, { "epoch": 0.18148902466621408, "grad_norm": 0.5397257208824158, "learning_rate": 0.00017056813473517775, "loss": 0.958, "step": 401 }, { "epoch": 0.18194161575016973, "grad_norm": 0.5134234428405762, "learning_rate": 0.00017042530515867896, "loss": 0.8052, "step": 402 }, { "epoch": 0.18239420683412536, "grad_norm": 0.48100152611732483, "learning_rate": 0.00017028218996841172, "loss": 0.8442, "step": 403 }, { "epoch": 0.182846797918081, "grad_norm": 0.48078837990760803, "learning_rate": 0.00017013878974478776, "loss": 0.715, "step": 404 }, { "epoch": 0.18329938900203666, "grad_norm": 0.4258125424385071, "learning_rate": 0.00016999510506937466, "loss": 0.7623, "step": 405 }, { "epoch": 0.1837519800859923, "grad_norm": 0.5635977983474731, "learning_rate": 0.00016985113652489374, "loss": 0.8729, "step": 406 }, { "epoch": 0.18420457116994796, "grad_norm": 0.4123055338859558, "learning_rate": 0.00016970688469521736, "loss": 0.6492, "step": 407 }, { "epoch": 0.1846571622539036, "grad_norm": 0.5301768183708191, "learning_rate": 0.000169562350165367, "loss": 0.974, "step": 408 }, { "epoch": 0.18510975333785926, "grad_norm": 0.5563939213752747, "learning_rate": 0.00016941753352151055, "loss": 1.0331, "step": 409 }, { "epoch": 0.18556234442181488, "grad_norm": 0.4835430681705475, "learning_rate": 0.00016927243535095997, "loss": 0.5977, "step": 410 }, { "epoch": 0.18601493550577053, "grad_norm": 0.4503084719181061, "learning_rate": 0.000169127056242169, "loss": 0.7688, "step": 411 }, { "epoch": 0.18646752658972618, "grad_norm": 0.461060106754303, "learning_rate": 0.00016898139678473076, "loss": 0.7793, "step": 412 }, { "epoch": 0.18692011767368183, "grad_norm": 0.49685123562812805, "learning_rate": 0.0001688354575693754, "loss": 0.9484, "step": 413 }, { "epoch": 0.18737270875763748, "grad_norm": 0.48521700501441956, "learning_rate": 0.00016868923918796753, "loss": 0.7861, "step": 414 }, { "epoch": 0.18782529984159313, "grad_norm": 0.4642525017261505, "learning_rate": 0.00016854274223350397, "loss": 0.8415, "step": 415 }, { "epoch": 0.18827789092554878, "grad_norm": 0.5125812292098999, "learning_rate": 0.00016839596730011138, "loss": 0.8479, "step": 416 }, { "epoch": 0.1887304820095044, "grad_norm": 0.6952138543128967, "learning_rate": 0.00016824891498304364, "loss": 1.1171, "step": 417 }, { "epoch": 0.18918307309346005, "grad_norm": 0.5572001934051514, "learning_rate": 0.00016810158587867973, "loss": 0.9817, "step": 418 }, { "epoch": 0.1896356641774157, "grad_norm": 0.47780361771583557, "learning_rate": 0.00016795398058452104, "loss": 0.9416, "step": 419 }, { "epoch": 0.19008825526137135, "grad_norm": 0.4838089048862457, "learning_rate": 0.0001678060996991891, "loss": 0.8315, "step": 420 }, { "epoch": 0.190540846345327, "grad_norm": 0.4793652892112732, "learning_rate": 0.00016765794382242314, "loss": 0.908, "step": 421 }, { "epoch": 0.19099343742928265, "grad_norm": 0.4631733000278473, "learning_rate": 0.00016750951355507763, "loss": 0.689, "step": 422 }, { "epoch": 0.19144602851323828, "grad_norm": 0.5332536101341248, "learning_rate": 0.00016736080949911978, "loss": 0.9532, "step": 423 }, { "epoch": 0.19189861959719393, "grad_norm": 0.5441685914993286, "learning_rate": 0.00016721183225762727, "loss": 0.8802, "step": 424 }, { "epoch": 0.19235121068114958, "grad_norm": 0.687839686870575, "learning_rate": 0.00016706258243478562, "loss": 0.771, "step": 425 }, { "epoch": 0.19280380176510523, "grad_norm": 0.5836572647094727, "learning_rate": 0.00016691306063588583, "loss": 1.0452, "step": 426 }, { "epoch": 0.19325639284906088, "grad_norm": 0.5064947605133057, "learning_rate": 0.00016676326746732195, "loss": 0.93, "step": 427 }, { "epoch": 0.19370898393301653, "grad_norm": 0.4416975975036621, "learning_rate": 0.00016661320353658857, "loss": 0.7413, "step": 428 }, { "epoch": 0.19416157501697218, "grad_norm": 0.6051701903343201, "learning_rate": 0.0001664628694522783, "loss": 0.7676, "step": 429 }, { "epoch": 0.1946141661009278, "grad_norm": 0.6898373961448669, "learning_rate": 0.00016631226582407952, "loss": 1.0243, "step": 430 }, { "epoch": 0.19506675718488345, "grad_norm": 0.4763202369213104, "learning_rate": 0.00016616139326277365, "loss": 0.6903, "step": 431 }, { "epoch": 0.1955193482688391, "grad_norm": 0.5373172163963318, "learning_rate": 0.0001660102523802328, "loss": 0.9375, "step": 432 }, { "epoch": 0.19597193935279475, "grad_norm": 0.4901677072048187, "learning_rate": 0.00016585884378941725, "loss": 0.8726, "step": 433 }, { "epoch": 0.1964245304367504, "grad_norm": 0.440965473651886, "learning_rate": 0.0001657071681043731, "loss": 0.7162, "step": 434 }, { "epoch": 0.19687712152070605, "grad_norm": 0.5141696333885193, "learning_rate": 0.0001655552259402295, "loss": 0.7813, "step": 435 }, { "epoch": 0.1973297126046617, "grad_norm": 0.44578734040260315, "learning_rate": 0.00016540301791319645, "loss": 0.7619, "step": 436 }, { "epoch": 0.19778230368861732, "grad_norm": 0.4049105644226074, "learning_rate": 0.00016525054464056212, "loss": 0.6142, "step": 437 }, { "epoch": 0.19823489477257297, "grad_norm": 0.46392935514450073, "learning_rate": 0.0001650978067406904, "loss": 0.8162, "step": 438 }, { "epoch": 0.19868748585652862, "grad_norm": 0.4067855775356293, "learning_rate": 0.00016494480483301836, "loss": 0.7385, "step": 439 }, { "epoch": 0.19914007694048427, "grad_norm": 0.47872698307037354, "learning_rate": 0.0001647915395380539, "loss": 0.7628, "step": 440 }, { "epoch": 0.19959266802443992, "grad_norm": 0.4723743200302124, "learning_rate": 0.0001646380114773729, "loss": 0.8442, "step": 441 }, { "epoch": 0.20004525910839557, "grad_norm": 0.4618031084537506, "learning_rate": 0.00016448422127361706, "loss": 0.9227, "step": 442 }, { "epoch": 0.20049785019235122, "grad_norm": 0.4881635010242462, "learning_rate": 0.00016433016955049115, "loss": 0.8212, "step": 443 }, { "epoch": 0.20095044127630685, "grad_norm": 0.49687886238098145, "learning_rate": 0.00016417585693276058, "loss": 0.9704, "step": 444 }, { "epoch": 0.2014030323602625, "grad_norm": 0.5278611183166504, "learning_rate": 0.00016402128404624882, "loss": 0.9722, "step": 445 }, { "epoch": 0.20185562344421815, "grad_norm": 0.5232154726982117, "learning_rate": 0.0001638664515178348, "loss": 1.0113, "step": 446 }, { "epoch": 0.2023082145281738, "grad_norm": 0.4725476801395416, "learning_rate": 0.00016371135997545058, "loss": 0.7659, "step": 447 }, { "epoch": 0.20276080561212945, "grad_norm": 0.458344966173172, "learning_rate": 0.00016355601004807856, "loss": 0.7465, "step": 448 }, { "epoch": 0.2032133966960851, "grad_norm": 1.0862559080123901, "learning_rate": 0.00016340040236574902, "loss": 1.1804, "step": 449 }, { "epoch": 0.20366598778004075, "grad_norm": 0.496658593416214, "learning_rate": 0.00016324453755953773, "loss": 0.766, "step": 450 }, { "epoch": 0.20411857886399637, "grad_norm": 0.6810061931610107, "learning_rate": 0.00016308841626156307, "loss": 1.0167, "step": 451 }, { "epoch": 0.20457116994795202, "grad_norm": 0.5212746858596802, "learning_rate": 0.00016293203910498376, "loss": 0.7169, "step": 452 }, { "epoch": 0.20502376103190767, "grad_norm": 0.46362701058387756, "learning_rate": 0.00016277540672399608, "loss": 0.8142, "step": 453 }, { "epoch": 0.20547635211586332, "grad_norm": 0.5136375427246094, "learning_rate": 0.00016261851975383137, "loss": 0.844, "step": 454 }, { "epoch": 0.20592894319981897, "grad_norm": 0.47330260276794434, "learning_rate": 0.00016246137883075363, "loss": 0.7709, "step": 455 }, { "epoch": 0.20638153428377462, "grad_norm": 0.6377627849578857, "learning_rate": 0.0001623039845920566, "loss": 0.7982, "step": 456 }, { "epoch": 0.20683412536773024, "grad_norm": 0.6286625266075134, "learning_rate": 0.00016214633767606143, "loss": 0.8048, "step": 457 }, { "epoch": 0.2072867164516859, "grad_norm": 0.4780018925666809, "learning_rate": 0.00016198843872211404, "loss": 0.8316, "step": 458 }, { "epoch": 0.20773930753564154, "grad_norm": 0.5281711220741272, "learning_rate": 0.00016183028837058247, "loss": 0.9401, "step": 459 }, { "epoch": 0.2081918986195972, "grad_norm": 0.46986955404281616, "learning_rate": 0.00016167188726285434, "loss": 0.728, "step": 460 }, { "epoch": 0.20864448970355284, "grad_norm": 0.4838784635066986, "learning_rate": 0.00016151323604133414, "loss": 0.7384, "step": 461 }, { "epoch": 0.2090970807875085, "grad_norm": 0.441522479057312, "learning_rate": 0.0001613543353494409, "loss": 0.738, "step": 462 }, { "epoch": 0.20954967187146414, "grad_norm": 0.5253183841705322, "learning_rate": 0.0001611951858316052, "loss": 0.8767, "step": 463 }, { "epoch": 0.21000226295541977, "grad_norm": 0.542733371257782, "learning_rate": 0.00016103578813326684, "loss": 0.9233, "step": 464 }, { "epoch": 0.21045485403937542, "grad_norm": 0.548744261264801, "learning_rate": 0.00016087614290087208, "loss": 0.9597, "step": 465 }, { "epoch": 0.21090744512333107, "grad_norm": 0.4585428833961487, "learning_rate": 0.00016071625078187114, "loss": 0.7097, "step": 466 }, { "epoch": 0.21136003620728672, "grad_norm": 0.4707442820072174, "learning_rate": 0.0001605561124247154, "loss": 0.6851, "step": 467 }, { "epoch": 0.21181262729124237, "grad_norm": 0.44789552688598633, "learning_rate": 0.00016039572847885504, "loss": 0.65, "step": 468 }, { "epoch": 0.21226521837519802, "grad_norm": 0.4923850893974304, "learning_rate": 0.00016023509959473605, "loss": 0.9482, "step": 469 }, { "epoch": 0.21271780945915367, "grad_norm": 0.5251060724258423, "learning_rate": 0.0001600742264237979, "loss": 0.865, "step": 470 }, { "epoch": 0.2131704005431093, "grad_norm": 0.43079182505607605, "learning_rate": 0.0001599131096184707, "loss": 0.6448, "step": 471 }, { "epoch": 0.21362299162706494, "grad_norm": 0.4741304814815521, "learning_rate": 0.00015975174983217275, "loss": 0.7992, "step": 472 }, { "epoch": 0.2140755827110206, "grad_norm": 0.5322284698486328, "learning_rate": 0.0001595901477193076, "loss": 0.7397, "step": 473 }, { "epoch": 0.21452817379497624, "grad_norm": 0.490506112575531, "learning_rate": 0.00015942830393526176, "loss": 0.8015, "step": 474 }, { "epoch": 0.2149807648789319, "grad_norm": 0.5263440012931824, "learning_rate": 0.0001592662191364017, "loss": 0.9122, "step": 475 }, { "epoch": 0.21543335596288754, "grad_norm": 0.5945788621902466, "learning_rate": 0.0001591038939800714, "loss": 1.0784, "step": 476 }, { "epoch": 0.2158859470468432, "grad_norm": 0.510407030582428, "learning_rate": 0.00015894132912458968, "loss": 1.0042, "step": 477 }, { "epoch": 0.2163385381307988, "grad_norm": 0.4436274766921997, "learning_rate": 0.00015877852522924732, "loss": 0.7253, "step": 478 }, { "epoch": 0.21679112921475446, "grad_norm": 0.45981746912002563, "learning_rate": 0.0001586154829543047, "loss": 0.8309, "step": 479 }, { "epoch": 0.2172437202987101, "grad_norm": 1.0382171869277954, "learning_rate": 0.0001584522029609889, "loss": 0.9175, "step": 480 }, { "epoch": 0.21769631138266576, "grad_norm": 0.4632301330566406, "learning_rate": 0.00015828868591149104, "loss": 0.771, "step": 481 }, { "epoch": 0.2181489024666214, "grad_norm": 0.49221765995025635, "learning_rate": 0.00015812493246896366, "loss": 0.7167, "step": 482 }, { "epoch": 0.21860149355057706, "grad_norm": 0.5978690385818481, "learning_rate": 0.00015796094329751807, "loss": 0.9947, "step": 483 }, { "epoch": 0.2190540846345327, "grad_norm": 0.48670709133148193, "learning_rate": 0.0001577967190622215, "loss": 0.935, "step": 484 }, { "epoch": 0.21950667571848834, "grad_norm": 0.48809000849723816, "learning_rate": 0.00015763226042909455, "loss": 0.9351, "step": 485 }, { "epoch": 0.219959266802444, "grad_norm": 0.6401401162147522, "learning_rate": 0.00015746756806510838, "loss": 1.2269, "step": 486 }, { "epoch": 0.22041185788639964, "grad_norm": 0.6913933753967285, "learning_rate": 0.00015730264263818212, "loss": 0.972, "step": 487 }, { "epoch": 0.2208644489703553, "grad_norm": 0.44599664211273193, "learning_rate": 0.0001571374848171801, "loss": 0.8677, "step": 488 }, { "epoch": 0.22131704005431094, "grad_norm": 1.1593374013900757, "learning_rate": 0.00015697209527190906, "loss": 0.7672, "step": 489 }, { "epoch": 0.2217696311382666, "grad_norm": 0.47981521487236023, "learning_rate": 0.00015680647467311557, "loss": 0.8651, "step": 490 }, { "epoch": 0.2222222222222222, "grad_norm": 0.44539451599121094, "learning_rate": 0.00015664062369248328, "loss": 0.7102, "step": 491 }, { "epoch": 0.22267481330617786, "grad_norm": 0.6086403131484985, "learning_rate": 0.00015647454300263012, "loss": 0.9978, "step": 492 }, { "epoch": 0.2231274043901335, "grad_norm": 0.512725293636322, "learning_rate": 0.00015630823327710558, "loss": 0.846, "step": 493 }, { "epoch": 0.22357999547408916, "grad_norm": 0.5239939093589783, "learning_rate": 0.0001561416951903881, "loss": 1.01, "step": 494 }, { "epoch": 0.2240325865580448, "grad_norm": 0.4727892279624939, "learning_rate": 0.00015597492941788222, "loss": 0.7412, "step": 495 }, { "epoch": 0.22448517764200046, "grad_norm": 0.576046347618103, "learning_rate": 0.00015580793663591585, "loss": 0.9968, "step": 496 }, { "epoch": 0.2249377687259561, "grad_norm": 0.4874511957168579, "learning_rate": 0.00015564071752173758, "loss": 0.7964, "step": 497 }, { "epoch": 0.22539035980991173, "grad_norm": 0.3970421552658081, "learning_rate": 0.0001554732727535139, "loss": 0.5187, "step": 498 }, { "epoch": 0.22584295089386738, "grad_norm": 0.430368036031723, "learning_rate": 0.0001553056030103264, "loss": 0.6658, "step": 499 }, { "epoch": 0.22629554197782303, "grad_norm": 0.5428101420402527, "learning_rate": 0.00015513770897216918, "loss": 1.1194, "step": 500 }, { "epoch": 0.22674813306177868, "grad_norm": 0.6447327136993408, "learning_rate": 0.00015496959131994586, "loss": 1.0849, "step": 501 }, { "epoch": 0.22720072414573433, "grad_norm": 0.4934568405151367, "learning_rate": 0.00015480125073546704, "loss": 0.8384, "step": 502 }, { "epoch": 0.22765331522968998, "grad_norm": 0.5266446471214294, "learning_rate": 0.0001546326879014473, "loss": 1.0149, "step": 503 }, { "epoch": 0.22810590631364563, "grad_norm": 0.5469086766242981, "learning_rate": 0.00015446390350150273, "loss": 0.9152, "step": 504 }, { "epoch": 0.22855849739760126, "grad_norm": 0.4938475787639618, "learning_rate": 0.0001542948982201479, "loss": 0.8845, "step": 505 }, { "epoch": 0.2290110884815569, "grad_norm": 0.40693241357803345, "learning_rate": 0.00015412567274279316, "loss": 0.6758, "step": 506 }, { "epoch": 0.22946367956551256, "grad_norm": 0.46620872616767883, "learning_rate": 0.00015395622775574193, "loss": 0.7524, "step": 507 }, { "epoch": 0.2299162706494682, "grad_norm": 0.4872758984565735, "learning_rate": 0.00015378656394618787, "loss": 0.8768, "step": 508 }, { "epoch": 0.23036886173342386, "grad_norm": 0.5335472226142883, "learning_rate": 0.00015361668200221203, "loss": 0.9473, "step": 509 }, { "epoch": 0.2308214528173795, "grad_norm": 0.5027713179588318, "learning_rate": 0.0001534465826127801, "loss": 0.9785, "step": 510 }, { "epoch": 0.23127404390133513, "grad_norm": 0.4740453064441681, "learning_rate": 0.00015327626646773976, "loss": 0.8903, "step": 511 }, { "epoch": 0.23172663498529078, "grad_norm": 0.5740635395050049, "learning_rate": 0.00015310573425781767, "loss": 0.9505, "step": 512 }, { "epoch": 0.23217922606924643, "grad_norm": 0.5359417796134949, "learning_rate": 0.0001529349866746167, "loss": 0.7476, "step": 513 }, { "epoch": 0.23263181715320208, "grad_norm": 0.37038734555244446, "learning_rate": 0.0001527640244106133, "loss": 0.5349, "step": 514 }, { "epoch": 0.23308440823715773, "grad_norm": 0.43701088428497314, "learning_rate": 0.0001525928481591544, "loss": 0.4619, "step": 515 }, { "epoch": 0.23353699932111338, "grad_norm": 0.5714133977890015, "learning_rate": 0.00015242145861445498, "loss": 1.015, "step": 516 }, { "epoch": 0.23398959040506903, "grad_norm": 0.4387613832950592, "learning_rate": 0.0001522498564715949, "loss": 0.6386, "step": 517 }, { "epoch": 0.23444218148902465, "grad_norm": 0.5844204425811768, "learning_rate": 0.00015207804242651626, "loss": 0.8942, "step": 518 }, { "epoch": 0.2348947725729803, "grad_norm": 0.5000221133232117, "learning_rate": 0.0001519060171760205, "loss": 0.8325, "step": 519 }, { "epoch": 0.23534736365693595, "grad_norm": 0.5425203442573547, "learning_rate": 0.00015173378141776568, "loss": 1.0931, "step": 520 }, { "epoch": 0.2357999547408916, "grad_norm": 0.5176980495452881, "learning_rate": 0.00015156133585026357, "loss": 0.9856, "step": 521 }, { "epoch": 0.23625254582484725, "grad_norm": 0.42920199036598206, "learning_rate": 0.0001513886811728769, "loss": 0.6077, "step": 522 }, { "epoch": 0.2367051369088029, "grad_norm": 0.4761313498020172, "learning_rate": 0.00015121581808581622, "loss": 0.7548, "step": 523 }, { "epoch": 0.23715772799275855, "grad_norm": 0.5511419773101807, "learning_rate": 0.0001510427472901376, "loss": 0.9048, "step": 524 }, { "epoch": 0.23761031907671418, "grad_norm": 0.5572141408920288, "learning_rate": 0.00015086946948773922, "loss": 0.9789, "step": 525 }, { "epoch": 0.23806291016066983, "grad_norm": 0.4818233847618103, "learning_rate": 0.00015069598538135906, "loss": 0.9283, "step": 526 }, { "epoch": 0.23851550124462548, "grad_norm": 0.5013587474822998, "learning_rate": 0.0001505222956745715, "loss": 0.9243, "step": 527 }, { "epoch": 0.23896809232858113, "grad_norm": 0.4967738389968872, "learning_rate": 0.0001503484010717849, "loss": 0.8075, "step": 528 }, { "epoch": 0.23942068341253678, "grad_norm": 0.48400700092315674, "learning_rate": 0.00015017430227823864, "loss": 0.784, "step": 529 }, { "epoch": 0.23987327449649243, "grad_norm": 0.4794735908508301, "learning_rate": 0.00015000000000000001, "loss": 0.7469, "step": 530 }, { "epoch": 0.24032586558044808, "grad_norm": 0.5944988131523132, "learning_rate": 0.0001498254949439617, "loss": 1.2721, "step": 531 }, { "epoch": 0.2407784566644037, "grad_norm": 0.5433883666992188, "learning_rate": 0.0001496507878178388, "loss": 0.9277, "step": 532 }, { "epoch": 0.24123104774835935, "grad_norm": 0.5507298707962036, "learning_rate": 0.00014947587933016577, "loss": 0.94, "step": 533 }, { "epoch": 0.241683638832315, "grad_norm": 0.6457071900367737, "learning_rate": 0.00014930077019029375, "loss": 1.1681, "step": 534 }, { "epoch": 0.24213622991627065, "grad_norm": 0.5488935112953186, "learning_rate": 0.00014912546110838775, "loss": 0.9509, "step": 535 }, { "epoch": 0.2425888210002263, "grad_norm": 0.5333803296089172, "learning_rate": 0.0001489499527954235, "loss": 0.9315, "step": 536 }, { "epoch": 0.24304141208418195, "grad_norm": 0.4848494231700897, "learning_rate": 0.00014877424596318483, "loss": 0.7865, "step": 537 }, { "epoch": 0.24349400316813757, "grad_norm": 0.5007054209709167, "learning_rate": 0.0001485983413242606, "loss": 0.9231, "step": 538 }, { "epoch": 0.24394659425209322, "grad_norm": 0.5695723295211792, "learning_rate": 0.00014842223959204198, "loss": 0.9934, "step": 539 }, { "epoch": 0.24439918533604887, "grad_norm": 0.4827041029930115, "learning_rate": 0.00014824594148071934, "loss": 0.8522, "step": 540 }, { "epoch": 0.24485177642000452, "grad_norm": 0.5187906622886658, "learning_rate": 0.00014806944770527958, "loss": 0.9672, "step": 541 }, { "epoch": 0.24530436750396017, "grad_norm": 0.712968111038208, "learning_rate": 0.00014789275898150308, "loss": 0.9316, "step": 542 }, { "epoch": 0.24575695858791582, "grad_norm": 0.6967286467552185, "learning_rate": 0.00014771587602596084, "loss": 0.9736, "step": 543 }, { "epoch": 0.24620954967187147, "grad_norm": 0.3656039535999298, "learning_rate": 0.00014753879955601163, "loss": 0.4932, "step": 544 }, { "epoch": 0.2466621407558271, "grad_norm": 0.864976704120636, "learning_rate": 0.00014736153028979893, "loss": 1.0662, "step": 545 }, { "epoch": 0.24711473183978275, "grad_norm": 0.4753805994987488, "learning_rate": 0.0001471840689462482, "loss": 0.8889, "step": 546 }, { "epoch": 0.2475673229237384, "grad_norm": 0.4921295642852783, "learning_rate": 0.00014700641624506392, "loss": 0.8106, "step": 547 }, { "epoch": 0.24801991400769405, "grad_norm": 0.7320112586021423, "learning_rate": 0.00014682857290672648, "loss": 0.845, "step": 548 }, { "epoch": 0.2484725050916497, "grad_norm": 0.4472196698188782, "learning_rate": 0.0001466505396524895, "loss": 0.8076, "step": 549 }, { "epoch": 0.24892509617560535, "grad_norm": 0.43544888496398926, "learning_rate": 0.00014647231720437686, "loss": 0.8158, "step": 550 }, { "epoch": 0.249377687259561, "grad_norm": 0.39511433243751526, "learning_rate": 0.00014629390628517964, "loss": 0.6934, "step": 551 }, { "epoch": 0.24983027834351662, "grad_norm": 0.44961658120155334, "learning_rate": 0.00014611530761845335, "loss": 0.7865, "step": 552 }, { "epoch": 0.2502828694274723, "grad_norm": 0.4339733421802521, "learning_rate": 0.00014593652192851486, "loss": 0.5448, "step": 553 }, { "epoch": 0.2507354605114279, "grad_norm": 0.4500328600406647, "learning_rate": 0.00014575754994043956, "loss": 0.8066, "step": 554 }, { "epoch": 0.25118805159538354, "grad_norm": 0.5119399428367615, "learning_rate": 0.00014557839238005832, "loss": 1.0859, "step": 555 }, { "epoch": 0.2516406426793392, "grad_norm": 0.5100839138031006, "learning_rate": 0.00014539904997395468, "loss": 0.9779, "step": 556 }, { "epoch": 0.25209323376329484, "grad_norm": 0.5187854766845703, "learning_rate": 0.00014521952344946187, "loss": 1.0203, "step": 557 }, { "epoch": 0.2525458248472505, "grad_norm": 0.45579731464385986, "learning_rate": 0.0001450398135346597, "loss": 0.7401, "step": 558 }, { "epoch": 0.25299841593120614, "grad_norm": 0.47840386629104614, "learning_rate": 0.00014485992095837177, "loss": 0.78, "step": 559 }, { "epoch": 0.2534510070151618, "grad_norm": 0.49256467819213867, "learning_rate": 0.00014467984645016258, "loss": 0.9524, "step": 560 }, { "epoch": 0.25390359809911744, "grad_norm": 0.3887495696544647, "learning_rate": 0.00014449959074033434, "loss": 0.6893, "step": 561 }, { "epoch": 0.25435618918307307, "grad_norm": 0.46293124556541443, "learning_rate": 0.00014431915455992414, "loss": 0.7996, "step": 562 }, { "epoch": 0.25480878026702874, "grad_norm": 0.5254295468330383, "learning_rate": 0.00014413853864070103, "loss": 0.9435, "step": 563 }, { "epoch": 0.25526137135098437, "grad_norm": 0.4905354380607605, "learning_rate": 0.00014395774371516305, "loss": 0.7951, "step": 564 }, { "epoch": 0.25571396243494005, "grad_norm": 0.47427988052368164, "learning_rate": 0.00014377677051653404, "loss": 0.7587, "step": 565 }, { "epoch": 0.25616655351889567, "grad_norm": 0.5358110070228577, "learning_rate": 0.00014359561977876102, "loss": 0.9288, "step": 566 }, { "epoch": 0.25661914460285135, "grad_norm": 0.4997360110282898, "learning_rate": 0.00014341429223651092, "loss": 0.9648, "step": 567 }, { "epoch": 0.25707173568680697, "grad_norm": 0.4709185063838959, "learning_rate": 0.00014323278862516775, "loss": 0.8978, "step": 568 }, { "epoch": 0.2575243267707626, "grad_norm": 0.6130642890930176, "learning_rate": 0.00014305110968082952, "loss": 1.2076, "step": 569 }, { "epoch": 0.25797691785471827, "grad_norm": 0.5134180188179016, "learning_rate": 0.00014286925614030542, "loss": 0.8747, "step": 570 }, { "epoch": 0.2584295089386739, "grad_norm": 0.5592136383056641, "learning_rate": 0.00014268722874111265, "loss": 1.0155, "step": 571 }, { "epoch": 0.25888210002262957, "grad_norm": 0.41666361689567566, "learning_rate": 0.00014250502822147354, "loss": 0.7715, "step": 572 }, { "epoch": 0.2593346911065852, "grad_norm": 0.45632311701774597, "learning_rate": 0.00014232265532031243, "loss": 0.7487, "step": 573 }, { "epoch": 0.25978728219054087, "grad_norm": 0.5512887239456177, "learning_rate": 0.00014214011077725292, "loss": 0.9464, "step": 574 }, { "epoch": 0.2602398732744965, "grad_norm": 0.47320300340652466, "learning_rate": 0.00014195739533261458, "loss": 0.9381, "step": 575 }, { "epoch": 0.2606924643584521, "grad_norm": 0.5143305659294128, "learning_rate": 0.0001417745097274101, "loss": 0.7783, "step": 576 }, { "epoch": 0.2611450554424078, "grad_norm": 0.4237568974494934, "learning_rate": 0.00014159145470334235, "loss": 0.5311, "step": 577 }, { "epoch": 0.2615976465263634, "grad_norm": 0.5218071937561035, "learning_rate": 0.0001414082310028012, "loss": 0.9782, "step": 578 }, { "epoch": 0.2620502376103191, "grad_norm": 0.411541223526001, "learning_rate": 0.00014122483936886067, "loss": 0.6654, "step": 579 }, { "epoch": 0.2625028286942747, "grad_norm": 0.508083701133728, "learning_rate": 0.0001410412805452757, "loss": 1.0775, "step": 580 }, { "epoch": 0.2629554197782304, "grad_norm": 0.43752411007881165, "learning_rate": 0.00014085755527647946, "loss": 0.6292, "step": 581 }, { "epoch": 0.263408010862186, "grad_norm": 0.591759443283081, "learning_rate": 0.00014067366430758004, "loss": 1.0097, "step": 582 }, { "epoch": 0.26386060194614164, "grad_norm": 0.411795973777771, "learning_rate": 0.00014048960838435753, "loss": 0.5143, "step": 583 }, { "epoch": 0.2643131930300973, "grad_norm": 0.4778439402580261, "learning_rate": 0.00014030538825326113, "loss": 0.6781, "step": 584 }, { "epoch": 0.26476578411405294, "grad_norm": 0.5669144988059998, "learning_rate": 0.00014012100466140578, "loss": 0.9513, "step": 585 }, { "epoch": 0.2652183751980086, "grad_norm": 0.5837613344192505, "learning_rate": 0.00013993645835656953, "loss": 1.0309, "step": 586 }, { "epoch": 0.26567096628196424, "grad_norm": 0.42293286323547363, "learning_rate": 0.00013975175008719027, "loss": 0.6944, "step": 587 }, { "epoch": 0.2661235573659199, "grad_norm": 0.5234153866767883, "learning_rate": 0.00013956688060236266, "loss": 1.1786, "step": 588 }, { "epoch": 0.26657614844987554, "grad_norm": 0.4235227108001709, "learning_rate": 0.00013938185065183532, "loss": 0.7063, "step": 589 }, { "epoch": 0.26702873953383116, "grad_norm": 0.5143097043037415, "learning_rate": 0.00013919666098600753, "loss": 0.8122, "step": 590 }, { "epoch": 0.26748133061778684, "grad_norm": 0.5661123991012573, "learning_rate": 0.00013901131235592634, "loss": 0.9807, "step": 591 }, { "epoch": 0.26793392170174246, "grad_norm": 0.47588860988616943, "learning_rate": 0.0001388258055132835, "loss": 0.7905, "step": 592 }, { "epoch": 0.26838651278569814, "grad_norm": 0.43809354305267334, "learning_rate": 0.0001386401412104124, "loss": 0.6649, "step": 593 }, { "epoch": 0.26883910386965376, "grad_norm": 0.4953427314758301, "learning_rate": 0.0001384543202002851, "loss": 0.9997, "step": 594 }, { "epoch": 0.26929169495360944, "grad_norm": 0.5393937826156616, "learning_rate": 0.000138268343236509, "loss": 1.0959, "step": 595 }, { "epoch": 0.26974428603756506, "grad_norm": 0.4869440495967865, "learning_rate": 0.0001380822110733241, "loss": 0.7566, "step": 596 }, { "epoch": 0.2701968771215207, "grad_norm": 0.5863755345344543, "learning_rate": 0.00013789592446559988, "loss": 1.073, "step": 597 }, { "epoch": 0.27064946820547636, "grad_norm": 0.46900475025177, "learning_rate": 0.00013770948416883205, "loss": 0.8824, "step": 598 }, { "epoch": 0.271102059289432, "grad_norm": 0.5337939262390137, "learning_rate": 0.0001375228909391397, "loss": 1.1455, "step": 599 }, { "epoch": 0.27155465037338766, "grad_norm": 0.6933276653289795, "learning_rate": 0.00013733614553326212, "loss": 0.9099, "step": 600 }, { "epoch": 0.2720072414573433, "grad_norm": 0.41781508922576904, "learning_rate": 0.00013714924870855571, "loss": 0.569, "step": 601 }, { "epoch": 0.27245983254129896, "grad_norm": 0.47960004210472107, "learning_rate": 0.00013696220122299112, "loss": 0.9258, "step": 602 }, { "epoch": 0.2729124236252546, "grad_norm": 0.4569396674633026, "learning_rate": 0.00013677500383514976, "loss": 0.7346, "step": 603 }, { "epoch": 0.2733650147092102, "grad_norm": 0.48138466477394104, "learning_rate": 0.00013658765730422125, "loss": 0.865, "step": 604 }, { "epoch": 0.2738176057931659, "grad_norm": 0.5245850682258606, "learning_rate": 0.00013640016238999984, "loss": 0.9676, "step": 605 }, { "epoch": 0.2742701968771215, "grad_norm": 0.6183745861053467, "learning_rate": 0.0001362125198528817, "loss": 1.0605, "step": 606 }, { "epoch": 0.2747227879610772, "grad_norm": 0.4640631079673767, "learning_rate": 0.00013602473045386165, "loss": 0.9091, "step": 607 }, { "epoch": 0.2751753790450328, "grad_norm": 0.4996570646762848, "learning_rate": 0.00013583679495453, "loss": 0.8421, "step": 608 }, { "epoch": 0.2756279701289885, "grad_norm": 0.5220595002174377, "learning_rate": 0.0001356487141170699, "loss": 0.5935, "step": 609 }, { "epoch": 0.2760805612129441, "grad_norm": 0.46406427025794983, "learning_rate": 0.00013546048870425356, "loss": 0.9066, "step": 610 }, { "epoch": 0.27653315229689973, "grad_norm": 0.4911267161369324, "learning_rate": 0.00013527211947943974, "loss": 0.8333, "step": 611 }, { "epoch": 0.2769857433808554, "grad_norm": 0.516521692276001, "learning_rate": 0.00013508360720657038, "loss": 0.808, "step": 612 }, { "epoch": 0.27743833446481103, "grad_norm": 0.5458274483680725, "learning_rate": 0.0001348949526501675, "loss": 1.0222, "step": 613 }, { "epoch": 0.2778909255487667, "grad_norm": 0.54526686668396, "learning_rate": 0.0001347061565753303, "loss": 0.7527, "step": 614 }, { "epoch": 0.27834351663272233, "grad_norm": 0.4712691307067871, "learning_rate": 0.0001345172197477318, "loss": 0.7782, "step": 615 }, { "epoch": 0.27879610771667795, "grad_norm": 0.44933852553367615, "learning_rate": 0.00013432814293361584, "loss": 0.7569, "step": 616 }, { "epoch": 0.27924869880063363, "grad_norm": 0.5856960415840149, "learning_rate": 0.00013413892689979407, "loss": 1.2136, "step": 617 }, { "epoch": 0.27970128988458925, "grad_norm": 0.5791141390800476, "learning_rate": 0.00013394957241364273, "loss": 1.1147, "step": 618 }, { "epoch": 0.28015388096854493, "grad_norm": 0.5399816036224365, "learning_rate": 0.00013376008024309948, "loss": 0.9661, "step": 619 }, { "epoch": 0.28060647205250056, "grad_norm": 0.4976983666419983, "learning_rate": 0.0001335704511566605, "loss": 0.8953, "step": 620 }, { "epoch": 0.28105906313645623, "grad_norm": 0.5449846386909485, "learning_rate": 0.0001333806859233771, "loss": 0.9998, "step": 621 }, { "epoch": 0.28151165422041186, "grad_norm": 0.4902433156967163, "learning_rate": 0.00013319078531285285, "loss": 0.8005, "step": 622 }, { "epoch": 0.2819642453043675, "grad_norm": 0.5565148591995239, "learning_rate": 0.0001330007500952403, "loss": 0.8355, "step": 623 }, { "epoch": 0.28241683638832316, "grad_norm": 0.5662787556648254, "learning_rate": 0.00013281058104123793, "loss": 0.9444, "step": 624 }, { "epoch": 0.2828694274722788, "grad_norm": 0.42661720514297485, "learning_rate": 0.00013262027892208694, "loss": 0.6726, "step": 625 }, { "epoch": 0.28332201855623446, "grad_norm": 1.0109385251998901, "learning_rate": 0.00013242984450956828, "loss": 0.882, "step": 626 }, { "epoch": 0.2837746096401901, "grad_norm": 0.4351411759853363, "learning_rate": 0.00013223927857599935, "loss": 0.7719, "step": 627 }, { "epoch": 0.28422720072414576, "grad_norm": 0.5186506509780884, "learning_rate": 0.00013204858189423097, "loss": 0.7782, "step": 628 }, { "epoch": 0.2846797918081014, "grad_norm": 0.7090187668800354, "learning_rate": 0.00013185775523764424, "loss": 0.9125, "step": 629 }, { "epoch": 0.285132382892057, "grad_norm": 0.5328108072280884, "learning_rate": 0.00013166679938014726, "loss": 0.9438, "step": 630 }, { "epoch": 0.2855849739760127, "grad_norm": 0.4744925796985626, "learning_rate": 0.00013147571509617228, "loss": 0.8791, "step": 631 }, { "epoch": 0.2860375650599683, "grad_norm": 0.5229085683822632, "learning_rate": 0.00013128450316067232, "loss": 0.881, "step": 632 }, { "epoch": 0.286490156143924, "grad_norm": 0.5308219790458679, "learning_rate": 0.00013109316434911804, "loss": 0.9303, "step": 633 }, { "epoch": 0.2869427472278796, "grad_norm": 0.5029545426368713, "learning_rate": 0.00013090169943749476, "loss": 0.9326, "step": 634 }, { "epoch": 0.2873953383118353, "grad_norm": 0.5018476843833923, "learning_rate": 0.00013071010920229909, "loss": 0.8697, "step": 635 }, { "epoch": 0.2878479293957909, "grad_norm": 0.5224341154098511, "learning_rate": 0.00013051839442053599, "loss": 1.0156, "step": 636 }, { "epoch": 0.2883005204797465, "grad_norm": 0.3911929726600647, "learning_rate": 0.00013032655586971552, "loss": 0.6028, "step": 637 }, { "epoch": 0.2887531115637022, "grad_norm": 0.6230449676513672, "learning_rate": 0.00013013459432784961, "loss": 1.3624, "step": 638 }, { "epoch": 0.2892057026476578, "grad_norm": 0.482842355966568, "learning_rate": 0.00012994251057344905, "loss": 0.8734, "step": 639 }, { "epoch": 0.2896582937316135, "grad_norm": 0.48907703161239624, "learning_rate": 0.00012975030538552032, "loss": 0.8193, "step": 640 }, { "epoch": 0.2901108848155691, "grad_norm": 0.5242980718612671, "learning_rate": 0.00012955797954356224, "loss": 0.8471, "step": 641 }, { "epoch": 0.2905634758995248, "grad_norm": 0.4441068172454834, "learning_rate": 0.0001293655338275631, "loss": 0.693, "step": 642 }, { "epoch": 0.2910160669834804, "grad_norm": 0.4508345127105713, "learning_rate": 0.0001291729690179972, "loss": 0.6663, "step": 643 }, { "epoch": 0.29146865806743605, "grad_norm": 0.4511308968067169, "learning_rate": 0.00012898028589582202, "loss": 0.8183, "step": 644 }, { "epoch": 0.2919212491513917, "grad_norm": 0.5768519639968872, "learning_rate": 0.00012878748524247462, "loss": 0.7828, "step": 645 }, { "epoch": 0.29237384023534735, "grad_norm": 0.4653044044971466, "learning_rate": 0.00012859456783986893, "loss": 0.6586, "step": 646 }, { "epoch": 0.292826431319303, "grad_norm": 0.5396013855934143, "learning_rate": 0.00012840153447039228, "loss": 0.8975, "step": 647 }, { "epoch": 0.29327902240325865, "grad_norm": 0.4867834746837616, "learning_rate": 0.00012820838591690227, "loss": 0.8606, "step": 648 }, { "epoch": 0.2937316134872143, "grad_norm": 0.4207919239997864, "learning_rate": 0.00012801512296272368, "loss": 0.6362, "step": 649 }, { "epoch": 0.29418420457116995, "grad_norm": 0.6996460556983948, "learning_rate": 0.0001278217463916453, "loss": 0.7404, "step": 650 }, { "epoch": 0.29463679565512557, "grad_norm": 0.5210298299789429, "learning_rate": 0.00012762825698791653, "loss": 0.7207, "step": 651 }, { "epoch": 0.29508938673908125, "grad_norm": 0.440009742975235, "learning_rate": 0.0001274346555362446, "loss": 0.6941, "step": 652 }, { "epoch": 0.29554197782303687, "grad_norm": 0.5195272564888, "learning_rate": 0.00012724094282179094, "loss": 0.7714, "step": 653 }, { "epoch": 0.29599456890699255, "grad_norm": 0.5177645087242126, "learning_rate": 0.0001270471196301684, "loss": 0.8669, "step": 654 }, { "epoch": 0.2964471599909482, "grad_norm": 0.46505850553512573, "learning_rate": 0.0001268531867474377, "loss": 0.914, "step": 655 }, { "epoch": 0.29689975107490385, "grad_norm": 0.578664243221283, "learning_rate": 0.00012665914496010454, "loss": 1.2505, "step": 656 }, { "epoch": 0.2973523421588595, "grad_norm": 0.4832736551761627, "learning_rate": 0.00012646499505511625, "loss": 0.8235, "step": 657 }, { "epoch": 0.2978049332428151, "grad_norm": 0.5135257244110107, "learning_rate": 0.0001262707378198587, "loss": 0.8036, "step": 658 }, { "epoch": 0.2982575243267708, "grad_norm": 0.4872123599052429, "learning_rate": 0.0001260763740421529, "loss": 0.8361, "step": 659 }, { "epoch": 0.2987101154107264, "grad_norm": 0.5487096905708313, "learning_rate": 0.00012588190451025207, "loss": 1.1053, "step": 660 }, { "epoch": 0.2991627064946821, "grad_norm": 0.5271202921867371, "learning_rate": 0.00012568733001283827, "loss": 0.775, "step": 661 }, { "epoch": 0.2996152975786377, "grad_norm": 0.49594709277153015, "learning_rate": 0.00012549265133901934, "loss": 0.747, "step": 662 }, { "epoch": 0.3000678886625934, "grad_norm": 0.5582691431045532, "learning_rate": 0.00012529786927832542, "loss": 0.8814, "step": 663 }, { "epoch": 0.300520479746549, "grad_norm": 0.5153692960739136, "learning_rate": 0.00012510298462070619, "loss": 0.9651, "step": 664 }, { "epoch": 0.3009730708305046, "grad_norm": 0.5268899202346802, "learning_rate": 0.0001249079981565272, "loss": 0.6635, "step": 665 }, { "epoch": 0.3014256619144603, "grad_norm": 0.5442325472831726, "learning_rate": 0.00012471291067656697, "loss": 0.9908, "step": 666 }, { "epoch": 0.3018782529984159, "grad_norm": 0.4943901300430298, "learning_rate": 0.00012451772297201376, "loss": 0.7379, "step": 667 }, { "epoch": 0.3023308440823716, "grad_norm": 0.49773284792900085, "learning_rate": 0.0001243224358344622, "loss": 0.968, "step": 668 }, { "epoch": 0.3027834351663272, "grad_norm": 0.4361281991004944, "learning_rate": 0.00012412705005591024, "loss": 0.8096, "step": 669 }, { "epoch": 0.30323602625028284, "grad_norm": 0.47975584864616394, "learning_rate": 0.0001239315664287558, "loss": 0.7846, "step": 670 }, { "epoch": 0.3036886173342385, "grad_norm": 0.5244975090026855, "learning_rate": 0.00012373598574579367, "loss": 0.9548, "step": 671 }, { "epoch": 0.30414120841819414, "grad_norm": 0.5144295692443848, "learning_rate": 0.00012354030880021234, "loss": 0.7269, "step": 672 }, { "epoch": 0.3045937995021498, "grad_norm": 0.5258602499961853, "learning_rate": 0.00012334453638559057, "loss": 0.9435, "step": 673 }, { "epoch": 0.30504639058610544, "grad_norm": 0.5024265050888062, "learning_rate": 0.00012314866929589432, "loss": 0.8495, "step": 674 }, { "epoch": 0.3054989816700611, "grad_norm": 0.4053838849067688, "learning_rate": 0.00012295270832547356, "loss": 0.4879, "step": 675 }, { "epoch": 0.30595157275401674, "grad_norm": 0.5322363972663879, "learning_rate": 0.000122756654269059, "loss": 0.7895, "step": 676 }, { "epoch": 0.30640416383797237, "grad_norm": 0.5888680815696716, "learning_rate": 0.00012256050792175887, "loss": 1.1324, "step": 677 }, { "epoch": 0.30685675492192804, "grad_norm": 0.5343974232673645, "learning_rate": 0.00012236427007905558, "loss": 0.9867, "step": 678 }, { "epoch": 0.30730934600588367, "grad_norm": 0.4725770950317383, "learning_rate": 0.00012216794153680274, "loss": 0.8123, "step": 679 }, { "epoch": 0.30776193708983934, "grad_norm": 0.4887194335460663, "learning_rate": 0.00012197152309122173, "loss": 0.9044, "step": 680 }, { "epoch": 0.30821452817379497, "grad_norm": 0.4865034818649292, "learning_rate": 0.00012177501553889856, "loss": 0.7338, "step": 681 }, { "epoch": 0.30866711925775064, "grad_norm": 0.472418874502182, "learning_rate": 0.00012157841967678063, "loss": 0.7923, "step": 682 }, { "epoch": 0.30911971034170627, "grad_norm": 0.48899978399276733, "learning_rate": 0.00012138173630217342, "loss": 0.8421, "step": 683 }, { "epoch": 0.3095723014256619, "grad_norm": 0.4492488503456116, "learning_rate": 0.00012118496621273745, "loss": 0.6645, "step": 684 }, { "epoch": 0.31002489250961757, "grad_norm": 0.40700769424438477, "learning_rate": 0.00012098811020648475, "loss": 0.5403, "step": 685 }, { "epoch": 0.3104774835935732, "grad_norm": 0.543043315410614, "learning_rate": 0.00012079116908177593, "loss": 0.9634, "step": 686 }, { "epoch": 0.31093007467752887, "grad_norm": 0.5078130960464478, "learning_rate": 0.00012059414363731674, "loss": 0.9197, "step": 687 }, { "epoch": 0.3113826657614845, "grad_norm": 0.47711634635925293, "learning_rate": 0.00012039703467215488, "loss": 0.8304, "step": 688 }, { "epoch": 0.31183525684544017, "grad_norm": 0.4520992338657379, "learning_rate": 0.00012019984298567682, "loss": 0.5376, "step": 689 }, { "epoch": 0.3122878479293958, "grad_norm": 0.49447259306907654, "learning_rate": 0.00012000256937760445, "loss": 0.8525, "step": 690 }, { "epoch": 0.3127404390133514, "grad_norm": 0.5532546639442444, "learning_rate": 0.00011980521464799198, "loss": 1.0082, "step": 691 }, { "epoch": 0.3131930300973071, "grad_norm": 0.4508833885192871, "learning_rate": 0.00011960777959722253, "loss": 0.8646, "step": 692 }, { "epoch": 0.3136456211812627, "grad_norm": 0.4003430902957916, "learning_rate": 0.000119410265026005, "loss": 0.6842, "step": 693 }, { "epoch": 0.3140982122652184, "grad_norm": 0.5402839183807373, "learning_rate": 0.00011921267173537086, "loss": 0.982, "step": 694 }, { "epoch": 0.314550803349174, "grad_norm": 0.44808655977249146, "learning_rate": 0.00011901500052667068, "loss": 0.8136, "step": 695 }, { "epoch": 0.3150033944331297, "grad_norm": 0.5470143556594849, "learning_rate": 0.00011881725220157113, "loss": 1.2509, "step": 696 }, { "epoch": 0.3154559855170853, "grad_norm": 0.5380640625953674, "learning_rate": 0.00011861942756205169, "loss": 1.0705, "step": 697 }, { "epoch": 0.31590857660104094, "grad_norm": 0.49166131019592285, "learning_rate": 0.00011842152741040116, "loss": 0.7588, "step": 698 }, { "epoch": 0.3163611676849966, "grad_norm": 0.5186418294906616, "learning_rate": 0.00011822355254921478, "loss": 0.9667, "step": 699 }, { "epoch": 0.31681375876895224, "grad_norm": 0.48516249656677246, "learning_rate": 0.0001180255037813906, "loss": 0.8118, "step": 700 }, { "epoch": 0.3172663498529079, "grad_norm": 0.4609658420085907, "learning_rate": 0.00011782738191012652, "loss": 0.9096, "step": 701 }, { "epoch": 0.31771894093686354, "grad_norm": 0.4883543848991394, "learning_rate": 0.00011762918773891691, "loss": 0.8148, "step": 702 }, { "epoch": 0.3181715320208192, "grad_norm": 0.5130921006202698, "learning_rate": 0.00011743092207154929, "loss": 0.8609, "step": 703 }, { "epoch": 0.31862412310477484, "grad_norm": 0.5097033381462097, "learning_rate": 0.00011723258571210121, "loss": 1.0203, "step": 704 }, { "epoch": 0.31907671418873046, "grad_norm": 0.4869794249534607, "learning_rate": 0.00011703417946493683, "loss": 1.0473, "step": 705 }, { "epoch": 0.31952930527268614, "grad_norm": 0.45631757378578186, "learning_rate": 0.00011683570413470383, "loss": 0.706, "step": 706 }, { "epoch": 0.31998189635664176, "grad_norm": 0.47609448432922363, "learning_rate": 0.00011663716052633006, "loss": 0.6871, "step": 707 }, { "epoch": 0.32043448744059744, "grad_norm": 0.4803597629070282, "learning_rate": 0.0001164385494450202, "loss": 0.7175, "step": 708 }, { "epoch": 0.32088707852455306, "grad_norm": 0.42784056067466736, "learning_rate": 0.00011623987169625261, "loss": 0.7301, "step": 709 }, { "epoch": 0.32133966960850874, "grad_norm": 0.5752036571502686, "learning_rate": 0.00011604112808577603, "loss": 0.8522, "step": 710 }, { "epoch": 0.32179226069246436, "grad_norm": 0.5516340732574463, "learning_rate": 0.00011584231941960628, "loss": 0.9913, "step": 711 }, { "epoch": 0.32224485177642, "grad_norm": 0.4553099572658539, "learning_rate": 0.0001156434465040231, "loss": 0.7671, "step": 712 }, { "epoch": 0.32269744286037566, "grad_norm": 0.5408231019973755, "learning_rate": 0.00011544451014556665, "loss": 0.9212, "step": 713 }, { "epoch": 0.3231500339443313, "grad_norm": 0.5186082720756531, "learning_rate": 0.00011524551115103454, "loss": 0.8433, "step": 714 }, { "epoch": 0.32360262502828696, "grad_norm": 0.464286744594574, "learning_rate": 0.00011504645032747832, "loss": 0.8244, "step": 715 }, { "epoch": 0.3240552161122426, "grad_norm": 0.5196670293807983, "learning_rate": 0.00011484732848220026, "loss": 0.9368, "step": 716 }, { "epoch": 0.32450780719619826, "grad_norm": 0.4835793375968933, "learning_rate": 0.00011464814642275022, "loss": 0.7648, "step": 717 }, { "epoch": 0.3249603982801539, "grad_norm": 0.48903408646583557, "learning_rate": 0.00011444890495692213, "loss": 0.9417, "step": 718 }, { "epoch": 0.3254129893641095, "grad_norm": 0.5032278895378113, "learning_rate": 0.00011424960489275098, "loss": 0.8376, "step": 719 }, { "epoch": 0.3258655804480652, "grad_norm": 0.44192057847976685, "learning_rate": 0.00011405024703850929, "loss": 0.6973, "step": 720 }, { "epoch": 0.3263181715320208, "grad_norm": 0.448816180229187, "learning_rate": 0.00011385083220270401, "loss": 0.7599, "step": 721 }, { "epoch": 0.3267707626159765, "grad_norm": 0.44175952672958374, "learning_rate": 0.00011365136119407319, "loss": 0.6627, "step": 722 }, { "epoch": 0.3272233536999321, "grad_norm": 0.5739919543266296, "learning_rate": 0.00011345183482158264, "loss": 0.9778, "step": 723 }, { "epoch": 0.3276759447838878, "grad_norm": 0.5183276534080505, "learning_rate": 0.00011325225389442277, "loss": 0.9271, "step": 724 }, { "epoch": 0.3281285358678434, "grad_norm": 0.40638917684555054, "learning_rate": 0.00011305261922200519, "loss": 0.5595, "step": 725 }, { "epoch": 0.32858112695179903, "grad_norm": 0.5148621797561646, "learning_rate": 0.00011285293161395946, "loss": 0.9125, "step": 726 }, { "epoch": 0.3290337180357547, "grad_norm": 0.49269428849220276, "learning_rate": 0.00011265319188012994, "loss": 0.8463, "step": 727 }, { "epoch": 0.32948630911971033, "grad_norm": 0.5185051560401917, "learning_rate": 0.00011245340083057223, "loss": 0.8708, "step": 728 }, { "epoch": 0.329938900203666, "grad_norm": 0.48547667264938354, "learning_rate": 0.00011225355927555014, "loss": 0.9409, "step": 729 }, { "epoch": 0.33039149128762163, "grad_norm": 0.581288754940033, "learning_rate": 0.0001120536680255323, "loss": 1.1318, "step": 730 }, { "epoch": 0.33084408237157725, "grad_norm": 0.5337197184562683, "learning_rate": 0.00011185372789118887, "loss": 0.9725, "step": 731 }, { "epoch": 0.33129667345553293, "grad_norm": 0.49437853693962097, "learning_rate": 0.00011165373968338824, "loss": 0.9509, "step": 732 }, { "epoch": 0.33174926453948855, "grad_norm": 0.577504575252533, "learning_rate": 0.00011145370421319377, "loss": 1.1211, "step": 733 }, { "epoch": 0.33220185562344423, "grad_norm": 0.6033246517181396, "learning_rate": 0.00011125362229186057, "loss": 1.1196, "step": 734 }, { "epoch": 0.33265444670739985, "grad_norm": 0.46174025535583496, "learning_rate": 0.000111053494730832, "loss": 0.762, "step": 735 }, { "epoch": 0.33310703779135553, "grad_norm": 0.5123987793922424, "learning_rate": 0.00011085332234173664, "loss": 0.9462, "step": 736 }, { "epoch": 0.33355962887531115, "grad_norm": 0.4520655572414398, "learning_rate": 0.00011065310593638483, "loss": 0.8018, "step": 737 }, { "epoch": 0.3340122199592668, "grad_norm": 0.5619468092918396, "learning_rate": 0.00011045284632676536, "loss": 1.0392, "step": 738 }, { "epoch": 0.33446481104322245, "grad_norm": 0.49872851371765137, "learning_rate": 0.00011025254432504233, "loss": 0.7856, "step": 739 }, { "epoch": 0.3349174021271781, "grad_norm": 0.40664106607437134, "learning_rate": 0.00011005220074355171, "loss": 0.5486, "step": 740 }, { "epoch": 0.33536999321113375, "grad_norm": 0.513265073299408, "learning_rate": 0.00010985181639479813, "loss": 0.8292, "step": 741 }, { "epoch": 0.3358225842950894, "grad_norm": 0.3986378014087677, "learning_rate": 0.00010965139209145152, "loss": 0.7516, "step": 742 }, { "epoch": 0.33627517537904505, "grad_norm": 0.4895348846912384, "learning_rate": 0.00010945092864634388, "loss": 0.8381, "step": 743 }, { "epoch": 0.3367277664630007, "grad_norm": 0.44972074031829834, "learning_rate": 0.00010925042687246592, "loss": 0.6788, "step": 744 }, { "epoch": 0.3371803575469563, "grad_norm": 0.5335928201675415, "learning_rate": 0.0001090498875829638, "loss": 0.7777, "step": 745 }, { "epoch": 0.337632948630912, "grad_norm": 0.5425425171852112, "learning_rate": 0.00010884931159113586, "loss": 0.8378, "step": 746 }, { "epoch": 0.3380855397148676, "grad_norm": 0.5339006185531616, "learning_rate": 0.00010864869971042925, "loss": 0.8918, "step": 747 }, { "epoch": 0.3385381307988233, "grad_norm": 0.5580891966819763, "learning_rate": 0.00010844805275443673, "loss": 0.9455, "step": 748 }, { "epoch": 0.3389907218827789, "grad_norm": 0.4360131025314331, "learning_rate": 0.00010824737153689319, "loss": 0.6515, "step": 749 }, { "epoch": 0.3394433129667346, "grad_norm": 0.43018674850463867, "learning_rate": 0.00010804665687167262, "loss": 0.6645, "step": 750 }, { "epoch": 0.3398959040506902, "grad_norm": 0.4255496561527252, "learning_rate": 0.0001078459095727845, "loss": 0.6244, "step": 751 }, { "epoch": 0.3403484951346458, "grad_norm": 0.4636889398097992, "learning_rate": 0.00010764513045437083, "loss": 0.791, "step": 752 }, { "epoch": 0.3408010862186015, "grad_norm": 0.5495937466621399, "learning_rate": 0.00010744432033070251, "loss": 0.9623, "step": 753 }, { "epoch": 0.3412536773025571, "grad_norm": 0.5409247875213623, "learning_rate": 0.00010724348001617625, "loss": 0.925, "step": 754 }, { "epoch": 0.3417062683865128, "grad_norm": 0.4871814548969269, "learning_rate": 0.0001070426103253112, "loss": 0.7726, "step": 755 }, { "epoch": 0.3421588594704684, "grad_norm": 0.5325297713279724, "learning_rate": 0.00010684171207274562, "loss": 1.0311, "step": 756 }, { "epoch": 0.3426114505544241, "grad_norm": 0.6206918358802795, "learning_rate": 0.00010664078607323367, "loss": 1.0326, "step": 757 }, { "epoch": 0.3430640416383797, "grad_norm": 0.45602741837501526, "learning_rate": 0.00010643983314164194, "loss": 0.6741, "step": 758 }, { "epoch": 0.34351663272233535, "grad_norm": 0.4545150399208069, "learning_rate": 0.00010623885409294633, "loss": 0.6257, "step": 759 }, { "epoch": 0.343969223806291, "grad_norm": 0.46721890568733215, "learning_rate": 0.00010603784974222861, "loss": 0.7438, "step": 760 }, { "epoch": 0.34442181489024665, "grad_norm": 0.44289225339889526, "learning_rate": 0.00010583682090467317, "loss": 0.7158, "step": 761 }, { "epoch": 0.3448744059742023, "grad_norm": 0.6556697487831116, "learning_rate": 0.00010563576839556374, "loss": 1.1567, "step": 762 }, { "epoch": 0.34532699705815795, "grad_norm": 0.48243972659111023, "learning_rate": 0.00010543469303028002, "loss": 0.7717, "step": 763 }, { "epoch": 0.3457795881421136, "grad_norm": 0.4793996214866638, "learning_rate": 0.0001052335956242944, "loss": 0.7768, "step": 764 }, { "epoch": 0.34623217922606925, "grad_norm": 0.4908459484577179, "learning_rate": 0.00010503247699316867, "loss": 0.856, "step": 765 }, { "epoch": 0.34668477031002487, "grad_norm": 0.5109564661979675, "learning_rate": 0.00010483133795255071, "loss": 0.9241, "step": 766 }, { "epoch": 0.34713736139398055, "grad_norm": 0.6431382894515991, "learning_rate": 0.00010463017931817118, "loss": 1.1816, "step": 767 }, { "epoch": 0.34758995247793617, "grad_norm": 0.40695667266845703, "learning_rate": 0.00010442900190584015, "loss": 0.6323, "step": 768 }, { "epoch": 0.34804254356189185, "grad_norm": 0.5081280469894409, "learning_rate": 0.0001042278065314439, "loss": 0.9465, "step": 769 }, { "epoch": 0.34849513464584747, "grad_norm": 0.5007328391075134, "learning_rate": 0.00010402659401094152, "loss": 0.8279, "step": 770 }, { "epoch": 0.34894772572980315, "grad_norm": 0.5045701861381531, "learning_rate": 0.00010382536516036168, "loss": 0.9728, "step": 771 }, { "epoch": 0.34940031681375877, "grad_norm": 0.48791611194610596, "learning_rate": 0.00010362412079579924, "loss": 0.8349, "step": 772 }, { "epoch": 0.3498529078977144, "grad_norm": 0.42378130555152893, "learning_rate": 0.00010342286173341198, "loss": 0.5704, "step": 773 }, { "epoch": 0.35030549898167007, "grad_norm": 0.4793219566345215, "learning_rate": 0.00010322158878941732, "loss": 0.8859, "step": 774 }, { "epoch": 0.3507580900656257, "grad_norm": 0.48526403307914734, "learning_rate": 0.0001030203027800889, "loss": 0.7512, "step": 775 }, { "epoch": 0.35121068114958137, "grad_norm": 0.5508863925933838, "learning_rate": 0.00010281900452175346, "loss": 1.1402, "step": 776 }, { "epoch": 0.351663272233537, "grad_norm": 0.5259900689125061, "learning_rate": 0.00010261769483078733, "loss": 0.8289, "step": 777 }, { "epoch": 0.35211586331749267, "grad_norm": 0.511981189250946, "learning_rate": 0.00010241637452361323, "loss": 0.796, "step": 778 }, { "epoch": 0.3525684544014483, "grad_norm": 0.4819547235965729, "learning_rate": 0.00010221504441669699, "loss": 0.8646, "step": 779 }, { "epoch": 0.3530210454854039, "grad_norm": 0.4084272086620331, "learning_rate": 0.00010201370532654404, "loss": 0.5851, "step": 780 }, { "epoch": 0.3534736365693596, "grad_norm": 0.4426387548446655, "learning_rate": 0.0001018123580696964, "loss": 0.7538, "step": 781 }, { "epoch": 0.3539262276533152, "grad_norm": 0.5691589713096619, "learning_rate": 0.00010161100346272914, "loss": 1.1445, "step": 782 }, { "epoch": 0.3543788187372709, "grad_norm": 0.5064798593521118, "learning_rate": 0.00010140964232224713, "loss": 0.6435, "step": 783 }, { "epoch": 0.3548314098212265, "grad_norm": 0.5018054842948914, "learning_rate": 0.00010120827546488174, "loss": 0.9572, "step": 784 }, { "epoch": 0.35528400090518214, "grad_norm": 0.5410506129264832, "learning_rate": 0.00010100690370728755, "loss": 1.003, "step": 785 }, { "epoch": 0.3557365919891378, "grad_norm": 0.510235071182251, "learning_rate": 0.00010080552786613899, "loss": 0.8269, "step": 786 }, { "epoch": 0.3557365919891378, "eval_loss": 0.20480701327323914, "eval_runtime": 51.7143, "eval_samples_per_second": 18.003, "eval_steps_per_second": 9.011, "step": 786 }, { "epoch": 0.35618918307309344, "grad_norm": 0.46580770611763, "learning_rate": 0.00010060414875812709, "loss": 0.7772, "step": 787 }, { "epoch": 0.3566417741570491, "grad_norm": 0.5495899319648743, "learning_rate": 0.00010040276719995605, "loss": 0.846, "step": 788 }, { "epoch": 0.35709436524100474, "grad_norm": 0.555011510848999, "learning_rate": 0.00010020138400834011, "loss": 0.7422, "step": 789 }, { "epoch": 0.3575469563249604, "grad_norm": 0.5138972997665405, "learning_rate": 0.0001, "loss": 0.923, "step": 790 }, { "epoch": 0.35799954740891604, "grad_norm": 0.48092055320739746, "learning_rate": 9.979861599165991e-05, "loss": 0.933, "step": 791 }, { "epoch": 0.35845213849287166, "grad_norm": 0.44474145770072937, "learning_rate": 9.959723280004396e-05, "loss": 0.6501, "step": 792 }, { "epoch": 0.35890472957682734, "grad_norm": 0.49980682134628296, "learning_rate": 9.939585124187292e-05, "loss": 0.7661, "step": 793 }, { "epoch": 0.35935732066078296, "grad_norm": 0.7595360279083252, "learning_rate": 9.919447213386103e-05, "loss": 0.8846, "step": 794 }, { "epoch": 0.35980991174473864, "grad_norm": 0.492086797952652, "learning_rate": 9.899309629271246e-05, "loss": 0.8221, "step": 795 }, { "epoch": 0.36026250282869426, "grad_norm": 0.5320361852645874, "learning_rate": 9.879172453511827e-05, "loss": 0.8432, "step": 796 }, { "epoch": 0.36071509391264994, "grad_norm": 0.4166679084300995, "learning_rate": 9.85903576777529e-05, "loss": 0.7284, "step": 797 }, { "epoch": 0.36116768499660556, "grad_norm": 0.5354281067848206, "learning_rate": 9.838899653727088e-05, "loss": 0.9363, "step": 798 }, { "epoch": 0.3616202760805612, "grad_norm": 0.4781079888343811, "learning_rate": 9.818764193030363e-05, "loss": 0.7322, "step": 799 }, { "epoch": 0.36207286716451687, "grad_norm": 0.4732087552547455, "learning_rate": 9.798629467345599e-05, "loss": 0.7354, "step": 800 }, { "epoch": 0.3625254582484725, "grad_norm": 0.4598158597946167, "learning_rate": 9.778495558330305e-05, "loss": 0.6928, "step": 801 }, { "epoch": 0.36297804933242817, "grad_norm": 0.451435387134552, "learning_rate": 9.75836254763868e-05, "loss": 0.7661, "step": 802 }, { "epoch": 0.3634306404163838, "grad_norm": 0.5164604187011719, "learning_rate": 9.73823051692127e-05, "loss": 0.8549, "step": 803 }, { "epoch": 0.36388323150033947, "grad_norm": 0.44017529487609863, "learning_rate": 9.718099547824658e-05, "loss": 0.7798, "step": 804 }, { "epoch": 0.3643358225842951, "grad_norm": 0.45070794224739075, "learning_rate": 9.697969721991114e-05, "loss": 0.7063, "step": 805 }, { "epoch": 0.3647884136682507, "grad_norm": 0.5448845624923706, "learning_rate": 9.677841121058273e-05, "loss": 1.1638, "step": 806 }, { "epoch": 0.3652410047522064, "grad_norm": 0.5377791523933411, "learning_rate": 9.657713826658806e-05, "loss": 0.9093, "step": 807 }, { "epoch": 0.365693595836162, "grad_norm": 0.5310801863670349, "learning_rate": 9.63758792042008e-05, "loss": 0.9624, "step": 808 }, { "epoch": 0.3661461869201177, "grad_norm": 0.5090137124061584, "learning_rate": 9.617463483963834e-05, "loss": 0.8736, "step": 809 }, { "epoch": 0.3665987780040733, "grad_norm": 0.47381317615509033, "learning_rate": 9.597340598905852e-05, "loss": 0.8111, "step": 810 }, { "epoch": 0.367051369088029, "grad_norm": 0.5369988679885864, "learning_rate": 9.577219346855613e-05, "loss": 0.733, "step": 811 }, { "epoch": 0.3675039601719846, "grad_norm": 0.6559973359107971, "learning_rate": 9.55709980941599e-05, "loss": 0.7097, "step": 812 }, { "epoch": 0.36795655125594023, "grad_norm": 0.5182902216911316, "learning_rate": 9.536982068182887e-05, "loss": 0.8151, "step": 813 }, { "epoch": 0.3684091423398959, "grad_norm": 0.49896934628486633, "learning_rate": 9.516866204744931e-05, "loss": 1.012, "step": 814 }, { "epoch": 0.36886173342385153, "grad_norm": 0.5400479435920715, "learning_rate": 9.496752300683133e-05, "loss": 0.9837, "step": 815 }, { "epoch": 0.3693143245078072, "grad_norm": 0.4777016341686249, "learning_rate": 9.476640437570562e-05, "loss": 0.7982, "step": 816 }, { "epoch": 0.36976691559176283, "grad_norm": 0.4785037040710449, "learning_rate": 9.456530696971999e-05, "loss": 0.701, "step": 817 }, { "epoch": 0.3702195066757185, "grad_norm": 0.4874170422554016, "learning_rate": 9.436423160443625e-05, "loss": 0.8509, "step": 818 }, { "epoch": 0.37067209775967414, "grad_norm": 0.5621728897094727, "learning_rate": 9.416317909532683e-05, "loss": 1.0424, "step": 819 }, { "epoch": 0.37112468884362976, "grad_norm": 0.5205615162849426, "learning_rate": 9.396215025777139e-05, "loss": 0.8152, "step": 820 }, { "epoch": 0.37157727992758544, "grad_norm": 0.47887948155403137, "learning_rate": 9.376114590705368e-05, "loss": 0.7467, "step": 821 }, { "epoch": 0.37202987101154106, "grad_norm": 0.511816680431366, "learning_rate": 9.356016685835806e-05, "loss": 0.8711, "step": 822 }, { "epoch": 0.37248246209549674, "grad_norm": 0.4856937527656555, "learning_rate": 9.335921392676631e-05, "loss": 0.8454, "step": 823 }, { "epoch": 0.37293505317945236, "grad_norm": 0.563502311706543, "learning_rate": 9.315828792725438e-05, "loss": 0.9715, "step": 824 }, { "epoch": 0.37338764426340804, "grad_norm": 0.5120623111724854, "learning_rate": 9.295738967468881e-05, "loss": 0.8401, "step": 825 }, { "epoch": 0.37384023534736366, "grad_norm": 0.43110066652297974, "learning_rate": 9.275651998382377e-05, "loss": 0.7352, "step": 826 }, { "epoch": 0.3742928264313193, "grad_norm": 0.5547953248023987, "learning_rate": 9.255567966929751e-05, "loss": 1.0781, "step": 827 }, { "epoch": 0.37474541751527496, "grad_norm": 0.5874584913253784, "learning_rate": 9.235486954562919e-05, "loss": 0.7856, "step": 828 }, { "epoch": 0.3751980085992306, "grad_norm": 0.4563722610473633, "learning_rate": 9.215409042721552e-05, "loss": 0.737, "step": 829 }, { "epoch": 0.37565059968318626, "grad_norm": 0.5010003447532654, "learning_rate": 9.195334312832742e-05, "loss": 0.8458, "step": 830 }, { "epoch": 0.3761031907671419, "grad_norm": 0.44281336665153503, "learning_rate": 9.175262846310682e-05, "loss": 0.686, "step": 831 }, { "epoch": 0.37655578185109756, "grad_norm": 0.5412707328796387, "learning_rate": 9.155194724556331e-05, "loss": 1.0169, "step": 832 }, { "epoch": 0.3770083729350532, "grad_norm": 0.4889048933982849, "learning_rate": 9.135130028957076e-05, "loss": 0.7573, "step": 833 }, { "epoch": 0.3774609640190088, "grad_norm": 0.5044034123420715, "learning_rate": 9.115068840886417e-05, "loss": 0.7249, "step": 834 }, { "epoch": 0.3779135551029645, "grad_norm": 0.45018038153648376, "learning_rate": 9.095011241703623e-05, "loss": 0.6717, "step": 835 }, { "epoch": 0.3783661461869201, "grad_norm": 0.5517880320549011, "learning_rate": 9.07495731275341e-05, "loss": 1.0841, "step": 836 }, { "epoch": 0.3788187372708758, "grad_norm": 0.41919058561325073, "learning_rate": 9.054907135365615e-05, "loss": 0.6097, "step": 837 }, { "epoch": 0.3792713283548314, "grad_norm": 0.4952158033847809, "learning_rate": 9.034860790854849e-05, "loss": 0.7001, "step": 838 }, { "epoch": 0.3797239194387871, "grad_norm": 0.5002701878547668, "learning_rate": 9.01481836052019e-05, "loss": 0.7987, "step": 839 }, { "epoch": 0.3801765105227427, "grad_norm": 0.42661139369010925, "learning_rate": 8.994779925644831e-05, "loss": 0.6824, "step": 840 }, { "epoch": 0.38062910160669833, "grad_norm": 0.5633373856544495, "learning_rate": 8.974745567495768e-05, "loss": 0.896, "step": 841 }, { "epoch": 0.381081692690654, "grad_norm": 0.5182005167007446, "learning_rate": 8.954715367323468e-05, "loss": 0.8394, "step": 842 }, { "epoch": 0.38153428377460963, "grad_norm": 0.4467911720275879, "learning_rate": 8.934689406361521e-05, "loss": 0.8438, "step": 843 }, { "epoch": 0.3819868748585653, "grad_norm": 0.3690691590309143, "learning_rate": 8.914667765826338e-05, "loss": 0.5102, "step": 844 }, { "epoch": 0.38243946594252093, "grad_norm": 0.5445937514305115, "learning_rate": 8.894650526916803e-05, "loss": 0.8618, "step": 845 }, { "epoch": 0.38289205702647655, "grad_norm": 0.5911957621574402, "learning_rate": 8.874637770813946e-05, "loss": 1.0281, "step": 846 }, { "epoch": 0.38334464811043223, "grad_norm": 0.4994654953479767, "learning_rate": 8.854629578680624e-05, "loss": 1.0337, "step": 847 }, { "epoch": 0.38379723919438785, "grad_norm": 0.48666390776634216, "learning_rate": 8.834626031661178e-05, "loss": 0.79, "step": 848 }, { "epoch": 0.38424983027834353, "grad_norm": 0.4750109314918518, "learning_rate": 8.814627210881117e-05, "loss": 0.8021, "step": 849 }, { "epoch": 0.38470242136229915, "grad_norm": 0.46276208758354187, "learning_rate": 8.79463319744677e-05, "loss": 0.7748, "step": 850 }, { "epoch": 0.38515501244625483, "grad_norm": 0.5422043204307556, "learning_rate": 8.774644072444986e-05, "loss": 0.87, "step": 851 }, { "epoch": 0.38560760353021045, "grad_norm": 0.5052550435066223, "learning_rate": 8.75465991694278e-05, "loss": 0.8123, "step": 852 }, { "epoch": 0.3860601946141661, "grad_norm": 0.4825555384159088, "learning_rate": 8.73468081198701e-05, "loss": 0.7966, "step": 853 }, { "epoch": 0.38651278569812175, "grad_norm": 0.5406911373138428, "learning_rate": 8.714706838604055e-05, "loss": 1.0109, "step": 854 }, { "epoch": 0.3869653767820774, "grad_norm": 0.5366398096084595, "learning_rate": 8.694738077799488e-05, "loss": 1.0179, "step": 855 }, { "epoch": 0.38741796786603305, "grad_norm": 0.5232101678848267, "learning_rate": 8.674774610557728e-05, "loss": 0.7802, "step": 856 }, { "epoch": 0.3878705589499887, "grad_norm": 0.47566697001457214, "learning_rate": 8.654816517841741e-05, "loss": 0.8271, "step": 857 }, { "epoch": 0.38832315003394435, "grad_norm": 0.5479469895362854, "learning_rate": 8.634863880592686e-05, "loss": 1.0688, "step": 858 }, { "epoch": 0.3887757411179, "grad_norm": 0.47462597489356995, "learning_rate": 8.614916779729603e-05, "loss": 0.8378, "step": 859 }, { "epoch": 0.3892283322018556, "grad_norm": 0.5492944121360779, "learning_rate": 8.594975296149076e-05, "loss": 0.9606, "step": 860 }, { "epoch": 0.3896809232858113, "grad_norm": 0.40749794244766235, "learning_rate": 8.575039510724905e-05, "loss": 0.6763, "step": 861 }, { "epoch": 0.3901335143697669, "grad_norm": 0.5637528300285339, "learning_rate": 8.55510950430779e-05, "loss": 0.9517, "step": 862 }, { "epoch": 0.3905861054537226, "grad_norm": 0.48673295974731445, "learning_rate": 8.535185357724982e-05, "loss": 0.7629, "step": 863 }, { "epoch": 0.3910386965376782, "grad_norm": 0.49013152718544006, "learning_rate": 8.515267151779974e-05, "loss": 0.7738, "step": 864 }, { "epoch": 0.3914912876216339, "grad_norm": 0.5754150748252869, "learning_rate": 8.495354967252169e-05, "loss": 1.0042, "step": 865 }, { "epoch": 0.3919438787055895, "grad_norm": 0.5181888937950134, "learning_rate": 8.475448884896547e-05, "loss": 1.0482, "step": 866 }, { "epoch": 0.3923964697895451, "grad_norm": 0.6057239770889282, "learning_rate": 8.455548985443334e-05, "loss": 1.2377, "step": 867 }, { "epoch": 0.3928490608735008, "grad_norm": 0.5204604864120483, "learning_rate": 8.435655349597689e-05, "loss": 0.8495, "step": 868 }, { "epoch": 0.3933016519574564, "grad_norm": 0.48216450214385986, "learning_rate": 8.41576805803937e-05, "loss": 0.6864, "step": 869 }, { "epoch": 0.3937542430414121, "grad_norm": 0.40034219622612, "learning_rate": 8.395887191422397e-05, "loss": 0.5482, "step": 870 }, { "epoch": 0.3942068341253677, "grad_norm": 0.49549373984336853, "learning_rate": 8.37601283037474e-05, "loss": 0.8129, "step": 871 }, { "epoch": 0.3946594252093234, "grad_norm": 0.4993366599082947, "learning_rate": 8.356145055497981e-05, "loss": 0.7104, "step": 872 }, { "epoch": 0.395112016293279, "grad_norm": 0.5507094264030457, "learning_rate": 8.336283947366992e-05, "loss": 0.9944, "step": 873 }, { "epoch": 0.39556460737723464, "grad_norm": 0.5083698034286499, "learning_rate": 8.316429586529615e-05, "loss": 0.9294, "step": 874 }, { "epoch": 0.3960171984611903, "grad_norm": 0.5367676615715027, "learning_rate": 8.296582053506317e-05, "loss": 0.996, "step": 875 }, { "epoch": 0.39646978954514595, "grad_norm": 0.4346739649772644, "learning_rate": 8.276741428789883e-05, "loss": 0.633, "step": 876 }, { "epoch": 0.3969223806291016, "grad_norm": 0.5170437097549438, "learning_rate": 8.256907792845072e-05, "loss": 1.0517, "step": 877 }, { "epoch": 0.39737497171305725, "grad_norm": 0.45783472061157227, "learning_rate": 8.237081226108311e-05, "loss": 0.7076, "step": 878 }, { "epoch": 0.3978275627970129, "grad_norm": 0.7020736336708069, "learning_rate": 8.217261808987351e-05, "loss": 0.9009, "step": 879 }, { "epoch": 0.39828015388096855, "grad_norm": 0.5104503631591797, "learning_rate": 8.197449621860943e-05, "loss": 0.9576, "step": 880 }, { "epoch": 0.39873274496492417, "grad_norm": 0.4474295675754547, "learning_rate": 8.177644745078526e-05, "loss": 0.6848, "step": 881 }, { "epoch": 0.39918533604887985, "grad_norm": 0.5501548051834106, "learning_rate": 8.157847258959885e-05, "loss": 1.0901, "step": 882 }, { "epoch": 0.39963792713283547, "grad_norm": 0.4706266522407532, "learning_rate": 8.138057243794833e-05, "loss": 0.7566, "step": 883 }, { "epoch": 0.40009051821679115, "grad_norm": 0.46165910363197327, "learning_rate": 8.118274779842888e-05, "loss": 0.7071, "step": 884 }, { "epoch": 0.40054310930074677, "grad_norm": 0.5035961270332336, "learning_rate": 8.098499947332934e-05, "loss": 0.8654, "step": 885 }, { "epoch": 0.40099570038470245, "grad_norm": 0.464796781539917, "learning_rate": 8.078732826462915e-05, "loss": 0.6857, "step": 886 }, { "epoch": 0.40144829146865807, "grad_norm": 0.5464978814125061, "learning_rate": 8.0589734973995e-05, "loss": 0.9781, "step": 887 }, { "epoch": 0.4019008825526137, "grad_norm": 0.42914673686027527, "learning_rate": 8.03922204027775e-05, "loss": 0.7287, "step": 888 }, { "epoch": 0.40235347363656937, "grad_norm": 0.5215734839439392, "learning_rate": 8.019478535200806e-05, "loss": 0.9839, "step": 889 }, { "epoch": 0.402806064720525, "grad_norm": 0.5467308163642883, "learning_rate": 7.999743062239557e-05, "loss": 0.8938, "step": 890 }, { "epoch": 0.40325865580448067, "grad_norm": 0.5061361789703369, "learning_rate": 7.98001570143232e-05, "loss": 0.7561, "step": 891 }, { "epoch": 0.4037112468884363, "grad_norm": 0.4637202322483063, "learning_rate": 7.960296532784515e-05, "loss": 0.8056, "step": 892 }, { "epoch": 0.40416383797239197, "grad_norm": 0.47489434480667114, "learning_rate": 7.940585636268327e-05, "loss": 0.833, "step": 893 }, { "epoch": 0.4046164290563476, "grad_norm": 0.555900514125824, "learning_rate": 7.920883091822408e-05, "loss": 0.8873, "step": 894 }, { "epoch": 0.4050690201403032, "grad_norm": 0.5026564002037048, "learning_rate": 7.901188979351526e-05, "loss": 1.0811, "step": 895 }, { "epoch": 0.4055216112242589, "grad_norm": 0.5691704154014587, "learning_rate": 7.881503378726258e-05, "loss": 0.7082, "step": 896 }, { "epoch": 0.4059742023082145, "grad_norm": 0.42396095395088196, "learning_rate": 7.86182636978266e-05, "loss": 0.6481, "step": 897 }, { "epoch": 0.4064267933921702, "grad_norm": 0.4842861294746399, "learning_rate": 7.84215803232194e-05, "loss": 0.6791, "step": 898 }, { "epoch": 0.4068793844761258, "grad_norm": 0.4645484685897827, "learning_rate": 7.822498446110145e-05, "loss": 0.6694, "step": 899 }, { "epoch": 0.4073319755600815, "grad_norm": 0.5308927297592163, "learning_rate": 7.802847690877832e-05, "loss": 0.9689, "step": 900 }, { "epoch": 0.4077845666440371, "grad_norm": 0.4862045347690582, "learning_rate": 7.78320584631973e-05, "loss": 0.8439, "step": 901 }, { "epoch": 0.40823715772799274, "grad_norm": 0.5172306299209595, "learning_rate": 7.763572992094447e-05, "loss": 0.9196, "step": 902 }, { "epoch": 0.4086897488119484, "grad_norm": 0.4229103624820709, "learning_rate": 7.743949207824119e-05, "loss": 0.5522, "step": 903 }, { "epoch": 0.40914233989590404, "grad_norm": 0.45848751068115234, "learning_rate": 7.7243345730941e-05, "loss": 0.8773, "step": 904 }, { "epoch": 0.4095949309798597, "grad_norm": 0.4387413263320923, "learning_rate": 7.704729167452647e-05, "loss": 0.8242, "step": 905 }, { "epoch": 0.41004752206381534, "grad_norm": 0.49762117862701416, "learning_rate": 7.685133070410571e-05, "loss": 0.8721, "step": 906 }, { "epoch": 0.41050011314777096, "grad_norm": 0.47309303283691406, "learning_rate": 7.66554636144095e-05, "loss": 0.7279, "step": 907 }, { "epoch": 0.41095270423172664, "grad_norm": 0.6089125275611877, "learning_rate": 7.64596911997877e-05, "loss": 0.9264, "step": 908 }, { "epoch": 0.41140529531568226, "grad_norm": 0.46339258551597595, "learning_rate": 7.626401425420634e-05, "loss": 0.6112, "step": 909 }, { "epoch": 0.41185788639963794, "grad_norm": 0.5053806900978088, "learning_rate": 7.606843357124426e-05, "loss": 0.8201, "step": 910 }, { "epoch": 0.41231047748359356, "grad_norm": 0.5062787532806396, "learning_rate": 7.587294994408981e-05, "loss": 0.8513, "step": 911 }, { "epoch": 0.41276306856754924, "grad_norm": 0.5039675235748291, "learning_rate": 7.56775641655378e-05, "loss": 0.8486, "step": 912 }, { "epoch": 0.41321565965150486, "grad_norm": 0.528287410736084, "learning_rate": 7.548227702798624e-05, "loss": 0.9241, "step": 913 }, { "epoch": 0.4136682507354605, "grad_norm": 0.5342898368835449, "learning_rate": 7.528708932343304e-05, "loss": 0.911, "step": 914 }, { "epoch": 0.41412084181941616, "grad_norm": 0.5621039271354675, "learning_rate": 7.509200184347282e-05, "loss": 1.0581, "step": 915 }, { "epoch": 0.4145734329033718, "grad_norm": 0.5111497044563293, "learning_rate": 7.489701537929384e-05, "loss": 0.8369, "step": 916 }, { "epoch": 0.41502602398732746, "grad_norm": 0.5669052600860596, "learning_rate": 7.470213072167456e-05, "loss": 0.9215, "step": 917 }, { "epoch": 0.4154786150712831, "grad_norm": 0.4463648200035095, "learning_rate": 7.450734866098066e-05, "loss": 0.7924, "step": 918 }, { "epoch": 0.41593120615523876, "grad_norm": 0.5927613973617554, "learning_rate": 7.431266998716171e-05, "loss": 1.084, "step": 919 }, { "epoch": 0.4163837972391944, "grad_norm": 0.47022631764411926, "learning_rate": 7.411809548974792e-05, "loss": 0.6784, "step": 920 }, { "epoch": 0.41683638832315, "grad_norm": 0.46409207582473755, "learning_rate": 7.392362595784711e-05, "loss": 0.6136, "step": 921 }, { "epoch": 0.4172889794071057, "grad_norm": 0.5186243653297424, "learning_rate": 7.372926218014131e-05, "loss": 0.8043, "step": 922 }, { "epoch": 0.4177415704910613, "grad_norm": 0.5423625707626343, "learning_rate": 7.353500494488373e-05, "loss": 0.931, "step": 923 }, { "epoch": 0.418194161575017, "grad_norm": 0.5815045833587646, "learning_rate": 7.334085503989547e-05, "loss": 0.9268, "step": 924 }, { "epoch": 0.4186467526589726, "grad_norm": 0.5250252485275269, "learning_rate": 7.314681325256232e-05, "loss": 0.962, "step": 925 }, { "epoch": 0.4190993437429283, "grad_norm": 0.48460298776626587, "learning_rate": 7.295288036983163e-05, "loss": 0.6581, "step": 926 }, { "epoch": 0.4195519348268839, "grad_norm": 0.5446643829345703, "learning_rate": 7.275905717820908e-05, "loss": 1.0548, "step": 927 }, { "epoch": 0.42000452591083953, "grad_norm": 0.4789769649505615, "learning_rate": 7.256534446375542e-05, "loss": 0.8688, "step": 928 }, { "epoch": 0.4204571169947952, "grad_norm": 0.4612770080566406, "learning_rate": 7.237174301208349e-05, "loss": 0.748, "step": 929 }, { "epoch": 0.42090970807875083, "grad_norm": 0.4356364607810974, "learning_rate": 7.217825360835473e-05, "loss": 0.6854, "step": 930 }, { "epoch": 0.4213622991627065, "grad_norm": 0.6402705907821655, "learning_rate": 7.198487703727632e-05, "loss": 0.9116, "step": 931 }, { "epoch": 0.42181489024666213, "grad_norm": 0.4785003066062927, "learning_rate": 7.179161408309776e-05, "loss": 0.9363, "step": 932 }, { "epoch": 0.4222674813306178, "grad_norm": 0.5057145357131958, "learning_rate": 7.159846552960774e-05, "loss": 0.7782, "step": 933 }, { "epoch": 0.42272007241457343, "grad_norm": 0.5009840726852417, "learning_rate": 7.14054321601311e-05, "loss": 0.8482, "step": 934 }, { "epoch": 0.42317266349852906, "grad_norm": 0.5882793664932251, "learning_rate": 7.121251475752539e-05, "loss": 1.0156, "step": 935 }, { "epoch": 0.42362525458248473, "grad_norm": 0.5955524444580078, "learning_rate": 7.101971410417803e-05, "loss": 0.9331, "step": 936 }, { "epoch": 0.42407784566644036, "grad_norm": 0.44543153047561646, "learning_rate": 7.082703098200282e-05, "loss": 0.7012, "step": 937 }, { "epoch": 0.42453043675039603, "grad_norm": 0.5537339448928833, "learning_rate": 7.063446617243694e-05, "loss": 0.71, "step": 938 }, { "epoch": 0.42498302783435166, "grad_norm": 0.6707674264907837, "learning_rate": 7.04420204564378e-05, "loss": 0.9452, "step": 939 }, { "epoch": 0.42543561891830733, "grad_norm": 0.4922984540462494, "learning_rate": 7.024969461447972e-05, "loss": 0.7476, "step": 940 }, { "epoch": 0.42588821000226296, "grad_norm": 0.5744497179985046, "learning_rate": 7.005748942655095e-05, "loss": 0.8709, "step": 941 }, { "epoch": 0.4263408010862186, "grad_norm": 0.49316924810409546, "learning_rate": 6.986540567215044e-05, "loss": 0.6702, "step": 942 }, { "epoch": 0.42679339217017426, "grad_norm": 0.5075472593307495, "learning_rate": 6.967344413028452e-05, "loss": 0.8238, "step": 943 }, { "epoch": 0.4272459832541299, "grad_norm": 0.5124140381813049, "learning_rate": 6.948160557946404e-05, "loss": 0.8917, "step": 944 }, { "epoch": 0.42769857433808556, "grad_norm": 0.49430105090141296, "learning_rate": 6.928989079770094e-05, "loss": 0.9076, "step": 945 }, { "epoch": 0.4281511654220412, "grad_norm": 0.536520779132843, "learning_rate": 6.909830056250527e-05, "loss": 0.9427, "step": 946 }, { "epoch": 0.42860375650599686, "grad_norm": 0.514545202255249, "learning_rate": 6.890683565088198e-05, "loss": 0.8755, "step": 947 }, { "epoch": 0.4290563475899525, "grad_norm": 0.43919333815574646, "learning_rate": 6.87154968393277e-05, "loss": 0.6828, "step": 948 }, { "epoch": 0.4295089386739081, "grad_norm": 0.6306238770484924, "learning_rate": 6.852428490382773e-05, "loss": 1.099, "step": 949 }, { "epoch": 0.4299615297578638, "grad_norm": 0.7785803079605103, "learning_rate": 6.833320061985277e-05, "loss": 0.7355, "step": 950 }, { "epoch": 0.4304141208418194, "grad_norm": 0.4950277805328369, "learning_rate": 6.814224476235581e-05, "loss": 0.6854, "step": 951 }, { "epoch": 0.4308667119257751, "grad_norm": 0.6018353700637817, "learning_rate": 6.795141810576906e-05, "loss": 1.2355, "step": 952 }, { "epoch": 0.4313193030097307, "grad_norm": 0.4305374026298523, "learning_rate": 6.776072142400067e-05, "loss": 0.6234, "step": 953 }, { "epoch": 0.4317718940936864, "grad_norm": 0.39642637968063354, "learning_rate": 6.757015549043175e-05, "loss": 0.4528, "step": 954 }, { "epoch": 0.432224485177642, "grad_norm": 0.44513657689094543, "learning_rate": 6.73797210779131e-05, "loss": 0.6681, "step": 955 }, { "epoch": 0.4326770762615976, "grad_norm": 0.47976213693618774, "learning_rate": 6.718941895876212e-05, "loss": 0.6925, "step": 956 }, { "epoch": 0.4331296673455533, "grad_norm": 0.4550023078918457, "learning_rate": 6.699924990475974e-05, "loss": 0.6719, "step": 957 }, { "epoch": 0.4335822584295089, "grad_norm": 0.4960489273071289, "learning_rate": 6.680921468714719e-05, "loss": 0.9812, "step": 958 }, { "epoch": 0.4340348495134646, "grad_norm": 0.4192532002925873, "learning_rate": 6.661931407662292e-05, "loss": 0.5652, "step": 959 }, { "epoch": 0.4344874405974202, "grad_norm": 0.7804063558578491, "learning_rate": 6.642954884333955e-05, "loss": 0.9131, "step": 960 }, { "epoch": 0.43494003168137585, "grad_norm": 0.5083318948745728, "learning_rate": 6.623991975690051e-05, "loss": 0.7885, "step": 961 }, { "epoch": 0.4353926227653315, "grad_norm": 0.4479636549949646, "learning_rate": 6.605042758635729e-05, "loss": 0.7053, "step": 962 }, { "epoch": 0.43584521384928715, "grad_norm": 0.5218529105186462, "learning_rate": 6.586107310020591e-05, "loss": 0.8669, "step": 963 }, { "epoch": 0.4362978049332428, "grad_norm": 0.42930254340171814, "learning_rate": 6.567185706638417e-05, "loss": 0.6542, "step": 964 }, { "epoch": 0.43675039601719845, "grad_norm": 0.4791978895664215, "learning_rate": 6.548278025226821e-05, "loss": 0.8341, "step": 965 }, { "epoch": 0.43720298710115413, "grad_norm": 0.5348832011222839, "learning_rate": 6.52938434246697e-05, "loss": 0.8451, "step": 966 }, { "epoch": 0.43765557818510975, "grad_norm": 0.662418007850647, "learning_rate": 6.510504734983249e-05, "loss": 0.8109, "step": 967 }, { "epoch": 0.4381081692690654, "grad_norm": 0.48103776574134827, "learning_rate": 6.491639279342963e-05, "loss": 0.8619, "step": 968 }, { "epoch": 0.43856076035302105, "grad_norm": 0.5524132251739502, "learning_rate": 6.472788052056027e-05, "loss": 0.8855, "step": 969 }, { "epoch": 0.4390133514369767, "grad_norm": 0.5164328217506409, "learning_rate": 6.453951129574644e-05, "loss": 0.7278, "step": 970 }, { "epoch": 0.43946594252093235, "grad_norm": 0.5808908343315125, "learning_rate": 6.435128588293012e-05, "loss": 0.8667, "step": 971 }, { "epoch": 0.439918533604888, "grad_norm": 0.5309447646141052, "learning_rate": 6.416320504546997e-05, "loss": 0.8923, "step": 972 }, { "epoch": 0.44037112468884365, "grad_norm": 0.4193963408470154, "learning_rate": 6.397526954613839e-05, "loss": 0.7471, "step": 973 }, { "epoch": 0.4408237157727993, "grad_norm": 0.4619600772857666, "learning_rate": 6.378748014711834e-05, "loss": 0.7625, "step": 974 }, { "epoch": 0.4412763068567549, "grad_norm": 0.4345484673976898, "learning_rate": 6.359983761000018e-05, "loss": 0.6369, "step": 975 }, { "epoch": 0.4417288979407106, "grad_norm": 0.4431121051311493, "learning_rate": 6.341234269577879e-05, "loss": 0.804, "step": 976 }, { "epoch": 0.4421814890246662, "grad_norm": 0.49312248826026917, "learning_rate": 6.322499616485025e-05, "loss": 0.8464, "step": 977 }, { "epoch": 0.4426340801086219, "grad_norm": 0.4761488139629364, "learning_rate": 6.30377987770089e-05, "loss": 0.7871, "step": 978 }, { "epoch": 0.4430866711925775, "grad_norm": 0.48850172758102417, "learning_rate": 6.285075129144428e-05, "loss": 0.8349, "step": 979 }, { "epoch": 0.4435392622765332, "grad_norm": 0.4868282973766327, "learning_rate": 6.26638544667379e-05, "loss": 0.777, "step": 980 }, { "epoch": 0.4439918533604888, "grad_norm": 0.5236530303955078, "learning_rate": 6.247710906086031e-05, "loss": 0.793, "step": 981 }, { "epoch": 0.4444444444444444, "grad_norm": 0.6053972840309143, "learning_rate": 6.229051583116796e-05, "loss": 1.0982, "step": 982 }, { "epoch": 0.4448970355284001, "grad_norm": 0.4751902222633362, "learning_rate": 6.210407553440015e-05, "loss": 0.7435, "step": 983 }, { "epoch": 0.4453496266123557, "grad_norm": 0.610129177570343, "learning_rate": 6.191778892667591e-05, "loss": 1.1366, "step": 984 }, { "epoch": 0.4458022176963114, "grad_norm": 0.5146445631980896, "learning_rate": 6.173165676349103e-05, "loss": 0.8541, "step": 985 }, { "epoch": 0.446254808780267, "grad_norm": 0.42905983328819275, "learning_rate": 6.154567979971493e-05, "loss": 0.6065, "step": 986 }, { "epoch": 0.4467073998642227, "grad_norm": 0.4927644431591034, "learning_rate": 6.135985878958759e-05, "loss": 0.7915, "step": 987 }, { "epoch": 0.4471599909481783, "grad_norm": 0.5202013850212097, "learning_rate": 6.117419448671651e-05, "loss": 0.7805, "step": 988 }, { "epoch": 0.44761258203213394, "grad_norm": 0.45878079533576965, "learning_rate": 6.098868764407371e-05, "loss": 0.8074, "step": 989 }, { "epoch": 0.4480651731160896, "grad_norm": 0.5383991599082947, "learning_rate": 6.080333901399251e-05, "loss": 0.7669, "step": 990 }, { "epoch": 0.44851776420004524, "grad_norm": 0.47985830903053284, "learning_rate": 6.0618149348164696e-05, "loss": 0.7841, "step": 991 }, { "epoch": 0.4489703552840009, "grad_norm": 0.49758774042129517, "learning_rate": 6.043311939763734e-05, "loss": 0.7702, "step": 992 }, { "epoch": 0.44942294636795654, "grad_norm": 0.582724392414093, "learning_rate": 6.024824991280974e-05, "loss": 0.9283, "step": 993 }, { "epoch": 0.4498755374519122, "grad_norm": 0.5484330058097839, "learning_rate": 6.006354164343046e-05, "loss": 0.9831, "step": 994 }, { "epoch": 0.45032812853586784, "grad_norm": 0.5154725909233093, "learning_rate": 5.9878995338594224e-05, "loss": 0.8368, "step": 995 }, { "epoch": 0.45078071961982347, "grad_norm": 0.5858908891677856, "learning_rate": 5.969461174673889e-05, "loss": 1.2287, "step": 996 }, { "epoch": 0.45123331070377914, "grad_norm": 0.46006911993026733, "learning_rate": 5.9510391615642466e-05, "loss": 0.8285, "step": 997 }, { "epoch": 0.45168590178773477, "grad_norm": 0.5323997139930725, "learning_rate": 5.9326335692419995e-05, "loss": 1.0234, "step": 998 }, { "epoch": 0.45213849287169044, "grad_norm": 0.5194817781448364, "learning_rate": 5.914244472352057e-05, "loss": 0.8194, "step": 999 }, { "epoch": 0.45259108395564607, "grad_norm": 0.5339561700820923, "learning_rate": 5.8958719454724346e-05, "loss": 0.9298, "step": 1000 }, { "epoch": 0.45304367503960175, "grad_norm": 0.5592015981674194, "learning_rate": 5.877516063113939e-05, "loss": 1.0499, "step": 1001 }, { "epoch": 0.45349626612355737, "grad_norm": 0.476650208234787, "learning_rate": 5.859176899719883e-05, "loss": 0.7622, "step": 1002 }, { "epoch": 0.453948857207513, "grad_norm": 0.5177839398384094, "learning_rate": 5.840854529665767e-05, "loss": 0.7409, "step": 1003 }, { "epoch": 0.45440144829146867, "grad_norm": 0.48446404933929443, "learning_rate": 5.8225490272589933e-05, "loss": 0.7528, "step": 1004 }, { "epoch": 0.4548540393754243, "grad_norm": 0.5130894780158997, "learning_rate": 5.804260466738547e-05, "loss": 0.6405, "step": 1005 }, { "epoch": 0.45530663045937997, "grad_norm": 0.38971513509750366, "learning_rate": 5.785988922274711e-05, "loss": 0.4938, "step": 1006 }, { "epoch": 0.4557592215433356, "grad_norm": 0.5229670405387878, "learning_rate": 5.767734467968757e-05, "loss": 0.8747, "step": 1007 }, { "epoch": 0.45621181262729127, "grad_norm": 0.4669683873653412, "learning_rate": 5.7494971778526486e-05, "loss": 0.7369, "step": 1008 }, { "epoch": 0.4566644037112469, "grad_norm": 0.5144645571708679, "learning_rate": 5.7312771258887386e-05, "loss": 0.8298, "step": 1009 }, { "epoch": 0.4571169947952025, "grad_norm": 0.47048383951187134, "learning_rate": 5.713074385969457e-05, "loss": 0.6594, "step": 1010 }, { "epoch": 0.4575695858791582, "grad_norm": 0.5354034304618835, "learning_rate": 5.694889031917047e-05, "loss": 0.8264, "step": 1011 }, { "epoch": 0.4580221769631138, "grad_norm": 0.5005225539207458, "learning_rate": 5.676721137483225e-05, "loss": 0.7441, "step": 1012 }, { "epoch": 0.4584747680470695, "grad_norm": 0.5038537979125977, "learning_rate": 5.658570776348911e-05, "loss": 0.8668, "step": 1013 }, { "epoch": 0.4589273591310251, "grad_norm": 0.4663868546485901, "learning_rate": 5.6404380221238985e-05, "loss": 0.7556, "step": 1014 }, { "epoch": 0.4593799502149808, "grad_norm": 0.5414367318153381, "learning_rate": 5.622322948346594e-05, "loss": 0.9308, "step": 1015 }, { "epoch": 0.4598325412989364, "grad_norm": 0.49297547340393066, "learning_rate": 5.604225628483695e-05, "loss": 0.9007, "step": 1016 }, { "epoch": 0.46028513238289204, "grad_norm": 0.4712825417518616, "learning_rate": 5.586146135929893e-05, "loss": 0.7742, "step": 1017 }, { "epoch": 0.4607377234668477, "grad_norm": 0.46004435420036316, "learning_rate": 5.568084544007588e-05, "loss": 0.5906, "step": 1018 }, { "epoch": 0.46119031455080334, "grad_norm": 0.48832663893699646, "learning_rate": 5.550040925966569e-05, "loss": 0.8536, "step": 1019 }, { "epoch": 0.461642905634759, "grad_norm": 0.5183213949203491, "learning_rate": 5.5320153549837415e-05, "loss": 0.8216, "step": 1020 }, { "epoch": 0.46209549671871464, "grad_norm": 0.5739022493362427, "learning_rate": 5.5140079041628214e-05, "loss": 0.9626, "step": 1021 }, { "epoch": 0.46254808780267026, "grad_norm": 0.5354647636413574, "learning_rate": 5.4960186465340316e-05, "loss": 0.952, "step": 1022 }, { "epoch": 0.46300067888662594, "grad_norm": 0.5027756094932556, "learning_rate": 5.4780476550538174e-05, "loss": 0.6244, "step": 1023 }, { "epoch": 0.46345326997058156, "grad_norm": 0.4999634623527527, "learning_rate": 5.4600950026045326e-05, "loss": 0.6141, "step": 1024 }, { "epoch": 0.46390586105453724, "grad_norm": 0.5566645860671997, "learning_rate": 5.4421607619941706e-05, "loss": 1.0411, "step": 1025 }, { "epoch": 0.46435845213849286, "grad_norm": 0.5371447801589966, "learning_rate": 5.424245005956048e-05, "loss": 0.932, "step": 1026 }, { "epoch": 0.46481104322244854, "grad_norm": 0.546616792678833, "learning_rate": 5.406347807148515e-05, "loss": 0.8634, "step": 1027 }, { "epoch": 0.46526363430640416, "grad_norm": 0.5544886589050293, "learning_rate": 5.3884692381546676e-05, "loss": 0.9725, "step": 1028 }, { "epoch": 0.4657162253903598, "grad_norm": 0.4882948398590088, "learning_rate": 5.370609371482036e-05, "loss": 0.8355, "step": 1029 }, { "epoch": 0.46616881647431546, "grad_norm": 0.4552709758281708, "learning_rate": 5.3527682795623146e-05, "loss": 0.6487, "step": 1030 }, { "epoch": 0.4666214075582711, "grad_norm": 0.46513643860816956, "learning_rate": 5.334946034751049e-05, "loss": 0.7491, "step": 1031 }, { "epoch": 0.46707399864222676, "grad_norm": 0.5479960441589355, "learning_rate": 5.317142709327354e-05, "loss": 1.0504, "step": 1032 }, { "epoch": 0.4675265897261824, "grad_norm": 0.5057997703552246, "learning_rate": 5.2993583754936126e-05, "loss": 0.9572, "step": 1033 }, { "epoch": 0.46797918081013806, "grad_norm": 0.4689159691333771, "learning_rate": 5.28159310537518e-05, "loss": 0.6862, "step": 1034 }, { "epoch": 0.4684317718940937, "grad_norm": 0.4798116385936737, "learning_rate": 5.263846971020108e-05, "loss": 0.698, "step": 1035 }, { "epoch": 0.4688843629780493, "grad_norm": 0.5007181167602539, "learning_rate": 5.246120044398839e-05, "loss": 0.811, "step": 1036 }, { "epoch": 0.469336954062005, "grad_norm": 0.46686723828315735, "learning_rate": 5.2284123974039154e-05, "loss": 0.7168, "step": 1037 }, { "epoch": 0.4697895451459606, "grad_norm": 0.4993205666542053, "learning_rate": 5.210724101849696e-05, "loss": 0.8641, "step": 1038 }, { "epoch": 0.4702421362299163, "grad_norm": 0.45969900488853455, "learning_rate": 5.193055229472045e-05, "loss": 0.6912, "step": 1039 }, { "epoch": 0.4706947273138719, "grad_norm": 0.570006251335144, "learning_rate": 5.175405851928068e-05, "loss": 1.0181, "step": 1040 }, { "epoch": 0.4711473183978276, "grad_norm": 0.5396348237991333, "learning_rate": 5.157776040795804e-05, "loss": 0.8275, "step": 1041 }, { "epoch": 0.4715999094817832, "grad_norm": 0.5312735438346863, "learning_rate": 5.14016586757394e-05, "loss": 0.8963, "step": 1042 }, { "epoch": 0.47205250056573883, "grad_norm": 0.5774843096733093, "learning_rate": 5.122575403681521e-05, "loss": 0.9631, "step": 1043 }, { "epoch": 0.4725050916496945, "grad_norm": 0.5320578813552856, "learning_rate": 5.105004720457653e-05, "loss": 0.9338, "step": 1044 }, { "epoch": 0.47295768273365013, "grad_norm": 0.45046183466911316, "learning_rate": 5.087453889161229e-05, "loss": 0.7609, "step": 1045 }, { "epoch": 0.4734102738176058, "grad_norm": 0.5660046935081482, "learning_rate": 5.069922980970626e-05, "loss": 0.9304, "step": 1046 }, { "epoch": 0.47386286490156143, "grad_norm": 0.45797666907310486, "learning_rate": 5.052412066983427e-05, "loss": 0.7153, "step": 1047 }, { "epoch": 0.4743154559855171, "grad_norm": 0.49813908338546753, "learning_rate": 5.0349212182161254e-05, "loss": 0.9467, "step": 1048 }, { "epoch": 0.47476804706947273, "grad_norm": 0.42716631293296814, "learning_rate": 5.017450505603831e-05, "loss": 0.6052, "step": 1049 }, { "epoch": 0.47522063815342835, "grad_norm": 0.48885422945022583, "learning_rate": 5.000000000000002e-05, "loss": 0.8695, "step": 1050 }, { "epoch": 0.47567322923738403, "grad_norm": 0.46352043747901917, "learning_rate": 4.98256977217614e-05, "loss": 0.7086, "step": 1051 }, { "epoch": 0.47612582032133965, "grad_norm": 0.5039620995521545, "learning_rate": 4.965159892821509e-05, "loss": 0.6385, "step": 1052 }, { "epoch": 0.47657841140529533, "grad_norm": 0.5147011876106262, "learning_rate": 4.9477704325428554e-05, "loss": 0.8561, "step": 1053 }, { "epoch": 0.47703100248925095, "grad_norm": 0.46321332454681396, "learning_rate": 4.9304014618640995e-05, "loss": 0.8561, "step": 1054 }, { "epoch": 0.47748359357320663, "grad_norm": 0.5085314512252808, "learning_rate": 4.913053051226079e-05, "loss": 0.7139, "step": 1055 }, { "epoch": 0.47793618465716226, "grad_norm": 0.5991241335868835, "learning_rate": 4.895725270986244e-05, "loss": 1.1444, "step": 1056 }, { "epoch": 0.4783887757411179, "grad_norm": 0.4854184687137604, "learning_rate": 4.87841819141838e-05, "loss": 0.7903, "step": 1057 }, { "epoch": 0.47884136682507356, "grad_norm": 0.5258088111877441, "learning_rate": 4.861131882712314e-05, "loss": 0.9007, "step": 1058 }, { "epoch": 0.4792939579090292, "grad_norm": 0.4994451105594635, "learning_rate": 4.843866414973641e-05, "loss": 0.7204, "step": 1059 }, { "epoch": 0.47974654899298486, "grad_norm": 0.483104944229126, "learning_rate": 4.826621858223431e-05, "loss": 0.8054, "step": 1060 }, { "epoch": 0.4801991400769405, "grad_norm": 0.5926142930984497, "learning_rate": 4.809398282397951e-05, "loss": 0.9783, "step": 1061 }, { "epoch": 0.48065173116089616, "grad_norm": 0.5814982056617737, "learning_rate": 4.7921957573483754e-05, "loss": 1.1615, "step": 1062 }, { "epoch": 0.4811043222448518, "grad_norm": 0.5472806692123413, "learning_rate": 4.7750143528405126e-05, "loss": 0.8907, "step": 1063 }, { "epoch": 0.4815569133288074, "grad_norm": 0.5430026054382324, "learning_rate": 4.7578541385545014e-05, "loss": 1.1033, "step": 1064 }, { "epoch": 0.4820095044127631, "grad_norm": 0.4955959916114807, "learning_rate": 4.740715184084559e-05, "loss": 0.8295, "step": 1065 }, { "epoch": 0.4824620954967187, "grad_norm": 0.44894957542419434, "learning_rate": 4.723597558938672e-05, "loss": 0.6721, "step": 1066 }, { "epoch": 0.4829146865806744, "grad_norm": 0.4289432764053345, "learning_rate": 4.7065013325383275e-05, "loss": 0.6993, "step": 1067 }, { "epoch": 0.48336727766463, "grad_norm": 0.5182718634605408, "learning_rate": 4.6894265742182344e-05, "loss": 0.6838, "step": 1068 }, { "epoch": 0.4838198687485857, "grad_norm": 0.6270569562911987, "learning_rate": 4.672373353226023e-05, "loss": 1.0023, "step": 1069 }, { "epoch": 0.4842724598325413, "grad_norm": 0.4682745039463043, "learning_rate": 4.6553417387219886e-05, "loss": 0.7835, "step": 1070 }, { "epoch": 0.4847250509164969, "grad_norm": 0.5611541271209717, "learning_rate": 4.6383317997787986e-05, "loss": 0.8775, "step": 1071 }, { "epoch": 0.4851776420004526, "grad_norm": 0.5803195238113403, "learning_rate": 4.6213436053812144e-05, "loss": 1.2508, "step": 1072 }, { "epoch": 0.4856302330844082, "grad_norm": 0.4491642117500305, "learning_rate": 4.6043772244258096e-05, "loss": 0.7083, "step": 1073 }, { "epoch": 0.4860828241683639, "grad_norm": 0.5523327589035034, "learning_rate": 4.587432725720687e-05, "loss": 0.9192, "step": 1074 }, { "epoch": 0.4865354152523195, "grad_norm": 0.5153692960739136, "learning_rate": 4.5705101779852135e-05, "loss": 0.8575, "step": 1075 }, { "epoch": 0.48698800633627515, "grad_norm": 0.5204266309738159, "learning_rate": 4.5536096498497295e-05, "loss": 0.7705, "step": 1076 }, { "epoch": 0.4874405974202308, "grad_norm": 0.5776071548461914, "learning_rate": 4.5367312098552705e-05, "loss": 1.0659, "step": 1077 }, { "epoch": 0.48789318850418645, "grad_norm": 0.4952472448348999, "learning_rate": 4.519874926453302e-05, "loss": 0.8679, "step": 1078 }, { "epoch": 0.4883457795881421, "grad_norm": 0.4542064964771271, "learning_rate": 4.503040868005416e-05, "loss": 0.796, "step": 1079 }, { "epoch": 0.48879837067209775, "grad_norm": 0.5350983142852783, "learning_rate": 4.486229102783084e-05, "loss": 0.9615, "step": 1080 }, { "epoch": 0.4892509617560534, "grad_norm": 0.6435551047325134, "learning_rate": 4.469439698967359e-05, "loss": 1.4427, "step": 1081 }, { "epoch": 0.48970355284000905, "grad_norm": 0.5093064308166504, "learning_rate": 4.452672724648611e-05, "loss": 0.9063, "step": 1082 }, { "epoch": 0.49015614392396467, "grad_norm": 0.5286267399787903, "learning_rate": 4.4359282478262454e-05, "loss": 0.7893, "step": 1083 }, { "epoch": 0.49060873500792035, "grad_norm": 0.46403610706329346, "learning_rate": 4.419206336408418e-05, "loss": 0.6207, "step": 1084 }, { "epoch": 0.49106132609187597, "grad_norm": 0.5149431824684143, "learning_rate": 4.40250705821178e-05, "loss": 0.8015, "step": 1085 }, { "epoch": 0.49151391717583165, "grad_norm": 0.543387234210968, "learning_rate": 4.385830480961192e-05, "loss": 0.9494, "step": 1086 }, { "epoch": 0.49196650825978727, "grad_norm": 0.5150254964828491, "learning_rate": 4.3691766722894435e-05, "loss": 0.8872, "step": 1087 }, { "epoch": 0.49241909934374295, "grad_norm": 0.48482635617256165, "learning_rate": 4.3525456997369926e-05, "loss": 0.5804, "step": 1088 }, { "epoch": 0.49287169042769857, "grad_norm": 0.5497539639472961, "learning_rate": 4.335937630751674e-05, "loss": 1.0462, "step": 1089 }, { "epoch": 0.4933242815116542, "grad_norm": 0.4731244742870331, "learning_rate": 4.3193525326884435e-05, "loss": 0.7626, "step": 1090 }, { "epoch": 0.4937768725956099, "grad_norm": 0.5599950551986694, "learning_rate": 4.3027904728090954e-05, "loss": 0.893, "step": 1091 }, { "epoch": 0.4942294636795655, "grad_norm": 0.5236995220184326, "learning_rate": 4.2862515182819904e-05, "loss": 0.9606, "step": 1092 }, { "epoch": 0.4946820547635212, "grad_norm": 0.6051554679870605, "learning_rate": 4.26973573618179e-05, "loss": 0.9776, "step": 1093 }, { "epoch": 0.4951346458474768, "grad_norm": 0.550380527973175, "learning_rate": 4.253243193489165e-05, "loss": 0.8356, "step": 1094 }, { "epoch": 0.4955872369314325, "grad_norm": 0.4601718783378601, "learning_rate": 4.236773957090548e-05, "loss": 0.75, "step": 1095 }, { "epoch": 0.4960398280153881, "grad_norm": 0.5237348675727844, "learning_rate": 4.220328093777851e-05, "loss": 0.8505, "step": 1096 }, { "epoch": 0.4964924190993437, "grad_norm": 0.5931401252746582, "learning_rate": 4.203905670248194e-05, "loss": 0.8769, "step": 1097 }, { "epoch": 0.4969450101832994, "grad_norm": 0.5065420269966125, "learning_rate": 4.1875067531036374e-05, "loss": 0.9189, "step": 1098 }, { "epoch": 0.497397601267255, "grad_norm": 0.49355390667915344, "learning_rate": 4.1711314088509e-05, "loss": 0.7092, "step": 1099 }, { "epoch": 0.4978501923512107, "grad_norm": 0.5571166276931763, "learning_rate": 4.154779703901114e-05, "loss": 0.9334, "step": 1100 }, { "epoch": 0.4983027834351663, "grad_norm": 0.4783550500869751, "learning_rate": 4.1384517045695316e-05, "loss": 0.8249, "step": 1101 }, { "epoch": 0.498755374519122, "grad_norm": 0.42690062522888184, "learning_rate": 4.12214747707527e-05, "loss": 0.5945, "step": 1102 }, { "epoch": 0.4992079656030776, "grad_norm": 0.5278857946395874, "learning_rate": 4.1058670875410386e-05, "loss": 0.9126, "step": 1103 }, { "epoch": 0.49966055668703324, "grad_norm": 0.5173430442810059, "learning_rate": 4.089610601992864e-05, "loss": 1.0533, "step": 1104 }, { "epoch": 0.5001131477709889, "grad_norm": 0.6061631441116333, "learning_rate": 4.0733780863598335e-05, "loss": 1.1095, "step": 1105 }, { "epoch": 0.5005657388549446, "grad_norm": 0.5131955742835999, "learning_rate": 4.057169606473827e-05, "loss": 0.765, "step": 1106 }, { "epoch": 0.5010183299389002, "grad_norm": 0.6769198775291443, "learning_rate": 4.04098522806924e-05, "loss": 0.5315, "step": 1107 }, { "epoch": 0.5014709210228558, "grad_norm": 0.5840115547180176, "learning_rate": 4.0248250167827275e-05, "loss": 0.8269, "step": 1108 }, { "epoch": 0.5019235121068115, "grad_norm": 0.5163032412528992, "learning_rate": 4.00868903815293e-05, "loss": 0.9032, "step": 1109 }, { "epoch": 0.5023761031907671, "grad_norm": 0.5210195779800415, "learning_rate": 3.99257735762021e-05, "loss": 0.6677, "step": 1110 }, { "epoch": 0.5028286942747228, "grad_norm": 0.4478912651538849, "learning_rate": 3.976490040526394e-05, "loss": 0.7087, "step": 1111 }, { "epoch": 0.5032812853586784, "grad_norm": 0.6642077565193176, "learning_rate": 3.960427152114494e-05, "loss": 1.0827, "step": 1112 }, { "epoch": 0.5037338764426341, "grad_norm": 0.5077605247497559, "learning_rate": 3.9443887575284586e-05, "loss": 0.8169, "step": 1113 }, { "epoch": 0.5041864675265897, "grad_norm": 0.5410700440406799, "learning_rate": 3.9283749218128885e-05, "loss": 0.929, "step": 1114 }, { "epoch": 0.5046390586105454, "grad_norm": 0.5579005479812622, "learning_rate": 3.9123857099127936e-05, "loss": 0.8948, "step": 1115 }, { "epoch": 0.505091649694501, "grad_norm": 0.44208788871765137, "learning_rate": 3.896421186673318e-05, "loss": 0.7895, "step": 1116 }, { "epoch": 0.5055442407784566, "grad_norm": 0.6037198305130005, "learning_rate": 3.88048141683948e-05, "loss": 0.9484, "step": 1117 }, { "epoch": 0.5059968318624123, "grad_norm": 0.5426963567733765, "learning_rate": 3.864566465055912e-05, "loss": 0.8515, "step": 1118 }, { "epoch": 0.506449422946368, "grad_norm": 0.5471535921096802, "learning_rate": 3.848676395866586e-05, "loss": 0.952, "step": 1119 }, { "epoch": 0.5069020140303236, "grad_norm": 0.5362775921821594, "learning_rate": 3.832811273714569e-05, "loss": 0.8603, "step": 1120 }, { "epoch": 0.5073546051142792, "grad_norm": 0.47162777185440063, "learning_rate": 3.816971162941755e-05, "loss": 0.737, "step": 1121 }, { "epoch": 0.5078071961982349, "grad_norm": 0.6005630493164062, "learning_rate": 3.8011561277885964e-05, "loss": 0.5545, "step": 1122 }, { "epoch": 0.5082597872821906, "grad_norm": 0.585284948348999, "learning_rate": 3.785366232393861e-05, "loss": 0.8955, "step": 1123 }, { "epoch": 0.5087123783661461, "grad_norm": 0.47206246852874756, "learning_rate": 3.769601540794344e-05, "loss": 0.4943, "step": 1124 }, { "epoch": 0.5091649694501018, "grad_norm": 0.5819693803787231, "learning_rate": 3.75386211692464e-05, "loss": 0.8945, "step": 1125 }, { "epoch": 0.5096175605340575, "grad_norm": 0.5495473146438599, "learning_rate": 3.738148024616863e-05, "loss": 1.0552, "step": 1126 }, { "epoch": 0.5100701516180132, "grad_norm": 0.5882697105407715, "learning_rate": 3.722459327600395e-05, "loss": 0.9964, "step": 1127 }, { "epoch": 0.5105227427019687, "grad_norm": 0.5525474548339844, "learning_rate": 3.7067960895016275e-05, "loss": 0.9653, "step": 1128 }, { "epoch": 0.5109753337859244, "grad_norm": 0.4547380208969116, "learning_rate": 3.691158373843694e-05, "loss": 0.6892, "step": 1129 }, { "epoch": 0.5114279248698801, "grad_norm": 0.5613864064216614, "learning_rate": 3.675546244046228e-05, "loss": 0.8805, "step": 1130 }, { "epoch": 0.5118805159538357, "grad_norm": 0.5161027908325195, "learning_rate": 3.659959763425098e-05, "loss": 0.8082, "step": 1131 }, { "epoch": 0.5123331070377913, "grad_norm": 0.550102710723877, "learning_rate": 3.644398995192147e-05, "loss": 0.865, "step": 1132 }, { "epoch": 0.512785698121747, "grad_norm": 0.562684178352356, "learning_rate": 3.628864002454947e-05, "loss": 0.9495, "step": 1133 }, { "epoch": 0.5132382892057027, "grad_norm": 0.5930166840553284, "learning_rate": 3.6133548482165225e-05, "loss": 1.1959, "step": 1134 }, { "epoch": 0.5136908802896583, "grad_norm": 0.5563511848449707, "learning_rate": 3.597871595375121e-05, "loss": 1.0088, "step": 1135 }, { "epoch": 0.5141434713736139, "grad_norm": 0.4959903061389923, "learning_rate": 3.582414306723941e-05, "loss": 0.8666, "step": 1136 }, { "epoch": 0.5145960624575696, "grad_norm": 0.5866261720657349, "learning_rate": 3.5669830449508836e-05, "loss": 0.9295, "step": 1137 }, { "epoch": 0.5150486535415252, "grad_norm": 0.46393883228302, "learning_rate": 3.5515778726382966e-05, "loss": 0.8025, "step": 1138 }, { "epoch": 0.5155012446254809, "grad_norm": 0.49082475900650024, "learning_rate": 3.536198852262713e-05, "loss": 0.9733, "step": 1139 }, { "epoch": 0.5159538357094365, "grad_norm": 0.5178489089012146, "learning_rate": 3.520846046194614e-05, "loss": 0.8229, "step": 1140 }, { "epoch": 0.5164064267933922, "grad_norm": 0.42519283294677734, "learning_rate": 3.5055195166981645e-05, "loss": 0.6869, "step": 1141 }, { "epoch": 0.5168590178773478, "grad_norm": 0.5158776044845581, "learning_rate": 3.490219325930962e-05, "loss": 0.8341, "step": 1142 }, { "epoch": 0.5173116089613035, "grad_norm": 0.6154707670211792, "learning_rate": 3.474945535943793e-05, "loss": 1.0029, "step": 1143 }, { "epoch": 0.5177642000452591, "grad_norm": 0.45592236518859863, "learning_rate": 3.459698208680359e-05, "loss": 0.7145, "step": 1144 }, { "epoch": 0.5182167911292147, "grad_norm": 0.546812117099762, "learning_rate": 3.4444774059770536e-05, "loss": 0.7955, "step": 1145 }, { "epoch": 0.5186693822131704, "grad_norm": 0.5020256638526917, "learning_rate": 3.429283189562694e-05, "loss": 0.8758, "step": 1146 }, { "epoch": 0.5191219732971261, "grad_norm": 0.45688191056251526, "learning_rate": 3.4141156210582756e-05, "loss": 0.7281, "step": 1147 }, { "epoch": 0.5195745643810817, "grad_norm": 0.5124973058700562, "learning_rate": 3.398974761976725e-05, "loss": 0.9177, "step": 1148 }, { "epoch": 0.5200271554650373, "grad_norm": 0.5134281516075134, "learning_rate": 3.383860673722639e-05, "loss": 0.6541, "step": 1149 }, { "epoch": 0.520479746548993, "grad_norm": 0.47039201855659485, "learning_rate": 3.36877341759205e-05, "loss": 0.6795, "step": 1150 }, { "epoch": 0.5209323376329487, "grad_norm": 0.41255295276641846, "learning_rate": 3.353713054772171e-05, "loss": 0.4722, "step": 1151 }, { "epoch": 0.5213849287169042, "grad_norm": 0.5607662796974182, "learning_rate": 3.338679646341146e-05, "loss": 0.7539, "step": 1152 }, { "epoch": 0.5218375198008599, "grad_norm": 0.475154310464859, "learning_rate": 3.3236732532678096e-05, "loss": 0.753, "step": 1153 }, { "epoch": 0.5222901108848156, "grad_norm": 0.5787057280540466, "learning_rate": 3.308693936411421e-05, "loss": 1.0778, "step": 1154 }, { "epoch": 0.5227427019687713, "grad_norm": 0.49256566166877747, "learning_rate": 3.293741756521442e-05, "loss": 0.7686, "step": 1155 }, { "epoch": 0.5231952930527268, "grad_norm": 0.5235592722892761, "learning_rate": 3.2788167742372725e-05, "loss": 1.0292, "step": 1156 }, { "epoch": 0.5236478841366825, "grad_norm": 0.4235650300979614, "learning_rate": 3.263919050088023e-05, "loss": 0.6254, "step": 1157 }, { "epoch": 0.5241004752206382, "grad_norm": 0.4604433476924896, "learning_rate": 3.24904864449224e-05, "loss": 0.7407, "step": 1158 }, { "epoch": 0.5245530663045938, "grad_norm": 0.41169530153274536, "learning_rate": 3.234205617757686e-05, "loss": 0.6558, "step": 1159 }, { "epoch": 0.5250056573885494, "grad_norm": 0.4996432065963745, "learning_rate": 3.219390030081091e-05, "loss": 0.7899, "step": 1160 }, { "epoch": 0.5254582484725051, "grad_norm": 0.4621078073978424, "learning_rate": 3.204601941547897e-05, "loss": 0.8085, "step": 1161 }, { "epoch": 0.5259108395564608, "grad_norm": 0.46135109663009644, "learning_rate": 3.1898414121320276e-05, "loss": 0.7147, "step": 1162 }, { "epoch": 0.5263634306404164, "grad_norm": 0.47625815868377686, "learning_rate": 3.1751085016956374e-05, "loss": 0.8159, "step": 1163 }, { "epoch": 0.526816021724372, "grad_norm": 0.5127959251403809, "learning_rate": 3.160403269988864e-05, "loss": 0.8973, "step": 1164 }, { "epoch": 0.5272686128083277, "grad_norm": 0.43038976192474365, "learning_rate": 3.1457257766496015e-05, "loss": 0.4513, "step": 1165 }, { "epoch": 0.5277212038922833, "grad_norm": 0.38239526748657227, "learning_rate": 3.131076081203247e-05, "loss": 0.589, "step": 1166 }, { "epoch": 0.528173794976239, "grad_norm": 0.40409696102142334, "learning_rate": 3.116454243062459e-05, "loss": 0.5848, "step": 1167 }, { "epoch": 0.5286263860601946, "grad_norm": 0.5500158667564392, "learning_rate": 3.101860321526924e-05, "loss": 0.9011, "step": 1168 }, { "epoch": 0.5290789771441503, "grad_norm": 0.5197808742523193, "learning_rate": 3.087294375783103e-05, "loss": 0.7307, "step": 1169 }, { "epoch": 0.5295315682281059, "grad_norm": 0.4519001841545105, "learning_rate": 3.072756464904006e-05, "loss": 0.6155, "step": 1170 }, { "epoch": 0.5299841593120616, "grad_norm": 0.5058773756027222, "learning_rate": 3.0582466478489455e-05, "loss": 0.8084, "step": 1171 }, { "epoch": 0.5304367503960172, "grad_norm": 0.5140249133110046, "learning_rate": 3.0437649834632977e-05, "loss": 0.8015, "step": 1172 }, { "epoch": 0.5308893414799728, "grad_norm": 0.4667006731033325, "learning_rate": 3.029311530478266e-05, "loss": 0.7651, "step": 1173 }, { "epoch": 0.5313419325639285, "grad_norm": 0.5502526164054871, "learning_rate": 3.0148863475106314e-05, "loss": 0.9117, "step": 1174 }, { "epoch": 0.5317945236478842, "grad_norm": 0.45357632637023926, "learning_rate": 3.000489493062535e-05, "loss": 0.6998, "step": 1175 }, { "epoch": 0.5322471147318398, "grad_norm": 0.5894111394882202, "learning_rate": 2.9861210255212245e-05, "loss": 0.88, "step": 1176 }, { "epoch": 0.5326997058157954, "grad_norm": 0.4162091314792633, "learning_rate": 2.9717810031588277e-05, "loss": 0.5883, "step": 1177 }, { "epoch": 0.5331522968997511, "grad_norm": 0.5021141171455383, "learning_rate": 2.9574694841321082e-05, "loss": 0.7265, "step": 1178 }, { "epoch": 0.5336048879837068, "grad_norm": 0.5346493721008301, "learning_rate": 2.943186526482229e-05, "loss": 0.84, "step": 1179 }, { "epoch": 0.5336048879837068, "eval_loss": 0.20086538791656494, "eval_runtime": 51.6164, "eval_samples_per_second": 18.037, "eval_steps_per_second": 9.028, "step": 1179 }, { "epoch": 0.5340574790676623, "grad_norm": 0.48865947127342224, "learning_rate": 2.9289321881345254e-05, "loss": 0.762, "step": 1180 }, { "epoch": 0.534510070151618, "grad_norm": 0.5784907341003418, "learning_rate": 2.9147065268982666e-05, "loss": 1.0548, "step": 1181 }, { "epoch": 0.5349626612355737, "grad_norm": 0.5535488128662109, "learning_rate": 2.9005096004664177e-05, "loss": 0.8442, "step": 1182 }, { "epoch": 0.5354152523195294, "grad_norm": 0.541508674621582, "learning_rate": 2.886341466415412e-05, "loss": 0.6527, "step": 1183 }, { "epoch": 0.5358678434034849, "grad_norm": 0.5890561938285828, "learning_rate": 2.8722021822049027e-05, "loss": 0.8764, "step": 1184 }, { "epoch": 0.5363204344874406, "grad_norm": 0.4952785074710846, "learning_rate": 2.858091805177554e-05, "loss": 0.8619, "step": 1185 }, { "epoch": 0.5367730255713963, "grad_norm": 0.4867090582847595, "learning_rate": 2.84401039255879e-05, "loss": 0.7619, "step": 1186 }, { "epoch": 0.5372256166553518, "grad_norm": 0.5630788207054138, "learning_rate": 2.8299580014565664e-05, "loss": 0.9779, "step": 1187 }, { "epoch": 0.5376782077393075, "grad_norm": 0.5053603053092957, "learning_rate": 2.815934688861146e-05, "loss": 0.8156, "step": 1188 }, { "epoch": 0.5381307988232632, "grad_norm": 0.463606595993042, "learning_rate": 2.8019405116448516e-05, "loss": 0.6967, "step": 1189 }, { "epoch": 0.5385833899072189, "grad_norm": 0.4266325533390045, "learning_rate": 2.7879755265618555e-05, "loss": 0.6799, "step": 1190 }, { "epoch": 0.5390359809911744, "grad_norm": 0.4817747175693512, "learning_rate": 2.7740397902479387e-05, "loss": 0.765, "step": 1191 }, { "epoch": 0.5394885720751301, "grad_norm": 0.5528002381324768, "learning_rate": 2.7601333592202583e-05, "loss": 0.8884, "step": 1192 }, { "epoch": 0.5399411631590858, "grad_norm": 0.6013336777687073, "learning_rate": 2.746256289877126e-05, "loss": 1.1148, "step": 1193 }, { "epoch": 0.5403937542430414, "grad_norm": 0.45258426666259766, "learning_rate": 2.7324086384977698e-05, "loss": 0.6737, "step": 1194 }, { "epoch": 0.540846345326997, "grad_norm": 0.4618009030818939, "learning_rate": 2.7185904612421176e-05, "loss": 0.6267, "step": 1195 }, { "epoch": 0.5412989364109527, "grad_norm": 0.5477524399757385, "learning_rate": 2.7048018141505604e-05, "loss": 0.6358, "step": 1196 }, { "epoch": 0.5417515274949084, "grad_norm": 0.5210055112838745, "learning_rate": 2.6910427531437287e-05, "loss": 0.8353, "step": 1197 }, { "epoch": 0.542204118578864, "grad_norm": 0.489916056394577, "learning_rate": 2.677313334022268e-05, "loss": 0.7661, "step": 1198 }, { "epoch": 0.5426567096628196, "grad_norm": 0.5317739844322205, "learning_rate": 2.6636136124666e-05, "loss": 0.8718, "step": 1199 }, { "epoch": 0.5431093007467753, "grad_norm": 0.563149094581604, "learning_rate": 2.6499436440367165e-05, "loss": 1.0695, "step": 1200 }, { "epoch": 0.5435618918307309, "grad_norm": 0.6181795597076416, "learning_rate": 2.6363034841719392e-05, "loss": 0.6715, "step": 1201 }, { "epoch": 0.5440144829146866, "grad_norm": 0.5255316495895386, "learning_rate": 2.622693188190699e-05, "loss": 0.7982, "step": 1202 }, { "epoch": 0.5444670739986422, "grad_norm": 0.5731631517410278, "learning_rate": 2.609112811290315e-05, "loss": 0.9418, "step": 1203 }, { "epoch": 0.5449196650825979, "grad_norm": 0.44508278369903564, "learning_rate": 2.59556240854677e-05, "loss": 0.7452, "step": 1204 }, { "epoch": 0.5453722561665535, "grad_norm": 0.5581162571907043, "learning_rate": 2.5820420349144693e-05, "loss": 0.9905, "step": 1205 }, { "epoch": 0.5458248472505092, "grad_norm": 0.4913443922996521, "learning_rate": 2.5685517452260567e-05, "loss": 0.8525, "step": 1206 }, { "epoch": 0.5462774383344648, "grad_norm": 0.6032697558403015, "learning_rate": 2.5550915941921526e-05, "loss": 0.8903, "step": 1207 }, { "epoch": 0.5467300294184204, "grad_norm": 0.5988163352012634, "learning_rate": 2.541661636401157e-05, "loss": 0.8709, "step": 1208 }, { "epoch": 0.5471826205023761, "grad_norm": 0.4508473873138428, "learning_rate": 2.52826192631901e-05, "loss": 0.6725, "step": 1209 }, { "epoch": 0.5476352115863318, "grad_norm": 0.5168390274047852, "learning_rate": 2.514892518288988e-05, "loss": 0.6997, "step": 1210 }, { "epoch": 0.5480878026702874, "grad_norm": 0.48190394043922424, "learning_rate": 2.5015534665314755e-05, "loss": 0.8461, "step": 1211 }, { "epoch": 0.548540393754243, "grad_norm": 0.5142277479171753, "learning_rate": 2.488244825143743e-05, "loss": 0.9458, "step": 1212 }, { "epoch": 0.5489929848381987, "grad_norm": 0.5585710406303406, "learning_rate": 2.4749666480997337e-05, "loss": 0.8265, "step": 1213 }, { "epoch": 0.5494455759221544, "grad_norm": 0.5171539783477783, "learning_rate": 2.4617189892498327e-05, "loss": 0.8442, "step": 1214 }, { "epoch": 0.5498981670061099, "grad_norm": 0.5711890459060669, "learning_rate": 2.4485019023206635e-05, "loss": 0.9659, "step": 1215 }, { "epoch": 0.5503507580900656, "grad_norm": 0.5380867719650269, "learning_rate": 2.4353154409148637e-05, "loss": 0.8394, "step": 1216 }, { "epoch": 0.5508033491740213, "grad_norm": 0.4734165370464325, "learning_rate": 2.4221596585108663e-05, "loss": 0.5909, "step": 1217 }, { "epoch": 0.551255940257977, "grad_norm": 0.5847920775413513, "learning_rate": 2.409034608462686e-05, "loss": 0.9881, "step": 1218 }, { "epoch": 0.5517085313419325, "grad_norm": 0.5413870811462402, "learning_rate": 2.3959403439996907e-05, "loss": 0.9771, "step": 1219 }, { "epoch": 0.5521611224258882, "grad_norm": 0.5978078842163086, "learning_rate": 2.382876918226409e-05, "loss": 0.7637, "step": 1220 }, { "epoch": 0.5526137135098439, "grad_norm": 0.5320140719413757, "learning_rate": 2.369844384122293e-05, "loss": 0.8887, "step": 1221 }, { "epoch": 0.5530663045937995, "grad_norm": 0.38298875093460083, "learning_rate": 2.356842794541516e-05, "loss": 0.4509, "step": 1222 }, { "epoch": 0.5535188956777551, "grad_norm": 0.4932439625263214, "learning_rate": 2.3438722022127546e-05, "loss": 0.7125, "step": 1223 }, { "epoch": 0.5539714867617108, "grad_norm": 0.4653611481189728, "learning_rate": 2.330932659738967e-05, "loss": 0.7279, "step": 1224 }, { "epoch": 0.5544240778456664, "grad_norm": 0.6703446507453918, "learning_rate": 2.318024219597196e-05, "loss": 1.2327, "step": 1225 }, { "epoch": 0.5548766689296221, "grad_norm": 0.5315276980400085, "learning_rate": 2.3051469341383402e-05, "loss": 0.9666, "step": 1226 }, { "epoch": 0.5553292600135777, "grad_norm": 0.4821360111236572, "learning_rate": 2.2923008555869552e-05, "loss": 0.7568, "step": 1227 }, { "epoch": 0.5557818510975334, "grad_norm": 0.7255955338478088, "learning_rate": 2.2794860360410342e-05, "loss": 1.0656, "step": 1228 }, { "epoch": 0.556234442181489, "grad_norm": 0.5304329991340637, "learning_rate": 2.266702527471788e-05, "loss": 0.8769, "step": 1229 }, { "epoch": 0.5566870332654447, "grad_norm": 0.5752689242362976, "learning_rate": 2.2539503817234553e-05, "loss": 1.0325, "step": 1230 }, { "epoch": 0.5571396243494003, "grad_norm": 0.5128302574157715, "learning_rate": 2.241229650513077e-05, "loss": 0.8263, "step": 1231 }, { "epoch": 0.5575922154333559, "grad_norm": 0.4604417383670807, "learning_rate": 2.2285403854302912e-05, "loss": 0.6971, "step": 1232 }, { "epoch": 0.5580448065173116, "grad_norm": 0.5629538893699646, "learning_rate": 2.2158826379371258e-05, "loss": 0.7893, "step": 1233 }, { "epoch": 0.5584973976012673, "grad_norm": 0.5437077879905701, "learning_rate": 2.2032564593677774e-05, "loss": 0.6816, "step": 1234 }, { "epoch": 0.5589499886852229, "grad_norm": 0.550563633441925, "learning_rate": 2.1906619009284257e-05, "loss": 0.8575, "step": 1235 }, { "epoch": 0.5594025797691785, "grad_norm": 0.5454031825065613, "learning_rate": 2.178099013697005e-05, "loss": 0.7705, "step": 1236 }, { "epoch": 0.5598551708531342, "grad_norm": 0.5278245210647583, "learning_rate": 2.165567848623009e-05, "loss": 0.9149, "step": 1237 }, { "epoch": 0.5603077619370899, "grad_norm": 0.4524843394756317, "learning_rate": 2.153068456527283e-05, "loss": 0.6749, "step": 1238 }, { "epoch": 0.5607603530210454, "grad_norm": 0.5835239887237549, "learning_rate": 2.1406008881018047e-05, "loss": 0.8808, "step": 1239 }, { "epoch": 0.5612129441050011, "grad_norm": 0.5145358443260193, "learning_rate": 2.1281651939094992e-05, "loss": 0.9971, "step": 1240 }, { "epoch": 0.5616655351889568, "grad_norm": 0.5458048582077026, "learning_rate": 2.1157614243840206e-05, "loss": 0.9558, "step": 1241 }, { "epoch": 0.5621181262729125, "grad_norm": 0.502225935459137, "learning_rate": 2.1033896298295508e-05, "loss": 0.7903, "step": 1242 }, { "epoch": 0.562570717356868, "grad_norm": 0.5830442905426025, "learning_rate": 2.0910498604205986e-05, "loss": 1.1186, "step": 1243 }, { "epoch": 0.5630233084408237, "grad_norm": 0.47947055101394653, "learning_rate": 2.0787421662017825e-05, "loss": 0.7421, "step": 1244 }, { "epoch": 0.5634758995247794, "grad_norm": 0.5443291068077087, "learning_rate": 2.0664665970876496e-05, "loss": 1.0004, "step": 1245 }, { "epoch": 0.563928490608735, "grad_norm": 0.6094164848327637, "learning_rate": 2.0542232028624586e-05, "loss": 0.9367, "step": 1246 }, { "epoch": 0.5643810816926906, "grad_norm": 0.5203437805175781, "learning_rate": 2.0420120331799786e-05, "loss": 1.035, "step": 1247 }, { "epoch": 0.5648336727766463, "grad_norm": 0.5212861895561218, "learning_rate": 2.0298331375632962e-05, "loss": 1.0298, "step": 1248 }, { "epoch": 0.565286263860602, "grad_norm": 0.5760666728019714, "learning_rate": 2.0176865654045974e-05, "loss": 1.0492, "step": 1249 }, { "epoch": 0.5657388549445576, "grad_norm": 0.5951511859893799, "learning_rate": 2.0055723659649904e-05, "loss": 1.0702, "step": 1250 }, { "epoch": 0.5661914460285132, "grad_norm": 0.4950634837150574, "learning_rate": 1.9934905883742882e-05, "loss": 0.6607, "step": 1251 }, { "epoch": 0.5666440371124689, "grad_norm": 0.42447158694267273, "learning_rate": 1.981441281630816e-05, "loss": 0.6971, "step": 1252 }, { "epoch": 0.5670966281964245, "grad_norm": 0.5706565976142883, "learning_rate": 1.969424494601213e-05, "loss": 1.0281, "step": 1253 }, { "epoch": 0.5675492192803802, "grad_norm": 0.5865153074264526, "learning_rate": 1.9574402760202315e-05, "loss": 1.1408, "step": 1254 }, { "epoch": 0.5680018103643358, "grad_norm": 0.5091719627380371, "learning_rate": 1.94548867449054e-05, "loss": 0.7702, "step": 1255 }, { "epoch": 0.5684544014482915, "grad_norm": 0.5343332290649414, "learning_rate": 1.933569738482529e-05, "loss": 0.9963, "step": 1256 }, { "epoch": 0.5689069925322471, "grad_norm": 0.5233331918716431, "learning_rate": 1.9216835163341106e-05, "loss": 0.9176, "step": 1257 }, { "epoch": 0.5693595836162028, "grad_norm": 0.45247891545295715, "learning_rate": 1.9098300562505266e-05, "loss": 0.6073, "step": 1258 }, { "epoch": 0.5698121747001584, "grad_norm": 0.4159603416919708, "learning_rate": 1.8980094063041432e-05, "loss": 0.6153, "step": 1259 }, { "epoch": 0.570264765784114, "grad_norm": 0.45843058824539185, "learning_rate": 1.8862216144342692e-05, "loss": 0.5988, "step": 1260 }, { "epoch": 0.5707173568680697, "grad_norm": 0.5220310091972351, "learning_rate": 1.8744667284469575e-05, "loss": 0.8899, "step": 1261 }, { "epoch": 0.5711699479520254, "grad_norm": 0.529350221157074, "learning_rate": 1.8627447960148037e-05, "loss": 0.818, "step": 1262 }, { "epoch": 0.571622539035981, "grad_norm": 0.48126599192619324, "learning_rate": 1.851055864676765e-05, "loss": 0.7028, "step": 1263 }, { "epoch": 0.5720751301199366, "grad_norm": 0.4716106653213501, "learning_rate": 1.8393999818379525e-05, "loss": 0.7965, "step": 1264 }, { "epoch": 0.5725277212038923, "grad_norm": 0.550206184387207, "learning_rate": 1.8277771947694523e-05, "loss": 1.1316, "step": 1265 }, { "epoch": 0.572980312287848, "grad_norm": 0.5137092471122742, "learning_rate": 1.8161875506081293e-05, "loss": 0.8549, "step": 1266 }, { "epoch": 0.5734329033718035, "grad_norm": 0.6171010136604309, "learning_rate": 1.804631096356435e-05, "loss": 0.9547, "step": 1267 }, { "epoch": 0.5738854944557592, "grad_norm": 0.5315536260604858, "learning_rate": 1.7931078788822175e-05, "loss": 0.896, "step": 1268 }, { "epoch": 0.5743380855397149, "grad_norm": 0.5210160613059998, "learning_rate": 1.781617944918528e-05, "loss": 0.8643, "step": 1269 }, { "epoch": 0.5747906766236706, "grad_norm": 0.4929242432117462, "learning_rate": 1.7701613410634365e-05, "loss": 0.7251, "step": 1270 }, { "epoch": 0.5752432677076261, "grad_norm": 0.5473180413246155, "learning_rate": 1.7587381137798432e-05, "loss": 0.9597, "step": 1271 }, { "epoch": 0.5756958587915818, "grad_norm": 0.5931307077407837, "learning_rate": 1.747348309395286e-05, "loss": 0.9572, "step": 1272 }, { "epoch": 0.5761484498755375, "grad_norm": 0.6130375862121582, "learning_rate": 1.735991974101756e-05, "loss": 1.0984, "step": 1273 }, { "epoch": 0.576601040959493, "grad_norm": 0.5877550840377808, "learning_rate": 1.7246691539555028e-05, "loss": 1.08, "step": 1274 }, { "epoch": 0.5770536320434487, "grad_norm": 0.498898446559906, "learning_rate": 1.7133798948768597e-05, "loss": 0.8566, "step": 1275 }, { "epoch": 0.5775062231274044, "grad_norm": 0.4698163568973541, "learning_rate": 1.7021242426500493e-05, "loss": 0.7786, "step": 1276 }, { "epoch": 0.5779588142113601, "grad_norm": 0.4652780592441559, "learning_rate": 1.6909022429229982e-05, "loss": 0.8628, "step": 1277 }, { "epoch": 0.5784114052953157, "grad_norm": 0.4151431620121002, "learning_rate": 1.6797139412071584e-05, "loss": 0.6402, "step": 1278 }, { "epoch": 0.5788639963792713, "grad_norm": 0.47546225786209106, "learning_rate": 1.6685593828773095e-05, "loss": 0.7404, "step": 1279 }, { "epoch": 0.579316587463227, "grad_norm": 0.47583967447280884, "learning_rate": 1.657438613171387e-05, "loss": 0.7162, "step": 1280 }, { "epoch": 0.5797691785471826, "grad_norm": 0.5199639797210693, "learning_rate": 1.6463516771902988e-05, "loss": 0.8366, "step": 1281 }, { "epoch": 0.5802217696311383, "grad_norm": 0.5876262784004211, "learning_rate": 1.6352986198977325e-05, "loss": 1.0567, "step": 1282 }, { "epoch": 0.5806743607150939, "grad_norm": 0.5289322137832642, "learning_rate": 1.624279486119984e-05, "loss": 0.9105, "step": 1283 }, { "epoch": 0.5811269517990496, "grad_norm": 0.5706743597984314, "learning_rate": 1.6132943205457606e-05, "loss": 1.1233, "step": 1284 }, { "epoch": 0.5815795428830052, "grad_norm": 0.5402660369873047, "learning_rate": 1.6023431677260214e-05, "loss": 0.7923, "step": 1285 }, { "epoch": 0.5820321339669609, "grad_norm": 0.4756923317909241, "learning_rate": 1.5914260720737795e-05, "loss": 0.7638, "step": 1286 }, { "epoch": 0.5824847250509165, "grad_norm": 0.5244125127792358, "learning_rate": 1.5805430778639263e-05, "loss": 0.7667, "step": 1287 }, { "epoch": 0.5829373161348721, "grad_norm": 0.5348141193389893, "learning_rate": 1.5696942292330576e-05, "loss": 0.8909, "step": 1288 }, { "epoch": 0.5833899072188278, "grad_norm": 0.5148683786392212, "learning_rate": 1.5588795701792803e-05, "loss": 0.9972, "step": 1289 }, { "epoch": 0.5838424983027835, "grad_norm": 0.5113561749458313, "learning_rate": 1.5480991445620542e-05, "loss": 0.8348, "step": 1290 }, { "epoch": 0.5842950893867391, "grad_norm": 0.4689948260784149, "learning_rate": 1.5373529961019974e-05, "loss": 0.7019, "step": 1291 }, { "epoch": 0.5847476804706947, "grad_norm": 0.45713579654693604, "learning_rate": 1.5266411683807168e-05, "loss": 0.6865, "step": 1292 }, { "epoch": 0.5852002715546504, "grad_norm": 0.46141064167022705, "learning_rate": 1.5159637048406328e-05, "loss": 0.6813, "step": 1293 }, { "epoch": 0.585652862638606, "grad_norm": 0.6206589341163635, "learning_rate": 1.5053206487847914e-05, "loss": 0.9386, "step": 1294 }, { "epoch": 0.5861054537225616, "grad_norm": 0.4815228283405304, "learning_rate": 1.4947120433767047e-05, "loss": 0.7759, "step": 1295 }, { "epoch": 0.5865580448065173, "grad_norm": 0.6688939929008484, "learning_rate": 1.484137931640167e-05, "loss": 0.9813, "step": 1296 }, { "epoch": 0.587010635890473, "grad_norm": 0.4881376624107361, "learning_rate": 1.4735983564590783e-05, "loss": 0.8419, "step": 1297 }, { "epoch": 0.5874632269744287, "grad_norm": 0.49161502718925476, "learning_rate": 1.4630933605772801e-05, "loss": 0.8166, "step": 1298 }, { "epoch": 0.5879158180583842, "grad_norm": 0.4156933128833771, "learning_rate": 1.4526229865983665e-05, "loss": 0.4593, "step": 1299 }, { "epoch": 0.5883684091423399, "grad_norm": 0.5126404166221619, "learning_rate": 1.442187276985526e-05, "loss": 0.952, "step": 1300 }, { "epoch": 0.5888210002262956, "grad_norm": 0.5474767088890076, "learning_rate": 1.4317862740613664e-05, "loss": 0.8347, "step": 1301 }, { "epoch": 0.5892735913102511, "grad_norm": 0.5409084558486938, "learning_rate": 1.4214200200077343e-05, "loss": 0.9025, "step": 1302 }, { "epoch": 0.5897261823942068, "grad_norm": 0.48775801062583923, "learning_rate": 1.4110885568655564e-05, "loss": 0.8533, "step": 1303 }, { "epoch": 0.5901787734781625, "grad_norm": 0.5462921857833862, "learning_rate": 1.400791926534657e-05, "loss": 0.9145, "step": 1304 }, { "epoch": 0.5906313645621182, "grad_norm": 0.5149967074394226, "learning_rate": 1.3905301707735985e-05, "loss": 0.8007, "step": 1305 }, { "epoch": 0.5910839556460737, "grad_norm": 0.4886469841003418, "learning_rate": 1.3803033311995072e-05, "loss": 0.6551, "step": 1306 }, { "epoch": 0.5915365467300294, "grad_norm": 0.42015543580055237, "learning_rate": 1.3701114492879007e-05, "loss": 0.5759, "step": 1307 }, { "epoch": 0.5919891378139851, "grad_norm": 0.43609780073165894, "learning_rate": 1.3599545663725321e-05, "loss": 0.639, "step": 1308 }, { "epoch": 0.5924417288979407, "grad_norm": 0.4982304871082306, "learning_rate": 1.3498327236452013e-05, "loss": 0.8886, "step": 1309 }, { "epoch": 0.5928943199818963, "grad_norm": 0.7850152850151062, "learning_rate": 1.339745962155613e-05, "loss": 0.7553, "step": 1310 }, { "epoch": 0.593346911065852, "grad_norm": 0.6349214911460876, "learning_rate": 1.3296943228111925e-05, "loss": 0.825, "step": 1311 }, { "epoch": 0.5937995021498077, "grad_norm": 0.509701669216156, "learning_rate": 1.3196778463769255e-05, "loss": 0.7534, "step": 1312 }, { "epoch": 0.5942520932337633, "grad_norm": 0.4454813301563263, "learning_rate": 1.3096965734751943e-05, "loss": 0.69, "step": 1313 }, { "epoch": 0.594704684317719, "grad_norm": 0.5413982272148132, "learning_rate": 1.2997505445856084e-05, "loss": 0.8602, "step": 1314 }, { "epoch": 0.5951572754016746, "grad_norm": 0.460597425699234, "learning_rate": 1.2898398000448443e-05, "loss": 0.6843, "step": 1315 }, { "epoch": 0.5956098664856302, "grad_norm": 0.49289670586586, "learning_rate": 1.2799643800464834e-05, "loss": 0.6259, "step": 1316 }, { "epoch": 0.5960624575695859, "grad_norm": 0.5211944580078125, "learning_rate": 1.2701243246408422e-05, "loss": 0.8669, "step": 1317 }, { "epoch": 0.5965150486535415, "grad_norm": 0.42615196108818054, "learning_rate": 1.260319673734821e-05, "loss": 0.6003, "step": 1318 }, { "epoch": 0.5969676397374972, "grad_norm": 0.6688699722290039, "learning_rate": 1.2505504670917256e-05, "loss": 1.1166, "step": 1319 }, { "epoch": 0.5974202308214528, "grad_norm": 0.4943235218524933, "learning_rate": 1.2408167443311214e-05, "loss": 0.726, "step": 1320 }, { "epoch": 0.5978728219054085, "grad_norm": 0.5515215992927551, "learning_rate": 1.2311185449286677e-05, "loss": 1.0609, "step": 1321 }, { "epoch": 0.5983254129893641, "grad_norm": 0.5061673521995544, "learning_rate": 1.2214559082159537e-05, "loss": 1.0484, "step": 1322 }, { "epoch": 0.5987780040733197, "grad_norm": 0.5210549831390381, "learning_rate": 1.2118288733803473e-05, "loss": 0.81, "step": 1323 }, { "epoch": 0.5992305951572754, "grad_norm": 1.2081853151321411, "learning_rate": 1.2022374794648228e-05, "loss": 0.7087, "step": 1324 }, { "epoch": 0.5996831862412311, "grad_norm": 0.5520872473716736, "learning_rate": 1.1926817653678157e-05, "loss": 0.6642, "step": 1325 }, { "epoch": 0.6001357773251867, "grad_norm": 0.5837110280990601, "learning_rate": 1.1831617698430609e-05, "loss": 0.8913, "step": 1326 }, { "epoch": 0.6005883684091423, "grad_norm": 0.47259557247161865, "learning_rate": 1.1736775314994342e-05, "loss": 0.6563, "step": 1327 }, { "epoch": 0.601040959493098, "grad_norm": 0.4800024628639221, "learning_rate": 1.1642290888007956e-05, "loss": 0.7851, "step": 1328 }, { "epoch": 0.6014935505770537, "grad_norm": 0.566657304763794, "learning_rate": 1.15481648006583e-05, "loss": 1.0709, "step": 1329 }, { "epoch": 0.6019461416610092, "grad_norm": 0.40888628363609314, "learning_rate": 1.1454397434679021e-05, "loss": 0.3452, "step": 1330 }, { "epoch": 0.6023987327449649, "grad_norm": 0.4961101710796356, "learning_rate": 1.1360989170348902e-05, "loss": 0.7467, "step": 1331 }, { "epoch": 0.6028513238289206, "grad_norm": 0.4577196538448334, "learning_rate": 1.1267940386490416e-05, "loss": 0.6897, "step": 1332 }, { "epoch": 0.6033039149128763, "grad_norm": 0.4734448194503784, "learning_rate": 1.1175251460468117e-05, "loss": 0.8086, "step": 1333 }, { "epoch": 0.6037565059968318, "grad_norm": 0.48683977127075195, "learning_rate": 1.10829227681871e-05, "loss": 0.8685, "step": 1334 }, { "epoch": 0.6042090970807875, "grad_norm": 0.46204137802124023, "learning_rate": 1.0990954684091558e-05, "loss": 0.5823, "step": 1335 }, { "epoch": 0.6046616881647432, "grad_norm": 0.47728225588798523, "learning_rate": 1.0899347581163221e-05, "loss": 0.8974, "step": 1336 }, { "epoch": 0.6051142792486988, "grad_norm": 0.4536563754081726, "learning_rate": 1.0808101830919814e-05, "loss": 0.7476, "step": 1337 }, { "epoch": 0.6055668703326544, "grad_norm": 0.519241452217102, "learning_rate": 1.0717217803413604e-05, "loss": 0.8848, "step": 1338 }, { "epoch": 0.6060194614166101, "grad_norm": 0.5880079865455627, "learning_rate": 1.062669586722983e-05, "loss": 1.1778, "step": 1339 }, { "epoch": 0.6064720525005657, "grad_norm": 0.5012646317481995, "learning_rate": 1.0536536389485275e-05, "loss": 0.9761, "step": 1340 }, { "epoch": 0.6069246435845214, "grad_norm": 0.5676561594009399, "learning_rate": 1.044673973582675e-05, "loss": 0.9059, "step": 1341 }, { "epoch": 0.607377234668477, "grad_norm": 0.5131206512451172, "learning_rate": 1.0357306270429624e-05, "loss": 0.9466, "step": 1342 }, { "epoch": 0.6078298257524327, "grad_norm": 0.5565215945243835, "learning_rate": 1.0268236355996341e-05, "loss": 1.1076, "step": 1343 }, { "epoch": 0.6082824168363883, "grad_norm": 0.5122553110122681, "learning_rate": 1.0179530353754874e-05, "loss": 0.8366, "step": 1344 }, { "epoch": 0.608735007920344, "grad_norm": 0.4143733084201813, "learning_rate": 1.0091188623457415e-05, "loss": 0.5818, "step": 1345 }, { "epoch": 0.6091875990042996, "grad_norm": 0.4871106445789337, "learning_rate": 1.0003211523378796e-05, "loss": 0.8138, "step": 1346 }, { "epoch": 0.6096401900882552, "grad_norm": 0.5023435354232788, "learning_rate": 9.915599410315068e-06, "loss": 0.8095, "step": 1347 }, { "epoch": 0.6100927811722109, "grad_norm": 0.5308644771575928, "learning_rate": 9.828352639582072e-06, "loss": 0.955, "step": 1348 }, { "epoch": 0.6105453722561666, "grad_norm": 0.46719393134117126, "learning_rate": 9.74147156501396e-06, "loss": 0.7615, "step": 1349 }, { "epoch": 0.6109979633401222, "grad_norm": 0.4629960358142853, "learning_rate": 9.65495653896179e-06, "loss": 0.6945, "step": 1350 }, { "epoch": 0.6114505544240778, "grad_norm": 0.4505084455013275, "learning_rate": 9.568807912292077e-06, "loss": 0.7088, "step": 1351 }, { "epoch": 0.6119031455080335, "grad_norm": 0.4612928032875061, "learning_rate": 9.483026034385467e-06, "loss": 0.5966, "step": 1352 }, { "epoch": 0.6123557365919892, "grad_norm": 0.81995689868927, "learning_rate": 9.397611253135118e-06, "loss": 0.751, "step": 1353 }, { "epoch": 0.6128083276759447, "grad_norm": 0.49387305974960327, "learning_rate": 9.31256391494546e-06, "loss": 0.8838, "step": 1354 }, { "epoch": 0.6132609187599004, "grad_norm": 0.5457798838615417, "learning_rate": 9.227884364730744e-06, "loss": 0.851, "step": 1355 }, { "epoch": 0.6137135098438561, "grad_norm": 0.4536944031715393, "learning_rate": 9.143572945913614e-06, "loss": 0.8056, "step": 1356 }, { "epoch": 0.6141661009278118, "grad_norm": 0.5066845417022705, "learning_rate": 9.05963000042378e-06, "loss": 0.8617, "step": 1357 }, { "epoch": 0.6146186920117673, "grad_norm": 0.5071107745170593, "learning_rate": 8.976055868696542e-06, "loss": 0.8629, "step": 1358 }, { "epoch": 0.615071283095723, "grad_norm": 0.5773394107818604, "learning_rate": 8.892850889671455e-06, "loss": 1.1922, "step": 1359 }, { "epoch": 0.6155238741796787, "grad_norm": 0.48692917823791504, "learning_rate": 8.810015400790994e-06, "loss": 0.7168, "step": 1360 }, { "epoch": 0.6159764652636343, "grad_norm": 0.4717068374156952, "learning_rate": 8.727549737999097e-06, "loss": 0.8643, "step": 1361 }, { "epoch": 0.6164290563475899, "grad_norm": 0.5000625252723694, "learning_rate": 8.645454235739903e-06, "loss": 0.7273, "step": 1362 }, { "epoch": 0.6168816474315456, "grad_norm": 0.43107035756111145, "learning_rate": 8.563729226956319e-06, "loss": 0.6671, "step": 1363 }, { "epoch": 0.6173342385155013, "grad_norm": 0.6068969368934631, "learning_rate": 8.482375043088665e-06, "loss": 1.2765, "step": 1364 }, { "epoch": 0.6177868295994569, "grad_norm": 0.5019914507865906, "learning_rate": 8.401392014073405e-06, "loss": 0.9452, "step": 1365 }, { "epoch": 0.6182394206834125, "grad_norm": 0.5206013917922974, "learning_rate": 8.32078046834176e-06, "loss": 0.9439, "step": 1366 }, { "epoch": 0.6186920117673682, "grad_norm": 0.4639444649219513, "learning_rate": 8.240540732818347e-06, "loss": 0.802, "step": 1367 }, { "epoch": 0.6191446028513238, "grad_norm": 0.5678917765617371, "learning_rate": 8.160673132919938e-06, "loss": 1.0333, "step": 1368 }, { "epoch": 0.6195971939352795, "grad_norm": 0.5168341994285583, "learning_rate": 8.081177992554013e-06, "loss": 0.8182, "step": 1369 }, { "epoch": 0.6200497850192351, "grad_norm": 0.611613929271698, "learning_rate": 8.002055634117578e-06, "loss": 1.0066, "step": 1370 }, { "epoch": 0.6205023761031908, "grad_norm": 0.5084848999977112, "learning_rate": 7.923306378495809e-06, "loss": 1.0312, "step": 1371 }, { "epoch": 0.6209549671871464, "grad_norm": 0.3988780081272125, "learning_rate": 7.844930545060703e-06, "loss": 0.4544, "step": 1372 }, { "epoch": 0.6214075582711021, "grad_norm": 0.6262059211730957, "learning_rate": 7.766928451669863e-06, "loss": 1.004, "step": 1373 }, { "epoch": 0.6218601493550577, "grad_norm": 0.5216922760009766, "learning_rate": 7.689300414665124e-06, "loss": 0.7166, "step": 1374 }, { "epoch": 0.6223127404390133, "grad_norm": 0.5540860891342163, "learning_rate": 7.612046748871327e-06, "loss": 0.8727, "step": 1375 }, { "epoch": 0.622765331522969, "grad_norm": 0.44992414116859436, "learning_rate": 7.5351677675950635e-06, "loss": 0.6504, "step": 1376 }, { "epoch": 0.6232179226069247, "grad_norm": 0.4714016616344452, "learning_rate": 7.458663782623343e-06, "loss": 0.8419, "step": 1377 }, { "epoch": 0.6236705136908803, "grad_norm": 0.49986764788627625, "learning_rate": 7.382535104222366e-06, "loss": 0.7894, "step": 1378 }, { "epoch": 0.6241231047748359, "grad_norm": 0.5532403588294983, "learning_rate": 7.306782041136218e-06, "loss": 0.901, "step": 1379 }, { "epoch": 0.6245756958587916, "grad_norm": 0.5384380221366882, "learning_rate": 7.231404900585714e-06, "loss": 0.9753, "step": 1380 }, { "epoch": 0.6250282869427473, "grad_norm": 0.5002140402793884, "learning_rate": 7.156403988267069e-06, "loss": 0.8285, "step": 1381 }, { "epoch": 0.6254808780267028, "grad_norm": 0.5726694464683533, "learning_rate": 7.08177960835068e-06, "loss": 0.8634, "step": 1382 }, { "epoch": 0.6259334691106585, "grad_norm": 0.5283421874046326, "learning_rate": 7.0075320634799045e-06, "loss": 0.7802, "step": 1383 }, { "epoch": 0.6263860601946142, "grad_norm": 0.4411744475364685, "learning_rate": 6.9336616547697965e-06, "loss": 0.5788, "step": 1384 }, { "epoch": 0.6268386512785699, "grad_norm": 0.5233549475669861, "learning_rate": 6.860168681805945e-06, "loss": 0.8098, "step": 1385 }, { "epoch": 0.6272912423625254, "grad_norm": 0.5534676313400269, "learning_rate": 6.787053442643232e-06, "loss": 1.0433, "step": 1386 }, { "epoch": 0.6277438334464811, "grad_norm": 0.5603635907173157, "learning_rate": 6.714316233804574e-06, "loss": 0.8382, "step": 1387 }, { "epoch": 0.6281964245304368, "grad_norm": 0.48828354477882385, "learning_rate": 6.6419573502798374e-06, "loss": 0.7261, "step": 1388 }, { "epoch": 0.6286490156143923, "grad_norm": 0.46339505910873413, "learning_rate": 6.5699770855244815e-06, "loss": 0.6944, "step": 1389 }, { "epoch": 0.629101606698348, "grad_norm": 0.5434744954109192, "learning_rate": 6.498375731458528e-06, "loss": 0.9542, "step": 1390 }, { "epoch": 0.6295541977823037, "grad_norm": 0.49759843945503235, "learning_rate": 6.427153578465262e-06, "loss": 0.7949, "step": 1391 }, { "epoch": 0.6300067888662594, "grad_norm": 0.5009887218475342, "learning_rate": 6.356310915390118e-06, "loss": 0.8088, "step": 1392 }, { "epoch": 0.630459379950215, "grad_norm": 0.5697285532951355, "learning_rate": 6.28584802953951e-06, "loss": 0.9026, "step": 1393 }, { "epoch": 0.6309119710341706, "grad_norm": 0.48114413022994995, "learning_rate": 6.215765206679569e-06, "loss": 0.733, "step": 1394 }, { "epoch": 0.6313645621181263, "grad_norm": 0.5739166140556335, "learning_rate": 6.146062731035129e-06, "loss": 1.1062, "step": 1395 }, { "epoch": 0.6318171532020819, "grad_norm": 0.475238561630249, "learning_rate": 6.076740885288479e-06, "loss": 0.7195, "step": 1396 }, { "epoch": 0.6322697442860375, "grad_norm": 0.5954925417900085, "learning_rate": 6.007799950578264e-06, "loss": 1.0236, "step": 1397 }, { "epoch": 0.6327223353699932, "grad_norm": 0.5766705870628357, "learning_rate": 5.939240206498287e-06, "loss": 0.9263, "step": 1398 }, { "epoch": 0.6331749264539489, "grad_norm": 0.4999594986438751, "learning_rate": 5.8710619310964445e-06, "loss": 0.696, "step": 1399 }, { "epoch": 0.6336275175379045, "grad_norm": 0.4891878068447113, "learning_rate": 5.803265400873514e-06, "loss": 0.9738, "step": 1400 }, { "epoch": 0.6340801086218602, "grad_norm": 0.5583204627037048, "learning_rate": 5.735850890782157e-06, "loss": 1.0242, "step": 1401 }, { "epoch": 0.6345326997058158, "grad_norm": 0.49677279591560364, "learning_rate": 5.668818674225685e-06, "loss": 0.757, "step": 1402 }, { "epoch": 0.6349852907897714, "grad_norm": 0.4881908595561981, "learning_rate": 5.602169023057013e-06, "loss": 0.7328, "step": 1403 }, { "epoch": 0.6354378818737271, "grad_norm": 0.5154109597206116, "learning_rate": 5.5359022075775146e-06, "loss": 0.8986, "step": 1404 }, { "epoch": 0.6358904729576828, "grad_norm": 0.585472047328949, "learning_rate": 5.470018496535967e-06, "loss": 0.7595, "step": 1405 }, { "epoch": 0.6363430640416384, "grad_norm": 0.5411213636398315, "learning_rate": 5.40451815712748e-06, "loss": 0.9672, "step": 1406 }, { "epoch": 0.636795655125594, "grad_norm": 0.4666892886161804, "learning_rate": 5.33940145499231e-06, "loss": 0.6498, "step": 1407 }, { "epoch": 0.6372482462095497, "grad_norm": 0.4871276319026947, "learning_rate": 5.274668654214932e-06, "loss": 0.6612, "step": 1408 }, { "epoch": 0.6377008372935054, "grad_norm": 0.6037775874137878, "learning_rate": 5.210320017322812e-06, "loss": 1.0683, "step": 1409 }, { "epoch": 0.6381534283774609, "grad_norm": 0.5718627572059631, "learning_rate": 5.146355805285452e-06, "loss": 1.0986, "step": 1410 }, { "epoch": 0.6386060194614166, "grad_norm": 0.5335869789123535, "learning_rate": 5.08277627751329e-06, "loss": 0.6686, "step": 1411 }, { "epoch": 0.6390586105453723, "grad_norm": 0.44057121872901917, "learning_rate": 5.01958169185669e-06, "loss": 0.587, "step": 1412 }, { "epoch": 0.639511201629328, "grad_norm": 0.4762479066848755, "learning_rate": 4.956772304604818e-06, "loss": 0.6572, "step": 1413 }, { "epoch": 0.6399637927132835, "grad_norm": 0.5564635396003723, "learning_rate": 4.8943483704846475e-06, "loss": 1.0787, "step": 1414 }, { "epoch": 0.6404163837972392, "grad_norm": 0.461028128862381, "learning_rate": 4.832310142659946e-06, "loss": 0.6813, "step": 1415 }, { "epoch": 0.6408689748811949, "grad_norm": 0.5097917318344116, "learning_rate": 4.7706578727302224e-06, "loss": 0.892, "step": 1416 }, { "epoch": 0.6413215659651504, "grad_norm": 0.5178496241569519, "learning_rate": 4.709391810729713e-06, "loss": 0.9012, "step": 1417 }, { "epoch": 0.6417741570491061, "grad_norm": 0.49436789751052856, "learning_rate": 4.648512205126376e-06, "loss": 0.7641, "step": 1418 }, { "epoch": 0.6422267481330618, "grad_norm": 0.8753583431243896, "learning_rate": 4.588019302820834e-06, "loss": 0.9981, "step": 1419 }, { "epoch": 0.6426793392170175, "grad_norm": 0.5542361736297607, "learning_rate": 4.527913349145441e-06, "loss": 0.9366, "step": 1420 }, { "epoch": 0.643131930300973, "grad_norm": 0.41615116596221924, "learning_rate": 4.468194587863273e-06, "loss": 0.5197, "step": 1421 }, { "epoch": 0.6435845213849287, "grad_norm": 0.5194451212882996, "learning_rate": 4.408863261167096e-06, "loss": 0.7452, "step": 1422 }, { "epoch": 0.6440371124688844, "grad_norm": 0.43391209840774536, "learning_rate": 4.349919609678455e-06, "loss": 0.6486, "step": 1423 }, { "epoch": 0.64448970355284, "grad_norm": 0.4887462556362152, "learning_rate": 4.291363872446597e-06, "loss": 0.6932, "step": 1424 }, { "epoch": 0.6449422946367956, "grad_norm": 0.6093336343765259, "learning_rate": 4.233196286947605e-06, "loss": 1.1266, "step": 1425 }, { "epoch": 0.6453948857207513, "grad_norm": 0.4346993565559387, "learning_rate": 4.175417089083378e-06, "loss": 0.5141, "step": 1426 }, { "epoch": 0.645847476804707, "grad_norm": 0.6119694709777832, "learning_rate": 4.118026513180695e-06, "loss": 0.9554, "step": 1427 }, { "epoch": 0.6463000678886626, "grad_norm": 0.5438103079795837, "learning_rate": 4.061024791990253e-06, "loss": 0.8614, "step": 1428 }, { "epoch": 0.6467526589726182, "grad_norm": 0.6257548928260803, "learning_rate": 4.004412156685711e-06, "loss": 0.9136, "step": 1429 }, { "epoch": 0.6472052500565739, "grad_norm": 0.5468961000442505, "learning_rate": 3.948188836862776e-06, "loss": 0.7189, "step": 1430 }, { "epoch": 0.6476578411405295, "grad_norm": 0.44468608498573303, "learning_rate": 3.892355060538289e-06, "loss": 0.6663, "step": 1431 }, { "epoch": 0.6481104322244852, "grad_norm": 0.5603303909301758, "learning_rate": 3.836911054149239e-06, "loss": 0.8297, "step": 1432 }, { "epoch": 0.6485630233084408, "grad_norm": 0.5190114974975586, "learning_rate": 3.7818570425519173e-06, "loss": 0.7831, "step": 1433 }, { "epoch": 0.6490156143923965, "grad_norm": 0.5360861420631409, "learning_rate": 3.7271932490209328e-06, "loss": 0.9773, "step": 1434 }, { "epoch": 0.6494682054763521, "grad_norm": 0.5341346263885498, "learning_rate": 3.6729198952483724e-06, "loss": 0.7224, "step": 1435 }, { "epoch": 0.6499207965603078, "grad_norm": 0.6777652502059937, "learning_rate": 3.6190372013428562e-06, "loss": 1.1764, "step": 1436 }, { "epoch": 0.6503733876442634, "grad_norm": 0.5760977864265442, "learning_rate": 3.5655453858286614e-06, "loss": 1.1423, "step": 1437 }, { "epoch": 0.650825978728219, "grad_norm": 0.44717341661453247, "learning_rate": 3.512444665644865e-06, "loss": 0.5806, "step": 1438 }, { "epoch": 0.6512785698121747, "grad_norm": 0.5548418760299683, "learning_rate": 3.4597352561443807e-06, "loss": 0.8524, "step": 1439 }, { "epoch": 0.6517311608961304, "grad_norm": 0.5654526948928833, "learning_rate": 3.40741737109318e-06, "loss": 1.0357, "step": 1440 }, { "epoch": 0.652183751980086, "grad_norm": 0.5160826444625854, "learning_rate": 3.355491222669371e-06, "loss": 0.7621, "step": 1441 }, { "epoch": 0.6526363430640416, "grad_norm": 0.4938196539878845, "learning_rate": 3.3039570214623782e-06, "loss": 0.7649, "step": 1442 }, { "epoch": 0.6530889341479973, "grad_norm": 0.5363398790359497, "learning_rate": 3.2528149764720186e-06, "loss": 0.8831, "step": 1443 }, { "epoch": 0.653541525231953, "grad_norm": 0.6059714555740356, "learning_rate": 3.202065295107726e-06, "loss": 0.9239, "step": 1444 }, { "epoch": 0.6539941163159085, "grad_norm": 0.5283812284469604, "learning_rate": 3.1517081831876737e-06, "loss": 0.8756, "step": 1445 }, { "epoch": 0.6544467073998642, "grad_norm": 0.5337501764297485, "learning_rate": 3.1017438449379434e-06, "loss": 1.0205, "step": 1446 }, { "epoch": 0.6548992984838199, "grad_norm": 0.5220997929573059, "learning_rate": 3.052172482991711e-06, "loss": 1.0293, "step": 1447 }, { "epoch": 0.6553518895677756, "grad_norm": 0.6735256314277649, "learning_rate": 3.0029942983884173e-06, "loss": 0.9027, "step": 1448 }, { "epoch": 0.6558044806517311, "grad_norm": 0.5760489106178284, "learning_rate": 2.9542094905729457e-06, "loss": 1.1187, "step": 1449 }, { "epoch": 0.6562570717356868, "grad_norm": 0.5203390717506409, "learning_rate": 2.905818257394799e-06, "loss": 0.9437, "step": 1450 }, { "epoch": 0.6567096628196425, "grad_norm": 0.4524308741092682, "learning_rate": 2.8578207951073353e-06, "loss": 0.6448, "step": 1451 }, { "epoch": 0.6571622539035981, "grad_norm": 0.4728708863258362, "learning_rate": 2.810217298366968e-06, "loss": 0.6843, "step": 1452 }, { "epoch": 0.6576148449875537, "grad_norm": 0.5334341526031494, "learning_rate": 2.7630079602323442e-06, "loss": 0.8001, "step": 1453 }, { "epoch": 0.6580674360715094, "grad_norm": 0.484829306602478, "learning_rate": 2.716192972163556e-06, "loss": 0.7185, "step": 1454 }, { "epoch": 0.658520027155465, "grad_norm": 0.5486847162246704, "learning_rate": 2.6697725240214076e-06, "loss": 0.9219, "step": 1455 }, { "epoch": 0.6589726182394207, "grad_norm": 0.551567792892456, "learning_rate": 2.6237468040666512e-06, "loss": 1.0728, "step": 1456 }, { "epoch": 0.6594252093233763, "grad_norm": 0.4729478657245636, "learning_rate": 2.578115998959152e-06, "loss": 0.655, "step": 1457 }, { "epoch": 0.659877800407332, "grad_norm": 0.5266134738922119, "learning_rate": 2.532880293757223e-06, "loss": 0.9098, "step": 1458 }, { "epoch": 0.6603303914912876, "grad_norm": 0.408477246761322, "learning_rate": 2.4880398719167586e-06, "loss": 0.644, "step": 1459 }, { "epoch": 0.6607829825752433, "grad_norm": 0.5005697011947632, "learning_rate": 2.4435949152906145e-06, "loss": 0.8143, "step": 1460 }, { "epoch": 0.6612355736591989, "grad_norm": 0.5645555257797241, "learning_rate": 2.3995456041278066e-06, "loss": 1.0237, "step": 1461 }, { "epoch": 0.6616881647431545, "grad_norm": 0.5656578540802002, "learning_rate": 2.3558921170727888e-06, "loss": 0.746, "step": 1462 }, { "epoch": 0.6621407558271102, "grad_norm": 0.48828980326652527, "learning_rate": 2.312634631164723e-06, "loss": 0.8299, "step": 1463 }, { "epoch": 0.6625933469110659, "grad_norm": 0.5643355250358582, "learning_rate": 2.2697733218367436e-06, "loss": 0.987, "step": 1464 }, { "epoch": 0.6630459379950215, "grad_norm": 0.49453938007354736, "learning_rate": 2.2273083629153147e-06, "loss": 0.7006, "step": 1465 }, { "epoch": 0.6634985290789771, "grad_norm": 0.5133518576622009, "learning_rate": 2.1852399266194314e-06, "loss": 0.8049, "step": 1466 }, { "epoch": 0.6639511201629328, "grad_norm": 0.456297367811203, "learning_rate": 2.1435681835600184e-06, "loss": 0.5972, "step": 1467 }, { "epoch": 0.6644037112468885, "grad_norm": 0.5126147270202637, "learning_rate": 2.1022933027391555e-06, "loss": 1.0061, "step": 1468 }, { "epoch": 0.664856302330844, "grad_norm": 0.5274229645729065, "learning_rate": 2.06141545154942e-06, "loss": 0.8853, "step": 1469 }, { "epoch": 0.6653088934147997, "grad_norm": 0.4462442100048065, "learning_rate": 2.0209347957732328e-06, "loss": 0.6457, "step": 1470 }, { "epoch": 0.6657614844987554, "grad_norm": 0.5552085041999817, "learning_rate": 1.9808514995821593e-06, "loss": 0.9793, "step": 1471 }, { "epoch": 0.6662140755827111, "grad_norm": 0.4626757800579071, "learning_rate": 1.941165725536265e-06, "loss": 0.7582, "step": 1472 }, { "epoch": 0.6666666666666666, "grad_norm": 0.6225503087043762, "learning_rate": 1.9018776345834155e-06, "loss": 0.5593, "step": 1473 }, { "epoch": 0.6671192577506223, "grad_norm": 0.4841187000274658, "learning_rate": 1.8629873860586566e-06, "loss": 0.8542, "step": 1474 }, { "epoch": 0.667571848834578, "grad_norm": 0.636572539806366, "learning_rate": 1.8244951376835906e-06, "loss": 1.1556, "step": 1475 }, { "epoch": 0.6680244399185336, "grad_norm": 0.5484120845794678, "learning_rate": 1.7864010455656554e-06, "loss": 1.1598, "step": 1476 }, { "epoch": 0.6684770310024892, "grad_norm": 0.5752256512641907, "learning_rate": 1.7487052641976032e-06, "loss": 0.9162, "step": 1477 }, { "epoch": 0.6689296220864449, "grad_norm": 0.5652234554290771, "learning_rate": 1.7114079464567888e-06, "loss": 0.8911, "step": 1478 }, { "epoch": 0.6693822131704006, "grad_norm": 0.4782993495464325, "learning_rate": 1.6745092436045494e-06, "loss": 0.8625, "step": 1479 }, { "epoch": 0.6698348042543562, "grad_norm": 0.5361889600753784, "learning_rate": 1.6380093052856483e-06, "loss": 0.8206, "step": 1480 }, { "epoch": 0.6702873953383118, "grad_norm": 0.47865453362464905, "learning_rate": 1.6019082795276307e-06, "loss": 0.7713, "step": 1481 }, { "epoch": 0.6707399864222675, "grad_norm": 0.457772821187973, "learning_rate": 1.566206312740226e-06, "loss": 0.6757, "step": 1482 }, { "epoch": 0.6711925775062231, "grad_norm": 0.5296614170074463, "learning_rate": 1.5309035497147684e-06, "loss": 0.9659, "step": 1483 }, { "epoch": 0.6716451685901788, "grad_norm": 0.496402770280838, "learning_rate": 1.4960001336235875e-06, "loss": 0.8881, "step": 1484 }, { "epoch": 0.6720977596741344, "grad_norm": 0.43575870990753174, "learning_rate": 1.4614962060194304e-06, "loss": 0.6084, "step": 1485 }, { "epoch": 0.6725503507580901, "grad_norm": 0.6141435503959656, "learning_rate": 1.4273919068349184e-06, "loss": 0.8805, "step": 1486 }, { "epoch": 0.6730029418420457, "grad_norm": 0.5889500975608826, "learning_rate": 1.3936873743819357e-06, "loss": 1.056, "step": 1487 }, { "epoch": 0.6734555329260014, "grad_norm": 0.4447315037250519, "learning_rate": 1.3603827453511186e-06, "loss": 0.6903, "step": 1488 }, { "epoch": 0.673908124009957, "grad_norm": 0.5051842331886292, "learning_rate": 1.3274781548112458e-06, "loss": 0.7553, "step": 1489 }, { "epoch": 0.6743607150939126, "grad_norm": 0.5147336721420288, "learning_rate": 1.2949737362087156e-06, "loss": 0.8062, "step": 1490 }, { "epoch": 0.6748133061778683, "grad_norm": 0.5651899576187134, "learning_rate": 1.2628696213670355e-06, "loss": 0.9131, "step": 1491 }, { "epoch": 0.675265897261824, "grad_norm": 0.569429337978363, "learning_rate": 1.231165940486234e-06, "loss": 0.9232, "step": 1492 }, { "epoch": 0.6757184883457796, "grad_norm": 0.5901250839233398, "learning_rate": 1.1998628221423614e-06, "loss": 1.138, "step": 1493 }, { "epoch": 0.6761710794297352, "grad_norm": 0.47215431928634644, "learning_rate": 1.1689603932869665e-06, "loss": 0.7919, "step": 1494 }, { "epoch": 0.6766236705136909, "grad_norm": 0.5352398753166199, "learning_rate": 1.1384587792465872e-06, "loss": 0.6431, "step": 1495 }, { "epoch": 0.6770762615976466, "grad_norm": 0.50892174243927, "learning_rate": 1.1083581037222068e-06, "loss": 0.7254, "step": 1496 }, { "epoch": 0.6775288526816021, "grad_norm": 0.5627516508102417, "learning_rate": 1.0786584887888307e-06, "loss": 0.949, "step": 1497 }, { "epoch": 0.6779814437655578, "grad_norm": 0.4339214265346527, "learning_rate": 1.0493600548948878e-06, "loss": 0.6264, "step": 1498 }, { "epoch": 0.6784340348495135, "grad_norm": 0.42282262444496155, "learning_rate": 1.020462920861831e-06, "loss": 0.5289, "step": 1499 }, { "epoch": 0.6788866259334692, "grad_norm": 0.46268293261528015, "learning_rate": 9.919672038835925e-07, "loss": 0.6008, "step": 1500 }, { "epoch": 0.6793392170174247, "grad_norm": 0.5364608764648438, "learning_rate": 9.638730195261625e-07, "loss": 0.6824, "step": 1501 }, { "epoch": 0.6797918081013804, "grad_norm": 0.5147013664245605, "learning_rate": 9.36180481727067e-07, "loss": 0.8468, "step": 1502 }, { "epoch": 0.6802443991853361, "grad_norm": 0.5776438117027283, "learning_rate": 9.088897027949462e-07, "loss": 0.7729, "step": 1503 }, { "epoch": 0.6806969902692916, "grad_norm": 0.47045034170150757, "learning_rate": 8.820007934090879e-07, "loss": 0.8525, "step": 1504 }, { "epoch": 0.6811495813532473, "grad_norm": 0.554664134979248, "learning_rate": 8.555138626189618e-07, "loss": 0.8944, "step": 1505 }, { "epoch": 0.681602172437203, "grad_norm": 0.5782079696655273, "learning_rate": 8.294290178437969e-07, "loss": 0.8888, "step": 1506 }, { "epoch": 0.6820547635211587, "grad_norm": 0.5328008532524109, "learning_rate": 8.037463648721488e-07, "loss": 0.8906, "step": 1507 }, { "epoch": 0.6825073546051142, "grad_norm": 0.5287159085273743, "learning_rate": 7.78466007861467e-07, "loss": 0.757, "step": 1508 }, { "epoch": 0.6829599456890699, "grad_norm": 0.5075718760490417, "learning_rate": 7.535880493376279e-07, "loss": 0.8139, "step": 1509 }, { "epoch": 0.6834125367730256, "grad_norm": 0.5284056067466736, "learning_rate": 7.291125901946027e-07, "loss": 0.9401, "step": 1510 }, { "epoch": 0.6838651278569812, "grad_norm": 0.7645094394683838, "learning_rate": 7.050397296939792e-07, "loss": 0.7314, "step": 1511 }, { "epoch": 0.6843177189409368, "grad_norm": 0.602204442024231, "learning_rate": 6.813695654645957e-07, "loss": 1.0208, "step": 1512 }, { "epoch": 0.6847703100248925, "grad_norm": 0.5655729174613953, "learning_rate": 6.581021935021304e-07, "loss": 1.0402, "step": 1513 }, { "epoch": 0.6852229011088482, "grad_norm": 0.6173549294471741, "learning_rate": 6.352377081687011e-07, "loss": 1.0761, "step": 1514 }, { "epoch": 0.6856754921928038, "grad_norm": 0.6081221103668213, "learning_rate": 6.127762021925221e-07, "loss": 1.0481, "step": 1515 }, { "epoch": 0.6861280832767594, "grad_norm": 0.6282268166542053, "learning_rate": 5.907177666674812e-07, "loss": 1.0363, "step": 1516 }, { "epoch": 0.6865806743607151, "grad_norm": 0.5493825674057007, "learning_rate": 5.690624910527964e-07, "loss": 0.8727, "step": 1517 }, { "epoch": 0.6870332654446707, "grad_norm": 0.5676363706588745, "learning_rate": 5.478104631726711e-07, "loss": 0.8488, "step": 1518 }, { "epoch": 0.6874858565286264, "grad_norm": 0.49481111764907837, "learning_rate": 5.269617692158613e-07, "loss": 0.6527, "step": 1519 }, { "epoch": 0.687938447612582, "grad_norm": 0.4594171643257141, "learning_rate": 5.065164937354428e-07, "loss": 0.8464, "step": 1520 }, { "epoch": 0.6883910386965377, "grad_norm": 0.5010755062103271, "learning_rate": 4.864747196483554e-07, "loss": 0.6373, "step": 1521 }, { "epoch": 0.6888436297804933, "grad_norm": 0.5262997150421143, "learning_rate": 4.668365282351372e-07, "loss": 0.7576, "step": 1522 }, { "epoch": 0.689296220864449, "grad_norm": 0.4877280592918396, "learning_rate": 4.476019991395908e-07, "loss": 0.7472, "step": 1523 }, { "epoch": 0.6897488119484046, "grad_norm": 0.5093807578086853, "learning_rate": 4.2877121036840606e-07, "loss": 0.7657, "step": 1524 }, { "epoch": 0.6902014030323602, "grad_norm": 0.5577916502952576, "learning_rate": 4.103442382909051e-07, "loss": 0.8773, "step": 1525 }, { "epoch": 0.6906539941163159, "grad_norm": 0.5437626242637634, "learning_rate": 3.923211576387087e-07, "loss": 0.8471, "step": 1526 }, { "epoch": 0.6911065852002716, "grad_norm": 0.4851123094558716, "learning_rate": 3.74702041505437e-07, "loss": 0.7631, "step": 1527 }, { "epoch": 0.6915591762842273, "grad_norm": 0.511060893535614, "learning_rate": 3.5748696134639825e-07, "loss": 0.7885, "step": 1528 }, { "epoch": 0.6920117673681828, "grad_norm": 0.44935715198516846, "learning_rate": 3.406759869783005e-07, "loss": 0.5878, "step": 1529 }, { "epoch": 0.6924643584521385, "grad_norm": 0.5244868397712708, "learning_rate": 3.2426918657900704e-07, "loss": 0.8548, "step": 1530 }, { "epoch": 0.6929169495360942, "grad_norm": 0.467731237411499, "learning_rate": 3.0826662668720364e-07, "loss": 0.8462, "step": 1531 }, { "epoch": 0.6933695406200497, "grad_norm": 0.49825143814086914, "learning_rate": 2.9266837220217613e-07, "loss": 0.6598, "step": 1532 }, { "epoch": 0.6938221317040054, "grad_norm": 0.48928219079971313, "learning_rate": 2.7747448638352215e-07, "loss": 0.7955, "step": 1533 }, { "epoch": 0.6942747227879611, "grad_norm": 0.5015487670898438, "learning_rate": 2.6268503085089547e-07, "loss": 0.9172, "step": 1534 }, { "epoch": 0.6947273138719168, "grad_norm": 0.44026800990104675, "learning_rate": 2.4830006558373973e-07, "loss": 0.6144, "step": 1535 }, { "epoch": 0.6951799049558723, "grad_norm": 0.5041724443435669, "learning_rate": 2.343196489211219e-07, "loss": 0.6941, "step": 1536 }, { "epoch": 0.695632496039828, "grad_norm": 0.5276736617088318, "learning_rate": 2.2074383756137686e-07, "loss": 0.9376, "step": 1537 }, { "epoch": 0.6960850871237837, "grad_norm": 0.539641261100769, "learning_rate": 2.0757268656198537e-07, "loss": 0.8445, "step": 1538 }, { "epoch": 0.6965376782077393, "grad_norm": 0.6825346350669861, "learning_rate": 1.948062493392744e-07, "loss": 1.1659, "step": 1539 }, { "epoch": 0.6969902692916949, "grad_norm": 0.5396426320075989, "learning_rate": 1.824445776682504e-07, "loss": 0.8319, "step": 1540 }, { "epoch": 0.6974428603756506, "grad_norm": 0.47626543045043945, "learning_rate": 1.7048772168237748e-07, "loss": 0.6278, "step": 1541 }, { "epoch": 0.6978954514596063, "grad_norm": 0.5386638641357422, "learning_rate": 1.5893572987333293e-07, "loss": 0.8372, "step": 1542 }, { "epoch": 0.6983480425435619, "grad_norm": 0.47005295753479004, "learning_rate": 1.477886490908742e-07, "loss": 0.7101, "step": 1543 }, { "epoch": 0.6988006336275175, "grad_norm": 0.48174676299095154, "learning_rate": 1.3704652454261668e-07, "loss": 0.6952, "step": 1544 }, { "epoch": 0.6992532247114732, "grad_norm": 0.46316617727279663, "learning_rate": 1.2670939979384512e-07, "loss": 0.7623, "step": 1545 }, { "epoch": 0.6997058157954288, "grad_norm": 0.6071258783340454, "learning_rate": 1.1677731676733584e-07, "loss": 1.0641, "step": 1546 }, { "epoch": 0.7001584068793845, "grad_norm": 0.4970654249191284, "learning_rate": 1.0725031574323474e-07, "loss": 0.7059, "step": 1547 }, { "epoch": 0.7006109979633401, "grad_norm": 0.4778405725955963, "learning_rate": 9.8128435358813e-08, "loss": 0.7019, "step": 1548 }, { "epoch": 0.7010635890472958, "grad_norm": 0.653716504573822, "learning_rate": 8.941171260835601e-08, "loss": 1.1438, "step": 1549 }, { "epoch": 0.7015161801312514, "grad_norm": 0.480570524930954, "learning_rate": 8.110018284304133e-08, "loss": 0.7621, "step": 1550 }, { "epoch": 0.7019687712152071, "grad_norm": 0.5458505153656006, "learning_rate": 7.319387977072766e-08, "loss": 1.0065, "step": 1551 }, { "epoch": 0.7024213622991627, "grad_norm": 0.4744986593723297, "learning_rate": 6.569283545587724e-08, "loss": 0.7546, "step": 1552 }, { "epoch": 0.7028739533831183, "grad_norm": 0.5370805859565735, "learning_rate": 5.8597080319389156e-08, "loss": 1.1206, "step": 1553 }, { "epoch": 0.703326544467074, "grad_norm": 0.49838805198669434, "learning_rate": 5.190664313851068e-08, "loss": 0.69, "step": 1554 }, { "epoch": 0.7037791355510297, "grad_norm": 0.5323602557182312, "learning_rate": 4.562155104665955e-08, "loss": 0.7611, "step": 1555 }, { "epoch": 0.7042317266349853, "grad_norm": 0.5184367299079895, "learning_rate": 3.9741829533401775e-08, "loss": 0.9261, "step": 1556 }, { "epoch": 0.7046843177189409, "grad_norm": 0.5408939719200134, "learning_rate": 3.4267502444274015e-08, "loss": 0.7767, "step": 1557 }, { "epoch": 0.7051369088028966, "grad_norm": 0.49419355392456055, "learning_rate": 2.9198591980705848e-08, "loss": 0.8236, "step": 1558 }, { "epoch": 0.7055894998868523, "grad_norm": 0.5697168111801147, "learning_rate": 2.4535118699953176e-08, "loss": 1.0097, "step": 1559 }, { "epoch": 0.7060420909708078, "grad_norm": 0.6209654808044434, "learning_rate": 2.0277101514987184e-08, "loss": 1.1677, "step": 1560 }, { "epoch": 0.7064946820547635, "grad_norm": 0.4766218364238739, "learning_rate": 1.642455769444995e-08, "loss": 0.6273, "step": 1561 }, { "epoch": 0.7069472731387192, "grad_norm": 0.5815181136131287, "learning_rate": 1.2977502862532297e-08, "loss": 0.8964, "step": 1562 }, { "epoch": 0.7073998642226749, "grad_norm": 0.5831205248832703, "learning_rate": 9.935950998962717e-09, "loss": 0.8613, "step": 1563 }, { "epoch": 0.7078524553066304, "grad_norm": 0.4257395565509796, "learning_rate": 7.2999144389296335e-09, "loss": 0.6569, "step": 1564 }, { "epoch": 0.7083050463905861, "grad_norm": 0.6523287296295166, "learning_rate": 5.069403873025902e-09, "loss": 1.023, "step": 1565 }, { "epoch": 0.7087576374745418, "grad_norm": 0.5527802109718323, "learning_rate": 3.244428347204398e-09, "loss": 0.8362, "step": 1566 }, { "epoch": 0.7092102285584974, "grad_norm": 0.42681992053985596, "learning_rate": 1.8249952627669154e-09, "loss": 0.5779, "step": 1567 }, { "epoch": 0.709662819642453, "grad_norm": 0.489521861076355, "learning_rate": 8.111103762975524e-10, "loss": 0.8613, "step": 1568 }, { "epoch": 0.7101154107264087, "grad_norm": 0.55333411693573, "learning_rate": 2.027777996738145e-10, "loss": 1.0663, "step": 1569 }, { "epoch": 0.7105680018103643, "grad_norm": 0.5273703336715698, "learning_rate": 0.0, "loss": 0.7856, "step": 1570 } ], "logging_steps": 1, "max_steps": 1570, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 393, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8612606783861555e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }