{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9997117878758766, "eval_steps": 500, "global_step": 7806, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007685656643289462, "grad_norm": 46.33760446804886, "learning_rate": 2.560819462227913e-08, "loss": 1.2444, "step": 2 }, { "epoch": 0.0015371313286578923, "grad_norm": 99.00918408435471, "learning_rate": 5.121638924455826e-08, "loss": 1.2778, "step": 4 }, { "epoch": 0.0023056969929868385, "grad_norm": 278.841985179111, "learning_rate": 7.68245838668374e-08, "loss": 1.2635, "step": 6 }, { "epoch": 0.0030742626573157846, "grad_norm": 130.6324503245223, "learning_rate": 1.0243277848911652e-07, "loss": 1.1947, "step": 8 }, { "epoch": 0.0038428283216447303, "grad_norm": 99.83521645646486, "learning_rate": 1.2804097311139564e-07, "loss": 1.2812, "step": 10 }, { "epoch": 0.004611393985973677, "grad_norm": 26.994304734641553, "learning_rate": 1.536491677336748e-07, "loss": 1.2019, "step": 12 }, { "epoch": 0.005379959650302623, "grad_norm": 56.60543598211865, "learning_rate": 1.7925736235595394e-07, "loss": 1.2043, "step": 14 }, { "epoch": 0.006148525314631569, "grad_norm": 76.12809456819475, "learning_rate": 2.0486555697823304e-07, "loss": 1.1762, "step": 16 }, { "epoch": 0.006917090978960515, "grad_norm": 112.78301429406974, "learning_rate": 2.304737516005122e-07, "loss": 1.26, "step": 18 }, { "epoch": 0.007685656643289461, "grad_norm": 229.46495896037206, "learning_rate": 2.560819462227913e-07, "loss": 1.2362, "step": 20 }, { "epoch": 0.008454222307618407, "grad_norm": 159.394719450483, "learning_rate": 2.8169014084507043e-07, "loss": 1.2009, "step": 22 }, { "epoch": 0.009222787971947354, "grad_norm": 64.7957161714447, "learning_rate": 3.072983354673496e-07, "loss": 1.1783, "step": 24 }, { "epoch": 0.009991353636276299, "grad_norm": 40.46403062047373, "learning_rate": 3.3290653008962873e-07, "loss": 1.1725, "step": 26 }, { "epoch": 0.010759919300605245, "grad_norm": 92.03152810292384, "learning_rate": 3.585147247119079e-07, "loss": 1.134, "step": 28 }, { "epoch": 0.011528484964934192, "grad_norm": 20.150498784472134, "learning_rate": 3.841229193341869e-07, "loss": 1.1751, "step": 30 }, { "epoch": 0.012297050629263138, "grad_norm": 18.76023416311018, "learning_rate": 4.097311139564661e-07, "loss": 1.0067, "step": 32 }, { "epoch": 0.013065616293592083, "grad_norm": 53.46254740639248, "learning_rate": 4.353393085787452e-07, "loss": 1.0379, "step": 34 }, { "epoch": 0.01383418195792103, "grad_norm": 186.29397816771157, "learning_rate": 4.609475032010244e-07, "loss": 1.0321, "step": 36 }, { "epoch": 0.014602747622249976, "grad_norm": 25.88362620566277, "learning_rate": 4.865556978233036e-07, "loss": 0.965, "step": 38 }, { "epoch": 0.015371313286578921, "grad_norm": 30.41217627723473, "learning_rate": 5.121638924455826e-07, "loss": 0.8547, "step": 40 }, { "epoch": 0.01613987895090787, "grad_norm": 89.8253565825706, "learning_rate": 5.377720870678618e-07, "loss": 0.7236, "step": 42 }, { "epoch": 0.016908444615236815, "grad_norm": 202.0087192398533, "learning_rate": 5.633802816901409e-07, "loss": 0.6885, "step": 44 }, { "epoch": 0.01767701027956576, "grad_norm": 48.82156064197885, "learning_rate": 5.889884763124201e-07, "loss": 0.6668, "step": 46 }, { "epoch": 0.018445575943894708, "grad_norm": 62.89166750847767, "learning_rate": 6.145966709346992e-07, "loss": 0.6492, "step": 48 }, { "epoch": 0.019214141608223653, "grad_norm": 115.69089469175242, "learning_rate": 6.402048655569783e-07, "loss": 0.6003, "step": 50 }, { "epoch": 0.019982707272552597, "grad_norm": 51.61618021030156, "learning_rate": 6.658130601792575e-07, "loss": 0.568, "step": 52 }, { "epoch": 0.020751272936881546, "grad_norm": 37.81069880056541, "learning_rate": 6.914212548015366e-07, "loss": 0.5168, "step": 54 }, { "epoch": 0.02151983860121049, "grad_norm": 181.43202282474087, "learning_rate": 7.170294494238158e-07, "loss": 0.4823, "step": 56 }, { "epoch": 0.022288404265539435, "grad_norm": 18.1856641954327, "learning_rate": 7.426376440460949e-07, "loss": 0.482, "step": 58 }, { "epoch": 0.023056969929868384, "grad_norm": 103.75984178526056, "learning_rate": 7.682458386683739e-07, "loss": 0.4577, "step": 60 }, { "epoch": 0.02382553559419733, "grad_norm": 76.84176444216983, "learning_rate": 7.93854033290653e-07, "loss": 0.4511, "step": 62 }, { "epoch": 0.024594101258526277, "grad_norm": 76.57556116884466, "learning_rate": 8.194622279129321e-07, "loss": 0.4657, "step": 64 }, { "epoch": 0.025362666922855222, "grad_norm": 58.016983864584, "learning_rate": 8.450704225352114e-07, "loss": 0.4119, "step": 66 }, { "epoch": 0.026131232587184167, "grad_norm": 20.418420514400733, "learning_rate": 8.706786171574904e-07, "loss": 0.408, "step": 68 }, { "epoch": 0.026899798251513115, "grad_norm": 18.901942801466426, "learning_rate": 8.962868117797697e-07, "loss": 0.3535, "step": 70 }, { "epoch": 0.02766836391584206, "grad_norm": 34.40200360130968, "learning_rate": 9.218950064020487e-07, "loss": 0.3787, "step": 72 }, { "epoch": 0.028436929580171005, "grad_norm": 60.94963075839824, "learning_rate": 9.47503201024328e-07, "loss": 0.3996, "step": 74 }, { "epoch": 0.029205495244499953, "grad_norm": 49.14575111031186, "learning_rate": 9.731113956466072e-07, "loss": 0.3641, "step": 76 }, { "epoch": 0.029974060908828898, "grad_norm": 22.703245272597435, "learning_rate": 9.987195902688861e-07, "loss": 0.3611, "step": 78 }, { "epoch": 0.030742626573157843, "grad_norm": 56.4478223538861, "learning_rate": 1.0243277848911651e-06, "loss": 0.3509, "step": 80 }, { "epoch": 0.03151119223748679, "grad_norm": 50.056515073419874, "learning_rate": 1.0499359795134443e-06, "loss": 0.3597, "step": 82 }, { "epoch": 0.03227975790181574, "grad_norm": 29.104363728108776, "learning_rate": 1.0755441741357235e-06, "loss": 0.3626, "step": 84 }, { "epoch": 0.033048323566144684, "grad_norm": 10.58596977231829, "learning_rate": 1.1011523687580027e-06, "loss": 0.3564, "step": 86 }, { "epoch": 0.03381688923047363, "grad_norm": 36.51034085604627, "learning_rate": 1.1267605633802817e-06, "loss": 0.3419, "step": 88 }, { "epoch": 0.034585454894802574, "grad_norm": 18.466602538538957, "learning_rate": 1.152368758002561e-06, "loss": 0.3448, "step": 90 }, { "epoch": 0.03535402055913152, "grad_norm": 59.523255850573314, "learning_rate": 1.1779769526248401e-06, "loss": 0.3146, "step": 92 }, { "epoch": 0.036122586223460464, "grad_norm": 14.865723956726159, "learning_rate": 1.2035851472471191e-06, "loss": 0.3532, "step": 94 }, { "epoch": 0.036891151887789415, "grad_norm": 17.716236557321658, "learning_rate": 1.2291933418693983e-06, "loss": 0.3388, "step": 96 }, { "epoch": 0.03765971755211836, "grad_norm": 14.310115347082816, "learning_rate": 1.2548015364916773e-06, "loss": 0.3198, "step": 98 }, { "epoch": 0.038428283216447305, "grad_norm": 22.380971998056655, "learning_rate": 1.2804097311139565e-06, "loss": 0.3123, "step": 100 }, { "epoch": 0.03919684888077625, "grad_norm": 18.324746830977965, "learning_rate": 1.3060179257362357e-06, "loss": 0.3334, "step": 102 }, { "epoch": 0.039965414545105195, "grad_norm": 42.805326289798906, "learning_rate": 1.331626120358515e-06, "loss": 0.3332, "step": 104 }, { "epoch": 0.04073398020943415, "grad_norm": 7.2589484837044855, "learning_rate": 1.357234314980794e-06, "loss": 0.2914, "step": 106 }, { "epoch": 0.04150254587376309, "grad_norm": 30.232749091643036, "learning_rate": 1.3828425096030731e-06, "loss": 0.3049, "step": 108 }, { "epoch": 0.042271111538092036, "grad_norm": 18.914452521420156, "learning_rate": 1.4084507042253523e-06, "loss": 0.3039, "step": 110 }, { "epoch": 0.04303967720242098, "grad_norm": 52.089845235790506, "learning_rate": 1.4340588988476315e-06, "loss": 0.3164, "step": 112 }, { "epoch": 0.043808242866749926, "grad_norm": 82.05236455147292, "learning_rate": 1.4596670934699105e-06, "loss": 0.3181, "step": 114 }, { "epoch": 0.04457680853107887, "grad_norm": 65.23236879823888, "learning_rate": 1.4852752880921897e-06, "loss": 0.3119, "step": 116 }, { "epoch": 0.04534537419540782, "grad_norm": 11.770508694194396, "learning_rate": 1.510883482714469e-06, "loss": 0.3202, "step": 118 }, { "epoch": 0.04611393985973677, "grad_norm": 7.330090852002793, "learning_rate": 1.5364916773367477e-06, "loss": 0.2745, "step": 120 }, { "epoch": 0.04688250552406571, "grad_norm": 4.286643357818288, "learning_rate": 1.562099871959027e-06, "loss": 0.3032, "step": 122 }, { "epoch": 0.04765107118839466, "grad_norm": 52.62910552257363, "learning_rate": 1.587708066581306e-06, "loss": 0.3025, "step": 124 }, { "epoch": 0.0484196368527236, "grad_norm": 10.319262334358426, "learning_rate": 1.613316261203585e-06, "loss": 0.32, "step": 126 }, { "epoch": 0.049188202517052554, "grad_norm": 24.088667751970913, "learning_rate": 1.6389244558258643e-06, "loss": 0.3146, "step": 128 }, { "epoch": 0.0499567681813815, "grad_norm": 25.790528746745665, "learning_rate": 1.6645326504481435e-06, "loss": 0.3202, "step": 130 }, { "epoch": 0.050725333845710444, "grad_norm": 34.77446800937756, "learning_rate": 1.6901408450704227e-06, "loss": 0.2811, "step": 132 }, { "epoch": 0.05149389951003939, "grad_norm": 27.69815761019934, "learning_rate": 1.7157490396927017e-06, "loss": 0.3223, "step": 134 }, { "epoch": 0.05226246517436833, "grad_norm": 10.699141603333432, "learning_rate": 1.741357234314981e-06, "loss": 0.3034, "step": 136 }, { "epoch": 0.05303103083869728, "grad_norm": 34.05292524966218, "learning_rate": 1.76696542893726e-06, "loss": 0.2923, "step": 138 }, { "epoch": 0.05379959650302623, "grad_norm": 13.580271860479359, "learning_rate": 1.7925736235595393e-06, "loss": 0.3042, "step": 140 }, { "epoch": 0.054568162167355175, "grad_norm": 17.4296343438217, "learning_rate": 1.8181818181818183e-06, "loss": 0.3225, "step": 142 }, { "epoch": 0.05533672783168412, "grad_norm": 8.122944925855485, "learning_rate": 1.8437900128040975e-06, "loss": 0.2878, "step": 144 }, { "epoch": 0.056105293496013064, "grad_norm": 21.563755319242443, "learning_rate": 1.8693982074263767e-06, "loss": 0.2774, "step": 146 }, { "epoch": 0.05687385916034201, "grad_norm": 21.44682395047032, "learning_rate": 1.895006402048656e-06, "loss": 0.3038, "step": 148 }, { "epoch": 0.05764242482467096, "grad_norm": 22.61939300619328, "learning_rate": 1.920614596670935e-06, "loss": 0.2927, "step": 150 }, { "epoch": 0.058410990488999906, "grad_norm": 6.78447995455534, "learning_rate": 1.9462227912932143e-06, "loss": 0.2892, "step": 152 }, { "epoch": 0.05917955615332885, "grad_norm": 3.5561830907263037, "learning_rate": 1.971830985915493e-06, "loss": 0.2795, "step": 154 }, { "epoch": 0.059948121817657796, "grad_norm": 19.011423538158418, "learning_rate": 1.9974391805377723e-06, "loss": 0.2948, "step": 156 }, { "epoch": 0.06071668748198674, "grad_norm": 18.2031545321998, "learning_rate": 2.0230473751600515e-06, "loss": 0.2771, "step": 158 }, { "epoch": 0.061485253146315685, "grad_norm": 7.405269794453134, "learning_rate": 2.0486555697823303e-06, "loss": 0.2751, "step": 160 }, { "epoch": 0.06225381881064464, "grad_norm": 7.0671939800121235, "learning_rate": 2.0742637644046095e-06, "loss": 0.2748, "step": 162 }, { "epoch": 0.06302238447497358, "grad_norm": 5.226630733557134, "learning_rate": 2.0998719590268887e-06, "loss": 0.2723, "step": 164 }, { "epoch": 0.06379095013930253, "grad_norm": 21.994420461511652, "learning_rate": 2.125480153649168e-06, "loss": 0.2553, "step": 166 }, { "epoch": 0.06455951580363148, "grad_norm": 8.814928713265397, "learning_rate": 2.151088348271447e-06, "loss": 0.2578, "step": 168 }, { "epoch": 0.06532808146796042, "grad_norm": 7.6210049534261035, "learning_rate": 2.1766965428937263e-06, "loss": 0.2687, "step": 170 }, { "epoch": 0.06609664713228937, "grad_norm": 7.578605236731984, "learning_rate": 2.2023047375160055e-06, "loss": 0.2823, "step": 172 }, { "epoch": 0.0668652127966183, "grad_norm": 14.778431947255093, "learning_rate": 2.2279129321382843e-06, "loss": 0.2519, "step": 174 }, { "epoch": 0.06763377846094726, "grad_norm": 25.739717706809145, "learning_rate": 2.2535211267605635e-06, "loss": 0.2786, "step": 176 }, { "epoch": 0.06840234412527621, "grad_norm": 10.61691952004806, "learning_rate": 2.2791293213828427e-06, "loss": 0.2805, "step": 178 }, { "epoch": 0.06917090978960515, "grad_norm": 17.775179831393846, "learning_rate": 2.304737516005122e-06, "loss": 0.2794, "step": 180 }, { "epoch": 0.0699394754539341, "grad_norm": 15.371623626609324, "learning_rate": 2.330345710627401e-06, "loss": 0.2628, "step": 182 }, { "epoch": 0.07070804111826304, "grad_norm": 4.91851606157634, "learning_rate": 2.3559539052496803e-06, "loss": 0.2542, "step": 184 }, { "epoch": 0.07147660678259199, "grad_norm": 18.8763123919504, "learning_rate": 2.3815620998719595e-06, "loss": 0.2485, "step": 186 }, { "epoch": 0.07224517244692093, "grad_norm": 5.301159487030119, "learning_rate": 2.4071702944942383e-06, "loss": 0.2546, "step": 188 }, { "epoch": 0.07301373811124988, "grad_norm": 3.2125271107004547, "learning_rate": 2.4327784891165175e-06, "loss": 0.2369, "step": 190 }, { "epoch": 0.07378230377557883, "grad_norm": 3.515920910783621, "learning_rate": 2.4583866837387967e-06, "loss": 0.2521, "step": 192 }, { "epoch": 0.07455086943990777, "grad_norm": 6.648596291375742, "learning_rate": 2.483994878361076e-06, "loss": 0.2743, "step": 194 }, { "epoch": 0.07531943510423672, "grad_norm": 3.3692024930881495, "learning_rate": 2.5096030729833546e-06, "loss": 0.229, "step": 196 }, { "epoch": 0.07608800076856566, "grad_norm": 3.5841651803338537, "learning_rate": 2.535211267605634e-06, "loss": 0.2637, "step": 198 }, { "epoch": 0.07685656643289461, "grad_norm": 3.054809995334705, "learning_rate": 2.560819462227913e-06, "loss": 0.2361, "step": 200 }, { "epoch": 0.07762513209722356, "grad_norm": 4.364940101217546, "learning_rate": 2.5864276568501922e-06, "loss": 0.2368, "step": 202 }, { "epoch": 0.0783936977615525, "grad_norm": 2.9332591848215994, "learning_rate": 2.6120358514724715e-06, "loss": 0.2204, "step": 204 }, { "epoch": 0.07916226342588145, "grad_norm": 4.015420697191092, "learning_rate": 2.6376440460947507e-06, "loss": 0.2552, "step": 206 }, { "epoch": 0.07993082909021039, "grad_norm": 3.4118515140127768, "learning_rate": 2.66325224071703e-06, "loss": 0.2458, "step": 208 }, { "epoch": 0.08069939475453934, "grad_norm": 3.5508039267022884, "learning_rate": 2.688860435339309e-06, "loss": 0.2335, "step": 210 }, { "epoch": 0.0814679604188683, "grad_norm": 5.007267781607216, "learning_rate": 2.714468629961588e-06, "loss": 0.2326, "step": 212 }, { "epoch": 0.08223652608319723, "grad_norm": 4.675835164788218, "learning_rate": 2.740076824583867e-06, "loss": 0.2531, "step": 214 }, { "epoch": 0.08300509174752618, "grad_norm": 4.655849365654854, "learning_rate": 2.7656850192061462e-06, "loss": 0.2362, "step": 216 }, { "epoch": 0.08377365741185512, "grad_norm": 3.917294824835895, "learning_rate": 2.7912932138284254e-06, "loss": 0.2129, "step": 218 }, { "epoch": 0.08454222307618407, "grad_norm": 7.24921337165743, "learning_rate": 2.8169014084507046e-06, "loss": 0.2294, "step": 220 }, { "epoch": 0.08531078874051302, "grad_norm": 6.921955166636701, "learning_rate": 2.842509603072984e-06, "loss": 0.254, "step": 222 }, { "epoch": 0.08607935440484196, "grad_norm": 4.2213157555866365, "learning_rate": 2.868117797695263e-06, "loss": 0.2522, "step": 224 }, { "epoch": 0.08684792006917091, "grad_norm": 3.9840395695402284, "learning_rate": 2.893725992317542e-06, "loss": 0.2417, "step": 226 }, { "epoch": 0.08761648573349985, "grad_norm": 3.9975525395065183, "learning_rate": 2.919334186939821e-06, "loss": 0.2161, "step": 228 }, { "epoch": 0.0883850513978288, "grad_norm": 3.7308986325531084, "learning_rate": 2.9449423815621002e-06, "loss": 0.2372, "step": 230 }, { "epoch": 0.08915361706215774, "grad_norm": 11.719833565584912, "learning_rate": 2.9705505761843794e-06, "loss": 0.2331, "step": 232 }, { "epoch": 0.0899221827264867, "grad_norm": 11.808354164840601, "learning_rate": 2.9961587708066586e-06, "loss": 0.2176, "step": 234 }, { "epoch": 0.09069074839081565, "grad_norm": 13.172308486984463, "learning_rate": 3.021766965428938e-06, "loss": 0.2507, "step": 236 }, { "epoch": 0.09145931405514458, "grad_norm": 5.042954153443494, "learning_rate": 3.047375160051216e-06, "loss": 0.2291, "step": 238 }, { "epoch": 0.09222787971947354, "grad_norm": 3.3903492106223156, "learning_rate": 3.0729833546734954e-06, "loss": 0.237, "step": 240 }, { "epoch": 0.09299644538380247, "grad_norm": 4.974360581535012, "learning_rate": 3.0985915492957746e-06, "loss": 0.221, "step": 242 }, { "epoch": 0.09376501104813142, "grad_norm": 9.574667517835657, "learning_rate": 3.124199743918054e-06, "loss": 0.2256, "step": 244 }, { "epoch": 0.09453357671246038, "grad_norm": 6.097974673455306, "learning_rate": 3.149807938540333e-06, "loss": 0.2252, "step": 246 }, { "epoch": 0.09530214237678931, "grad_norm": 2.966520861878796, "learning_rate": 3.175416133162612e-06, "loss": 0.2137, "step": 248 }, { "epoch": 0.09607070804111827, "grad_norm": 2.8715917051815523, "learning_rate": 3.2010243277848914e-06, "loss": 0.2308, "step": 250 }, { "epoch": 0.0968392737054472, "grad_norm": 3.1033990511581733, "learning_rate": 3.22663252240717e-06, "loss": 0.2279, "step": 252 }, { "epoch": 0.09760783936977616, "grad_norm": 3.446155462824691, "learning_rate": 3.2522407170294494e-06, "loss": 0.2367, "step": 254 }, { "epoch": 0.09837640503410511, "grad_norm": 2.9266184156182793, "learning_rate": 3.2778489116517286e-06, "loss": 0.2193, "step": 256 }, { "epoch": 0.09914497069843405, "grad_norm": 2.925455168943199, "learning_rate": 3.303457106274008e-06, "loss": 0.2284, "step": 258 }, { "epoch": 0.099913536362763, "grad_norm": 3.5728455339347325, "learning_rate": 3.329065300896287e-06, "loss": 0.2103, "step": 260 }, { "epoch": 0.10068210202709194, "grad_norm": 4.221093381068644, "learning_rate": 3.354673495518566e-06, "loss": 0.2246, "step": 262 }, { "epoch": 0.10145066769142089, "grad_norm": 2.9632356409256424, "learning_rate": 3.3802816901408454e-06, "loss": 0.2346, "step": 264 }, { "epoch": 0.10221923335574983, "grad_norm": 2.8033401050013307, "learning_rate": 3.4058898847631246e-06, "loss": 0.2319, "step": 266 }, { "epoch": 0.10298779902007878, "grad_norm": 2.8665125380986467, "learning_rate": 3.4314980793854034e-06, "loss": 0.2298, "step": 268 }, { "epoch": 0.10375636468440773, "grad_norm": 2.5878367982420047, "learning_rate": 3.4571062740076826e-06, "loss": 0.2173, "step": 270 }, { "epoch": 0.10452493034873667, "grad_norm": 2.5055747015053336, "learning_rate": 3.482714468629962e-06, "loss": 0.2331, "step": 272 }, { "epoch": 0.10529349601306562, "grad_norm": 2.6044664633628547, "learning_rate": 3.508322663252241e-06, "loss": 0.2183, "step": 274 }, { "epoch": 0.10606206167739456, "grad_norm": 3.0595848685423794, "learning_rate": 3.53393085787452e-06, "loss": 0.2606, "step": 276 }, { "epoch": 0.10683062734172351, "grad_norm": 2.9336268094871465, "learning_rate": 3.5595390524967994e-06, "loss": 0.2448, "step": 278 }, { "epoch": 0.10759919300605246, "grad_norm": 2.4864538433915326, "learning_rate": 3.5851472471190786e-06, "loss": 0.2293, "step": 280 }, { "epoch": 0.1083677586703814, "grad_norm": 2.9732405504319495, "learning_rate": 3.610755441741358e-06, "loss": 0.2334, "step": 282 }, { "epoch": 0.10913632433471035, "grad_norm": 2.2374012733548274, "learning_rate": 3.6363636363636366e-06, "loss": 0.234, "step": 284 }, { "epoch": 0.10990488999903929, "grad_norm": 2.6626338027134144, "learning_rate": 3.6619718309859158e-06, "loss": 0.2374, "step": 286 }, { "epoch": 0.11067345566336824, "grad_norm": 2.828448121185741, "learning_rate": 3.687580025608195e-06, "loss": 0.2294, "step": 288 }, { "epoch": 0.11144202132769719, "grad_norm": 3.0404593247177796, "learning_rate": 3.713188220230474e-06, "loss": 0.2456, "step": 290 }, { "epoch": 0.11221058699202613, "grad_norm": 2.390604511770431, "learning_rate": 3.7387964148527534e-06, "loss": 0.2092, "step": 292 }, { "epoch": 0.11297915265635508, "grad_norm": 2.8009136526188185, "learning_rate": 3.7644046094750326e-06, "loss": 0.2325, "step": 294 }, { "epoch": 0.11374771832068402, "grad_norm": 3.103666072000564, "learning_rate": 3.790012804097312e-06, "loss": 0.2118, "step": 296 }, { "epoch": 0.11451628398501297, "grad_norm": 2.2410844360611475, "learning_rate": 3.815620998719591e-06, "loss": 0.2171, "step": 298 }, { "epoch": 0.11528484964934192, "grad_norm": 3.0380959937429437, "learning_rate": 3.84122919334187e-06, "loss": 0.238, "step": 300 }, { "epoch": 0.11605341531367086, "grad_norm": 8.011506402704828, "learning_rate": 3.866837387964149e-06, "loss": 0.2498, "step": 302 }, { "epoch": 0.11682198097799981, "grad_norm": 2.8285095764068546, "learning_rate": 3.892445582586429e-06, "loss": 0.2379, "step": 304 }, { "epoch": 0.11759054664232875, "grad_norm": 2.6414619840153635, "learning_rate": 3.9180537772087065e-06, "loss": 0.2262, "step": 306 }, { "epoch": 0.1183591123066577, "grad_norm": 3.3617004265256676, "learning_rate": 3.943661971830986e-06, "loss": 0.2189, "step": 308 }, { "epoch": 0.11912767797098664, "grad_norm": 2.6171321270603447, "learning_rate": 3.969270166453265e-06, "loss": 0.2354, "step": 310 }, { "epoch": 0.11989624363531559, "grad_norm": 2.5265082940689605, "learning_rate": 3.9948783610755446e-06, "loss": 0.2347, "step": 312 }, { "epoch": 0.12066480929964454, "grad_norm": 4.310377424246406, "learning_rate": 4.020486555697823e-06, "loss": 0.2173, "step": 314 }, { "epoch": 0.12143337496397348, "grad_norm": 3.33108638388813, "learning_rate": 4.046094750320103e-06, "loss": 0.2372, "step": 316 }, { "epoch": 0.12220194062830243, "grad_norm": 2.645227447839728, "learning_rate": 4.071702944942382e-06, "loss": 0.2376, "step": 318 }, { "epoch": 0.12297050629263137, "grad_norm": 2.350386448021577, "learning_rate": 4.0973111395646605e-06, "loss": 0.2192, "step": 320 }, { "epoch": 0.12373907195696032, "grad_norm": 2.485480210594397, "learning_rate": 4.12291933418694e-06, "loss": 0.2309, "step": 322 }, { "epoch": 0.12450763762128927, "grad_norm": 2.483522681244722, "learning_rate": 4.148527528809219e-06, "loss": 0.2372, "step": 324 }, { "epoch": 0.12527620328561823, "grad_norm": 2.3357815065562355, "learning_rate": 4.1741357234314986e-06, "loss": 0.23, "step": 326 }, { "epoch": 0.12604476894994715, "grad_norm": 2.613847062947084, "learning_rate": 4.199743918053777e-06, "loss": 0.2373, "step": 328 }, { "epoch": 0.1268133346142761, "grad_norm": 3.467846462423581, "learning_rate": 4.225352112676057e-06, "loss": 0.2154, "step": 330 }, { "epoch": 0.12758190027860505, "grad_norm": 2.453886912627419, "learning_rate": 4.250960307298336e-06, "loss": 0.2488, "step": 332 }, { "epoch": 0.128350465942934, "grad_norm": 2.4993670033701894, "learning_rate": 4.2765685019206145e-06, "loss": 0.1983, "step": 334 }, { "epoch": 0.12911903160726296, "grad_norm": 2.793092958475753, "learning_rate": 4.302176696542894e-06, "loss": 0.2137, "step": 336 }, { "epoch": 0.12988759727159188, "grad_norm": 2.3350196436584993, "learning_rate": 4.327784891165173e-06, "loss": 0.2494, "step": 338 }, { "epoch": 0.13065616293592083, "grad_norm": 2.3915721776764376, "learning_rate": 4.3533930857874526e-06, "loss": 0.2049, "step": 340 }, { "epoch": 0.13142472860024978, "grad_norm": 3.887494715792718, "learning_rate": 4.379001280409731e-06, "loss": 0.2288, "step": 342 }, { "epoch": 0.13219329426457874, "grad_norm": 3.75521134278938, "learning_rate": 4.404609475032011e-06, "loss": 0.2156, "step": 344 }, { "epoch": 0.1329618599289077, "grad_norm": 2.412791555507996, "learning_rate": 4.43021766965429e-06, "loss": 0.2418, "step": 346 }, { "epoch": 0.1337304255932366, "grad_norm": 2.518395112843478, "learning_rate": 4.4558258642765685e-06, "loss": 0.204, "step": 348 }, { "epoch": 0.13449899125756556, "grad_norm": 2.077327313582346, "learning_rate": 4.481434058898848e-06, "loss": 0.2289, "step": 350 }, { "epoch": 0.13526755692189452, "grad_norm": 4.103585870373693, "learning_rate": 4.507042253521127e-06, "loss": 0.1986, "step": 352 }, { "epoch": 0.13603612258622347, "grad_norm": 2.5096612416184407, "learning_rate": 4.5326504481434066e-06, "loss": 0.2217, "step": 354 }, { "epoch": 0.13680468825055242, "grad_norm": 2.4322865844998303, "learning_rate": 4.558258642765685e-06, "loss": 0.2038, "step": 356 }, { "epoch": 0.13757325391488134, "grad_norm": 2.43276733770992, "learning_rate": 4.583866837387965e-06, "loss": 0.2386, "step": 358 }, { "epoch": 0.1383418195792103, "grad_norm": 2.4354806247218623, "learning_rate": 4.609475032010244e-06, "loss": 0.2314, "step": 360 }, { "epoch": 0.13911038524353925, "grad_norm": 2.2584368701382136, "learning_rate": 4.635083226632523e-06, "loss": 0.238, "step": 362 }, { "epoch": 0.1398789509078682, "grad_norm": 2.818731287250581, "learning_rate": 4.660691421254802e-06, "loss": 0.2128, "step": 364 }, { "epoch": 0.14064751657219712, "grad_norm": 2.211892198107575, "learning_rate": 4.686299615877081e-06, "loss": 0.2258, "step": 366 }, { "epoch": 0.14141608223652608, "grad_norm": 2.3425369086966343, "learning_rate": 4.7119078104993605e-06, "loss": 0.2133, "step": 368 }, { "epoch": 0.14218464790085503, "grad_norm": 2.273504250152576, "learning_rate": 4.737516005121639e-06, "loss": 0.2167, "step": 370 }, { "epoch": 0.14295321356518398, "grad_norm": 2.288943038262385, "learning_rate": 4.763124199743919e-06, "loss": 0.2453, "step": 372 }, { "epoch": 0.14372177922951293, "grad_norm": 3.1052357577334155, "learning_rate": 4.788732394366197e-06, "loss": 0.2316, "step": 374 }, { "epoch": 0.14449034489384185, "grad_norm": 2.120098144512718, "learning_rate": 4.8143405889884765e-06, "loss": 0.2181, "step": 376 }, { "epoch": 0.1452589105581708, "grad_norm": 2.1347654859773537, "learning_rate": 4.839948783610755e-06, "loss": 0.2179, "step": 378 }, { "epoch": 0.14602747622249976, "grad_norm": 2.1998709774658445, "learning_rate": 4.865556978233035e-06, "loss": 0.2032, "step": 380 }, { "epoch": 0.1467960418868287, "grad_norm": 2.207761509875039, "learning_rate": 4.891165172855314e-06, "loss": 0.243, "step": 382 }, { "epoch": 0.14756460755115766, "grad_norm": 2.2980262641602147, "learning_rate": 4.916773367477593e-06, "loss": 0.2216, "step": 384 }, { "epoch": 0.14833317321548659, "grad_norm": 2.2127660134283222, "learning_rate": 4.942381562099872e-06, "loss": 0.2056, "step": 386 }, { "epoch": 0.14910173887981554, "grad_norm": 2.285792155460432, "learning_rate": 4.967989756722152e-06, "loss": 0.2205, "step": 388 }, { "epoch": 0.1498703045441445, "grad_norm": 2.2333018817420447, "learning_rate": 4.9935979513444305e-06, "loss": 0.2274, "step": 390 }, { "epoch": 0.15063887020847344, "grad_norm": 2.6989386989616264, "learning_rate": 5.019206145966709e-06, "loss": 0.2354, "step": 392 }, { "epoch": 0.1514074358728024, "grad_norm": 2.2271185955960764, "learning_rate": 5.04481434058899e-06, "loss": 0.2259, "step": 394 }, { "epoch": 0.15217600153713132, "grad_norm": 2.0484665140161673, "learning_rate": 5.070422535211268e-06, "loss": 0.211, "step": 396 }, { "epoch": 0.15294456720146027, "grad_norm": 2.1188988800845387, "learning_rate": 5.096030729833547e-06, "loss": 0.2332, "step": 398 }, { "epoch": 0.15371313286578922, "grad_norm": 2.018697346973585, "learning_rate": 5.121638924455826e-06, "loss": 0.2248, "step": 400 }, { "epoch": 0.15448169853011817, "grad_norm": 2.114185429963843, "learning_rate": 5.147247119078106e-06, "loss": 0.2378, "step": 402 }, { "epoch": 0.15525026419444712, "grad_norm": 3.9919067721909753, "learning_rate": 5.1728553137003845e-06, "loss": 0.2261, "step": 404 }, { "epoch": 0.15601882985877605, "grad_norm": 2.1645921811836897, "learning_rate": 5.198463508322664e-06, "loss": 0.2135, "step": 406 }, { "epoch": 0.156787395523105, "grad_norm": 2.2050450060731683, "learning_rate": 5.224071702944943e-06, "loss": 0.2533, "step": 408 }, { "epoch": 0.15755596118743395, "grad_norm": 2.265484660611347, "learning_rate": 5.249679897567222e-06, "loss": 0.2333, "step": 410 }, { "epoch": 0.1583245268517629, "grad_norm": 2.3290087842602962, "learning_rate": 5.275288092189501e-06, "loss": 0.2091, "step": 412 }, { "epoch": 0.15909309251609186, "grad_norm": 2.2053249544591957, "learning_rate": 5.30089628681178e-06, "loss": 0.2231, "step": 414 }, { "epoch": 0.15986165818042078, "grad_norm": 2.14968865606181, "learning_rate": 5.32650448143406e-06, "loss": 0.2164, "step": 416 }, { "epoch": 0.16063022384474973, "grad_norm": 2.278173827662444, "learning_rate": 5.352112676056338e-06, "loss": 0.2164, "step": 418 }, { "epoch": 0.16139878950907868, "grad_norm": 2.4193934409100555, "learning_rate": 5.377720870678618e-06, "loss": 0.2254, "step": 420 }, { "epoch": 0.16216735517340763, "grad_norm": 2.425415361094944, "learning_rate": 5.403329065300896e-06, "loss": 0.2219, "step": 422 }, { "epoch": 0.1629359208377366, "grad_norm": 1.942343841994695, "learning_rate": 5.428937259923176e-06, "loss": 0.2226, "step": 424 }, { "epoch": 0.1637044865020655, "grad_norm": 1.9187106343966556, "learning_rate": 5.4545454545454545e-06, "loss": 0.2153, "step": 426 }, { "epoch": 0.16447305216639446, "grad_norm": 2.1493501443075123, "learning_rate": 5.480153649167734e-06, "loss": 0.2211, "step": 428 }, { "epoch": 0.16524161783072341, "grad_norm": 1.9422980546951032, "learning_rate": 5.505761843790013e-06, "loss": 0.2128, "step": 430 }, { "epoch": 0.16601018349505237, "grad_norm": 2.2501655443469257, "learning_rate": 5.5313700384122925e-06, "loss": 0.2238, "step": 432 }, { "epoch": 0.16677874915938132, "grad_norm": 2.126594105132844, "learning_rate": 5.556978233034571e-06, "loss": 0.2316, "step": 434 }, { "epoch": 0.16754731482371024, "grad_norm": 2.2385656814485335, "learning_rate": 5.582586427656851e-06, "loss": 0.2444, "step": 436 }, { "epoch": 0.1683158804880392, "grad_norm": 1.8505580715570082, "learning_rate": 5.60819462227913e-06, "loss": 0.1994, "step": 438 }, { "epoch": 0.16908444615236815, "grad_norm": 2.2742421327410063, "learning_rate": 5.633802816901409e-06, "loss": 0.2291, "step": 440 }, { "epoch": 0.1698530118166971, "grad_norm": 1.9619880811230088, "learning_rate": 5.659411011523688e-06, "loss": 0.2114, "step": 442 }, { "epoch": 0.17062157748102605, "grad_norm": 1.9725158178786575, "learning_rate": 5.685019206145968e-06, "loss": 0.2059, "step": 444 }, { "epoch": 0.17139014314535497, "grad_norm": 2.1990416874330143, "learning_rate": 5.7106274007682465e-06, "loss": 0.2239, "step": 446 }, { "epoch": 0.17215870880968392, "grad_norm": 2.141728497508181, "learning_rate": 5.736235595390526e-06, "loss": 0.2226, "step": 448 }, { "epoch": 0.17292727447401288, "grad_norm": 2.205752219022877, "learning_rate": 5.761843790012804e-06, "loss": 0.2129, "step": 450 }, { "epoch": 0.17369584013834183, "grad_norm": 2.159245396857092, "learning_rate": 5.787451984635084e-06, "loss": 0.2208, "step": 452 }, { "epoch": 0.17446440580267075, "grad_norm": 1.9621396275876677, "learning_rate": 5.8130601792573624e-06, "loss": 0.2123, "step": 454 }, { "epoch": 0.1752329714669997, "grad_norm": 2.112283844957891, "learning_rate": 5.838668373879642e-06, "loss": 0.2175, "step": 456 }, { "epoch": 0.17600153713132866, "grad_norm": 2.128350525848297, "learning_rate": 5.864276568501921e-06, "loss": 0.2315, "step": 458 }, { "epoch": 0.1767701027956576, "grad_norm": 1.8988984170410825, "learning_rate": 5.8898847631242005e-06, "loss": 0.2162, "step": 460 }, { "epoch": 0.17753866845998656, "grad_norm": 1.955550175017778, "learning_rate": 5.915492957746479e-06, "loss": 0.2214, "step": 462 }, { "epoch": 0.17830723412431548, "grad_norm": 2.0337389544624083, "learning_rate": 5.941101152368759e-06, "loss": 0.222, "step": 464 }, { "epoch": 0.17907579978864444, "grad_norm": 2.1607080127896046, "learning_rate": 5.966709346991038e-06, "loss": 0.2364, "step": 466 }, { "epoch": 0.1798443654529734, "grad_norm": 2.2508901655238507, "learning_rate": 5.992317541613317e-06, "loss": 0.2386, "step": 468 }, { "epoch": 0.18061293111730234, "grad_norm": 1.9216692255221042, "learning_rate": 6.017925736235596e-06, "loss": 0.1998, "step": 470 }, { "epoch": 0.1813814967816313, "grad_norm": 2.020919071830767, "learning_rate": 6.043533930857876e-06, "loss": 0.2376, "step": 472 }, { "epoch": 0.18215006244596021, "grad_norm": 2.0500831734753433, "learning_rate": 6.0691421254801545e-06, "loss": 0.205, "step": 474 }, { "epoch": 0.18291862811028917, "grad_norm": 2.1070550954123837, "learning_rate": 6.094750320102432e-06, "loss": 0.216, "step": 476 }, { "epoch": 0.18368719377461812, "grad_norm": 2.221541209559589, "learning_rate": 6.120358514724712e-06, "loss": 0.2255, "step": 478 }, { "epoch": 0.18445575943894707, "grad_norm": 1.9962832396190466, "learning_rate": 6.145966709346991e-06, "loss": 0.2079, "step": 480 }, { "epoch": 0.18522432510327602, "grad_norm": 2.2312902389210767, "learning_rate": 6.1715749039692704e-06, "loss": 0.2167, "step": 482 }, { "epoch": 0.18599289076760495, "grad_norm": 2.2192619712416457, "learning_rate": 6.197183098591549e-06, "loss": 0.2261, "step": 484 }, { "epoch": 0.1867614564319339, "grad_norm": 1.954095080003812, "learning_rate": 6.222791293213829e-06, "loss": 0.2489, "step": 486 }, { "epoch": 0.18753002209626285, "grad_norm": 2.322241833731236, "learning_rate": 6.248399487836108e-06, "loss": 0.2338, "step": 488 }, { "epoch": 0.1882985877605918, "grad_norm": 2.1994192307707126, "learning_rate": 6.274007682458387e-06, "loss": 0.2163, "step": 490 }, { "epoch": 0.18906715342492075, "grad_norm": 1.913876713202932, "learning_rate": 6.299615877080666e-06, "loss": 0.2268, "step": 492 }, { "epoch": 0.18983571908924968, "grad_norm": 2.271597754890768, "learning_rate": 6.325224071702946e-06, "loss": 0.2136, "step": 494 }, { "epoch": 0.19060428475357863, "grad_norm": 2.2648640319548967, "learning_rate": 6.350832266325224e-06, "loss": 0.215, "step": 496 }, { "epoch": 0.19137285041790758, "grad_norm": 2.1197051832058973, "learning_rate": 6.376440460947504e-06, "loss": 0.2115, "step": 498 }, { "epoch": 0.19214141608223653, "grad_norm": 2.3870231881018458, "learning_rate": 6.402048655569783e-06, "loss": 0.2074, "step": 500 }, { "epoch": 0.19214141608223653, "eval_loss": 0.19993986189365387, "eval_runtime": 390.2407, "eval_samples_per_second": 47.419, "eval_steps_per_second": 5.93, "step": 500 }, { "epoch": 0.19290998174656548, "grad_norm": 2.146004493465879, "learning_rate": 6.4276568501920625e-06, "loss": 0.2004, "step": 502 }, { "epoch": 0.1936785474108944, "grad_norm": 2.9276061247969167, "learning_rate": 6.45326504481434e-06, "loss": 0.2084, "step": 504 }, { "epoch": 0.19444711307522336, "grad_norm": 60.11828056773781, "learning_rate": 6.478873239436621e-06, "loss": 0.2285, "step": 506 }, { "epoch": 0.1952156787395523, "grad_norm": 5.111524393300991, "learning_rate": 6.504481434058899e-06, "loss": 0.2106, "step": 508 }, { "epoch": 0.19598424440388126, "grad_norm": 30.517321099027328, "learning_rate": 6.530089628681178e-06, "loss": 0.2464, "step": 510 }, { "epoch": 0.19675281006821022, "grad_norm": 13.772873134175958, "learning_rate": 6.555697823303457e-06, "loss": 0.2502, "step": 512 }, { "epoch": 0.19752137573253914, "grad_norm": 17.132070472979734, "learning_rate": 6.581306017925737e-06, "loss": 0.2304, "step": 514 }, { "epoch": 0.1982899413968681, "grad_norm": 36.46478343986076, "learning_rate": 6.606914212548016e-06, "loss": 0.2493, "step": 516 }, { "epoch": 0.19905850706119704, "grad_norm": 3.4133540940378113, "learning_rate": 6.632522407170295e-06, "loss": 0.2406, "step": 518 }, { "epoch": 0.199827072725526, "grad_norm": 4.0341516234071975, "learning_rate": 6.658130601792574e-06, "loss": 0.2143, "step": 520 }, { "epoch": 0.20059563838985495, "grad_norm": 1.985172594317315, "learning_rate": 6.683738796414854e-06, "loss": 0.2255, "step": 522 }, { "epoch": 0.20136420405418387, "grad_norm": 2.4112146087868895, "learning_rate": 6.709346991037132e-06, "loss": 0.2038, "step": 524 }, { "epoch": 0.20213276971851282, "grad_norm": 2.420847145367826, "learning_rate": 6.734955185659412e-06, "loss": 0.1999, "step": 526 }, { "epoch": 0.20290133538284177, "grad_norm": 4.660587707413427, "learning_rate": 6.760563380281691e-06, "loss": 0.23, "step": 528 }, { "epoch": 0.20366990104717073, "grad_norm": 3.898772091039189, "learning_rate": 6.7861715749039704e-06, "loss": 0.2266, "step": 530 }, { "epoch": 0.20443846671149965, "grad_norm": 3.1422178093350293, "learning_rate": 6.811779769526249e-06, "loss": 0.2159, "step": 532 }, { "epoch": 0.2052070323758286, "grad_norm": 3.7243181010371065, "learning_rate": 6.837387964148529e-06, "loss": 0.1945, "step": 534 }, { "epoch": 0.20597559804015755, "grad_norm": 2.005810013390093, "learning_rate": 6.862996158770807e-06, "loss": 0.207, "step": 536 }, { "epoch": 0.2067441637044865, "grad_norm": 4.946318541178533, "learning_rate": 6.888604353393087e-06, "loss": 0.2079, "step": 538 }, { "epoch": 0.20751272936881546, "grad_norm": 2.4289382237883217, "learning_rate": 6.914212548015365e-06, "loss": 0.2163, "step": 540 }, { "epoch": 0.20828129503314438, "grad_norm": 2.161436954100355, "learning_rate": 6.939820742637645e-06, "loss": 0.2214, "step": 542 }, { "epoch": 0.20904986069747333, "grad_norm": 3.6295892856350815, "learning_rate": 6.965428937259924e-06, "loss": 0.2116, "step": 544 }, { "epoch": 0.20981842636180228, "grad_norm": 2.8107108509331846, "learning_rate": 6.991037131882202e-06, "loss": 0.2216, "step": 546 }, { "epoch": 0.21058699202613124, "grad_norm": 1.9519124075899597, "learning_rate": 7.016645326504482e-06, "loss": 0.226, "step": 548 }, { "epoch": 0.2113555576904602, "grad_norm": 2.0733210274364167, "learning_rate": 7.042253521126761e-06, "loss": 0.2039, "step": 550 }, { "epoch": 0.2121241233547891, "grad_norm": 2.372644281784759, "learning_rate": 7.06786171574904e-06, "loss": 0.2097, "step": 552 }, { "epoch": 0.21289268901911806, "grad_norm": 2.624268552112254, "learning_rate": 7.093469910371319e-06, "loss": 0.1917, "step": 554 }, { "epoch": 0.21366125468344702, "grad_norm": 3.4983526278460895, "learning_rate": 7.119078104993599e-06, "loss": 0.2175, "step": 556 }, { "epoch": 0.21442982034777597, "grad_norm": 1.7491884946585117, "learning_rate": 7.144686299615878e-06, "loss": 0.2046, "step": 558 }, { "epoch": 0.21519838601210492, "grad_norm": 1.899614330432619, "learning_rate": 7.170294494238157e-06, "loss": 0.2152, "step": 560 }, { "epoch": 0.21596695167643384, "grad_norm": 2.120337028605035, "learning_rate": 7.195902688860435e-06, "loss": 0.2191, "step": 562 }, { "epoch": 0.2167355173407628, "grad_norm": 1.9084002774266735, "learning_rate": 7.221510883482716e-06, "loss": 0.2045, "step": 564 }, { "epoch": 0.21750408300509175, "grad_norm": 2.4147526418736525, "learning_rate": 7.2471190781049935e-06, "loss": 0.194, "step": 566 }, { "epoch": 0.2182726486694207, "grad_norm": 2.0258272072973376, "learning_rate": 7.272727272727273e-06, "loss": 0.2375, "step": 568 }, { "epoch": 0.21904121433374965, "grad_norm": 2.1968670290428927, "learning_rate": 7.298335467349552e-06, "loss": 0.2202, "step": 570 }, { "epoch": 0.21980977999807857, "grad_norm": 1.8034701440001886, "learning_rate": 7.3239436619718316e-06, "loss": 0.2211, "step": 572 }, { "epoch": 0.22057834566240753, "grad_norm": 1.838708964570849, "learning_rate": 7.34955185659411e-06, "loss": 0.2074, "step": 574 }, { "epoch": 0.22134691132673648, "grad_norm": 1.870675982250777, "learning_rate": 7.37516005121639e-06, "loss": 0.2328, "step": 576 }, { "epoch": 0.22211547699106543, "grad_norm": 1.9795695801218918, "learning_rate": 7.400768245838669e-06, "loss": 0.221, "step": 578 }, { "epoch": 0.22288404265539438, "grad_norm": 1.9677216918316087, "learning_rate": 7.426376440460948e-06, "loss": 0.2185, "step": 580 }, { "epoch": 0.2236526083197233, "grad_norm": 1.9109623259222404, "learning_rate": 7.451984635083227e-06, "loss": 0.2132, "step": 582 }, { "epoch": 0.22442117398405226, "grad_norm": 1.9864091703963151, "learning_rate": 7.477592829705507e-06, "loss": 0.199, "step": 584 }, { "epoch": 0.2251897396483812, "grad_norm": 1.8088712597796701, "learning_rate": 7.5032010243277856e-06, "loss": 0.2425, "step": 586 }, { "epoch": 0.22595830531271016, "grad_norm": 1.71633714098637, "learning_rate": 7.528809218950065e-06, "loss": 0.2126, "step": 588 }, { "epoch": 0.2267268709770391, "grad_norm": 1.8186648829494232, "learning_rate": 7.554417413572344e-06, "loss": 0.2041, "step": 590 }, { "epoch": 0.22749543664136804, "grad_norm": 1.8053144499116875, "learning_rate": 7.580025608194624e-06, "loss": 0.2089, "step": 592 }, { "epoch": 0.228264002305697, "grad_norm": 1.9672356726560407, "learning_rate": 7.6056338028169015e-06, "loss": 0.2256, "step": 594 }, { "epoch": 0.22903256797002594, "grad_norm": 1.958467884313613, "learning_rate": 7.631241997439181e-06, "loss": 0.2155, "step": 596 }, { "epoch": 0.2298011336343549, "grad_norm": 1.8350614257330995, "learning_rate": 7.65685019206146e-06, "loss": 0.235, "step": 598 }, { "epoch": 0.23056969929868384, "grad_norm": 1.8677699861124337, "learning_rate": 7.68245838668374e-06, "loss": 0.2035, "step": 600 }, { "epoch": 0.23133826496301277, "grad_norm": 1.897854727375343, "learning_rate": 7.708066581306018e-06, "loss": 0.2309, "step": 602 }, { "epoch": 0.23210683062734172, "grad_norm": 1.8846670754728845, "learning_rate": 7.733674775928298e-06, "loss": 0.1862, "step": 604 }, { "epoch": 0.23287539629167067, "grad_norm": 2.0711705006800454, "learning_rate": 7.759282970550578e-06, "loss": 0.2179, "step": 606 }, { "epoch": 0.23364396195599962, "grad_norm": 1.8865023177510742, "learning_rate": 7.784891165172857e-06, "loss": 0.2384, "step": 608 }, { "epoch": 0.23441252762032855, "grad_norm": 1.8413262983772125, "learning_rate": 7.810499359795135e-06, "loss": 0.2256, "step": 610 }, { "epoch": 0.2351810932846575, "grad_norm": 1.7472393690098726, "learning_rate": 7.836107554417413e-06, "loss": 0.2428, "step": 612 }, { "epoch": 0.23594965894898645, "grad_norm": 1.8762869282099917, "learning_rate": 7.861715749039693e-06, "loss": 0.2428, "step": 614 }, { "epoch": 0.2367182246133154, "grad_norm": 1.9586965437669182, "learning_rate": 7.887323943661972e-06, "loss": 0.2307, "step": 616 }, { "epoch": 0.23748679027764436, "grad_norm": 1.7706967070088564, "learning_rate": 7.912932138284252e-06, "loss": 0.2218, "step": 618 }, { "epoch": 0.23825535594197328, "grad_norm": 1.9519624836743898, "learning_rate": 7.93854033290653e-06, "loss": 0.2452, "step": 620 }, { "epoch": 0.23902392160630223, "grad_norm": 2.196954211116194, "learning_rate": 7.96414852752881e-06, "loss": 0.2259, "step": 622 }, { "epoch": 0.23979248727063118, "grad_norm": 1.7490487162778352, "learning_rate": 7.989756722151089e-06, "loss": 0.2302, "step": 624 }, { "epoch": 0.24056105293496013, "grad_norm": 1.8017043166572984, "learning_rate": 8.015364916773369e-06, "loss": 0.2149, "step": 626 }, { "epoch": 0.2413296185992891, "grad_norm": 1.6890632209422813, "learning_rate": 8.040973111395647e-06, "loss": 0.2083, "step": 628 }, { "epoch": 0.242098184263618, "grad_norm": 2.10836705269315, "learning_rate": 8.066581306017926e-06, "loss": 0.2261, "step": 630 }, { "epoch": 0.24286674992794696, "grad_norm": 1.9181365180554364, "learning_rate": 8.092189500640206e-06, "loss": 0.2208, "step": 632 }, { "epoch": 0.24363531559227591, "grad_norm": 2.0626557910397714, "learning_rate": 8.117797695262486e-06, "loss": 0.2455, "step": 634 }, { "epoch": 0.24440388125660487, "grad_norm": 1.906497011161951, "learning_rate": 8.143405889884764e-06, "loss": 0.2173, "step": 636 }, { "epoch": 0.24517244692093382, "grad_norm": 1.7037750710453679, "learning_rate": 8.169014084507043e-06, "loss": 0.2329, "step": 638 }, { "epoch": 0.24594101258526274, "grad_norm": 1.6995814197528412, "learning_rate": 8.194622279129321e-06, "loss": 0.2058, "step": 640 }, { "epoch": 0.2467095782495917, "grad_norm": 1.885080878069524, "learning_rate": 8.2202304737516e-06, "loss": 0.2408, "step": 642 }, { "epoch": 0.24747814391392065, "grad_norm": 1.8882217893666564, "learning_rate": 8.24583866837388e-06, "loss": 0.2299, "step": 644 }, { "epoch": 0.2482467095782496, "grad_norm": 1.760675821524176, "learning_rate": 8.27144686299616e-06, "loss": 0.2184, "step": 646 }, { "epoch": 0.24901527524257855, "grad_norm": 1.8060794891644503, "learning_rate": 8.297055057618438e-06, "loss": 0.2108, "step": 648 }, { "epoch": 0.24978384090690747, "grad_norm": 1.7701199580499865, "learning_rate": 8.322663252240718e-06, "loss": 0.2253, "step": 650 }, { "epoch": 0.25055240657123645, "grad_norm": 1.7741432929857883, "learning_rate": 8.348271446862997e-06, "loss": 0.2241, "step": 652 }, { "epoch": 0.2513209722355654, "grad_norm": 1.7567463256981144, "learning_rate": 8.373879641485277e-06, "loss": 0.22, "step": 654 }, { "epoch": 0.2520895378998943, "grad_norm": 1.7753261349788123, "learning_rate": 8.399487836107555e-06, "loss": 0.2217, "step": 656 }, { "epoch": 0.2528581035642233, "grad_norm": 2.059576783531928, "learning_rate": 8.425096030729834e-06, "loss": 0.2436, "step": 658 }, { "epoch": 0.2536266692285522, "grad_norm": 1.9498526325523997, "learning_rate": 8.450704225352114e-06, "loss": 0.2168, "step": 660 }, { "epoch": 0.2543952348928812, "grad_norm": 1.6766704710158786, "learning_rate": 8.476312419974394e-06, "loss": 0.2165, "step": 662 }, { "epoch": 0.2551638005572101, "grad_norm": 1.9104044492513372, "learning_rate": 8.501920614596671e-06, "loss": 0.2114, "step": 664 }, { "epoch": 0.25593236622153903, "grad_norm": 2.204867840580165, "learning_rate": 8.527528809218951e-06, "loss": 0.2138, "step": 666 }, { "epoch": 0.256700931885868, "grad_norm": 2.008264233872534, "learning_rate": 8.553137003841229e-06, "loss": 0.2407, "step": 668 }, { "epoch": 0.25746949755019694, "grad_norm": 1.6110127162322228, "learning_rate": 8.578745198463509e-06, "loss": 0.197, "step": 670 }, { "epoch": 0.2582380632145259, "grad_norm": 1.8580746119932927, "learning_rate": 8.604353393085788e-06, "loss": 0.2227, "step": 672 }, { "epoch": 0.25900662887885484, "grad_norm": 1.7098642643674544, "learning_rate": 8.629961587708068e-06, "loss": 0.2185, "step": 674 }, { "epoch": 0.25977519454318376, "grad_norm": 1.8024059713658118, "learning_rate": 8.655569782330346e-06, "loss": 0.2113, "step": 676 }, { "epoch": 0.26054376020751274, "grad_norm": 1.7559666726014331, "learning_rate": 8.681177976952625e-06, "loss": 0.1985, "step": 678 }, { "epoch": 0.26131232587184167, "grad_norm": 1.9800681825794015, "learning_rate": 8.706786171574905e-06, "loss": 0.2475, "step": 680 }, { "epoch": 0.26208089153617065, "grad_norm": 2.43895751414997, "learning_rate": 8.732394366197183e-06, "loss": 0.2299, "step": 682 }, { "epoch": 0.26284945720049957, "grad_norm": 1.8756661075149896, "learning_rate": 8.758002560819463e-06, "loss": 0.2443, "step": 684 }, { "epoch": 0.2636180228648285, "grad_norm": 1.838217610350611, "learning_rate": 8.783610755441742e-06, "loss": 0.2388, "step": 686 }, { "epoch": 0.2643865885291575, "grad_norm": 1.646505710494678, "learning_rate": 8.809218950064022e-06, "loss": 0.2199, "step": 688 }, { "epoch": 0.2651551541934864, "grad_norm": 1.698113305745756, "learning_rate": 8.8348271446863e-06, "loss": 0.2227, "step": 690 }, { "epoch": 0.2659237198578154, "grad_norm": 1.6682589800420018, "learning_rate": 8.86043533930858e-06, "loss": 0.2246, "step": 692 }, { "epoch": 0.2666922855221443, "grad_norm": 1.560867301105333, "learning_rate": 8.886043533930857e-06, "loss": 0.2117, "step": 694 }, { "epoch": 0.2674608511864732, "grad_norm": 1.8850587892390305, "learning_rate": 8.911651728553137e-06, "loss": 0.2377, "step": 696 }, { "epoch": 0.2682294168508022, "grad_norm": 1.6340840572592261, "learning_rate": 8.937259923175417e-06, "loss": 0.2191, "step": 698 }, { "epoch": 0.26899798251513113, "grad_norm": 1.6666636572621132, "learning_rate": 8.962868117797696e-06, "loss": 0.2053, "step": 700 }, { "epoch": 0.2697665481794601, "grad_norm": 1.9357223697248132, "learning_rate": 8.988476312419974e-06, "loss": 0.2362, "step": 702 }, { "epoch": 0.27053511384378903, "grad_norm": 1.8466666239128657, "learning_rate": 9.014084507042254e-06, "loss": 0.239, "step": 704 }, { "epoch": 0.27130367950811796, "grad_norm": 1.652081916506016, "learning_rate": 9.039692701664533e-06, "loss": 0.221, "step": 706 }, { "epoch": 0.27207224517244694, "grad_norm": 1.6561779700801502, "learning_rate": 9.065300896286813e-06, "loss": 0.2201, "step": 708 }, { "epoch": 0.27284081083677586, "grad_norm": 1.7726416494639383, "learning_rate": 9.090909090909091e-06, "loss": 0.2076, "step": 710 }, { "epoch": 0.27360937650110484, "grad_norm": 1.7812313811285239, "learning_rate": 9.11651728553137e-06, "loss": 0.2231, "step": 712 }, { "epoch": 0.27437794216543376, "grad_norm": 1.8472008413256538, "learning_rate": 9.14212548015365e-06, "loss": 0.2376, "step": 714 }, { "epoch": 0.2751465078297627, "grad_norm": 1.7286400060201044, "learning_rate": 9.16773367477593e-06, "loss": 0.2506, "step": 716 }, { "epoch": 0.27591507349409167, "grad_norm": 1.65422650822449, "learning_rate": 9.193341869398208e-06, "loss": 0.1989, "step": 718 }, { "epoch": 0.2766836391584206, "grad_norm": 1.726923997163801, "learning_rate": 9.218950064020487e-06, "loss": 0.2136, "step": 720 }, { "epoch": 0.27745220482274957, "grad_norm": 1.8761815370497523, "learning_rate": 9.244558258642765e-06, "loss": 0.2329, "step": 722 }, { "epoch": 0.2782207704870785, "grad_norm": 1.619563200315737, "learning_rate": 9.270166453265047e-06, "loss": 0.2209, "step": 724 }, { "epoch": 0.2789893361514074, "grad_norm": 1.8581959041841127, "learning_rate": 9.295774647887325e-06, "loss": 0.2226, "step": 726 }, { "epoch": 0.2797579018157364, "grad_norm": 1.8798668828665128, "learning_rate": 9.321382842509604e-06, "loss": 0.2473, "step": 728 }, { "epoch": 0.2805264674800653, "grad_norm": 1.6711087895284482, "learning_rate": 9.346991037131882e-06, "loss": 0.2376, "step": 730 }, { "epoch": 0.28129503314439425, "grad_norm": 1.6467532835752412, "learning_rate": 9.372599231754162e-06, "loss": 0.2227, "step": 732 }, { "epoch": 0.2820635988087232, "grad_norm": 1.5741466743526495, "learning_rate": 9.398207426376441e-06, "loss": 0.2272, "step": 734 }, { "epoch": 0.28283216447305215, "grad_norm": 1.486959198069152, "learning_rate": 9.423815620998721e-06, "loss": 0.1876, "step": 736 }, { "epoch": 0.28360073013738113, "grad_norm": 2.0616838587395665, "learning_rate": 9.449423815620999e-06, "loss": 0.2288, "step": 738 }, { "epoch": 0.28436929580171005, "grad_norm": 1.6775590554421191, "learning_rate": 9.475032010243279e-06, "loss": 0.2195, "step": 740 }, { "epoch": 0.285137861466039, "grad_norm": 2.1078150287339033, "learning_rate": 9.500640204865558e-06, "loss": 0.2573, "step": 742 }, { "epoch": 0.28590642713036796, "grad_norm": 1.92314221992431, "learning_rate": 9.526248399487838e-06, "loss": 0.2238, "step": 744 }, { "epoch": 0.2866749927946969, "grad_norm": 1.6617272690870843, "learning_rate": 9.551856594110116e-06, "loss": 0.2269, "step": 746 }, { "epoch": 0.28744355845902586, "grad_norm": 1.792421830293697, "learning_rate": 9.577464788732394e-06, "loss": 0.2264, "step": 748 }, { "epoch": 0.2882121241233548, "grad_norm": 1.5019780859120824, "learning_rate": 9.603072983354675e-06, "loss": 0.2048, "step": 750 }, { "epoch": 0.2889806897876837, "grad_norm": 1.8075010585083238, "learning_rate": 9.628681177976953e-06, "loss": 0.2413, "step": 752 }, { "epoch": 0.2897492554520127, "grad_norm": 1.8334002693346336, "learning_rate": 9.654289372599233e-06, "loss": 0.2099, "step": 754 }, { "epoch": 0.2905178211163416, "grad_norm": 1.939789884195804, "learning_rate": 9.67989756722151e-06, "loss": 0.2489, "step": 756 }, { "epoch": 0.2912863867806706, "grad_norm": 1.7898567506495586, "learning_rate": 9.70550576184379e-06, "loss": 0.2308, "step": 758 }, { "epoch": 0.2920549524449995, "grad_norm": 1.8777625272383993, "learning_rate": 9.73111395646607e-06, "loss": 0.2467, "step": 760 }, { "epoch": 0.29282351810932844, "grad_norm": 1.5439545318889065, "learning_rate": 9.75672215108835e-06, "loss": 0.1982, "step": 762 }, { "epoch": 0.2935920837736574, "grad_norm": 1.7380216250939193, "learning_rate": 9.782330345710627e-06, "loss": 0.2083, "step": 764 }, { "epoch": 0.29436064943798634, "grad_norm": 1.6882690322760967, "learning_rate": 9.807938540332907e-06, "loss": 0.2143, "step": 766 }, { "epoch": 0.2951292151023153, "grad_norm": 1.7087179743726464, "learning_rate": 9.833546734955187e-06, "loss": 0.1977, "step": 768 }, { "epoch": 0.29589778076664425, "grad_norm": 1.77701814293098, "learning_rate": 9.859154929577466e-06, "loss": 0.2173, "step": 770 }, { "epoch": 0.29666634643097317, "grad_norm": 1.67105732549748, "learning_rate": 9.884763124199744e-06, "loss": 0.2121, "step": 772 }, { "epoch": 0.29743491209530215, "grad_norm": 1.7334027238302565, "learning_rate": 9.910371318822024e-06, "loss": 0.2162, "step": 774 }, { "epoch": 0.2982034777596311, "grad_norm": 1.5977274424293693, "learning_rate": 9.935979513444303e-06, "loss": 0.2076, "step": 776 }, { "epoch": 0.29897204342396005, "grad_norm": 2.1578550017159026, "learning_rate": 9.961587708066583e-06, "loss": 0.2429, "step": 778 }, { "epoch": 0.299740609088289, "grad_norm": 1.8253585297169743, "learning_rate": 9.987195902688861e-06, "loss": 0.2208, "step": 780 }, { "epoch": 0.3005091747526179, "grad_norm": 1.5136330417205952, "learning_rate": 9.999999500026381e-06, "loss": 0.2289, "step": 782 }, { "epoch": 0.3012777404169469, "grad_norm": 1.539299350416108, "learning_rate": 9.999995500238028e-06, "loss": 0.2262, "step": 784 }, { "epoch": 0.3020463060812758, "grad_norm": 1.904835000800668, "learning_rate": 9.999987500664521e-06, "loss": 0.2253, "step": 786 }, { "epoch": 0.3028148717456048, "grad_norm": 1.7421514454420284, "learning_rate": 9.999975501312258e-06, "loss": 0.2069, "step": 788 }, { "epoch": 0.3035834374099337, "grad_norm": 1.7076664984413368, "learning_rate": 9.999959502190839e-06, "loss": 0.2154, "step": 790 }, { "epoch": 0.30435200307426263, "grad_norm": 2.0251016633263057, "learning_rate": 9.999939503313063e-06, "loss": 0.2273, "step": 792 }, { "epoch": 0.3051205687385916, "grad_norm": 1.7763832744607833, "learning_rate": 9.999915504694929e-06, "loss": 0.2131, "step": 794 }, { "epoch": 0.30588913440292054, "grad_norm": 1.9039966019733678, "learning_rate": 9.999887506355635e-06, "loss": 0.2584, "step": 796 }, { "epoch": 0.3066577000672495, "grad_norm": 1.631619906001044, "learning_rate": 9.999855508317574e-06, "loss": 0.2075, "step": 798 }, { "epoch": 0.30742626573157844, "grad_norm": 1.6897237769106512, "learning_rate": 9.99981951060635e-06, "loss": 0.2266, "step": 800 }, { "epoch": 0.30819483139590736, "grad_norm": 1.7953372656341966, "learning_rate": 9.999779513250754e-06, "loss": 0.2196, "step": 802 }, { "epoch": 0.30896339706023634, "grad_norm": 1.6735768094545143, "learning_rate": 9.999735516282784e-06, "loss": 0.2159, "step": 804 }, { "epoch": 0.30973196272456527, "grad_norm": 2.015417348280801, "learning_rate": 9.999687519737639e-06, "loss": 0.2017, "step": 806 }, { "epoch": 0.31050052838889425, "grad_norm": 1.6829288583267141, "learning_rate": 9.999635523653709e-06, "loss": 0.2312, "step": 808 }, { "epoch": 0.31126909405322317, "grad_norm": 1.5888814947672238, "learning_rate": 9.999579528072592e-06, "loss": 0.2412, "step": 810 }, { "epoch": 0.3120376597175521, "grad_norm": 1.727733929181802, "learning_rate": 9.999519533039079e-06, "loss": 0.2323, "step": 812 }, { "epoch": 0.3128062253818811, "grad_norm": 1.802582673235118, "learning_rate": 9.999455538601168e-06, "loss": 0.216, "step": 814 }, { "epoch": 0.31357479104621, "grad_norm": 1.4540483176379686, "learning_rate": 9.999387544810049e-06, "loss": 0.1976, "step": 816 }, { "epoch": 0.314343356710539, "grad_norm": 1.8178902278356643, "learning_rate": 9.999315551720115e-06, "loss": 0.2174, "step": 818 }, { "epoch": 0.3151119223748679, "grad_norm": 1.8551293445230332, "learning_rate": 9.999239559388955e-06, "loss": 0.2367, "step": 820 }, { "epoch": 0.3158804880391968, "grad_norm": 1.5648508454906225, "learning_rate": 9.999159567877363e-06, "loss": 0.2094, "step": 822 }, { "epoch": 0.3166490537035258, "grad_norm": 1.5747707151299892, "learning_rate": 9.999075577249327e-06, "loss": 0.2102, "step": 824 }, { "epoch": 0.31741761936785473, "grad_norm": 1.6406933894996116, "learning_rate": 9.998987587572037e-06, "loss": 0.2109, "step": 826 }, { "epoch": 0.3181861850321837, "grad_norm": 1.8794329526479572, "learning_rate": 9.99889559891588e-06, "loss": 0.2342, "step": 828 }, { "epoch": 0.31895475069651263, "grad_norm": 1.5016826761509714, "learning_rate": 9.998799611354445e-06, "loss": 0.2252, "step": 830 }, { "epoch": 0.31972331636084156, "grad_norm": 1.662666740335047, "learning_rate": 9.998699624964514e-06, "loss": 0.203, "step": 832 }, { "epoch": 0.32049188202517054, "grad_norm": 1.6955726862348701, "learning_rate": 9.998595639826077e-06, "loss": 0.2477, "step": 834 }, { "epoch": 0.32126044768949946, "grad_norm": 1.7220824075715226, "learning_rate": 9.998487656022314e-06, "loss": 0.2219, "step": 836 }, { "epoch": 0.32202901335382844, "grad_norm": 1.746863428272635, "learning_rate": 9.998375673639606e-06, "loss": 0.2108, "step": 838 }, { "epoch": 0.32279757901815737, "grad_norm": 1.7628610974051768, "learning_rate": 9.998259692767541e-06, "loss": 0.2248, "step": 840 }, { "epoch": 0.3235661446824863, "grad_norm": 1.7316419822316227, "learning_rate": 9.998139713498891e-06, "loss": 0.2349, "step": 842 }, { "epoch": 0.32433471034681527, "grad_norm": 1.6423036324310099, "learning_rate": 9.99801573592964e-06, "loss": 0.2292, "step": 844 }, { "epoch": 0.3251032760111442, "grad_norm": 1.4806048196893955, "learning_rate": 9.997887760158962e-06, "loss": 0.2134, "step": 846 }, { "epoch": 0.3258718416754732, "grad_norm": 1.737282275576078, "learning_rate": 9.997755786289234e-06, "loss": 0.21, "step": 848 }, { "epoch": 0.3266404073398021, "grad_norm": 1.7999409221225962, "learning_rate": 9.997619814426027e-06, "loss": 0.2584, "step": 850 }, { "epoch": 0.327408973004131, "grad_norm": 1.7751286822079229, "learning_rate": 9.997479844678116e-06, "loss": 0.2331, "step": 852 }, { "epoch": 0.32817753866846, "grad_norm": 1.7255727642988363, "learning_rate": 9.997335877157468e-06, "loss": 0.216, "step": 854 }, { "epoch": 0.3289461043327889, "grad_norm": 1.6405323934314253, "learning_rate": 9.997187911979252e-06, "loss": 0.2099, "step": 856 }, { "epoch": 0.3297146699971179, "grad_norm": 1.7486992998451107, "learning_rate": 9.997035949261833e-06, "loss": 0.2343, "step": 858 }, { "epoch": 0.33048323566144683, "grad_norm": 1.7404903403459362, "learning_rate": 9.996879989126776e-06, "loss": 0.2187, "step": 860 }, { "epoch": 0.33125180132577575, "grad_norm": 1.6835407654620318, "learning_rate": 9.996720031698843e-06, "loss": 0.2219, "step": 862 }, { "epoch": 0.33202036699010473, "grad_norm": 1.5138609087572352, "learning_rate": 9.996556077105992e-06, "loss": 0.2217, "step": 864 }, { "epoch": 0.33278893265443366, "grad_norm": 1.5964984706112475, "learning_rate": 9.99638812547938e-06, "loss": 0.235, "step": 866 }, { "epoch": 0.33355749831876264, "grad_norm": 1.8509465154312439, "learning_rate": 9.996216176953361e-06, "loss": 0.2365, "step": 868 }, { "epoch": 0.33432606398309156, "grad_norm": 1.634932075866118, "learning_rate": 9.99604023166549e-06, "loss": 0.2223, "step": 870 }, { "epoch": 0.3350946296474205, "grad_norm": 1.5166229410942782, "learning_rate": 9.99586028975651e-06, "loss": 0.2248, "step": 872 }, { "epoch": 0.33586319531174946, "grad_norm": 1.3844603573545156, "learning_rate": 9.995676351370369e-06, "loss": 0.2108, "step": 874 }, { "epoch": 0.3366317609760784, "grad_norm": 1.6619826014783978, "learning_rate": 9.99548841665421e-06, "loss": 0.2063, "step": 876 }, { "epoch": 0.33740032664040737, "grad_norm": 1.6162140941140468, "learning_rate": 9.995296485758374e-06, "loss": 0.2351, "step": 878 }, { "epoch": 0.3381688923047363, "grad_norm": 1.7816350058389543, "learning_rate": 9.9951005588364e-06, "loss": 0.2092, "step": 880 }, { "epoch": 0.3389374579690652, "grad_norm": 1.5867659357460926, "learning_rate": 9.994900636045016e-06, "loss": 0.2423, "step": 882 }, { "epoch": 0.3397060236333942, "grad_norm": 1.6594587031358168, "learning_rate": 9.994696717544153e-06, "loss": 0.2119, "step": 884 }, { "epoch": 0.3404745892977231, "grad_norm": 1.6286504685149525, "learning_rate": 9.994488803496937e-06, "loss": 0.2082, "step": 886 }, { "epoch": 0.3412431549620521, "grad_norm": 1.6837980522206613, "learning_rate": 9.994276894069694e-06, "loss": 0.2348, "step": 888 }, { "epoch": 0.342011720626381, "grad_norm": 1.8272852519054432, "learning_rate": 9.99406098943194e-06, "loss": 0.239, "step": 890 }, { "epoch": 0.34278028629070995, "grad_norm": 1.767866620759406, "learning_rate": 9.993841089756391e-06, "loss": 0.2548, "step": 892 }, { "epoch": 0.3435488519550389, "grad_norm": 1.6186264121534149, "learning_rate": 9.993617195218955e-06, "loss": 0.241, "step": 894 }, { "epoch": 0.34431741761936785, "grad_norm": 1.593912858274301, "learning_rate": 9.99338930599874e-06, "loss": 0.2266, "step": 896 }, { "epoch": 0.3450859832836968, "grad_norm": 1.612725041033444, "learning_rate": 9.993157422278046e-06, "loss": 0.2174, "step": 898 }, { "epoch": 0.34585454894802575, "grad_norm": 1.5835841773492154, "learning_rate": 9.992921544242372e-06, "loss": 0.2298, "step": 900 }, { "epoch": 0.3466231146123547, "grad_norm": 1.7697543937437346, "learning_rate": 9.99268167208041e-06, "loss": 0.2006, "step": 902 }, { "epoch": 0.34739168027668366, "grad_norm": 1.4849059712227983, "learning_rate": 9.992437805984047e-06, "loss": 0.2049, "step": 904 }, { "epoch": 0.3481602459410126, "grad_norm": 1.5920552131892343, "learning_rate": 9.992189946148366e-06, "loss": 0.2372, "step": 906 }, { "epoch": 0.3489288116053415, "grad_norm": 1.726111653529476, "learning_rate": 9.991938092771645e-06, "loss": 0.2423, "step": 908 }, { "epoch": 0.3496973772696705, "grad_norm": 1.517851327314567, "learning_rate": 9.991682246055355e-06, "loss": 0.2123, "step": 910 }, { "epoch": 0.3504659429339994, "grad_norm": 1.6036791944811652, "learning_rate": 9.991422406204163e-06, "loss": 0.2202, "step": 912 }, { "epoch": 0.3512345085983284, "grad_norm": 1.5721291487484743, "learning_rate": 9.99115857342593e-06, "loss": 0.2179, "step": 914 }, { "epoch": 0.3520030742626573, "grad_norm": 1.5613448266498815, "learning_rate": 9.990890747931712e-06, "loss": 0.2216, "step": 916 }, { "epoch": 0.35277163992698624, "grad_norm": 1.4535788255859194, "learning_rate": 9.990618929935756e-06, "loss": 0.2171, "step": 918 }, { "epoch": 0.3535402055913152, "grad_norm": 1.574747207775964, "learning_rate": 9.990343119655507e-06, "loss": 0.211, "step": 920 }, { "epoch": 0.35430877125564414, "grad_norm": 1.5794813991353758, "learning_rate": 9.9900633173116e-06, "loss": 0.211, "step": 922 }, { "epoch": 0.3550773369199731, "grad_norm": 1.6644941231234982, "learning_rate": 9.989779523127865e-06, "loss": 0.2067, "step": 924 }, { "epoch": 0.35584590258430204, "grad_norm": 1.6167552783466348, "learning_rate": 9.989491737331327e-06, "loss": 0.2387, "step": 926 }, { "epoch": 0.35661446824863097, "grad_norm": 1.3827951594663042, "learning_rate": 9.989199960152202e-06, "loss": 0.2043, "step": 928 }, { "epoch": 0.35738303391295995, "grad_norm": 1.4449044982812944, "learning_rate": 9.988904191823897e-06, "loss": 0.1877, "step": 930 }, { "epoch": 0.35815159957728887, "grad_norm": 1.4854707318143534, "learning_rate": 9.988604432583018e-06, "loss": 0.2025, "step": 932 }, { "epoch": 0.35892016524161785, "grad_norm": 1.5239844780977823, "learning_rate": 9.988300682669357e-06, "loss": 0.2027, "step": 934 }, { "epoch": 0.3596887309059468, "grad_norm": 1.5922830040145939, "learning_rate": 9.987992942325902e-06, "loss": 0.2175, "step": 936 }, { "epoch": 0.3604572965702757, "grad_norm": 1.6133028585244196, "learning_rate": 9.987681211798829e-06, "loss": 0.2265, "step": 938 }, { "epoch": 0.3612258622346047, "grad_norm": 1.6216534977399493, "learning_rate": 9.987365491337517e-06, "loss": 0.2226, "step": 940 }, { "epoch": 0.3619944278989336, "grad_norm": 1.5231399321369041, "learning_rate": 9.987045781194522e-06, "loss": 0.2153, "step": 942 }, { "epoch": 0.3627629935632626, "grad_norm": 1.5210063360054469, "learning_rate": 9.986722081625601e-06, "loss": 0.2209, "step": 944 }, { "epoch": 0.3635315592275915, "grad_norm": 1.4685724902401287, "learning_rate": 9.986394392889702e-06, "loss": 0.1952, "step": 946 }, { "epoch": 0.36430012489192043, "grad_norm": 1.49923961996504, "learning_rate": 9.98606271524896e-06, "loss": 0.2184, "step": 948 }, { "epoch": 0.3650686905562494, "grad_norm": 1.5495077174798066, "learning_rate": 9.985727048968701e-06, "loss": 0.2416, "step": 950 }, { "epoch": 0.36583725622057833, "grad_norm": 1.4006269360211432, "learning_rate": 9.985387394317448e-06, "loss": 0.2274, "step": 952 }, { "epoch": 0.3666058218849073, "grad_norm": 1.3283689747877068, "learning_rate": 9.985043751566907e-06, "loss": 0.2053, "step": 954 }, { "epoch": 0.36737438754923624, "grad_norm": 1.4416385350886187, "learning_rate": 9.984696120991979e-06, "loss": 0.2091, "step": 956 }, { "epoch": 0.36814295321356516, "grad_norm": 1.7260769913360674, "learning_rate": 9.984344502870756e-06, "loss": 0.2416, "step": 958 }, { "epoch": 0.36891151887789414, "grad_norm": 1.5868283490365476, "learning_rate": 9.983988897484513e-06, "loss": 0.2308, "step": 960 }, { "epoch": 0.36968008454222306, "grad_norm": 1.5814824300202786, "learning_rate": 9.983629305117724e-06, "loss": 0.2284, "step": 962 }, { "epoch": 0.37044865020655204, "grad_norm": 1.4917166348568545, "learning_rate": 9.983265726058044e-06, "loss": 0.2277, "step": 964 }, { "epoch": 0.37121721587088097, "grad_norm": 1.6710537918985473, "learning_rate": 9.982898160596324e-06, "loss": 0.2354, "step": 966 }, { "epoch": 0.3719857815352099, "grad_norm": 1.5170398535279166, "learning_rate": 9.982526609026599e-06, "loss": 0.2374, "step": 968 }, { "epoch": 0.37275434719953887, "grad_norm": 1.4106407150606228, "learning_rate": 9.982151071646094e-06, "loss": 0.1951, "step": 970 }, { "epoch": 0.3735229128638678, "grad_norm": 1.5723550197892044, "learning_rate": 9.981771548755223e-06, "loss": 0.2229, "step": 972 }, { "epoch": 0.3742914785281968, "grad_norm": 1.4518481995901078, "learning_rate": 9.981388040657591e-06, "loss": 0.1932, "step": 974 }, { "epoch": 0.3750600441925257, "grad_norm": 1.4588940786137212, "learning_rate": 9.981000547659985e-06, "loss": 0.2076, "step": 976 }, { "epoch": 0.3758286098568546, "grad_norm": 1.338838211373192, "learning_rate": 9.980609070072386e-06, "loss": 0.1964, "step": 978 }, { "epoch": 0.3765971755211836, "grad_norm": 1.5649173933724436, "learning_rate": 9.980213608207958e-06, "loss": 0.2254, "step": 980 }, { "epoch": 0.3773657411855125, "grad_norm": 1.4762409477491152, "learning_rate": 9.979814162383052e-06, "loss": 0.2088, "step": 982 }, { "epoch": 0.3781343068498415, "grad_norm": 1.3761332895955432, "learning_rate": 9.979410732917211e-06, "loss": 0.207, "step": 984 }, { "epoch": 0.37890287251417043, "grad_norm": 1.9229580325433038, "learning_rate": 9.97900332013316e-06, "loss": 0.2224, "step": 986 }, { "epoch": 0.37967143817849935, "grad_norm": 1.3718400408418019, "learning_rate": 9.978591924356811e-06, "loss": 0.215, "step": 988 }, { "epoch": 0.38044000384282833, "grad_norm": 1.365549921507905, "learning_rate": 9.978176545917266e-06, "loss": 0.2082, "step": 990 }, { "epoch": 0.38120856950715726, "grad_norm": 1.3327405366607776, "learning_rate": 9.977757185146808e-06, "loss": 0.2019, "step": 992 }, { "epoch": 0.38197713517148624, "grad_norm": 1.598436545959415, "learning_rate": 9.977333842380907e-06, "loss": 0.2415, "step": 994 }, { "epoch": 0.38274570083581516, "grad_norm": 1.3796846564628238, "learning_rate": 9.976906517958222e-06, "loss": 0.2028, "step": 996 }, { "epoch": 0.3835142665001441, "grad_norm": 1.3890364616044417, "learning_rate": 9.976475212220593e-06, "loss": 0.1938, "step": 998 }, { "epoch": 0.38428283216447306, "grad_norm": 1.6860009251926829, "learning_rate": 9.976039925513048e-06, "loss": 0.2202, "step": 1000 }, { "epoch": 0.38428283216447306, "eval_loss": 0.19420889019966125, "eval_runtime": 391.3553, "eval_samples_per_second": 47.284, "eval_steps_per_second": 5.913, "step": 1000 }, { "epoch": 0.385051397828802, "grad_norm": 1.4377302790001703, "learning_rate": 9.975600658183795e-06, "loss": 0.2114, "step": 1002 }, { "epoch": 0.38581996349313097, "grad_norm": 1.6433335566125242, "learning_rate": 9.97515741058423e-06, "loss": 0.2267, "step": 1004 }, { "epoch": 0.3865885291574599, "grad_norm": 1.6130638373523, "learning_rate": 9.974710183068935e-06, "loss": 0.2208, "step": 1006 }, { "epoch": 0.3873570948217888, "grad_norm": 1.2768140688573653, "learning_rate": 9.974258975995672e-06, "loss": 0.1753, "step": 1008 }, { "epoch": 0.3881256604861178, "grad_norm": 1.5675808149911346, "learning_rate": 9.973803789725384e-06, "loss": 0.2143, "step": 1010 }, { "epoch": 0.3888942261504467, "grad_norm": 1.645024677358577, "learning_rate": 9.973344624622206e-06, "loss": 0.2148, "step": 1012 }, { "epoch": 0.3896627918147757, "grad_norm": 1.6250878916780662, "learning_rate": 9.972881481053449e-06, "loss": 0.2248, "step": 1014 }, { "epoch": 0.3904313574791046, "grad_norm": 1.517433598564765, "learning_rate": 9.972414359389606e-06, "loss": 0.1974, "step": 1016 }, { "epoch": 0.39119992314343355, "grad_norm": 1.8093481654939418, "learning_rate": 9.971943260004357e-06, "loss": 0.2146, "step": 1018 }, { "epoch": 0.3919684888077625, "grad_norm": 1.5368568343870446, "learning_rate": 9.971468183274562e-06, "loss": 0.219, "step": 1020 }, { "epoch": 0.39273705447209145, "grad_norm": 1.4636719560928695, "learning_rate": 9.970989129580259e-06, "loss": 0.2109, "step": 1022 }, { "epoch": 0.39350562013642043, "grad_norm": 1.570171338880823, "learning_rate": 9.970506099304676e-06, "loss": 0.2346, "step": 1024 }, { "epoch": 0.39427418580074935, "grad_norm": 1.5041645972989865, "learning_rate": 9.970019092834211e-06, "loss": 0.2326, "step": 1026 }, { "epoch": 0.3950427514650783, "grad_norm": 1.3187953081820838, "learning_rate": 9.969528110558452e-06, "loss": 0.2063, "step": 1028 }, { "epoch": 0.39581131712940726, "grad_norm": 1.5504447664704968, "learning_rate": 9.969033152870164e-06, "loss": 0.2198, "step": 1030 }, { "epoch": 0.3965798827937362, "grad_norm": 1.5128169686046529, "learning_rate": 9.968534220165292e-06, "loss": 0.1993, "step": 1032 }, { "epoch": 0.39734844845806516, "grad_norm": 1.4790798641804794, "learning_rate": 9.968031312842959e-06, "loss": 0.2005, "step": 1034 }, { "epoch": 0.3981170141223941, "grad_norm": 1.4536952563898289, "learning_rate": 9.967524431305473e-06, "loss": 0.1978, "step": 1036 }, { "epoch": 0.398885579786723, "grad_norm": 1.4641405709293691, "learning_rate": 9.967013575958315e-06, "loss": 0.2199, "step": 1038 }, { "epoch": 0.399654145451052, "grad_norm": 1.3634966807389484, "learning_rate": 9.96649874721015e-06, "loss": 0.2112, "step": 1040 }, { "epoch": 0.4004227111153809, "grad_norm": 1.4176855400080206, "learning_rate": 9.965979945472818e-06, "loss": 0.1958, "step": 1042 }, { "epoch": 0.4011912767797099, "grad_norm": 1.3999801618516174, "learning_rate": 9.965457171161339e-06, "loss": 0.2075, "step": 1044 }, { "epoch": 0.4019598424440388, "grad_norm": 1.4827612508898096, "learning_rate": 9.964930424693908e-06, "loss": 0.2155, "step": 1046 }, { "epoch": 0.40272840810836774, "grad_norm": 1.6901454630859463, "learning_rate": 9.964399706491904e-06, "loss": 0.2439, "step": 1048 }, { "epoch": 0.4034969737726967, "grad_norm": 1.4272831946801088, "learning_rate": 9.963865016979877e-06, "loss": 0.1958, "step": 1050 }, { "epoch": 0.40426553943702564, "grad_norm": 1.3238454021245833, "learning_rate": 9.963326356585554e-06, "loss": 0.2066, "step": 1052 }, { "epoch": 0.4050341051013546, "grad_norm": 1.560396917599262, "learning_rate": 9.962783725739844e-06, "loss": 0.2001, "step": 1054 }, { "epoch": 0.40580267076568355, "grad_norm": 1.544397827502799, "learning_rate": 9.962237124876828e-06, "loss": 0.2191, "step": 1056 }, { "epoch": 0.4065712364300125, "grad_norm": 1.6123703170989347, "learning_rate": 9.961686554433762e-06, "loss": 0.2398, "step": 1058 }, { "epoch": 0.40733980209434145, "grad_norm": 1.7689778480719758, "learning_rate": 9.961132014851079e-06, "loss": 0.2051, "step": 1060 }, { "epoch": 0.4081083677586704, "grad_norm": 1.470223690030404, "learning_rate": 9.960573506572391e-06, "loss": 0.1976, "step": 1062 }, { "epoch": 0.4088769334229993, "grad_norm": 1.3432958854227, "learning_rate": 9.960011030044477e-06, "loss": 0.2079, "step": 1064 }, { "epoch": 0.4096454990873283, "grad_norm": 1.3343477077671035, "learning_rate": 9.959444585717295e-06, "loss": 0.1908, "step": 1066 }, { "epoch": 0.4104140647516572, "grad_norm": 1.418379672436081, "learning_rate": 9.958874174043977e-06, "loss": 0.2055, "step": 1068 }, { "epoch": 0.4111826304159862, "grad_norm": 1.4075578888446751, "learning_rate": 9.95829979548083e-06, "loss": 0.2043, "step": 1070 }, { "epoch": 0.4119511960803151, "grad_norm": 1.5171084849414287, "learning_rate": 9.95772145048733e-06, "loss": 0.2195, "step": 1072 }, { "epoch": 0.41271976174464403, "grad_norm": 1.67359157463377, "learning_rate": 9.95713913952613e-06, "loss": 0.2219, "step": 1074 }, { "epoch": 0.413488327408973, "grad_norm": 1.3371422173923004, "learning_rate": 9.956552863063053e-06, "loss": 0.2088, "step": 1076 }, { "epoch": 0.41425689307330194, "grad_norm": 1.409739202870157, "learning_rate": 9.955962621567099e-06, "loss": 0.1989, "step": 1078 }, { "epoch": 0.4150254587376309, "grad_norm": 1.5582939160453675, "learning_rate": 9.955368415510432e-06, "loss": 0.2348, "step": 1080 }, { "epoch": 0.41579402440195984, "grad_norm": 1.469571743550721, "learning_rate": 9.954770245368392e-06, "loss": 0.2215, "step": 1082 }, { "epoch": 0.41656259006628876, "grad_norm": 1.583794633502873, "learning_rate": 9.95416811161949e-06, "loss": 0.2209, "step": 1084 }, { "epoch": 0.41733115573061774, "grad_norm": 1.3969644412496027, "learning_rate": 9.953562014745412e-06, "loss": 0.2207, "step": 1086 }, { "epoch": 0.41809972139494667, "grad_norm": 1.5068831141198915, "learning_rate": 9.952951955231005e-06, "loss": 0.222, "step": 1088 }, { "epoch": 0.41886828705927565, "grad_norm": 1.4071045200147443, "learning_rate": 9.952337933564294e-06, "loss": 0.1902, "step": 1090 }, { "epoch": 0.41963685272360457, "grad_norm": 1.5205728615811025, "learning_rate": 9.951719950236467e-06, "loss": 0.2198, "step": 1092 }, { "epoch": 0.4204054183879335, "grad_norm": 1.4835021135787179, "learning_rate": 9.951098005741885e-06, "loss": 0.2321, "step": 1094 }, { "epoch": 0.4211739840522625, "grad_norm": 1.4951824832422167, "learning_rate": 9.950472100578082e-06, "loss": 0.2046, "step": 1096 }, { "epoch": 0.4219425497165914, "grad_norm": 1.4761365689645203, "learning_rate": 9.94984223524575e-06, "loss": 0.2168, "step": 1098 }, { "epoch": 0.4227111153809204, "grad_norm": 1.4533464362815016, "learning_rate": 9.949208410248758e-06, "loss": 0.2019, "step": 1100 }, { "epoch": 0.4234796810452493, "grad_norm": 1.8288740536029644, "learning_rate": 9.948570626094138e-06, "loss": 0.2026, "step": 1102 }, { "epoch": 0.4242482467095782, "grad_norm": 1.5587733785631785, "learning_rate": 9.94792888329209e-06, "loss": 0.22, "step": 1104 }, { "epoch": 0.4250168123739072, "grad_norm": 1.4381655106715092, "learning_rate": 9.947283182355982e-06, "loss": 0.2153, "step": 1106 }, { "epoch": 0.42578537803823613, "grad_norm": 1.5483615532493467, "learning_rate": 9.946633523802346e-06, "loss": 0.208, "step": 1108 }, { "epoch": 0.4265539437025651, "grad_norm": 2.3388380676655514, "learning_rate": 9.945979908150884e-06, "loss": 0.2288, "step": 1110 }, { "epoch": 0.42732250936689403, "grad_norm": 1.570959182428413, "learning_rate": 9.94532233592446e-06, "loss": 0.1958, "step": 1112 }, { "epoch": 0.42809107503122296, "grad_norm": 1.716615907306885, "learning_rate": 9.944660807649102e-06, "loss": 0.2407, "step": 1114 }, { "epoch": 0.42885964069555194, "grad_norm": 1.4910927831768912, "learning_rate": 9.943995323854007e-06, "loss": 0.2078, "step": 1116 }, { "epoch": 0.42962820635988086, "grad_norm": 1.5723222965438568, "learning_rate": 9.943325885071534e-06, "loss": 0.246, "step": 1118 }, { "epoch": 0.43039677202420984, "grad_norm": 1.748642080906803, "learning_rate": 9.942652491837203e-06, "loss": 0.2096, "step": 1120 }, { "epoch": 0.43116533768853876, "grad_norm": 1.3719125900188793, "learning_rate": 9.941975144689705e-06, "loss": 0.2025, "step": 1122 }, { "epoch": 0.4319339033528677, "grad_norm": 1.6117229210404536, "learning_rate": 9.941293844170882e-06, "loss": 0.2223, "step": 1124 }, { "epoch": 0.43270246901719667, "grad_norm": 1.324631993620901, "learning_rate": 9.940608590825751e-06, "loss": 0.1937, "step": 1126 }, { "epoch": 0.4334710346815256, "grad_norm": 1.4633145698686136, "learning_rate": 9.939919385202485e-06, "loss": 0.2068, "step": 1128 }, { "epoch": 0.43423960034585457, "grad_norm": 1.5027005896450814, "learning_rate": 9.93922622785242e-06, "loss": 0.2324, "step": 1130 }, { "epoch": 0.4350081660101835, "grad_norm": 1.3601616538842676, "learning_rate": 9.93852911933005e-06, "loss": 0.2173, "step": 1132 }, { "epoch": 0.4357767316745124, "grad_norm": 1.4426128618252998, "learning_rate": 9.937828060193033e-06, "loss": 0.1847, "step": 1134 }, { "epoch": 0.4365452973388414, "grad_norm": 1.4227251687672742, "learning_rate": 9.937123051002188e-06, "loss": 0.196, "step": 1136 }, { "epoch": 0.4373138630031703, "grad_norm": 1.3816094846443345, "learning_rate": 9.93641409232149e-06, "loss": 0.201, "step": 1138 }, { "epoch": 0.4380824286674993, "grad_norm": 1.4887031645182767, "learning_rate": 9.935701184718082e-06, "loss": 0.2149, "step": 1140 }, { "epoch": 0.4388509943318282, "grad_norm": 1.3527337426257124, "learning_rate": 9.934984328762254e-06, "loss": 0.1957, "step": 1142 }, { "epoch": 0.43961955999615715, "grad_norm": 1.5741435171097717, "learning_rate": 9.934263525027463e-06, "loss": 0.2051, "step": 1144 }, { "epoch": 0.44038812566048613, "grad_norm": 1.4056372404524864, "learning_rate": 9.93353877409032e-06, "loss": 0.2168, "step": 1146 }, { "epoch": 0.44115669132481505, "grad_norm": 1.4655500499307113, "learning_rate": 9.932810076530597e-06, "loss": 0.2107, "step": 1148 }, { "epoch": 0.44192525698914403, "grad_norm": 1.4244329718946038, "learning_rate": 9.93207743293122e-06, "loss": 0.2022, "step": 1150 }, { "epoch": 0.44269382265347296, "grad_norm": 1.514129838141836, "learning_rate": 9.931340843878272e-06, "loss": 0.197, "step": 1152 }, { "epoch": 0.4434623883178019, "grad_norm": 1.3967935643739222, "learning_rate": 9.930600309960997e-06, "loss": 0.2246, "step": 1154 }, { "epoch": 0.44423095398213086, "grad_norm": 1.4685506395183272, "learning_rate": 9.929855831771787e-06, "loss": 0.1872, "step": 1156 }, { "epoch": 0.4449995196464598, "grad_norm": 1.5262905809942995, "learning_rate": 9.929107409906193e-06, "loss": 0.1879, "step": 1158 }, { "epoch": 0.44576808531078876, "grad_norm": 1.4537216254807916, "learning_rate": 9.928355044962923e-06, "loss": 0.2121, "step": 1160 }, { "epoch": 0.4465366509751177, "grad_norm": 1.2620566754236306, "learning_rate": 9.927598737543838e-06, "loss": 0.2019, "step": 1162 }, { "epoch": 0.4473052166394466, "grad_norm": 1.6530128730492903, "learning_rate": 9.926838488253948e-06, "loss": 0.22, "step": 1164 }, { "epoch": 0.4480737823037756, "grad_norm": 1.7672282909479509, "learning_rate": 9.926074297701425e-06, "loss": 0.2018, "step": 1166 }, { "epoch": 0.4488423479681045, "grad_norm": 1.3764740895920753, "learning_rate": 9.925306166497585e-06, "loss": 0.1933, "step": 1168 }, { "epoch": 0.4496109136324335, "grad_norm": 2.1872868950165096, "learning_rate": 9.924534095256903e-06, "loss": 0.199, "step": 1170 }, { "epoch": 0.4503794792967624, "grad_norm": 1.5031329735387424, "learning_rate": 9.923758084597002e-06, "loss": 0.1886, "step": 1172 }, { "epoch": 0.45114804496109134, "grad_norm": 1.2825307649374589, "learning_rate": 9.922978135138658e-06, "loss": 0.2049, "step": 1174 }, { "epoch": 0.4519166106254203, "grad_norm": 1.4283510779058315, "learning_rate": 9.9221942475058e-06, "loss": 0.2022, "step": 1176 }, { "epoch": 0.45268517628974925, "grad_norm": 1.774710145759505, "learning_rate": 9.921406422325504e-06, "loss": 0.1942, "step": 1178 }, { "epoch": 0.4534537419540782, "grad_norm": 2.394937422873874, "learning_rate": 9.920614660227992e-06, "loss": 0.2137, "step": 1180 }, { "epoch": 0.45422230761840715, "grad_norm": 1.3969713287857692, "learning_rate": 9.919818961846647e-06, "loss": 0.1883, "step": 1182 }, { "epoch": 0.4549908732827361, "grad_norm": 1.3585655674432215, "learning_rate": 9.919019327817991e-06, "loss": 0.1841, "step": 1184 }, { "epoch": 0.45575943894706505, "grad_norm": 1.310686159488752, "learning_rate": 9.918215758781698e-06, "loss": 0.2229, "step": 1186 }, { "epoch": 0.456528004611394, "grad_norm": 1.4038306047900346, "learning_rate": 9.917408255380587e-06, "loss": 0.2173, "step": 1188 }, { "epoch": 0.45729657027572296, "grad_norm": 1.4358616905401873, "learning_rate": 9.91659681826063e-06, "loss": 0.2052, "step": 1190 }, { "epoch": 0.4580651359400519, "grad_norm": 1.2713239411509525, "learning_rate": 9.91578144807094e-06, "loss": 0.2001, "step": 1192 }, { "epoch": 0.4588337016043808, "grad_norm": 1.4090235600951024, "learning_rate": 9.914962145463782e-06, "loss": 0.2104, "step": 1194 }, { "epoch": 0.4596022672687098, "grad_norm": 1.3378404521119245, "learning_rate": 9.91413891109456e-06, "loss": 0.2136, "step": 1196 }, { "epoch": 0.4603708329330387, "grad_norm": 1.4299272125388527, "learning_rate": 9.913311745621827e-06, "loss": 0.2008, "step": 1198 }, { "epoch": 0.4611393985973677, "grad_norm": 1.3254645678987609, "learning_rate": 9.912480649707282e-06, "loss": 0.2154, "step": 1200 }, { "epoch": 0.4619079642616966, "grad_norm": 1.6689726886918228, "learning_rate": 9.911645624015764e-06, "loss": 0.2257, "step": 1202 }, { "epoch": 0.46267652992602554, "grad_norm": 1.2972838599335297, "learning_rate": 9.910806669215263e-06, "loss": 0.1876, "step": 1204 }, { "epoch": 0.4634450955903545, "grad_norm": 1.4935257751458701, "learning_rate": 9.909963785976902e-06, "loss": 0.2022, "step": 1206 }, { "epoch": 0.46421366125468344, "grad_norm": 2.0315721408061327, "learning_rate": 9.909116974974957e-06, "loss": 0.2081, "step": 1208 }, { "epoch": 0.4649822269190124, "grad_norm": 1.4038873466676118, "learning_rate": 9.908266236886836e-06, "loss": 0.1927, "step": 1210 }, { "epoch": 0.46575079258334134, "grad_norm": 1.3774743413972566, "learning_rate": 9.907411572393098e-06, "loss": 0.2149, "step": 1212 }, { "epoch": 0.46651935824767027, "grad_norm": 1.517335976203843, "learning_rate": 9.906552982177438e-06, "loss": 0.2283, "step": 1214 }, { "epoch": 0.46728792391199925, "grad_norm": 1.3890627630447039, "learning_rate": 9.905690466926687e-06, "loss": 0.196, "step": 1216 }, { "epoch": 0.46805648957632817, "grad_norm": 1.3725597232193358, "learning_rate": 9.904824027330825e-06, "loss": 0.1935, "step": 1218 }, { "epoch": 0.4688250552406571, "grad_norm": 1.4756798540863383, "learning_rate": 9.903953664082969e-06, "loss": 0.2082, "step": 1220 }, { "epoch": 0.4695936209049861, "grad_norm": 1.3693969986245964, "learning_rate": 9.903079377879368e-06, "loss": 0.1867, "step": 1222 }, { "epoch": 0.470362186569315, "grad_norm": 1.6173276804614833, "learning_rate": 9.902201169419416e-06, "loss": 0.2128, "step": 1224 }, { "epoch": 0.471130752233644, "grad_norm": 1.3551491545931513, "learning_rate": 9.901319039405643e-06, "loss": 0.2135, "step": 1226 }, { "epoch": 0.4718993178979729, "grad_norm": 1.26217892839208, "learning_rate": 9.900432988543718e-06, "loss": 0.185, "step": 1228 }, { "epoch": 0.4726678835623018, "grad_norm": 1.3955434453276478, "learning_rate": 9.899543017542439e-06, "loss": 0.2277, "step": 1230 }, { "epoch": 0.4734364492266308, "grad_norm": 1.4315317015796312, "learning_rate": 9.89864912711375e-06, "loss": 0.1972, "step": 1232 }, { "epoch": 0.47420501489095973, "grad_norm": 1.5285413369307779, "learning_rate": 9.89775131797272e-06, "loss": 0.2554, "step": 1234 }, { "epoch": 0.4749735805552887, "grad_norm": 1.4083099168166353, "learning_rate": 9.896849590837567e-06, "loss": 0.2088, "step": 1236 }, { "epoch": 0.47574214621961763, "grad_norm": 1.3641208993547835, "learning_rate": 9.895943946429626e-06, "loss": 0.1886, "step": 1238 }, { "epoch": 0.47651071188394656, "grad_norm": 1.288315610967532, "learning_rate": 9.895034385473379e-06, "loss": 0.2182, "step": 1240 }, { "epoch": 0.47727927754827554, "grad_norm": 1.4548393277223735, "learning_rate": 9.894120908696435e-06, "loss": 0.2079, "step": 1242 }, { "epoch": 0.47804784321260446, "grad_norm": 1.3988581265230586, "learning_rate": 9.893203516829537e-06, "loss": 0.2182, "step": 1244 }, { "epoch": 0.47881640887693344, "grad_norm": 1.4719421555376868, "learning_rate": 9.892282210606561e-06, "loss": 0.2034, "step": 1246 }, { "epoch": 0.47958497454126237, "grad_norm": 1.490420779310944, "learning_rate": 9.89135699076451e-06, "loss": 0.2246, "step": 1248 }, { "epoch": 0.4803535402055913, "grad_norm": 1.595495867841866, "learning_rate": 9.890427858043523e-06, "loss": 0.208, "step": 1250 }, { "epoch": 0.48112210586992027, "grad_norm": 1.4853250409414513, "learning_rate": 9.889494813186867e-06, "loss": 0.1954, "step": 1252 }, { "epoch": 0.4818906715342492, "grad_norm": 1.1582287632452946, "learning_rate": 9.888557856940938e-06, "loss": 0.185, "step": 1254 }, { "epoch": 0.4826592371985782, "grad_norm": 1.3788824890919835, "learning_rate": 9.887616990055262e-06, "loss": 0.1919, "step": 1256 }, { "epoch": 0.4834278028629071, "grad_norm": 1.307081191264779, "learning_rate": 9.886672213282491e-06, "loss": 0.2175, "step": 1258 }, { "epoch": 0.484196368527236, "grad_norm": 1.354066978624733, "learning_rate": 9.885723527378407e-06, "loss": 0.2153, "step": 1260 }, { "epoch": 0.484964934191565, "grad_norm": 1.4483194071709067, "learning_rate": 9.884770933101922e-06, "loss": 0.2003, "step": 1262 }, { "epoch": 0.4857334998558939, "grad_norm": 1.80775067918897, "learning_rate": 9.883814431215066e-06, "loss": 0.2044, "step": 1264 }, { "epoch": 0.4865020655202229, "grad_norm": 1.4454189147029592, "learning_rate": 9.882854022483005e-06, "loss": 0.2164, "step": 1266 }, { "epoch": 0.48727063118455183, "grad_norm": 1.2626466034612045, "learning_rate": 9.881889707674019e-06, "loss": 0.2103, "step": 1268 }, { "epoch": 0.48803919684888075, "grad_norm": 1.4409015400800942, "learning_rate": 9.880921487559526e-06, "loss": 0.2106, "step": 1270 }, { "epoch": 0.48880776251320973, "grad_norm": 1.232539097320994, "learning_rate": 9.879949362914059e-06, "loss": 0.2102, "step": 1272 }, { "epoch": 0.48957632817753866, "grad_norm": 1.3289181439833337, "learning_rate": 9.878973334515274e-06, "loss": 0.2079, "step": 1274 }, { "epoch": 0.49034489384186764, "grad_norm": 1.2977712144852023, "learning_rate": 9.877993403143956e-06, "loss": 0.2173, "step": 1276 }, { "epoch": 0.49111345950619656, "grad_norm": 1.3786915735363086, "learning_rate": 9.877009569584005e-06, "loss": 0.2048, "step": 1278 }, { "epoch": 0.4918820251705255, "grad_norm": 1.4309611246277152, "learning_rate": 9.87602183462245e-06, "loss": 0.2237, "step": 1280 }, { "epoch": 0.49265059083485446, "grad_norm": 1.2695524207709121, "learning_rate": 9.875030199049436e-06, "loss": 0.2071, "step": 1282 }, { "epoch": 0.4934191564991834, "grad_norm": 1.3206912530411206, "learning_rate": 9.874034663658227e-06, "loss": 0.1885, "step": 1284 }, { "epoch": 0.49418772216351237, "grad_norm": 1.5548416804083998, "learning_rate": 9.873035229245214e-06, "loss": 0.2129, "step": 1286 }, { "epoch": 0.4949562878278413, "grad_norm": 1.3513363590542624, "learning_rate": 9.872031896609899e-06, "loss": 0.2072, "step": 1288 }, { "epoch": 0.4957248534921702, "grad_norm": 1.2436363136248427, "learning_rate": 9.871024666554905e-06, "loss": 0.1933, "step": 1290 }, { "epoch": 0.4964934191564992, "grad_norm": 1.4783879051516648, "learning_rate": 9.870013539885976e-06, "loss": 0.1738, "step": 1292 }, { "epoch": 0.4972619848208281, "grad_norm": 1.4245292256956743, "learning_rate": 9.86899851741197e-06, "loss": 0.2062, "step": 1294 }, { "epoch": 0.4980305504851571, "grad_norm": 1.4476867016230186, "learning_rate": 9.867979599944861e-06, "loss": 0.1981, "step": 1296 }, { "epoch": 0.498799116149486, "grad_norm": 1.4592529343491007, "learning_rate": 9.86695678829974e-06, "loss": 0.2144, "step": 1298 }, { "epoch": 0.49956768181381495, "grad_norm": 1.3923850375945273, "learning_rate": 9.865930083294815e-06, "loss": 0.2111, "step": 1300 }, { "epoch": 0.5003362474781439, "grad_norm": 1.329078521265029, "learning_rate": 9.864899485751405e-06, "loss": 0.193, "step": 1302 }, { "epoch": 0.5011048131424729, "grad_norm": 1.4484624509299158, "learning_rate": 9.863864996493944e-06, "loss": 0.217, "step": 1304 }, { "epoch": 0.5018733788068018, "grad_norm": 1.2172876509674269, "learning_rate": 9.862826616349981e-06, "loss": 0.1815, "step": 1306 }, { "epoch": 0.5026419444711308, "grad_norm": 1.2738939573020764, "learning_rate": 9.861784346150175e-06, "loss": 0.1947, "step": 1308 }, { "epoch": 0.5034105101354597, "grad_norm": 1.4173782402079247, "learning_rate": 9.8607381867283e-06, "loss": 0.2006, "step": 1310 }, { "epoch": 0.5041790757997886, "grad_norm": 1.4632440254365329, "learning_rate": 9.859688138921239e-06, "loss": 0.1763, "step": 1312 }, { "epoch": 0.5049476414641176, "grad_norm": 1.3630638349772646, "learning_rate": 9.858634203568984e-06, "loss": 0.203, "step": 1314 }, { "epoch": 0.5057162071284466, "grad_norm": 1.356434403108063, "learning_rate": 9.85757638151464e-06, "loss": 0.1846, "step": 1316 }, { "epoch": 0.5064847727927755, "grad_norm": 1.3452093312535778, "learning_rate": 9.85651467360442e-06, "loss": 0.2136, "step": 1318 }, { "epoch": 0.5072533384571044, "grad_norm": 1.1604783048807306, "learning_rate": 9.855449080687645e-06, "loss": 0.1955, "step": 1320 }, { "epoch": 0.5080219041214333, "grad_norm": 1.1772070306212823, "learning_rate": 9.854379603616743e-06, "loss": 0.2054, "step": 1322 }, { "epoch": 0.5087904697857624, "grad_norm": 1.2901033171832479, "learning_rate": 9.853306243247253e-06, "loss": 0.1915, "step": 1324 }, { "epoch": 0.5095590354500913, "grad_norm": 1.361118039363656, "learning_rate": 9.852229000437819e-06, "loss": 0.1965, "step": 1326 }, { "epoch": 0.5103276011144202, "grad_norm": 1.5807294884042364, "learning_rate": 9.851147876050187e-06, "loss": 0.2168, "step": 1328 }, { "epoch": 0.5110961667787491, "grad_norm": 1.418540631483589, "learning_rate": 9.850062870949209e-06, "loss": 0.2024, "step": 1330 }, { "epoch": 0.5118647324430781, "grad_norm": 1.3271900080762289, "learning_rate": 9.848973986002847e-06, "loss": 0.2159, "step": 1332 }, { "epoch": 0.5126332981074071, "grad_norm": 1.3407357605084491, "learning_rate": 9.847881222082165e-06, "loss": 0.2027, "step": 1334 }, { "epoch": 0.513401863771736, "grad_norm": 1.3308906378900534, "learning_rate": 9.846784580061322e-06, "loss": 0.1867, "step": 1336 }, { "epoch": 0.514170429436065, "grad_norm": 1.3949769272229728, "learning_rate": 9.845684060817588e-06, "loss": 0.2192, "step": 1338 }, { "epoch": 0.5149389951003939, "grad_norm": 1.2888696936225625, "learning_rate": 9.844579665231331e-06, "loss": 0.1977, "step": 1340 }, { "epoch": 0.5157075607647228, "grad_norm": 1.406520161910905, "learning_rate": 9.843471394186024e-06, "loss": 0.221, "step": 1342 }, { "epoch": 0.5164761264290518, "grad_norm": 1.37703443666149, "learning_rate": 9.842359248568232e-06, "loss": 0.2024, "step": 1344 }, { "epoch": 0.5172446920933808, "grad_norm": 1.309393854784345, "learning_rate": 9.84124322926763e-06, "loss": 0.2011, "step": 1346 }, { "epoch": 0.5180132577577097, "grad_norm": 1.3528852548502097, "learning_rate": 9.840123337176981e-06, "loss": 0.1996, "step": 1348 }, { "epoch": 0.5187818234220386, "grad_norm": 1.3621288779999257, "learning_rate": 9.838999573192156e-06, "loss": 0.1865, "step": 1350 }, { "epoch": 0.5195503890863675, "grad_norm": 1.5539198751018202, "learning_rate": 9.837871938212116e-06, "loss": 0.2108, "step": 1352 }, { "epoch": 0.5203189547506966, "grad_norm": 1.1928473042119738, "learning_rate": 9.836740433138922e-06, "loss": 0.198, "step": 1354 }, { "epoch": 0.5210875204150255, "grad_norm": 1.287109770173762, "learning_rate": 9.83560505887773e-06, "loss": 0.2062, "step": 1356 }, { "epoch": 0.5218560860793544, "grad_norm": 1.3345148934113393, "learning_rate": 9.834465816336791e-06, "loss": 0.2123, "step": 1358 }, { "epoch": 0.5226246517436833, "grad_norm": 1.2834945880979858, "learning_rate": 9.833322706427451e-06, "loss": 0.195, "step": 1360 }, { "epoch": 0.5233932174080123, "grad_norm": 1.3753731448704771, "learning_rate": 9.832175730064154e-06, "loss": 0.1918, "step": 1362 }, { "epoch": 0.5241617830723413, "grad_norm": 1.3794928499297416, "learning_rate": 9.831024888164424e-06, "loss": 0.201, "step": 1364 }, { "epoch": 0.5249303487366702, "grad_norm": 1.3295977056087662, "learning_rate": 9.829870181648892e-06, "loss": 0.191, "step": 1366 }, { "epoch": 0.5256989144009991, "grad_norm": 1.5835788347506514, "learning_rate": 9.828711611441275e-06, "loss": 0.2179, "step": 1368 }, { "epoch": 0.5264674800653281, "grad_norm": 1.4062257781877492, "learning_rate": 9.827549178468376e-06, "loss": 0.1996, "step": 1370 }, { "epoch": 0.527236045729657, "grad_norm": 1.3016394709768888, "learning_rate": 9.826382883660097e-06, "loss": 0.2085, "step": 1372 }, { "epoch": 0.528004611393986, "grad_norm": 1.4242252559437534, "learning_rate": 9.82521272794942e-06, "loss": 0.2042, "step": 1374 }, { "epoch": 0.528773177058315, "grad_norm": 1.3579480640265167, "learning_rate": 9.824038712272423e-06, "loss": 0.2217, "step": 1376 }, { "epoch": 0.5295417427226439, "grad_norm": 1.439747343806765, "learning_rate": 9.82286083756827e-06, "loss": 0.2173, "step": 1378 }, { "epoch": 0.5303103083869728, "grad_norm": 1.3461877561493047, "learning_rate": 9.821679104779208e-06, "loss": 0.1908, "step": 1380 }, { "epoch": 0.5310788740513017, "grad_norm": 1.4730359360930283, "learning_rate": 9.820493514850574e-06, "loss": 0.2162, "step": 1382 }, { "epoch": 0.5318474397156308, "grad_norm": 1.4477489253250448, "learning_rate": 9.819304068730791e-06, "loss": 0.2066, "step": 1384 }, { "epoch": 0.5326160053799597, "grad_norm": 1.4362128410631725, "learning_rate": 9.818110767371365e-06, "loss": 0.1925, "step": 1386 }, { "epoch": 0.5333845710442886, "grad_norm": 1.3986814785631492, "learning_rate": 9.816913611726886e-06, "loss": 0.2297, "step": 1388 }, { "epoch": 0.5341531367086175, "grad_norm": 1.3300939750290368, "learning_rate": 9.81571260275503e-06, "loss": 0.1925, "step": 1390 }, { "epoch": 0.5349217023729465, "grad_norm": 1.3277732840383096, "learning_rate": 9.81450774141655e-06, "loss": 0.1858, "step": 1392 }, { "epoch": 0.5356902680372755, "grad_norm": 1.3708987845252871, "learning_rate": 9.813299028675288e-06, "loss": 0.209, "step": 1394 }, { "epoch": 0.5364588337016044, "grad_norm": 1.2349689292533583, "learning_rate": 9.812086465498161e-06, "loss": 0.1827, "step": 1396 }, { "epoch": 0.5372273993659333, "grad_norm": 1.3333174267670862, "learning_rate": 9.810870052855168e-06, "loss": 0.2253, "step": 1398 }, { "epoch": 0.5379959650302623, "grad_norm": 1.3614257350584085, "learning_rate": 9.80964979171939e-06, "loss": 0.2139, "step": 1400 }, { "epoch": 0.5387645306945912, "grad_norm": 1.2571688607617737, "learning_rate": 9.808425683066982e-06, "loss": 0.2206, "step": 1402 }, { "epoch": 0.5395330963589202, "grad_norm": 1.486397005807379, "learning_rate": 9.807197727877179e-06, "loss": 0.2091, "step": 1404 }, { "epoch": 0.5403016620232491, "grad_norm": 1.2708852857353887, "learning_rate": 9.805965927132294e-06, "loss": 0.179, "step": 1406 }, { "epoch": 0.5410702276875781, "grad_norm": 1.3423725048308324, "learning_rate": 9.804730281817718e-06, "loss": 0.2176, "step": 1408 }, { "epoch": 0.541838793351907, "grad_norm": 1.449081275675123, "learning_rate": 9.803490792921912e-06, "loss": 0.2335, "step": 1410 }, { "epoch": 0.5426073590162359, "grad_norm": 1.3894909498222905, "learning_rate": 9.802247461436415e-06, "loss": 0.1955, "step": 1412 }, { "epoch": 0.543375924680565, "grad_norm": 1.5789386976972783, "learning_rate": 9.80100028835584e-06, "loss": 0.2192, "step": 1414 }, { "epoch": 0.5441444903448939, "grad_norm": 1.4331990017051242, "learning_rate": 9.799749274677875e-06, "loss": 0.1873, "step": 1416 }, { "epoch": 0.5449130560092228, "grad_norm": 1.3725789713422325, "learning_rate": 9.798494421403274e-06, "loss": 0.2024, "step": 1418 }, { "epoch": 0.5456816216735517, "grad_norm": 1.3207936028855949, "learning_rate": 9.797235729535868e-06, "loss": 0.2004, "step": 1420 }, { "epoch": 0.5464501873378806, "grad_norm": 1.3370028156891876, "learning_rate": 9.79597320008256e-06, "loss": 0.2056, "step": 1422 }, { "epoch": 0.5472187530022097, "grad_norm": 1.4293601542479812, "learning_rate": 9.794706834053319e-06, "loss": 0.1995, "step": 1424 }, { "epoch": 0.5479873186665386, "grad_norm": 1.4223871811350175, "learning_rate": 9.79343663246118e-06, "loss": 0.1862, "step": 1426 }, { "epoch": 0.5487558843308675, "grad_norm": 1.4577422608125201, "learning_rate": 9.792162596322257e-06, "loss": 0.211, "step": 1428 }, { "epoch": 0.5495244499951965, "grad_norm": 1.4129154138717641, "learning_rate": 9.790884726655723e-06, "loss": 0.209, "step": 1430 }, { "epoch": 0.5502930156595254, "grad_norm": 1.198315912486837, "learning_rate": 9.789603024483816e-06, "loss": 0.1726, "step": 1432 }, { "epoch": 0.5510615813238544, "grad_norm": 1.2982214491245976, "learning_rate": 9.78831749083185e-06, "loss": 0.2087, "step": 1434 }, { "epoch": 0.5518301469881833, "grad_norm": 1.339997021513598, "learning_rate": 9.78702812672819e-06, "loss": 0.2031, "step": 1436 }, { "epoch": 0.5525987126525123, "grad_norm": 1.3295828461572072, "learning_rate": 9.78573493320428e-06, "loss": 0.1933, "step": 1438 }, { "epoch": 0.5533672783168412, "grad_norm": 1.3927372461090053, "learning_rate": 9.784437911294618e-06, "loss": 0.1776, "step": 1440 }, { "epoch": 0.5541358439811701, "grad_norm": 1.3562209501189604, "learning_rate": 9.783137062036765e-06, "loss": 0.2081, "step": 1442 }, { "epoch": 0.5549044096454991, "grad_norm": 1.298105937310008, "learning_rate": 9.781832386471346e-06, "loss": 0.2105, "step": 1444 }, { "epoch": 0.5556729753098281, "grad_norm": 1.30280262242187, "learning_rate": 9.780523885642046e-06, "loss": 0.1927, "step": 1446 }, { "epoch": 0.556441540974157, "grad_norm": 1.2483549864637862, "learning_rate": 9.779211560595611e-06, "loss": 0.1788, "step": 1448 }, { "epoch": 0.5572101066384859, "grad_norm": 1.2238811479420058, "learning_rate": 9.777895412381847e-06, "loss": 0.2074, "step": 1450 }, { "epoch": 0.5579786723028148, "grad_norm": 1.450742186708242, "learning_rate": 9.776575442053616e-06, "loss": 0.2053, "step": 1452 }, { "epoch": 0.5587472379671439, "grad_norm": 1.320882678690354, "learning_rate": 9.775251650666838e-06, "loss": 0.1937, "step": 1454 }, { "epoch": 0.5595158036314728, "grad_norm": 1.3642837210428673, "learning_rate": 9.773924039280488e-06, "loss": 0.1886, "step": 1456 }, { "epoch": 0.5602843692958017, "grad_norm": 1.4184347395377566, "learning_rate": 9.772592608956604e-06, "loss": 0.1788, "step": 1458 }, { "epoch": 0.5610529349601306, "grad_norm": 1.4740979532717917, "learning_rate": 9.771257360760268e-06, "loss": 0.2341, "step": 1460 }, { "epoch": 0.5618215006244596, "grad_norm": 1.4215665224560925, "learning_rate": 9.769918295759628e-06, "loss": 0.1981, "step": 1462 }, { "epoch": 0.5625900662887885, "grad_norm": 1.4124349135188115, "learning_rate": 9.768575415025878e-06, "loss": 0.2026, "step": 1464 }, { "epoch": 0.5633586319531175, "grad_norm": 1.2540874956346166, "learning_rate": 9.767228719633265e-06, "loss": 0.2, "step": 1466 }, { "epoch": 0.5641271976174465, "grad_norm": 1.3763632009209785, "learning_rate": 9.765878210659085e-06, "loss": 0.192, "step": 1468 }, { "epoch": 0.5648957632817754, "grad_norm": 1.28802520769284, "learning_rate": 9.764523889183693e-06, "loss": 0.1865, "step": 1470 }, { "epoch": 0.5656643289461043, "grad_norm": 1.2936195400951103, "learning_rate": 9.763165756290486e-06, "loss": 0.1862, "step": 1472 }, { "epoch": 0.5664328946104332, "grad_norm": 1.4057603899652855, "learning_rate": 9.761803813065915e-06, "loss": 0.224, "step": 1474 }, { "epoch": 0.5672014602747623, "grad_norm": 1.316535286851872, "learning_rate": 9.760438060599478e-06, "loss": 0.2157, "step": 1476 }, { "epoch": 0.5679700259390912, "grad_norm": 1.313641141432758, "learning_rate": 9.759068499983715e-06, "loss": 0.2137, "step": 1478 }, { "epoch": 0.5687385916034201, "grad_norm": 1.2486341006678794, "learning_rate": 9.75769513231422e-06, "loss": 0.19, "step": 1480 }, { "epoch": 0.569507157267749, "grad_norm": 1.2713345419796058, "learning_rate": 9.75631795868963e-06, "loss": 0.2162, "step": 1482 }, { "epoch": 0.570275722932078, "grad_norm": 1.2999523551620669, "learning_rate": 9.754936980211623e-06, "loss": 0.2059, "step": 1484 }, { "epoch": 0.571044288596407, "grad_norm": 1.314536889796257, "learning_rate": 9.753552197984922e-06, "loss": 0.2106, "step": 1486 }, { "epoch": 0.5718128542607359, "grad_norm": 1.2644556991901343, "learning_rate": 9.7521636131173e-06, "loss": 0.2061, "step": 1488 }, { "epoch": 0.5725814199250648, "grad_norm": 1.4922625202371915, "learning_rate": 9.75077122671956e-06, "loss": 0.202, "step": 1490 }, { "epoch": 0.5733499855893938, "grad_norm": 1.2414180004432929, "learning_rate": 9.749375039905557e-06, "loss": 0.2104, "step": 1492 }, { "epoch": 0.5741185512537227, "grad_norm": 1.3859175543840374, "learning_rate": 9.74797505379218e-06, "loss": 0.2125, "step": 1494 }, { "epoch": 0.5748871169180517, "grad_norm": 1.2561991963728598, "learning_rate": 9.746571269499356e-06, "loss": 0.1895, "step": 1496 }, { "epoch": 0.5756556825823806, "grad_norm": 1.239706370515697, "learning_rate": 9.74516368815006e-06, "loss": 0.1896, "step": 1498 }, { "epoch": 0.5764242482467096, "grad_norm": 1.3391964317739775, "learning_rate": 9.743752310870289e-06, "loss": 0.1933, "step": 1500 }, { "epoch": 0.5764242482467096, "eval_loss": 0.18106360733509064, "eval_runtime": 390.6218, "eval_samples_per_second": 47.373, "eval_steps_per_second": 5.924, "step": 1500 }, { "epoch": 0.5771928139110385, "grad_norm": 1.287426954916706, "learning_rate": 9.74233713878909e-06, "loss": 0.1888, "step": 1502 }, { "epoch": 0.5779613795753674, "grad_norm": 1.167198031275383, "learning_rate": 9.74091817303854e-06, "loss": 0.181, "step": 1504 }, { "epoch": 0.5787299452396965, "grad_norm": 1.418949258572521, "learning_rate": 9.739495414753754e-06, "loss": 0.2172, "step": 1506 }, { "epoch": 0.5794985109040254, "grad_norm": 1.497337608679801, "learning_rate": 9.738068865072874e-06, "loss": 0.2061, "step": 1508 }, { "epoch": 0.5802670765683543, "grad_norm": 1.3408473944395172, "learning_rate": 9.736638525137083e-06, "loss": 0.2263, "step": 1510 }, { "epoch": 0.5810356422326832, "grad_norm": 1.2355905356330328, "learning_rate": 9.735204396090593e-06, "loss": 0.1861, "step": 1512 }, { "epoch": 0.5818042078970121, "grad_norm": 1.369304238075575, "learning_rate": 9.733766479080643e-06, "loss": 0.1943, "step": 1514 }, { "epoch": 0.5825727735613412, "grad_norm": 1.308221787760333, "learning_rate": 9.732324775257507e-06, "loss": 0.205, "step": 1516 }, { "epoch": 0.5833413392256701, "grad_norm": 1.2506045757459512, "learning_rate": 9.73087928577449e-06, "loss": 0.2003, "step": 1518 }, { "epoch": 0.584109904889999, "grad_norm": 1.3078632468939588, "learning_rate": 9.72943001178792e-06, "loss": 0.2086, "step": 1520 }, { "epoch": 0.584878470554328, "grad_norm": 1.3441107640267025, "learning_rate": 9.727976954457154e-06, "loss": 0.2207, "step": 1522 }, { "epoch": 0.5856470362186569, "grad_norm": 1.1124377403418169, "learning_rate": 9.72652011494458e-06, "loss": 0.1805, "step": 1524 }, { "epoch": 0.5864156018829859, "grad_norm": 1.2505849198023506, "learning_rate": 9.725059494415605e-06, "loss": 0.1924, "step": 1526 }, { "epoch": 0.5871841675473148, "grad_norm": 1.3084539974954903, "learning_rate": 9.723595094038664e-06, "loss": 0.2007, "step": 1528 }, { "epoch": 0.5879527332116438, "grad_norm": 1.4045312411189377, "learning_rate": 9.722126914985218e-06, "loss": 0.1702, "step": 1530 }, { "epoch": 0.5887212988759727, "grad_norm": 1.421442505706631, "learning_rate": 9.720654958429744e-06, "loss": 0.2239, "step": 1532 }, { "epoch": 0.5894898645403016, "grad_norm": 1.3206061659271495, "learning_rate": 9.71917922554975e-06, "loss": 0.2046, "step": 1534 }, { "epoch": 0.5902584302046306, "grad_norm": 1.2716231961836306, "learning_rate": 9.717699717525757e-06, "loss": 0.2031, "step": 1536 }, { "epoch": 0.5910269958689596, "grad_norm": 1.1948915545558434, "learning_rate": 9.716216435541308e-06, "loss": 0.1714, "step": 1538 }, { "epoch": 0.5917955615332885, "grad_norm": 1.231034634495294, "learning_rate": 9.714729380782968e-06, "loss": 0.2066, "step": 1540 }, { "epoch": 0.5925641271976174, "grad_norm": 1.1758320616098612, "learning_rate": 9.713238554440316e-06, "loss": 0.1869, "step": 1542 }, { "epoch": 0.5933326928619463, "grad_norm": 1.2949441484368311, "learning_rate": 9.711743957705952e-06, "loss": 0.2195, "step": 1544 }, { "epoch": 0.5941012585262754, "grad_norm": 1.216016797701703, "learning_rate": 9.710245591775491e-06, "loss": 0.1896, "step": 1546 }, { "epoch": 0.5948698241906043, "grad_norm": 1.391559887876485, "learning_rate": 9.708743457847561e-06, "loss": 0.1969, "step": 1548 }, { "epoch": 0.5956383898549332, "grad_norm": 1.3990595473773835, "learning_rate": 9.707237557123804e-06, "loss": 0.2096, "step": 1550 }, { "epoch": 0.5964069555192621, "grad_norm": 1.3073850095721629, "learning_rate": 9.705727890808879e-06, "loss": 0.2071, "step": 1552 }, { "epoch": 0.5971755211835911, "grad_norm": 1.2496266878831166, "learning_rate": 9.704214460110454e-06, "loss": 0.19, "step": 1554 }, { "epoch": 0.5979440868479201, "grad_norm": 1.3048901983236318, "learning_rate": 9.702697266239211e-06, "loss": 0.203, "step": 1556 }, { "epoch": 0.598712652512249, "grad_norm": 3.854617101477928, "learning_rate": 9.701176310408843e-06, "loss": 0.1988, "step": 1558 }, { "epoch": 0.599481218176578, "grad_norm": 1.3969969542451366, "learning_rate": 9.699651593836045e-06, "loss": 0.2007, "step": 1560 }, { "epoch": 0.6002497838409069, "grad_norm": 1.3429314677058979, "learning_rate": 9.698123117740528e-06, "loss": 0.1925, "step": 1562 }, { "epoch": 0.6010183495052358, "grad_norm": 1.2358959869537518, "learning_rate": 9.696590883345011e-06, "loss": 0.1763, "step": 1564 }, { "epoch": 0.6017869151695648, "grad_norm": 1.2870100673390357, "learning_rate": 9.695054891875216e-06, "loss": 0.209, "step": 1566 }, { "epoch": 0.6025554808338938, "grad_norm": 1.293264886254371, "learning_rate": 9.693515144559868e-06, "loss": 0.2068, "step": 1568 }, { "epoch": 0.6033240464982227, "grad_norm": 2.3206124677828277, "learning_rate": 9.691971642630702e-06, "loss": 0.2052, "step": 1570 }, { "epoch": 0.6040926121625516, "grad_norm": 1.3348611684485352, "learning_rate": 9.690424387322454e-06, "loss": 0.2104, "step": 1572 }, { "epoch": 0.6048611778268805, "grad_norm": 1.1751768720449665, "learning_rate": 9.688873379872865e-06, "loss": 0.1996, "step": 1574 }, { "epoch": 0.6056297434912096, "grad_norm": 1.0843781283481289, "learning_rate": 9.687318621522672e-06, "loss": 0.1647, "step": 1576 }, { "epoch": 0.6063983091555385, "grad_norm": 1.387543890066641, "learning_rate": 9.685760113515617e-06, "loss": 0.23, "step": 1578 }, { "epoch": 0.6071668748198674, "grad_norm": 1.2932676189231018, "learning_rate": 9.684197857098443e-06, "loss": 0.2006, "step": 1580 }, { "epoch": 0.6079354404841963, "grad_norm": 1.2672160624781967, "learning_rate": 9.682631853520885e-06, "loss": 0.2052, "step": 1582 }, { "epoch": 0.6087040061485253, "grad_norm": 1.2467158221920345, "learning_rate": 9.681062104035684e-06, "loss": 0.2004, "step": 1584 }, { "epoch": 0.6094725718128543, "grad_norm": 1.1772332784398907, "learning_rate": 9.679488609898569e-06, "loss": 0.203, "step": 1586 }, { "epoch": 0.6102411374771832, "grad_norm": 1.4165970531499326, "learning_rate": 9.677911372368273e-06, "loss": 0.201, "step": 1588 }, { "epoch": 0.6110097031415122, "grad_norm": 1.2328433029772616, "learning_rate": 9.676330392706517e-06, "loss": 0.2035, "step": 1590 }, { "epoch": 0.6117782688058411, "grad_norm": 1.4582149523785455, "learning_rate": 9.674745672178017e-06, "loss": 0.1928, "step": 1592 }, { "epoch": 0.61254683447017, "grad_norm": 1.4136249438028552, "learning_rate": 9.673157212050485e-06, "loss": 0.1938, "step": 1594 }, { "epoch": 0.613315400134499, "grad_norm": 1.2456679250911895, "learning_rate": 9.67156501359462e-06, "loss": 0.2089, "step": 1596 }, { "epoch": 0.614083965798828, "grad_norm": 1.1669162731243092, "learning_rate": 9.669969078084116e-06, "loss": 0.201, "step": 1598 }, { "epoch": 0.6148525314631569, "grad_norm": 1.4412718707298333, "learning_rate": 9.668369406795651e-06, "loss": 0.213, "step": 1600 }, { "epoch": 0.6156210971274858, "grad_norm": 1.2838459708474366, "learning_rate": 9.666766001008897e-06, "loss": 0.1842, "step": 1602 }, { "epoch": 0.6163896627918147, "grad_norm": 1.4243731142475187, "learning_rate": 9.665158862006508e-06, "loss": 0.2034, "step": 1604 }, { "epoch": 0.6171582284561438, "grad_norm": 1.2966223052269372, "learning_rate": 9.663547991074129e-06, "loss": 0.2032, "step": 1606 }, { "epoch": 0.6179267941204727, "grad_norm": 1.5477313602864722, "learning_rate": 9.66193338950039e-06, "loss": 0.2126, "step": 1608 }, { "epoch": 0.6186953597848016, "grad_norm": 1.2346085737138752, "learning_rate": 9.6603150585769e-06, "loss": 0.172, "step": 1610 }, { "epoch": 0.6194639254491305, "grad_norm": 1.3638509738796714, "learning_rate": 9.658692999598258e-06, "loss": 0.1729, "step": 1612 }, { "epoch": 0.6202324911134595, "grad_norm": 1.513196620853587, "learning_rate": 9.657067213862046e-06, "loss": 0.2124, "step": 1614 }, { "epoch": 0.6210010567777885, "grad_norm": 1.1539813330256896, "learning_rate": 9.655437702668818e-06, "loss": 0.1709, "step": 1616 }, { "epoch": 0.6217696224421174, "grad_norm": 1.408814870139716, "learning_rate": 9.653804467322117e-06, "loss": 0.2013, "step": 1618 }, { "epoch": 0.6225381881064463, "grad_norm": 1.2188821177821376, "learning_rate": 9.652167509128462e-06, "loss": 0.197, "step": 1620 }, { "epoch": 0.6233067537707753, "grad_norm": 1.1824541177299042, "learning_rate": 9.650526829397352e-06, "loss": 0.197, "step": 1622 }, { "epoch": 0.6240753194351042, "grad_norm": 1.2963843642246893, "learning_rate": 9.648882429441258e-06, "loss": 0.1984, "step": 1624 }, { "epoch": 0.6248438850994332, "grad_norm": 1.5267206157428852, "learning_rate": 9.647234310575632e-06, "loss": 0.2064, "step": 1626 }, { "epoch": 0.6256124507637622, "grad_norm": 1.1155367886285708, "learning_rate": 9.645582474118902e-06, "loss": 0.186, "step": 1628 }, { "epoch": 0.6263810164280911, "grad_norm": 1.3431087115032683, "learning_rate": 9.643926921392464e-06, "loss": 0.2024, "step": 1630 }, { "epoch": 0.62714958209242, "grad_norm": 1.297088077350878, "learning_rate": 9.64226765372069e-06, "loss": 0.2088, "step": 1632 }, { "epoch": 0.6279181477567489, "grad_norm": 1.5806076209306625, "learning_rate": 9.640604672430927e-06, "loss": 0.2114, "step": 1634 }, { "epoch": 0.628686713421078, "grad_norm": 2.4547394088894867, "learning_rate": 9.638937978853486e-06, "loss": 0.1957, "step": 1636 }, { "epoch": 0.6294552790854069, "grad_norm": 1.33620608117729, "learning_rate": 9.637267574321655e-06, "loss": 0.1878, "step": 1638 }, { "epoch": 0.6302238447497358, "grad_norm": 1.3265251224176011, "learning_rate": 9.635593460171688e-06, "loss": 0.2069, "step": 1640 }, { "epoch": 0.6309924104140647, "grad_norm": 1.2416642675451406, "learning_rate": 9.6339156377428e-06, "loss": 0.1971, "step": 1642 }, { "epoch": 0.6317609760783937, "grad_norm": 1.2723773079973835, "learning_rate": 9.632234108377183e-06, "loss": 0.1898, "step": 1644 }, { "epoch": 0.6325295417427227, "grad_norm": 1.3080972838129097, "learning_rate": 9.630548873419987e-06, "loss": 0.1798, "step": 1646 }, { "epoch": 0.6332981074070516, "grad_norm": 1.330490064174828, "learning_rate": 9.62885993421933e-06, "loss": 0.2032, "step": 1648 }, { "epoch": 0.6340666730713805, "grad_norm": 1.1846594460557267, "learning_rate": 9.627167292126291e-06, "loss": 0.1837, "step": 1650 }, { "epoch": 0.6348352387357095, "grad_norm": 1.1834527971396693, "learning_rate": 9.625470948494913e-06, "loss": 0.1856, "step": 1652 }, { "epoch": 0.6356038044000384, "grad_norm": 1.2562362806144844, "learning_rate": 9.6237709046822e-06, "loss": 0.201, "step": 1654 }, { "epoch": 0.6363723700643674, "grad_norm": 1.4497079835938032, "learning_rate": 9.622067162048111e-06, "loss": 0.2171, "step": 1656 }, { "epoch": 0.6371409357286963, "grad_norm": 1.4480590150674777, "learning_rate": 9.620359721955574e-06, "loss": 0.2001, "step": 1658 }, { "epoch": 0.6379095013930253, "grad_norm": 1.162035868543118, "learning_rate": 9.618648585770465e-06, "loss": 0.1759, "step": 1660 }, { "epoch": 0.6386780670573542, "grad_norm": 1.3199546494522103, "learning_rate": 9.616933754861622e-06, "loss": 0.1923, "step": 1662 }, { "epoch": 0.6394466327216831, "grad_norm": 1.4478798846309864, "learning_rate": 9.615215230600839e-06, "loss": 0.2087, "step": 1664 }, { "epoch": 0.6402151983860122, "grad_norm": 1.2694890701541321, "learning_rate": 9.61349301436286e-06, "loss": 0.206, "step": 1666 }, { "epoch": 0.6409837640503411, "grad_norm": 1.315829791331175, "learning_rate": 9.611767107525384e-06, "loss": 0.2107, "step": 1668 }, { "epoch": 0.64175232971467, "grad_norm": 1.1882043928935548, "learning_rate": 9.610037511469069e-06, "loss": 0.1967, "step": 1670 }, { "epoch": 0.6425208953789989, "grad_norm": 1.2157782559370218, "learning_rate": 9.608304227577512e-06, "loss": 0.1814, "step": 1672 }, { "epoch": 0.6432894610433278, "grad_norm": 1.2458376047570976, "learning_rate": 9.606567257237274e-06, "loss": 0.1948, "step": 1674 }, { "epoch": 0.6440580267076569, "grad_norm": 1.3343808579324128, "learning_rate": 9.604826601837852e-06, "loss": 0.2046, "step": 1676 }, { "epoch": 0.6448265923719858, "grad_norm": 1.2306891251956384, "learning_rate": 9.6030822627717e-06, "loss": 0.207, "step": 1678 }, { "epoch": 0.6455951580363147, "grad_norm": 1.4394973570491079, "learning_rate": 9.601334241434214e-06, "loss": 0.207, "step": 1680 }, { "epoch": 0.6463637237006437, "grad_norm": 1.262223145662826, "learning_rate": 9.599582539223737e-06, "loss": 0.1936, "step": 1682 }, { "epoch": 0.6471322893649726, "grad_norm": 1.221108271012134, "learning_rate": 9.597827157541559e-06, "loss": 0.2009, "step": 1684 }, { "epoch": 0.6479008550293016, "grad_norm": 1.1308112960855634, "learning_rate": 9.596068097791909e-06, "loss": 0.1915, "step": 1686 }, { "epoch": 0.6486694206936305, "grad_norm": 1.2033779074795024, "learning_rate": 9.594305361381963e-06, "loss": 0.194, "step": 1688 }, { "epoch": 0.6494379863579595, "grad_norm": 1.1322871543574708, "learning_rate": 9.592538949721832e-06, "loss": 0.1747, "step": 1690 }, { "epoch": 0.6502065520222884, "grad_norm": 1.2106230166256158, "learning_rate": 9.590768864224574e-06, "loss": 0.2095, "step": 1692 }, { "epoch": 0.6509751176866173, "grad_norm": 1.3174240422105115, "learning_rate": 9.58899510630618e-06, "loss": 0.1971, "step": 1694 }, { "epoch": 0.6517436833509463, "grad_norm": 1.2165698242552831, "learning_rate": 9.587217677385584e-06, "loss": 0.1945, "step": 1696 }, { "epoch": 0.6525122490152753, "grad_norm": 1.2419229777406855, "learning_rate": 9.58543657888465e-06, "loss": 0.1991, "step": 1698 }, { "epoch": 0.6532808146796042, "grad_norm": 1.2613908388585762, "learning_rate": 9.583651812228186e-06, "loss": 0.1817, "step": 1700 }, { "epoch": 0.6540493803439331, "grad_norm": 1.3644417522104724, "learning_rate": 9.581863378843928e-06, "loss": 0.1816, "step": 1702 }, { "epoch": 0.654817946008262, "grad_norm": 1.4077333844274722, "learning_rate": 9.580071280162546e-06, "loss": 0.2072, "step": 1704 }, { "epoch": 0.6555865116725911, "grad_norm": 1.1808268434718832, "learning_rate": 9.578275517617646e-06, "loss": 0.1744, "step": 1706 }, { "epoch": 0.65635507733692, "grad_norm": 1.4884312303543485, "learning_rate": 9.576476092645762e-06, "loss": 0.2107, "step": 1708 }, { "epoch": 0.6571236430012489, "grad_norm": 1.4475760015355676, "learning_rate": 9.574673006686354e-06, "loss": 0.1949, "step": 1710 }, { "epoch": 0.6578922086655778, "grad_norm": 1.1674508432883988, "learning_rate": 9.572866261181818e-06, "loss": 0.1841, "step": 1712 }, { "epoch": 0.6586607743299068, "grad_norm": 1.3203650124263, "learning_rate": 9.571055857577473e-06, "loss": 0.2062, "step": 1714 }, { "epoch": 0.6594293399942358, "grad_norm": 1.2033881350118112, "learning_rate": 9.569241797321566e-06, "loss": 0.1865, "step": 1716 }, { "epoch": 0.6601979056585647, "grad_norm": 1.3915392700565332, "learning_rate": 9.567424081865268e-06, "loss": 0.2121, "step": 1718 }, { "epoch": 0.6609664713228937, "grad_norm": 1.278979272247038, "learning_rate": 9.565602712662675e-06, "loss": 0.216, "step": 1720 }, { "epoch": 0.6617350369872226, "grad_norm": 1.3564891026189372, "learning_rate": 9.563777691170806e-06, "loss": 0.2246, "step": 1722 }, { "epoch": 0.6625036026515515, "grad_norm": 1.2129684939143928, "learning_rate": 9.561949018849598e-06, "loss": 0.1743, "step": 1724 }, { "epoch": 0.6632721683158805, "grad_norm": 1.3701442132664645, "learning_rate": 9.560116697161916e-06, "loss": 0.1926, "step": 1726 }, { "epoch": 0.6640407339802095, "grad_norm": 1.149095507322513, "learning_rate": 9.558280727573538e-06, "loss": 0.1788, "step": 1728 }, { "epoch": 0.6648092996445384, "grad_norm": 1.2814058841584446, "learning_rate": 9.556441111553161e-06, "loss": 0.2084, "step": 1730 }, { "epoch": 0.6655778653088673, "grad_norm": 1.329426835055138, "learning_rate": 9.554597850572402e-06, "loss": 0.19, "step": 1732 }, { "epoch": 0.6663464309731962, "grad_norm": 1.1823674936023603, "learning_rate": 9.55275094610579e-06, "loss": 0.188, "step": 1734 }, { "epoch": 0.6671149966375253, "grad_norm": 1.2609115666668476, "learning_rate": 9.550900399630772e-06, "loss": 0.1903, "step": 1736 }, { "epoch": 0.6678835623018542, "grad_norm": 1.3263135694013082, "learning_rate": 9.549046212627706e-06, "loss": 0.192, "step": 1738 }, { "epoch": 0.6686521279661831, "grad_norm": 1.2300779701720115, "learning_rate": 9.547188386579863e-06, "loss": 0.1686, "step": 1740 }, { "epoch": 0.669420693630512, "grad_norm": 1.3031134577606849, "learning_rate": 9.545326922973428e-06, "loss": 0.2023, "step": 1742 }, { "epoch": 0.670189259294841, "grad_norm": 1.4104566424724636, "learning_rate": 9.54346182329749e-06, "loss": 0.2011, "step": 1744 }, { "epoch": 0.67095782495917, "grad_norm": 1.3173969460246497, "learning_rate": 9.541593089044052e-06, "loss": 0.1984, "step": 1746 }, { "epoch": 0.6717263906234989, "grad_norm": 1.427794323621257, "learning_rate": 9.53972072170802e-06, "loss": 0.2131, "step": 1748 }, { "epoch": 0.6724949562878278, "grad_norm": 1.516582391410596, "learning_rate": 9.537844722787211e-06, "loss": 0.1873, "step": 1750 }, { "epoch": 0.6732635219521568, "grad_norm": 1.1723521017593788, "learning_rate": 9.535965093782347e-06, "loss": 0.1805, "step": 1752 }, { "epoch": 0.6740320876164857, "grad_norm": 1.3864689994086057, "learning_rate": 9.534081836197047e-06, "loss": 0.2015, "step": 1754 }, { "epoch": 0.6748006532808147, "grad_norm": 1.3040052855001218, "learning_rate": 9.532194951537838e-06, "loss": 0.1915, "step": 1756 }, { "epoch": 0.6755692189451437, "grad_norm": 1.1974369816844495, "learning_rate": 9.530304441314154e-06, "loss": 0.1808, "step": 1758 }, { "epoch": 0.6763377846094726, "grad_norm": 1.2733320357502782, "learning_rate": 9.528410307038314e-06, "loss": 0.1923, "step": 1760 }, { "epoch": 0.6771063502738015, "grad_norm": 1.3621342410770065, "learning_rate": 9.526512550225554e-06, "loss": 0.2106, "step": 1762 }, { "epoch": 0.6778749159381304, "grad_norm": 1.277921296304779, "learning_rate": 9.524611172393994e-06, "loss": 0.1698, "step": 1764 }, { "epoch": 0.6786434816024595, "grad_norm": 1.2818291040352963, "learning_rate": 9.522706175064657e-06, "loss": 0.1946, "step": 1766 }, { "epoch": 0.6794120472667884, "grad_norm": 1.3502554834254876, "learning_rate": 9.520797559761463e-06, "loss": 0.2049, "step": 1768 }, { "epoch": 0.6801806129311173, "grad_norm": 1.551070247316441, "learning_rate": 9.51888532801122e-06, "loss": 0.2254, "step": 1770 }, { "epoch": 0.6809491785954462, "grad_norm": 1.2048401774419955, "learning_rate": 9.516969481343636e-06, "loss": 0.1979, "step": 1772 }, { "epoch": 0.6817177442597752, "grad_norm": 1.0976758853794555, "learning_rate": 9.515050021291303e-06, "loss": 0.2078, "step": 1774 }, { "epoch": 0.6824863099241042, "grad_norm": 1.1907512799279973, "learning_rate": 9.513126949389713e-06, "loss": 0.1998, "step": 1776 }, { "epoch": 0.6832548755884331, "grad_norm": 1.136485933206815, "learning_rate": 9.511200267177239e-06, "loss": 0.1879, "step": 1778 }, { "epoch": 0.684023441252762, "grad_norm": 1.3239377678409967, "learning_rate": 9.509269976195144e-06, "loss": 0.2069, "step": 1780 }, { "epoch": 0.684792006917091, "grad_norm": 1.2531488217711504, "learning_rate": 9.507336077987584e-06, "loss": 0.1974, "step": 1782 }, { "epoch": 0.6855605725814199, "grad_norm": 1.412082027357498, "learning_rate": 9.505398574101591e-06, "loss": 0.2034, "step": 1784 }, { "epoch": 0.6863291382457489, "grad_norm": 1.3001650413870924, "learning_rate": 9.50345746608709e-06, "loss": 0.1754, "step": 1786 }, { "epoch": 0.6870977039100779, "grad_norm": 1.4513014663050383, "learning_rate": 9.501512755496885e-06, "loss": 0.1886, "step": 1788 }, { "epoch": 0.6878662695744068, "grad_norm": 1.3454684717224001, "learning_rate": 9.499564443886658e-06, "loss": 0.2002, "step": 1790 }, { "epoch": 0.6886348352387357, "grad_norm": 1.2967319250315876, "learning_rate": 9.497612532814981e-06, "loss": 0.2028, "step": 1792 }, { "epoch": 0.6894034009030646, "grad_norm": 1.173576917449772, "learning_rate": 9.495657023843298e-06, "loss": 0.1694, "step": 1794 }, { "epoch": 0.6901719665673935, "grad_norm": 1.2198031617334804, "learning_rate": 9.493697918535935e-06, "loss": 0.2038, "step": 1796 }, { "epoch": 0.6909405322317226, "grad_norm": 1.1624911283765307, "learning_rate": 9.491735218460092e-06, "loss": 0.1928, "step": 1798 }, { "epoch": 0.6917090978960515, "grad_norm": 1.3353346726174404, "learning_rate": 9.489768925185848e-06, "loss": 0.1971, "step": 1800 }, { "epoch": 0.6924776635603804, "grad_norm": 1.1435940388247579, "learning_rate": 9.487799040286152e-06, "loss": 0.1829, "step": 1802 }, { "epoch": 0.6932462292247094, "grad_norm": 1.3422804823350556, "learning_rate": 9.48582556533683e-06, "loss": 0.2, "step": 1804 }, { "epoch": 0.6940147948890383, "grad_norm": 1.2502191921170718, "learning_rate": 9.483848501916578e-06, "loss": 0.2075, "step": 1806 }, { "epoch": 0.6947833605533673, "grad_norm": 1.5305187115885368, "learning_rate": 9.481867851606965e-06, "loss": 0.1924, "step": 1808 }, { "epoch": 0.6955519262176962, "grad_norm": 1.1776389621342327, "learning_rate": 9.479883615992424e-06, "loss": 0.1934, "step": 1810 }, { "epoch": 0.6963204918820252, "grad_norm": 1.285799940265511, "learning_rate": 9.477895796660265e-06, "loss": 0.204, "step": 1812 }, { "epoch": 0.6970890575463541, "grad_norm": 1.2548318065621586, "learning_rate": 9.475904395200655e-06, "loss": 0.1898, "step": 1814 }, { "epoch": 0.697857623210683, "grad_norm": 1.2301910260305635, "learning_rate": 9.473909413206633e-06, "loss": 0.188, "step": 1816 }, { "epoch": 0.698626188875012, "grad_norm": 1.5304283762181896, "learning_rate": 9.471910852274098e-06, "loss": 0.2017, "step": 1818 }, { "epoch": 0.699394754539341, "grad_norm": 1.2617384380579768, "learning_rate": 9.46990871400182e-06, "loss": 0.1895, "step": 1820 }, { "epoch": 0.7001633202036699, "grad_norm": 1.31453287106243, "learning_rate": 9.467902999991416e-06, "loss": 0.1797, "step": 1822 }, { "epoch": 0.7009318858679988, "grad_norm": 1.280774750701666, "learning_rate": 9.465893711847381e-06, "loss": 0.1811, "step": 1824 }, { "epoch": 0.7017004515323277, "grad_norm": 1.246958794934018, "learning_rate": 9.463880851177056e-06, "loss": 0.1944, "step": 1826 }, { "epoch": 0.7024690171966568, "grad_norm": 1.1855252597971575, "learning_rate": 9.461864419590647e-06, "loss": 0.1884, "step": 1828 }, { "epoch": 0.7032375828609857, "grad_norm": 1.2810503035520533, "learning_rate": 9.459844418701212e-06, "loss": 0.1885, "step": 1830 }, { "epoch": 0.7040061485253146, "grad_norm": 1.3761888108731573, "learning_rate": 9.457820850124666e-06, "loss": 0.1945, "step": 1832 }, { "epoch": 0.7047747141896435, "grad_norm": 1.3058863107914824, "learning_rate": 9.455793715479782e-06, "loss": 0.207, "step": 1834 }, { "epoch": 0.7055432798539725, "grad_norm": 1.3297033950069876, "learning_rate": 9.453763016388178e-06, "loss": 0.1886, "step": 1836 }, { "epoch": 0.7063118455183015, "grad_norm": 1.3506726981635433, "learning_rate": 9.45172875447433e-06, "loss": 0.2095, "step": 1838 }, { "epoch": 0.7070804111826304, "grad_norm": 1.2620180323023928, "learning_rate": 9.44969093136556e-06, "loss": 0.1993, "step": 1840 }, { "epoch": 0.7078489768469594, "grad_norm": 1.3058209088092634, "learning_rate": 9.447649548692041e-06, "loss": 0.1881, "step": 1842 }, { "epoch": 0.7086175425112883, "grad_norm": 1.358866956024039, "learning_rate": 9.445604608086792e-06, "loss": 0.1881, "step": 1844 }, { "epoch": 0.7093861081756172, "grad_norm": 1.296621040587658, "learning_rate": 9.443556111185682e-06, "loss": 0.1806, "step": 1846 }, { "epoch": 0.7101546738399462, "grad_norm": 1.247094310071278, "learning_rate": 9.441504059627418e-06, "loss": 0.1784, "step": 1848 }, { "epoch": 0.7109232395042752, "grad_norm": 1.2144501231447247, "learning_rate": 9.439448455053558e-06, "loss": 0.1837, "step": 1850 }, { "epoch": 0.7116918051686041, "grad_norm": 1.1503297906974492, "learning_rate": 9.437389299108496e-06, "loss": 0.1863, "step": 1852 }, { "epoch": 0.712460370832933, "grad_norm": 1.218150241579278, "learning_rate": 9.435326593439474e-06, "loss": 0.1897, "step": 1854 }, { "epoch": 0.7132289364972619, "grad_norm": 1.2431965478373257, "learning_rate": 9.433260339696564e-06, "loss": 0.1893, "step": 1856 }, { "epoch": 0.713997502161591, "grad_norm": 1.419365542215161, "learning_rate": 9.431190539532684e-06, "loss": 0.1878, "step": 1858 }, { "epoch": 0.7147660678259199, "grad_norm": 1.2833006416425194, "learning_rate": 9.42911719460359e-06, "loss": 0.1848, "step": 1860 }, { "epoch": 0.7155346334902488, "grad_norm": 1.3192268387230512, "learning_rate": 9.427040306567864e-06, "loss": 0.2088, "step": 1862 }, { "epoch": 0.7163031991545777, "grad_norm": 1.3602645148748442, "learning_rate": 9.424959877086931e-06, "loss": 0.2047, "step": 1864 }, { "epoch": 0.7170717648189067, "grad_norm": 1.2226660384011676, "learning_rate": 9.422875907825052e-06, "loss": 0.1898, "step": 1866 }, { "epoch": 0.7178403304832357, "grad_norm": 1.3839375212400795, "learning_rate": 9.420788400449307e-06, "loss": 0.1904, "step": 1868 }, { "epoch": 0.7186088961475646, "grad_norm": 1.3095257725985094, "learning_rate": 9.418697356629618e-06, "loss": 0.1976, "step": 1870 }, { "epoch": 0.7193774618118935, "grad_norm": 1.1393698712745064, "learning_rate": 9.416602778038728e-06, "loss": 0.1808, "step": 1872 }, { "epoch": 0.7201460274762225, "grad_norm": 1.2579053743733817, "learning_rate": 9.414504666352216e-06, "loss": 0.2172, "step": 1874 }, { "epoch": 0.7209145931405514, "grad_norm": 1.2564432999841113, "learning_rate": 9.41240302324848e-06, "loss": 0.1843, "step": 1876 }, { "epoch": 0.7216831588048804, "grad_norm": 1.241087289636078, "learning_rate": 9.410297850408745e-06, "loss": 0.1905, "step": 1878 }, { "epoch": 0.7224517244692094, "grad_norm": 1.1959776928861516, "learning_rate": 9.408189149517062e-06, "loss": 0.1604, "step": 1880 }, { "epoch": 0.7232202901335383, "grad_norm": 1.071614907204551, "learning_rate": 9.406076922260304e-06, "loss": 0.1796, "step": 1882 }, { "epoch": 0.7239888557978672, "grad_norm": 1.3612928350244509, "learning_rate": 9.40396117032816e-06, "loss": 0.1859, "step": 1884 }, { "epoch": 0.7247574214621961, "grad_norm": 1.4137503889340293, "learning_rate": 9.401841895413144e-06, "loss": 0.1997, "step": 1886 }, { "epoch": 0.7255259871265252, "grad_norm": 1.2927852036791396, "learning_rate": 9.399719099210587e-06, "loss": 0.1905, "step": 1888 }, { "epoch": 0.7262945527908541, "grad_norm": 1.2242170885327133, "learning_rate": 9.397592783418634e-06, "loss": 0.169, "step": 1890 }, { "epoch": 0.727063118455183, "grad_norm": 1.260390755888933, "learning_rate": 9.39546294973825e-06, "loss": 0.1844, "step": 1892 }, { "epoch": 0.7278316841195119, "grad_norm": 1.2856220294669751, "learning_rate": 9.393329599873212e-06, "loss": 0.1929, "step": 1894 }, { "epoch": 0.7286002497838409, "grad_norm": 1.414370640734507, "learning_rate": 9.391192735530108e-06, "loss": 0.1969, "step": 1896 }, { "epoch": 0.7293688154481699, "grad_norm": 1.3168699843551137, "learning_rate": 9.389052358418341e-06, "loss": 0.1888, "step": 1898 }, { "epoch": 0.7301373811124988, "grad_norm": 1.2812540171318416, "learning_rate": 9.38690847025012e-06, "loss": 0.1962, "step": 1900 }, { "epoch": 0.7309059467768277, "grad_norm": 1.1207144056878078, "learning_rate": 9.38476107274047e-06, "loss": 0.1808, "step": 1902 }, { "epoch": 0.7316745124411567, "grad_norm": 1.431504860562175, "learning_rate": 9.382610167607212e-06, "loss": 0.2037, "step": 1904 }, { "epoch": 0.7324430781054856, "grad_norm": 1.2226155463752022, "learning_rate": 9.38045575657098e-06, "loss": 0.1827, "step": 1906 }, { "epoch": 0.7332116437698146, "grad_norm": 1.266673716522782, "learning_rate": 9.378297841355217e-06, "loss": 0.1831, "step": 1908 }, { "epoch": 0.7339802094341435, "grad_norm": 1.1585480319336692, "learning_rate": 9.376136423686161e-06, "loss": 0.1641, "step": 1910 }, { "epoch": 0.7347487750984725, "grad_norm": 1.5272005277473866, "learning_rate": 9.373971505292852e-06, "loss": 0.2032, "step": 1912 }, { "epoch": 0.7355173407628014, "grad_norm": 1.2168555449955432, "learning_rate": 9.371803087907138e-06, "loss": 0.1732, "step": 1914 }, { "epoch": 0.7362859064271303, "grad_norm": 1.1922122088558418, "learning_rate": 9.369631173263658e-06, "loss": 0.194, "step": 1916 }, { "epoch": 0.7370544720914594, "grad_norm": 1.3501459549607584, "learning_rate": 9.367455763099853e-06, "loss": 0.1895, "step": 1918 }, { "epoch": 0.7378230377557883, "grad_norm": 1.2505729038803686, "learning_rate": 9.36527685915596e-06, "loss": 0.1642, "step": 1920 }, { "epoch": 0.7385916034201172, "grad_norm": 1.2715742195862592, "learning_rate": 9.363094463175009e-06, "loss": 0.212, "step": 1922 }, { "epoch": 0.7393601690844461, "grad_norm": 1.1511796942609833, "learning_rate": 9.360908576902825e-06, "loss": 0.1945, "step": 1924 }, { "epoch": 0.740128734748775, "grad_norm": 1.2321097302137263, "learning_rate": 9.358719202088025e-06, "loss": 0.1937, "step": 1926 }, { "epoch": 0.7408973004131041, "grad_norm": 1.03989993313692, "learning_rate": 9.356526340482015e-06, "loss": 0.1771, "step": 1928 }, { "epoch": 0.741665866077433, "grad_norm": 1.2167849289650687, "learning_rate": 9.354329993838994e-06, "loss": 0.1811, "step": 1930 }, { "epoch": 0.7424344317417619, "grad_norm": 1.2185197260858038, "learning_rate": 9.352130163915944e-06, "loss": 0.1922, "step": 1932 }, { "epoch": 0.7432029974060909, "grad_norm": 1.250112168063517, "learning_rate": 9.349926852472638e-06, "loss": 0.2126, "step": 1934 }, { "epoch": 0.7439715630704198, "grad_norm": 1.1251498636436688, "learning_rate": 9.347720061271631e-06, "loss": 0.1702, "step": 1936 }, { "epoch": 0.7447401287347488, "grad_norm": 1.215312756533942, "learning_rate": 9.345509792078263e-06, "loss": 0.1925, "step": 1938 }, { "epoch": 0.7455086943990777, "grad_norm": 1.3315238711987494, "learning_rate": 9.343296046660655e-06, "loss": 0.208, "step": 1940 }, { "epoch": 0.7462772600634067, "grad_norm": 1.3639445252570699, "learning_rate": 9.341078826789711e-06, "loss": 0.2048, "step": 1942 }, { "epoch": 0.7470458257277356, "grad_norm": 1.2623425052580295, "learning_rate": 9.338858134239114e-06, "loss": 0.1816, "step": 1944 }, { "epoch": 0.7478143913920645, "grad_norm": 1.1076715160264892, "learning_rate": 9.336633970785323e-06, "loss": 0.1849, "step": 1946 }, { "epoch": 0.7485829570563935, "grad_norm": 1.2483899842301078, "learning_rate": 9.334406338207574e-06, "loss": 0.1815, "step": 1948 }, { "epoch": 0.7493515227207225, "grad_norm": 1.2641131836171817, "learning_rate": 9.33217523828788e-06, "loss": 0.1903, "step": 1950 }, { "epoch": 0.7501200883850514, "grad_norm": 1.0854558937788896, "learning_rate": 9.329940672811028e-06, "loss": 0.1479, "step": 1952 }, { "epoch": 0.7508886540493803, "grad_norm": 1.222945280629528, "learning_rate": 9.327702643564574e-06, "loss": 0.1873, "step": 1954 }, { "epoch": 0.7516572197137092, "grad_norm": 1.2476276398307693, "learning_rate": 9.325461152338846e-06, "loss": 0.1898, "step": 1956 }, { "epoch": 0.7524257853780383, "grad_norm": 1.135242573598646, "learning_rate": 9.323216200926945e-06, "loss": 0.1846, "step": 1958 }, { "epoch": 0.7531943510423672, "grad_norm": 1.3781448682049446, "learning_rate": 9.320967791124735e-06, "loss": 0.1892, "step": 1960 }, { "epoch": 0.7539629167066961, "grad_norm": 1.347074939220756, "learning_rate": 9.31871592473085e-06, "loss": 0.1713, "step": 1962 }, { "epoch": 0.754731482371025, "grad_norm": 1.3304948599639612, "learning_rate": 9.31646060354669e-06, "loss": 0.192, "step": 1964 }, { "epoch": 0.755500048035354, "grad_norm": 1.207983865583764, "learning_rate": 9.31420182937641e-06, "loss": 0.1844, "step": 1966 }, { "epoch": 0.756268613699683, "grad_norm": 1.249548278917204, "learning_rate": 9.31193960402694e-06, "loss": 0.1831, "step": 1968 }, { "epoch": 0.7570371793640119, "grad_norm": 1.275154575856212, "learning_rate": 9.309673929307966e-06, "loss": 0.1775, "step": 1970 }, { "epoch": 0.7578057450283409, "grad_norm": 1.2280086973865334, "learning_rate": 9.307404807031926e-06, "loss": 0.1661, "step": 1972 }, { "epoch": 0.7585743106926698, "grad_norm": 1.289517335555379, "learning_rate": 9.305132239014025e-06, "loss": 0.2009, "step": 1974 }, { "epoch": 0.7593428763569987, "grad_norm": 1.285312700225704, "learning_rate": 9.302856227072223e-06, "loss": 0.1921, "step": 1976 }, { "epoch": 0.7601114420213277, "grad_norm": 1.1548159082914387, "learning_rate": 9.300576773027233e-06, "loss": 0.1593, "step": 1978 }, { "epoch": 0.7608800076856567, "grad_norm": 1.2053693553772773, "learning_rate": 9.298293878702518e-06, "loss": 0.2097, "step": 1980 }, { "epoch": 0.7616485733499856, "grad_norm": 1.0738130191156703, "learning_rate": 9.2960075459243e-06, "loss": 0.1689, "step": 1982 }, { "epoch": 0.7624171390143145, "grad_norm": 1.2214629124398166, "learning_rate": 9.29371777652155e-06, "loss": 0.1713, "step": 1984 }, { "epoch": 0.7631857046786434, "grad_norm": 1.4234787856633797, "learning_rate": 9.291424572325984e-06, "loss": 0.2115, "step": 1986 }, { "epoch": 0.7639542703429725, "grad_norm": 1.3146811048313856, "learning_rate": 9.289127935172068e-06, "loss": 0.2051, "step": 1988 }, { "epoch": 0.7647228360073014, "grad_norm": 1.2121159712488319, "learning_rate": 9.28682786689702e-06, "loss": 0.1903, "step": 1990 }, { "epoch": 0.7654914016716303, "grad_norm": 1.339646392569142, "learning_rate": 9.284524369340793e-06, "loss": 0.1855, "step": 1992 }, { "epoch": 0.7662599673359592, "grad_norm": 1.2705944498483268, "learning_rate": 9.282217444346087e-06, "loss": 0.194, "step": 1994 }, { "epoch": 0.7670285330002882, "grad_norm": 1.2764110954625452, "learning_rate": 9.279907093758346e-06, "loss": 0.1883, "step": 1996 }, { "epoch": 0.7677970986646172, "grad_norm": 1.2044463759690873, "learning_rate": 9.277593319425753e-06, "loss": 0.1797, "step": 1998 }, { "epoch": 0.7685656643289461, "grad_norm": 1.4738467596120135, "learning_rate": 9.275276123199231e-06, "loss": 0.1909, "step": 2000 }, { "epoch": 0.7685656643289461, "eval_loss": 0.17198452353477478, "eval_runtime": 391.2391, "eval_samples_per_second": 47.298, "eval_steps_per_second": 5.915, "step": 2000 }, { "epoch": 0.769334229993275, "grad_norm": 1.261820375024299, "learning_rate": 9.272955506932437e-06, "loss": 0.1902, "step": 2002 }, { "epoch": 0.770102795657604, "grad_norm": 1.3402796827572112, "learning_rate": 9.270631472481767e-06, "loss": 0.1755, "step": 2004 }, { "epoch": 0.7708713613219329, "grad_norm": 1.1552148429814055, "learning_rate": 9.26830402170635e-06, "loss": 0.1768, "step": 2006 }, { "epoch": 0.7716399269862619, "grad_norm": 1.16415168973856, "learning_rate": 9.265973156468046e-06, "loss": 0.169, "step": 2008 }, { "epoch": 0.7724084926505909, "grad_norm": 1.1170099890783545, "learning_rate": 9.263638878631453e-06, "loss": 0.1763, "step": 2010 }, { "epoch": 0.7731770583149198, "grad_norm": 1.1746924453146292, "learning_rate": 9.261301190063893e-06, "loss": 0.1939, "step": 2012 }, { "epoch": 0.7739456239792487, "grad_norm": 1.2574041202369828, "learning_rate": 9.258960092635419e-06, "loss": 0.189, "step": 2014 }, { "epoch": 0.7747141896435776, "grad_norm": 1.2144038041029122, "learning_rate": 9.256615588218806e-06, "loss": 0.1998, "step": 2016 }, { "epoch": 0.7754827553079067, "grad_norm": 1.2187870793543332, "learning_rate": 9.254267678689562e-06, "loss": 0.1965, "step": 2018 }, { "epoch": 0.7762513209722356, "grad_norm": 1.1817810411991718, "learning_rate": 9.251916365925917e-06, "loss": 0.1827, "step": 2020 }, { "epoch": 0.7770198866365645, "grad_norm": 1.2010857223271103, "learning_rate": 9.249561651808816e-06, "loss": 0.1834, "step": 2022 }, { "epoch": 0.7777884523008934, "grad_norm": 1.2503680248520013, "learning_rate": 9.247203538221936e-06, "loss": 0.1844, "step": 2024 }, { "epoch": 0.7785570179652224, "grad_norm": 1.2708584285157831, "learning_rate": 9.244842027051667e-06, "loss": 0.1957, "step": 2026 }, { "epoch": 0.7793255836295514, "grad_norm": 1.1378365107912802, "learning_rate": 9.242477120187118e-06, "loss": 0.1918, "step": 2028 }, { "epoch": 0.7800941492938803, "grad_norm": 1.288391936084676, "learning_rate": 9.240108819520114e-06, "loss": 0.1941, "step": 2030 }, { "epoch": 0.7808627149582092, "grad_norm": 1.2752177731882801, "learning_rate": 9.237737126945196e-06, "loss": 0.1922, "step": 2032 }, { "epoch": 0.7816312806225382, "grad_norm": 1.2436701197739084, "learning_rate": 9.235362044359618e-06, "loss": 0.2008, "step": 2034 }, { "epoch": 0.7823998462868671, "grad_norm": 1.0739027760167292, "learning_rate": 9.232983573663345e-06, "loss": 0.1619, "step": 2036 }, { "epoch": 0.7831684119511961, "grad_norm": 1.2350486243273902, "learning_rate": 9.230601716759052e-06, "loss": 0.1939, "step": 2038 }, { "epoch": 0.783936977615525, "grad_norm": 1.0858836541452441, "learning_rate": 9.228216475552128e-06, "loss": 0.201, "step": 2040 }, { "epoch": 0.784705543279854, "grad_norm": 1.2536821489207608, "learning_rate": 9.22582785195066e-06, "loss": 0.1919, "step": 2042 }, { "epoch": 0.7854741089441829, "grad_norm": 1.218024448127749, "learning_rate": 9.223435847865451e-06, "loss": 0.2013, "step": 2044 }, { "epoch": 0.7862426746085118, "grad_norm": 1.2762680984154162, "learning_rate": 9.22104046521e-06, "loss": 0.1972, "step": 2046 }, { "epoch": 0.7870112402728409, "grad_norm": 1.1356375732217165, "learning_rate": 9.218641705900513e-06, "loss": 0.1792, "step": 2048 }, { "epoch": 0.7877798059371698, "grad_norm": 1.2027684188360803, "learning_rate": 9.216239571855893e-06, "loss": 0.1778, "step": 2050 }, { "epoch": 0.7885483716014987, "grad_norm": 1.2117561835361554, "learning_rate": 9.21383406499775e-06, "loss": 0.1906, "step": 2052 }, { "epoch": 0.7893169372658276, "grad_norm": 1.387301105626476, "learning_rate": 9.211425187250388e-06, "loss": 0.2011, "step": 2054 }, { "epoch": 0.7900855029301566, "grad_norm": 1.4113053022744988, "learning_rate": 9.209012940540806e-06, "loss": 0.1685, "step": 2056 }, { "epoch": 0.7908540685944856, "grad_norm": 1.1991499243107075, "learning_rate": 9.206597326798697e-06, "loss": 0.1917, "step": 2058 }, { "epoch": 0.7916226342588145, "grad_norm": 1.189093078007415, "learning_rate": 9.204178347956455e-06, "loss": 0.1712, "step": 2060 }, { "epoch": 0.7923911999231434, "grad_norm": 1.1684314590319023, "learning_rate": 9.201756005949155e-06, "loss": 0.1789, "step": 2062 }, { "epoch": 0.7931597655874724, "grad_norm": 1.1322856484415094, "learning_rate": 9.199330302714573e-06, "loss": 0.1899, "step": 2064 }, { "epoch": 0.7939283312518013, "grad_norm": 1.1347215928536178, "learning_rate": 9.196901240193166e-06, "loss": 0.1548, "step": 2066 }, { "epoch": 0.7946968969161303, "grad_norm": 1.1808860665442598, "learning_rate": 9.194468820328084e-06, "loss": 0.1808, "step": 2068 }, { "epoch": 0.7954654625804592, "grad_norm": 1.2001369715725392, "learning_rate": 9.192033045065158e-06, "loss": 0.1895, "step": 2070 }, { "epoch": 0.7962340282447882, "grad_norm": 1.275675318794341, "learning_rate": 9.189593916352907e-06, "loss": 0.2004, "step": 2072 }, { "epoch": 0.7970025939091171, "grad_norm": 1.2500341138954048, "learning_rate": 9.187151436142531e-06, "loss": 0.1911, "step": 2074 }, { "epoch": 0.797771159573446, "grad_norm": 1.2281871822993962, "learning_rate": 9.184705606387911e-06, "loss": 0.1721, "step": 2076 }, { "epoch": 0.798539725237775, "grad_norm": 1.3655817571729876, "learning_rate": 9.182256429045605e-06, "loss": 0.2039, "step": 2078 }, { "epoch": 0.799308290902104, "grad_norm": 1.3762987325207334, "learning_rate": 9.179803906074852e-06, "loss": 0.2015, "step": 2080 }, { "epoch": 0.8000768565664329, "grad_norm": 1.4265582902703169, "learning_rate": 9.177348039437568e-06, "loss": 0.1748, "step": 2082 }, { "epoch": 0.8008454222307618, "grad_norm": 1.2047103403956998, "learning_rate": 9.174888831098343e-06, "loss": 0.1893, "step": 2084 }, { "epoch": 0.8016139878950908, "grad_norm": 1.332466569466646, "learning_rate": 9.17242628302444e-06, "loss": 0.2117, "step": 2086 }, { "epoch": 0.8023825535594198, "grad_norm": 1.19075340615867, "learning_rate": 9.169960397185791e-06, "loss": 0.1856, "step": 2088 }, { "epoch": 0.8031511192237487, "grad_norm": 1.2603141214395528, "learning_rate": 9.167491175555004e-06, "loss": 0.1861, "step": 2090 }, { "epoch": 0.8039196848880776, "grad_norm": 1.256064196988792, "learning_rate": 9.165018620107348e-06, "loss": 0.1881, "step": 2092 }, { "epoch": 0.8046882505524066, "grad_norm": 1.1213424611941971, "learning_rate": 9.162542732820765e-06, "loss": 0.1903, "step": 2094 }, { "epoch": 0.8054568162167355, "grad_norm": 1.2287475513708368, "learning_rate": 9.160063515675862e-06, "loss": 0.1908, "step": 2096 }, { "epoch": 0.8062253818810645, "grad_norm": 1.3371141508488003, "learning_rate": 9.157580970655904e-06, "loss": 0.1938, "step": 2098 }, { "epoch": 0.8069939475453934, "grad_norm": 1.2265883488427984, "learning_rate": 9.155095099746825e-06, "loss": 0.2096, "step": 2100 }, { "epoch": 0.8077625132097224, "grad_norm": 1.1430503066718267, "learning_rate": 9.152605904937215e-06, "loss": 0.1972, "step": 2102 }, { "epoch": 0.8085310788740513, "grad_norm": 1.2567144554181124, "learning_rate": 9.150113388218327e-06, "loss": 0.1899, "step": 2104 }, { "epoch": 0.8092996445383802, "grad_norm": 1.2187807111087547, "learning_rate": 9.147617551584066e-06, "loss": 0.1898, "step": 2106 }, { "epoch": 0.8100682102027092, "grad_norm": 1.2144690309674413, "learning_rate": 9.145118397031e-06, "loss": 0.1865, "step": 2108 }, { "epoch": 0.8108367758670382, "grad_norm": 1.2661717778297978, "learning_rate": 9.142615926558342e-06, "loss": 0.1804, "step": 2110 }, { "epoch": 0.8116053415313671, "grad_norm": 1.2189758034025457, "learning_rate": 9.140110142167966e-06, "loss": 0.1826, "step": 2112 }, { "epoch": 0.812373907195696, "grad_norm": 1.2161076002973408, "learning_rate": 9.137601045864393e-06, "loss": 0.2049, "step": 2114 }, { "epoch": 0.813142472860025, "grad_norm": 1.2734186363193822, "learning_rate": 9.135088639654795e-06, "loss": 0.1954, "step": 2116 }, { "epoch": 0.8139110385243539, "grad_norm": 1.2431306626877, "learning_rate": 9.132572925548988e-06, "loss": 0.1827, "step": 2118 }, { "epoch": 0.8146796041886829, "grad_norm": 1.3445399436809387, "learning_rate": 9.13005390555944e-06, "loss": 0.1982, "step": 2120 }, { "epoch": 0.8154481698530118, "grad_norm": 1.20163916196121, "learning_rate": 9.127531581701257e-06, "loss": 0.2019, "step": 2122 }, { "epoch": 0.8162167355173408, "grad_norm": 1.1799933036068841, "learning_rate": 9.125005955992196e-06, "loss": 0.1807, "step": 2124 }, { "epoch": 0.8169853011816697, "grad_norm": 1.1266483834843526, "learning_rate": 9.122477030452647e-06, "loss": 0.1926, "step": 2126 }, { "epoch": 0.8177538668459986, "grad_norm": 1.1903176338778183, "learning_rate": 9.119944807105648e-06, "loss": 0.1946, "step": 2128 }, { "epoch": 0.8185224325103276, "grad_norm": 1.1114705191839893, "learning_rate": 9.117409287976867e-06, "loss": 0.2028, "step": 2130 }, { "epoch": 0.8192909981746566, "grad_norm": 1.2220175435203755, "learning_rate": 9.11487047509461e-06, "loss": 0.1942, "step": 2132 }, { "epoch": 0.8200595638389855, "grad_norm": 1.1217016868872014, "learning_rate": 9.112328370489827e-06, "loss": 0.1759, "step": 2134 }, { "epoch": 0.8208281295033144, "grad_norm": 1.2782197699685234, "learning_rate": 9.10978297619609e-06, "loss": 0.1966, "step": 2136 }, { "epoch": 0.8215966951676433, "grad_norm": 1.0875158269786116, "learning_rate": 9.107234294249604e-06, "loss": 0.1834, "step": 2138 }, { "epoch": 0.8223652608319724, "grad_norm": 1.1652679269256225, "learning_rate": 9.104682326689214e-06, "loss": 0.1824, "step": 2140 }, { "epoch": 0.8231338264963013, "grad_norm": 1.0784368463618048, "learning_rate": 9.10212707555638e-06, "loss": 0.1674, "step": 2142 }, { "epoch": 0.8239023921606302, "grad_norm": 1.2196507044127218, "learning_rate": 9.0995685428952e-06, "loss": 0.1902, "step": 2144 }, { "epoch": 0.8246709578249591, "grad_norm": 1.251272013141529, "learning_rate": 9.097006730752387e-06, "loss": 0.1798, "step": 2146 }, { "epoch": 0.8254395234892881, "grad_norm": 1.1189370891537207, "learning_rate": 9.094441641177284e-06, "loss": 0.1861, "step": 2148 }, { "epoch": 0.8262080891536171, "grad_norm": 1.288003060202175, "learning_rate": 9.091873276221858e-06, "loss": 0.1812, "step": 2150 }, { "epoch": 0.826976654817946, "grad_norm": 1.1455438895758303, "learning_rate": 9.089301637940688e-06, "loss": 0.1765, "step": 2152 }, { "epoch": 0.827745220482275, "grad_norm": 1.2426325989824412, "learning_rate": 9.086726728390979e-06, "loss": 0.1957, "step": 2154 }, { "epoch": 0.8285137861466039, "grad_norm": 1.2525106839369142, "learning_rate": 9.084148549632547e-06, "loss": 0.1902, "step": 2156 }, { "epoch": 0.8292823518109328, "grad_norm": 1.2916747609984376, "learning_rate": 9.081567103727829e-06, "loss": 0.1895, "step": 2158 }, { "epoch": 0.8300509174752618, "grad_norm": 1.2431329407115455, "learning_rate": 9.07898239274187e-06, "loss": 0.1826, "step": 2160 }, { "epoch": 0.8308194831395908, "grad_norm": 1.1093295940142172, "learning_rate": 9.076394418742332e-06, "loss": 0.2017, "step": 2162 }, { "epoch": 0.8315880488039197, "grad_norm": 1.1275488416000359, "learning_rate": 9.073803183799483e-06, "loss": 0.1886, "step": 2164 }, { "epoch": 0.8323566144682486, "grad_norm": 1.1573398979704723, "learning_rate": 9.071208689986202e-06, "loss": 0.1765, "step": 2166 }, { "epoch": 0.8331251801325775, "grad_norm": 1.2724909267914235, "learning_rate": 9.068610939377974e-06, "loss": 0.1896, "step": 2168 }, { "epoch": 0.8338937457969066, "grad_norm": 1.303081085611057, "learning_rate": 9.066009934052891e-06, "loss": 0.1904, "step": 2170 }, { "epoch": 0.8346623114612355, "grad_norm": 1.3623763655207586, "learning_rate": 9.063405676091645e-06, "loss": 0.1811, "step": 2172 }, { "epoch": 0.8354308771255644, "grad_norm": 1.1725501505491063, "learning_rate": 9.060798167577536e-06, "loss": 0.188, "step": 2174 }, { "epoch": 0.8361994427898933, "grad_norm": 1.1629672384431131, "learning_rate": 9.058187410596458e-06, "loss": 0.1989, "step": 2176 }, { "epoch": 0.8369680084542223, "grad_norm": 1.4038542867881196, "learning_rate": 9.055573407236908e-06, "loss": 0.1751, "step": 2178 }, { "epoch": 0.8377365741185513, "grad_norm": 1.185662095409636, "learning_rate": 9.052956159589977e-06, "loss": 0.178, "step": 2180 }, { "epoch": 0.8385051397828802, "grad_norm": 1.3093541167858365, "learning_rate": 9.05033566974935e-06, "loss": 0.1916, "step": 2182 }, { "epoch": 0.8392737054472091, "grad_norm": 1.5245414212299586, "learning_rate": 9.047711939811315e-06, "loss": 0.1881, "step": 2184 }, { "epoch": 0.8400422711115381, "grad_norm": 1.234223472485929, "learning_rate": 9.045084971874738e-06, "loss": 0.1875, "step": 2186 }, { "epoch": 0.840810836775867, "grad_norm": 1.149828559962546, "learning_rate": 9.042454768041087e-06, "loss": 0.1967, "step": 2188 }, { "epoch": 0.841579402440196, "grad_norm": 1.373330303893253, "learning_rate": 9.03982133041441e-06, "loss": 0.1903, "step": 2190 }, { "epoch": 0.842347968104525, "grad_norm": 1.2134137837380758, "learning_rate": 9.03718466110135e-06, "loss": 0.1876, "step": 2192 }, { "epoch": 0.8431165337688539, "grad_norm": 1.2793655376305941, "learning_rate": 9.034544762211129e-06, "loss": 0.1774, "step": 2194 }, { "epoch": 0.8438850994331828, "grad_norm": 1.247980967531676, "learning_rate": 9.031901635855555e-06, "loss": 0.2052, "step": 2196 }, { "epoch": 0.8446536650975117, "grad_norm": 1.2088927604608937, "learning_rate": 9.029255284149017e-06, "loss": 0.1633, "step": 2198 }, { "epoch": 0.8454222307618408, "grad_norm": 1.2141612522413139, "learning_rate": 9.026605709208484e-06, "loss": 0.2129, "step": 2200 }, { "epoch": 0.8461907964261697, "grad_norm": 1.165926276868925, "learning_rate": 9.023952913153505e-06, "loss": 0.1905, "step": 2202 }, { "epoch": 0.8469593620904986, "grad_norm": 1.2909621412109802, "learning_rate": 9.021296898106205e-06, "loss": 0.1748, "step": 2204 }, { "epoch": 0.8477279277548275, "grad_norm": 1.1858605505810673, "learning_rate": 9.018637666191284e-06, "loss": 0.1708, "step": 2206 }, { "epoch": 0.8484964934191565, "grad_norm": 1.312083400253972, "learning_rate": 9.015975219536013e-06, "loss": 0.1984, "step": 2208 }, { "epoch": 0.8492650590834855, "grad_norm": 1.3286137282509032, "learning_rate": 9.01330956027024e-06, "loss": 0.1743, "step": 2210 }, { "epoch": 0.8500336247478144, "grad_norm": 1.245905471240792, "learning_rate": 9.010640690526376e-06, "loss": 0.1829, "step": 2212 }, { "epoch": 0.8508021904121433, "grad_norm": 1.2367649103585008, "learning_rate": 9.007968612439407e-06, "loss": 0.1922, "step": 2214 }, { "epoch": 0.8515707560764723, "grad_norm": 1.2707008358292793, "learning_rate": 9.005293328146882e-06, "loss": 0.1831, "step": 2216 }, { "epoch": 0.8523393217408012, "grad_norm": 1.368672506938956, "learning_rate": 9.002614839788916e-06, "loss": 0.2017, "step": 2218 }, { "epoch": 0.8531078874051302, "grad_norm": 1.2394895812748155, "learning_rate": 8.999933149508185e-06, "loss": 0.1676, "step": 2220 }, { "epoch": 0.8538764530694591, "grad_norm": 1.1861427859560083, "learning_rate": 8.997248259449928e-06, "loss": 0.1982, "step": 2222 }, { "epoch": 0.8546450187337881, "grad_norm": 1.1695957437744628, "learning_rate": 8.994560171761945e-06, "loss": 0.1768, "step": 2224 }, { "epoch": 0.855413584398117, "grad_norm": 1.1139574846194964, "learning_rate": 8.991868888594591e-06, "loss": 0.1735, "step": 2226 }, { "epoch": 0.8561821500624459, "grad_norm": 1.2759198445611144, "learning_rate": 8.989174412100781e-06, "loss": 0.1951, "step": 2228 }, { "epoch": 0.856950715726775, "grad_norm": 1.4771255806208772, "learning_rate": 8.986476744435979e-06, "loss": 0.2009, "step": 2230 }, { "epoch": 0.8577192813911039, "grad_norm": 1.196764765932786, "learning_rate": 8.983775887758209e-06, "loss": 0.1859, "step": 2232 }, { "epoch": 0.8584878470554328, "grad_norm": 1.1484691167065375, "learning_rate": 8.98107184422804e-06, "loss": 0.177, "step": 2234 }, { "epoch": 0.8592564127197617, "grad_norm": 1.2173024834864858, "learning_rate": 8.978364616008592e-06, "loss": 0.1797, "step": 2236 }, { "epoch": 0.8600249783840906, "grad_norm": 1.180066282119778, "learning_rate": 8.975654205265535e-06, "loss": 0.1765, "step": 2238 }, { "epoch": 0.8607935440484197, "grad_norm": 1.3470818206970216, "learning_rate": 8.972940614167083e-06, "loss": 0.1896, "step": 2240 }, { "epoch": 0.8615621097127486, "grad_norm": 1.2498791623672432, "learning_rate": 8.970223844883991e-06, "loss": 0.1821, "step": 2242 }, { "epoch": 0.8623306753770775, "grad_norm": 1.2378559227786265, "learning_rate": 8.967503899589565e-06, "loss": 0.2121, "step": 2244 }, { "epoch": 0.8630992410414065, "grad_norm": 1.3046462776812422, "learning_rate": 8.96478078045964e-06, "loss": 0.1966, "step": 2246 }, { "epoch": 0.8638678067057354, "grad_norm": 1.1990895238784551, "learning_rate": 8.962054489672602e-06, "loss": 0.16, "step": 2248 }, { "epoch": 0.8646363723700644, "grad_norm": 1.194431344204316, "learning_rate": 8.959325029409368e-06, "loss": 0.1803, "step": 2250 }, { "epoch": 0.8654049380343933, "grad_norm": 1.2016103999064283, "learning_rate": 8.956592401853385e-06, "loss": 0.1867, "step": 2252 }, { "epoch": 0.8661735036987223, "grad_norm": 1.253494590937789, "learning_rate": 8.953856609190645e-06, "loss": 0.1862, "step": 2254 }, { "epoch": 0.8669420693630512, "grad_norm": 1.2202309918881187, "learning_rate": 8.951117653609666e-06, "loss": 0.1923, "step": 2256 }, { "epoch": 0.8677106350273801, "grad_norm": 1.1488471824949893, "learning_rate": 8.948375537301497e-06, "loss": 0.1829, "step": 2258 }, { "epoch": 0.8684792006917091, "grad_norm": 1.158624220625633, "learning_rate": 8.945630262459713e-06, "loss": 0.1785, "step": 2260 }, { "epoch": 0.8692477663560381, "grad_norm": 1.251921294849819, "learning_rate": 8.942881831280418e-06, "loss": 0.1693, "step": 2262 }, { "epoch": 0.870016332020367, "grad_norm": 1.1775304143130512, "learning_rate": 8.940130245962242e-06, "loss": 0.1586, "step": 2264 }, { "epoch": 0.8707848976846959, "grad_norm": 1.1293732984826816, "learning_rate": 8.937375508706338e-06, "loss": 0.1839, "step": 2266 }, { "epoch": 0.8715534633490248, "grad_norm": 1.2712572832275575, "learning_rate": 8.934617621716377e-06, "loss": 0.196, "step": 2268 }, { "epoch": 0.8723220290133539, "grad_norm": 1.1328907592795852, "learning_rate": 8.931856587198555e-06, "loss": 0.1683, "step": 2270 }, { "epoch": 0.8730905946776828, "grad_norm": 1.1864156684347011, "learning_rate": 8.929092407361579e-06, "loss": 0.193, "step": 2272 }, { "epoch": 0.8738591603420117, "grad_norm": 1.007703123761032, "learning_rate": 8.926325084416679e-06, "loss": 0.1762, "step": 2274 }, { "epoch": 0.8746277260063406, "grad_norm": 1.2123340801499867, "learning_rate": 8.923554620577597e-06, "loss": 0.1942, "step": 2276 }, { "epoch": 0.8753962916706696, "grad_norm": 1.1133761792375818, "learning_rate": 8.920781018060585e-06, "loss": 0.1828, "step": 2278 }, { "epoch": 0.8761648573349986, "grad_norm": 1.1487109602546244, "learning_rate": 8.91800427908441e-06, "loss": 0.1719, "step": 2280 }, { "epoch": 0.8769334229993275, "grad_norm": 1.1963562792758868, "learning_rate": 8.915224405870343e-06, "loss": 0.1957, "step": 2282 }, { "epoch": 0.8777019886636565, "grad_norm": 1.107529585559194, "learning_rate": 8.912441400642166e-06, "loss": 0.1844, "step": 2284 }, { "epoch": 0.8784705543279854, "grad_norm": 1.1495335130753808, "learning_rate": 8.909655265626167e-06, "loss": 0.1662, "step": 2286 }, { "epoch": 0.8792391199923143, "grad_norm": 1.1374511936948153, "learning_rate": 8.906866003051136e-06, "loss": 0.1797, "step": 2288 }, { "epoch": 0.8800076856566433, "grad_norm": 1.2073449922918444, "learning_rate": 8.904073615148363e-06, "loss": 0.2063, "step": 2290 }, { "epoch": 0.8807762513209723, "grad_norm": 1.1559555074628627, "learning_rate": 8.901278104151644e-06, "loss": 0.1927, "step": 2292 }, { "epoch": 0.8815448169853012, "grad_norm": 1.162001448050792, "learning_rate": 8.898479472297268e-06, "loss": 0.1674, "step": 2294 }, { "epoch": 0.8823133826496301, "grad_norm": 1.1400199790884669, "learning_rate": 8.895677721824023e-06, "loss": 0.1851, "step": 2296 }, { "epoch": 0.883081948313959, "grad_norm": 1.1266784872377675, "learning_rate": 8.892872854973188e-06, "loss": 0.185, "step": 2298 }, { "epoch": 0.8838505139782881, "grad_norm": 1.1877371838914534, "learning_rate": 8.89006487398854e-06, "loss": 0.1903, "step": 2300 }, { "epoch": 0.884619079642617, "grad_norm": 1.3130760756792879, "learning_rate": 8.887253781116345e-06, "loss": 0.1947, "step": 2302 }, { "epoch": 0.8853876453069459, "grad_norm": 1.1213078605180262, "learning_rate": 8.88443957860536e-06, "loss": 0.1672, "step": 2304 }, { "epoch": 0.8861562109712748, "grad_norm": 1.1129557356365196, "learning_rate": 8.881622268706825e-06, "loss": 0.1899, "step": 2306 }, { "epoch": 0.8869247766356038, "grad_norm": 1.252983559937864, "learning_rate": 8.878801853674473e-06, "loss": 0.1865, "step": 2308 }, { "epoch": 0.8876933422999328, "grad_norm": 1.292901935666689, "learning_rate": 8.875978335764512e-06, "loss": 0.1816, "step": 2310 }, { "epoch": 0.8884619079642617, "grad_norm": 1.230210773989238, "learning_rate": 8.873151717235642e-06, "loss": 0.1649, "step": 2312 }, { "epoch": 0.8892304736285906, "grad_norm": 1.1218363925358446, "learning_rate": 8.870322000349034e-06, "loss": 0.1598, "step": 2314 }, { "epoch": 0.8899990392929196, "grad_norm": 1.2720799808630978, "learning_rate": 8.867489187368346e-06, "loss": 0.1939, "step": 2316 }, { "epoch": 0.8907676049572485, "grad_norm": 1.1621874150546059, "learning_rate": 8.864653280559706e-06, "loss": 0.1779, "step": 2318 }, { "epoch": 0.8915361706215775, "grad_norm": 1.1389835742499064, "learning_rate": 8.86181428219172e-06, "loss": 0.1683, "step": 2320 }, { "epoch": 0.8923047362859065, "grad_norm": 1.2017565394487877, "learning_rate": 8.858972194535466e-06, "loss": 0.2091, "step": 2322 }, { "epoch": 0.8930733019502354, "grad_norm": 1.133362198034487, "learning_rate": 8.856127019864495e-06, "loss": 0.1841, "step": 2324 }, { "epoch": 0.8938418676145643, "grad_norm": 1.3509744366959464, "learning_rate": 8.853278760454828e-06, "loss": 0.1845, "step": 2326 }, { "epoch": 0.8946104332788932, "grad_norm": 1.157091302773675, "learning_rate": 8.850427418584951e-06, "loss": 0.1932, "step": 2328 }, { "epoch": 0.8953789989432223, "grad_norm": 1.2147725723299043, "learning_rate": 8.847572996535817e-06, "loss": 0.198, "step": 2330 }, { "epoch": 0.8961475646075512, "grad_norm": 1.1573730266585227, "learning_rate": 8.844715496590842e-06, "loss": 0.1815, "step": 2332 }, { "epoch": 0.8969161302718801, "grad_norm": 1.2541925794316908, "learning_rate": 8.841854921035907e-06, "loss": 0.1922, "step": 2334 }, { "epoch": 0.897684695936209, "grad_norm": 1.2028078979390076, "learning_rate": 8.83899127215935e-06, "loss": 0.1792, "step": 2336 }, { "epoch": 0.898453261600538, "grad_norm": 1.170904333137329, "learning_rate": 8.83612455225197e-06, "loss": 0.1941, "step": 2338 }, { "epoch": 0.899221827264867, "grad_norm": 1.240885701094925, "learning_rate": 8.83325476360702e-06, "loss": 0.1919, "step": 2340 }, { "epoch": 0.8999903929291959, "grad_norm": 1.1504187922474298, "learning_rate": 8.830381908520213e-06, "loss": 0.1865, "step": 2342 }, { "epoch": 0.9007589585935248, "grad_norm": 1.1954446112506525, "learning_rate": 8.82750598928971e-06, "loss": 0.1829, "step": 2344 }, { "epoch": 0.9015275242578538, "grad_norm": 1.3599470639492717, "learning_rate": 8.824627008216124e-06, "loss": 0.1789, "step": 2346 }, { "epoch": 0.9022960899221827, "grad_norm": 1.288839595143552, "learning_rate": 8.821744967602522e-06, "loss": 0.1836, "step": 2348 }, { "epoch": 0.9030646555865117, "grad_norm": 1.155109406616065, "learning_rate": 8.818859869754409e-06, "loss": 0.1957, "step": 2350 }, { "epoch": 0.9038332212508406, "grad_norm": 1.2757721699172633, "learning_rate": 8.815971716979745e-06, "loss": 0.2025, "step": 2352 }, { "epoch": 0.9046017869151696, "grad_norm": 1.357029186816927, "learning_rate": 8.813080511588929e-06, "loss": 0.1914, "step": 2354 }, { "epoch": 0.9053703525794985, "grad_norm": 1.1825479705946487, "learning_rate": 8.810186255894804e-06, "loss": 0.1972, "step": 2356 }, { "epoch": 0.9061389182438274, "grad_norm": 1.5724395350595985, "learning_rate": 8.807288952212652e-06, "loss": 0.1722, "step": 2358 }, { "epoch": 0.9069074839081565, "grad_norm": 1.2650440936114473, "learning_rate": 8.804388602860191e-06, "loss": 0.1793, "step": 2360 }, { "epoch": 0.9076760495724854, "grad_norm": 1.1563956168754435, "learning_rate": 8.801485210157583e-06, "loss": 0.1893, "step": 2362 }, { "epoch": 0.9084446152368143, "grad_norm": 1.2074951705626022, "learning_rate": 8.798578776427415e-06, "loss": 0.1822, "step": 2364 }, { "epoch": 0.9092131809011432, "grad_norm": 1.1116256514386857, "learning_rate": 8.795669303994715e-06, "loss": 0.1708, "step": 2366 }, { "epoch": 0.9099817465654721, "grad_norm": 1.0578178368107343, "learning_rate": 8.792756795186936e-06, "loss": 0.1615, "step": 2368 }, { "epoch": 0.9107503122298012, "grad_norm": 1.338812110311402, "learning_rate": 8.78984125233396e-06, "loss": 0.1811, "step": 2370 }, { "epoch": 0.9115188778941301, "grad_norm": 1.423542754676097, "learning_rate": 8.786922677768101e-06, "loss": 0.1817, "step": 2372 }, { "epoch": 0.912287443558459, "grad_norm": 1.1128338643795046, "learning_rate": 8.784001073824095e-06, "loss": 0.1837, "step": 2374 }, { "epoch": 0.913056009222788, "grad_norm": 1.0659960722720763, "learning_rate": 8.7810764428391e-06, "loss": 0.1672, "step": 2376 }, { "epoch": 0.9138245748871169, "grad_norm": 1.1533394029961748, "learning_rate": 8.7781487871527e-06, "loss": 0.1849, "step": 2378 }, { "epoch": 0.9145931405514459, "grad_norm": 1.237049334806909, "learning_rate": 8.775218109106893e-06, "loss": 0.1794, "step": 2380 }, { "epoch": 0.9153617062157748, "grad_norm": 1.160989884976689, "learning_rate": 8.772284411046099e-06, "loss": 0.1749, "step": 2382 }, { "epoch": 0.9161302718801038, "grad_norm": 1.2610495074320225, "learning_rate": 8.769347695317154e-06, "loss": 0.1709, "step": 2384 }, { "epoch": 0.9168988375444327, "grad_norm": 1.212109787710417, "learning_rate": 8.766407964269304e-06, "loss": 0.1829, "step": 2386 }, { "epoch": 0.9176674032087616, "grad_norm": 1.1677407014366483, "learning_rate": 8.76346522025421e-06, "loss": 0.1812, "step": 2388 }, { "epoch": 0.9184359688730906, "grad_norm": 1.6014951745826158, "learning_rate": 8.760519465625943e-06, "loss": 0.1702, "step": 2390 }, { "epoch": 0.9192045345374196, "grad_norm": 1.1011210728359908, "learning_rate": 8.757570702740983e-06, "loss": 0.1505, "step": 2392 }, { "epoch": 0.9199731002017485, "grad_norm": 1.1628627528475448, "learning_rate": 8.754618933958216e-06, "loss": 0.1795, "step": 2394 }, { "epoch": 0.9207416658660774, "grad_norm": 1.2032685991627565, "learning_rate": 8.75166416163893e-06, "loss": 0.1591, "step": 2396 }, { "epoch": 0.9215102315304063, "grad_norm": 1.196133388930768, "learning_rate": 8.748706388146823e-06, "loss": 0.1815, "step": 2398 }, { "epoch": 0.9222787971947354, "grad_norm": 0.9956867529641326, "learning_rate": 8.745745615847984e-06, "loss": 0.1556, "step": 2400 }, { "epoch": 0.9230473628590643, "grad_norm": 1.268150156370348, "learning_rate": 8.742781847110907e-06, "loss": 0.1882, "step": 2402 }, { "epoch": 0.9238159285233932, "grad_norm": 1.0912540027240771, "learning_rate": 8.739815084306482e-06, "loss": 0.1638, "step": 2404 }, { "epoch": 0.9245844941877222, "grad_norm": 1.315238448191983, "learning_rate": 8.736845329807994e-06, "loss": 0.1929, "step": 2406 }, { "epoch": 0.9253530598520511, "grad_norm": 1.3345823718166399, "learning_rate": 8.733872585991121e-06, "loss": 0.1951, "step": 2408 }, { "epoch": 0.9261216255163801, "grad_norm": 1.3288357985033, "learning_rate": 8.730896855233933e-06, "loss": 0.1765, "step": 2410 }, { "epoch": 0.926890191180709, "grad_norm": 1.250069968579694, "learning_rate": 8.727918139916887e-06, "loss": 0.1802, "step": 2412 }, { "epoch": 0.927658756845038, "grad_norm": 1.080820496452964, "learning_rate": 8.724936442422834e-06, "loss": 0.1821, "step": 2414 }, { "epoch": 0.9284273225093669, "grad_norm": 1.1128807961908334, "learning_rate": 8.721951765137e-06, "loss": 0.1644, "step": 2416 }, { "epoch": 0.9291958881736958, "grad_norm": 1.1432548474573283, "learning_rate": 8.718964110447003e-06, "loss": 0.1606, "step": 2418 }, { "epoch": 0.9299644538380248, "grad_norm": 1.2885722974958567, "learning_rate": 8.715973480742843e-06, "loss": 0.1955, "step": 2420 }, { "epoch": 0.9307330195023538, "grad_norm": 1.2252978188570895, "learning_rate": 8.712979878416896e-06, "loss": 0.1904, "step": 2422 }, { "epoch": 0.9315015851666827, "grad_norm": 1.2746054266370066, "learning_rate": 8.709983305863916e-06, "loss": 0.1876, "step": 2424 }, { "epoch": 0.9322701508310116, "grad_norm": 1.2789519137955567, "learning_rate": 8.706983765481035e-06, "loss": 0.1652, "step": 2426 }, { "epoch": 0.9330387164953405, "grad_norm": 1.249435016520877, "learning_rate": 8.70398125966776e-06, "loss": 0.1845, "step": 2428 }, { "epoch": 0.9338072821596696, "grad_norm": 1.1855074417609208, "learning_rate": 8.700975790825967e-06, "loss": 0.1736, "step": 2430 }, { "epoch": 0.9345758478239985, "grad_norm": 1.2480948486996701, "learning_rate": 8.697967361359905e-06, "loss": 0.1823, "step": 2432 }, { "epoch": 0.9353444134883274, "grad_norm": 1.0575330863257393, "learning_rate": 8.69495597367619e-06, "loss": 0.168, "step": 2434 }, { "epoch": 0.9361129791526563, "grad_norm": 1.083703915739838, "learning_rate": 8.691941630183805e-06, "loss": 0.1512, "step": 2436 }, { "epoch": 0.9368815448169853, "grad_norm": 1.092997160615736, "learning_rate": 8.688924333294098e-06, "loss": 0.1666, "step": 2438 }, { "epoch": 0.9376501104813142, "grad_norm": 1.2437218826073353, "learning_rate": 8.68590408542078e-06, "loss": 0.1692, "step": 2440 }, { "epoch": 0.9384186761456432, "grad_norm": 1.1889162188401965, "learning_rate": 8.68288088897992e-06, "loss": 0.1765, "step": 2442 }, { "epoch": 0.9391872418099722, "grad_norm": 1.346616442015079, "learning_rate": 8.679854746389947e-06, "loss": 0.1792, "step": 2444 }, { "epoch": 0.9399558074743011, "grad_norm": 1.4030795525368809, "learning_rate": 8.676825660071645e-06, "loss": 0.1704, "step": 2446 }, { "epoch": 0.94072437313863, "grad_norm": 1.0920729282036086, "learning_rate": 8.673793632448162e-06, "loss": 0.1758, "step": 2448 }, { "epoch": 0.9414929388029589, "grad_norm": 1.2119874332061875, "learning_rate": 8.670758665944988e-06, "loss": 0.1903, "step": 2450 }, { "epoch": 0.942261504467288, "grad_norm": 1.1041178888843832, "learning_rate": 8.667720762989967e-06, "loss": 0.1685, "step": 2452 }, { "epoch": 0.9430300701316169, "grad_norm": 1.0870291023647063, "learning_rate": 8.664679926013294e-06, "loss": 0.1675, "step": 2454 }, { "epoch": 0.9437986357959458, "grad_norm": 1.2036893726740805, "learning_rate": 8.661636157447511e-06, "loss": 0.1669, "step": 2456 }, { "epoch": 0.9445672014602747, "grad_norm": 1.2042718490596327, "learning_rate": 8.658589459727503e-06, "loss": 0.1928, "step": 2458 }, { "epoch": 0.9453357671246037, "grad_norm": 1.1182056109457212, "learning_rate": 8.655539835290501e-06, "loss": 0.1662, "step": 2460 }, { "epoch": 0.9461043327889327, "grad_norm": 1.2308979046278397, "learning_rate": 8.652487286576073e-06, "loss": 0.1765, "step": 2462 }, { "epoch": 0.9468728984532616, "grad_norm": 1.1528102203209936, "learning_rate": 8.649431816026131e-06, "loss": 0.1746, "step": 2464 }, { "epoch": 0.9476414641175905, "grad_norm": 1.2014934491120075, "learning_rate": 8.646373426084922e-06, "loss": 0.1721, "step": 2466 }, { "epoch": 0.9484100297819195, "grad_norm": 1.2848122188474367, "learning_rate": 8.643312119199029e-06, "loss": 0.1849, "step": 2468 }, { "epoch": 0.9491785954462484, "grad_norm": 1.1277721420383973, "learning_rate": 8.64024789781737e-06, "loss": 0.1856, "step": 2470 }, { "epoch": 0.9499471611105774, "grad_norm": 1.0312386948538232, "learning_rate": 8.637180764391186e-06, "loss": 0.1441, "step": 2472 }, { "epoch": 0.9507157267749063, "grad_norm": 1.287138102548142, "learning_rate": 8.634110721374061e-06, "loss": 0.1935, "step": 2474 }, { "epoch": 0.9514842924392353, "grad_norm": 1.187206595496294, "learning_rate": 8.631037771221898e-06, "loss": 0.1939, "step": 2476 }, { "epoch": 0.9522528581035642, "grad_norm": 1.198979154010338, "learning_rate": 8.627961916392926e-06, "loss": 0.1804, "step": 2478 }, { "epoch": 0.9530214237678931, "grad_norm": 1.067763914503609, "learning_rate": 8.624883159347698e-06, "loss": 0.1694, "step": 2480 }, { "epoch": 0.9537899894322222, "grad_norm": 1.1831654122648927, "learning_rate": 8.621801502549095e-06, "loss": 0.1639, "step": 2482 }, { "epoch": 0.9545585550965511, "grad_norm": 1.127577070638523, "learning_rate": 8.618716948462305e-06, "loss": 0.1767, "step": 2484 }, { "epoch": 0.95532712076088, "grad_norm": 1.1670397292161896, "learning_rate": 8.615629499554845e-06, "loss": 0.186, "step": 2486 }, { "epoch": 0.9560956864252089, "grad_norm": 1.2033170747165403, "learning_rate": 8.612539158296544e-06, "loss": 0.1856, "step": 2488 }, { "epoch": 0.9568642520895378, "grad_norm": 1.1059957852633127, "learning_rate": 8.609445927159542e-06, "loss": 0.1717, "step": 2490 }, { "epoch": 0.9576328177538669, "grad_norm": 1.1911658462655756, "learning_rate": 8.606349808618297e-06, "loss": 0.1754, "step": 2492 }, { "epoch": 0.9584013834181958, "grad_norm": 1.1816862242884691, "learning_rate": 8.60325080514957e-06, "loss": 0.1927, "step": 2494 }, { "epoch": 0.9591699490825247, "grad_norm": 1.0951676904686158, "learning_rate": 8.600148919232432e-06, "loss": 0.1667, "step": 2496 }, { "epoch": 0.9599385147468537, "grad_norm": 1.1045107610551284, "learning_rate": 8.597044153348264e-06, "loss": 0.1673, "step": 2498 }, { "epoch": 0.9607070804111826, "grad_norm": 1.1602975370007549, "learning_rate": 8.593936509980744e-06, "loss": 0.1637, "step": 2500 }, { "epoch": 0.9607070804111826, "eval_loss": 0.16385692358016968, "eval_runtime": 391.4754, "eval_samples_per_second": 47.27, "eval_steps_per_second": 5.911, "step": 2500 }, { "epoch": 0.9614756460755116, "grad_norm": 1.3366198294958178, "learning_rate": 8.590825991615858e-06, "loss": 0.1905, "step": 2502 }, { "epoch": 0.9622442117398405, "grad_norm": 1.2595281594107641, "learning_rate": 8.58771260074189e-06, "loss": 0.1814, "step": 2504 }, { "epoch": 0.9630127774041695, "grad_norm": 1.172489339425924, "learning_rate": 8.584596339849419e-06, "loss": 0.1789, "step": 2506 }, { "epoch": 0.9637813430684984, "grad_norm": 1.1509483729528238, "learning_rate": 8.581477211431322e-06, "loss": 0.1787, "step": 2508 }, { "epoch": 0.9645499087328273, "grad_norm": 1.1928501324066068, "learning_rate": 8.57835521798277e-06, "loss": 0.1744, "step": 2510 }, { "epoch": 0.9653184743971563, "grad_norm": 1.1241561464760947, "learning_rate": 8.575230362001227e-06, "loss": 0.1765, "step": 2512 }, { "epoch": 0.9660870400614853, "grad_norm": 1.3925286885850316, "learning_rate": 8.572102645986444e-06, "loss": 0.1868, "step": 2514 }, { "epoch": 0.9668556057258142, "grad_norm": 1.184595084002648, "learning_rate": 8.568972072440464e-06, "loss": 0.192, "step": 2516 }, { "epoch": 0.9676241713901431, "grad_norm": 1.1520414128962253, "learning_rate": 8.565838643867615e-06, "loss": 0.1816, "step": 2518 }, { "epoch": 0.968392737054472, "grad_norm": 1.1259344170796861, "learning_rate": 8.562702362774502e-06, "loss": 0.1758, "step": 2520 }, { "epoch": 0.9691613027188011, "grad_norm": 1.2164411595374587, "learning_rate": 8.55956323167002e-06, "loss": 0.1855, "step": 2522 }, { "epoch": 0.96992986838313, "grad_norm": 1.1680167745652408, "learning_rate": 8.556421253065344e-06, "loss": 0.1895, "step": 2524 }, { "epoch": 0.9706984340474589, "grad_norm": 1.677238161839133, "learning_rate": 8.553276429473918e-06, "loss": 0.1813, "step": 2526 }, { "epoch": 0.9714669997117878, "grad_norm": 1.3237362919087992, "learning_rate": 8.550128763411474e-06, "loss": 0.1921, "step": 2528 }, { "epoch": 0.9722355653761168, "grad_norm": 1.1978764456154631, "learning_rate": 8.54697825739601e-06, "loss": 0.17, "step": 2530 }, { "epoch": 0.9730041310404458, "grad_norm": 1.267731323600252, "learning_rate": 8.543824913947798e-06, "loss": 0.1634, "step": 2532 }, { "epoch": 0.9737726967047747, "grad_norm": 1.3498452039637066, "learning_rate": 8.540668735589377e-06, "loss": 0.1903, "step": 2534 }, { "epoch": 0.9745412623691037, "grad_norm": 1.2865426546489835, "learning_rate": 8.537509724845558e-06, "loss": 0.1938, "step": 2536 }, { "epoch": 0.9753098280334326, "grad_norm": 1.09496239287967, "learning_rate": 8.534347884243417e-06, "loss": 0.1655, "step": 2538 }, { "epoch": 0.9760783936977615, "grad_norm": 1.34590558032455, "learning_rate": 8.531183216312293e-06, "loss": 0.1955, "step": 2540 }, { "epoch": 0.9768469593620905, "grad_norm": 1.2854882464777597, "learning_rate": 8.528015723583785e-06, "loss": 0.1893, "step": 2542 }, { "epoch": 0.9776155250264195, "grad_norm": 1.0174709700000482, "learning_rate": 8.524845408591755e-06, "loss": 0.1673, "step": 2544 }, { "epoch": 0.9783840906907484, "grad_norm": 0.9965601014572996, "learning_rate": 8.521672273872318e-06, "loss": 0.1806, "step": 2546 }, { "epoch": 0.9791526563550773, "grad_norm": 1.0488697052474119, "learning_rate": 8.518496321963851e-06, "loss": 0.1742, "step": 2548 }, { "epoch": 0.9799212220194062, "grad_norm": 1.1937225121311539, "learning_rate": 8.515317555406982e-06, "loss": 0.1766, "step": 2550 }, { "epoch": 0.9806897876837353, "grad_norm": 1.2410364965697114, "learning_rate": 8.512135976744586e-06, "loss": 0.1734, "step": 2552 }, { "epoch": 0.9814583533480642, "grad_norm": 1.2454463503090711, "learning_rate": 8.508951588521797e-06, "loss": 0.1855, "step": 2554 }, { "epoch": 0.9822269190123931, "grad_norm": 1.3470877582999545, "learning_rate": 8.505764393285985e-06, "loss": 0.2021, "step": 2556 }, { "epoch": 0.982995484676722, "grad_norm": 1.114660339475549, "learning_rate": 8.502574393586774e-06, "loss": 0.164, "step": 2558 }, { "epoch": 0.983764050341051, "grad_norm": 1.2507594828637805, "learning_rate": 8.499381591976031e-06, "loss": 0.1789, "step": 2560 }, { "epoch": 0.98453261600538, "grad_norm": 1.1320678044884365, "learning_rate": 8.49618599100786e-06, "loss": 0.1728, "step": 2562 }, { "epoch": 0.9853011816697089, "grad_norm": 1.241879119478018, "learning_rate": 8.492987593238606e-06, "loss": 0.1914, "step": 2564 }, { "epoch": 0.9860697473340378, "grad_norm": 1.33169238267083, "learning_rate": 8.489786401226855e-06, "loss": 0.1755, "step": 2566 }, { "epoch": 0.9868383129983668, "grad_norm": 1.156785322578118, "learning_rate": 8.486582417533424e-06, "loss": 0.1836, "step": 2568 }, { "epoch": 0.9876068786626957, "grad_norm": 1.2386576759385486, "learning_rate": 8.483375644721365e-06, "loss": 0.1884, "step": 2570 }, { "epoch": 0.9883754443270247, "grad_norm": 1.1639405945710612, "learning_rate": 8.48016608535596e-06, "loss": 0.1695, "step": 2572 }, { "epoch": 0.9891440099913537, "grad_norm": 1.2062872797633537, "learning_rate": 8.476953742004721e-06, "loss": 0.1823, "step": 2574 }, { "epoch": 0.9899125756556826, "grad_norm": 1.0122697783696166, "learning_rate": 8.473738617237387e-06, "loss": 0.1814, "step": 2576 }, { "epoch": 0.9906811413200115, "grad_norm": 1.2490962831992136, "learning_rate": 8.470520713625921e-06, "loss": 0.1907, "step": 2578 }, { "epoch": 0.9914497069843404, "grad_norm": 1.1347706749440867, "learning_rate": 8.467300033744513e-06, "loss": 0.1671, "step": 2580 }, { "epoch": 0.9922182726486695, "grad_norm": 1.2581208961297028, "learning_rate": 8.464076580169566e-06, "loss": 0.2039, "step": 2582 }, { "epoch": 0.9929868383129984, "grad_norm": 1.2286515675479035, "learning_rate": 8.460850355479713e-06, "loss": 0.1792, "step": 2584 }, { "epoch": 0.9937554039773273, "grad_norm": 1.1868333485155889, "learning_rate": 8.457621362255792e-06, "loss": 0.1751, "step": 2586 }, { "epoch": 0.9945239696416562, "grad_norm": 1.1604667515835358, "learning_rate": 8.454389603080863e-06, "loss": 0.1694, "step": 2588 }, { "epoch": 0.9952925353059852, "grad_norm": 1.16242025872779, "learning_rate": 8.451155080540199e-06, "loss": 0.172, "step": 2590 }, { "epoch": 0.9960611009703142, "grad_norm": 1.1174413169968995, "learning_rate": 8.447917797221276e-06, "loss": 0.1652, "step": 2592 }, { "epoch": 0.9968296666346431, "grad_norm": 1.1219665884503818, "learning_rate": 8.44467775571379e-06, "loss": 0.1557, "step": 2594 }, { "epoch": 0.997598232298972, "grad_norm": 1.3580042440043738, "learning_rate": 8.441434958609633e-06, "loss": 0.1977, "step": 2596 }, { "epoch": 0.998366797963301, "grad_norm": 1.4298640052539462, "learning_rate": 8.438189408502908e-06, "loss": 0.1841, "step": 2598 }, { "epoch": 0.9991353636276299, "grad_norm": 1.2247327455840082, "learning_rate": 8.434941107989916e-06, "loss": 0.1961, "step": 2600 }, { "epoch": 0.9999039292919589, "grad_norm": 1.1018939578251297, "learning_rate": 8.431690059669162e-06, "loss": 0.1713, "step": 2602 }, { "epoch": 1.0006724949562877, "grad_norm": 0.941215678626101, "learning_rate": 8.428436266141347e-06, "loss": 0.2243, "step": 2604 }, { "epoch": 1.0014410606206168, "grad_norm": 0.845203573482625, "learning_rate": 8.425179730009368e-06, "loss": 0.11, "step": 2606 }, { "epoch": 1.0022096262849458, "grad_norm": 0.8286000808342657, "learning_rate": 8.421920453878317e-06, "loss": 0.101, "step": 2608 }, { "epoch": 1.0029781919492746, "grad_norm": 0.9824638704014862, "learning_rate": 8.418658440355476e-06, "loss": 0.1079, "step": 2610 }, { "epoch": 1.0037467576136037, "grad_norm": 1.1958854345513863, "learning_rate": 8.415393692050317e-06, "loss": 0.1122, "step": 2612 }, { "epoch": 1.0045153232779325, "grad_norm": 1.2791164931958283, "learning_rate": 8.412126211574503e-06, "loss": 0.1112, "step": 2614 }, { "epoch": 1.0052838889422615, "grad_norm": 1.3723071854049773, "learning_rate": 8.408856001541881e-06, "loss": 0.1217, "step": 2616 }, { "epoch": 1.0060524546065905, "grad_norm": 1.481031446885998, "learning_rate": 8.405583064568478e-06, "loss": 0.1023, "step": 2618 }, { "epoch": 1.0068210202709194, "grad_norm": 1.0656507804650803, "learning_rate": 8.402307403272506e-06, "loss": 0.1089, "step": 2620 }, { "epoch": 1.0075895859352484, "grad_norm": 1.1042627830684681, "learning_rate": 8.399029020274358e-06, "loss": 0.1102, "step": 2622 }, { "epoch": 1.0083581515995772, "grad_norm": 1.163672132746451, "learning_rate": 8.3957479181966e-06, "loss": 0.1025, "step": 2624 }, { "epoch": 1.0091267172639062, "grad_norm": 1.165400514721663, "learning_rate": 8.392464099663975e-06, "loss": 0.1187, "step": 2626 }, { "epoch": 1.0098952829282353, "grad_norm": 1.1456605346460322, "learning_rate": 8.3891775673034e-06, "loss": 0.1144, "step": 2628 }, { "epoch": 1.010663848592564, "grad_norm": 1.0774762812653393, "learning_rate": 8.38588832374396e-06, "loss": 0.1116, "step": 2630 }, { "epoch": 1.0114324142568931, "grad_norm": 1.0878598481335413, "learning_rate": 8.382596371616912e-06, "loss": 0.1121, "step": 2632 }, { "epoch": 1.012200979921222, "grad_norm": 1.054902152887174, "learning_rate": 8.37930171355568e-06, "loss": 0.1086, "step": 2634 }, { "epoch": 1.012969545585551, "grad_norm": 1.1220046180028624, "learning_rate": 8.37600435219585e-06, "loss": 0.1074, "step": 2636 }, { "epoch": 1.01373811124988, "grad_norm": 1.1735017187629988, "learning_rate": 8.372704290175173e-06, "loss": 0.1189, "step": 2638 }, { "epoch": 1.0145066769142088, "grad_norm": 1.0218404687189255, "learning_rate": 8.369401530133557e-06, "loss": 0.1029, "step": 2640 }, { "epoch": 1.0152752425785379, "grad_norm": 1.2184242248606438, "learning_rate": 8.36609607471307e-06, "loss": 0.1071, "step": 2642 }, { "epoch": 1.0160438082428667, "grad_norm": 1.2368075082360919, "learning_rate": 8.36278792655794e-06, "loss": 0.0978, "step": 2644 }, { "epoch": 1.0168123739071957, "grad_norm": 1.1264447855037276, "learning_rate": 8.359477088314544e-06, "loss": 0.1055, "step": 2646 }, { "epoch": 1.0175809395715247, "grad_norm": 1.1216987744710032, "learning_rate": 8.356163562631413e-06, "loss": 0.1132, "step": 2648 }, { "epoch": 1.0183495052358535, "grad_norm": 1.1761302549482537, "learning_rate": 8.352847352159228e-06, "loss": 0.1111, "step": 2650 }, { "epoch": 1.0191180709001826, "grad_norm": 1.1048021602538607, "learning_rate": 8.349528459550816e-06, "loss": 0.1098, "step": 2652 }, { "epoch": 1.0198866365645114, "grad_norm": 1.1602024266050668, "learning_rate": 8.34620688746115e-06, "loss": 0.116, "step": 2654 }, { "epoch": 1.0206552022288404, "grad_norm": 1.114520497388927, "learning_rate": 8.342882638547351e-06, "loss": 0.1037, "step": 2656 }, { "epoch": 1.0214237678931695, "grad_norm": 1.1358721988831975, "learning_rate": 8.339555715468674e-06, "loss": 0.1083, "step": 2658 }, { "epoch": 1.0221923335574983, "grad_norm": 1.0452438230817858, "learning_rate": 8.336226120886519e-06, "loss": 0.0903, "step": 2660 }, { "epoch": 1.0229608992218273, "grad_norm": 1.132166121848424, "learning_rate": 8.33289385746442e-06, "loss": 0.1065, "step": 2662 }, { "epoch": 1.0237294648861561, "grad_norm": 1.112259383630313, "learning_rate": 8.329558927868047e-06, "loss": 0.1083, "step": 2664 }, { "epoch": 1.0244980305504852, "grad_norm": 1.1586981178659672, "learning_rate": 8.326221334765202e-06, "loss": 0.1079, "step": 2666 }, { "epoch": 1.0252665962148142, "grad_norm": 0.959611745214435, "learning_rate": 8.322881080825823e-06, "loss": 0.0944, "step": 2668 }, { "epoch": 1.026035161879143, "grad_norm": 1.1740853377834575, "learning_rate": 8.319538168721966e-06, "loss": 0.1105, "step": 2670 }, { "epoch": 1.026803727543472, "grad_norm": 1.0604865626745612, "learning_rate": 8.316192601127824e-06, "loss": 0.1019, "step": 2672 }, { "epoch": 1.0275722932078009, "grad_norm": 1.0622334484443028, "learning_rate": 8.312844380719707e-06, "loss": 0.1058, "step": 2674 }, { "epoch": 1.02834085887213, "grad_norm": 1.0719589037560313, "learning_rate": 8.30949351017605e-06, "loss": 0.1085, "step": 2676 }, { "epoch": 1.029109424536459, "grad_norm": 1.0870796697222216, "learning_rate": 8.30613999217741e-06, "loss": 0.1038, "step": 2678 }, { "epoch": 1.0298779902007877, "grad_norm": 1.1271223221146631, "learning_rate": 8.302783829406456e-06, "loss": 0.1022, "step": 2680 }, { "epoch": 1.0306465558651168, "grad_norm": 1.0699671223446336, "learning_rate": 8.299425024547978e-06, "loss": 0.1108, "step": 2682 }, { "epoch": 1.0314151215294456, "grad_norm": 1.2942296552277686, "learning_rate": 8.296063580288881e-06, "loss": 0.1286, "step": 2684 }, { "epoch": 1.0321836871937746, "grad_norm": 1.2703938865021622, "learning_rate": 8.292699499318175e-06, "loss": 0.1189, "step": 2686 }, { "epoch": 1.0329522528581037, "grad_norm": 1.0758087606705409, "learning_rate": 8.289332784326984e-06, "loss": 0.1153, "step": 2688 }, { "epoch": 1.0337208185224325, "grad_norm": 1.0781684727648344, "learning_rate": 8.285963438008537e-06, "loss": 0.1055, "step": 2690 }, { "epoch": 1.0344893841867615, "grad_norm": 1.0209338243220296, "learning_rate": 8.28259146305817e-06, "loss": 0.1014, "step": 2692 }, { "epoch": 1.0352579498510903, "grad_norm": 1.2053410964094091, "learning_rate": 8.27921686217332e-06, "loss": 0.1096, "step": 2694 }, { "epoch": 1.0360265155154194, "grad_norm": 1.196076659142551, "learning_rate": 8.275839638053524e-06, "loss": 0.1126, "step": 2696 }, { "epoch": 1.0367950811797484, "grad_norm": 1.0643194746272406, "learning_rate": 8.272459793400419e-06, "loss": 0.11, "step": 2698 }, { "epoch": 1.0375636468440772, "grad_norm": 1.1431870004056324, "learning_rate": 8.269077330917738e-06, "loss": 0.1085, "step": 2700 }, { "epoch": 1.0383322125084062, "grad_norm": 1.1437660805614343, "learning_rate": 8.265692253311308e-06, "loss": 0.1028, "step": 2702 }, { "epoch": 1.039100778172735, "grad_norm": 1.1469067958706156, "learning_rate": 8.26230456328905e-06, "loss": 0.1161, "step": 2704 }, { "epoch": 1.039869343837064, "grad_norm": 1.138497120602533, "learning_rate": 8.258914263560971e-06, "loss": 0.103, "step": 2706 }, { "epoch": 1.0406379095013931, "grad_norm": 1.1225398710088408, "learning_rate": 8.255521356839167e-06, "loss": 0.1269, "step": 2708 }, { "epoch": 1.041406475165722, "grad_norm": 1.3677929374232802, "learning_rate": 8.252125845837821e-06, "loss": 0.1138, "step": 2710 }, { "epoch": 1.042175040830051, "grad_norm": 1.2217729438934801, "learning_rate": 8.2487277332732e-06, "loss": 0.0945, "step": 2712 }, { "epoch": 1.0429436064943798, "grad_norm": 0.9933595413777809, "learning_rate": 8.245327021863648e-06, "loss": 0.1118, "step": 2714 }, { "epoch": 1.0437121721587088, "grad_norm": 1.214660335435361, "learning_rate": 8.241923714329589e-06, "loss": 0.1383, "step": 2716 }, { "epoch": 1.0444807378230379, "grad_norm": 1.1773948493708033, "learning_rate": 8.238517813393529e-06, "loss": 0.1151, "step": 2718 }, { "epoch": 1.0452493034873667, "grad_norm": 1.0620787605922537, "learning_rate": 8.235109321780044e-06, "loss": 0.0935, "step": 2720 }, { "epoch": 1.0460178691516957, "grad_norm": 1.0658062006262228, "learning_rate": 8.231698242215782e-06, "loss": 0.1136, "step": 2722 }, { "epoch": 1.0467864348160245, "grad_norm": 1.079823878615438, "learning_rate": 8.228284577429462e-06, "loss": 0.1117, "step": 2724 }, { "epoch": 1.0475550004803535, "grad_norm": 1.203565579363539, "learning_rate": 8.224868330151875e-06, "loss": 0.1165, "step": 2726 }, { "epoch": 1.0483235661446826, "grad_norm": 1.1118213447466752, "learning_rate": 8.22144950311587e-06, "loss": 0.1113, "step": 2728 }, { "epoch": 1.0490921318090114, "grad_norm": 1.2038053541076286, "learning_rate": 8.218028099056366e-06, "loss": 0.1278, "step": 2730 }, { "epoch": 1.0498606974733404, "grad_norm": 1.2880184192758635, "learning_rate": 8.214604120710344e-06, "loss": 0.1244, "step": 2732 }, { "epoch": 1.0506292631376692, "grad_norm": 1.1181231311928075, "learning_rate": 8.211177570816839e-06, "loss": 0.1047, "step": 2734 }, { "epoch": 1.0513978288019983, "grad_norm": 1.128333681952738, "learning_rate": 8.207748452116948e-06, "loss": 0.1111, "step": 2736 }, { "epoch": 1.0521663944663273, "grad_norm": 1.0521796863717088, "learning_rate": 8.204316767353819e-06, "loss": 0.0979, "step": 2738 }, { "epoch": 1.0529349601306561, "grad_norm": 1.1256843608181222, "learning_rate": 8.200882519272658e-06, "loss": 0.1165, "step": 2740 }, { "epoch": 1.0537035257949852, "grad_norm": 1.3255242355134025, "learning_rate": 8.197445710620716e-06, "loss": 0.1223, "step": 2742 }, { "epoch": 1.054472091459314, "grad_norm": 1.1225130030847648, "learning_rate": 8.194006344147294e-06, "loss": 0.0997, "step": 2744 }, { "epoch": 1.055240657123643, "grad_norm": 0.8885970174235238, "learning_rate": 8.19056442260374e-06, "loss": 0.0996, "step": 2746 }, { "epoch": 1.056009222787972, "grad_norm": 1.1007021750405572, "learning_rate": 8.18711994874345e-06, "loss": 0.1153, "step": 2748 }, { "epoch": 1.0567777884523009, "grad_norm": 0.953263990664266, "learning_rate": 8.183672925321852e-06, "loss": 0.1005, "step": 2750 }, { "epoch": 1.05754635411663, "grad_norm": 1.0011407963312944, "learning_rate": 8.180223355096423e-06, "loss": 0.1083, "step": 2752 }, { "epoch": 1.0583149197809587, "grad_norm": 1.2249538096021981, "learning_rate": 8.176771240826671e-06, "loss": 0.1129, "step": 2754 }, { "epoch": 1.0590834854452877, "grad_norm": 1.0509425982714722, "learning_rate": 8.173316585274144e-06, "loss": 0.1011, "step": 2756 }, { "epoch": 1.0598520511096168, "grad_norm": 1.067194784195954, "learning_rate": 8.169859391202418e-06, "loss": 0.1076, "step": 2758 }, { "epoch": 1.0606206167739456, "grad_norm": 1.1895474199819274, "learning_rate": 8.166399661377104e-06, "loss": 0.1071, "step": 2760 }, { "epoch": 1.0613891824382746, "grad_norm": 1.109174578005238, "learning_rate": 8.162937398565838e-06, "loss": 0.1128, "step": 2762 }, { "epoch": 1.0621577481026034, "grad_norm": 1.192405954591908, "learning_rate": 8.159472605538286e-06, "loss": 0.1019, "step": 2764 }, { "epoch": 1.0629263137669325, "grad_norm": 1.1330843565636592, "learning_rate": 8.156005285066135e-06, "loss": 0.1152, "step": 2766 }, { "epoch": 1.0636948794312615, "grad_norm": 1.1238987546353898, "learning_rate": 8.152535439923093e-06, "loss": 0.1138, "step": 2768 }, { "epoch": 1.0644634450955903, "grad_norm": 0.9256061466707445, "learning_rate": 8.149063072884893e-06, "loss": 0.1069, "step": 2770 }, { "epoch": 1.0652320107599194, "grad_norm": 1.1734127728656698, "learning_rate": 8.14558818672928e-06, "loss": 0.1162, "step": 2772 }, { "epoch": 1.0660005764242482, "grad_norm": 1.0211066153983668, "learning_rate": 8.142110784236015e-06, "loss": 0.0961, "step": 2774 }, { "epoch": 1.0667691420885772, "grad_norm": 1.0454242828223508, "learning_rate": 8.138630868186876e-06, "loss": 0.1002, "step": 2776 }, { "epoch": 1.0675377077529062, "grad_norm": 1.1055548735706058, "learning_rate": 8.135148441365647e-06, "loss": 0.1194, "step": 2778 }, { "epoch": 1.068306273417235, "grad_norm": 1.1266002758215572, "learning_rate": 8.131663506558123e-06, "loss": 0.1088, "step": 2780 }, { "epoch": 1.069074839081564, "grad_norm": 0.954807488549767, "learning_rate": 8.128176066552104e-06, "loss": 0.0849, "step": 2782 }, { "epoch": 1.069843404745893, "grad_norm": 1.1071800808369954, "learning_rate": 8.124686124137395e-06, "loss": 0.1108, "step": 2784 }, { "epoch": 1.070611970410222, "grad_norm": 1.1213466867697637, "learning_rate": 8.121193682105802e-06, "loss": 0.0991, "step": 2786 }, { "epoch": 1.071380536074551, "grad_norm": 1.3642690059749647, "learning_rate": 8.117698743251131e-06, "loss": 0.1089, "step": 2788 }, { "epoch": 1.0721491017388798, "grad_norm": 1.0944476680201871, "learning_rate": 8.114201310369186e-06, "loss": 0.1058, "step": 2790 }, { "epoch": 1.0729176674032088, "grad_norm": 1.0201115518080996, "learning_rate": 8.110701386257767e-06, "loss": 0.1138, "step": 2792 }, { "epoch": 1.0736862330675376, "grad_norm": 1.1692114405249554, "learning_rate": 8.10719897371666e-06, "loss": 0.1117, "step": 2794 }, { "epoch": 1.0744547987318667, "grad_norm": 1.2541093980987905, "learning_rate": 8.103694075547655e-06, "loss": 0.1184, "step": 2796 }, { "epoch": 1.0752233643961957, "grad_norm": 1.1413333227144755, "learning_rate": 8.100186694554517e-06, "loss": 0.1195, "step": 2798 }, { "epoch": 1.0759919300605245, "grad_norm": 1.0990972370502985, "learning_rate": 8.096676833543002e-06, "loss": 0.1137, "step": 2800 }, { "epoch": 1.0767604957248536, "grad_norm": 1.1109843215389517, "learning_rate": 8.093164495320855e-06, "loss": 0.1034, "step": 2802 }, { "epoch": 1.0775290613891824, "grad_norm": 1.129627250791017, "learning_rate": 8.089649682697795e-06, "loss": 0.1046, "step": 2804 }, { "epoch": 1.0782976270535114, "grad_norm": 1.0793256785623686, "learning_rate": 8.086132398485525e-06, "loss": 0.1039, "step": 2806 }, { "epoch": 1.0790661927178404, "grad_norm": 1.091298107023866, "learning_rate": 8.082612645497723e-06, "loss": 0.1178, "step": 2808 }, { "epoch": 1.0798347583821692, "grad_norm": 1.0849322549969822, "learning_rate": 8.07909042655004e-06, "loss": 0.1056, "step": 2810 }, { "epoch": 1.0806033240464983, "grad_norm": 1.0490742244310978, "learning_rate": 8.075565744460107e-06, "loss": 0.1036, "step": 2812 }, { "epoch": 1.081371889710827, "grad_norm": 1.0548346429897657, "learning_rate": 8.072038602047518e-06, "loss": 0.1048, "step": 2814 }, { "epoch": 1.0821404553751561, "grad_norm": 1.1459680087444282, "learning_rate": 8.06850900213384e-06, "loss": 0.1159, "step": 2816 }, { "epoch": 1.0829090210394852, "grad_norm": 1.1420807652658578, "learning_rate": 8.064976947542603e-06, "loss": 0.1133, "step": 2818 }, { "epoch": 1.083677586703814, "grad_norm": 1.1564477717576003, "learning_rate": 8.061442441099299e-06, "loss": 0.1112, "step": 2820 }, { "epoch": 1.084446152368143, "grad_norm": 1.0978819998243232, "learning_rate": 8.057905485631388e-06, "loss": 0.105, "step": 2822 }, { "epoch": 1.0852147180324718, "grad_norm": 1.018671591722812, "learning_rate": 8.054366083968281e-06, "loss": 0.0944, "step": 2824 }, { "epoch": 1.0859832836968009, "grad_norm": 0.9603360078746535, "learning_rate": 8.05082423894135e-06, "loss": 0.0996, "step": 2826 }, { "epoch": 1.08675184936113, "grad_norm": 1.646852493673257, "learning_rate": 8.047279953383927e-06, "loss": 0.1108, "step": 2828 }, { "epoch": 1.0875204150254587, "grad_norm": 1.286766388100262, "learning_rate": 8.043733230131284e-06, "loss": 0.1206, "step": 2830 }, { "epoch": 1.0882889806897877, "grad_norm": 1.1784569892916377, "learning_rate": 8.040184072020653e-06, "loss": 0.104, "step": 2832 }, { "epoch": 1.0890575463541166, "grad_norm": 1.182338111913254, "learning_rate": 8.03663248189121e-06, "loss": 0.1019, "step": 2834 }, { "epoch": 1.0898261120184456, "grad_norm": 1.1670025141363154, "learning_rate": 8.033078462584077e-06, "loss": 0.107, "step": 2836 }, { "epoch": 1.0905946776827746, "grad_norm": 1.0990035709345962, "learning_rate": 8.029522016942318e-06, "loss": 0.1006, "step": 2838 }, { "epoch": 1.0913632433471034, "grad_norm": 1.218098831702478, "learning_rate": 8.025963147810942e-06, "loss": 0.1215, "step": 2840 }, { "epoch": 1.0921318090114325, "grad_norm": 1.0492615341639566, "learning_rate": 8.022401858036892e-06, "loss": 0.1036, "step": 2842 }, { "epoch": 1.0929003746757613, "grad_norm": 1.0794423392436632, "learning_rate": 8.018838150469051e-06, "loss": 0.1077, "step": 2844 }, { "epoch": 1.0936689403400903, "grad_norm": 1.2164229286264856, "learning_rate": 8.015272027958231e-06, "loss": 0.1207, "step": 2846 }, { "epoch": 1.0944375060044194, "grad_norm": 1.2326747416241952, "learning_rate": 8.011703493357184e-06, "loss": 0.1172, "step": 2848 }, { "epoch": 1.0952060716687482, "grad_norm": 1.058248811649057, "learning_rate": 8.008132549520581e-06, "loss": 0.0992, "step": 2850 }, { "epoch": 1.0959746373330772, "grad_norm": 1.0647617631116704, "learning_rate": 8.004559199305034e-06, "loss": 0.112, "step": 2852 }, { "epoch": 1.096743202997406, "grad_norm": 1.0612867141317304, "learning_rate": 8.000983445569066e-06, "loss": 0.0943, "step": 2854 }, { "epoch": 1.097511768661735, "grad_norm": 1.2649862306551645, "learning_rate": 7.99740529117313e-06, "loss": 0.1088, "step": 2856 }, { "epoch": 1.098280334326064, "grad_norm": 1.1602047811752911, "learning_rate": 7.9938247389796e-06, "loss": 0.122, "step": 2858 }, { "epoch": 1.099048899990393, "grad_norm": 1.2027515066974948, "learning_rate": 7.990241791852766e-06, "loss": 0.1039, "step": 2860 }, { "epoch": 1.099817465654722, "grad_norm": 1.1745154777734554, "learning_rate": 7.986656452658834e-06, "loss": 0.1011, "step": 2862 }, { "epoch": 1.1005860313190508, "grad_norm": 1.0936478617703165, "learning_rate": 7.983068724265924e-06, "loss": 0.1058, "step": 2864 }, { "epoch": 1.1013545969833798, "grad_norm": 1.2168680901879056, "learning_rate": 7.979478609544067e-06, "loss": 0.1145, "step": 2866 }, { "epoch": 1.1021231626477088, "grad_norm": 1.0737509937699807, "learning_rate": 7.975886111365203e-06, "loss": 0.1097, "step": 2868 }, { "epoch": 1.1028917283120376, "grad_norm": 1.128739628379647, "learning_rate": 7.972291232603178e-06, "loss": 0.1121, "step": 2870 }, { "epoch": 1.1036602939763667, "grad_norm": 1.098921624459344, "learning_rate": 7.968693976133745e-06, "loss": 0.1059, "step": 2872 }, { "epoch": 1.1044288596406955, "grad_norm": 1.1438077947265362, "learning_rate": 7.965094344834556e-06, "loss": 0.1134, "step": 2874 }, { "epoch": 1.1051974253050245, "grad_norm": 1.2480944349993752, "learning_rate": 7.961492341585163e-06, "loss": 0.1164, "step": 2876 }, { "epoch": 1.1059659909693536, "grad_norm": 1.0745221130212947, "learning_rate": 7.95788796926702e-06, "loss": 0.1065, "step": 2878 }, { "epoch": 1.1067345566336824, "grad_norm": 1.1428067169667604, "learning_rate": 7.95428123076347e-06, "loss": 0.1197, "step": 2880 }, { "epoch": 1.1075031222980114, "grad_norm": 1.0024128020779763, "learning_rate": 7.95067212895975e-06, "loss": 0.0984, "step": 2882 }, { "epoch": 1.1082716879623402, "grad_norm": 1.2259364688636618, "learning_rate": 7.94706066674299e-06, "loss": 0.116, "step": 2884 }, { "epoch": 1.1090402536266692, "grad_norm": 0.9633995105055172, "learning_rate": 7.94344684700221e-06, "loss": 0.1053, "step": 2886 }, { "epoch": 1.1098088192909983, "grad_norm": 1.039506864712118, "learning_rate": 7.93983067262831e-06, "loss": 0.0957, "step": 2888 }, { "epoch": 1.110577384955327, "grad_norm": 1.1928810701973873, "learning_rate": 7.936212146514075e-06, "loss": 0.1106, "step": 2890 }, { "epoch": 1.1113459506196561, "grad_norm": 1.0754069223946474, "learning_rate": 7.93259127155418e-06, "loss": 0.103, "step": 2892 }, { "epoch": 1.112114516283985, "grad_norm": 1.0636856469833205, "learning_rate": 7.928968050645164e-06, "loss": 0.1085, "step": 2894 }, { "epoch": 1.112883081948314, "grad_norm": 1.3015651670096517, "learning_rate": 7.925342486685457e-06, "loss": 0.1105, "step": 2896 }, { "epoch": 1.113651647612643, "grad_norm": 1.1741340008767698, "learning_rate": 7.921714582575354e-06, "loss": 0.1048, "step": 2898 }, { "epoch": 1.1144202132769718, "grad_norm": 1.0705216972116909, "learning_rate": 7.918084341217022e-06, "loss": 0.0952, "step": 2900 }, { "epoch": 1.1151887789413009, "grad_norm": 1.1747377463135593, "learning_rate": 7.914451765514506e-06, "loss": 0.1211, "step": 2902 }, { "epoch": 1.1159573446056297, "grad_norm": 1.0814911679561685, "learning_rate": 7.910816858373711e-06, "loss": 0.1088, "step": 2904 }, { "epoch": 1.1167259102699587, "grad_norm": 1.0632083763860813, "learning_rate": 7.907179622702409e-06, "loss": 0.1089, "step": 2906 }, { "epoch": 1.1174944759342877, "grad_norm": 1.1008046074771187, "learning_rate": 7.903540061410235e-06, "loss": 0.1104, "step": 2908 }, { "epoch": 1.1182630415986166, "grad_norm": 1.0442386802949002, "learning_rate": 7.899898177408684e-06, "loss": 0.1114, "step": 2910 }, { "epoch": 1.1190316072629456, "grad_norm": 1.1037985273933621, "learning_rate": 7.89625397361111e-06, "loss": 0.1031, "step": 2912 }, { "epoch": 1.1198001729272744, "grad_norm": 1.1032392062547325, "learning_rate": 7.892607452932724e-06, "loss": 0.1136, "step": 2914 }, { "epoch": 1.1205687385916034, "grad_norm": 1.1419456677441853, "learning_rate": 7.888958618290584e-06, "loss": 0.1103, "step": 2916 }, { "epoch": 1.1213373042559325, "grad_norm": 1.0887666514949357, "learning_rate": 7.885307472603605e-06, "loss": 0.0919, "step": 2918 }, { "epoch": 1.1221058699202613, "grad_norm": 1.0925681979406274, "learning_rate": 7.881654018792551e-06, "loss": 0.1163, "step": 2920 }, { "epoch": 1.1228744355845903, "grad_norm": 1.0232471711307733, "learning_rate": 7.877998259780029e-06, "loss": 0.1061, "step": 2922 }, { "epoch": 1.1236430012489191, "grad_norm": 1.061074627728053, "learning_rate": 7.874340198490494e-06, "loss": 0.1163, "step": 2924 }, { "epoch": 1.1244115669132482, "grad_norm": 1.0027230004682393, "learning_rate": 7.870679837850238e-06, "loss": 0.1156, "step": 2926 }, { "epoch": 1.1251801325775772, "grad_norm": 1.2203522145575008, "learning_rate": 7.867017180787395e-06, "loss": 0.1213, "step": 2928 }, { "epoch": 1.125948698241906, "grad_norm": 1.0154632678953242, "learning_rate": 7.863352230231938e-06, "loss": 0.1117, "step": 2930 }, { "epoch": 1.126717263906235, "grad_norm": 1.1718908371374452, "learning_rate": 7.85968498911567e-06, "loss": 0.1156, "step": 2932 }, { "epoch": 1.1274858295705639, "grad_norm": 1.0512790721444059, "learning_rate": 7.856015460372232e-06, "loss": 0.1106, "step": 2934 }, { "epoch": 1.128254395234893, "grad_norm": 1.0747344037468036, "learning_rate": 7.852343646937089e-06, "loss": 0.1044, "step": 2936 }, { "epoch": 1.129022960899222, "grad_norm": 1.164376482511858, "learning_rate": 7.848669551747536e-06, "loss": 0.11, "step": 2938 }, { "epoch": 1.1297915265635508, "grad_norm": 1.2332822007878137, "learning_rate": 7.8449931777427e-06, "loss": 0.1273, "step": 2940 }, { "epoch": 1.1305600922278798, "grad_norm": 1.0472881654758228, "learning_rate": 7.841314527863517e-06, "loss": 0.106, "step": 2942 }, { "epoch": 1.1313286578922086, "grad_norm": 1.1146252383396411, "learning_rate": 7.837633605052757e-06, "loss": 0.1011, "step": 2944 }, { "epoch": 1.1320972235565376, "grad_norm": 1.074373472916478, "learning_rate": 7.833950412255003e-06, "loss": 0.1096, "step": 2946 }, { "epoch": 1.1328657892208667, "grad_norm": 1.1970175422750928, "learning_rate": 7.830264952416648e-06, "loss": 0.1108, "step": 2948 }, { "epoch": 1.1336343548851955, "grad_norm": 1.168287622381479, "learning_rate": 7.826577228485907e-06, "loss": 0.1144, "step": 2950 }, { "epoch": 1.1344029205495245, "grad_norm": 1.1162347622821915, "learning_rate": 7.822887243412807e-06, "loss": 0.1117, "step": 2952 }, { "epoch": 1.1351714862138533, "grad_norm": 1.1476271628305141, "learning_rate": 7.819195000149175e-06, "loss": 0.1132, "step": 2954 }, { "epoch": 1.1359400518781824, "grad_norm": 1.1384386485949898, "learning_rate": 7.815500501648654e-06, "loss": 0.1041, "step": 2956 }, { "epoch": 1.1367086175425114, "grad_norm": 1.0762661149465182, "learning_rate": 7.811803750866683e-06, "loss": 0.1071, "step": 2958 }, { "epoch": 1.1374771832068402, "grad_norm": 1.201837215026128, "learning_rate": 7.808104750760507e-06, "loss": 0.1108, "step": 2960 }, { "epoch": 1.1382457488711692, "grad_norm": 1.005644439408976, "learning_rate": 7.804403504289174e-06, "loss": 0.1081, "step": 2962 }, { "epoch": 1.139014314535498, "grad_norm": 1.0733790282352023, "learning_rate": 7.800700014413519e-06, "loss": 0.102, "step": 2964 }, { "epoch": 1.139782880199827, "grad_norm": 1.2142845825592516, "learning_rate": 7.79699428409618e-06, "loss": 0.11, "step": 2966 }, { "epoch": 1.1405514458641561, "grad_norm": 0.988799743292227, "learning_rate": 7.793286316301585e-06, "loss": 0.0989, "step": 2968 }, { "epoch": 1.141320011528485, "grad_norm": 1.1999121271970359, "learning_rate": 7.78957611399595e-06, "loss": 0.102, "step": 2970 }, { "epoch": 1.142088577192814, "grad_norm": 1.1769489434013929, "learning_rate": 7.785863680147284e-06, "loss": 0.1011, "step": 2972 }, { "epoch": 1.1428571428571428, "grad_norm": 1.0766711921955072, "learning_rate": 7.782149017725373e-06, "loss": 0.1049, "step": 2974 }, { "epoch": 1.1436257085214718, "grad_norm": 1.2133723629483677, "learning_rate": 7.778432129701792e-06, "loss": 0.1059, "step": 2976 }, { "epoch": 1.1443942741858009, "grad_norm": 1.1616819026999314, "learning_rate": 7.774713019049894e-06, "loss": 0.1179, "step": 2978 }, { "epoch": 1.1451628398501297, "grad_norm": 1.2475487023076068, "learning_rate": 7.77099168874481e-06, "loss": 0.1161, "step": 2980 }, { "epoch": 1.1459314055144587, "grad_norm": 1.0016240248208936, "learning_rate": 7.767268141763447e-06, "loss": 0.1047, "step": 2982 }, { "epoch": 1.1466999711787875, "grad_norm": 1.2076124976998917, "learning_rate": 7.763542381084486e-06, "loss": 0.1212, "step": 2984 }, { "epoch": 1.1474685368431166, "grad_norm": 1.1076932761316667, "learning_rate": 7.759814409688378e-06, "loss": 0.1119, "step": 2986 }, { "epoch": 1.1482371025074456, "grad_norm": 1.113143266873669, "learning_rate": 7.756084230557342e-06, "loss": 0.1065, "step": 2988 }, { "epoch": 1.1490056681717744, "grad_norm": 1.2579699092962155, "learning_rate": 7.752351846675364e-06, "loss": 0.108, "step": 2990 }, { "epoch": 1.1497742338361034, "grad_norm": 1.0746377694035296, "learning_rate": 7.748617261028196e-06, "loss": 0.1184, "step": 2992 }, { "epoch": 1.1505427995004323, "grad_norm": 0.9437230950636191, "learning_rate": 7.744880476603343e-06, "loss": 0.1043, "step": 2994 }, { "epoch": 1.1513113651647613, "grad_norm": 1.2386816923935147, "learning_rate": 7.741141496390077e-06, "loss": 0.1196, "step": 2996 }, { "epoch": 1.1520799308290903, "grad_norm": 1.0983968321068993, "learning_rate": 7.737400323379427e-06, "loss": 0.1078, "step": 2998 }, { "epoch": 1.1528484964934191, "grad_norm": 1.0288292517585145, "learning_rate": 7.733656960564173e-06, "loss": 0.0983, "step": 3000 }, { "epoch": 1.1528484964934191, "eval_loss": 0.16636966168880463, "eval_runtime": 389.9058, "eval_samples_per_second": 47.46, "eval_steps_per_second": 5.935, "step": 3000 }, { "epoch": 1.1536170621577482, "grad_norm": 1.1388684737278791, "learning_rate": 7.729911410938843e-06, "loss": 0.1228, "step": 3002 }, { "epoch": 1.154385627822077, "grad_norm": 1.209879097510645, "learning_rate": 7.726163677499722e-06, "loss": 0.1111, "step": 3004 }, { "epoch": 1.155154193486406, "grad_norm": 0.996561332904351, "learning_rate": 7.722413763244837e-06, "loss": 0.094, "step": 3006 }, { "epoch": 1.155922759150735, "grad_norm": 1.1306756364051458, "learning_rate": 7.718661671173962e-06, "loss": 0.1178, "step": 3008 }, { "epoch": 1.1566913248150639, "grad_norm": 1.186545119440085, "learning_rate": 7.714907404288611e-06, "loss": 0.1188, "step": 3010 }, { "epoch": 1.157459890479393, "grad_norm": 1.1597821200861673, "learning_rate": 7.71115096559204e-06, "loss": 0.1171, "step": 3012 }, { "epoch": 1.1582284561437217, "grad_norm": 1.1326657450889184, "learning_rate": 7.70739235808924e-06, "loss": 0.1086, "step": 3014 }, { "epoch": 1.1589970218080508, "grad_norm": 1.3213706237895122, "learning_rate": 7.703631584786939e-06, "loss": 0.109, "step": 3016 }, { "epoch": 1.1597655874723798, "grad_norm": 1.1320716514383662, "learning_rate": 7.699868648693596e-06, "loss": 0.0973, "step": 3018 }, { "epoch": 1.1605341531367086, "grad_norm": 1.0026139371849767, "learning_rate": 7.696103552819403e-06, "loss": 0.0961, "step": 3020 }, { "epoch": 1.1613027188010376, "grad_norm": 1.194673713789019, "learning_rate": 7.692336300176275e-06, "loss": 0.1117, "step": 3022 }, { "epoch": 1.1620712844653664, "grad_norm": 1.1628584873988672, "learning_rate": 7.688566893777856e-06, "loss": 0.1106, "step": 3024 }, { "epoch": 1.1628398501296955, "grad_norm": 1.1400680286849034, "learning_rate": 7.684795336639512e-06, "loss": 0.1094, "step": 3026 }, { "epoch": 1.1636084157940245, "grad_norm": 1.2048577148789437, "learning_rate": 7.68102163177833e-06, "loss": 0.102, "step": 3028 }, { "epoch": 1.1643769814583533, "grad_norm": 1.1872015747991478, "learning_rate": 7.677245782213114e-06, "loss": 0.1085, "step": 3030 }, { "epoch": 1.1651455471226824, "grad_norm": 1.0572392999846876, "learning_rate": 7.673467790964382e-06, "loss": 0.1024, "step": 3032 }, { "epoch": 1.1659141127870112, "grad_norm": 1.0491169718277014, "learning_rate": 7.669687661054373e-06, "loss": 0.1036, "step": 3034 }, { "epoch": 1.1666826784513402, "grad_norm": 1.0817365630556637, "learning_rate": 7.665905395507023e-06, "loss": 0.1125, "step": 3036 }, { "epoch": 1.1674512441156693, "grad_norm": 1.034044111167132, "learning_rate": 7.66212099734799e-06, "loss": 0.1067, "step": 3038 }, { "epoch": 1.168219809779998, "grad_norm": 0.9769787109074675, "learning_rate": 7.658334469604635e-06, "loss": 0.1107, "step": 3040 }, { "epoch": 1.168988375444327, "grad_norm": 1.0724190256524988, "learning_rate": 7.654545815306014e-06, "loss": 0.1119, "step": 3042 }, { "epoch": 1.169756941108656, "grad_norm": 1.1019892412318584, "learning_rate": 7.650755037482893e-06, "loss": 0.1026, "step": 3044 }, { "epoch": 1.170525506772985, "grad_norm": 0.9886880565430526, "learning_rate": 7.646962139167738e-06, "loss": 0.1077, "step": 3046 }, { "epoch": 1.171294072437314, "grad_norm": 1.0868554335327594, "learning_rate": 7.6431671233947e-06, "loss": 0.0983, "step": 3048 }, { "epoch": 1.1720626381016428, "grad_norm": 1.1664472133521484, "learning_rate": 7.639369993199636e-06, "loss": 0.1053, "step": 3050 }, { "epoch": 1.1728312037659718, "grad_norm": 1.124832579439523, "learning_rate": 7.635570751620091e-06, "loss": 0.11, "step": 3052 }, { "epoch": 1.1735997694303006, "grad_norm": 1.1124534354029376, "learning_rate": 7.631769401695294e-06, "loss": 0.1015, "step": 3054 }, { "epoch": 1.1743683350946297, "grad_norm": 1.183974604020884, "learning_rate": 7.627965946466167e-06, "loss": 0.1074, "step": 3056 }, { "epoch": 1.1751369007589587, "grad_norm": 1.0181194611697484, "learning_rate": 7.6241603889753115e-06, "loss": 0.1112, "step": 3058 }, { "epoch": 1.1759054664232875, "grad_norm": 1.2117100708449913, "learning_rate": 7.6203527322670136e-06, "loss": 0.1027, "step": 3060 }, { "epoch": 1.1766740320876166, "grad_norm": 1.0209732642666094, "learning_rate": 7.616542979387238e-06, "loss": 0.0985, "step": 3062 }, { "epoch": 1.1774425977519454, "grad_norm": 1.0803665740641333, "learning_rate": 7.612731133383624e-06, "loss": 0.1165, "step": 3064 }, { "epoch": 1.1782111634162744, "grad_norm": 1.1047886086571497, "learning_rate": 7.608917197305492e-06, "loss": 0.1019, "step": 3066 }, { "epoch": 1.1789797290806034, "grad_norm": 1.0814628984862982, "learning_rate": 7.605101174203826e-06, "loss": 0.11, "step": 3068 }, { "epoch": 1.1797482947449323, "grad_norm": 1.0574304109582588, "learning_rate": 7.601283067131284e-06, "loss": 0.0918, "step": 3070 }, { "epoch": 1.1805168604092613, "grad_norm": 1.3627143486235347, "learning_rate": 7.5974628791421925e-06, "loss": 0.121, "step": 3072 }, { "epoch": 1.18128542607359, "grad_norm": 1.0741020021910836, "learning_rate": 7.593640613292537e-06, "loss": 0.1136, "step": 3074 }, { "epoch": 1.1820539917379191, "grad_norm": 1.2195733034504286, "learning_rate": 7.589816272639969e-06, "loss": 0.1196, "step": 3076 }, { "epoch": 1.1828225574022482, "grad_norm": 1.0890561316330765, "learning_rate": 7.585989860243802e-06, "loss": 0.1138, "step": 3078 }, { "epoch": 1.183591123066577, "grad_norm": 1.115765608983075, "learning_rate": 7.582161379165003e-06, "loss": 0.112, "step": 3080 }, { "epoch": 1.184359688730906, "grad_norm": 1.1158655904406714, "learning_rate": 7.578330832466194e-06, "loss": 0.1114, "step": 3082 }, { "epoch": 1.1851282543952348, "grad_norm": 1.0628690763601882, "learning_rate": 7.5744982232116524e-06, "loss": 0.0992, "step": 3084 }, { "epoch": 1.1858968200595639, "grad_norm": 1.0071383391068325, "learning_rate": 7.570663554467305e-06, "loss": 0.1065, "step": 3086 }, { "epoch": 1.186665385723893, "grad_norm": 1.1486677041500748, "learning_rate": 7.5668268293007195e-06, "loss": 0.0991, "step": 3088 }, { "epoch": 1.1874339513882217, "grad_norm": 1.3428962593637535, "learning_rate": 7.5629880507811184e-06, "loss": 0.1162, "step": 3090 }, { "epoch": 1.1882025170525508, "grad_norm": 1.0278621810563175, "learning_rate": 7.559147221979362e-06, "loss": 0.0982, "step": 3092 }, { "epoch": 1.1889710827168796, "grad_norm": 1.0640061136558494, "learning_rate": 7.555304345967949e-06, "loss": 0.0969, "step": 3094 }, { "epoch": 1.1897396483812086, "grad_norm": 1.0881857222519369, "learning_rate": 7.551459425821017e-06, "loss": 0.1111, "step": 3096 }, { "epoch": 1.1905082140455376, "grad_norm": 1.062888424885625, "learning_rate": 7.547612464614345e-06, "loss": 0.1019, "step": 3098 }, { "epoch": 1.1912767797098665, "grad_norm": 1.2265208207453993, "learning_rate": 7.543763465425335e-06, "loss": 0.1182, "step": 3100 }, { "epoch": 1.1920453453741955, "grad_norm": 1.1400704975151388, "learning_rate": 7.539912431333023e-06, "loss": 0.104, "step": 3102 }, { "epoch": 1.1928139110385243, "grad_norm": 1.0243649513138324, "learning_rate": 7.536059365418078e-06, "loss": 0.1026, "step": 3104 }, { "epoch": 1.1935824767028533, "grad_norm": 1.1624014180631546, "learning_rate": 7.532204270762786e-06, "loss": 0.1088, "step": 3106 }, { "epoch": 1.1943510423671824, "grad_norm": 1.123731081495633, "learning_rate": 7.528347150451061e-06, "loss": 0.1182, "step": 3108 }, { "epoch": 1.1951196080315112, "grad_norm": 1.0899629000241524, "learning_rate": 7.524488007568435e-06, "loss": 0.1056, "step": 3110 }, { "epoch": 1.1958881736958402, "grad_norm": 1.0857275588730884, "learning_rate": 7.5206268452020634e-06, "loss": 0.0879, "step": 3112 }, { "epoch": 1.196656739360169, "grad_norm": 1.0462656748330923, "learning_rate": 7.5167636664407094e-06, "loss": 0.1117, "step": 3114 }, { "epoch": 1.197425305024498, "grad_norm": 0.993872433173416, "learning_rate": 7.5128984743747535e-06, "loss": 0.1062, "step": 3116 }, { "epoch": 1.198193870688827, "grad_norm": 1.266704024936471, "learning_rate": 7.509031272096187e-06, "loss": 0.1111, "step": 3118 }, { "epoch": 1.198962436353156, "grad_norm": 1.0827458170001596, "learning_rate": 7.505162062698607e-06, "loss": 0.1033, "step": 3120 }, { "epoch": 1.199731002017485, "grad_norm": 1.0301442341753162, "learning_rate": 7.501290849277216e-06, "loss": 0.1061, "step": 3122 }, { "epoch": 1.2004995676818138, "grad_norm": 1.0759522654557878, "learning_rate": 7.4974176349288265e-06, "loss": 0.1114, "step": 3124 }, { "epoch": 1.2012681333461428, "grad_norm": 1.119537274499154, "learning_rate": 7.493542422751842e-06, "loss": 0.117, "step": 3126 }, { "epoch": 1.2020366990104718, "grad_norm": 1.0556344665240842, "learning_rate": 7.489665215846268e-06, "loss": 0.1146, "step": 3128 }, { "epoch": 1.2028052646748006, "grad_norm": 0.9462907005911091, "learning_rate": 7.485786017313709e-06, "loss": 0.0971, "step": 3130 }, { "epoch": 1.2035738303391297, "grad_norm": 0.9861698549842547, "learning_rate": 7.48190483025736e-06, "loss": 0.1095, "step": 3132 }, { "epoch": 1.2043423960034585, "grad_norm": 0.9987671565055024, "learning_rate": 7.478021657782004e-06, "loss": 0.0973, "step": 3134 }, { "epoch": 1.2051109616677875, "grad_norm": 1.0602976010286351, "learning_rate": 7.474136502994016e-06, "loss": 0.0949, "step": 3136 }, { "epoch": 1.2058795273321163, "grad_norm": 1.0775096332334504, "learning_rate": 7.470249369001358e-06, "loss": 0.1061, "step": 3138 }, { "epoch": 1.2066480929964454, "grad_norm": 1.093911860114731, "learning_rate": 7.466360258913569e-06, "loss": 0.1007, "step": 3140 }, { "epoch": 1.2074166586607744, "grad_norm": 1.1341553797903292, "learning_rate": 7.462469175841774e-06, "loss": 0.1064, "step": 3142 }, { "epoch": 1.2081852243251032, "grad_norm": 1.2211139779821596, "learning_rate": 7.4585761228986795e-06, "loss": 0.1045, "step": 3144 }, { "epoch": 1.2089537899894323, "grad_norm": 1.1027904092820553, "learning_rate": 7.454681103198557e-06, "loss": 0.0987, "step": 3146 }, { "epoch": 1.209722355653761, "grad_norm": 0.985580604886226, "learning_rate": 7.45078411985726e-06, "loss": 0.0958, "step": 3148 }, { "epoch": 1.21049092131809, "grad_norm": 1.1659308773390764, "learning_rate": 7.446885175992213e-06, "loss": 0.111, "step": 3150 }, { "epoch": 1.2112594869824191, "grad_norm": 1.056243429657134, "learning_rate": 7.442984274722402e-06, "loss": 0.0983, "step": 3152 }, { "epoch": 1.212028052646748, "grad_norm": 0.9371141450212414, "learning_rate": 7.439081419168387e-06, "loss": 0.0896, "step": 3154 }, { "epoch": 1.212796618311077, "grad_norm": 1.0021835750356736, "learning_rate": 7.435176612452286e-06, "loss": 0.1046, "step": 3156 }, { "epoch": 1.2135651839754058, "grad_norm": 1.0504331301281937, "learning_rate": 7.4312698576977795e-06, "loss": 0.106, "step": 3158 }, { "epoch": 1.2143337496397348, "grad_norm": 1.1895450276967143, "learning_rate": 7.427361158030105e-06, "loss": 0.1187, "step": 3160 }, { "epoch": 1.2151023153040639, "grad_norm": 1.191196759934383, "learning_rate": 7.423450516576059e-06, "loss": 0.1183, "step": 3162 }, { "epoch": 1.2158708809683927, "grad_norm": 1.1180956311092907, "learning_rate": 7.41953793646399e-06, "loss": 0.115, "step": 3164 }, { "epoch": 1.2166394466327217, "grad_norm": 1.0735924016293423, "learning_rate": 7.415623420823793e-06, "loss": 0.0997, "step": 3166 }, { "epoch": 1.2174080122970505, "grad_norm": 1.0102459997595263, "learning_rate": 7.41170697278692e-06, "loss": 0.105, "step": 3168 }, { "epoch": 1.2181765779613796, "grad_norm": 1.0557236658563456, "learning_rate": 7.40778859548636e-06, "loss": 0.1149, "step": 3170 }, { "epoch": 1.2189451436257086, "grad_norm": 1.2685608244662554, "learning_rate": 7.403868292056653e-06, "loss": 0.1184, "step": 3172 }, { "epoch": 1.2197137092900374, "grad_norm": 1.034838091210697, "learning_rate": 7.399946065633872e-06, "loss": 0.1084, "step": 3174 }, { "epoch": 1.2204822749543665, "grad_norm": 1.0628754584067615, "learning_rate": 7.396021919355634e-06, "loss": 0.1115, "step": 3176 }, { "epoch": 1.2212508406186953, "grad_norm": 1.034626128874365, "learning_rate": 7.392095856361092e-06, "loss": 0.0959, "step": 3178 }, { "epoch": 1.2220194062830243, "grad_norm": 1.125476898555156, "learning_rate": 7.388167879790929e-06, "loss": 0.1076, "step": 3180 }, { "epoch": 1.2227879719473533, "grad_norm": 1.0572960614367095, "learning_rate": 7.38423799278736e-06, "loss": 0.1047, "step": 3182 }, { "epoch": 1.2235565376116821, "grad_norm": 1.160892885244426, "learning_rate": 7.38030619849413e-06, "loss": 0.1129, "step": 3184 }, { "epoch": 1.2243251032760112, "grad_norm": 1.300658793608965, "learning_rate": 7.376372500056506e-06, "loss": 0.1056, "step": 3186 }, { "epoch": 1.22509366894034, "grad_norm": 1.0229423716940362, "learning_rate": 7.372436900621283e-06, "loss": 0.0855, "step": 3188 }, { "epoch": 1.225862234604669, "grad_norm": 1.204140617633634, "learning_rate": 7.368499403336774e-06, "loss": 0.1168, "step": 3190 }, { "epoch": 1.226630800268998, "grad_norm": 1.2303071737867488, "learning_rate": 7.364560011352808e-06, "loss": 0.0937, "step": 3192 }, { "epoch": 1.2273993659333269, "grad_norm": 1.0891950080436623, "learning_rate": 7.3606187278207344e-06, "loss": 0.1079, "step": 3194 }, { "epoch": 1.228167931597656, "grad_norm": 1.0327625572819445, "learning_rate": 7.356675555893412e-06, "loss": 0.107, "step": 3196 }, { "epoch": 1.2289364972619847, "grad_norm": 1.1498490430121964, "learning_rate": 7.352730498725215e-06, "loss": 0.1012, "step": 3198 }, { "epoch": 1.2297050629263138, "grad_norm": 1.2630752132337697, "learning_rate": 7.348783559472017e-06, "loss": 0.1037, "step": 3200 }, { "epoch": 1.2304736285906428, "grad_norm": 1.3511005368636377, "learning_rate": 7.344834741291209e-06, "loss": 0.1179, "step": 3202 }, { "epoch": 1.2312421942549716, "grad_norm": 1.0789755115306352, "learning_rate": 7.340884047341674e-06, "loss": 0.1215, "step": 3204 }, { "epoch": 1.2320107599193006, "grad_norm": 1.1685368017978282, "learning_rate": 7.336931480783801e-06, "loss": 0.1052, "step": 3206 }, { "epoch": 1.2327793255836295, "grad_norm": 1.1704340440467762, "learning_rate": 7.332977044779478e-06, "loss": 0.1075, "step": 3208 }, { "epoch": 1.2335478912479585, "grad_norm": 1.1808677164914771, "learning_rate": 7.329020742492086e-06, "loss": 0.1096, "step": 3210 }, { "epoch": 1.2343164569122875, "grad_norm": 0.9900566213744348, "learning_rate": 7.3250625770864995e-06, "loss": 0.1193, "step": 3212 }, { "epoch": 1.2350850225766163, "grad_norm": 0.9252470045047413, "learning_rate": 7.3211025517290825e-06, "loss": 0.1043, "step": 3214 }, { "epoch": 1.2358535882409454, "grad_norm": 1.0265059243114736, "learning_rate": 7.317140669587689e-06, "loss": 0.1001, "step": 3216 }, { "epoch": 1.2366221539052742, "grad_norm": 1.1074748548231843, "learning_rate": 7.313176933831658e-06, "loss": 0.1047, "step": 3218 }, { "epoch": 1.2373907195696032, "grad_norm": 1.2124613138522577, "learning_rate": 7.3092113476318095e-06, "loss": 0.1142, "step": 3220 }, { "epoch": 1.2381592852339323, "grad_norm": 1.033195898329858, "learning_rate": 7.305243914160445e-06, "loss": 0.0997, "step": 3222 }, { "epoch": 1.238927850898261, "grad_norm": 1.1022800037595823, "learning_rate": 7.301274636591345e-06, "loss": 0.1131, "step": 3224 }, { "epoch": 1.23969641656259, "grad_norm": 1.1630471536381635, "learning_rate": 7.297303518099763e-06, "loss": 0.1209, "step": 3226 }, { "epoch": 1.240464982226919, "grad_norm": 1.1159239937137992, "learning_rate": 7.293330561862425e-06, "loss": 0.1052, "step": 3228 }, { "epoch": 1.241233547891248, "grad_norm": 1.0805088718810811, "learning_rate": 7.28935577105753e-06, "loss": 0.1073, "step": 3230 }, { "epoch": 1.242002113555577, "grad_norm": 1.00924590655784, "learning_rate": 7.285379148864741e-06, "loss": 0.0955, "step": 3232 }, { "epoch": 1.2427706792199058, "grad_norm": 1.086690375179855, "learning_rate": 7.281400698465189e-06, "loss": 0.1103, "step": 3234 }, { "epoch": 1.2435392448842348, "grad_norm": 1.114355757767786, "learning_rate": 7.277420423041466e-06, "loss": 0.1097, "step": 3236 }, { "epoch": 1.2443078105485637, "grad_norm": 1.1788730239197907, "learning_rate": 7.273438325777622e-06, "loss": 0.0979, "step": 3238 }, { "epoch": 1.2450763762128927, "grad_norm": 1.0450637627039217, "learning_rate": 7.269454409859169e-06, "loss": 0.0946, "step": 3240 }, { "epoch": 1.2458449418772217, "grad_norm": 0.994734999846052, "learning_rate": 7.265468678473072e-06, "loss": 0.0911, "step": 3242 }, { "epoch": 1.2466135075415505, "grad_norm": 1.214369348167086, "learning_rate": 7.261481134807746e-06, "loss": 0.1173, "step": 3244 }, { "epoch": 1.2473820732058796, "grad_norm": 1.2181136122206986, "learning_rate": 7.257491782053059e-06, "loss": 0.112, "step": 3246 }, { "epoch": 1.2481506388702084, "grad_norm": 1.217206068004002, "learning_rate": 7.2535006234003225e-06, "loss": 0.1182, "step": 3248 }, { "epoch": 1.2489192045345374, "grad_norm": 1.0231552958725898, "learning_rate": 7.2495076620422964e-06, "loss": 0.1008, "step": 3250 }, { "epoch": 1.2496877701988665, "grad_norm": 1.0212425261670999, "learning_rate": 7.2455129011731806e-06, "loss": 0.1109, "step": 3252 }, { "epoch": 1.2504563358631953, "grad_norm": 0.9739818513776388, "learning_rate": 7.241516343988616e-06, "loss": 0.1039, "step": 3254 }, { "epoch": 1.2512249015275243, "grad_norm": 0.9696623547627686, "learning_rate": 7.2375179936856775e-06, "loss": 0.0916, "step": 3256 }, { "epoch": 1.2519934671918533, "grad_norm": 1.1832988620830618, "learning_rate": 7.233517853462878e-06, "loss": 0.0995, "step": 3258 }, { "epoch": 1.2527620328561822, "grad_norm": 1.1183050762955713, "learning_rate": 7.229515926520161e-06, "loss": 0.1148, "step": 3260 }, { "epoch": 1.253530598520511, "grad_norm": 0.9555880897968847, "learning_rate": 7.225512216058897e-06, "loss": 0.0979, "step": 3262 }, { "epoch": 1.25429916418484, "grad_norm": 0.9820787372201224, "learning_rate": 7.221506725281886e-06, "loss": 0.0931, "step": 3264 }, { "epoch": 1.255067729849169, "grad_norm": 1.2593917902547296, "learning_rate": 7.217499457393354e-06, "loss": 0.1289, "step": 3266 }, { "epoch": 1.255836295513498, "grad_norm": 0.865915442580309, "learning_rate": 7.2134904155989405e-06, "loss": 0.0951, "step": 3268 }, { "epoch": 1.2566048611778269, "grad_norm": 1.0150446256924714, "learning_rate": 7.209479603105716e-06, "loss": 0.1106, "step": 3270 }, { "epoch": 1.2573734268421557, "grad_norm": 1.0938221816094018, "learning_rate": 7.205467023122156e-06, "loss": 0.1122, "step": 3272 }, { "epoch": 1.2581419925064847, "grad_norm": 1.1817589275859, "learning_rate": 7.201452678858156e-06, "loss": 0.0967, "step": 3274 }, { "epoch": 1.2589105581708138, "grad_norm": 1.2400440413447142, "learning_rate": 7.197436573525025e-06, "loss": 0.1055, "step": 3276 }, { "epoch": 1.2596791238351428, "grad_norm": 1.0786193661041186, "learning_rate": 7.193418710335475e-06, "loss": 0.0997, "step": 3278 }, { "epoch": 1.2604476894994716, "grad_norm": 1.0984147725715754, "learning_rate": 7.189399092503626e-06, "loss": 0.1257, "step": 3280 }, { "epoch": 1.2612162551638004, "grad_norm": 1.1217056842127744, "learning_rate": 7.185377723245004e-06, "loss": 0.1061, "step": 3282 }, { "epoch": 1.2619848208281295, "grad_norm": 1.23346552547768, "learning_rate": 7.1813546057765335e-06, "loss": 0.1158, "step": 3284 }, { "epoch": 1.2627533864924585, "grad_norm": 1.2160567223857799, "learning_rate": 7.177329743316539e-06, "loss": 0.1128, "step": 3286 }, { "epoch": 1.2635219521567875, "grad_norm": 1.093895401378369, "learning_rate": 7.1733031390847415e-06, "loss": 0.1144, "step": 3288 }, { "epoch": 1.2642905178211163, "grad_norm": 1.1170106665729227, "learning_rate": 7.1692747963022524e-06, "loss": 0.1098, "step": 3290 }, { "epoch": 1.2650590834854452, "grad_norm": 1.1368686275633446, "learning_rate": 7.165244718191576e-06, "loss": 0.1051, "step": 3292 }, { "epoch": 1.2658276491497742, "grad_norm": 1.195959681550762, "learning_rate": 7.161212907976607e-06, "loss": 0.1192, "step": 3294 }, { "epoch": 1.2665962148141032, "grad_norm": 1.0236627770175084, "learning_rate": 7.15717936888262e-06, "loss": 0.0979, "step": 3296 }, { "epoch": 1.2673647804784323, "grad_norm": 1.1031404172398462, "learning_rate": 7.153144104136276e-06, "loss": 0.1127, "step": 3298 }, { "epoch": 1.268133346142761, "grad_norm": 1.0929297114727132, "learning_rate": 7.149107116965619e-06, "loss": 0.1079, "step": 3300 }, { "epoch": 1.2689019118070899, "grad_norm": 1.0769007923925822, "learning_rate": 7.145068410600065e-06, "loss": 0.0968, "step": 3302 }, { "epoch": 1.269670477471419, "grad_norm": 1.1189645230165786, "learning_rate": 7.141027988270411e-06, "loss": 0.0947, "step": 3304 }, { "epoch": 1.270439043135748, "grad_norm": 1.0488074651146888, "learning_rate": 7.136985853208824e-06, "loss": 0.104, "step": 3306 }, { "epoch": 1.271207608800077, "grad_norm": 1.255745143442445, "learning_rate": 7.13294200864884e-06, "loss": 0.1211, "step": 3308 }, { "epoch": 1.2719761744644058, "grad_norm": 0.9602219680575647, "learning_rate": 7.128896457825364e-06, "loss": 0.0944, "step": 3310 }, { "epoch": 1.2727447401287346, "grad_norm": 1.013698405755513, "learning_rate": 7.124849203974666e-06, "loss": 0.1028, "step": 3312 }, { "epoch": 1.2735133057930637, "grad_norm": 1.1522109737061788, "learning_rate": 7.1208002503343785e-06, "loss": 0.113, "step": 3314 }, { "epoch": 1.2742818714573927, "grad_norm": 1.3052664999406849, "learning_rate": 7.116749600143494e-06, "loss": 0.1112, "step": 3316 }, { "epoch": 1.2750504371217217, "grad_norm": 1.1367808653695193, "learning_rate": 7.112697256642358e-06, "loss": 0.1157, "step": 3318 }, { "epoch": 1.2758190027860505, "grad_norm": 0.9910679974550004, "learning_rate": 7.108643223072678e-06, "loss": 0.0837, "step": 3320 }, { "epoch": 1.2765875684503794, "grad_norm": 1.2455430516663863, "learning_rate": 7.104587502677508e-06, "loss": 0.105, "step": 3322 }, { "epoch": 1.2773561341147084, "grad_norm": 1.2810005980328256, "learning_rate": 7.100530098701254e-06, "loss": 0.1172, "step": 3324 }, { "epoch": 1.2781246997790374, "grad_norm": 1.1957026671292306, "learning_rate": 7.0964710143896655e-06, "loss": 0.1023, "step": 3326 }, { "epoch": 1.2788932654433665, "grad_norm": 1.0961651528256708, "learning_rate": 7.09241025298984e-06, "loss": 0.1022, "step": 3328 }, { "epoch": 1.2796618311076953, "grad_norm": 1.1847031281189029, "learning_rate": 7.088347817750216e-06, "loss": 0.114, "step": 3330 }, { "epoch": 1.280430396772024, "grad_norm": 1.309124021199123, "learning_rate": 7.084283711920567e-06, "loss": 0.1103, "step": 3332 }, { "epoch": 1.2811989624363531, "grad_norm": 1.1309650165420526, "learning_rate": 7.0802179387520095e-06, "loss": 0.109, "step": 3334 }, { "epoch": 1.2819675281006822, "grad_norm": 1.0797496042807837, "learning_rate": 7.076150501496989e-06, "loss": 0.1054, "step": 3336 }, { "epoch": 1.282736093765011, "grad_norm": 1.1338372156244034, "learning_rate": 7.072081403409281e-06, "loss": 0.1044, "step": 3338 }, { "epoch": 1.28350465942934, "grad_norm": 1.0403061087622487, "learning_rate": 7.068010647743997e-06, "loss": 0.1038, "step": 3340 }, { "epoch": 1.2842732250936688, "grad_norm": 1.042608322093075, "learning_rate": 7.063938237757565e-06, "loss": 0.1116, "step": 3342 }, { "epoch": 1.2850417907579978, "grad_norm": 1.0478260975455944, "learning_rate": 7.059864176707742e-06, "loss": 0.1157, "step": 3344 }, { "epoch": 1.2858103564223269, "grad_norm": 1.1419565045042874, "learning_rate": 7.055788467853607e-06, "loss": 0.1082, "step": 3346 }, { "epoch": 1.2865789220866557, "grad_norm": 1.013438940031721, "learning_rate": 7.051711114455553e-06, "loss": 0.1025, "step": 3348 }, { "epoch": 1.2873474877509847, "grad_norm": 1.043141854147516, "learning_rate": 7.0476321197752895e-06, "loss": 0.1011, "step": 3350 }, { "epoch": 1.2881160534153135, "grad_norm": 1.026449417183067, "learning_rate": 7.043551487075843e-06, "loss": 0.1039, "step": 3352 }, { "epoch": 1.2888846190796426, "grad_norm": 1.158221241215838, "learning_rate": 7.039469219621543e-06, "loss": 0.1179, "step": 3354 }, { "epoch": 1.2896531847439716, "grad_norm": 1.0898437871363447, "learning_rate": 7.035385320678035e-06, "loss": 0.0984, "step": 3356 }, { "epoch": 1.2904217504083004, "grad_norm": 1.090327418416237, "learning_rate": 7.031299793512265e-06, "loss": 0.1001, "step": 3358 }, { "epoch": 1.2911903160726295, "grad_norm": 1.2612288925630497, "learning_rate": 7.027212641392478e-06, "loss": 0.1123, "step": 3360 }, { "epoch": 1.2919588817369583, "grad_norm": 1.0593799379942432, "learning_rate": 7.023123867588229e-06, "loss": 0.1027, "step": 3362 }, { "epoch": 1.2927274474012873, "grad_norm": 0.9552797549105786, "learning_rate": 7.0190334753703605e-06, "loss": 0.1041, "step": 3364 }, { "epoch": 1.2934960130656163, "grad_norm": 1.3474410007663173, "learning_rate": 7.014941468011014e-06, "loss": 0.1095, "step": 3366 }, { "epoch": 1.2942645787299452, "grad_norm": 1.0215404174916893, "learning_rate": 7.010847848783623e-06, "loss": 0.1071, "step": 3368 }, { "epoch": 1.2950331443942742, "grad_norm": 1.159807611039063, "learning_rate": 7.00675262096291e-06, "loss": 0.1155, "step": 3370 }, { "epoch": 1.295801710058603, "grad_norm": 1.1350102984207238, "learning_rate": 7.002655787824884e-06, "loss": 0.1056, "step": 3372 }, { "epoch": 1.296570275722932, "grad_norm": 1.1868272803826567, "learning_rate": 6.998557352646837e-06, "loss": 0.1053, "step": 3374 }, { "epoch": 1.297338841387261, "grad_norm": 1.09555836251897, "learning_rate": 6.994457318707348e-06, "loss": 0.1025, "step": 3376 }, { "epoch": 1.29810740705159, "grad_norm": 0.9892691550549848, "learning_rate": 6.990355689286266e-06, "loss": 0.0978, "step": 3378 }, { "epoch": 1.298875972715919, "grad_norm": 1.0170330550625946, "learning_rate": 6.986252467664726e-06, "loss": 0.0978, "step": 3380 }, { "epoch": 1.2996445383802477, "grad_norm": 1.0766248043802122, "learning_rate": 6.982147657125129e-06, "loss": 0.1076, "step": 3382 }, { "epoch": 1.3004131040445768, "grad_norm": 1.1024397171362048, "learning_rate": 6.9780412609511495e-06, "loss": 0.1046, "step": 3384 }, { "epoch": 1.3011816697089058, "grad_norm": 1.1908911296318505, "learning_rate": 6.973933282427733e-06, "loss": 0.1103, "step": 3386 }, { "epoch": 1.3019502353732346, "grad_norm": 1.2155738728693821, "learning_rate": 6.9698237248410875e-06, "loss": 0.114, "step": 3388 }, { "epoch": 1.3027188010375637, "grad_norm": 1.0339132972074925, "learning_rate": 6.965712591478685e-06, "loss": 0.0951, "step": 3390 }, { "epoch": 1.3034873667018925, "grad_norm": 1.244660376879418, "learning_rate": 6.96159988562926e-06, "loss": 0.1186, "step": 3392 }, { "epoch": 1.3042559323662215, "grad_norm": 1.1220519888938487, "learning_rate": 6.957485610582803e-06, "loss": 0.1064, "step": 3394 }, { "epoch": 1.3050244980305505, "grad_norm": 0.9735538859888435, "learning_rate": 6.953369769630558e-06, "loss": 0.0946, "step": 3396 }, { "epoch": 1.3057930636948794, "grad_norm": 1.211418707917989, "learning_rate": 6.949252366065027e-06, "loss": 0.1216, "step": 3398 }, { "epoch": 1.3065616293592084, "grad_norm": 1.0826523412188112, "learning_rate": 6.945133403179958e-06, "loss": 0.1169, "step": 3400 }, { "epoch": 1.3073301950235372, "grad_norm": 1.0092906440805864, "learning_rate": 6.9410128842703465e-06, "loss": 0.112, "step": 3402 }, { "epoch": 1.3080987606878662, "grad_norm": 1.312292021809481, "learning_rate": 6.936890812632433e-06, "loss": 0.1302, "step": 3404 }, { "epoch": 1.3088673263521953, "grad_norm": 0.9649694835006564, "learning_rate": 6.932767191563703e-06, "loss": 0.1016, "step": 3406 }, { "epoch": 1.309635892016524, "grad_norm": 1.175062380907642, "learning_rate": 6.928642024362877e-06, "loss": 0.1217, "step": 3408 }, { "epoch": 1.3104044576808531, "grad_norm": 1.170670452572567, "learning_rate": 6.924515314329916e-06, "loss": 0.1126, "step": 3410 }, { "epoch": 1.311173023345182, "grad_norm": 1.05418057023813, "learning_rate": 6.920387064766012e-06, "loss": 0.0985, "step": 3412 }, { "epoch": 1.311941589009511, "grad_norm": 1.0077736942003874, "learning_rate": 6.916257278973592e-06, "loss": 0.0939, "step": 3414 }, { "epoch": 1.31271015467384, "grad_norm": 1.2829752415627378, "learning_rate": 6.912125960256308e-06, "loss": 0.1064, "step": 3416 }, { "epoch": 1.3134787203381688, "grad_norm": 1.1832542260300816, "learning_rate": 6.9079931119190425e-06, "loss": 0.108, "step": 3418 }, { "epoch": 1.3142472860024978, "grad_norm": 1.1677072419149293, "learning_rate": 6.9038587372678985e-06, "loss": 0.1131, "step": 3420 }, { "epoch": 1.3150158516668267, "grad_norm": 1.0499567493599085, "learning_rate": 6.899722839610202e-06, "loss": 0.1005, "step": 3422 }, { "epoch": 1.3157844173311557, "grad_norm": 0.9767669577265266, "learning_rate": 6.895585422254495e-06, "loss": 0.1003, "step": 3424 }, { "epoch": 1.3165529829954847, "grad_norm": 1.02407952451726, "learning_rate": 6.891446488510538e-06, "loss": 0.1001, "step": 3426 }, { "epoch": 1.3173215486598135, "grad_norm": 1.1664637021054485, "learning_rate": 6.887306041689301e-06, "loss": 0.1098, "step": 3428 }, { "epoch": 1.3180901143241426, "grad_norm": 1.1729047189510806, "learning_rate": 6.883164085102969e-06, "loss": 0.0998, "step": 3430 }, { "epoch": 1.3188586799884714, "grad_norm": 1.125335840709974, "learning_rate": 6.879020622064931e-06, "loss": 0.1015, "step": 3432 }, { "epoch": 1.3196272456528004, "grad_norm": 1.125955921065118, "learning_rate": 6.874875655889783e-06, "loss": 0.1053, "step": 3434 }, { "epoch": 1.3203958113171295, "grad_norm": 1.1213492551896838, "learning_rate": 6.870729189893322e-06, "loss": 0.1051, "step": 3436 }, { "epoch": 1.3211643769814583, "grad_norm": 1.160865758778921, "learning_rate": 6.866581227392546e-06, "loss": 0.1054, "step": 3438 }, { "epoch": 1.3219329426457873, "grad_norm": 1.093865632900907, "learning_rate": 6.862431771705649e-06, "loss": 0.1049, "step": 3440 }, { "epoch": 1.3227015083101161, "grad_norm": 1.2687494575550735, "learning_rate": 6.858280826152022e-06, "loss": 0.1093, "step": 3442 }, { "epoch": 1.3234700739744452, "grad_norm": 0.99282407970078, "learning_rate": 6.8541283940522454e-06, "loss": 0.1036, "step": 3444 }, { "epoch": 1.3242386396387742, "grad_norm": 1.1016898788277454, "learning_rate": 6.849974478728088e-06, "loss": 0.1024, "step": 3446 }, { "epoch": 1.325007205303103, "grad_norm": 1.1846088455699622, "learning_rate": 6.845819083502509e-06, "loss": 0.1109, "step": 3448 }, { "epoch": 1.325775770967432, "grad_norm": 1.0901707812271892, "learning_rate": 6.841662211699647e-06, "loss": 0.1116, "step": 3450 }, { "epoch": 1.3265443366317609, "grad_norm": 1.0876562825996763, "learning_rate": 6.837503866644824e-06, "loss": 0.1028, "step": 3452 }, { "epoch": 1.32731290229609, "grad_norm": 1.173639427694616, "learning_rate": 6.833344051664541e-06, "loss": 0.1115, "step": 3454 }, { "epoch": 1.328081467960419, "grad_norm": 1.1426971024667996, "learning_rate": 6.829182770086474e-06, "loss": 0.1024, "step": 3456 }, { "epoch": 1.3288500336247477, "grad_norm": 1.0759701166866695, "learning_rate": 6.825020025239472e-06, "loss": 0.1082, "step": 3458 }, { "epoch": 1.3296185992890768, "grad_norm": 1.2121078407118115, "learning_rate": 6.820855820453558e-06, "loss": 0.1155, "step": 3460 }, { "epoch": 1.3303871649534056, "grad_norm": 0.9691762379441079, "learning_rate": 6.816690159059916e-06, "loss": 0.0931, "step": 3462 }, { "epoch": 1.3311557306177346, "grad_norm": 1.4861532425723871, "learning_rate": 6.8125230443908995e-06, "loss": 0.1088, "step": 3464 }, { "epoch": 1.3319242962820637, "grad_norm": 1.226862051222767, "learning_rate": 6.808354479780027e-06, "loss": 0.116, "step": 3466 }, { "epoch": 1.3326928619463925, "grad_norm": 1.2005692389740918, "learning_rate": 6.804184468561972e-06, "loss": 0.1161, "step": 3468 }, { "epoch": 1.3334614276107215, "grad_norm": 0.9778631913093287, "learning_rate": 6.800013014072566e-06, "loss": 0.098, "step": 3470 }, { "epoch": 1.3342299932750503, "grad_norm": 1.114897502662505, "learning_rate": 6.7958401196488e-06, "loss": 0.1121, "step": 3472 }, { "epoch": 1.3349985589393794, "grad_norm": 1.0822433993876044, "learning_rate": 6.791665788628811e-06, "loss": 0.1035, "step": 3474 }, { "epoch": 1.3357671246037084, "grad_norm": 1.2205156807495892, "learning_rate": 6.787490024351886e-06, "loss": 0.1045, "step": 3476 }, { "epoch": 1.3365356902680372, "grad_norm": 1.4128283398987758, "learning_rate": 6.783312830158463e-06, "loss": 0.1204, "step": 3478 }, { "epoch": 1.3373042559323662, "grad_norm": 1.1322069169180353, "learning_rate": 6.779134209390119e-06, "loss": 0.1019, "step": 3480 }, { "epoch": 1.338072821596695, "grad_norm": 1.382866626088024, "learning_rate": 6.774954165389574e-06, "loss": 0.1141, "step": 3482 }, { "epoch": 1.338841387261024, "grad_norm": 1.1807863007533579, "learning_rate": 6.770772701500688e-06, "loss": 0.1033, "step": 3484 }, { "epoch": 1.3396099529253531, "grad_norm": 1.1126309925602025, "learning_rate": 6.766589821068455e-06, "loss": 0.0897, "step": 3486 }, { "epoch": 1.340378518589682, "grad_norm": 1.0575851125889175, "learning_rate": 6.762405527439002e-06, "loss": 0.1215, "step": 3488 }, { "epoch": 1.341147084254011, "grad_norm": 1.1495913367732435, "learning_rate": 6.758219823959588e-06, "loss": 0.1108, "step": 3490 }, { "epoch": 1.3419156499183398, "grad_norm": 0.9137242883090909, "learning_rate": 6.754032713978595e-06, "loss": 0.098, "step": 3492 }, { "epoch": 1.3426842155826688, "grad_norm": 1.1638374242898861, "learning_rate": 6.749844200845541e-06, "loss": 0.1188, "step": 3494 }, { "epoch": 1.3434527812469979, "grad_norm": 1.2121378046315916, "learning_rate": 6.7456542879110554e-06, "loss": 0.115, "step": 3496 }, { "epoch": 1.3442213469113267, "grad_norm": 1.2397125583693025, "learning_rate": 6.741462978526892e-06, "loss": 0.1164, "step": 3498 }, { "epoch": 1.3449899125756557, "grad_norm": 1.0782582385761723, "learning_rate": 6.737270276045919e-06, "loss": 0.1033, "step": 3500 }, { "epoch": 1.3449899125756557, "eval_loss": 0.16024567186832428, "eval_runtime": 390.6968, "eval_samples_per_second": 47.364, "eval_steps_per_second": 5.923, "step": 3500 }, { "epoch": 1.3457584782399845, "grad_norm": 1.0741240990192813, "learning_rate": 6.733076183822127e-06, "loss": 0.1038, "step": 3502 }, { "epoch": 1.3465270439043135, "grad_norm": 1.0943067559079702, "learning_rate": 6.728880705210606e-06, "loss": 0.1093, "step": 3504 }, { "epoch": 1.3472956095686426, "grad_norm": 1.089833068399713, "learning_rate": 6.724683843567567e-06, "loss": 0.1263, "step": 3506 }, { "epoch": 1.3480641752329714, "grad_norm": 1.1246622280512546, "learning_rate": 6.72048560225032e-06, "loss": 0.0991, "step": 3508 }, { "epoch": 1.3488327408973004, "grad_norm": 0.9990368811628265, "learning_rate": 6.716285984617279e-06, "loss": 0.1058, "step": 3510 }, { "epoch": 1.3496013065616292, "grad_norm": 1.1561697222833236, "learning_rate": 6.712084994027963e-06, "loss": 0.1175, "step": 3512 }, { "epoch": 1.3503698722259583, "grad_norm": 1.066029855130969, "learning_rate": 6.707882633842986e-06, "loss": 0.1001, "step": 3514 }, { "epoch": 1.3511384378902873, "grad_norm": 1.2496332838900837, "learning_rate": 6.703678907424057e-06, "loss": 0.1002, "step": 3516 }, { "epoch": 1.3519070035546161, "grad_norm": 1.1053770811839736, "learning_rate": 6.699473818133983e-06, "loss": 0.1102, "step": 3518 }, { "epoch": 1.3526755692189452, "grad_norm": 1.107672699687679, "learning_rate": 6.695267369336656e-06, "loss": 0.1105, "step": 3520 }, { "epoch": 1.353444134883274, "grad_norm": 1.0826241300642498, "learning_rate": 6.6910595643970565e-06, "loss": 0.0989, "step": 3522 }, { "epoch": 1.354212700547603, "grad_norm": 1.0259464583593318, "learning_rate": 6.686850406681253e-06, "loss": 0.1049, "step": 3524 }, { "epoch": 1.354981266211932, "grad_norm": 1.0148196505032745, "learning_rate": 6.6826398995563916e-06, "loss": 0.111, "step": 3526 }, { "epoch": 1.3557498318762609, "grad_norm": 0.9624547744607049, "learning_rate": 6.678428046390699e-06, "loss": 0.1054, "step": 3528 }, { "epoch": 1.35651839754059, "grad_norm": 1.1993752879433752, "learning_rate": 6.674214850553484e-06, "loss": 0.1172, "step": 3530 }, { "epoch": 1.3572869632049187, "grad_norm": 1.1072774302736579, "learning_rate": 6.670000315415122e-06, "loss": 0.1059, "step": 3532 }, { "epoch": 1.3580555288692477, "grad_norm": 1.1604350432665413, "learning_rate": 6.665784444347063e-06, "loss": 0.1064, "step": 3534 }, { "epoch": 1.3588240945335768, "grad_norm": 1.191123687578186, "learning_rate": 6.661567240721829e-06, "loss": 0.1094, "step": 3536 }, { "epoch": 1.3595926601979056, "grad_norm": 1.125112859941398, "learning_rate": 6.657348707913e-06, "loss": 0.1032, "step": 3538 }, { "epoch": 1.3603612258622346, "grad_norm": 0.9690730150341403, "learning_rate": 6.653128849295228e-06, "loss": 0.0924, "step": 3540 }, { "epoch": 1.3611297915265634, "grad_norm": 1.1365753561461795, "learning_rate": 6.64890766824422e-06, "loss": 0.1049, "step": 3542 }, { "epoch": 1.3618983571908925, "grad_norm": 1.0016186335053132, "learning_rate": 6.644685168136742e-06, "loss": 0.1076, "step": 3544 }, { "epoch": 1.3626669228552215, "grad_norm": 0.9812801817529536, "learning_rate": 6.6404613523506155e-06, "loss": 0.0917, "step": 3546 }, { "epoch": 1.3634354885195503, "grad_norm": 1.2295864378791501, "learning_rate": 6.6362362242647175e-06, "loss": 0.1118, "step": 3548 }, { "epoch": 1.3642040541838794, "grad_norm": 1.2049793815166547, "learning_rate": 6.632009787258969e-06, "loss": 0.1021, "step": 3550 }, { "epoch": 1.3649726198482082, "grad_norm": 0.9932801035891162, "learning_rate": 6.627782044714339e-06, "loss": 0.0985, "step": 3552 }, { "epoch": 1.3657411855125372, "grad_norm": 1.1647515648479663, "learning_rate": 6.62355300001285e-06, "loss": 0.1135, "step": 3554 }, { "epoch": 1.3665097511768662, "grad_norm": 0.9925683729204299, "learning_rate": 6.619322656537552e-06, "loss": 0.1042, "step": 3556 }, { "epoch": 1.367278316841195, "grad_norm": 1.0267960652018984, "learning_rate": 6.615091017672546e-06, "loss": 0.0957, "step": 3558 }, { "epoch": 1.368046882505524, "grad_norm": 1.0532662513884856, "learning_rate": 6.610858086802963e-06, "loss": 0.0951, "step": 3560 }, { "epoch": 1.368815448169853, "grad_norm": 1.1082248687844498, "learning_rate": 6.606623867314965e-06, "loss": 0.1046, "step": 3562 }, { "epoch": 1.369584013834182, "grad_norm": 1.110636205877885, "learning_rate": 6.602388362595753e-06, "loss": 0.092, "step": 3564 }, { "epoch": 1.370352579498511, "grad_norm": 1.0511491323273428, "learning_rate": 6.598151576033552e-06, "loss": 0.105, "step": 3566 }, { "epoch": 1.3711211451628398, "grad_norm": 1.1733965513502098, "learning_rate": 6.593913511017609e-06, "loss": 0.1026, "step": 3568 }, { "epoch": 1.3718897108271688, "grad_norm": 1.0864525359771768, "learning_rate": 6.589674170938201e-06, "loss": 0.1046, "step": 3570 }, { "epoch": 1.3726582764914976, "grad_norm": 1.1695568387571893, "learning_rate": 6.5854335591866166e-06, "loss": 0.1083, "step": 3572 }, { "epoch": 1.3734268421558267, "grad_norm": 1.0180300308551624, "learning_rate": 6.5811916791551686e-06, "loss": 0.0922, "step": 3574 }, { "epoch": 1.3741954078201557, "grad_norm": 1.1574840107176207, "learning_rate": 6.576948534237183e-06, "loss": 0.0978, "step": 3576 }, { "epoch": 1.3749639734844845, "grad_norm": 1.1973928484970118, "learning_rate": 6.572704127826993e-06, "loss": 0.1114, "step": 3578 }, { "epoch": 1.3757325391488135, "grad_norm": 1.1017882456930859, "learning_rate": 6.568458463319948e-06, "loss": 0.112, "step": 3580 }, { "epoch": 1.3765011048131424, "grad_norm": 1.1779847906763161, "learning_rate": 6.564211544112398e-06, "loss": 0.1107, "step": 3582 }, { "epoch": 1.3772696704774714, "grad_norm": 1.2417773748028165, "learning_rate": 6.559963373601699e-06, "loss": 0.1109, "step": 3584 }, { "epoch": 1.3780382361418004, "grad_norm": 1.0428732095101327, "learning_rate": 6.555713955186209e-06, "loss": 0.0985, "step": 3586 }, { "epoch": 1.3788068018061292, "grad_norm": 0.9919687815110036, "learning_rate": 6.551463292265283e-06, "loss": 0.1026, "step": 3588 }, { "epoch": 1.3795753674704583, "grad_norm": 1.107766993905677, "learning_rate": 6.54721138823927e-06, "loss": 0.1175, "step": 3590 }, { "epoch": 1.380343933134787, "grad_norm": 0.9844604098535292, "learning_rate": 6.542958246509517e-06, "loss": 0.1089, "step": 3592 }, { "epoch": 1.3811124987991161, "grad_norm": 1.0613819652031307, "learning_rate": 6.538703870478354e-06, "loss": 0.1095, "step": 3594 }, { "epoch": 1.3818810644634452, "grad_norm": 1.049441657136399, "learning_rate": 6.5344482635491026e-06, "loss": 0.0959, "step": 3596 }, { "epoch": 1.382649630127774, "grad_norm": 1.047741993677556, "learning_rate": 6.530191429126071e-06, "loss": 0.094, "step": 3598 }, { "epoch": 1.383418195792103, "grad_norm": 1.0072441838066508, "learning_rate": 6.525933370614546e-06, "loss": 0.0887, "step": 3600 }, { "epoch": 1.3841867614564318, "grad_norm": 1.1695659719646245, "learning_rate": 6.521674091420793e-06, "loss": 0.1132, "step": 3602 }, { "epoch": 1.3849553271207609, "grad_norm": 1.0530973050556078, "learning_rate": 6.517413594952058e-06, "loss": 0.0915, "step": 3604 }, { "epoch": 1.38572389278509, "grad_norm": 1.0721775937951348, "learning_rate": 6.513151884616556e-06, "loss": 0.1143, "step": 3606 }, { "epoch": 1.3864924584494187, "grad_norm": 1.2288371300707386, "learning_rate": 6.508888963823476e-06, "loss": 0.1196, "step": 3608 }, { "epoch": 1.3872610241137477, "grad_norm": 1.118363108476703, "learning_rate": 6.504624835982973e-06, "loss": 0.1013, "step": 3610 }, { "epoch": 1.3880295897780766, "grad_norm": 1.0300261225167895, "learning_rate": 6.500359504506173e-06, "loss": 0.1135, "step": 3612 }, { "epoch": 1.3887981554424056, "grad_norm": 1.0733868739953827, "learning_rate": 6.496092972805157e-06, "loss": 0.0985, "step": 3614 }, { "epoch": 1.3895667211067346, "grad_norm": 1.159224268931053, "learning_rate": 6.491825244292971e-06, "loss": 0.1137, "step": 3616 }, { "epoch": 1.3903352867710634, "grad_norm": 0.9549161353951655, "learning_rate": 6.487556322383618e-06, "loss": 0.0959, "step": 3618 }, { "epoch": 1.3911038524353925, "grad_norm": 1.0767506921106442, "learning_rate": 6.483286210492057e-06, "loss": 0.1066, "step": 3620 }, { "epoch": 1.3918724180997213, "grad_norm": 1.0065096203247952, "learning_rate": 6.479014912034194e-06, "loss": 0.0953, "step": 3622 }, { "epoch": 1.3926409837640503, "grad_norm": 1.1803610958390895, "learning_rate": 6.474742430426888e-06, "loss": 0.1046, "step": 3624 }, { "epoch": 1.3934095494283794, "grad_norm": 0.9950809879971344, "learning_rate": 6.470468769087946e-06, "loss": 0.1072, "step": 3626 }, { "epoch": 1.3941781150927082, "grad_norm": 1.2056017296313122, "learning_rate": 6.466193931436115e-06, "loss": 0.0988, "step": 3628 }, { "epoch": 1.3949466807570372, "grad_norm": 1.0301932148644475, "learning_rate": 6.4619179208910834e-06, "loss": 0.0925, "step": 3630 }, { "epoch": 1.395715246421366, "grad_norm": 1.1392882414234315, "learning_rate": 6.457640740873481e-06, "loss": 0.1152, "step": 3632 }, { "epoch": 1.396483812085695, "grad_norm": 1.1523298411010923, "learning_rate": 6.45336239480487e-06, "loss": 0.0912, "step": 3634 }, { "epoch": 1.397252377750024, "grad_norm": 2.08830317129801, "learning_rate": 6.449082886107745e-06, "loss": 0.1047, "step": 3636 }, { "epoch": 1.398020943414353, "grad_norm": 1.1304617311761007, "learning_rate": 6.444802218205536e-06, "loss": 0.0994, "step": 3638 }, { "epoch": 1.398789509078682, "grad_norm": 0.9692181575643294, "learning_rate": 6.4405203945225936e-06, "loss": 0.1018, "step": 3640 }, { "epoch": 1.3995580747430107, "grad_norm": 0.9169919412216945, "learning_rate": 6.436237418484195e-06, "loss": 0.0923, "step": 3642 }, { "epoch": 1.4003266404073398, "grad_norm": 1.0678072137741828, "learning_rate": 6.431953293516542e-06, "loss": 0.1083, "step": 3644 }, { "epoch": 1.4010952060716688, "grad_norm": 1.0042928065878147, "learning_rate": 6.427668023046756e-06, "loss": 0.093, "step": 3646 }, { "epoch": 1.4018637717359976, "grad_norm": 1.0535465103257822, "learning_rate": 6.423381610502866e-06, "loss": 0.1019, "step": 3648 }, { "epoch": 1.4026323374003267, "grad_norm": 1.2184708145187475, "learning_rate": 6.4190940593138255e-06, "loss": 0.1054, "step": 3650 }, { "epoch": 1.4034009030646555, "grad_norm": 0.9471401778950829, "learning_rate": 6.414805372909495e-06, "loss": 0.0978, "step": 3652 }, { "epoch": 1.4041694687289845, "grad_norm": 0.9445487328525355, "learning_rate": 6.410515554720638e-06, "loss": 0.1027, "step": 3654 }, { "epoch": 1.4049380343933136, "grad_norm": 0.9955525577038581, "learning_rate": 6.4062246081789316e-06, "loss": 0.0999, "step": 3656 }, { "epoch": 1.4057066000576424, "grad_norm": 1.1916448120551637, "learning_rate": 6.4019325367169515e-06, "loss": 0.1109, "step": 3658 }, { "epoch": 1.4064751657219714, "grad_norm": 1.0615833066719316, "learning_rate": 6.397639343768171e-06, "loss": 0.1013, "step": 3660 }, { "epoch": 1.4072437313863002, "grad_norm": 1.1528907238241664, "learning_rate": 6.393345032766966e-06, "loss": 0.102, "step": 3662 }, { "epoch": 1.4080122970506292, "grad_norm": 1.0909562489778688, "learning_rate": 6.389049607148602e-06, "loss": 0.1069, "step": 3664 }, { "epoch": 1.4087808627149583, "grad_norm": 1.1105441424860834, "learning_rate": 6.384753070349239e-06, "loss": 0.0952, "step": 3666 }, { "epoch": 1.409549428379287, "grad_norm": 1.11743579955878, "learning_rate": 6.380455425805923e-06, "loss": 0.0923, "step": 3668 }, { "epoch": 1.4103179940436161, "grad_norm": 0.987884454671256, "learning_rate": 6.376156676956591e-06, "loss": 0.0943, "step": 3670 }, { "epoch": 1.411086559707945, "grad_norm": 1.159146280348909, "learning_rate": 6.3718568272400594e-06, "loss": 0.1066, "step": 3672 }, { "epoch": 1.411855125372274, "grad_norm": 1.0835601879807268, "learning_rate": 6.367555880096025e-06, "loss": 0.1065, "step": 3674 }, { "epoch": 1.412623691036603, "grad_norm": 1.117266679224832, "learning_rate": 6.3632538389650655e-06, "loss": 0.0983, "step": 3676 }, { "epoch": 1.4133922567009318, "grad_norm": 1.2224781017359008, "learning_rate": 6.358950707288631e-06, "loss": 0.1096, "step": 3678 }, { "epoch": 1.4141608223652609, "grad_norm": 0.9492177707481574, "learning_rate": 6.354646488509044e-06, "loss": 0.0894, "step": 3680 }, { "epoch": 1.4149293880295897, "grad_norm": 1.073076671687902, "learning_rate": 6.350341186069502e-06, "loss": 0.0965, "step": 3682 }, { "epoch": 1.4156979536939187, "grad_norm": 1.0806095410247387, "learning_rate": 6.34603480341406e-06, "loss": 0.0941, "step": 3684 }, { "epoch": 1.4164665193582477, "grad_norm": 1.259201510944982, "learning_rate": 6.341727343987645e-06, "loss": 0.123, "step": 3686 }, { "epoch": 1.4172350850225766, "grad_norm": 1.0625596898505931, "learning_rate": 6.33741881123604e-06, "loss": 0.0957, "step": 3688 }, { "epoch": 1.4180036506869056, "grad_norm": 1.0562913275108412, "learning_rate": 6.333109208605892e-06, "loss": 0.1016, "step": 3690 }, { "epoch": 1.4187722163512344, "grad_norm": 1.0106747116549863, "learning_rate": 6.328798539544702e-06, "loss": 0.107, "step": 3692 }, { "epoch": 1.4195407820155634, "grad_norm": 1.1921838484379144, "learning_rate": 6.324486807500818e-06, "loss": 0.1084, "step": 3694 }, { "epoch": 1.4203093476798925, "grad_norm": 1.261257561682368, "learning_rate": 6.320174015923448e-06, "loss": 0.1008, "step": 3696 }, { "epoch": 1.4210779133442213, "grad_norm": 1.0703088080570242, "learning_rate": 6.315860168262643e-06, "loss": 0.0979, "step": 3698 }, { "epoch": 1.4218464790085503, "grad_norm": 1.1541781393359445, "learning_rate": 6.311545267969296e-06, "loss": 0.0904, "step": 3700 }, { "epoch": 1.4226150446728791, "grad_norm": 1.0604643277461208, "learning_rate": 6.307229318495145e-06, "loss": 0.097, "step": 3702 }, { "epoch": 1.4233836103372082, "grad_norm": 1.219703628743773, "learning_rate": 6.302912323292773e-06, "loss": 0.1188, "step": 3704 }, { "epoch": 1.4241521760015372, "grad_norm": 1.158441096005232, "learning_rate": 6.298594285815585e-06, "loss": 0.112, "step": 3706 }, { "epoch": 1.424920741665866, "grad_norm": 1.2547666803289148, "learning_rate": 6.294275209517834e-06, "loss": 0.1031, "step": 3708 }, { "epoch": 1.425689307330195, "grad_norm": 0.921713670668529, "learning_rate": 6.289955097854599e-06, "loss": 0.109, "step": 3710 }, { "epoch": 1.4264578729945239, "grad_norm": 1.1196293958984331, "learning_rate": 6.285633954281785e-06, "loss": 0.1143, "step": 3712 }, { "epoch": 1.427226438658853, "grad_norm": 1.2331574382402006, "learning_rate": 6.281311782256123e-06, "loss": 0.1133, "step": 3714 }, { "epoch": 1.427995004323182, "grad_norm": 0.9466890782752775, "learning_rate": 6.276988585235172e-06, "loss": 0.1048, "step": 3716 }, { "epoch": 1.4287635699875108, "grad_norm": 1.127432854656734, "learning_rate": 6.2726643666773034e-06, "loss": 0.1001, "step": 3718 }, { "epoch": 1.4295321356518398, "grad_norm": 1.2067773459706788, "learning_rate": 6.2683391300417105e-06, "loss": 0.1068, "step": 3720 }, { "epoch": 1.4303007013161686, "grad_norm": 1.056041784591762, "learning_rate": 6.2640128787884005e-06, "loss": 0.0976, "step": 3722 }, { "epoch": 1.4310692669804976, "grad_norm": 1.1640203359625638, "learning_rate": 6.259685616378192e-06, "loss": 0.1065, "step": 3724 }, { "epoch": 1.4318378326448267, "grad_norm": 1.11969019655386, "learning_rate": 6.255357346272708e-06, "loss": 0.0998, "step": 3726 }, { "epoch": 1.4326063983091555, "grad_norm": 1.090765640007368, "learning_rate": 6.251028071934388e-06, "loss": 0.109, "step": 3728 }, { "epoch": 1.4333749639734845, "grad_norm": 1.025582893524102, "learning_rate": 6.246697796826464e-06, "loss": 0.1004, "step": 3730 }, { "epoch": 1.4341435296378133, "grad_norm": 0.892029526081978, "learning_rate": 6.242366524412975e-06, "loss": 0.0976, "step": 3732 }, { "epoch": 1.4349120953021424, "grad_norm": 1.0463784281651205, "learning_rate": 6.238034258158756e-06, "loss": 0.101, "step": 3734 }, { "epoch": 1.4356806609664714, "grad_norm": 1.2598998147669427, "learning_rate": 6.233701001529435e-06, "loss": 0.1149, "step": 3736 }, { "epoch": 1.4364492266308002, "grad_norm": 1.0081974127272186, "learning_rate": 6.229366757991437e-06, "loss": 0.1023, "step": 3738 }, { "epoch": 1.4372177922951292, "grad_norm": 1.1480901400270764, "learning_rate": 6.225031531011973e-06, "loss": 0.1018, "step": 3740 }, { "epoch": 1.437986357959458, "grad_norm": 0.9490395314194845, "learning_rate": 6.22069532405904e-06, "loss": 0.0876, "step": 3742 }, { "epoch": 1.438754923623787, "grad_norm": 1.1656532253933602, "learning_rate": 6.216358140601422e-06, "loss": 0.116, "step": 3744 }, { "epoch": 1.4395234892881161, "grad_norm": 1.2102678075947397, "learning_rate": 6.212019984108681e-06, "loss": 0.1013, "step": 3746 }, { "epoch": 1.440292054952445, "grad_norm": 1.0948358527547788, "learning_rate": 6.207680858051158e-06, "loss": 0.0918, "step": 3748 }, { "epoch": 1.441060620616774, "grad_norm": 1.160439744807667, "learning_rate": 6.203340765899976e-06, "loss": 0.1057, "step": 3750 }, { "epoch": 1.4418291862811028, "grad_norm": 1.1710475548229597, "learning_rate": 6.198999711127018e-06, "loss": 0.0939, "step": 3752 }, { "epoch": 1.4425977519454318, "grad_norm": 1.102741981324908, "learning_rate": 6.1946576972049484e-06, "loss": 0.0951, "step": 3754 }, { "epoch": 1.4433663176097609, "grad_norm": 1.21792257926401, "learning_rate": 6.190314727607196e-06, "loss": 0.1023, "step": 3756 }, { "epoch": 1.4441348832740897, "grad_norm": 1.182142894642489, "learning_rate": 6.185970805807951e-06, "loss": 0.1093, "step": 3758 }, { "epoch": 1.4449034489384187, "grad_norm": 1.198584878197929, "learning_rate": 6.181625935282167e-06, "loss": 0.101, "step": 3760 }, { "epoch": 1.4456720146027475, "grad_norm": 1.0878463332685986, "learning_rate": 6.177280119505559e-06, "loss": 0.0947, "step": 3762 }, { "epoch": 1.4464405802670766, "grad_norm": 1.2087850480297482, "learning_rate": 6.172933361954594e-06, "loss": 0.108, "step": 3764 }, { "epoch": 1.4472091459314056, "grad_norm": 1.1079845511628184, "learning_rate": 6.168585666106495e-06, "loss": 0.1074, "step": 3766 }, { "epoch": 1.4479777115957344, "grad_norm": 1.0365549724380647, "learning_rate": 6.164237035439235e-06, "loss": 0.1022, "step": 3768 }, { "epoch": 1.4487462772600634, "grad_norm": 1.203625347360839, "learning_rate": 6.1598874734315335e-06, "loss": 0.0944, "step": 3770 }, { "epoch": 1.4495148429243923, "grad_norm": 1.1514316792883843, "learning_rate": 6.155536983562858e-06, "loss": 0.1075, "step": 3772 }, { "epoch": 1.4502834085887213, "grad_norm": 1.1181898482337458, "learning_rate": 6.151185569313418e-06, "loss": 0.1093, "step": 3774 }, { "epoch": 1.4510519742530503, "grad_norm": 1.0395611705504353, "learning_rate": 6.146833234164158e-06, "loss": 0.0961, "step": 3776 }, { "epoch": 1.4518205399173791, "grad_norm": 1.0888503284153743, "learning_rate": 6.142479981596764e-06, "loss": 0.1109, "step": 3778 }, { "epoch": 1.4525891055817082, "grad_norm": 1.1078687476511964, "learning_rate": 6.138125815093654e-06, "loss": 0.099, "step": 3780 }, { "epoch": 1.453357671246037, "grad_norm": 1.0329227601729036, "learning_rate": 6.133770738137975e-06, "loss": 0.0923, "step": 3782 }, { "epoch": 1.454126236910366, "grad_norm": 1.0742968578199084, "learning_rate": 6.129414754213608e-06, "loss": 0.0928, "step": 3784 }, { "epoch": 1.454894802574695, "grad_norm": 1.0736124840076708, "learning_rate": 6.125057866805155e-06, "loss": 0.1082, "step": 3786 }, { "epoch": 1.4556633682390239, "grad_norm": 1.1128696606277442, "learning_rate": 6.12070007939794e-06, "loss": 0.1035, "step": 3788 }, { "epoch": 1.456431933903353, "grad_norm": 1.101062953369874, "learning_rate": 6.116341395478012e-06, "loss": 0.0957, "step": 3790 }, { "epoch": 1.4572004995676817, "grad_norm": 1.1216886115838474, "learning_rate": 6.1119818185321315e-06, "loss": 0.1055, "step": 3792 }, { "epoch": 1.4579690652320108, "grad_norm": 1.2777112120672336, "learning_rate": 6.1076213520477745e-06, "loss": 0.1169, "step": 3794 }, { "epoch": 1.4587376308963398, "grad_norm": 1.2310830689447756, "learning_rate": 6.103259999513135e-06, "loss": 0.0992, "step": 3796 }, { "epoch": 1.4595061965606686, "grad_norm": 1.1029882962706423, "learning_rate": 6.098897764417106e-06, "loss": 0.0995, "step": 3798 }, { "epoch": 1.4602747622249976, "grad_norm": 1.1202044959124011, "learning_rate": 6.0945346502492945e-06, "loss": 0.0975, "step": 3800 }, { "epoch": 1.4610433278893264, "grad_norm": 1.0994835267093073, "learning_rate": 6.090170660500006e-06, "loss": 0.1035, "step": 3802 }, { "epoch": 1.4618118935536555, "grad_norm": 1.0002019479570095, "learning_rate": 6.085805798660247e-06, "loss": 0.0977, "step": 3804 }, { "epoch": 1.4625804592179845, "grad_norm": 1.1405188355838538, "learning_rate": 6.0814400682217236e-06, "loss": 0.101, "step": 3806 }, { "epoch": 1.4633490248823133, "grad_norm": 1.1025288053178182, "learning_rate": 6.077073472676835e-06, "loss": 0.1076, "step": 3808 }, { "epoch": 1.4641175905466424, "grad_norm": 1.18314579822248, "learning_rate": 6.072706015518675e-06, "loss": 0.1029, "step": 3810 }, { "epoch": 1.4648861562109712, "grad_norm": 1.0430136579135172, "learning_rate": 6.068337700241024e-06, "loss": 0.0954, "step": 3812 }, { "epoch": 1.4656547218753002, "grad_norm": 1.0758458463351674, "learning_rate": 6.063968530338349e-06, "loss": 0.1095, "step": 3814 }, { "epoch": 1.4664232875396292, "grad_norm": 1.0394557777419875, "learning_rate": 6.059598509305802e-06, "loss": 0.1059, "step": 3816 }, { "epoch": 1.467191853203958, "grad_norm": 1.1004510695162526, "learning_rate": 6.055227640639213e-06, "loss": 0.1028, "step": 3818 }, { "epoch": 1.467960418868287, "grad_norm": 0.970486562436618, "learning_rate": 6.0508559278350956e-06, "loss": 0.0886, "step": 3820 }, { "epoch": 1.468728984532616, "grad_norm": 1.0883702886243292, "learning_rate": 6.046483374390633e-06, "loss": 0.1027, "step": 3822 }, { "epoch": 1.469497550196945, "grad_norm": 1.0252582803141115, "learning_rate": 6.042109983803685e-06, "loss": 0.0991, "step": 3824 }, { "epoch": 1.470266115861274, "grad_norm": 1.2508942560032796, "learning_rate": 6.037735759572777e-06, "loss": 0.1108, "step": 3826 }, { "epoch": 1.4710346815256028, "grad_norm": 1.0468580153637583, "learning_rate": 6.033360705197105e-06, "loss": 0.1027, "step": 3828 }, { "epoch": 1.4718032471899318, "grad_norm": 1.2265584865001273, "learning_rate": 6.0289848241765295e-06, "loss": 0.1016, "step": 3830 }, { "epoch": 1.4725718128542606, "grad_norm": 1.106068496941919, "learning_rate": 6.024608120011567e-06, "loss": 0.1233, "step": 3832 }, { "epoch": 1.4733403785185897, "grad_norm": 1.0442567475851559, "learning_rate": 6.020230596203397e-06, "loss": 0.0997, "step": 3834 }, { "epoch": 1.4741089441829187, "grad_norm": 1.0271806501408283, "learning_rate": 6.015852256253855e-06, "loss": 0.1005, "step": 3836 }, { "epoch": 1.4748775098472475, "grad_norm": 1.104396675693817, "learning_rate": 6.011473103665427e-06, "loss": 0.098, "step": 3838 }, { "epoch": 1.4756460755115766, "grad_norm": 1.151240020274314, "learning_rate": 6.007093141941249e-06, "loss": 0.0968, "step": 3840 }, { "epoch": 1.4764146411759054, "grad_norm": 0.9074892033812044, "learning_rate": 6.002712374585108e-06, "loss": 0.0978, "step": 3842 }, { "epoch": 1.4771832068402344, "grad_norm": 1.0638079839241026, "learning_rate": 5.99833080510143e-06, "loss": 0.098, "step": 3844 }, { "epoch": 1.4779517725045634, "grad_norm": 1.1250650600934071, "learning_rate": 5.993948436995286e-06, "loss": 0.1038, "step": 3846 }, { "epoch": 1.4787203381688923, "grad_norm": 1.0561466502625547, "learning_rate": 5.989565273772387e-06, "loss": 0.091, "step": 3848 }, { "epoch": 1.4794889038332213, "grad_norm": 1.1511396627611667, "learning_rate": 5.985181318939078e-06, "loss": 0.0931, "step": 3850 }, { "epoch": 1.48025746949755, "grad_norm": 1.1012646986725858, "learning_rate": 5.980796576002337e-06, "loss": 0.0928, "step": 3852 }, { "epoch": 1.4810260351618791, "grad_norm": 1.1737520408035, "learning_rate": 5.9764110484697724e-06, "loss": 0.1113, "step": 3854 }, { "epoch": 1.4817946008262082, "grad_norm": 1.1893086850380306, "learning_rate": 5.972024739849622e-06, "loss": 0.1109, "step": 3856 }, { "epoch": 1.482563166490537, "grad_norm": 0.9191952639030616, "learning_rate": 5.967637653650746e-06, "loss": 0.089, "step": 3858 }, { "epoch": 1.483331732154866, "grad_norm": 1.1410369881678053, "learning_rate": 5.963249793382631e-06, "loss": 0.1151, "step": 3860 }, { "epoch": 1.4841002978191948, "grad_norm": 1.18440137726923, "learning_rate": 5.958861162555376e-06, "loss": 0.1113, "step": 3862 }, { "epoch": 1.4848688634835239, "grad_norm": 1.0215086605742358, "learning_rate": 5.954471764679702e-06, "loss": 0.1001, "step": 3864 }, { "epoch": 1.485637429147853, "grad_norm": 1.0599184080922641, "learning_rate": 5.9500816032669436e-06, "loss": 0.1128, "step": 3866 }, { "epoch": 1.4864059948121817, "grad_norm": 1.08470438932806, "learning_rate": 5.945690681829042e-06, "loss": 0.1001, "step": 3868 }, { "epoch": 1.4871745604765108, "grad_norm": 0.9977848002925234, "learning_rate": 5.94129900387855e-06, "loss": 0.0958, "step": 3870 }, { "epoch": 1.4879431261408396, "grad_norm": 1.1754565675112634, "learning_rate": 5.936906572928625e-06, "loss": 0.1013, "step": 3872 }, { "epoch": 1.4887116918051686, "grad_norm": 1.148457692856843, "learning_rate": 5.932513392493022e-06, "loss": 0.1055, "step": 3874 }, { "epoch": 1.4894802574694976, "grad_norm": 1.1134281416143208, "learning_rate": 5.928119466086106e-06, "loss": 0.1056, "step": 3876 }, { "epoch": 1.4902488231338265, "grad_norm": 1.0207128219230177, "learning_rate": 5.92372479722283e-06, "loss": 0.0933, "step": 3878 }, { "epoch": 1.4910173887981555, "grad_norm": 1.0002542948056468, "learning_rate": 5.91932938941874e-06, "loss": 0.1062, "step": 3880 }, { "epoch": 1.4917859544624843, "grad_norm": 1.1374502885376412, "learning_rate": 5.9149332461899815e-06, "loss": 0.1027, "step": 3882 }, { "epoch": 1.4925545201268133, "grad_norm": 1.2890511101788094, "learning_rate": 5.910536371053281e-06, "loss": 0.0957, "step": 3884 }, { "epoch": 1.4933230857911424, "grad_norm": 0.9614053065331762, "learning_rate": 5.906138767525952e-06, "loss": 0.0838, "step": 3886 }, { "epoch": 1.4940916514554712, "grad_norm": 1.1227883430604573, "learning_rate": 5.901740439125893e-06, "loss": 0.0938, "step": 3888 }, { "epoch": 1.4948602171198002, "grad_norm": 1.124577160021568, "learning_rate": 5.897341389371583e-06, "loss": 0.1072, "step": 3890 }, { "epoch": 1.495628782784129, "grad_norm": 1.0602606795313014, "learning_rate": 5.8929416217820715e-06, "loss": 0.102, "step": 3892 }, { "epoch": 1.496397348448458, "grad_norm": 1.1026140514678286, "learning_rate": 5.888541139876989e-06, "loss": 0.0999, "step": 3894 }, { "epoch": 1.497165914112787, "grad_norm": 1.1120264771487292, "learning_rate": 5.884139947176535e-06, "loss": 0.1101, "step": 3896 }, { "epoch": 1.497934479777116, "grad_norm": 1.1004499091483855, "learning_rate": 5.879738047201476e-06, "loss": 0.0915, "step": 3898 }, { "epoch": 1.498703045441445, "grad_norm": 1.1883542787342556, "learning_rate": 5.8753354434731495e-06, "loss": 0.1024, "step": 3900 }, { "epoch": 1.4994716111057738, "grad_norm": 1.1881404259472568, "learning_rate": 5.87093213951345e-06, "loss": 0.0933, "step": 3902 }, { "epoch": 1.5002401767701028, "grad_norm": 1.0926658514443548, "learning_rate": 5.866528138844834e-06, "loss": 0.1146, "step": 3904 }, { "epoch": 1.5010087424344318, "grad_norm": 1.2952104988303337, "learning_rate": 5.862123444990319e-06, "loss": 0.1098, "step": 3906 }, { "epoch": 1.5017773080987606, "grad_norm": 1.2185537268155087, "learning_rate": 5.857718061473471e-06, "loss": 0.1075, "step": 3908 }, { "epoch": 1.5025458737630897, "grad_norm": 1.120409007824226, "learning_rate": 5.853311991818411e-06, "loss": 0.1142, "step": 3910 }, { "epoch": 1.5033144394274185, "grad_norm": 1.1380104507060913, "learning_rate": 5.84890523954981e-06, "loss": 0.1023, "step": 3912 }, { "epoch": 1.5040830050917475, "grad_norm": 1.1167825348071683, "learning_rate": 5.844497808192882e-06, "loss": 0.1139, "step": 3914 }, { "epoch": 1.5048515707560766, "grad_norm": 1.3082901712886794, "learning_rate": 5.840089701273389e-06, "loss": 0.1117, "step": 3916 }, { "epoch": 1.5056201364204054, "grad_norm": 1.047177511551079, "learning_rate": 5.835680922317628e-06, "loss": 0.1057, "step": 3918 }, { "epoch": 1.5063887020847344, "grad_norm": 1.0565241318834728, "learning_rate": 5.831271474852434e-06, "loss": 0.0941, "step": 3920 }, { "epoch": 1.5071572677490632, "grad_norm": 0.9916532632548073, "learning_rate": 5.826861362405183e-06, "loss": 0.1003, "step": 3922 }, { "epoch": 1.5079258334133923, "grad_norm": 0.9579420060145515, "learning_rate": 5.822450588503775e-06, "loss": 0.1043, "step": 3924 }, { "epoch": 1.5086943990777213, "grad_norm": 1.1449719407126229, "learning_rate": 5.818039156676644e-06, "loss": 0.1035, "step": 3926 }, { "epoch": 1.50946296474205, "grad_norm": 0.9780284666004697, "learning_rate": 5.81362707045275e-06, "loss": 0.0967, "step": 3928 }, { "epoch": 1.5102315304063791, "grad_norm": 1.069797388376964, "learning_rate": 5.809214333361575e-06, "loss": 0.1008, "step": 3930 }, { "epoch": 1.511000096070708, "grad_norm": 1.1090718778401774, "learning_rate": 5.804800948933121e-06, "loss": 0.1103, "step": 3932 }, { "epoch": 1.511768661735037, "grad_norm": 0.9829633164637174, "learning_rate": 5.80038692069791e-06, "loss": 0.0982, "step": 3934 }, { "epoch": 1.512537227399366, "grad_norm": 1.0562128033815545, "learning_rate": 5.795972252186979e-06, "loss": 0.0965, "step": 3936 }, { "epoch": 1.5133057930636948, "grad_norm": 1.0121557027671988, "learning_rate": 5.791556946931874e-06, "loss": 0.0933, "step": 3938 }, { "epoch": 1.5140743587280239, "grad_norm": 1.1976917663937399, "learning_rate": 5.787141008464657e-06, "loss": 0.1169, "step": 3940 }, { "epoch": 1.5148429243923527, "grad_norm": 1.1568289660804782, "learning_rate": 5.7827244403178875e-06, "loss": 0.0941, "step": 3942 }, { "epoch": 1.5156114900566817, "grad_norm": 1.0976743111067402, "learning_rate": 5.778307246024635e-06, "loss": 0.0941, "step": 3944 }, { "epoch": 1.5163800557210108, "grad_norm": 1.1085722649871232, "learning_rate": 5.773889429118469e-06, "loss": 0.0954, "step": 3946 }, { "epoch": 1.5171486213853396, "grad_norm": 1.158295600198378, "learning_rate": 5.769470993133457e-06, "loss": 0.1028, "step": 3948 }, { "epoch": 1.5179171870496686, "grad_norm": 1.0464943298564833, "learning_rate": 5.765051941604157e-06, "loss": 0.1, "step": 3950 }, { "epoch": 1.5186857527139974, "grad_norm": 1.2343915577082534, "learning_rate": 5.76063227806563e-06, "loss": 0.1083, "step": 3952 }, { "epoch": 1.5194543183783265, "grad_norm": 1.1337147024125651, "learning_rate": 5.7562120060534145e-06, "loss": 0.1044, "step": 3954 }, { "epoch": 1.5202228840426555, "grad_norm": 1.1595712532641342, "learning_rate": 5.751791129103545e-06, "loss": 0.1017, "step": 3956 }, { "epoch": 1.5209914497069843, "grad_norm": 1.2252541382581092, "learning_rate": 5.747369650752535e-06, "loss": 0.1139, "step": 3958 }, { "epoch": 1.5217600153713133, "grad_norm": 1.1534897058316913, "learning_rate": 5.7429475745373786e-06, "loss": 0.1019, "step": 3960 }, { "epoch": 1.5225285810356421, "grad_norm": 1.144614063825106, "learning_rate": 5.738524903995553e-06, "loss": 0.11, "step": 3962 }, { "epoch": 1.5232971466999712, "grad_norm": 0.939276901594189, "learning_rate": 5.734101642665005e-06, "loss": 0.0768, "step": 3964 }, { "epoch": 1.5240657123643002, "grad_norm": 1.255315536564419, "learning_rate": 5.729677794084159e-06, "loss": 0.1041, "step": 3966 }, { "epoch": 1.524834278028629, "grad_norm": 1.1709484498873615, "learning_rate": 5.725253361791906e-06, "loss": 0.1072, "step": 3968 }, { "epoch": 1.525602843692958, "grad_norm": 1.0529678780446243, "learning_rate": 5.720828349327605e-06, "loss": 0.1012, "step": 3970 }, { "epoch": 1.5263714093572869, "grad_norm": 1.0323426051770197, "learning_rate": 5.716402760231079e-06, "loss": 0.0847, "step": 3972 }, { "epoch": 1.527139975021616, "grad_norm": 1.1913103230346327, "learning_rate": 5.711976598042612e-06, "loss": 0.1034, "step": 3974 }, { "epoch": 1.527908540685945, "grad_norm": 0.9992001125835379, "learning_rate": 5.707549866302948e-06, "loss": 0.1025, "step": 3976 }, { "epoch": 1.5286771063502738, "grad_norm": 1.3366286828263152, "learning_rate": 5.703122568553283e-06, "loss": 0.1023, "step": 3978 }, { "epoch": 1.5294456720146028, "grad_norm": 1.2291331917564936, "learning_rate": 5.698694708335271e-06, "loss": 0.1087, "step": 3980 }, { "epoch": 1.5302142376789316, "grad_norm": 1.0429529122734822, "learning_rate": 5.694266289191011e-06, "loss": 0.097, "step": 3982 }, { "epoch": 1.5309828033432606, "grad_norm": 1.149857542112524, "learning_rate": 5.689837314663051e-06, "loss": 0.1123, "step": 3984 }, { "epoch": 1.5317513690075897, "grad_norm": 1.0377043574129101, "learning_rate": 5.685407788294386e-06, "loss": 0.0956, "step": 3986 }, { "epoch": 1.5325199346719185, "grad_norm": 1.1569570187750808, "learning_rate": 5.680977713628447e-06, "loss": 0.0973, "step": 3988 }, { "epoch": 1.5332885003362475, "grad_norm": 1.189388301198451, "learning_rate": 5.6765470942091084e-06, "loss": 0.1001, "step": 3990 }, { "epoch": 1.5340570660005763, "grad_norm": 1.0431818020505303, "learning_rate": 5.672115933580679e-06, "loss": 0.0978, "step": 3992 }, { "epoch": 1.5348256316649054, "grad_norm": 1.0908162940919754, "learning_rate": 5.667684235287898e-06, "loss": 0.0958, "step": 3994 }, { "epoch": 1.5355941973292344, "grad_norm": 1.051052444137121, "learning_rate": 5.663252002875938e-06, "loss": 0.0996, "step": 3996 }, { "epoch": 1.5363627629935632, "grad_norm": 0.8759797874344064, "learning_rate": 5.658819239890398e-06, "loss": 0.0864, "step": 3998 }, { "epoch": 1.5371313286578923, "grad_norm": 1.1757604644786424, "learning_rate": 5.654385949877299e-06, "loss": 0.1145, "step": 4000 }, { "epoch": 1.5371313286578923, "eval_loss": 0.1557261347770691, "eval_runtime": 390.0565, "eval_samples_per_second": 47.442, "eval_steps_per_second": 5.932, "step": 4000 }, { "epoch": 1.537899894322221, "grad_norm": 0.8996625828127834, "learning_rate": 5.64995213638309e-06, "loss": 0.1085, "step": 4002 }, { "epoch": 1.53866845998655, "grad_norm": 1.0833568918259162, "learning_rate": 5.645517802954631e-06, "loss": 0.1004, "step": 4004 }, { "epoch": 1.5394370256508791, "grad_norm": 1.1933465078361665, "learning_rate": 5.641082953139201e-06, "loss": 0.1152, "step": 4006 }, { "epoch": 1.540205591315208, "grad_norm": 1.075858911215391, "learning_rate": 5.636647590484496e-06, "loss": 0.0935, "step": 4008 }, { "epoch": 1.540974156979537, "grad_norm": 1.0922575588786656, "learning_rate": 5.632211718538615e-06, "loss": 0.0907, "step": 4010 }, { "epoch": 1.5417427226438658, "grad_norm": 1.0777041268907455, "learning_rate": 5.62777534085007e-06, "loss": 0.1079, "step": 4012 }, { "epoch": 1.5425112883081948, "grad_norm": 1.0896959639749095, "learning_rate": 5.623338460967775e-06, "loss": 0.0957, "step": 4014 }, { "epoch": 1.5432798539725239, "grad_norm": 1.1573987992171724, "learning_rate": 5.618901082441048e-06, "loss": 0.113, "step": 4016 }, { "epoch": 1.5440484196368527, "grad_norm": 0.9747224369775391, "learning_rate": 5.6144632088196015e-06, "loss": 0.0941, "step": 4018 }, { "epoch": 1.5448169853011817, "grad_norm": 1.0404214246825718, "learning_rate": 5.610024843653549e-06, "loss": 0.1046, "step": 4020 }, { "epoch": 1.5455855509655105, "grad_norm": 1.1705670923885618, "learning_rate": 5.605585990493396e-06, "loss": 0.1071, "step": 4022 }, { "epoch": 1.5463541166298396, "grad_norm": 1.0991822502593884, "learning_rate": 5.601146652890035e-06, "loss": 0.0979, "step": 4024 }, { "epoch": 1.5471226822941686, "grad_norm": 1.1691193930941142, "learning_rate": 5.5967068343947505e-06, "loss": 0.0993, "step": 4026 }, { "epoch": 1.5478912479584974, "grad_norm": 1.0701538517992706, "learning_rate": 5.592266538559208e-06, "loss": 0.0922, "step": 4028 }, { "epoch": 1.5486598136228265, "grad_norm": 1.0770291994593728, "learning_rate": 5.587825768935458e-06, "loss": 0.0907, "step": 4030 }, { "epoch": 1.5494283792871553, "grad_norm": 1.0178953735199736, "learning_rate": 5.583384529075928e-06, "loss": 0.096, "step": 4032 }, { "epoch": 1.5501969449514843, "grad_norm": 1.058174754723859, "learning_rate": 5.578942822533422e-06, "loss": 0.1099, "step": 4034 }, { "epoch": 1.5509655106158133, "grad_norm": 1.1683875547883316, "learning_rate": 5.574500652861118e-06, "loss": 0.1042, "step": 4036 }, { "epoch": 1.5517340762801421, "grad_norm": 1.1368401069114986, "learning_rate": 5.570058023612564e-06, "loss": 0.1006, "step": 4038 }, { "epoch": 1.5525026419444712, "grad_norm": 1.041929511709789, "learning_rate": 5.565614938341676e-06, "loss": 0.0924, "step": 4040 }, { "epoch": 1.5532712076088, "grad_norm": 0.988260257315592, "learning_rate": 5.561171400602733e-06, "loss": 0.0942, "step": 4042 }, { "epoch": 1.554039773273129, "grad_norm": 1.0840662169816615, "learning_rate": 5.55672741395038e-06, "loss": 0.0996, "step": 4044 }, { "epoch": 1.554808338937458, "grad_norm": 1.0345981817744938, "learning_rate": 5.552282981939616e-06, "loss": 0.092, "step": 4046 }, { "epoch": 1.5555769046017869, "grad_norm": 1.0793543486112303, "learning_rate": 5.547838108125801e-06, "loss": 0.0991, "step": 4048 }, { "epoch": 1.556345470266116, "grad_norm": 1.0518170836084724, "learning_rate": 5.5433927960646446e-06, "loss": 0.0957, "step": 4050 }, { "epoch": 1.5571140359304447, "grad_norm": 1.1323081323093318, "learning_rate": 5.538947049312209e-06, "loss": 0.0966, "step": 4052 }, { "epoch": 1.5578826015947738, "grad_norm": 1.0796498818721039, "learning_rate": 5.534500871424905e-06, "loss": 0.0915, "step": 4054 }, { "epoch": 1.5586511672591028, "grad_norm": 1.0610004303285088, "learning_rate": 5.530054265959486e-06, "loss": 0.1084, "step": 4056 }, { "epoch": 1.5594197329234316, "grad_norm": 0.9038863914353208, "learning_rate": 5.525607236473047e-06, "loss": 0.0828, "step": 4058 }, { "epoch": 1.5601882985877606, "grad_norm": 1.10444210118182, "learning_rate": 5.521159786523027e-06, "loss": 0.0898, "step": 4060 }, { "epoch": 1.5609568642520895, "grad_norm": 1.0695899165005824, "learning_rate": 5.516711919667197e-06, "loss": 0.0919, "step": 4062 }, { "epoch": 1.5617254299164185, "grad_norm": 1.1734199530904525, "learning_rate": 5.5122636394636595e-06, "loss": 0.0985, "step": 4064 }, { "epoch": 1.5624939955807475, "grad_norm": 1.0212608725368688, "learning_rate": 5.5078149494708545e-06, "loss": 0.0957, "step": 4066 }, { "epoch": 1.5632625612450763, "grad_norm": 1.324771993731093, "learning_rate": 5.5033658532475465e-06, "loss": 0.1171, "step": 4068 }, { "epoch": 1.5640311269094052, "grad_norm": 1.1002570486689707, "learning_rate": 5.49891635435282e-06, "loss": 0.0911, "step": 4070 }, { "epoch": 1.5647996925737342, "grad_norm": 1.0477658173986293, "learning_rate": 5.49446645634609e-06, "loss": 0.1051, "step": 4072 }, { "epoch": 1.5655682582380632, "grad_norm": 1.1196079301268613, "learning_rate": 5.4900161627870875e-06, "loss": 0.1183, "step": 4074 }, { "epoch": 1.5663368239023923, "grad_norm": 1.0455662753315662, "learning_rate": 5.485565477235854e-06, "loss": 0.0979, "step": 4076 }, { "epoch": 1.567105389566721, "grad_norm": 1.0922388562647183, "learning_rate": 5.481114403252755e-06, "loss": 0.1063, "step": 4078 }, { "epoch": 1.5678739552310499, "grad_norm": 1.0393579479691581, "learning_rate": 5.476662944398462e-06, "loss": 0.0954, "step": 4080 }, { "epoch": 1.568642520895379, "grad_norm": 1.1236538615123706, "learning_rate": 5.47221110423395e-06, "loss": 0.1, "step": 4082 }, { "epoch": 1.569411086559708, "grad_norm": 1.013423235327647, "learning_rate": 5.467758886320507e-06, "loss": 0.0995, "step": 4084 }, { "epoch": 1.570179652224037, "grad_norm": 1.013234358825571, "learning_rate": 5.463306294219715e-06, "loss": 0.1056, "step": 4086 }, { "epoch": 1.5709482178883658, "grad_norm": 1.0599310396270378, "learning_rate": 5.458853331493463e-06, "loss": 0.0972, "step": 4088 }, { "epoch": 1.5717167835526946, "grad_norm": 1.2486178024951609, "learning_rate": 5.454400001703934e-06, "loss": 0.1113, "step": 4090 }, { "epoch": 1.5724853492170237, "grad_norm": 1.113534231867734, "learning_rate": 5.4499463084135985e-06, "loss": 0.0945, "step": 4092 }, { "epoch": 1.5732539148813527, "grad_norm": 1.1380517680023963, "learning_rate": 5.445492255185228e-06, "loss": 0.1052, "step": 4094 }, { "epoch": 1.5740224805456817, "grad_norm": 1.1355319338394019, "learning_rate": 5.441037845581874e-06, "loss": 0.0953, "step": 4096 }, { "epoch": 1.5747910462100105, "grad_norm": 1.1764943675944426, "learning_rate": 5.436583083166878e-06, "loss": 0.0913, "step": 4098 }, { "epoch": 1.5755596118743393, "grad_norm": 1.2230690252388632, "learning_rate": 5.432127971503861e-06, "loss": 0.1027, "step": 4100 }, { "epoch": 1.5763281775386684, "grad_norm": 1.1895317411454491, "learning_rate": 5.427672514156723e-06, "loss": 0.0961, "step": 4102 }, { "epoch": 1.5770967432029974, "grad_norm": 1.177045395952928, "learning_rate": 5.423216714689643e-06, "loss": 0.1044, "step": 4104 }, { "epoch": 1.5778653088673265, "grad_norm": 1.202301328808032, "learning_rate": 5.418760576667071e-06, "loss": 0.1018, "step": 4106 }, { "epoch": 1.5786338745316553, "grad_norm": 0.9876457090501944, "learning_rate": 5.4143041036537326e-06, "loss": 0.0995, "step": 4108 }, { "epoch": 1.579402440195984, "grad_norm": 1.0500317195831161, "learning_rate": 5.409847299214612e-06, "loss": 0.0946, "step": 4110 }, { "epoch": 1.5801710058603131, "grad_norm": 1.0918106261976868, "learning_rate": 5.405390166914969e-06, "loss": 0.1004, "step": 4112 }, { "epoch": 1.5809395715246422, "grad_norm": 1.0580092883436174, "learning_rate": 5.400932710320321e-06, "loss": 0.105, "step": 4114 }, { "epoch": 1.5817081371889712, "grad_norm": 1.0500843141682523, "learning_rate": 5.396474932996443e-06, "loss": 0.1026, "step": 4116 }, { "epoch": 1.5824767028533, "grad_norm": 1.1873655080159962, "learning_rate": 5.392016838509369e-06, "loss": 0.1087, "step": 4118 }, { "epoch": 1.5832452685176288, "grad_norm": 1.0082176062143526, "learning_rate": 5.387558430425388e-06, "loss": 0.0897, "step": 4120 }, { "epoch": 1.5840138341819578, "grad_norm": 1.111477541842581, "learning_rate": 5.383099712311035e-06, "loss": 0.1076, "step": 4122 }, { "epoch": 1.5847823998462869, "grad_norm": 1.039580519374182, "learning_rate": 5.378640687733098e-06, "loss": 0.1005, "step": 4124 }, { "epoch": 1.585550965510616, "grad_norm": 1.1419156922544695, "learning_rate": 5.374181360258609e-06, "loss": 0.0945, "step": 4126 }, { "epoch": 1.5863195311749447, "grad_norm": 1.0315281710887503, "learning_rate": 5.36972173345484e-06, "loss": 0.1004, "step": 4128 }, { "epoch": 1.5870880968392735, "grad_norm": 1.0689486916105735, "learning_rate": 5.3652618108893036e-06, "loss": 0.099, "step": 4130 }, { "epoch": 1.5878566625036026, "grad_norm": 0.9757782313117511, "learning_rate": 5.3608015961297534e-06, "loss": 0.0999, "step": 4132 }, { "epoch": 1.5886252281679316, "grad_norm": 0.9810381243847355, "learning_rate": 5.356341092744169e-06, "loss": 0.0863, "step": 4134 }, { "epoch": 1.5893937938322606, "grad_norm": 1.0593192735896777, "learning_rate": 5.351880304300764e-06, "loss": 0.0927, "step": 4136 }, { "epoch": 1.5901623594965895, "grad_norm": 2.2486734521259395, "learning_rate": 5.347419234367983e-06, "loss": 0.1073, "step": 4138 }, { "epoch": 1.5909309251609183, "grad_norm": 1.0835399994602122, "learning_rate": 5.342957886514494e-06, "loss": 0.0916, "step": 4140 }, { "epoch": 1.5916994908252473, "grad_norm": 1.1691873519782132, "learning_rate": 5.338496264309185e-06, "loss": 0.0966, "step": 4142 }, { "epoch": 1.5924680564895763, "grad_norm": 1.2095390536264123, "learning_rate": 5.334034371321164e-06, "loss": 0.1093, "step": 4144 }, { "epoch": 1.5932366221539054, "grad_norm": 1.2928860942720883, "learning_rate": 5.32957221111976e-06, "loss": 0.108, "step": 4146 }, { "epoch": 1.5940051878182342, "grad_norm": 1.0961668577587071, "learning_rate": 5.325109787274512e-06, "loss": 0.0965, "step": 4148 }, { "epoch": 1.594773753482563, "grad_norm": 1.0790803613576734, "learning_rate": 5.3206471033551675e-06, "loss": 0.0958, "step": 4150 }, { "epoch": 1.595542319146892, "grad_norm": 0.9521720692841917, "learning_rate": 5.3161841629316905e-06, "loss": 0.0974, "step": 4152 }, { "epoch": 1.596310884811221, "grad_norm": 1.0804175375916798, "learning_rate": 5.31172096957424e-06, "loss": 0.1045, "step": 4154 }, { "epoch": 1.59707945047555, "grad_norm": 1.138905998358467, "learning_rate": 5.3072575268531835e-06, "loss": 0.0933, "step": 4156 }, { "epoch": 1.597848016139879, "grad_norm": 1.1831554899580299, "learning_rate": 5.302793838339086e-06, "loss": 0.1103, "step": 4158 }, { "epoch": 1.5986165818042077, "grad_norm": 1.1798257507453935, "learning_rate": 5.298329907602714e-06, "loss": 0.1023, "step": 4160 }, { "epoch": 1.5993851474685368, "grad_norm": 1.0792569982425844, "learning_rate": 5.293865738215017e-06, "loss": 0.1018, "step": 4162 }, { "epoch": 1.6001537131328658, "grad_norm": 1.1270617064913668, "learning_rate": 5.289401333747145e-06, "loss": 0.0928, "step": 4164 }, { "epoch": 1.6009222787971948, "grad_norm": 1.2488971321033662, "learning_rate": 5.284936697770436e-06, "loss": 0.1007, "step": 4166 }, { "epoch": 1.6016908444615237, "grad_norm": 1.2177462601085065, "learning_rate": 5.280471833856404e-06, "loss": 0.0942, "step": 4168 }, { "epoch": 1.6024594101258525, "grad_norm": 0.8796351749176001, "learning_rate": 5.276006745576756e-06, "loss": 0.0958, "step": 4170 }, { "epoch": 1.6032279757901815, "grad_norm": 0.9383840738616891, "learning_rate": 5.271541436503373e-06, "loss": 0.095, "step": 4172 }, { "epoch": 1.6039965414545105, "grad_norm": 1.149572619632874, "learning_rate": 5.2670759102083125e-06, "loss": 0.0983, "step": 4174 }, { "epoch": 1.6047651071188396, "grad_norm": 1.1362165646943878, "learning_rate": 5.262610170263808e-06, "loss": 0.1012, "step": 4176 }, { "epoch": 1.6055336727831684, "grad_norm": 1.1545159411552848, "learning_rate": 5.258144220242262e-06, "loss": 0.1031, "step": 4178 }, { "epoch": 1.6063022384474972, "grad_norm": 1.1676383243601172, "learning_rate": 5.253678063716247e-06, "loss": 0.1066, "step": 4180 }, { "epoch": 1.6070708041118262, "grad_norm": 0.9718359335361042, "learning_rate": 5.249211704258498e-06, "loss": 0.0965, "step": 4182 }, { "epoch": 1.6078393697761553, "grad_norm": 1.1598033448140876, "learning_rate": 5.2447451454419154e-06, "loss": 0.1006, "step": 4184 }, { "epoch": 1.6086079354404843, "grad_norm": 1.098644799707403, "learning_rate": 5.240278390839556e-06, "loss": 0.0986, "step": 4186 }, { "epoch": 1.6093765011048131, "grad_norm": 1.0586269348329453, "learning_rate": 5.235811444024635e-06, "loss": 0.0887, "step": 4188 }, { "epoch": 1.610145066769142, "grad_norm": 1.150489163477807, "learning_rate": 5.231344308570523e-06, "loss": 0.104, "step": 4190 }, { "epoch": 1.610913632433471, "grad_norm": 0.9560108681628459, "learning_rate": 5.226876988050737e-06, "loss": 0.0897, "step": 4192 }, { "epoch": 1.6116821980978, "grad_norm": 1.0883464298467864, "learning_rate": 5.222409486038947e-06, "loss": 0.0913, "step": 4194 }, { "epoch": 1.612450763762129, "grad_norm": 1.1682960395078923, "learning_rate": 5.217941806108964e-06, "loss": 0.0996, "step": 4196 }, { "epoch": 1.6132193294264578, "grad_norm": 1.053432173654743, "learning_rate": 5.213473951834744e-06, "loss": 0.0853, "step": 4198 }, { "epoch": 1.6139878950907867, "grad_norm": 1.1896123343703984, "learning_rate": 5.209005926790383e-06, "loss": 0.1137, "step": 4200 }, { "epoch": 1.6147564607551157, "grad_norm": 1.193782032092963, "learning_rate": 5.2045377345501095e-06, "loss": 0.1007, "step": 4202 }, { "epoch": 1.6155250264194447, "grad_norm": 1.1593294669837775, "learning_rate": 5.20006937868829e-06, "loss": 0.0931, "step": 4204 }, { "epoch": 1.6162935920837738, "grad_norm": 0.9791891115658414, "learning_rate": 5.195600862779421e-06, "loss": 0.0917, "step": 4206 }, { "epoch": 1.6170621577481026, "grad_norm": 1.00260143403568, "learning_rate": 5.191132190398125e-06, "loss": 0.0965, "step": 4208 }, { "epoch": 1.6178307234124314, "grad_norm": 1.0892332377232443, "learning_rate": 5.186663365119151e-06, "loss": 0.0991, "step": 4210 }, { "epoch": 1.6185992890767604, "grad_norm": 1.0934481234111713, "learning_rate": 5.182194390517374e-06, "loss": 0.1035, "step": 4212 }, { "epoch": 1.6193678547410895, "grad_norm": 1.0987059791187679, "learning_rate": 5.17772527016778e-06, "loss": 0.0969, "step": 4214 }, { "epoch": 1.6201364204054185, "grad_norm": 1.0533692055016426, "learning_rate": 5.17325600764548e-06, "loss": 0.1016, "step": 4216 }, { "epoch": 1.6209049860697473, "grad_norm": 0.953322439286603, "learning_rate": 5.168786606525694e-06, "loss": 0.0897, "step": 4218 }, { "epoch": 1.6216735517340761, "grad_norm": 1.013410835651436, "learning_rate": 5.164317070383752e-06, "loss": 0.1058, "step": 4220 }, { "epoch": 1.6224421173984052, "grad_norm": 1.122658253852591, "learning_rate": 5.159847402795096e-06, "loss": 0.1111, "step": 4222 }, { "epoch": 1.6232106830627342, "grad_norm": 0.9967154290369378, "learning_rate": 5.155377607335274e-06, "loss": 0.0959, "step": 4224 }, { "epoch": 1.6239792487270632, "grad_norm": 1.07169584334969, "learning_rate": 5.150907687579928e-06, "loss": 0.0945, "step": 4226 }, { "epoch": 1.624747814391392, "grad_norm": 1.115830827597116, "learning_rate": 5.146437647104808e-06, "loss": 0.0977, "step": 4228 }, { "epoch": 1.6255163800557209, "grad_norm": 1.1032863404164617, "learning_rate": 5.141967489485758e-06, "loss": 0.1004, "step": 4230 }, { "epoch": 1.62628494572005, "grad_norm": 1.1706660066870445, "learning_rate": 5.137497218298715e-06, "loss": 0.1058, "step": 4232 }, { "epoch": 1.627053511384379, "grad_norm": 1.0605406710912553, "learning_rate": 5.133026837119706e-06, "loss": 0.0934, "step": 4234 }, { "epoch": 1.627822077048708, "grad_norm": 1.087060077667683, "learning_rate": 5.128556349524847e-06, "loss": 0.1028, "step": 4236 }, { "epoch": 1.6285906427130368, "grad_norm": 1.1298440717454814, "learning_rate": 5.124085759090341e-06, "loss": 0.1052, "step": 4238 }, { "epoch": 1.6293592083773656, "grad_norm": 1.0242472472929134, "learning_rate": 5.119615069392468e-06, "loss": 0.0946, "step": 4240 }, { "epoch": 1.6301277740416946, "grad_norm": 0.9835469600277115, "learning_rate": 5.115144284007595e-06, "loss": 0.0925, "step": 4242 }, { "epoch": 1.6308963397060237, "grad_norm": 1.0982818763415192, "learning_rate": 5.110673406512159e-06, "loss": 0.1068, "step": 4244 }, { "epoch": 1.6316649053703527, "grad_norm": 0.9553114082054188, "learning_rate": 5.106202440482674e-06, "loss": 0.0919, "step": 4246 }, { "epoch": 1.6324334710346815, "grad_norm": 1.0059439357744524, "learning_rate": 5.1017313894957235e-06, "loss": 0.0987, "step": 4248 }, { "epoch": 1.6332020366990103, "grad_norm": 1.0560839864496332, "learning_rate": 5.097260257127958e-06, "loss": 0.1012, "step": 4250 }, { "epoch": 1.6339706023633394, "grad_norm": 1.0512570098545782, "learning_rate": 5.092789046956098e-06, "loss": 0.0863, "step": 4252 }, { "epoch": 1.6347391680276684, "grad_norm": 1.0375806015383973, "learning_rate": 5.08831776255692e-06, "loss": 0.0931, "step": 4254 }, { "epoch": 1.6355077336919974, "grad_norm": 1.0547310969739654, "learning_rate": 5.083846407507263e-06, "loss": 0.1078, "step": 4256 }, { "epoch": 1.6362762993563262, "grad_norm": 1.1103862804622826, "learning_rate": 5.079374985384025e-06, "loss": 0.0926, "step": 4258 }, { "epoch": 1.637044865020655, "grad_norm": 1.2256710351713689, "learning_rate": 5.074903499764149e-06, "loss": 0.0911, "step": 4260 }, { "epoch": 1.637813430684984, "grad_norm": 1.0274709541948006, "learning_rate": 5.0704319542246385e-06, "loss": 0.1076, "step": 4262 }, { "epoch": 1.6385819963493131, "grad_norm": 1.1836074140728108, "learning_rate": 5.0659603523425414e-06, "loss": 0.0945, "step": 4264 }, { "epoch": 1.6393505620136422, "grad_norm": 1.1329434887290986, "learning_rate": 5.061488697694947e-06, "loss": 0.1043, "step": 4266 }, { "epoch": 1.640119127677971, "grad_norm": 1.0514822259994563, "learning_rate": 5.057016993858994e-06, "loss": 0.0966, "step": 4268 }, { "epoch": 1.6408876933422998, "grad_norm": 1.1565589858964724, "learning_rate": 5.052545244411855e-06, "loss": 0.1086, "step": 4270 }, { "epoch": 1.6416562590066288, "grad_norm": 0.9747993996967705, "learning_rate": 5.048073452930741e-06, "loss": 0.0902, "step": 4272 }, { "epoch": 1.6424248246709579, "grad_norm": 1.107388228949875, "learning_rate": 5.043601622992893e-06, "loss": 0.1035, "step": 4274 }, { "epoch": 1.6431933903352869, "grad_norm": 1.1711781356848476, "learning_rate": 5.039129758175592e-06, "loss": 0.0979, "step": 4276 }, { "epoch": 1.6439619559996157, "grad_norm": 1.0293525229409812, "learning_rate": 5.034657862056136e-06, "loss": 0.0954, "step": 4278 }, { "epoch": 1.6447305216639445, "grad_norm": 0.9744924814356439, "learning_rate": 5.030185938211853e-06, "loss": 0.0945, "step": 4280 }, { "epoch": 1.6454990873282735, "grad_norm": 1.2292576699161564, "learning_rate": 5.025713990220098e-06, "loss": 0.1113, "step": 4282 }, { "epoch": 1.6462676529926026, "grad_norm": 1.1042727189972055, "learning_rate": 5.021242021658235e-06, "loss": 0.0954, "step": 4284 }, { "epoch": 1.6470362186569316, "grad_norm": 1.0907419752610512, "learning_rate": 5.016770036103653e-06, "loss": 0.1082, "step": 4286 }, { "epoch": 1.6478047843212604, "grad_norm": 1.0177923783238343, "learning_rate": 5.0122980371337506e-06, "loss": 0.1086, "step": 4288 }, { "epoch": 1.6485733499855892, "grad_norm": 1.118512751268935, "learning_rate": 5.007826028325937e-06, "loss": 0.0985, "step": 4290 }, { "epoch": 1.6493419156499183, "grad_norm": 1.0210146231422188, "learning_rate": 5.003354013257633e-06, "loss": 0.087, "step": 4292 }, { "epoch": 1.6501104813142473, "grad_norm": 1.0385433583560306, "learning_rate": 4.998881995506261e-06, "loss": 0.0991, "step": 4294 }, { "epoch": 1.6508790469785763, "grad_norm": 0.9762258627904882, "learning_rate": 4.9944099786492435e-06, "loss": 0.0831, "step": 4296 }, { "epoch": 1.6516476126429052, "grad_norm": 1.1294100191668344, "learning_rate": 4.989937966264006e-06, "loss": 0.0956, "step": 4298 }, { "epoch": 1.652416178307234, "grad_norm": 1.248807961698218, "learning_rate": 4.9854659619279725e-06, "loss": 0.1029, "step": 4300 }, { "epoch": 1.653184743971563, "grad_norm": 1.0391508067054098, "learning_rate": 4.980993969218554e-06, "loss": 0.0887, "step": 4302 }, { "epoch": 1.653953309635892, "grad_norm": 1.013148336967748, "learning_rate": 4.976521991713158e-06, "loss": 0.0939, "step": 4304 }, { "epoch": 1.654721875300221, "grad_norm": 1.0650065489702976, "learning_rate": 4.9720500329891755e-06, "loss": 0.0989, "step": 4306 }, { "epoch": 1.65549044096455, "grad_norm": 1.1650755192446582, "learning_rate": 4.967578096623988e-06, "loss": 0.1099, "step": 4308 }, { "epoch": 1.6562590066288787, "grad_norm": 1.2249209244273576, "learning_rate": 4.9631061861949524e-06, "loss": 0.1, "step": 4310 }, { "epoch": 1.6570275722932077, "grad_norm": 1.1012964612753433, "learning_rate": 4.958634305279409e-06, "loss": 0.099, "step": 4312 }, { "epoch": 1.6577961379575368, "grad_norm": 1.166005219224878, "learning_rate": 4.9541624574546725e-06, "loss": 0.1025, "step": 4314 }, { "epoch": 1.6585647036218658, "grad_norm": 0.9373584934216532, "learning_rate": 4.949690646298034e-06, "loss": 0.0883, "step": 4316 }, { "epoch": 1.6593332692861946, "grad_norm": 0.9245081367808976, "learning_rate": 4.945218875386752e-06, "loss": 0.0911, "step": 4318 }, { "epoch": 1.6601018349505234, "grad_norm": 0.9506924766281506, "learning_rate": 4.9407471482980585e-06, "loss": 0.101, "step": 4320 }, { "epoch": 1.6608704006148525, "grad_norm": 1.0749352292610275, "learning_rate": 4.936275468609141e-06, "loss": 0.0981, "step": 4322 }, { "epoch": 1.6616389662791815, "grad_norm": 0.969768197945117, "learning_rate": 4.931803839897156e-06, "loss": 0.093, "step": 4324 }, { "epoch": 1.6624075319435105, "grad_norm": 1.040086960905704, "learning_rate": 4.927332265739217e-06, "loss": 0.0982, "step": 4326 }, { "epoch": 1.6631760976078394, "grad_norm": 1.195484669843597, "learning_rate": 4.922860749712397e-06, "loss": 0.1093, "step": 4328 }, { "epoch": 1.6639446632721682, "grad_norm": 1.2475984327506298, "learning_rate": 4.918389295393717e-06, "loss": 0.1118, "step": 4330 }, { "epoch": 1.6647132289364972, "grad_norm": 1.0464607723382573, "learning_rate": 4.913917906360154e-06, "loss": 0.0955, "step": 4332 }, { "epoch": 1.6654817946008262, "grad_norm": 1.1390628842199608, "learning_rate": 4.909446586188629e-06, "loss": 0.0936, "step": 4334 }, { "epoch": 1.6662503602651553, "grad_norm": 1.049950683920262, "learning_rate": 4.904975338456008e-06, "loss": 0.0965, "step": 4336 }, { "epoch": 1.667018925929484, "grad_norm": 1.132830504258452, "learning_rate": 4.9005041667391015e-06, "loss": 0.1055, "step": 4338 }, { "epoch": 1.667787491593813, "grad_norm": 1.1406425750293647, "learning_rate": 4.896033074614659e-06, "loss": 0.1053, "step": 4340 }, { "epoch": 1.668556057258142, "grad_norm": 0.9923075127044864, "learning_rate": 4.8915620656593624e-06, "loss": 0.0851, "step": 4342 }, { "epoch": 1.669324622922471, "grad_norm": 1.1551388798272597, "learning_rate": 4.8870911434498346e-06, "loss": 0.0917, "step": 4344 }, { "epoch": 1.6700931885868, "grad_norm": 1.1841277223601747, "learning_rate": 4.882620311562619e-06, "loss": 0.1004, "step": 4346 }, { "epoch": 1.6708617542511288, "grad_norm": 1.2319532588011064, "learning_rate": 4.878149573574198e-06, "loss": 0.1043, "step": 4348 }, { "epoch": 1.6716303199154576, "grad_norm": 1.2722269970329474, "learning_rate": 4.873678933060967e-06, "loss": 0.1007, "step": 4350 }, { "epoch": 1.6723988855797867, "grad_norm": 1.037606034019622, "learning_rate": 4.869208393599255e-06, "loss": 0.1015, "step": 4352 }, { "epoch": 1.6731674512441157, "grad_norm": 1.1751765484237247, "learning_rate": 4.8647379587653e-06, "loss": 0.115, "step": 4354 }, { "epoch": 1.6739360169084447, "grad_norm": 1.1665180274315108, "learning_rate": 4.8602676321352646e-06, "loss": 0.0923, "step": 4356 }, { "epoch": 1.6747045825727735, "grad_norm": 1.1362494293738439, "learning_rate": 4.855797417285219e-06, "loss": 0.1007, "step": 4358 }, { "epoch": 1.6754731482371024, "grad_norm": 0.9981815296569643, "learning_rate": 4.8513273177911494e-06, "loss": 0.0885, "step": 4360 }, { "epoch": 1.6762417139014314, "grad_norm": 1.100468845984561, "learning_rate": 4.846857337228943e-06, "loss": 0.0941, "step": 4362 }, { "epoch": 1.6770102795657604, "grad_norm": 1.0492871695836505, "learning_rate": 4.842387479174396e-06, "loss": 0.0926, "step": 4364 }, { "epoch": 1.6777788452300895, "grad_norm": 1.07078955043494, "learning_rate": 4.837917747203207e-06, "loss": 0.0905, "step": 4366 }, { "epoch": 1.6785474108944183, "grad_norm": 1.11087147362112, "learning_rate": 4.833448144890972e-06, "loss": 0.0951, "step": 4368 }, { "epoch": 1.679315976558747, "grad_norm": 1.064647752204919, "learning_rate": 4.828978675813184e-06, "loss": 0.0883, "step": 4370 }, { "epoch": 1.6800845422230761, "grad_norm": 1.0186129752399111, "learning_rate": 4.8245093435452316e-06, "loss": 0.0943, "step": 4372 }, { "epoch": 1.6808531078874052, "grad_norm": 1.0419150410990519, "learning_rate": 4.82004015166239e-06, "loss": 0.096, "step": 4374 }, { "epoch": 1.6816216735517342, "grad_norm": 1.1647545295647859, "learning_rate": 4.815571103739822e-06, "loss": 0.1033, "step": 4376 }, { "epoch": 1.682390239216063, "grad_norm": 1.1502723807585877, "learning_rate": 4.811102203352579e-06, "loss": 0.1003, "step": 4378 }, { "epoch": 1.6831588048803918, "grad_norm": 1.126239894706306, "learning_rate": 4.806633454075595e-06, "loss": 0.1015, "step": 4380 }, { "epoch": 1.6839273705447209, "grad_norm": 1.1336448385814963, "learning_rate": 4.802164859483675e-06, "loss": 0.0975, "step": 4382 }, { "epoch": 1.68469593620905, "grad_norm": 1.0397483759985617, "learning_rate": 4.797696423151511e-06, "loss": 0.0957, "step": 4384 }, { "epoch": 1.685464501873379, "grad_norm": 1.0439500444880077, "learning_rate": 4.793228148653663e-06, "loss": 0.0905, "step": 4386 }, { "epoch": 1.6862330675377077, "grad_norm": 1.0334923996414913, "learning_rate": 4.788760039564557e-06, "loss": 0.0842, "step": 4388 }, { "epoch": 1.6870016332020366, "grad_norm": 1.1246968608192889, "learning_rate": 4.784292099458495e-06, "loss": 0.0906, "step": 4390 }, { "epoch": 1.6877701988663656, "grad_norm": 1.183360002654018, "learning_rate": 4.779824331909641e-06, "loss": 0.0998, "step": 4392 }, { "epoch": 1.6885387645306946, "grad_norm": 1.0218370469076647, "learning_rate": 4.775356740492019e-06, "loss": 0.089, "step": 4394 }, { "epoch": 1.6893073301950237, "grad_norm": 1.1053384139237952, "learning_rate": 4.770889328779513e-06, "loss": 0.1093, "step": 4396 }, { "epoch": 1.6900758958593525, "grad_norm": 1.1105174244280536, "learning_rate": 4.766422100345865e-06, "loss": 0.1038, "step": 4398 }, { "epoch": 1.6908444615236813, "grad_norm": 1.1584117885897274, "learning_rate": 4.761955058764669e-06, "loss": 0.108, "step": 4400 }, { "epoch": 1.6916130271880103, "grad_norm": 0.9704969728568139, "learning_rate": 4.757488207609366e-06, "loss": 0.0905, "step": 4402 }, { "epoch": 1.6923815928523394, "grad_norm": 1.0370312570884674, "learning_rate": 4.753021550453254e-06, "loss": 0.0922, "step": 4404 }, { "epoch": 1.6931501585166684, "grad_norm": 1.1564052797829698, "learning_rate": 4.748555090869464e-06, "loss": 0.0983, "step": 4406 }, { "epoch": 1.6939187241809972, "grad_norm": 1.3598504190350496, "learning_rate": 4.744088832430981e-06, "loss": 0.0947, "step": 4408 }, { "epoch": 1.694687289845326, "grad_norm": 1.0630331492035852, "learning_rate": 4.739622778710617e-06, "loss": 0.0999, "step": 4410 }, { "epoch": 1.695455855509655, "grad_norm": 1.1351041356649334, "learning_rate": 4.735156933281031e-06, "loss": 0.0994, "step": 4412 }, { "epoch": 1.696224421173984, "grad_norm": 1.3006694214256223, "learning_rate": 4.73069129971471e-06, "loss": 0.1047, "step": 4414 }, { "epoch": 1.6969929868383131, "grad_norm": 1.0360618046706056, "learning_rate": 4.72622588158397e-06, "loss": 0.108, "step": 4416 }, { "epoch": 1.697761552502642, "grad_norm": 1.2418998442548914, "learning_rate": 4.721760682460957e-06, "loss": 0.0941, "step": 4418 }, { "epoch": 1.6985301181669707, "grad_norm": 1.1661980931944445, "learning_rate": 4.717295705917644e-06, "loss": 0.1032, "step": 4420 }, { "epoch": 1.6992986838312998, "grad_norm": 1.120863095641424, "learning_rate": 4.712830955525821e-06, "loss": 0.087, "step": 4422 }, { "epoch": 1.7000672494956288, "grad_norm": 1.2863742306674617, "learning_rate": 4.708366434857102e-06, "loss": 0.11, "step": 4424 }, { "epoch": 1.7008358151599579, "grad_norm": 0.9738226158841401, "learning_rate": 4.703902147482915e-06, "loss": 0.0821, "step": 4426 }, { "epoch": 1.7016043808242867, "grad_norm": 0.8915946729782415, "learning_rate": 4.699438096974498e-06, "loss": 0.0927, "step": 4428 }, { "epoch": 1.7023729464886155, "grad_norm": 1.1903706217214982, "learning_rate": 4.694974286902905e-06, "loss": 0.1066, "step": 4430 }, { "epoch": 1.7031415121529445, "grad_norm": 0.976023824892685, "learning_rate": 4.690510720838996e-06, "loss": 0.0915, "step": 4432 }, { "epoch": 1.7039100778172735, "grad_norm": 1.284921065938603, "learning_rate": 4.686047402353433e-06, "loss": 0.1079, "step": 4434 }, { "epoch": 1.7046786434816026, "grad_norm": 0.9701001474811831, "learning_rate": 4.681584335016686e-06, "loss": 0.0953, "step": 4436 }, { "epoch": 1.7054472091459314, "grad_norm": 1.1304145372309693, "learning_rate": 4.6771215223990165e-06, "loss": 0.0991, "step": 4438 }, { "epoch": 1.7062157748102602, "grad_norm": 0.9395206665064343, "learning_rate": 4.6726589680704905e-06, "loss": 0.0887, "step": 4440 }, { "epoch": 1.7069843404745892, "grad_norm": 0.965938580996638, "learning_rate": 4.6681966756009575e-06, "loss": 0.0916, "step": 4442 }, { "epoch": 1.7077529061389183, "grad_norm": 0.972554493097576, "learning_rate": 4.663734648560067e-06, "loss": 0.0909, "step": 4444 }, { "epoch": 1.7085214718032473, "grad_norm": 1.0547067370630778, "learning_rate": 4.659272890517249e-06, "loss": 0.091, "step": 4446 }, { "epoch": 1.7092900374675761, "grad_norm": 1.0477688453424583, "learning_rate": 4.6548114050417245e-06, "loss": 0.0932, "step": 4448 }, { "epoch": 1.710058603131905, "grad_norm": 1.2073521735755093, "learning_rate": 4.650350195702491e-06, "loss": 0.0966, "step": 4450 }, { "epoch": 1.710827168796234, "grad_norm": 1.0938740228364676, "learning_rate": 4.645889266068331e-06, "loss": 0.0908, "step": 4452 }, { "epoch": 1.711595734460563, "grad_norm": 1.1656620821654, "learning_rate": 4.641428619707794e-06, "loss": 0.1074, "step": 4454 }, { "epoch": 1.712364300124892, "grad_norm": 1.089907072428517, "learning_rate": 4.636968260189214e-06, "loss": 0.0925, "step": 4456 }, { "epoch": 1.7131328657892209, "grad_norm": 1.0524526396394076, "learning_rate": 4.6325081910806865e-06, "loss": 0.0902, "step": 4458 }, { "epoch": 1.7139014314535497, "grad_norm": 1.0212730045411351, "learning_rate": 4.628048415950082e-06, "loss": 0.0891, "step": 4460 }, { "epoch": 1.7146699971178787, "grad_norm": 1.0888672085829354, "learning_rate": 4.6235889383650276e-06, "loss": 0.0972, "step": 4462 }, { "epoch": 1.7154385627822077, "grad_norm": 1.1287120142463452, "learning_rate": 4.6191297618929195e-06, "loss": 0.1093, "step": 4464 }, { "epoch": 1.7162071284465368, "grad_norm": 1.0662330182045934, "learning_rate": 4.614670890100913e-06, "loss": 0.0926, "step": 4466 }, { "epoch": 1.7169756941108656, "grad_norm": 1.0155831083467535, "learning_rate": 4.610212326555913e-06, "loss": 0.092, "step": 4468 }, { "epoch": 1.7177442597751944, "grad_norm": 1.0649734108067508, "learning_rate": 4.605754074824583e-06, "loss": 0.0928, "step": 4470 }, { "epoch": 1.7185128254395234, "grad_norm": 1.0662255757169796, "learning_rate": 4.601296138473338e-06, "loss": 0.0968, "step": 4472 }, { "epoch": 1.7192813911038525, "grad_norm": 1.2886870169427302, "learning_rate": 4.596838521068335e-06, "loss": 0.0916, "step": 4474 }, { "epoch": 1.7200499567681815, "grad_norm": 1.145201287529512, "learning_rate": 4.592381226175484e-06, "loss": 0.0944, "step": 4476 }, { "epoch": 1.7208185224325103, "grad_norm": 1.0437979871153111, "learning_rate": 4.5879242573604286e-06, "loss": 0.0892, "step": 4478 }, { "epoch": 1.7215870880968391, "grad_norm": 1.074964868901433, "learning_rate": 4.58346761818856e-06, "loss": 0.0832, "step": 4480 }, { "epoch": 1.7223556537611682, "grad_norm": 1.1354917656782955, "learning_rate": 4.579011312224997e-06, "loss": 0.1091, "step": 4482 }, { "epoch": 1.7231242194254972, "grad_norm": 1.0388359671512601, "learning_rate": 4.574555343034598e-06, "loss": 0.0932, "step": 4484 }, { "epoch": 1.7238927850898262, "grad_norm": 1.3221151202882861, "learning_rate": 4.570099714181949e-06, "loss": 0.0949, "step": 4486 }, { "epoch": 1.724661350754155, "grad_norm": 1.1115080163541293, "learning_rate": 4.565644429231367e-06, "loss": 0.1041, "step": 4488 }, { "epoch": 1.7254299164184839, "grad_norm": 1.4102488640737516, "learning_rate": 4.561189491746889e-06, "loss": 0.0875, "step": 4490 }, { "epoch": 1.726198482082813, "grad_norm": 1.1875509514358318, "learning_rate": 4.55673490529228e-06, "loss": 0.0979, "step": 4492 }, { "epoch": 1.726967047747142, "grad_norm": 1.0491033394068108, "learning_rate": 4.552280673431018e-06, "loss": 0.0968, "step": 4494 }, { "epoch": 1.727735613411471, "grad_norm": 1.1056225032590432, "learning_rate": 4.547826799726302e-06, "loss": 0.0973, "step": 4496 }, { "epoch": 1.7285041790757998, "grad_norm": 1.0085136764086857, "learning_rate": 4.543373287741041e-06, "loss": 0.0939, "step": 4498 }, { "epoch": 1.7292727447401286, "grad_norm": 1.2693845579532759, "learning_rate": 4.538920141037859e-06, "loss": 0.105, "step": 4500 }, { "epoch": 1.7292727447401286, "eval_loss": 0.15057328343391418, "eval_runtime": 390.3911, "eval_samples_per_second": 47.401, "eval_steps_per_second": 5.927, "step": 4500 }, { "epoch": 1.7300413104044576, "grad_norm": 1.1227814458492138, "learning_rate": 4.534467363179083e-06, "loss": 0.0998, "step": 4502 }, { "epoch": 1.7308098760687867, "grad_norm": 1.2199384307288825, "learning_rate": 4.530014957726747e-06, "loss": 0.0955, "step": 4504 }, { "epoch": 1.7315784417331157, "grad_norm": 1.1061212428108498, "learning_rate": 4.525562928242592e-06, "loss": 0.105, "step": 4506 }, { "epoch": 1.7323470073974445, "grad_norm": 1.0405562691363472, "learning_rate": 4.521111278288047e-06, "loss": 0.0996, "step": 4508 }, { "epoch": 1.7331155730617733, "grad_norm": 0.9980896354164529, "learning_rate": 4.516660011424247e-06, "loss": 0.0886, "step": 4510 }, { "epoch": 1.7338841387261024, "grad_norm": 1.2477820221913218, "learning_rate": 4.512209131212017e-06, "loss": 0.1085, "step": 4512 }, { "epoch": 1.7346527043904314, "grad_norm": 0.923374588657426, "learning_rate": 4.507758641211873e-06, "loss": 0.0877, "step": 4514 }, { "epoch": 1.7354212700547604, "grad_norm": 1.0984082771932984, "learning_rate": 4.50330854498402e-06, "loss": 0.1012, "step": 4516 }, { "epoch": 1.7361898357190892, "grad_norm": 1.1553794139710218, "learning_rate": 4.498858846088347e-06, "loss": 0.1034, "step": 4518 }, { "epoch": 1.736958401383418, "grad_norm": 0.9361286525270438, "learning_rate": 4.494409548084424e-06, "loss": 0.0745, "step": 4520 }, { "epoch": 1.737726967047747, "grad_norm": 0.9131071822467857, "learning_rate": 4.4899606545315e-06, "loss": 0.0822, "step": 4522 }, { "epoch": 1.7384955327120761, "grad_norm": 1.1213576185216674, "learning_rate": 4.485512168988505e-06, "loss": 0.1012, "step": 4524 }, { "epoch": 1.7392640983764052, "grad_norm": 1.2462979021863378, "learning_rate": 4.481064095014036e-06, "loss": 0.0998, "step": 4526 }, { "epoch": 1.740032664040734, "grad_norm": 1.0961453969236288, "learning_rate": 4.476616436166368e-06, "loss": 0.0976, "step": 4528 }, { "epoch": 1.7408012297050628, "grad_norm": 1.0806906055128513, "learning_rate": 4.472169196003439e-06, "loss": 0.0896, "step": 4530 }, { "epoch": 1.7415697953693918, "grad_norm": 1.2760003427575768, "learning_rate": 4.4677223780828535e-06, "loss": 0.1051, "step": 4532 }, { "epoch": 1.7423383610337209, "grad_norm": 1.0928421042194842, "learning_rate": 4.463275985961874e-06, "loss": 0.0952, "step": 4534 }, { "epoch": 1.74310692669805, "grad_norm": 1.1251574378568248, "learning_rate": 4.458830023197432e-06, "loss": 0.0925, "step": 4536 }, { "epoch": 1.7438754923623787, "grad_norm": 1.112631215377929, "learning_rate": 4.454384493346107e-06, "loss": 0.0961, "step": 4538 }, { "epoch": 1.7446440580267075, "grad_norm": 1.0341813609292942, "learning_rate": 4.449939399964135e-06, "loss": 0.089, "step": 4540 }, { "epoch": 1.7454126236910366, "grad_norm": 1.1241700698586683, "learning_rate": 4.445494746607403e-06, "loss": 0.092, "step": 4542 }, { "epoch": 1.7461811893553656, "grad_norm": 1.0058534868791218, "learning_rate": 4.441050536831449e-06, "loss": 0.0824, "step": 4544 }, { "epoch": 1.7469497550196946, "grad_norm": 1.069707195574756, "learning_rate": 4.4366067741914475e-06, "loss": 0.1009, "step": 4546 }, { "epoch": 1.7477183206840234, "grad_norm": 0.9890580723490138, "learning_rate": 4.432163462242225e-06, "loss": 0.1006, "step": 4548 }, { "epoch": 1.7484868863483523, "grad_norm": 1.0711409162445578, "learning_rate": 4.42772060453824e-06, "loss": 0.0955, "step": 4550 }, { "epoch": 1.7492554520126813, "grad_norm": 1.0666684560116635, "learning_rate": 4.423278204633596e-06, "loss": 0.0947, "step": 4552 }, { "epoch": 1.7500240176770103, "grad_norm": 1.144016178793511, "learning_rate": 4.418836266082019e-06, "loss": 0.09, "step": 4554 }, { "epoch": 1.7507925833413394, "grad_norm": 1.1193531505837582, "learning_rate": 4.414394792436877e-06, "loss": 0.0971, "step": 4556 }, { "epoch": 1.7515611490056682, "grad_norm": 1.1643676311299362, "learning_rate": 4.409953787251161e-06, "loss": 0.0857, "step": 4558 }, { "epoch": 1.752329714669997, "grad_norm": 1.053911095937465, "learning_rate": 4.405513254077485e-06, "loss": 0.0948, "step": 4560 }, { "epoch": 1.753098280334326, "grad_norm": 1.0475356169662415, "learning_rate": 4.401073196468089e-06, "loss": 0.0869, "step": 4562 }, { "epoch": 1.753866845998655, "grad_norm": 0.9577727064549215, "learning_rate": 4.396633617974831e-06, "loss": 0.0923, "step": 4564 }, { "epoch": 1.754635411662984, "grad_norm": 1.1166777247277255, "learning_rate": 4.392194522149187e-06, "loss": 0.0878, "step": 4566 }, { "epoch": 1.755403977327313, "grad_norm": 1.190341812780619, "learning_rate": 4.387755912542247e-06, "loss": 0.0975, "step": 4568 }, { "epoch": 1.7561725429916417, "grad_norm": 1.0540537960031893, "learning_rate": 4.383317792704709e-06, "loss": 0.0817, "step": 4570 }, { "epoch": 1.7569411086559708, "grad_norm": 1.0675508488513656, "learning_rate": 4.378880166186884e-06, "loss": 0.081, "step": 4572 }, { "epoch": 1.7577096743202998, "grad_norm": 1.037703263273538, "learning_rate": 4.374443036538683e-06, "loss": 0.099, "step": 4574 }, { "epoch": 1.7584782399846288, "grad_norm": 1.0338937017003604, "learning_rate": 4.370006407309624e-06, "loss": 0.104, "step": 4576 }, { "epoch": 1.7592468056489576, "grad_norm": 1.04448810709474, "learning_rate": 4.365570282048822e-06, "loss": 0.0828, "step": 4578 }, { "epoch": 1.7600153713132864, "grad_norm": 1.1462115955649337, "learning_rate": 4.361134664304989e-06, "loss": 0.1035, "step": 4580 }, { "epoch": 1.7607839369776155, "grad_norm": 1.0017379402542474, "learning_rate": 4.356699557626435e-06, "loss": 0.1028, "step": 4582 }, { "epoch": 1.7615525026419445, "grad_norm": 1.1382025974319936, "learning_rate": 4.352264965561056e-06, "loss": 0.0969, "step": 4584 }, { "epoch": 1.7623210683062736, "grad_norm": 1.0296887949158422, "learning_rate": 4.3478308916563375e-06, "loss": 0.0862, "step": 4586 }, { "epoch": 1.7630896339706024, "grad_norm": 1.1098310359727674, "learning_rate": 4.343397339459352e-06, "loss": 0.0984, "step": 4588 }, { "epoch": 1.7638581996349312, "grad_norm": 0.9963891464406937, "learning_rate": 4.338964312516753e-06, "loss": 0.0991, "step": 4590 }, { "epoch": 1.7646267652992602, "grad_norm": 1.1806243732883759, "learning_rate": 4.334531814374777e-06, "loss": 0.0905, "step": 4592 }, { "epoch": 1.7653953309635892, "grad_norm": 1.0735361264534495, "learning_rate": 4.330099848579232e-06, "loss": 0.0866, "step": 4594 }, { "epoch": 1.7661638966279183, "grad_norm": 1.2150333401966407, "learning_rate": 4.325668418675507e-06, "loss": 0.1027, "step": 4596 }, { "epoch": 1.766932462292247, "grad_norm": 1.0282248832196519, "learning_rate": 4.321237528208558e-06, "loss": 0.084, "step": 4598 }, { "epoch": 1.767701027956576, "grad_norm": 1.1729737549503059, "learning_rate": 4.316807180722908e-06, "loss": 0.098, "step": 4600 }, { "epoch": 1.768469593620905, "grad_norm": 1.030830391163099, "learning_rate": 4.312377379762649e-06, "loss": 0.0907, "step": 4602 }, { "epoch": 1.769238159285234, "grad_norm": 1.167363677322885, "learning_rate": 4.307948128871435e-06, "loss": 0.0976, "step": 4604 }, { "epoch": 1.770006724949563, "grad_norm": 1.1750951055424361, "learning_rate": 4.303519431592479e-06, "loss": 0.091, "step": 4606 }, { "epoch": 1.7707752906138918, "grad_norm": 1.0998564642761142, "learning_rate": 4.299091291468551e-06, "loss": 0.0968, "step": 4608 }, { "epoch": 1.7715438562782206, "grad_norm": 1.1872208410696674, "learning_rate": 4.294663712041979e-06, "loss": 0.0954, "step": 4610 }, { "epoch": 1.7723124219425497, "grad_norm": 0.9997692134608994, "learning_rate": 4.290236696854637e-06, "loss": 0.1035, "step": 4612 }, { "epoch": 1.7730809876068787, "grad_norm": 0.8814845034086696, "learning_rate": 4.285810249447949e-06, "loss": 0.0838, "step": 4614 }, { "epoch": 1.7738495532712077, "grad_norm": 0.9925377465070174, "learning_rate": 4.281384373362889e-06, "loss": 0.081, "step": 4616 }, { "epoch": 1.7746181189355366, "grad_norm": 1.089352495185396, "learning_rate": 4.276959072139968e-06, "loss": 0.0971, "step": 4618 }, { "epoch": 1.7753866845998654, "grad_norm": 1.05683172251349, "learning_rate": 4.272534349319241e-06, "loss": 0.0981, "step": 4620 }, { "epoch": 1.7761552502641944, "grad_norm": 1.038134252869432, "learning_rate": 4.268110208440302e-06, "loss": 0.0884, "step": 4622 }, { "epoch": 1.7769238159285234, "grad_norm": 1.0652957834637722, "learning_rate": 4.263686653042274e-06, "loss": 0.1016, "step": 4624 }, { "epoch": 1.7776923815928525, "grad_norm": 1.001531706865138, "learning_rate": 4.259263686663814e-06, "loss": 0.0945, "step": 4626 }, { "epoch": 1.7784609472571813, "grad_norm": 0.9248766025300692, "learning_rate": 4.254841312843109e-06, "loss": 0.0998, "step": 4628 }, { "epoch": 1.77922951292151, "grad_norm": 1.0501406502690522, "learning_rate": 4.25041953511787e-06, "loss": 0.0898, "step": 4630 }, { "epoch": 1.7799980785858391, "grad_norm": 1.1988980309840631, "learning_rate": 4.245998357025335e-06, "loss": 0.094, "step": 4632 }, { "epoch": 1.7807666442501682, "grad_norm": 1.0026867996278013, "learning_rate": 4.241577782102258e-06, "loss": 0.0915, "step": 4634 }, { "epoch": 1.7815352099144972, "grad_norm": 1.0575048327429968, "learning_rate": 4.237157813884913e-06, "loss": 0.1031, "step": 4636 }, { "epoch": 1.782303775578826, "grad_norm": 1.0508820340207679, "learning_rate": 4.232738455909089e-06, "loss": 0.0943, "step": 4638 }, { "epoch": 1.7830723412431548, "grad_norm": 1.0369736573005697, "learning_rate": 4.228319711710083e-06, "loss": 0.095, "step": 4640 }, { "epoch": 1.7838409069074839, "grad_norm": 1.1182668546367236, "learning_rate": 4.223901584822703e-06, "loss": 0.0948, "step": 4642 }, { "epoch": 1.784609472571813, "grad_norm": 0.9959864529029774, "learning_rate": 4.219484078781266e-06, "loss": 0.0911, "step": 4644 }, { "epoch": 1.785378038236142, "grad_norm": 1.0035563526884537, "learning_rate": 4.215067197119591e-06, "loss": 0.0897, "step": 4646 }, { "epoch": 1.7861466039004708, "grad_norm": 0.9984163738368774, "learning_rate": 4.210650943370994e-06, "loss": 0.0854, "step": 4648 }, { "epoch": 1.7869151695647996, "grad_norm": 1.0725412085746084, "learning_rate": 4.206235321068296e-06, "loss": 0.0863, "step": 4650 }, { "epoch": 1.7876837352291286, "grad_norm": 1.05973404693804, "learning_rate": 4.201820333743803e-06, "loss": 0.0894, "step": 4652 }, { "epoch": 1.7884523008934576, "grad_norm": 0.9799062772946924, "learning_rate": 4.197405984929319e-06, "loss": 0.0841, "step": 4654 }, { "epoch": 1.7892208665577867, "grad_norm": 1.1245905485884407, "learning_rate": 4.192992278156141e-06, "loss": 0.0973, "step": 4656 }, { "epoch": 1.7899894322221155, "grad_norm": 1.3452239774912416, "learning_rate": 4.188579216955042e-06, "loss": 0.1044, "step": 4658 }, { "epoch": 1.7907579978864443, "grad_norm": 1.212847306105373, "learning_rate": 4.184166804856289e-06, "loss": 0.1063, "step": 4660 }, { "epoch": 1.7915265635507733, "grad_norm": 0.9642356213181992, "learning_rate": 4.179755045389624e-06, "loss": 0.102, "step": 4662 }, { "epoch": 1.7922951292151024, "grad_norm": 1.083872757153357, "learning_rate": 4.175343942084269e-06, "loss": 0.1033, "step": 4664 }, { "epoch": 1.7930636948794314, "grad_norm": 1.115234797205667, "learning_rate": 4.170933498468918e-06, "loss": 0.0895, "step": 4666 }, { "epoch": 1.7938322605437602, "grad_norm": 1.1433640024198233, "learning_rate": 4.166523718071741e-06, "loss": 0.0839, "step": 4668 }, { "epoch": 1.794600826208089, "grad_norm": 1.0428943637833972, "learning_rate": 4.162114604420374e-06, "loss": 0.0927, "step": 4670 }, { "epoch": 1.795369391872418, "grad_norm": 1.0136871048155578, "learning_rate": 4.1577061610419246e-06, "loss": 0.085, "step": 4672 }, { "epoch": 1.796137957536747, "grad_norm": 1.0480390926617074, "learning_rate": 4.1532983914629606e-06, "loss": 0.0919, "step": 4674 }, { "epoch": 1.7969065232010761, "grad_norm": 1.0500888980915524, "learning_rate": 4.148891299209513e-06, "loss": 0.0819, "step": 4676 }, { "epoch": 1.797675088865405, "grad_norm": 1.157085807104773, "learning_rate": 4.144484887807064e-06, "loss": 0.1041, "step": 4678 }, { "epoch": 1.7984436545297338, "grad_norm": 1.0181345614874848, "learning_rate": 4.140079160780561e-06, "loss": 0.089, "step": 4680 }, { "epoch": 1.7992122201940628, "grad_norm": 1.1266406730557732, "learning_rate": 4.135674121654398e-06, "loss": 0.0957, "step": 4682 }, { "epoch": 1.7999807858583918, "grad_norm": 1.0966761526060145, "learning_rate": 4.13126977395242e-06, "loss": 0.0838, "step": 4684 }, { "epoch": 1.8007493515227209, "grad_norm": 1.1268195564261492, "learning_rate": 4.126866121197921e-06, "loss": 0.0934, "step": 4686 }, { "epoch": 1.8015179171870497, "grad_norm": 1.1365970789889293, "learning_rate": 4.122463166913635e-06, "loss": 0.0954, "step": 4688 }, { "epoch": 1.8022864828513785, "grad_norm": 1.2365859193047417, "learning_rate": 4.1180609146217416e-06, "loss": 0.1027, "step": 4690 }, { "epoch": 1.8030550485157075, "grad_norm": 1.2299043237327973, "learning_rate": 4.113659367843854e-06, "loss": 0.1019, "step": 4692 }, { "epoch": 1.8038236141800366, "grad_norm": 0.9524458304708779, "learning_rate": 4.109258530101025e-06, "loss": 0.085, "step": 4694 }, { "epoch": 1.8045921798443656, "grad_norm": 1.0637659868631943, "learning_rate": 4.104858404913737e-06, "loss": 0.0842, "step": 4696 }, { "epoch": 1.8053607455086944, "grad_norm": 1.203912151761302, "learning_rate": 4.1004589958019085e-06, "loss": 0.0983, "step": 4698 }, { "epoch": 1.8061293111730232, "grad_norm": 1.0371221567077997, "learning_rate": 4.096060306284877e-06, "loss": 0.0947, "step": 4700 }, { "epoch": 1.8068978768373523, "grad_norm": 1.0099838252426303, "learning_rate": 4.091662339881411e-06, "loss": 0.0855, "step": 4702 }, { "epoch": 1.8076664425016813, "grad_norm": 1.1533798582382528, "learning_rate": 4.087265100109697e-06, "loss": 0.121, "step": 4704 }, { "epoch": 1.8084350081660103, "grad_norm": 1.021263818738618, "learning_rate": 4.082868590487339e-06, "loss": 0.0839, "step": 4706 }, { "epoch": 1.8092035738303391, "grad_norm": 0.990568809744524, "learning_rate": 4.078472814531361e-06, "loss": 0.0871, "step": 4708 }, { "epoch": 1.809972139494668, "grad_norm": 1.1367456737803596, "learning_rate": 4.074077775758197e-06, "loss": 0.0926, "step": 4710 }, { "epoch": 1.810740705158997, "grad_norm": 1.1698055083864654, "learning_rate": 4.069683477683692e-06, "loss": 0.0949, "step": 4712 }, { "epoch": 1.811509270823326, "grad_norm": 1.1941798838911144, "learning_rate": 4.0652899238231016e-06, "loss": 0.107, "step": 4714 }, { "epoch": 1.812277836487655, "grad_norm": 1.1562362223047369, "learning_rate": 4.060897117691082e-06, "loss": 0.091, "step": 4716 }, { "epoch": 1.8130464021519839, "grad_norm": 1.1651749940505352, "learning_rate": 4.05650506280169e-06, "loss": 0.1023, "step": 4718 }, { "epoch": 1.8138149678163127, "grad_norm": 1.2234796218845423, "learning_rate": 4.052113762668386e-06, "loss": 0.0938, "step": 4720 }, { "epoch": 1.8145835334806417, "grad_norm": 1.0468155552003473, "learning_rate": 4.047723220804024e-06, "loss": 0.1018, "step": 4722 }, { "epoch": 1.8153520991449708, "grad_norm": 1.0209252596049274, "learning_rate": 4.043333440720853e-06, "loss": 0.0919, "step": 4724 }, { "epoch": 1.8161206648092998, "grad_norm": 1.047104540212793, "learning_rate": 4.038944425930511e-06, "loss": 0.0939, "step": 4726 }, { "epoch": 1.8168892304736286, "grad_norm": 0.9469641867154628, "learning_rate": 4.034556179944024e-06, "loss": 0.0852, "step": 4728 }, { "epoch": 1.8176577961379574, "grad_norm": 1.0317281862428862, "learning_rate": 4.030168706271805e-06, "loss": 0.0866, "step": 4730 }, { "epoch": 1.8184263618022865, "grad_norm": 1.1317858605415871, "learning_rate": 4.0257820084236465e-06, "loss": 0.0895, "step": 4732 }, { "epoch": 1.8191949274666155, "grad_norm": 1.0928591505252303, "learning_rate": 4.02139608990872e-06, "loss": 0.1004, "step": 4734 }, { "epoch": 1.8199634931309445, "grad_norm": 1.1298391139943653, "learning_rate": 4.0170109542355755e-06, "loss": 0.1002, "step": 4736 }, { "epoch": 1.8207320587952733, "grad_norm": 1.1946772960226626, "learning_rate": 4.012626604912138e-06, "loss": 0.0946, "step": 4738 }, { "epoch": 1.8215006244596021, "grad_norm": 1.2216716564098675, "learning_rate": 4.0082430454457005e-06, "loss": 0.0988, "step": 4740 }, { "epoch": 1.8222691901239312, "grad_norm": 1.1216778674197248, "learning_rate": 4.003860279342927e-06, "loss": 0.0871, "step": 4742 }, { "epoch": 1.8230377557882602, "grad_norm": 1.1144918644343427, "learning_rate": 3.999478310109843e-06, "loss": 0.0899, "step": 4744 }, { "epoch": 1.8238063214525893, "grad_norm": 0.9930313533731132, "learning_rate": 3.995097141251839e-06, "loss": 0.0827, "step": 4746 }, { "epoch": 1.824574887116918, "grad_norm": 1.1300722455488208, "learning_rate": 3.990716776273666e-06, "loss": 0.0866, "step": 4748 }, { "epoch": 1.8253434527812469, "grad_norm": 1.0591205466583202, "learning_rate": 3.986337218679431e-06, "loss": 0.091, "step": 4750 }, { "epoch": 1.826112018445576, "grad_norm": 1.0521721287821877, "learning_rate": 3.981958471972593e-06, "loss": 0.0873, "step": 4752 }, { "epoch": 1.826880584109905, "grad_norm": 1.1168644720505567, "learning_rate": 3.977580539655967e-06, "loss": 0.0921, "step": 4754 }, { "epoch": 1.827649149774234, "grad_norm": 1.1623218814182714, "learning_rate": 3.973203425231715e-06, "loss": 0.0953, "step": 4756 }, { "epoch": 1.8284177154385628, "grad_norm": 1.0605710582779981, "learning_rate": 3.968827132201339e-06, "loss": 0.0973, "step": 4758 }, { "epoch": 1.8291862811028916, "grad_norm": 1.2495052943750342, "learning_rate": 3.96445166406569e-06, "loss": 0.1088, "step": 4760 }, { "epoch": 1.8299548467672206, "grad_norm": 1.023522831965709, "learning_rate": 3.96007702432496e-06, "loss": 0.0888, "step": 4762 }, { "epoch": 1.8307234124315497, "grad_norm": 1.100119470658054, "learning_rate": 3.9557032164786735e-06, "loss": 0.1017, "step": 4764 }, { "epoch": 1.8314919780958787, "grad_norm": 1.0375537214808888, "learning_rate": 3.9513302440256945e-06, "loss": 0.0986, "step": 4766 }, { "epoch": 1.8322605437602075, "grad_norm": 1.0941905700943497, "learning_rate": 3.946958110464216e-06, "loss": 0.0977, "step": 4768 }, { "epoch": 1.8330291094245363, "grad_norm": 1.0767762941897385, "learning_rate": 3.942586819291756e-06, "loss": 0.1021, "step": 4770 }, { "epoch": 1.8337976750888654, "grad_norm": 1.042690910720241, "learning_rate": 3.938216374005167e-06, "loss": 0.1022, "step": 4772 }, { "epoch": 1.8345662407531944, "grad_norm": 1.116393042155018, "learning_rate": 3.933846778100619e-06, "loss": 0.0863, "step": 4774 }, { "epoch": 1.8353348064175234, "grad_norm": 0.9979373505620446, "learning_rate": 3.929478035073603e-06, "loss": 0.091, "step": 4776 }, { "epoch": 1.8361033720818523, "grad_norm": 1.0946128842759564, "learning_rate": 3.925110148418932e-06, "loss": 0.0962, "step": 4778 }, { "epoch": 1.836871937746181, "grad_norm": 1.062600924951768, "learning_rate": 3.9207431216307275e-06, "loss": 0.0826, "step": 4780 }, { "epoch": 1.83764050341051, "grad_norm": 0.9712904491662524, "learning_rate": 3.916376958202431e-06, "loss": 0.0853, "step": 4782 }, { "epoch": 1.8384090690748391, "grad_norm": 1.0615733510646372, "learning_rate": 3.912011661626783e-06, "loss": 0.0987, "step": 4784 }, { "epoch": 1.8391776347391682, "grad_norm": 1.2893849953174341, "learning_rate": 3.907647235395839e-06, "loss": 0.0894, "step": 4786 }, { "epoch": 1.839946200403497, "grad_norm": 1.0674601839978484, "learning_rate": 3.903283683000955e-06, "loss": 0.0875, "step": 4788 }, { "epoch": 1.8407147660678258, "grad_norm": 1.1320420177716157, "learning_rate": 3.898921007932791e-06, "loss": 0.1026, "step": 4790 }, { "epoch": 1.8414833317321548, "grad_norm": 1.1142743084030489, "learning_rate": 3.894559213681299e-06, "loss": 0.0869, "step": 4792 }, { "epoch": 1.8422518973964839, "grad_norm": 1.0852750301220597, "learning_rate": 3.890198303735734e-06, "loss": 0.0861, "step": 4794 }, { "epoch": 1.843020463060813, "grad_norm": 1.279793927156826, "learning_rate": 3.885838281584638e-06, "loss": 0.0995, "step": 4796 }, { "epoch": 1.8437890287251417, "grad_norm": 1.1214554382277662, "learning_rate": 3.881479150715842e-06, "loss": 0.0919, "step": 4798 }, { "epoch": 1.8445575943894705, "grad_norm": 0.9417748919094449, "learning_rate": 3.877120914616469e-06, "loss": 0.08, "step": 4800 }, { "epoch": 1.8453261600537996, "grad_norm": 1.0742320594241295, "learning_rate": 3.872763576772925e-06, "loss": 0.0881, "step": 4802 }, { "epoch": 1.8460947257181286, "grad_norm": 1.0522285674391252, "learning_rate": 3.8684071406708935e-06, "loss": 0.0991, "step": 4804 }, { "epoch": 1.8468632913824576, "grad_norm": 1.071434437477333, "learning_rate": 3.86405160979534e-06, "loss": 0.0923, "step": 4806 }, { "epoch": 1.8476318570467865, "grad_norm": 1.1154165601424344, "learning_rate": 3.859696987630509e-06, "loss": 0.1016, "step": 4808 }, { "epoch": 1.8484004227111153, "grad_norm": 1.0845937223494313, "learning_rate": 3.8553432776599085e-06, "loss": 0.0953, "step": 4810 }, { "epoch": 1.8491689883754443, "grad_norm": 1.1254340281287118, "learning_rate": 3.8509904833663235e-06, "loss": 0.0884, "step": 4812 }, { "epoch": 1.8499375540397733, "grad_norm": 1.1070646705138054, "learning_rate": 3.846638608231809e-06, "loss": 0.0937, "step": 4814 }, { "epoch": 1.8507061197041024, "grad_norm": 1.2029479894997508, "learning_rate": 3.842287655737678e-06, "loss": 0.1113, "step": 4816 }, { "epoch": 1.8514746853684312, "grad_norm": 1.1070558608648937, "learning_rate": 3.8379376293645115e-06, "loss": 0.087, "step": 4818 }, { "epoch": 1.85224325103276, "grad_norm": 1.1979097537558734, "learning_rate": 3.833588532592143e-06, "loss": 0.0939, "step": 4820 }, { "epoch": 1.853011816697089, "grad_norm": 1.0350659883277837, "learning_rate": 3.829240368899672e-06, "loss": 0.0793, "step": 4822 }, { "epoch": 1.853780382361418, "grad_norm": 0.984840969077864, "learning_rate": 3.82489314176544e-06, "loss": 0.0932, "step": 4824 }, { "epoch": 1.854548948025747, "grad_norm": 1.201813893914687, "learning_rate": 3.820546854667047e-06, "loss": 0.0958, "step": 4826 }, { "epoch": 1.855317513690076, "grad_norm": 1.0029484228220262, "learning_rate": 3.816201511081339e-06, "loss": 0.0869, "step": 4828 }, { "epoch": 1.8560860793544047, "grad_norm": 0.9539061255557659, "learning_rate": 3.811857114484408e-06, "loss": 0.0948, "step": 4830 }, { "epoch": 1.8568546450187338, "grad_norm": 1.1102054930813652, "learning_rate": 3.807513668351586e-06, "loss": 0.0934, "step": 4832 }, { "epoch": 1.8576232106830628, "grad_norm": 1.1892569878107253, "learning_rate": 3.8031711761574506e-06, "loss": 0.0968, "step": 4834 }, { "epoch": 1.8583917763473918, "grad_norm": 1.0730579363844297, "learning_rate": 3.798829641375808e-06, "loss": 0.0897, "step": 4836 }, { "epoch": 1.8591603420117206, "grad_norm": 0.9759701681369963, "learning_rate": 3.7944890674797024e-06, "loss": 0.088, "step": 4838 }, { "epoch": 1.8599289076760495, "grad_norm": 1.1685631283972417, "learning_rate": 3.790149457941411e-06, "loss": 0.0932, "step": 4840 }, { "epoch": 1.8606974733403785, "grad_norm": 1.121131149696113, "learning_rate": 3.7858108162324386e-06, "loss": 0.0931, "step": 4842 }, { "epoch": 1.8614660390047075, "grad_norm": 1.038183502102688, "learning_rate": 3.7814731458235142e-06, "loss": 0.0811, "step": 4844 }, { "epoch": 1.8622346046690366, "grad_norm": 1.1559073074918258, "learning_rate": 3.777136450184592e-06, "loss": 0.1015, "step": 4846 }, { "epoch": 1.8630031703333654, "grad_norm": 0.9573810023204248, "learning_rate": 3.772800732784846e-06, "loss": 0.0916, "step": 4848 }, { "epoch": 1.8637717359976942, "grad_norm": 1.271498041354363, "learning_rate": 3.768465997092663e-06, "loss": 0.1153, "step": 4850 }, { "epoch": 1.8645403016620232, "grad_norm": 1.0300917312519247, "learning_rate": 3.7641322465756508e-06, "loss": 0.0894, "step": 4852 }, { "epoch": 1.8653088673263523, "grad_norm": 1.1612954100103559, "learning_rate": 3.759799484700628e-06, "loss": 0.0947, "step": 4854 }, { "epoch": 1.8660774329906813, "grad_norm": 1.0620574474416404, "learning_rate": 3.7554677149336186e-06, "loss": 0.0915, "step": 4856 }, { "epoch": 1.86684599865501, "grad_norm": 1.1943505287343492, "learning_rate": 3.7511369407398584e-06, "loss": 0.1082, "step": 4858 }, { "epoch": 1.867614564319339, "grad_norm": 1.178051212876048, "learning_rate": 3.7468071655837805e-06, "loss": 0.1087, "step": 4860 }, { "epoch": 1.868383129983668, "grad_norm": 1.003335057274852, "learning_rate": 3.7424783929290263e-06, "loss": 0.0924, "step": 4862 }, { "epoch": 1.869151695647997, "grad_norm": 1.0524488529140674, "learning_rate": 3.738150626238426e-06, "loss": 0.0961, "step": 4864 }, { "epoch": 1.869920261312326, "grad_norm": 1.179031815265349, "learning_rate": 3.7338238689740137e-06, "loss": 0.0979, "step": 4866 }, { "epoch": 1.8706888269766548, "grad_norm": 1.055910668467737, "learning_rate": 3.7294981245970096e-06, "loss": 0.0935, "step": 4868 }, { "epoch": 1.8714573926409837, "grad_norm": 1.034253989545545, "learning_rate": 3.7251733965678293e-06, "loss": 0.0884, "step": 4870 }, { "epoch": 1.8722259583053127, "grad_norm": 1.095218259043101, "learning_rate": 3.7208496883460704e-06, "loss": 0.1005, "step": 4872 }, { "epoch": 1.8729945239696417, "grad_norm": 0.9464149061687653, "learning_rate": 3.71652700339052e-06, "loss": 0.0896, "step": 4874 }, { "epoch": 1.8737630896339708, "grad_norm": 1.0599012153095106, "learning_rate": 3.712205345159139e-06, "loss": 0.0995, "step": 4876 }, { "epoch": 1.8745316552982996, "grad_norm": 1.1876342032342249, "learning_rate": 3.7078847171090737e-06, "loss": 0.1003, "step": 4878 }, { "epoch": 1.8753002209626284, "grad_norm": 1.064321583508167, "learning_rate": 3.703565122696644e-06, "loss": 0.0905, "step": 4880 }, { "epoch": 1.8760687866269574, "grad_norm": 1.1890320722151528, "learning_rate": 3.6992465653773425e-06, "loss": 0.093, "step": 4882 }, { "epoch": 1.8768373522912865, "grad_norm": 1.0063593724706235, "learning_rate": 3.6949290486058326e-06, "loss": 0.0903, "step": 4884 }, { "epoch": 1.8776059179556155, "grad_norm": 1.1610092882160976, "learning_rate": 3.690612575835947e-06, "loss": 0.1076, "step": 4886 }, { "epoch": 1.8783744836199443, "grad_norm": 1.0151462357489245, "learning_rate": 3.6862971505206803e-06, "loss": 0.0859, "step": 4888 }, { "epoch": 1.8791430492842731, "grad_norm": 0.9309711369347431, "learning_rate": 3.6819827761121885e-06, "loss": 0.0863, "step": 4890 }, { "epoch": 1.8799116149486021, "grad_norm": 1.136682075309015, "learning_rate": 3.67766945606179e-06, "loss": 0.0833, "step": 4892 }, { "epoch": 1.8806801806129312, "grad_norm": 0.9888111308289552, "learning_rate": 3.673357193819961e-06, "loss": 0.0954, "step": 4894 }, { "epoch": 1.8814487462772602, "grad_norm": 1.033964390502172, "learning_rate": 3.6690459928363255e-06, "loss": 0.0878, "step": 4896 }, { "epoch": 1.882217311941589, "grad_norm": 1.2145273428415089, "learning_rate": 3.664735856559665e-06, "loss": 0.1026, "step": 4898 }, { "epoch": 1.8829858776059178, "grad_norm": 1.0237552760467146, "learning_rate": 3.660426788437907e-06, "loss": 0.0936, "step": 4900 }, { "epoch": 1.8837544432702469, "grad_norm": 0.9256912883002052, "learning_rate": 3.6561187919181195e-06, "loss": 0.0777, "step": 4902 }, { "epoch": 1.884523008934576, "grad_norm": 1.144722383329025, "learning_rate": 3.651811870446521e-06, "loss": 0.0817, "step": 4904 }, { "epoch": 1.885291574598905, "grad_norm": 1.2050805652083687, "learning_rate": 3.647506027468467e-06, "loss": 0.0959, "step": 4906 }, { "epoch": 1.8860601402632338, "grad_norm": 0.9111607815858482, "learning_rate": 3.643201266428448e-06, "loss": 0.0815, "step": 4908 }, { "epoch": 1.8868287059275626, "grad_norm": 1.0772061793552512, "learning_rate": 3.6388975907700935e-06, "loss": 0.0893, "step": 4910 }, { "epoch": 1.8875972715918916, "grad_norm": 1.1129189608734384, "learning_rate": 3.63459500393616e-06, "loss": 0.094, "step": 4912 }, { "epoch": 1.8883658372562206, "grad_norm": 1.2238880215877073, "learning_rate": 3.6302935093685375e-06, "loss": 0.1115, "step": 4914 }, { "epoch": 1.8891344029205497, "grad_norm": 1.0386614000295777, "learning_rate": 3.625993110508236e-06, "loss": 0.086, "step": 4916 }, { "epoch": 1.8899029685848785, "grad_norm": 1.0188056315035443, "learning_rate": 3.6216938107953967e-06, "loss": 0.0905, "step": 4918 }, { "epoch": 1.8906715342492073, "grad_norm": 0.950889624958373, "learning_rate": 3.6173956136692763e-06, "loss": 0.0907, "step": 4920 }, { "epoch": 1.8914400999135363, "grad_norm": 1.1159321177982797, "learning_rate": 3.6130985225682512e-06, "loss": 0.0968, "step": 4922 }, { "epoch": 1.8922086655778654, "grad_norm": 1.0725296671502595, "learning_rate": 3.608802540929811e-06, "loss": 0.0995, "step": 4924 }, { "epoch": 1.8929772312421944, "grad_norm": 0.9405815519142091, "learning_rate": 3.6045076721905615e-06, "loss": 0.087, "step": 4926 }, { "epoch": 1.8937457969065232, "grad_norm": 1.0780386830106605, "learning_rate": 3.6002139197862177e-06, "loss": 0.0953, "step": 4928 }, { "epoch": 1.894514362570852, "grad_norm": 1.053222357931341, "learning_rate": 3.595921287151597e-06, "loss": 0.0898, "step": 4930 }, { "epoch": 1.895282928235181, "grad_norm": 0.9189038192495935, "learning_rate": 3.5916297777206243e-06, "loss": 0.0764, "step": 4932 }, { "epoch": 1.89605149389951, "grad_norm": 1.0884497701155853, "learning_rate": 3.587339394926328e-06, "loss": 0.0982, "step": 4934 }, { "epoch": 1.8968200595638391, "grad_norm": 1.1324273158940088, "learning_rate": 3.5830501422008313e-06, "loss": 0.0831, "step": 4936 }, { "epoch": 1.897588625228168, "grad_norm": 1.0752868024028321, "learning_rate": 3.578762022975356e-06, "loss": 0.0992, "step": 4938 }, { "epoch": 1.8983571908924968, "grad_norm": 1.0492733446602431, "learning_rate": 3.5744750406802173e-06, "loss": 0.0881, "step": 4940 }, { "epoch": 1.8991257565568258, "grad_norm": 1.007424470918158, "learning_rate": 3.5701891987448165e-06, "loss": 0.0865, "step": 4942 }, { "epoch": 1.8998943222211548, "grad_norm": 1.2107700099699354, "learning_rate": 3.565904500597649e-06, "loss": 0.0924, "step": 4944 }, { "epoch": 1.9006628878854839, "grad_norm": 1.0970701199014863, "learning_rate": 3.5616209496662918e-06, "loss": 0.0922, "step": 4946 }, { "epoch": 1.9014314535498127, "grad_norm": 1.0514022877985034, "learning_rate": 3.5573385493774038e-06, "loss": 0.0915, "step": 4948 }, { "epoch": 1.9022000192141415, "grad_norm": 1.1091276651208755, "learning_rate": 3.5530573031567254e-06, "loss": 0.0862, "step": 4950 }, { "epoch": 1.9029685848784705, "grad_norm": 1.0059786606841976, "learning_rate": 3.548777214429072e-06, "loss": 0.0919, "step": 4952 }, { "epoch": 1.9037371505427996, "grad_norm": 1.0706104930756644, "learning_rate": 3.5444982866183354e-06, "loss": 0.0977, "step": 4954 }, { "epoch": 1.9045057162071286, "grad_norm": 0.986331021184669, "learning_rate": 3.540220523147474e-06, "loss": 0.0912, "step": 4956 }, { "epoch": 1.9052742818714574, "grad_norm": 1.075728011331635, "learning_rate": 3.53594392743852e-06, "loss": 0.0862, "step": 4958 }, { "epoch": 1.9060428475357862, "grad_norm": 1.0540976916613054, "learning_rate": 3.531668502912568e-06, "loss": 0.0864, "step": 4960 }, { "epoch": 1.9068114132001153, "grad_norm": 1.1046130159081344, "learning_rate": 3.5273942529897785e-06, "loss": 0.0952, "step": 4962 }, { "epoch": 1.9075799788644443, "grad_norm": 0.9376106804277698, "learning_rate": 3.5231211810893694e-06, "loss": 0.0904, "step": 4964 }, { "epoch": 1.9083485445287733, "grad_norm": 1.1903378635998756, "learning_rate": 3.5188492906296207e-06, "loss": 0.0904, "step": 4966 }, { "epoch": 1.9091171101931022, "grad_norm": 1.163617130642281, "learning_rate": 3.5145785850278597e-06, "loss": 0.0954, "step": 4968 }, { "epoch": 1.909885675857431, "grad_norm": 1.0933266067826832, "learning_rate": 3.510309067700474e-06, "loss": 0.0918, "step": 4970 }, { "epoch": 1.91065424152176, "grad_norm": 1.060310961707363, "learning_rate": 3.506040742062894e-06, "loss": 0.0971, "step": 4972 }, { "epoch": 1.911422807186089, "grad_norm": 1.0050410026625032, "learning_rate": 3.501773611529603e-06, "loss": 0.0918, "step": 4974 }, { "epoch": 1.912191372850418, "grad_norm": 1.0403980094969703, "learning_rate": 3.497507679514123e-06, "loss": 0.0826, "step": 4976 }, { "epoch": 1.9129599385147469, "grad_norm": 1.182731862735671, "learning_rate": 3.4932429494290206e-06, "loss": 0.0972, "step": 4978 }, { "epoch": 1.9137285041790757, "grad_norm": 0.9617924427222939, "learning_rate": 3.488979424685901e-06, "loss": 0.0862, "step": 4980 }, { "epoch": 1.9144970698434047, "grad_norm": 0.9575349681592041, "learning_rate": 3.4847171086954006e-06, "loss": 0.0831, "step": 4982 }, { "epoch": 1.9152656355077338, "grad_norm": 1.2161127329423909, "learning_rate": 3.4804560048671932e-06, "loss": 0.1073, "step": 4984 }, { "epoch": 1.9160342011720628, "grad_norm": 1.1021459299717744, "learning_rate": 3.4761961166099833e-06, "loss": 0.0856, "step": 4986 }, { "epoch": 1.9168027668363916, "grad_norm": 1.1310973786925955, "learning_rate": 3.471937447331499e-06, "loss": 0.0854, "step": 4988 }, { "epoch": 1.9175713325007204, "grad_norm": 1.1454632601729926, "learning_rate": 3.4676800004384994e-06, "loss": 0.0859, "step": 4990 }, { "epoch": 1.9183398981650495, "grad_norm": 1.0660129825739313, "learning_rate": 3.4634237793367576e-06, "loss": 0.0964, "step": 4992 }, { "epoch": 1.9191084638293785, "grad_norm": 1.0048737991198042, "learning_rate": 3.4591687874310765e-06, "loss": 0.0859, "step": 4994 }, { "epoch": 1.9198770294937075, "grad_norm": 1.1117006142617296, "learning_rate": 3.4549150281252635e-06, "loss": 0.1021, "step": 4996 }, { "epoch": 1.9206455951580363, "grad_norm": 1.0639550601082344, "learning_rate": 3.450662504822151e-06, "loss": 0.0869, "step": 4998 }, { "epoch": 1.9214141608223652, "grad_norm": 1.187548857283023, "learning_rate": 3.4464112209235744e-06, "loss": 0.1008, "step": 5000 }, { "epoch": 1.9214141608223652, "eval_loss": 0.14540965855121613, "eval_runtime": 389.9586, "eval_samples_per_second": 47.454, "eval_steps_per_second": 5.934, "step": 5000 }, { "epoch": 1.9221827264866942, "grad_norm": 1.0640255652907669, "learning_rate": 3.442161179830385e-06, "loss": 0.0819, "step": 5002 }, { "epoch": 1.9229512921510232, "grad_norm": 1.0768089311448044, "learning_rate": 3.4379123849424323e-06, "loss": 0.0889, "step": 5004 }, { "epoch": 1.9237198578153523, "grad_norm": 1.0461439405614625, "learning_rate": 3.4336648396585777e-06, "loss": 0.1012, "step": 5006 }, { "epoch": 1.924488423479681, "grad_norm": 1.1735305129389457, "learning_rate": 3.4294185473766724e-06, "loss": 0.0861, "step": 5008 }, { "epoch": 1.92525698914401, "grad_norm": 1.1233947971674458, "learning_rate": 3.4251735114935737e-06, "loss": 0.0925, "step": 5010 }, { "epoch": 1.926025554808339, "grad_norm": 1.0148486982947422, "learning_rate": 3.42092973540513e-06, "loss": 0.088, "step": 5012 }, { "epoch": 1.926794120472668, "grad_norm": 1.0053126323544739, "learning_rate": 3.4166872225061846e-06, "loss": 0.0905, "step": 5014 }, { "epoch": 1.927562686136997, "grad_norm": 1.067334167153116, "learning_rate": 3.4124459761905664e-06, "loss": 0.0917, "step": 5016 }, { "epoch": 1.9283312518013258, "grad_norm": 1.279363700116257, "learning_rate": 3.408205999851094e-06, "loss": 0.1072, "step": 5018 }, { "epoch": 1.9290998174656546, "grad_norm": 1.0736496664390172, "learning_rate": 3.4039672968795725e-06, "loss": 0.0828, "step": 5020 }, { "epoch": 1.9298683831299837, "grad_norm": 1.071732054597203, "learning_rate": 3.3997298706667805e-06, "loss": 0.0967, "step": 5022 }, { "epoch": 1.9306369487943127, "grad_norm": 1.1350840007770846, "learning_rate": 3.3954937246024797e-06, "loss": 0.0921, "step": 5024 }, { "epoch": 1.9314055144586417, "grad_norm": 0.9733898662264998, "learning_rate": 3.3912588620754127e-06, "loss": 0.0822, "step": 5026 }, { "epoch": 1.9321740801229705, "grad_norm": 0.8974299870222441, "learning_rate": 3.387025286473286e-06, "loss": 0.0733, "step": 5028 }, { "epoch": 1.9329426457872994, "grad_norm": 1.1287548179639084, "learning_rate": 3.3827930011827837e-06, "loss": 0.0903, "step": 5030 }, { "epoch": 1.9337112114516284, "grad_norm": 1.035329830914374, "learning_rate": 3.3785620095895564e-06, "loss": 0.0897, "step": 5032 }, { "epoch": 1.9344797771159574, "grad_norm": 1.2618085835087465, "learning_rate": 3.374332315078216e-06, "loss": 0.0904, "step": 5034 }, { "epoch": 1.9352483427802865, "grad_norm": 1.004696730356712, "learning_rate": 3.3701039210323387e-06, "loss": 0.0911, "step": 5036 }, { "epoch": 1.9360169084446153, "grad_norm": 0.9294516447847803, "learning_rate": 3.3658768308344645e-06, "loss": 0.0866, "step": 5038 }, { "epoch": 1.936785474108944, "grad_norm": 1.104237227275698, "learning_rate": 3.3616510478660835e-06, "loss": 0.0926, "step": 5040 }, { "epoch": 1.9375540397732731, "grad_norm": 1.112844263876308, "learning_rate": 3.3574265755076452e-06, "loss": 0.0888, "step": 5042 }, { "epoch": 1.9383226054376022, "grad_norm": 1.0243076607095885, "learning_rate": 3.353203417138551e-06, "loss": 0.0731, "step": 5044 }, { "epoch": 1.939091171101931, "grad_norm": 1.0985542603737928, "learning_rate": 3.348981576137147e-06, "loss": 0.0905, "step": 5046 }, { "epoch": 1.93985973676626, "grad_norm": 1.0514640232707118, "learning_rate": 3.3447610558807262e-06, "loss": 0.0854, "step": 5048 }, { "epoch": 1.9406283024305888, "grad_norm": 1.019549110475732, "learning_rate": 3.3405418597455287e-06, "loss": 0.0917, "step": 5050 }, { "epoch": 1.9413968680949178, "grad_norm": 1.0885477027583415, "learning_rate": 3.3363239911067325e-06, "loss": 0.0849, "step": 5052 }, { "epoch": 1.9421654337592469, "grad_norm": 1.0929009472129827, "learning_rate": 3.3321074533384545e-06, "loss": 0.0847, "step": 5054 }, { "epoch": 1.9429339994235757, "grad_norm": 1.1632459494469913, "learning_rate": 3.3278922498137455e-06, "loss": 0.0956, "step": 5056 }, { "epoch": 1.9437025650879047, "grad_norm": 1.0458447120447687, "learning_rate": 3.323678383904594e-06, "loss": 0.0941, "step": 5058 }, { "epoch": 1.9444711307522335, "grad_norm": 1.0549216762622893, "learning_rate": 3.3194658589819096e-06, "loss": 0.0812, "step": 5060 }, { "epoch": 1.9452396964165626, "grad_norm": 1.0375140100403766, "learning_rate": 3.315254678415537e-06, "loss": 0.0916, "step": 5062 }, { "epoch": 1.9460082620808916, "grad_norm": 1.0734096478714452, "learning_rate": 3.3110448455742427e-06, "loss": 0.0911, "step": 5064 }, { "epoch": 1.9467768277452204, "grad_norm": 0.9973192388787151, "learning_rate": 3.306836363825715e-06, "loss": 0.084, "step": 5066 }, { "epoch": 1.9475453934095495, "grad_norm": 1.039758717309691, "learning_rate": 3.3026292365365608e-06, "loss": 0.094, "step": 5068 }, { "epoch": 1.9483139590738783, "grad_norm": 1.040193199040735, "learning_rate": 3.2984234670723046e-06, "loss": 0.0814, "step": 5070 }, { "epoch": 1.9490825247382073, "grad_norm": 1.1469173621737845, "learning_rate": 3.2942190587973867e-06, "loss": 0.0898, "step": 5072 }, { "epoch": 1.9498510904025363, "grad_norm": 1.160951353550111, "learning_rate": 3.290016015075152e-06, "loss": 0.089, "step": 5074 }, { "epoch": 1.9506196560668652, "grad_norm": 1.0675773898950676, "learning_rate": 3.285814339267859e-06, "loss": 0.0923, "step": 5076 }, { "epoch": 1.9513882217311942, "grad_norm": 1.216161501697383, "learning_rate": 3.2816140347366725e-06, "loss": 0.1021, "step": 5078 }, { "epoch": 1.952156787395523, "grad_norm": 1.2093675143900617, "learning_rate": 3.2774151048416567e-06, "loss": 0.0956, "step": 5080 }, { "epoch": 1.952925353059852, "grad_norm": 1.0636683965498348, "learning_rate": 3.273217552941779e-06, "loss": 0.0862, "step": 5082 }, { "epoch": 1.953693918724181, "grad_norm": 1.199142124327118, "learning_rate": 3.269021382394905e-06, "loss": 0.0876, "step": 5084 }, { "epoch": 1.95446248438851, "grad_norm": 1.0836690357929495, "learning_rate": 3.2648265965577936e-06, "loss": 0.096, "step": 5086 }, { "epoch": 1.955231050052839, "grad_norm": 1.1152117539616224, "learning_rate": 3.260633198786093e-06, "loss": 0.0842, "step": 5088 }, { "epoch": 1.9559996157171677, "grad_norm": 1.0586258139616203, "learning_rate": 3.2564411924343477e-06, "loss": 0.0822, "step": 5090 }, { "epoch": 1.9567681813814968, "grad_norm": 1.1041461169386806, "learning_rate": 3.2522505808559823e-06, "loss": 0.087, "step": 5092 }, { "epoch": 1.9575367470458258, "grad_norm": 1.1821109633791267, "learning_rate": 3.248061367403312e-06, "loss": 0.0998, "step": 5094 }, { "epoch": 1.9583053127101546, "grad_norm": 1.0791140522965197, "learning_rate": 3.2438735554275304e-06, "loss": 0.0878, "step": 5096 }, { "epoch": 1.9590738783744837, "grad_norm": 1.01311442177349, "learning_rate": 3.2396871482787107e-06, "loss": 0.0977, "step": 5098 }, { "epoch": 1.9598424440388125, "grad_norm": 1.1582970878045296, "learning_rate": 3.235502149305798e-06, "loss": 0.0885, "step": 5100 }, { "epoch": 1.9606110097031415, "grad_norm": 1.2035694051117733, "learning_rate": 3.2313185618566186e-06, "loss": 0.0883, "step": 5102 }, { "epoch": 1.9613795753674705, "grad_norm": 1.041172922448694, "learning_rate": 3.227136389277863e-06, "loss": 0.0916, "step": 5104 }, { "epoch": 1.9621481410317994, "grad_norm": 1.0722758956950673, "learning_rate": 3.2229556349150947e-06, "loss": 0.086, "step": 5106 }, { "epoch": 1.9629167066961284, "grad_norm": 1.0462558415102876, "learning_rate": 3.218776302112738e-06, "loss": 0.0915, "step": 5108 }, { "epoch": 1.9636852723604572, "grad_norm": 1.0667185834442787, "learning_rate": 3.214598394214086e-06, "loss": 0.0895, "step": 5110 }, { "epoch": 1.9644538380247862, "grad_norm": 1.1299580700164114, "learning_rate": 3.210421914561287e-06, "loss": 0.0906, "step": 5112 }, { "epoch": 1.9652224036891153, "grad_norm": 0.9479065965035364, "learning_rate": 3.2062468664953485e-06, "loss": 0.084, "step": 5114 }, { "epoch": 1.965990969353444, "grad_norm": 0.9546008919071777, "learning_rate": 3.2020732533561296e-06, "loss": 0.079, "step": 5116 }, { "epoch": 1.9667595350177731, "grad_norm": 1.3189747436273875, "learning_rate": 3.197901078482349e-06, "loss": 0.0923, "step": 5118 }, { "epoch": 1.967528100682102, "grad_norm": 1.0055350281229603, "learning_rate": 3.1937303452115663e-06, "loss": 0.0875, "step": 5120 }, { "epoch": 1.968296666346431, "grad_norm": 1.2036440514149829, "learning_rate": 3.189561056880195e-06, "loss": 0.1046, "step": 5122 }, { "epoch": 1.96906523201076, "grad_norm": 0.9935621225258402, "learning_rate": 3.18539321682349e-06, "loss": 0.0809, "step": 5124 }, { "epoch": 1.9698337976750888, "grad_norm": 1.0731661662463428, "learning_rate": 3.1812268283755444e-06, "loss": 0.0927, "step": 5126 }, { "epoch": 1.9706023633394179, "grad_norm": 1.0501596699969362, "learning_rate": 3.1770618948692935e-06, "loss": 0.09, "step": 5128 }, { "epoch": 1.9713709290037467, "grad_norm": 1.1344780894958448, "learning_rate": 3.1728984196365097e-06, "loss": 0.0848, "step": 5130 }, { "epoch": 1.9721394946680757, "grad_norm": 1.068551060648597, "learning_rate": 3.1687364060077965e-06, "loss": 0.0868, "step": 5132 }, { "epoch": 1.9729080603324047, "grad_norm": 1.0178326531696174, "learning_rate": 3.164575857312588e-06, "loss": 0.0867, "step": 5134 }, { "epoch": 1.9736766259967335, "grad_norm": 1.1170355819671527, "learning_rate": 3.160416776879149e-06, "loss": 0.0959, "step": 5136 }, { "epoch": 1.9744451916610626, "grad_norm": 1.0718809565630478, "learning_rate": 3.15625916803457e-06, "loss": 0.0939, "step": 5138 }, { "epoch": 1.9752137573253914, "grad_norm": 0.9836688699727424, "learning_rate": 3.1521030341047564e-06, "loss": 0.079, "step": 5140 }, { "epoch": 1.9759823229897204, "grad_norm": 1.0343439393682807, "learning_rate": 3.147948378414445e-06, "loss": 0.0812, "step": 5142 }, { "epoch": 1.9767508886540495, "grad_norm": 1.2523617945769072, "learning_rate": 3.143795204287182e-06, "loss": 0.0952, "step": 5144 }, { "epoch": 1.9775194543183783, "grad_norm": 1.186682107011038, "learning_rate": 3.139643515045332e-06, "loss": 0.085, "step": 5146 }, { "epoch": 1.9782880199827073, "grad_norm": 1.099467549115586, "learning_rate": 3.1354933140100725e-06, "loss": 0.0874, "step": 5148 }, { "epoch": 1.9790565856470361, "grad_norm": 0.9245152146753606, "learning_rate": 3.1313446045013863e-06, "loss": 0.0809, "step": 5150 }, { "epoch": 1.9798251513113652, "grad_norm": 1.1012285018689663, "learning_rate": 3.1271973898380697e-06, "loss": 0.0911, "step": 5152 }, { "epoch": 1.9805937169756942, "grad_norm": 1.123409209203501, "learning_rate": 3.123051673337716e-06, "loss": 0.0849, "step": 5154 }, { "epoch": 1.981362282640023, "grad_norm": 1.1273026283984429, "learning_rate": 3.118907458316722e-06, "loss": 0.0899, "step": 5156 }, { "epoch": 1.982130848304352, "grad_norm": 1.0992188793190392, "learning_rate": 3.1147647480902876e-06, "loss": 0.0907, "step": 5158 }, { "epoch": 1.9828994139686809, "grad_norm": 1.0576301883033128, "learning_rate": 3.1106235459724053e-06, "loss": 0.0884, "step": 5160 }, { "epoch": 1.98366797963301, "grad_norm": 1.038220969000617, "learning_rate": 3.1064838552758602e-06, "loss": 0.0895, "step": 5162 }, { "epoch": 1.984436545297339, "grad_norm": 1.1243005101535595, "learning_rate": 3.1023456793122343e-06, "loss": 0.0942, "step": 5164 }, { "epoch": 1.9852051109616677, "grad_norm": 1.0801124428556277, "learning_rate": 3.098209021391888e-06, "loss": 0.0878, "step": 5166 }, { "epoch": 1.9859736766259968, "grad_norm": 1.0985176873336528, "learning_rate": 3.0940738848239753e-06, "loss": 0.0897, "step": 5168 }, { "epoch": 1.9867422422903256, "grad_norm": 1.0800958102749034, "learning_rate": 3.089940272916432e-06, "loss": 0.089, "step": 5170 }, { "epoch": 1.9875108079546546, "grad_norm": 1.0535626883223101, "learning_rate": 3.08580818897597e-06, "loss": 0.0869, "step": 5172 }, { "epoch": 1.9882793736189837, "grad_norm": 1.2074365276884156, "learning_rate": 3.0816776363080835e-06, "loss": 0.0925, "step": 5174 }, { "epoch": 1.9890479392833125, "grad_norm": 1.1236928678120803, "learning_rate": 3.077548618217042e-06, "loss": 0.0908, "step": 5176 }, { "epoch": 1.9898165049476415, "grad_norm": 0.9760050642601811, "learning_rate": 3.0734211380058842e-06, "loss": 0.0804, "step": 5178 }, { "epoch": 1.9905850706119703, "grad_norm": 1.1857422892573475, "learning_rate": 3.0692951989764182e-06, "loss": 0.098, "step": 5180 }, { "epoch": 1.9913536362762994, "grad_norm": 1.1796342451255912, "learning_rate": 3.065170804429223e-06, "loss": 0.104, "step": 5182 }, { "epoch": 1.9921222019406284, "grad_norm": 1.0875762747399709, "learning_rate": 3.0610479576636374e-06, "loss": 0.1003, "step": 5184 }, { "epoch": 1.9928907676049572, "grad_norm": 0.9627288827682604, "learning_rate": 3.056926661977767e-06, "loss": 0.0831, "step": 5186 }, { "epoch": 1.9936593332692862, "grad_norm": 1.0139332033564048, "learning_rate": 3.052806920668474e-06, "loss": 0.0907, "step": 5188 }, { "epoch": 1.994427898933615, "grad_norm": 0.9673378456720474, "learning_rate": 3.048688737031379e-06, "loss": 0.0815, "step": 5190 }, { "epoch": 1.995196464597944, "grad_norm": 1.1733977656641028, "learning_rate": 3.0445721143608503e-06, "loss": 0.098, "step": 5192 }, { "epoch": 1.9959650302622731, "grad_norm": 0.998580448782166, "learning_rate": 3.0404570559500158e-06, "loss": 0.0866, "step": 5194 }, { "epoch": 1.996733595926602, "grad_norm": 1.0805784680067922, "learning_rate": 3.0363435650907465e-06, "loss": 0.0789, "step": 5196 }, { "epoch": 1.997502161590931, "grad_norm": 1.1170820148042138, "learning_rate": 3.0322316450736617e-06, "loss": 0.0991, "step": 5198 }, { "epoch": 1.9982707272552598, "grad_norm": 1.0263460923634076, "learning_rate": 3.0281212991881253e-06, "loss": 0.0751, "step": 5200 }, { "epoch": 1.9990392929195888, "grad_norm": 0.9929046083965724, "learning_rate": 3.024012530722238e-06, "loss": 0.0828, "step": 5202 }, { "epoch": 1.9998078585839179, "grad_norm": 1.0703021469495189, "learning_rate": 3.0199053429628446e-06, "loss": 0.08, "step": 5204 }, { "epoch": 2.000576424248247, "grad_norm": 0.6967303770946301, "learning_rate": 3.0157997391955172e-06, "loss": 0.0851, "step": 5206 }, { "epoch": 2.0013449899125755, "grad_norm": 0.6913193547731548, "learning_rate": 3.0116957227045673e-06, "loss": 0.0423, "step": 5208 }, { "epoch": 2.0021135555769045, "grad_norm": 0.7677236297646391, "learning_rate": 3.007593296773035e-06, "loss": 0.0441, "step": 5210 }, { "epoch": 2.0028821212412335, "grad_norm": 0.9274799237689426, "learning_rate": 3.0034924646826885e-06, "loss": 0.0402, "step": 5212 }, { "epoch": 2.0036506869055626, "grad_norm": 0.8973855288045031, "learning_rate": 2.999393229714018e-06, "loss": 0.0402, "step": 5214 }, { "epoch": 2.0044192525698916, "grad_norm": 0.7236518561321047, "learning_rate": 2.9952955951462404e-06, "loss": 0.0332, "step": 5216 }, { "epoch": 2.00518781823422, "grad_norm": 0.9330060390865927, "learning_rate": 2.991199564257291e-06, "loss": 0.0384, "step": 5218 }, { "epoch": 2.0059563838985492, "grad_norm": 0.9000133530724655, "learning_rate": 2.987105140323817e-06, "loss": 0.0365, "step": 5220 }, { "epoch": 2.0067249495628783, "grad_norm": 1.1812497032259401, "learning_rate": 2.9830123266211895e-06, "loss": 0.0377, "step": 5222 }, { "epoch": 2.0074935152272073, "grad_norm": 0.9915677675658211, "learning_rate": 2.978921126423483e-06, "loss": 0.0333, "step": 5224 }, { "epoch": 2.0082620808915363, "grad_norm": 0.9690820829419907, "learning_rate": 2.9748315430034856e-06, "loss": 0.0351, "step": 5226 }, { "epoch": 2.009030646555865, "grad_norm": 0.8538563679801177, "learning_rate": 2.970743579632694e-06, "loss": 0.034, "step": 5228 }, { "epoch": 2.009799212220194, "grad_norm": 1.087434004462708, "learning_rate": 2.9666572395813042e-06, "loss": 0.0373, "step": 5230 }, { "epoch": 2.010567777884523, "grad_norm": 0.9318354046193885, "learning_rate": 2.9625725261182143e-06, "loss": 0.036, "step": 5232 }, { "epoch": 2.011336343548852, "grad_norm": 0.9789027640449266, "learning_rate": 2.9584894425110245e-06, "loss": 0.0379, "step": 5234 }, { "epoch": 2.012104909213181, "grad_norm": 0.8050037914622996, "learning_rate": 2.954407992026027e-06, "loss": 0.034, "step": 5236 }, { "epoch": 2.0128734748775097, "grad_norm": 0.9190701516238533, "learning_rate": 2.9503281779282113e-06, "loss": 0.0376, "step": 5238 }, { "epoch": 2.0136420405418387, "grad_norm": 0.9219974461164094, "learning_rate": 2.9462500034812575e-06, "loss": 0.036, "step": 5240 }, { "epoch": 2.0144106062061677, "grad_norm": 0.9213774315735066, "learning_rate": 2.9421734719475304e-06, "loss": 0.0358, "step": 5242 }, { "epoch": 2.0151791718704968, "grad_norm": 0.85460607086415, "learning_rate": 2.9380985865880853e-06, "loss": 0.0366, "step": 5244 }, { "epoch": 2.015947737534826, "grad_norm": 0.8076676482123725, "learning_rate": 2.934025350662657e-06, "loss": 0.0316, "step": 5246 }, { "epoch": 2.0167163031991544, "grad_norm": 0.9035669936510498, "learning_rate": 2.929953767429661e-06, "loss": 0.0374, "step": 5248 }, { "epoch": 2.0174848688634834, "grad_norm": 0.8671990189571651, "learning_rate": 2.9258838401461933e-06, "loss": 0.0323, "step": 5250 }, { "epoch": 2.0182534345278125, "grad_norm": 0.8542920817226674, "learning_rate": 2.9218155720680242e-06, "loss": 0.0312, "step": 5252 }, { "epoch": 2.0190220001921415, "grad_norm": 0.8799039281028046, "learning_rate": 2.917748966449595e-06, "loss": 0.0358, "step": 5254 }, { "epoch": 2.0197905658564705, "grad_norm": 0.9053236395380021, "learning_rate": 2.9136840265440213e-06, "loss": 0.0342, "step": 5256 }, { "epoch": 2.020559131520799, "grad_norm": 0.7548162433871463, "learning_rate": 2.909620755603078e-06, "loss": 0.0284, "step": 5258 }, { "epoch": 2.021327697185128, "grad_norm": 0.8586484625720349, "learning_rate": 2.905559156877215e-06, "loss": 0.0331, "step": 5260 }, { "epoch": 2.022096262849457, "grad_norm": 0.7602604508030791, "learning_rate": 2.9014992336155377e-06, "loss": 0.0377, "step": 5262 }, { "epoch": 2.0228648285137862, "grad_norm": 0.894796490953474, "learning_rate": 2.897440989065812e-06, "loss": 0.035, "step": 5264 }, { "epoch": 2.0236333941781153, "grad_norm": 0.9501776346118852, "learning_rate": 2.893384426474465e-06, "loss": 0.039, "step": 5266 }, { "epoch": 2.024401959842444, "grad_norm": 0.9066512218859033, "learning_rate": 2.8893295490865743e-06, "loss": 0.0361, "step": 5268 }, { "epoch": 2.025170525506773, "grad_norm": 0.8058371551450385, "learning_rate": 2.8852763601458696e-06, "loss": 0.0339, "step": 5270 }, { "epoch": 2.025939091171102, "grad_norm": 0.9654999594459138, "learning_rate": 2.8812248628947313e-06, "loss": 0.0334, "step": 5272 }, { "epoch": 2.026707656835431, "grad_norm": 0.9230711342374563, "learning_rate": 2.877175060574186e-06, "loss": 0.0365, "step": 5274 }, { "epoch": 2.02747622249976, "grad_norm": 1.0624982800430127, "learning_rate": 2.8731269564239027e-06, "loss": 0.0357, "step": 5276 }, { "epoch": 2.0282447881640886, "grad_norm": 0.856158209207151, "learning_rate": 2.8690805536821976e-06, "loss": 0.0393, "step": 5278 }, { "epoch": 2.0290133538284176, "grad_norm": 0.8737316316305836, "learning_rate": 2.865035855586019e-06, "loss": 0.0349, "step": 5280 }, { "epoch": 2.0297819194927467, "grad_norm": 1.0411192916160443, "learning_rate": 2.860992865370956e-06, "loss": 0.0391, "step": 5282 }, { "epoch": 2.0305504851570757, "grad_norm": 0.9466860219249282, "learning_rate": 2.8569515862712287e-06, "loss": 0.0438, "step": 5284 }, { "epoch": 2.0313190508214047, "grad_norm": 0.9626445096271713, "learning_rate": 2.8529120215196904e-06, "loss": 0.0368, "step": 5286 }, { "epoch": 2.0320876164857333, "grad_norm": 0.9805027439999172, "learning_rate": 2.8488741743478206e-06, "loss": 0.0363, "step": 5288 }, { "epoch": 2.0328561821500624, "grad_norm": 1.0076312018318667, "learning_rate": 2.844838047985729e-06, "loss": 0.0406, "step": 5290 }, { "epoch": 2.0336247478143914, "grad_norm": 0.6383236406192649, "learning_rate": 2.8408036456621455e-06, "loss": 0.0278, "step": 5292 }, { "epoch": 2.0343933134787204, "grad_norm": 0.8743791816235245, "learning_rate": 2.836770970604421e-06, "loss": 0.0372, "step": 5294 }, { "epoch": 2.0351618791430495, "grad_norm": 0.8253170139326124, "learning_rate": 2.8327400260385253e-06, "loss": 0.0342, "step": 5296 }, { "epoch": 2.035930444807378, "grad_norm": 1.0443565515792934, "learning_rate": 2.828710815189043e-06, "loss": 0.0342, "step": 5298 }, { "epoch": 2.036699010471707, "grad_norm": 1.3087632087845673, "learning_rate": 2.8246833412791717e-06, "loss": 0.0347, "step": 5300 }, { "epoch": 2.037467576136036, "grad_norm": 1.1440787330493287, "learning_rate": 2.8206576075307233e-06, "loss": 0.0379, "step": 5302 }, { "epoch": 2.038236141800365, "grad_norm": 0.9659906513727816, "learning_rate": 2.816633617164114e-06, "loss": 0.0306, "step": 5304 }, { "epoch": 2.039004707464694, "grad_norm": 0.9894117229487729, "learning_rate": 2.812611373398365e-06, "loss": 0.0332, "step": 5306 }, { "epoch": 2.039773273129023, "grad_norm": 0.903528139716795, "learning_rate": 2.8085908794511e-06, "loss": 0.0337, "step": 5308 }, { "epoch": 2.040541838793352, "grad_norm": 0.8556572953529274, "learning_rate": 2.8045721385385492e-06, "loss": 0.0317, "step": 5310 }, { "epoch": 2.041310404457681, "grad_norm": 0.9733700815355953, "learning_rate": 2.8005551538755316e-06, "loss": 0.0366, "step": 5312 }, { "epoch": 2.04207897012201, "grad_norm": 0.9938684965928717, "learning_rate": 2.7965399286754637e-06, "loss": 0.0361, "step": 5314 }, { "epoch": 2.042847535786339, "grad_norm": 0.9439754003986521, "learning_rate": 2.7925264661503594e-06, "loss": 0.0348, "step": 5316 }, { "epoch": 2.0436161014506675, "grad_norm": 0.8237652093257505, "learning_rate": 2.7885147695108182e-06, "loss": 0.0306, "step": 5318 }, { "epoch": 2.0443846671149966, "grad_norm": 1.1352500156172587, "learning_rate": 2.784504841966027e-06, "loss": 0.0343, "step": 5320 }, { "epoch": 2.0451532327793256, "grad_norm": 0.8691982473923895, "learning_rate": 2.780496686723762e-06, "loss": 0.0315, "step": 5322 }, { "epoch": 2.0459217984436546, "grad_norm": 0.8580624679769172, "learning_rate": 2.7764903069903736e-06, "loss": 0.0291, "step": 5324 }, { "epoch": 2.0466903641079837, "grad_norm": 1.0456217034372917, "learning_rate": 2.772485705970796e-06, "loss": 0.0348, "step": 5326 }, { "epoch": 2.0474589297723123, "grad_norm": 0.9502547165346061, "learning_rate": 2.768482886868545e-06, "loss": 0.036, "step": 5328 }, { "epoch": 2.0482274954366413, "grad_norm": 0.9263969585476259, "learning_rate": 2.7644818528857044e-06, "loss": 0.035, "step": 5330 }, { "epoch": 2.0489960611009703, "grad_norm": 0.9418308343347142, "learning_rate": 2.76048260722293e-06, "loss": 0.0301, "step": 5332 }, { "epoch": 2.0497646267652994, "grad_norm": 0.9964204649181274, "learning_rate": 2.7564851530794547e-06, "loss": 0.0333, "step": 5334 }, { "epoch": 2.0505331924296284, "grad_norm": 0.8559698360970853, "learning_rate": 2.7524894936530715e-06, "loss": 0.0333, "step": 5336 }, { "epoch": 2.051301758093957, "grad_norm": 0.934930135783441, "learning_rate": 2.748495632140134e-06, "loss": 0.0403, "step": 5338 }, { "epoch": 2.052070323758286, "grad_norm": 0.9421623866863678, "learning_rate": 2.744503571735568e-06, "loss": 0.0348, "step": 5340 }, { "epoch": 2.052838889422615, "grad_norm": 0.9717501649370395, "learning_rate": 2.740513315632852e-06, "loss": 0.0352, "step": 5342 }, { "epoch": 2.053607455086944, "grad_norm": 0.9815363186191622, "learning_rate": 2.73652486702402e-06, "loss": 0.0361, "step": 5344 }, { "epoch": 2.054376020751273, "grad_norm": 0.9605372526188493, "learning_rate": 2.732538229099666e-06, "loss": 0.0346, "step": 5346 }, { "epoch": 2.0551445864156017, "grad_norm": 1.1405761471498232, "learning_rate": 2.728553405048932e-06, "loss": 0.0373, "step": 5348 }, { "epoch": 2.0559131520799308, "grad_norm": 1.0202834605545252, "learning_rate": 2.7245703980595033e-06, "loss": 0.0357, "step": 5350 }, { "epoch": 2.05668171774426, "grad_norm": 1.070997249875782, "learning_rate": 2.7205892113176223e-06, "loss": 0.0327, "step": 5352 }, { "epoch": 2.057450283408589, "grad_norm": 0.8713883291321272, "learning_rate": 2.7166098480080694e-06, "loss": 0.0314, "step": 5354 }, { "epoch": 2.058218849072918, "grad_norm": 0.9413637813128269, "learning_rate": 2.712632311314165e-06, "loss": 0.034, "step": 5356 }, { "epoch": 2.0589874147372464, "grad_norm": 0.8438484703897325, "learning_rate": 2.7086566044177733e-06, "loss": 0.0329, "step": 5358 }, { "epoch": 2.0597559804015755, "grad_norm": 0.8618112953119047, "learning_rate": 2.7046827304992907e-06, "loss": 0.0328, "step": 5360 }, { "epoch": 2.0605245460659045, "grad_norm": 1.1118338728677368, "learning_rate": 2.700710692737648e-06, "loss": 0.0397, "step": 5362 }, { "epoch": 2.0612931117302336, "grad_norm": 0.8905921521632625, "learning_rate": 2.696740494310308e-06, "loss": 0.029, "step": 5364 }, { "epoch": 2.0620616773945626, "grad_norm": 0.7881500181157219, "learning_rate": 2.6927721383932615e-06, "loss": 0.0337, "step": 5366 }, { "epoch": 2.062830243058891, "grad_norm": 1.041161211330062, "learning_rate": 2.6888056281610236e-06, "loss": 0.0384, "step": 5368 }, { "epoch": 2.06359880872322, "grad_norm": 0.9602786349553348, "learning_rate": 2.6848409667866393e-06, "loss": 0.0361, "step": 5370 }, { "epoch": 2.0643673743875492, "grad_norm": 0.8651692529393904, "learning_rate": 2.6808781574416674e-06, "loss": 0.0333, "step": 5372 }, { "epoch": 2.0651359400518783, "grad_norm": 1.0090970820741607, "learning_rate": 2.6769172032961883e-06, "loss": 0.0337, "step": 5374 }, { "epoch": 2.0659045057162073, "grad_norm": 1.011258597612014, "learning_rate": 2.672958107518798e-06, "loss": 0.0343, "step": 5376 }, { "epoch": 2.066673071380536, "grad_norm": 0.8551182221632782, "learning_rate": 2.669000873276606e-06, "loss": 0.0331, "step": 5378 }, { "epoch": 2.067441637044865, "grad_norm": 0.8876493135322141, "learning_rate": 2.665045503735231e-06, "loss": 0.034, "step": 5380 }, { "epoch": 2.068210202709194, "grad_norm": 0.7429920413200745, "learning_rate": 2.661092002058804e-06, "loss": 0.0321, "step": 5382 }, { "epoch": 2.068978768373523, "grad_norm": 0.9784720258078888, "learning_rate": 2.657140371409959e-06, "loss": 0.0333, "step": 5384 }, { "epoch": 2.069747334037852, "grad_norm": 0.9818469844496855, "learning_rate": 2.653190614949832e-06, "loss": 0.0352, "step": 5386 }, { "epoch": 2.0705158997021806, "grad_norm": 0.8908332540800143, "learning_rate": 2.6492427358380625e-06, "loss": 0.0314, "step": 5388 }, { "epoch": 2.0712844653665097, "grad_norm": 1.0239895759931983, "learning_rate": 2.6452967372327865e-06, "loss": 0.0357, "step": 5390 }, { "epoch": 2.0720530310308387, "grad_norm": 1.0404599120623501, "learning_rate": 2.6413526222906336e-06, "loss": 0.0363, "step": 5392 }, { "epoch": 2.0728215966951677, "grad_norm": 1.028093297438215, "learning_rate": 2.637410394166735e-06, "loss": 0.0354, "step": 5394 }, { "epoch": 2.073590162359497, "grad_norm": 1.1147568889532244, "learning_rate": 2.633470056014703e-06, "loss": 0.033, "step": 5396 }, { "epoch": 2.0743587280238254, "grad_norm": 0.9601702604564353, "learning_rate": 2.629531610986642e-06, "loss": 0.0391, "step": 5398 }, { "epoch": 2.0751272936881544, "grad_norm": 0.8571923940027621, "learning_rate": 2.6255950622331395e-06, "loss": 0.0366, "step": 5400 }, { "epoch": 2.0758958593524834, "grad_norm": 0.9142768617918895, "learning_rate": 2.621660412903274e-06, "loss": 0.0291, "step": 5402 }, { "epoch": 2.0766644250168125, "grad_norm": 0.8441929721531168, "learning_rate": 2.6177276661445916e-06, "loss": 0.0309, "step": 5404 }, { "epoch": 2.0774329906811415, "grad_norm": 0.7607236253810837, "learning_rate": 2.613796825103129e-06, "loss": 0.0284, "step": 5406 }, { "epoch": 2.07820155634547, "grad_norm": 0.895523122300879, "learning_rate": 2.609867892923391e-06, "loss": 0.0343, "step": 5408 }, { "epoch": 2.078970122009799, "grad_norm": 0.9811461903011947, "learning_rate": 2.605940872748357e-06, "loss": 0.0335, "step": 5410 }, { "epoch": 2.079738687674128, "grad_norm": 0.9345613337345444, "learning_rate": 2.602015767719476e-06, "loss": 0.0314, "step": 5412 }, { "epoch": 2.080507253338457, "grad_norm": 0.9101119430240284, "learning_rate": 2.5980925809766717e-06, "loss": 0.0322, "step": 5414 }, { "epoch": 2.0812758190027862, "grad_norm": 0.9537557880707668, "learning_rate": 2.59417131565832e-06, "loss": 0.0305, "step": 5416 }, { "epoch": 2.082044384667115, "grad_norm": 0.8867174611255556, "learning_rate": 2.5902519749012728e-06, "loss": 0.0348, "step": 5418 }, { "epoch": 2.082812950331444, "grad_norm": 0.9665402810437599, "learning_rate": 2.5863345618408357e-06, "loss": 0.0413, "step": 5420 }, { "epoch": 2.083581515995773, "grad_norm": 0.9829314297222597, "learning_rate": 2.582419079610774e-06, "loss": 0.0335, "step": 5422 }, { "epoch": 2.084350081660102, "grad_norm": 0.8799369856242469, "learning_rate": 2.578505531343305e-06, "loss": 0.0342, "step": 5424 }, { "epoch": 2.085118647324431, "grad_norm": 0.8704134072245727, "learning_rate": 2.5745939201691074e-06, "loss": 0.0269, "step": 5426 }, { "epoch": 2.0858872129887596, "grad_norm": 0.8679827390638687, "learning_rate": 2.5706842492173035e-06, "loss": 0.0332, "step": 5428 }, { "epoch": 2.0866557786530886, "grad_norm": 0.9675872385169061, "learning_rate": 2.5667765216154596e-06, "loss": 0.0372, "step": 5430 }, { "epoch": 2.0874243443174176, "grad_norm": 0.7917559544040728, "learning_rate": 2.562870740489598e-06, "loss": 0.0361, "step": 5432 }, { "epoch": 2.0881929099817467, "grad_norm": 0.9620601800175693, "learning_rate": 2.5589669089641774e-06, "loss": 0.033, "step": 5434 }, { "epoch": 2.0889614756460757, "grad_norm": 0.9878833426791778, "learning_rate": 2.555065030162096e-06, "loss": 0.0341, "step": 5436 }, { "epoch": 2.0897300413104043, "grad_norm": 1.133160531347125, "learning_rate": 2.5511651072046957e-06, "loss": 0.0378, "step": 5438 }, { "epoch": 2.0904986069747333, "grad_norm": 0.9630230031237749, "learning_rate": 2.547267143211748e-06, "loss": 0.0359, "step": 5440 }, { "epoch": 2.0912671726390624, "grad_norm": 0.9766440114184034, "learning_rate": 2.5433711413014593e-06, "loss": 0.0336, "step": 5442 }, { "epoch": 2.0920357383033914, "grad_norm": 0.9736714307165565, "learning_rate": 2.5394771045904675e-06, "loss": 0.0359, "step": 5444 }, { "epoch": 2.0928043039677204, "grad_norm": 0.8475739684059423, "learning_rate": 2.535585036193836e-06, "loss": 0.0326, "step": 5446 }, { "epoch": 2.093572869632049, "grad_norm": 0.8803705690767365, "learning_rate": 2.5316949392250543e-06, "loss": 0.031, "step": 5448 }, { "epoch": 2.094341435296378, "grad_norm": 0.8849674287106765, "learning_rate": 2.5278068167960394e-06, "loss": 0.0366, "step": 5450 }, { "epoch": 2.095110000960707, "grad_norm": 0.8767753943199241, "learning_rate": 2.523920672017123e-06, "loss": 0.0325, "step": 5452 }, { "epoch": 2.095878566625036, "grad_norm": 1.0117983358225484, "learning_rate": 2.5200365079970556e-06, "loss": 0.0369, "step": 5454 }, { "epoch": 2.096647132289365, "grad_norm": 0.844937201692542, "learning_rate": 2.5161543278430055e-06, "loss": 0.0335, "step": 5456 }, { "epoch": 2.0974156979536938, "grad_norm": 0.895965927648342, "learning_rate": 2.5122741346605523e-06, "loss": 0.0399, "step": 5458 }, { "epoch": 2.098184263618023, "grad_norm": 1.0298378873817724, "learning_rate": 2.5083959315536853e-06, "loss": 0.0332, "step": 5460 }, { "epoch": 2.098952829282352, "grad_norm": 0.8371627842356378, "learning_rate": 2.5045197216248064e-06, "loss": 0.0338, "step": 5462 }, { "epoch": 2.099721394946681, "grad_norm": 1.0453222534394124, "learning_rate": 2.500645507974718e-06, "loss": 0.0358, "step": 5464 }, { "epoch": 2.10048996061101, "grad_norm": 1.0846150574928535, "learning_rate": 2.4967732937026252e-06, "loss": 0.0352, "step": 5466 }, { "epoch": 2.1012585262753385, "grad_norm": 0.9479737238965682, "learning_rate": 2.492903081906142e-06, "loss": 0.0331, "step": 5468 }, { "epoch": 2.1020270919396675, "grad_norm": 0.9451982315571857, "learning_rate": 2.4890348756812684e-06, "loss": 0.0354, "step": 5470 }, { "epoch": 2.1027956576039966, "grad_norm": 0.8515481891543908, "learning_rate": 2.4851686781224056e-06, "loss": 0.03, "step": 5472 }, { "epoch": 2.1035642232683256, "grad_norm": 0.7415331978870945, "learning_rate": 2.4813044923223524e-06, "loss": 0.0336, "step": 5474 }, { "epoch": 2.1043327889326546, "grad_norm": 0.76721170465044, "learning_rate": 2.4774423213722925e-06, "loss": 0.0339, "step": 5476 }, { "epoch": 2.105101354596983, "grad_norm": 0.7467804353437315, "learning_rate": 2.4735821683617996e-06, "loss": 0.0311, "step": 5478 }, { "epoch": 2.1058699202613123, "grad_norm": 1.015949910072625, "learning_rate": 2.4697240363788327e-06, "loss": 0.0355, "step": 5480 }, { "epoch": 2.1066384859256413, "grad_norm": 0.8534256386603646, "learning_rate": 2.4658679285097344e-06, "loss": 0.032, "step": 5482 }, { "epoch": 2.1074070515899703, "grad_norm": 0.9007266051587063, "learning_rate": 2.4620138478392257e-06, "loss": 0.0361, "step": 5484 }, { "epoch": 2.1081756172542994, "grad_norm": 1.0626978422126214, "learning_rate": 2.458161797450413e-06, "loss": 0.0352, "step": 5486 }, { "epoch": 2.108944182918628, "grad_norm": 0.8925793471629259, "learning_rate": 2.4543117804247713e-06, "loss": 0.0338, "step": 5488 }, { "epoch": 2.109712748582957, "grad_norm": 0.9473846842440328, "learning_rate": 2.450463799842152e-06, "loss": 0.0317, "step": 5490 }, { "epoch": 2.110481314247286, "grad_norm": 1.0299192089716633, "learning_rate": 2.446617858780775e-06, "loss": 0.0375, "step": 5492 }, { "epoch": 2.111249879911615, "grad_norm": 0.810498097897491, "learning_rate": 2.4427739603172364e-06, "loss": 0.0359, "step": 5494 }, { "epoch": 2.112018445575944, "grad_norm": 0.8951406620680875, "learning_rate": 2.4389321075264845e-06, "loss": 0.0327, "step": 5496 }, { "epoch": 2.1127870112402727, "grad_norm": 1.0359730490689578, "learning_rate": 2.435092303481845e-06, "loss": 0.0352, "step": 5498 }, { "epoch": 2.1135555769046017, "grad_norm": 0.8764145114668275, "learning_rate": 2.431254551254998e-06, "loss": 0.031, "step": 5500 }, { "epoch": 2.1135555769046017, "eval_loss": 0.17527556419372559, "eval_runtime": 392.1468, "eval_samples_per_second": 47.189, "eval_steps_per_second": 5.901, "step": 5500 }, { "epoch": 2.1143241425689308, "grad_norm": 1.007244798051405, "learning_rate": 2.4274188539159814e-06, "loss": 0.0418, "step": 5502 }, { "epoch": 2.11509270823326, "grad_norm": 1.1175507276278467, "learning_rate": 2.423585214533191e-06, "loss": 0.0428, "step": 5504 }, { "epoch": 2.115861273897589, "grad_norm": 0.7920291487342939, "learning_rate": 2.4197536361733792e-06, "loss": 0.0288, "step": 5506 }, { "epoch": 2.1166298395619174, "grad_norm": 1.1076807970128675, "learning_rate": 2.4159241219016447e-06, "loss": 0.0393, "step": 5508 }, { "epoch": 2.1173984052262464, "grad_norm": 0.8563362134010912, "learning_rate": 2.412096674781437e-06, "loss": 0.0373, "step": 5510 }, { "epoch": 2.1181669708905755, "grad_norm": 0.8154636883261094, "learning_rate": 2.4082712978745533e-06, "loss": 0.0316, "step": 5512 }, { "epoch": 2.1189355365549045, "grad_norm": 0.7724308185224815, "learning_rate": 2.4044479942411325e-06, "loss": 0.0355, "step": 5514 }, { "epoch": 2.1197041022192336, "grad_norm": 0.8768933380897748, "learning_rate": 2.4006267669396537e-06, "loss": 0.0389, "step": 5516 }, { "epoch": 2.120472667883562, "grad_norm": 0.850801926907641, "learning_rate": 2.3968076190269423e-06, "loss": 0.0306, "step": 5518 }, { "epoch": 2.121241233547891, "grad_norm": 0.7231397978823041, "learning_rate": 2.392990553558152e-06, "loss": 0.0327, "step": 5520 }, { "epoch": 2.12200979921222, "grad_norm": 1.0815168885911781, "learning_rate": 2.3891755735867754e-06, "loss": 0.0326, "step": 5522 }, { "epoch": 2.1227783648765493, "grad_norm": 0.8085032225279724, "learning_rate": 2.385362682164634e-06, "loss": 0.0315, "step": 5524 }, { "epoch": 2.1235469305408783, "grad_norm": 0.7734641723423052, "learning_rate": 2.381551882341881e-06, "loss": 0.0354, "step": 5526 }, { "epoch": 2.124315496205207, "grad_norm": 1.1786443966602602, "learning_rate": 2.3777431771669916e-06, "loss": 0.0389, "step": 5528 }, { "epoch": 2.125084061869536, "grad_norm": 0.9945081395611932, "learning_rate": 2.373936569686775e-06, "loss": 0.0379, "step": 5530 }, { "epoch": 2.125852627533865, "grad_norm": 0.8848104326422569, "learning_rate": 2.370132062946353e-06, "loss": 0.0303, "step": 5532 }, { "epoch": 2.126621193198194, "grad_norm": 1.0826944626958832, "learning_rate": 2.3663296599891712e-06, "loss": 0.0368, "step": 5534 }, { "epoch": 2.127389758862523, "grad_norm": 0.9199979586227872, "learning_rate": 2.36252936385699e-06, "loss": 0.0306, "step": 5536 }, { "epoch": 2.1281583245268516, "grad_norm": 0.9806908587655794, "learning_rate": 2.3587311775898867e-06, "loss": 0.0368, "step": 5538 }, { "epoch": 2.1289268901911806, "grad_norm": 0.8590757047044847, "learning_rate": 2.354935104226248e-06, "loss": 0.0353, "step": 5540 }, { "epoch": 2.1296954558555097, "grad_norm": 1.1013317253929398, "learning_rate": 2.3511411468027757e-06, "loss": 0.0395, "step": 5542 }, { "epoch": 2.1304640215198387, "grad_norm": 0.8932338104366823, "learning_rate": 2.3473493083544736e-06, "loss": 0.0361, "step": 5544 }, { "epoch": 2.1312325871841677, "grad_norm": 0.9928380486232576, "learning_rate": 2.3435595919146524e-06, "loss": 0.0334, "step": 5546 }, { "epoch": 2.1320011528484963, "grad_norm": 0.8715664991592926, "learning_rate": 2.339772000514925e-06, "loss": 0.0329, "step": 5548 }, { "epoch": 2.1327697185128254, "grad_norm": 1.0882980896091325, "learning_rate": 2.335986537185204e-06, "loss": 0.037, "step": 5550 }, { "epoch": 2.1335382841771544, "grad_norm": 1.0337833546364696, "learning_rate": 2.3322032049537e-06, "loss": 0.0359, "step": 5552 }, { "epoch": 2.1343068498414834, "grad_norm": 1.0431177640469862, "learning_rate": 2.3284220068469203e-06, "loss": 0.0302, "step": 5554 }, { "epoch": 2.1350754155058125, "grad_norm": 1.0012168807463313, "learning_rate": 2.3246429458896637e-06, "loss": 0.0407, "step": 5556 }, { "epoch": 2.135843981170141, "grad_norm": 0.9173645994130253, "learning_rate": 2.320866025105016e-06, "loss": 0.0312, "step": 5558 }, { "epoch": 2.13661254683447, "grad_norm": 0.9527802668900744, "learning_rate": 2.3170912475143607e-06, "loss": 0.0325, "step": 5560 }, { "epoch": 2.137381112498799, "grad_norm": 1.1082518817372102, "learning_rate": 2.313318616137355e-06, "loss": 0.0351, "step": 5562 }, { "epoch": 2.138149678163128, "grad_norm": 1.01923937745911, "learning_rate": 2.309548133991944e-06, "loss": 0.0338, "step": 5564 }, { "epoch": 2.138918243827457, "grad_norm": 1.118411538036161, "learning_rate": 2.305779804094358e-06, "loss": 0.038, "step": 5566 }, { "epoch": 2.139686809491786, "grad_norm": 1.020571737434286, "learning_rate": 2.3020136294591e-06, "loss": 0.034, "step": 5568 }, { "epoch": 2.140455375156115, "grad_norm": 0.9372058461233294, "learning_rate": 2.2982496130989484e-06, "loss": 0.0309, "step": 5570 }, { "epoch": 2.141223940820444, "grad_norm": 0.9193675344499723, "learning_rate": 2.294487758024963e-06, "loss": 0.0367, "step": 5572 }, { "epoch": 2.141992506484773, "grad_norm": 0.8976841182404491, "learning_rate": 2.290728067246464e-06, "loss": 0.0303, "step": 5574 }, { "epoch": 2.142761072149102, "grad_norm": 0.9858389739408078, "learning_rate": 2.286970543771044e-06, "loss": 0.0328, "step": 5576 }, { "epoch": 2.1435296378134305, "grad_norm": 1.1409126906010232, "learning_rate": 2.283215190604568e-06, "loss": 0.0391, "step": 5578 }, { "epoch": 2.1442982034777596, "grad_norm": 0.883330161575807, "learning_rate": 2.2794620107511573e-06, "loss": 0.0292, "step": 5580 }, { "epoch": 2.1450667691420886, "grad_norm": 0.9836344247337597, "learning_rate": 2.275711007213195e-06, "loss": 0.0313, "step": 5582 }, { "epoch": 2.1458353348064176, "grad_norm": 0.8724986466342735, "learning_rate": 2.2719621829913295e-06, "loss": 0.0335, "step": 5584 }, { "epoch": 2.1466039004707467, "grad_norm": 0.8447721381110733, "learning_rate": 2.268215541084462e-06, "loss": 0.0339, "step": 5586 }, { "epoch": 2.1473724661350753, "grad_norm": 0.7533272259630476, "learning_rate": 2.264471084489742e-06, "loss": 0.0325, "step": 5588 }, { "epoch": 2.1481410317994043, "grad_norm": 1.057576739914508, "learning_rate": 2.2607288162025825e-06, "loss": 0.0396, "step": 5590 }, { "epoch": 2.1489095974637333, "grad_norm": 0.9211758323183467, "learning_rate": 2.256988739216638e-06, "loss": 0.0309, "step": 5592 }, { "epoch": 2.1496781631280624, "grad_norm": 0.8709712922243326, "learning_rate": 2.2532508565238117e-06, "loss": 0.0346, "step": 5594 }, { "epoch": 2.1504467287923914, "grad_norm": 0.9237974753870393, "learning_rate": 2.2495151711142502e-06, "loss": 0.0339, "step": 5596 }, { "epoch": 2.15121529445672, "grad_norm": 0.9945453610028653, "learning_rate": 2.2457816859763475e-06, "loss": 0.0352, "step": 5598 }, { "epoch": 2.151983860121049, "grad_norm": 0.8781181494491378, "learning_rate": 2.2420504040967326e-06, "loss": 0.0331, "step": 5600 }, { "epoch": 2.152752425785378, "grad_norm": 1.1240525505951002, "learning_rate": 2.2383213284602727e-06, "loss": 0.0366, "step": 5602 }, { "epoch": 2.153520991449707, "grad_norm": 0.921190186887827, "learning_rate": 2.234594462050072e-06, "loss": 0.0335, "step": 5604 }, { "epoch": 2.154289557114036, "grad_norm": 0.9542515897608899, "learning_rate": 2.2308698078474645e-06, "loss": 0.0356, "step": 5606 }, { "epoch": 2.1550581227783647, "grad_norm": 0.962820307745428, "learning_rate": 2.2271473688320156e-06, "loss": 0.032, "step": 5608 }, { "epoch": 2.1558266884426938, "grad_norm": 0.8444992948430475, "learning_rate": 2.2234271479815224e-06, "loss": 0.0286, "step": 5610 }, { "epoch": 2.156595254107023, "grad_norm": 0.8721569746391332, "learning_rate": 2.2197091482720024e-06, "loss": 0.0351, "step": 5612 }, { "epoch": 2.157363819771352, "grad_norm": 0.7917410681946669, "learning_rate": 2.215993372677699e-06, "loss": 0.0319, "step": 5614 }, { "epoch": 2.158132385435681, "grad_norm": 1.1676857576683979, "learning_rate": 2.2122798241710753e-06, "loss": 0.0429, "step": 5616 }, { "epoch": 2.1589009511000095, "grad_norm": 0.8763013105714923, "learning_rate": 2.2085685057228125e-06, "loss": 0.0321, "step": 5618 }, { "epoch": 2.1596695167643385, "grad_norm": 0.9860474887757009, "learning_rate": 2.2048594203018076e-06, "loss": 0.0365, "step": 5620 }, { "epoch": 2.1604380824286675, "grad_norm": 0.8386888815996094, "learning_rate": 2.201152570875176e-06, "loss": 0.0277, "step": 5622 }, { "epoch": 2.1612066480929966, "grad_norm": 0.8440665765364018, "learning_rate": 2.1974479604082384e-06, "loss": 0.0296, "step": 5624 }, { "epoch": 2.1619752137573256, "grad_norm": 0.9937578212189544, "learning_rate": 2.193745591864526e-06, "loss": 0.039, "step": 5626 }, { "epoch": 2.162743779421654, "grad_norm": 1.061909282193711, "learning_rate": 2.1900454682057786e-06, "loss": 0.0363, "step": 5628 }, { "epoch": 2.1635123450859832, "grad_norm": 0.9492261928561503, "learning_rate": 2.1863475923919386e-06, "loss": 0.0327, "step": 5630 }, { "epoch": 2.1642809107503123, "grad_norm": 0.9074383982859054, "learning_rate": 2.1826519673811477e-06, "loss": 0.0316, "step": 5632 }, { "epoch": 2.1650494764146413, "grad_norm": 0.8649531704661869, "learning_rate": 2.178958596129753e-06, "loss": 0.0336, "step": 5634 }, { "epoch": 2.1658180420789703, "grad_norm": 1.0389247236285257, "learning_rate": 2.1752674815922957e-06, "loss": 0.0359, "step": 5636 }, { "epoch": 2.166586607743299, "grad_norm": 0.9908947090256345, "learning_rate": 2.17157862672151e-06, "loss": 0.0335, "step": 5638 }, { "epoch": 2.167355173407628, "grad_norm": 0.9183557342430625, "learning_rate": 2.1678920344683234e-06, "loss": 0.0332, "step": 5640 }, { "epoch": 2.168123739071957, "grad_norm": 0.8317334450618388, "learning_rate": 2.1642077077818555e-06, "loss": 0.0323, "step": 5642 }, { "epoch": 2.168892304736286, "grad_norm": 0.994848880435368, "learning_rate": 2.1605256496094094e-06, "loss": 0.0415, "step": 5644 }, { "epoch": 2.169660870400615, "grad_norm": 0.7966643254246877, "learning_rate": 2.15684586289648e-06, "loss": 0.0341, "step": 5646 }, { "epoch": 2.1704294360649437, "grad_norm": 1.0167133897967953, "learning_rate": 2.1531683505867386e-06, "loss": 0.0354, "step": 5648 }, { "epoch": 2.1711980017292727, "grad_norm": 1.028078974966357, "learning_rate": 2.149493115622038e-06, "loss": 0.0286, "step": 5650 }, { "epoch": 2.1719665673936017, "grad_norm": 0.9688094755064882, "learning_rate": 2.145820160942418e-06, "loss": 0.0349, "step": 5652 }, { "epoch": 2.1727351330579308, "grad_norm": 0.9025340193219336, "learning_rate": 2.1421494894860797e-06, "loss": 0.0301, "step": 5654 }, { "epoch": 2.17350369872226, "grad_norm": 1.0902690945308096, "learning_rate": 2.1384811041894055e-06, "loss": 0.0371, "step": 5656 }, { "epoch": 2.1742722643865884, "grad_norm": 1.0561816695216633, "learning_rate": 2.1348150079869533e-06, "loss": 0.0334, "step": 5658 }, { "epoch": 2.1750408300509174, "grad_norm": 0.9077516411872621, "learning_rate": 2.1311512038114425e-06, "loss": 0.0333, "step": 5660 }, { "epoch": 2.1758093957152465, "grad_norm": 1.0049267358040312, "learning_rate": 2.12748969459376e-06, "loss": 0.0349, "step": 5662 }, { "epoch": 2.1765779613795755, "grad_norm": 0.9585688795327988, "learning_rate": 2.1238304832629627e-06, "loss": 0.0336, "step": 5664 }, { "epoch": 2.1773465270439045, "grad_norm": 0.8767199555700053, "learning_rate": 2.1201735727462643e-06, "loss": 0.0362, "step": 5666 }, { "epoch": 2.178115092708233, "grad_norm": 0.9442236678102335, "learning_rate": 2.1165189659690333e-06, "loss": 0.0366, "step": 5668 }, { "epoch": 2.178883658372562, "grad_norm": 0.9684612924061815, "learning_rate": 2.112866665854807e-06, "loss": 0.0316, "step": 5670 }, { "epoch": 2.179652224036891, "grad_norm": 0.9667016882244809, "learning_rate": 2.109216675325268e-06, "loss": 0.0314, "step": 5672 }, { "epoch": 2.18042078970122, "grad_norm": 0.9756191216895358, "learning_rate": 2.105568997300254e-06, "loss": 0.0288, "step": 5674 }, { "epoch": 2.1811893553655493, "grad_norm": 0.770781737543144, "learning_rate": 2.1019236346977562e-06, "loss": 0.03, "step": 5676 }, { "epoch": 2.181957921029878, "grad_norm": 0.9606229455015266, "learning_rate": 2.098280590433911e-06, "loss": 0.0323, "step": 5678 }, { "epoch": 2.182726486694207, "grad_norm": 0.8467059906564933, "learning_rate": 2.0946398674229944e-06, "loss": 0.0285, "step": 5680 }, { "epoch": 2.183495052358536, "grad_norm": 1.0635838816725236, "learning_rate": 2.091001468577436e-06, "loss": 0.0375, "step": 5682 }, { "epoch": 2.184263618022865, "grad_norm": 0.8330308140768923, "learning_rate": 2.0873653968078e-06, "loss": 0.032, "step": 5684 }, { "epoch": 2.185032183687194, "grad_norm": 1.016591353918576, "learning_rate": 2.0837316550227884e-06, "loss": 0.0371, "step": 5686 }, { "epoch": 2.1858007493515226, "grad_norm": 1.1360504253264305, "learning_rate": 2.0801002461292437e-06, "loss": 0.0358, "step": 5688 }, { "epoch": 2.1865693150158516, "grad_norm": 0.9230413767432099, "learning_rate": 2.0764711730321384e-06, "loss": 0.0305, "step": 5690 }, { "epoch": 2.1873378806801806, "grad_norm": 0.9197310443613181, "learning_rate": 2.072844438634579e-06, "loss": 0.0319, "step": 5692 }, { "epoch": 2.1881064463445097, "grad_norm": 1.018239028307765, "learning_rate": 2.069220045837797e-06, "loss": 0.0323, "step": 5694 }, { "epoch": 2.1888750120088387, "grad_norm": 0.8382513681109507, "learning_rate": 2.0655979975411562e-06, "loss": 0.0324, "step": 5696 }, { "epoch": 2.1896435776731673, "grad_norm": 0.9415408935288803, "learning_rate": 2.0619782966421386e-06, "loss": 0.0351, "step": 5698 }, { "epoch": 2.1904121433374963, "grad_norm": 0.9521172854192993, "learning_rate": 2.0583609460363564e-06, "loss": 0.031, "step": 5700 }, { "epoch": 2.1911807090018254, "grad_norm": 1.0416217868177622, "learning_rate": 2.054745948617535e-06, "loss": 0.0339, "step": 5702 }, { "epoch": 2.1919492746661544, "grad_norm": 0.7930932396666955, "learning_rate": 2.05113330727752e-06, "loss": 0.032, "step": 5704 }, { "epoch": 2.1927178403304834, "grad_norm": 0.961591911229594, "learning_rate": 2.0475230249062727e-06, "loss": 0.0313, "step": 5706 }, { "epoch": 2.193486405994812, "grad_norm": 1.0671314639039378, "learning_rate": 2.0439151043918647e-06, "loss": 0.0358, "step": 5708 }, { "epoch": 2.194254971659141, "grad_norm": 0.8614205203026544, "learning_rate": 2.0403095486204805e-06, "loss": 0.0275, "step": 5710 }, { "epoch": 2.19502353732347, "grad_norm": 1.0525233632423925, "learning_rate": 2.0367063604764113e-06, "loss": 0.0401, "step": 5712 }, { "epoch": 2.195792102987799, "grad_norm": 0.91013442424793, "learning_rate": 2.033105542842059e-06, "loss": 0.0331, "step": 5714 }, { "epoch": 2.196560668652128, "grad_norm": 1.0300363146387705, "learning_rate": 2.0295070985979226e-06, "loss": 0.0323, "step": 5716 }, { "epoch": 2.1973292343164568, "grad_norm": 0.8128071023116268, "learning_rate": 2.025911030622606e-06, "loss": 0.0286, "step": 5718 }, { "epoch": 2.198097799980786, "grad_norm": 0.9035154511625811, "learning_rate": 2.022317341792813e-06, "loss": 0.0281, "step": 5720 }, { "epoch": 2.198866365645115, "grad_norm": 0.8659645413566112, "learning_rate": 2.0187260349833405e-06, "loss": 0.0258, "step": 5722 }, { "epoch": 2.199634931309444, "grad_norm": 0.9570758868963121, "learning_rate": 2.0151371130670823e-06, "loss": 0.0289, "step": 5724 }, { "epoch": 2.200403496973773, "grad_norm": 1.0601239293312077, "learning_rate": 2.0115505789150264e-06, "loss": 0.0362, "step": 5726 }, { "epoch": 2.2011720626381015, "grad_norm": 0.7787907837730097, "learning_rate": 2.007966435396248e-06, "loss": 0.0286, "step": 5728 }, { "epoch": 2.2019406283024305, "grad_norm": 0.892829536276527, "learning_rate": 2.0043846853779085e-06, "loss": 0.0307, "step": 5730 }, { "epoch": 2.2027091939667596, "grad_norm": 1.043012805250879, "learning_rate": 2.0008053317252617e-06, "loss": 0.0345, "step": 5732 }, { "epoch": 2.2034777596310886, "grad_norm": 1.0645198296568987, "learning_rate": 1.997228377301634e-06, "loss": 0.0345, "step": 5734 }, { "epoch": 2.2042463252954176, "grad_norm": 0.8194982357769905, "learning_rate": 1.993653824968438e-06, "loss": 0.0296, "step": 5736 }, { "epoch": 2.2050148909597462, "grad_norm": 1.0191785719337654, "learning_rate": 1.9900816775851674e-06, "loss": 0.0368, "step": 5738 }, { "epoch": 2.2057834566240753, "grad_norm": 0.9448498225776976, "learning_rate": 1.9865119380093884e-06, "loss": 0.0353, "step": 5740 }, { "epoch": 2.2065520222884043, "grad_norm": 1.0084461015886654, "learning_rate": 1.9829446090967392e-06, "loss": 0.0329, "step": 5742 }, { "epoch": 2.2073205879527333, "grad_norm": 0.9309459861925766, "learning_rate": 1.979379693700938e-06, "loss": 0.0316, "step": 5744 }, { "epoch": 2.2080891536170624, "grad_norm": 0.8704446408601946, "learning_rate": 1.975817194673761e-06, "loss": 0.0305, "step": 5746 }, { "epoch": 2.208857719281391, "grad_norm": 0.9066867444861842, "learning_rate": 1.972257114865057e-06, "loss": 0.0314, "step": 5748 }, { "epoch": 2.20962628494572, "grad_norm": 0.8965127565410671, "learning_rate": 1.968699457122743e-06, "loss": 0.0313, "step": 5750 }, { "epoch": 2.210394850610049, "grad_norm": 0.8532232125709417, "learning_rate": 1.965144224292793e-06, "loss": 0.0332, "step": 5752 }, { "epoch": 2.211163416274378, "grad_norm": 0.933921612007712, "learning_rate": 1.961591419219242e-06, "loss": 0.034, "step": 5754 }, { "epoch": 2.211931981938707, "grad_norm": 0.8807264071926647, "learning_rate": 1.958041044744186e-06, "loss": 0.0321, "step": 5756 }, { "epoch": 2.2127005476030357, "grad_norm": 0.8867305253812364, "learning_rate": 1.9544931037077762e-06, "loss": 0.0299, "step": 5758 }, { "epoch": 2.2134691132673647, "grad_norm": 1.0023253083013148, "learning_rate": 1.9509475989482092e-06, "loss": 0.0359, "step": 5760 }, { "epoch": 2.2142376789316938, "grad_norm": 0.967001597347489, "learning_rate": 1.9474045333017445e-06, "loss": 0.0324, "step": 5762 }, { "epoch": 2.215006244596023, "grad_norm": 0.9823286734167501, "learning_rate": 1.9438639096026834e-06, "loss": 0.0349, "step": 5764 }, { "epoch": 2.215774810260352, "grad_norm": 0.9086186144718693, "learning_rate": 1.9403257306833733e-06, "loss": 0.0398, "step": 5766 }, { "epoch": 2.2165433759246804, "grad_norm": 0.9808240288652214, "learning_rate": 1.9367899993742116e-06, "loss": 0.0326, "step": 5768 }, { "epoch": 2.2173119415890095, "grad_norm": 0.8349529694352418, "learning_rate": 1.933256718503634e-06, "loss": 0.0303, "step": 5770 }, { "epoch": 2.2180805072533385, "grad_norm": 0.8054523209741483, "learning_rate": 1.92972589089811e-06, "loss": 0.0291, "step": 5772 }, { "epoch": 2.2188490729176675, "grad_norm": 0.8692674976162635, "learning_rate": 1.926197519382158e-06, "loss": 0.0323, "step": 5774 }, { "epoch": 2.2196176385819966, "grad_norm": 1.0931970555066712, "learning_rate": 1.9226716067783254e-06, "loss": 0.0342, "step": 5776 }, { "epoch": 2.220386204246325, "grad_norm": 1.1785294282744294, "learning_rate": 1.91914815590719e-06, "loss": 0.0335, "step": 5778 }, { "epoch": 2.221154769910654, "grad_norm": 0.960009224785792, "learning_rate": 1.9156271695873683e-06, "loss": 0.0359, "step": 5780 }, { "epoch": 2.2219233355749832, "grad_norm": 0.9783958840957145, "learning_rate": 1.9121086506354984e-06, "loss": 0.0342, "step": 5782 }, { "epoch": 2.2226919012393123, "grad_norm": 1.0565502846334769, "learning_rate": 1.908592601866247e-06, "loss": 0.0349, "step": 5784 }, { "epoch": 2.2234604669036413, "grad_norm": 1.020718566087012, "learning_rate": 1.905079026092304e-06, "loss": 0.034, "step": 5786 }, { "epoch": 2.22422903256797, "grad_norm": 0.9865234945861185, "learning_rate": 1.9015679261243824e-06, "loss": 0.0317, "step": 5788 }, { "epoch": 2.224997598232299, "grad_norm": 0.9121074582862952, "learning_rate": 1.8980593047712115e-06, "loss": 0.0318, "step": 5790 }, { "epoch": 2.225766163896628, "grad_norm": 0.9638654405077426, "learning_rate": 1.8945531648395437e-06, "loss": 0.0283, "step": 5792 }, { "epoch": 2.226534729560957, "grad_norm": 0.9369156731201441, "learning_rate": 1.891049509134142e-06, "loss": 0.0317, "step": 5794 }, { "epoch": 2.227303295225286, "grad_norm": 0.8568922297464745, "learning_rate": 1.8875483404577817e-06, "loss": 0.0268, "step": 5796 }, { "epoch": 2.2280718608896146, "grad_norm": 0.965198758499901, "learning_rate": 1.8840496616112496e-06, "loss": 0.034, "step": 5798 }, { "epoch": 2.2288404265539437, "grad_norm": 0.9794419749540484, "learning_rate": 1.8805534753933423e-06, "loss": 0.0362, "step": 5800 }, { "epoch": 2.2296089922182727, "grad_norm": 1.4641371643928158, "learning_rate": 1.877059784600858e-06, "loss": 0.0368, "step": 5802 }, { "epoch": 2.2303775578826017, "grad_norm": 0.9134991335348968, "learning_rate": 1.8735685920286068e-06, "loss": 0.0319, "step": 5804 }, { "epoch": 2.2311461235469308, "grad_norm": 0.9244502020508493, "learning_rate": 1.870079900469392e-06, "loss": 0.0334, "step": 5806 }, { "epoch": 2.2319146892112594, "grad_norm": 1.0396904706853876, "learning_rate": 1.8665937127140199e-06, "loss": 0.0322, "step": 5808 }, { "epoch": 2.2326832548755884, "grad_norm": 0.886743036060259, "learning_rate": 1.863110031551294e-06, "loss": 0.0296, "step": 5810 }, { "epoch": 2.2334518205399174, "grad_norm": 0.8210257829318581, "learning_rate": 1.8596288597680113e-06, "loss": 0.0345, "step": 5812 }, { "epoch": 2.2342203862042465, "grad_norm": 0.9878266693312766, "learning_rate": 1.8561502001489606e-06, "loss": 0.0363, "step": 5814 }, { "epoch": 2.2349889518685755, "grad_norm": 0.9125199917396424, "learning_rate": 1.8526740554769262e-06, "loss": 0.0334, "step": 5816 }, { "epoch": 2.235757517532904, "grad_norm": 1.016335785153987, "learning_rate": 1.8492004285326754e-06, "loss": 0.0371, "step": 5818 }, { "epoch": 2.236526083197233, "grad_norm": 0.9632245062470354, "learning_rate": 1.8457293220949624e-06, "loss": 0.0373, "step": 5820 }, { "epoch": 2.237294648861562, "grad_norm": 1.0697467176439988, "learning_rate": 1.842260738940525e-06, "loss": 0.0368, "step": 5822 }, { "epoch": 2.238063214525891, "grad_norm": 0.9939310258777823, "learning_rate": 1.8387946818440871e-06, "loss": 0.0363, "step": 5824 }, { "epoch": 2.23883178019022, "grad_norm": 0.9543120490502437, "learning_rate": 1.8353311535783442e-06, "loss": 0.0355, "step": 5826 }, { "epoch": 2.239600345854549, "grad_norm": 1.117136623619639, "learning_rate": 1.8318701569139714e-06, "loss": 0.0357, "step": 5828 }, { "epoch": 2.240368911518878, "grad_norm": 0.8271494076691652, "learning_rate": 1.8284116946196235e-06, "loss": 0.0283, "step": 5830 }, { "epoch": 2.241137477183207, "grad_norm": 0.961307431990351, "learning_rate": 1.824955769461923e-06, "loss": 0.034, "step": 5832 }, { "epoch": 2.241906042847536, "grad_norm": 0.7711404420000467, "learning_rate": 1.8215023842054614e-06, "loss": 0.0302, "step": 5834 }, { "epoch": 2.242674608511865, "grad_norm": 0.7964690347598511, "learning_rate": 1.818051541612807e-06, "loss": 0.0292, "step": 5836 }, { "epoch": 2.2434431741761935, "grad_norm": 1.0958128729742755, "learning_rate": 1.8146032444444827e-06, "loss": 0.0382, "step": 5838 }, { "epoch": 2.2442117398405226, "grad_norm": 1.0899618795490402, "learning_rate": 1.8111574954589805e-06, "loss": 0.0331, "step": 5840 }, { "epoch": 2.2449803055048516, "grad_norm": 0.8215619044994982, "learning_rate": 1.807714297412757e-06, "loss": 0.0281, "step": 5842 }, { "epoch": 2.2457488711691806, "grad_norm": 0.8700283992370094, "learning_rate": 1.8042736530602245e-06, "loss": 0.0302, "step": 5844 }, { "epoch": 2.2465174368335097, "grad_norm": 0.9298043333969758, "learning_rate": 1.8008355651537502e-06, "loss": 0.0354, "step": 5846 }, { "epoch": 2.2472860024978383, "grad_norm": 0.9569160392373498, "learning_rate": 1.7974000364436634e-06, "loss": 0.0322, "step": 5848 }, { "epoch": 2.2480545681621673, "grad_norm": 0.8815810667720636, "learning_rate": 1.793967069678242e-06, "loss": 0.0303, "step": 5850 }, { "epoch": 2.2488231338264963, "grad_norm": 0.9717801463534375, "learning_rate": 1.7905366676037083e-06, "loss": 0.0388, "step": 5852 }, { "epoch": 2.2495916994908254, "grad_norm": 0.7709472830241082, "learning_rate": 1.787108832964245e-06, "loss": 0.0293, "step": 5854 }, { "epoch": 2.2503602651551544, "grad_norm": 1.011726348672562, "learning_rate": 1.7836835685019732e-06, "loss": 0.0337, "step": 5856 }, { "epoch": 2.251128830819483, "grad_norm": 1.0666884090668347, "learning_rate": 1.780260876956958e-06, "loss": 0.0317, "step": 5858 }, { "epoch": 2.251897396483812, "grad_norm": 1.1242896872803971, "learning_rate": 1.7768407610672112e-06, "loss": 0.0372, "step": 5860 }, { "epoch": 2.252665962148141, "grad_norm": 0.9551643144084437, "learning_rate": 1.773423223568681e-06, "loss": 0.0289, "step": 5862 }, { "epoch": 2.25343452781247, "grad_norm": 0.8851343041797383, "learning_rate": 1.7700082671952485e-06, "loss": 0.0305, "step": 5864 }, { "epoch": 2.254203093476799, "grad_norm": 1.0839640288666947, "learning_rate": 1.7665958946787388e-06, "loss": 0.0326, "step": 5866 }, { "epoch": 2.2549716591411277, "grad_norm": 0.8062362409978662, "learning_rate": 1.7631861087489043e-06, "loss": 0.0297, "step": 5868 }, { "epoch": 2.2557402248054568, "grad_norm": 0.9393494624121664, "learning_rate": 1.7597789121334285e-06, "loss": 0.0326, "step": 5870 }, { "epoch": 2.256508790469786, "grad_norm": 1.0749320015340558, "learning_rate": 1.7563743075579276e-06, "loss": 0.0349, "step": 5872 }, { "epoch": 2.257277356134115, "grad_norm": 1.1120210582475876, "learning_rate": 1.7529722977459395e-06, "loss": 0.0344, "step": 5874 }, { "epoch": 2.258045921798444, "grad_norm": 0.8823758683643312, "learning_rate": 1.7495728854189293e-06, "loss": 0.0326, "step": 5876 }, { "epoch": 2.2588144874627725, "grad_norm": 1.0726665319886974, "learning_rate": 1.7461760732962824e-06, "loss": 0.0349, "step": 5878 }, { "epoch": 2.2595830531271015, "grad_norm": 1.0950937779972867, "learning_rate": 1.742781864095305e-06, "loss": 0.0342, "step": 5880 }, { "epoch": 2.2603516187914305, "grad_norm": 1.115025640202343, "learning_rate": 1.7393902605312195e-06, "loss": 0.0334, "step": 5882 }, { "epoch": 2.2611201844557596, "grad_norm": 0.920267698110576, "learning_rate": 1.7360012653171694e-06, "loss": 0.0301, "step": 5884 }, { "epoch": 2.2618887501200886, "grad_norm": 0.9646461409073334, "learning_rate": 1.7326148811642052e-06, "loss": 0.0311, "step": 5886 }, { "epoch": 2.262657315784417, "grad_norm": 0.9754518720009274, "learning_rate": 1.7292311107812914e-06, "loss": 0.0346, "step": 5888 }, { "epoch": 2.2634258814487462, "grad_norm": 1.073563603323224, "learning_rate": 1.7258499568753012e-06, "loss": 0.0345, "step": 5890 }, { "epoch": 2.2641944471130753, "grad_norm": 0.8641364011618404, "learning_rate": 1.7224714221510157e-06, "loss": 0.0286, "step": 5892 }, { "epoch": 2.2649630127774043, "grad_norm": 1.1112647792278614, "learning_rate": 1.7190955093111172e-06, "loss": 0.0318, "step": 5894 }, { "epoch": 2.2657315784417333, "grad_norm": 0.9608765299245043, "learning_rate": 1.7157222210561974e-06, "loss": 0.0377, "step": 5896 }, { "epoch": 2.266500144106062, "grad_norm": 0.8379424191223472, "learning_rate": 1.7123515600847424e-06, "loss": 0.0335, "step": 5898 }, { "epoch": 2.267268709770391, "grad_norm": 0.9279988710430845, "learning_rate": 1.7089835290931384e-06, "loss": 0.0357, "step": 5900 }, { "epoch": 2.26803727543472, "grad_norm": 0.9402541959844827, "learning_rate": 1.705618130775668e-06, "loss": 0.0338, "step": 5902 }, { "epoch": 2.268805841099049, "grad_norm": 1.0239131965434327, "learning_rate": 1.7022553678245084e-06, "loss": 0.0368, "step": 5904 }, { "epoch": 2.269574406763378, "grad_norm": 0.8273355412305304, "learning_rate": 1.698895242929725e-06, "loss": 0.0323, "step": 5906 }, { "epoch": 2.2703429724277067, "grad_norm": 0.9607304890259447, "learning_rate": 1.69553775877928e-06, "loss": 0.0326, "step": 5908 }, { "epoch": 2.2711115380920357, "grad_norm": 0.9741732353262201, "learning_rate": 1.6921829180590172e-06, "loss": 0.0356, "step": 5910 }, { "epoch": 2.2718801037563647, "grad_norm": 0.9932674118627284, "learning_rate": 1.6888307234526663e-06, "loss": 0.0292, "step": 5912 }, { "epoch": 2.2726486694206938, "grad_norm": 0.8659402671517408, "learning_rate": 1.6854811776418411e-06, "loss": 0.0325, "step": 5914 }, { "epoch": 2.273417235085023, "grad_norm": 0.9827396636004431, "learning_rate": 1.6821342833060412e-06, "loss": 0.0359, "step": 5916 }, { "epoch": 2.2741858007493514, "grad_norm": 1.1167607119470615, "learning_rate": 1.6787900431226333e-06, "loss": 0.0342, "step": 5918 }, { "epoch": 2.2749543664136804, "grad_norm": 0.8076207551866152, "learning_rate": 1.6754484597668746e-06, "loss": 0.0265, "step": 5920 }, { "epoch": 2.2757229320780095, "grad_norm": 0.8746815310291105, "learning_rate": 1.6721095359118884e-06, "loss": 0.0337, "step": 5922 }, { "epoch": 2.2764914977423385, "grad_norm": 0.8846435799658079, "learning_rate": 1.6687732742286723e-06, "loss": 0.0298, "step": 5924 }, { "epoch": 2.2772600634066675, "grad_norm": 1.0593624791807377, "learning_rate": 1.6654396773860927e-06, "loss": 0.0332, "step": 5926 }, { "epoch": 2.278028629070996, "grad_norm": 0.8426559991259504, "learning_rate": 1.6621087480508924e-06, "loss": 0.0294, "step": 5928 }, { "epoch": 2.278797194735325, "grad_norm": 0.8670971810162272, "learning_rate": 1.6587804888876669e-06, "loss": 0.0317, "step": 5930 }, { "epoch": 2.279565760399654, "grad_norm": 1.1133323087860758, "learning_rate": 1.6554549025588878e-06, "loss": 0.0327, "step": 5932 }, { "epoch": 2.2803343260639832, "grad_norm": 0.9167053171407388, "learning_rate": 1.6521319917248818e-06, "loss": 0.0287, "step": 5934 }, { "epoch": 2.2811028917283123, "grad_norm": 0.9199466504007597, "learning_rate": 1.648811759043838e-06, "loss": 0.0316, "step": 5936 }, { "epoch": 2.281871457392641, "grad_norm": 0.9240619753466709, "learning_rate": 1.6454942071717995e-06, "loss": 0.0353, "step": 5938 }, { "epoch": 2.28264002305697, "grad_norm": 0.8920575197223954, "learning_rate": 1.6421793387626722e-06, "loss": 0.0326, "step": 5940 }, { "epoch": 2.283408588721299, "grad_norm": 0.8930469890884619, "learning_rate": 1.6388671564682095e-06, "loss": 0.0349, "step": 5942 }, { "epoch": 2.284177154385628, "grad_norm": 0.9294556584117756, "learning_rate": 1.6355576629380132e-06, "loss": 0.0355, "step": 5944 }, { "epoch": 2.284945720049957, "grad_norm": 1.007325927878941, "learning_rate": 1.632250860819543e-06, "loss": 0.0293, "step": 5946 }, { "epoch": 2.2857142857142856, "grad_norm": 0.859396312611142, "learning_rate": 1.6289467527580993e-06, "loss": 0.031, "step": 5948 }, { "epoch": 2.2864828513786146, "grad_norm": 1.30968058360277, "learning_rate": 1.625645341396827e-06, "loss": 0.0365, "step": 5950 }, { "epoch": 2.2872514170429437, "grad_norm": 1.0224110442050347, "learning_rate": 1.6223466293767198e-06, "loss": 0.0363, "step": 5952 }, { "epoch": 2.2880199827072727, "grad_norm": 0.9713452485815901, "learning_rate": 1.6190506193366052e-06, "loss": 0.032, "step": 5954 }, { "epoch": 2.2887885483716017, "grad_norm": 0.9164545431248516, "learning_rate": 1.6157573139131527e-06, "loss": 0.0315, "step": 5956 }, { "epoch": 2.2895571140359303, "grad_norm": 0.9258621328935093, "learning_rate": 1.6124667157408674e-06, "loss": 0.0297, "step": 5958 }, { "epoch": 2.2903256797002594, "grad_norm": 0.8327114613632633, "learning_rate": 1.6091788274520887e-06, "loss": 0.0327, "step": 5960 }, { "epoch": 2.2910942453645884, "grad_norm": 0.9386636054473388, "learning_rate": 1.6058936516769863e-06, "loss": 0.0339, "step": 5962 }, { "epoch": 2.2918628110289174, "grad_norm": 0.7653021512194486, "learning_rate": 1.6026111910435665e-06, "loss": 0.0312, "step": 5964 }, { "epoch": 2.2926313766932465, "grad_norm": 1.2467430825260393, "learning_rate": 1.599331448177656e-06, "loss": 0.0334, "step": 5966 }, { "epoch": 2.293399942357575, "grad_norm": 0.9104087380003765, "learning_rate": 1.5960544257029116e-06, "loss": 0.0302, "step": 5968 }, { "epoch": 2.294168508021904, "grad_norm": 1.0016066097341374, "learning_rate": 1.592780126240812e-06, "loss": 0.0356, "step": 5970 }, { "epoch": 2.294937073686233, "grad_norm": 0.9155683411381158, "learning_rate": 1.5895085524106597e-06, "loss": 0.0297, "step": 5972 }, { "epoch": 2.295705639350562, "grad_norm": 0.9013291075494171, "learning_rate": 1.5862397068295726e-06, "loss": 0.0293, "step": 5974 }, { "epoch": 2.296474205014891, "grad_norm": 0.8895674532558653, "learning_rate": 1.5829735921124928e-06, "loss": 0.0299, "step": 5976 }, { "epoch": 2.29724277067922, "grad_norm": 0.7611225572727243, "learning_rate": 1.5797102108721723e-06, "loss": 0.0305, "step": 5978 }, { "epoch": 2.298011336343549, "grad_norm": 1.124822620740705, "learning_rate": 1.5764495657191769e-06, "loss": 0.0334, "step": 5980 }, { "epoch": 2.298779902007878, "grad_norm": 1.0110017715550792, "learning_rate": 1.57319165926189e-06, "loss": 0.0306, "step": 5982 }, { "epoch": 2.299548467672207, "grad_norm": 0.9937275524629052, "learning_rate": 1.5699364941064937e-06, "loss": 0.0339, "step": 5984 }, { "epoch": 2.300317033336536, "grad_norm": 1.0014940944647008, "learning_rate": 1.5666840728569833e-06, "loss": 0.0316, "step": 5986 }, { "epoch": 2.3010855990008645, "grad_norm": 0.9753161261647969, "learning_rate": 1.5634343981151607e-06, "loss": 0.0333, "step": 5988 }, { "epoch": 2.3018541646651935, "grad_norm": 0.9578867231271385, "learning_rate": 1.560187472480627e-06, "loss": 0.034, "step": 5990 }, { "epoch": 2.3026227303295226, "grad_norm": 1.1344775182093358, "learning_rate": 1.556943298550786e-06, "loss": 0.0273, "step": 5992 }, { "epoch": 2.3033912959938516, "grad_norm": 0.8819870697508558, "learning_rate": 1.5537018789208397e-06, "loss": 0.0317, "step": 5994 }, { "epoch": 2.3041598616581807, "grad_norm": 0.8593198951558769, "learning_rate": 1.5504632161837869e-06, "loss": 0.0318, "step": 5996 }, { "epoch": 2.3049284273225092, "grad_norm": 1.2328139730210155, "learning_rate": 1.5472273129304188e-06, "loss": 0.0344, "step": 5998 }, { "epoch": 2.3056969929868383, "grad_norm": 1.0498506803601948, "learning_rate": 1.543994171749325e-06, "loss": 0.0313, "step": 6000 }, { "epoch": 2.3056969929868383, "eval_loss": 0.1776677817106247, "eval_runtime": 391.5531, "eval_samples_per_second": 47.261, "eval_steps_per_second": 5.91, "step": 6000 }, { "epoch": 2.3064655586511673, "grad_norm": 0.9550869220948441, "learning_rate": 1.54076379522688e-06, "loss": 0.0338, "step": 6002 }, { "epoch": 2.3072341243154963, "grad_norm": 0.8848815653070123, "learning_rate": 1.5375361859472492e-06, "loss": 0.0343, "step": 6004 }, { "epoch": 2.3080026899798254, "grad_norm": 0.9571600263206651, "learning_rate": 1.5343113464923808e-06, "loss": 0.0294, "step": 6006 }, { "epoch": 2.308771255644154, "grad_norm": 0.8433374017942371, "learning_rate": 1.5310892794420163e-06, "loss": 0.0315, "step": 6008 }, { "epoch": 2.309539821308483, "grad_norm": 0.8907972963058427, "learning_rate": 1.5278699873736653e-06, "loss": 0.03, "step": 6010 }, { "epoch": 2.310308386972812, "grad_norm": 0.8983267282745284, "learning_rate": 1.5246534728626317e-06, "loss": 0.0312, "step": 6012 }, { "epoch": 2.311076952637141, "grad_norm": 0.934534781861702, "learning_rate": 1.521439738481989e-06, "loss": 0.0275, "step": 6014 }, { "epoch": 2.31184551830147, "grad_norm": 0.8706654289048824, "learning_rate": 1.5182287868025887e-06, "loss": 0.0313, "step": 6016 }, { "epoch": 2.3126140839657987, "grad_norm": 0.8580595595357888, "learning_rate": 1.515020620393055e-06, "loss": 0.0294, "step": 6018 }, { "epoch": 2.3133826496301277, "grad_norm": 0.9369958749182182, "learning_rate": 1.511815241819789e-06, "loss": 0.0298, "step": 6020 }, { "epoch": 2.3141512152944568, "grad_norm": 0.8381358867201331, "learning_rate": 1.5086126536469558e-06, "loss": 0.027, "step": 6022 }, { "epoch": 2.314919780958786, "grad_norm": 0.900591944623819, "learning_rate": 1.5054128584364918e-06, "loss": 0.0284, "step": 6024 }, { "epoch": 2.315688346623115, "grad_norm": 0.8473738464214834, "learning_rate": 1.5022158587480973e-06, "loss": 0.0334, "step": 6026 }, { "epoch": 2.3164569122874434, "grad_norm": 0.9664924420820546, "learning_rate": 1.4990216571392368e-06, "loss": 0.0352, "step": 6028 }, { "epoch": 2.3172254779517725, "grad_norm": 0.9574107759282299, "learning_rate": 1.495830256165135e-06, "loss": 0.0316, "step": 6030 }, { "epoch": 2.3179940436161015, "grad_norm": 1.0550121921188538, "learning_rate": 1.4926416583787817e-06, "loss": 0.0291, "step": 6032 }, { "epoch": 2.3187626092804305, "grad_norm": 0.97610648236116, "learning_rate": 1.4894558663309183e-06, "loss": 0.0338, "step": 6034 }, { "epoch": 2.3195311749447596, "grad_norm": 0.9382064019120636, "learning_rate": 1.486272882570044e-06, "loss": 0.0324, "step": 6036 }, { "epoch": 2.320299740609088, "grad_norm": 1.1781703595263988, "learning_rate": 1.4830927096424107e-06, "loss": 0.0333, "step": 6038 }, { "epoch": 2.321068306273417, "grad_norm": 0.9710599242702864, "learning_rate": 1.4799153500920238e-06, "loss": 0.0279, "step": 6040 }, { "epoch": 2.3218368719377462, "grad_norm": 0.8312576484288945, "learning_rate": 1.4767408064606337e-06, "loss": 0.0258, "step": 6042 }, { "epoch": 2.3226054376020753, "grad_norm": 1.0843740451065498, "learning_rate": 1.473569081287745e-06, "loss": 0.0328, "step": 6044 }, { "epoch": 2.3233740032664043, "grad_norm": 0.9706583080900374, "learning_rate": 1.4704001771106024e-06, "loss": 0.0324, "step": 6046 }, { "epoch": 2.324142568930733, "grad_norm": 0.9099949074854775, "learning_rate": 1.4672340964641952e-06, "loss": 0.0304, "step": 6048 }, { "epoch": 2.324911134595062, "grad_norm": 0.9764464232563469, "learning_rate": 1.4640708418812538e-06, "loss": 0.0335, "step": 6050 }, { "epoch": 2.325679700259391, "grad_norm": 1.0308956863534215, "learning_rate": 1.4609104158922489e-06, "loss": 0.0337, "step": 6052 }, { "epoch": 2.32644826592372, "grad_norm": 0.9962707087875775, "learning_rate": 1.457752821025385e-06, "loss": 0.0351, "step": 6054 }, { "epoch": 2.327216831588049, "grad_norm": 1.1520748574196924, "learning_rate": 1.454598059806609e-06, "loss": 0.035, "step": 6056 }, { "epoch": 2.3279853972523776, "grad_norm": 1.0740332200986065, "learning_rate": 1.4514461347595948e-06, "loss": 0.0327, "step": 6058 }, { "epoch": 2.3287539629167067, "grad_norm": 0.7905211165568543, "learning_rate": 1.4482970484057485e-06, "loss": 0.0285, "step": 6060 }, { "epoch": 2.3295225285810357, "grad_norm": 0.8885643155661286, "learning_rate": 1.445150803264207e-06, "loss": 0.032, "step": 6062 }, { "epoch": 2.3302910942453647, "grad_norm": 1.0109173283682658, "learning_rate": 1.4420074018518333e-06, "loss": 0.0364, "step": 6064 }, { "epoch": 2.3310596599096938, "grad_norm": 0.9924103291565507, "learning_rate": 1.438866846683214e-06, "loss": 0.0297, "step": 6066 }, { "epoch": 2.3318282255740224, "grad_norm": 1.010064447079299, "learning_rate": 1.435729140270663e-06, "loss": 0.0338, "step": 6068 }, { "epoch": 2.3325967912383514, "grad_norm": 0.938776186988429, "learning_rate": 1.4325942851242125e-06, "loss": 0.0251, "step": 6070 }, { "epoch": 2.3333653569026804, "grad_norm": 0.8073748032043968, "learning_rate": 1.429462283751612e-06, "loss": 0.025, "step": 6072 }, { "epoch": 2.3341339225670095, "grad_norm": 0.8405923417007354, "learning_rate": 1.4263331386583351e-06, "loss": 0.0324, "step": 6074 }, { "epoch": 2.3349024882313385, "grad_norm": 0.9548597637827023, "learning_rate": 1.4232068523475612e-06, "loss": 0.0312, "step": 6076 }, { "epoch": 2.335671053895667, "grad_norm": 1.1408567724782988, "learning_rate": 1.4200834273201864e-06, "loss": 0.036, "step": 6078 }, { "epoch": 2.336439619559996, "grad_norm": 0.8985909397767691, "learning_rate": 1.416962866074823e-06, "loss": 0.0268, "step": 6080 }, { "epoch": 2.337208185224325, "grad_norm": 0.9459532482637584, "learning_rate": 1.4138451711077856e-06, "loss": 0.0303, "step": 6082 }, { "epoch": 2.337976750888654, "grad_norm": 0.8178177371591966, "learning_rate": 1.4107303449130978e-06, "loss": 0.0288, "step": 6084 }, { "epoch": 2.3387453165529832, "grad_norm": 0.9464512167105799, "learning_rate": 1.407618389982493e-06, "loss": 0.0288, "step": 6086 }, { "epoch": 2.339513882217312, "grad_norm": 0.957242280255979, "learning_rate": 1.4045093088053991e-06, "loss": 0.0299, "step": 6088 }, { "epoch": 2.340282447881641, "grad_norm": 0.7330127476040568, "learning_rate": 1.4014031038689497e-06, "loss": 0.0242, "step": 6090 }, { "epoch": 2.34105101354597, "grad_norm": 1.0321834622589827, "learning_rate": 1.398299777657981e-06, "loss": 0.031, "step": 6092 }, { "epoch": 2.341819579210299, "grad_norm": 0.926004183552732, "learning_rate": 1.3951993326550206e-06, "loss": 0.0308, "step": 6094 }, { "epoch": 2.342588144874628, "grad_norm": 0.836302002040745, "learning_rate": 1.3921017713402929e-06, "loss": 0.0316, "step": 6096 }, { "epoch": 2.3433567105389566, "grad_norm": 0.9217230675493828, "learning_rate": 1.3890070961917186e-06, "loss": 0.0334, "step": 6098 }, { "epoch": 2.3441252762032856, "grad_norm": 0.8932991768570027, "learning_rate": 1.3859153096849065e-06, "loss": 0.031, "step": 6100 }, { "epoch": 2.3448938418676146, "grad_norm": 0.9494827768088175, "learning_rate": 1.382826414293152e-06, "loss": 0.0362, "step": 6102 }, { "epoch": 2.3456624075319437, "grad_norm": 0.9853746373902722, "learning_rate": 1.3797404124874441e-06, "loss": 0.0307, "step": 6104 }, { "epoch": 2.3464309731962727, "grad_norm": 1.1503838015185683, "learning_rate": 1.376657306736453e-06, "loss": 0.0331, "step": 6106 }, { "epoch": 2.3471995388606013, "grad_norm": 0.836977103702859, "learning_rate": 1.3735770995065328e-06, "loss": 0.0283, "step": 6108 }, { "epoch": 2.3479681045249303, "grad_norm": 0.9708813995394042, "learning_rate": 1.3704997932617182e-06, "loss": 0.0311, "step": 6110 }, { "epoch": 2.3487366701892594, "grad_norm": 0.9771685459123309, "learning_rate": 1.3674253904637268e-06, "loss": 0.0357, "step": 6112 }, { "epoch": 2.3495052358535884, "grad_norm": 1.0573301494203524, "learning_rate": 1.3643538935719492e-06, "loss": 0.0337, "step": 6114 }, { "epoch": 2.3502738015179174, "grad_norm": 0.8412100295170292, "learning_rate": 1.3612853050434532e-06, "loss": 0.0295, "step": 6116 }, { "epoch": 2.351042367182246, "grad_norm": 0.8763698997257661, "learning_rate": 1.3582196273329806e-06, "loss": 0.027, "step": 6118 }, { "epoch": 2.351810932846575, "grad_norm": 0.8805188261585938, "learning_rate": 1.3551568628929434e-06, "loss": 0.0281, "step": 6120 }, { "epoch": 2.352579498510904, "grad_norm": 0.9552313513838278, "learning_rate": 1.3520970141734225e-06, "loss": 0.0334, "step": 6122 }, { "epoch": 2.353348064175233, "grad_norm": 1.0350951403140671, "learning_rate": 1.3490400836221706e-06, "loss": 0.0313, "step": 6124 }, { "epoch": 2.354116629839562, "grad_norm": 1.009472037141673, "learning_rate": 1.3459860736846013e-06, "loss": 0.0323, "step": 6126 }, { "epoch": 2.3548851955038907, "grad_norm": 0.935989010446079, "learning_rate": 1.3429349868037938e-06, "loss": 0.0322, "step": 6128 }, { "epoch": 2.35565376116822, "grad_norm": 1.042542041056386, "learning_rate": 1.3398868254204884e-06, "loss": 0.0362, "step": 6130 }, { "epoch": 2.356422326832549, "grad_norm": 0.8588543093545278, "learning_rate": 1.336841591973086e-06, "loss": 0.033, "step": 6132 }, { "epoch": 2.357190892496878, "grad_norm": 1.2289833227771632, "learning_rate": 1.3337992888976425e-06, "loss": 0.0388, "step": 6134 }, { "epoch": 2.357959458161207, "grad_norm": 0.9056766688836796, "learning_rate": 1.3307599186278747e-06, "loss": 0.0268, "step": 6136 }, { "epoch": 2.3587280238255355, "grad_norm": 0.8493164597006577, "learning_rate": 1.3277234835951503e-06, "loss": 0.0355, "step": 6138 }, { "epoch": 2.3594965894898645, "grad_norm": 1.0821708331236752, "learning_rate": 1.3246899862284872e-06, "loss": 0.0306, "step": 6140 }, { "epoch": 2.3602651551541936, "grad_norm": 0.8764877271891676, "learning_rate": 1.3216594289545566e-06, "loss": 0.0299, "step": 6142 }, { "epoch": 2.3610337208185226, "grad_norm": 0.8990389250576687, "learning_rate": 1.318631814197675e-06, "loss": 0.0351, "step": 6144 }, { "epoch": 2.3618022864828516, "grad_norm": 0.8628167822118563, "learning_rate": 1.3156071443798058e-06, "loss": 0.0287, "step": 6146 }, { "epoch": 2.36257085214718, "grad_norm": 1.3328414652242286, "learning_rate": 1.3125854219205597e-06, "loss": 0.0324, "step": 6148 }, { "epoch": 2.3633394178115092, "grad_norm": 0.9828057920669566, "learning_rate": 1.3095666492371855e-06, "loss": 0.0335, "step": 6150 }, { "epoch": 2.3641079834758383, "grad_norm": 0.9324116412785937, "learning_rate": 1.3065508287445738e-06, "loss": 0.0296, "step": 6152 }, { "epoch": 2.3648765491401673, "grad_norm": 1.0243807380221257, "learning_rate": 1.3035379628552542e-06, "loss": 0.0335, "step": 6154 }, { "epoch": 2.3656451148044964, "grad_norm": 0.8055220518220095, "learning_rate": 1.3005280539793908e-06, "loss": 0.0314, "step": 6156 }, { "epoch": 2.366413680468825, "grad_norm": 0.9577237968090377, "learning_rate": 1.2975211045247826e-06, "loss": 0.0301, "step": 6158 }, { "epoch": 2.367182246133154, "grad_norm": 0.9528718126542017, "learning_rate": 1.2945171168968661e-06, "loss": 0.0348, "step": 6160 }, { "epoch": 2.367950811797483, "grad_norm": 1.1374230461315007, "learning_rate": 1.2915160934987015e-06, "loss": 0.0333, "step": 6162 }, { "epoch": 2.368719377461812, "grad_norm": 0.9469037562895174, "learning_rate": 1.2885180367309801e-06, "loss": 0.0327, "step": 6164 }, { "epoch": 2.369487943126141, "grad_norm": 0.9562475898168009, "learning_rate": 1.2855229489920251e-06, "loss": 0.0305, "step": 6166 }, { "epoch": 2.3702565087904697, "grad_norm": 0.6900061001472727, "learning_rate": 1.2825308326777753e-06, "loss": 0.0267, "step": 6168 }, { "epoch": 2.3710250744547987, "grad_norm": 0.8833765011106044, "learning_rate": 1.2795416901817974e-06, "loss": 0.0271, "step": 6170 }, { "epoch": 2.3717936401191277, "grad_norm": 1.0340869204388614, "learning_rate": 1.2765555238952826e-06, "loss": 0.0349, "step": 6172 }, { "epoch": 2.372562205783457, "grad_norm": 0.9736850784587424, "learning_rate": 1.2735723362070362e-06, "loss": 0.0334, "step": 6174 }, { "epoch": 2.373330771447786, "grad_norm": 0.8411793139462941, "learning_rate": 1.27059212950348e-06, "loss": 0.0311, "step": 6176 }, { "epoch": 2.3740993371121144, "grad_norm": 0.9085165712286899, "learning_rate": 1.2676149061686572e-06, "loss": 0.0313, "step": 6178 }, { "epoch": 2.3748679027764434, "grad_norm": 1.044676411556788, "learning_rate": 1.2646406685842199e-06, "loss": 0.0383, "step": 6180 }, { "epoch": 2.3756364684407725, "grad_norm": 1.0905040776052208, "learning_rate": 1.2616694191294292e-06, "loss": 0.0386, "step": 6182 }, { "epoch": 2.3764050341051015, "grad_norm": 0.9377100129214377, "learning_rate": 1.2587011601811627e-06, "loss": 0.031, "step": 6184 }, { "epoch": 2.3771735997694305, "grad_norm": 1.2570643746921568, "learning_rate": 1.2557358941139014e-06, "loss": 0.0339, "step": 6186 }, { "epoch": 2.377942165433759, "grad_norm": 0.9410784477793802, "learning_rate": 1.25277362329973e-06, "loss": 0.0334, "step": 6188 }, { "epoch": 2.378710731098088, "grad_norm": 0.8152947036192679, "learning_rate": 1.2498143501083449e-06, "loss": 0.0331, "step": 6190 }, { "epoch": 2.379479296762417, "grad_norm": 0.837217664545045, "learning_rate": 1.2468580769070383e-06, "loss": 0.0259, "step": 6192 }, { "epoch": 2.3802478624267462, "grad_norm": 0.8891272786568134, "learning_rate": 1.2439048060606996e-06, "loss": 0.034, "step": 6194 }, { "epoch": 2.3810164280910753, "grad_norm": 0.9260245477664959, "learning_rate": 1.2409545399318252e-06, "loss": 0.0286, "step": 6196 }, { "epoch": 2.381784993755404, "grad_norm": 1.1167281973981114, "learning_rate": 1.2380072808805022e-06, "loss": 0.0336, "step": 6198 }, { "epoch": 2.382553559419733, "grad_norm": 0.899917770427177, "learning_rate": 1.2350630312644114e-06, "loss": 0.0274, "step": 6200 }, { "epoch": 2.383322125084062, "grad_norm": 0.9551459664982198, "learning_rate": 1.2321217934388312e-06, "loss": 0.0307, "step": 6202 }, { "epoch": 2.384090690748391, "grad_norm": 0.9087670810967209, "learning_rate": 1.2291835697566262e-06, "loss": 0.0307, "step": 6204 }, { "epoch": 2.38485925641272, "grad_norm": 1.0029901425050405, "learning_rate": 1.2262483625682514e-06, "loss": 0.0314, "step": 6206 }, { "epoch": 2.3856278220770486, "grad_norm": 0.8751709875772684, "learning_rate": 1.2233161742217486e-06, "loss": 0.0291, "step": 6208 }, { "epoch": 2.3863963877413776, "grad_norm": 0.985000550188052, "learning_rate": 1.2203870070627437e-06, "loss": 0.0319, "step": 6210 }, { "epoch": 2.3871649534057067, "grad_norm": 0.9192269491537859, "learning_rate": 1.2174608634344464e-06, "loss": 0.0335, "step": 6212 }, { "epoch": 2.3879335190700357, "grad_norm": 0.8911203431298392, "learning_rate": 1.2145377456776496e-06, "loss": 0.0331, "step": 6214 }, { "epoch": 2.3887020847343647, "grad_norm": 0.9196027046856164, "learning_rate": 1.2116176561307241e-06, "loss": 0.0334, "step": 6216 }, { "epoch": 2.3894706503986933, "grad_norm": 0.8436877524548119, "learning_rate": 1.2087005971296167e-06, "loss": 0.0293, "step": 6218 }, { "epoch": 2.3902392160630224, "grad_norm": 0.8749152040546565, "learning_rate": 1.2057865710078525e-06, "loss": 0.0318, "step": 6220 }, { "epoch": 2.3910077817273514, "grad_norm": 0.8973647286111471, "learning_rate": 1.2028755800965286e-06, "loss": 0.0293, "step": 6222 }, { "epoch": 2.3917763473916804, "grad_norm": 0.8752625975458487, "learning_rate": 1.1999676267243148e-06, "loss": 0.0345, "step": 6224 }, { "epoch": 2.3925449130560095, "grad_norm": 0.835874654663785, "learning_rate": 1.19706271321745e-06, "loss": 0.0284, "step": 6226 }, { "epoch": 2.393313478720338, "grad_norm": 0.8563919886022364, "learning_rate": 1.194160841899744e-06, "loss": 0.0313, "step": 6228 }, { "epoch": 2.394082044384667, "grad_norm": 0.8172582072786218, "learning_rate": 1.1912620150925713e-06, "loss": 0.0273, "step": 6230 }, { "epoch": 2.394850610048996, "grad_norm": 0.9941735132989908, "learning_rate": 1.18836623511487e-06, "loss": 0.0266, "step": 6232 }, { "epoch": 2.395619175713325, "grad_norm": 0.8982319586660505, "learning_rate": 1.1854735042831417e-06, "loss": 0.0283, "step": 6234 }, { "epoch": 2.396387741377654, "grad_norm": 0.9228501648048444, "learning_rate": 1.1825838249114497e-06, "loss": 0.0289, "step": 6236 }, { "epoch": 2.397156307041983, "grad_norm": 0.9458628073037663, "learning_rate": 1.1796971993114125e-06, "loss": 0.0319, "step": 6238 }, { "epoch": 2.397924872706312, "grad_norm": 0.9827843804213633, "learning_rate": 1.1768136297922129e-06, "loss": 0.0272, "step": 6240 }, { "epoch": 2.398693438370641, "grad_norm": 0.9718104205233962, "learning_rate": 1.1739331186605823e-06, "loss": 0.0304, "step": 6242 }, { "epoch": 2.39946200403497, "grad_norm": 0.8821257625947041, "learning_rate": 1.1710556682208069e-06, "loss": 0.028, "step": 6244 }, { "epoch": 2.400230569699299, "grad_norm": 1.0053625788437306, "learning_rate": 1.1681812807747295e-06, "loss": 0.0286, "step": 6246 }, { "epoch": 2.4009991353636275, "grad_norm": 0.8124221487823499, "learning_rate": 1.1653099586217349e-06, "loss": 0.0256, "step": 6248 }, { "epoch": 2.4017677010279566, "grad_norm": 0.9231334779589119, "learning_rate": 1.1624417040587593e-06, "loss": 0.0327, "step": 6250 }, { "epoch": 2.4025362666922856, "grad_norm": 1.127976940846233, "learning_rate": 1.1595765193802877e-06, "loss": 0.03, "step": 6252 }, { "epoch": 2.4033048323566146, "grad_norm": 1.0263002809765875, "learning_rate": 1.1567144068783452e-06, "loss": 0.0316, "step": 6254 }, { "epoch": 2.4040733980209437, "grad_norm": 1.1540884029276097, "learning_rate": 1.1538553688425002e-06, "loss": 0.0342, "step": 6256 }, { "epoch": 2.4048419636852723, "grad_norm": 0.8808308742189771, "learning_rate": 1.1509994075598663e-06, "loss": 0.0298, "step": 6258 }, { "epoch": 2.4056105293496013, "grad_norm": 0.8684330600053505, "learning_rate": 1.148146525315087e-06, "loss": 0.0301, "step": 6260 }, { "epoch": 2.4063790950139303, "grad_norm": 1.001127868344964, "learning_rate": 1.1452967243903484e-06, "loss": 0.0347, "step": 6262 }, { "epoch": 2.4071476606782594, "grad_norm": 1.029534572348146, "learning_rate": 1.1424500070653733e-06, "loss": 0.0304, "step": 6264 }, { "epoch": 2.407916226342588, "grad_norm": 0.9587719405114409, "learning_rate": 1.1396063756174142e-06, "loss": 0.0293, "step": 6266 }, { "epoch": 2.408684792006917, "grad_norm": 0.9048569928971546, "learning_rate": 1.1367658323212538e-06, "loss": 0.0315, "step": 6268 }, { "epoch": 2.409453357671246, "grad_norm": 0.8665846568934097, "learning_rate": 1.1339283794492106e-06, "loss": 0.0296, "step": 6270 }, { "epoch": 2.410221923335575, "grad_norm": 0.8758730043300434, "learning_rate": 1.1310940192711266e-06, "loss": 0.0281, "step": 6272 }, { "epoch": 2.410990488999904, "grad_norm": 0.8634237448421659, "learning_rate": 1.1282627540543662e-06, "loss": 0.0333, "step": 6274 }, { "epoch": 2.4117590546642327, "grad_norm": 0.8369011750227572, "learning_rate": 1.1254345860638256e-06, "loss": 0.0347, "step": 6276 }, { "epoch": 2.4125276203285617, "grad_norm": 0.9853739767593754, "learning_rate": 1.1226095175619184e-06, "loss": 0.032, "step": 6278 }, { "epoch": 2.4132961859928908, "grad_norm": 0.8157800413217577, "learning_rate": 1.1197875508085793e-06, "loss": 0.0295, "step": 6280 }, { "epoch": 2.41406475165722, "grad_norm": 1.00985002235359, "learning_rate": 1.1169686880612646e-06, "loss": 0.0324, "step": 6282 }, { "epoch": 2.414833317321549, "grad_norm": 1.0423647907230025, "learning_rate": 1.1141529315749455e-06, "loss": 0.0379, "step": 6284 }, { "epoch": 2.4156018829858774, "grad_norm": 0.9593595386541955, "learning_rate": 1.1113402836021042e-06, "loss": 0.0299, "step": 6286 }, { "epoch": 2.4163704486502064, "grad_norm": 1.0091361232313747, "learning_rate": 1.1085307463927448e-06, "loss": 0.0282, "step": 6288 }, { "epoch": 2.4171390143145355, "grad_norm": 0.8305235323877965, "learning_rate": 1.1057243221943765e-06, "loss": 0.03, "step": 6290 }, { "epoch": 2.4179075799788645, "grad_norm": 0.8538055750822274, "learning_rate": 1.1029210132520185e-06, "loss": 0.0252, "step": 6292 }, { "epoch": 2.4186761456431936, "grad_norm": 1.0477843288284823, "learning_rate": 1.1001208218082027e-06, "loss": 0.0312, "step": 6294 }, { "epoch": 2.419444711307522, "grad_norm": 1.005609491798349, "learning_rate": 1.0973237501029626e-06, "loss": 0.0341, "step": 6296 }, { "epoch": 2.420213276971851, "grad_norm": 0.9566963996129869, "learning_rate": 1.0945298003738369e-06, "loss": 0.0306, "step": 6298 }, { "epoch": 2.42098184263618, "grad_norm": 1.0493211652527417, "learning_rate": 1.0917389748558681e-06, "loss": 0.0375, "step": 6300 }, { "epoch": 2.4217504083005092, "grad_norm": 0.8605501340733691, "learning_rate": 1.0889512757815978e-06, "loss": 0.0275, "step": 6302 }, { "epoch": 2.4225189739648383, "grad_norm": 1.1256048234732654, "learning_rate": 1.086166705381067e-06, "loss": 0.0355, "step": 6304 }, { "epoch": 2.423287539629167, "grad_norm": 0.9035960583575421, "learning_rate": 1.0833852658818167e-06, "loss": 0.031, "step": 6306 }, { "epoch": 2.424056105293496, "grad_norm": 1.0478514760863735, "learning_rate": 1.0806069595088792e-06, "loss": 0.0386, "step": 6308 }, { "epoch": 2.424824670957825, "grad_norm": 0.8719138065288091, "learning_rate": 1.0778317884847834e-06, "loss": 0.0294, "step": 6310 }, { "epoch": 2.425593236622154, "grad_norm": 0.8592317227746658, "learning_rate": 1.0750597550295472e-06, "loss": 0.0278, "step": 6312 }, { "epoch": 2.426361802286483, "grad_norm": 0.9292348369279388, "learning_rate": 1.0722908613606813e-06, "loss": 0.0348, "step": 6314 }, { "epoch": 2.4271303679508116, "grad_norm": 0.8469021249847731, "learning_rate": 1.069525109693183e-06, "loss": 0.0338, "step": 6316 }, { "epoch": 2.4278989336151406, "grad_norm": 0.9496527951983709, "learning_rate": 1.0667625022395378e-06, "loss": 0.0314, "step": 6318 }, { "epoch": 2.4286674992794697, "grad_norm": 0.9790001434795267, "learning_rate": 1.0640030412097146e-06, "loss": 0.0332, "step": 6320 }, { "epoch": 2.4294360649437987, "grad_norm": 0.8832295505029365, "learning_rate": 1.0612467288111655e-06, "loss": 0.0299, "step": 6322 }, { "epoch": 2.4302046306081277, "grad_norm": 1.0693089898488894, "learning_rate": 1.0584935672488244e-06, "loss": 0.0299, "step": 6324 }, { "epoch": 2.4309731962724563, "grad_norm": 0.9259513003266131, "learning_rate": 1.0557435587251031e-06, "loss": 0.033, "step": 6326 }, { "epoch": 2.4317417619367854, "grad_norm": 1.0511269499949598, "learning_rate": 1.052996705439892e-06, "loss": 0.0343, "step": 6328 }, { "epoch": 2.4325103276011144, "grad_norm": 1.1346498538833438, "learning_rate": 1.0502530095905599e-06, "loss": 0.0358, "step": 6330 }, { "epoch": 2.4332788932654434, "grad_norm": 0.9390159874584031, "learning_rate": 1.047512473371946e-06, "loss": 0.0314, "step": 6332 }, { "epoch": 2.4340474589297725, "grad_norm": 1.0500129816752868, "learning_rate": 1.0447750989763639e-06, "loss": 0.0317, "step": 6334 }, { "epoch": 2.434816024594101, "grad_norm": 0.9681786719353576, "learning_rate": 1.0420408885935956e-06, "loss": 0.033, "step": 6336 }, { "epoch": 2.43558459025843, "grad_norm": 0.9720884793650917, "learning_rate": 1.0393098444108985e-06, "loss": 0.0329, "step": 6338 }, { "epoch": 2.436353155922759, "grad_norm": 0.8999191634537679, "learning_rate": 1.0365819686129886e-06, "loss": 0.0278, "step": 6340 }, { "epoch": 2.437121721587088, "grad_norm": 1.0805618360378626, "learning_rate": 1.0338572633820504e-06, "loss": 0.0332, "step": 6342 }, { "epoch": 2.437890287251417, "grad_norm": 0.9325816697435729, "learning_rate": 1.0311357308977372e-06, "loss": 0.0337, "step": 6344 }, { "epoch": 2.438658852915746, "grad_norm": 0.8986185677783991, "learning_rate": 1.0284173733371566e-06, "loss": 0.0289, "step": 6346 }, { "epoch": 2.439427418580075, "grad_norm": 0.8790368666347483, "learning_rate": 1.02570219287488e-06, "loss": 0.0274, "step": 6348 }, { "epoch": 2.440195984244404, "grad_norm": 1.035253744058864, "learning_rate": 1.022990191682941e-06, "loss": 0.0334, "step": 6350 }, { "epoch": 2.440964549908733, "grad_norm": 0.8814859304747412, "learning_rate": 1.0202813719308213e-06, "loss": 0.0319, "step": 6352 }, { "epoch": 2.441733115573062, "grad_norm": 0.9475133837660361, "learning_rate": 1.017575735785462e-06, "loss": 0.0282, "step": 6354 }, { "epoch": 2.4425016812373905, "grad_norm": 1.0473899730376843, "learning_rate": 1.014873285411262e-06, "loss": 0.0304, "step": 6356 }, { "epoch": 2.4432702469017196, "grad_norm": 0.7984933531252377, "learning_rate": 1.0121740229700643e-06, "loss": 0.0273, "step": 6358 }, { "epoch": 2.4440388125660486, "grad_norm": 0.8359180247082129, "learning_rate": 1.0094779506211643e-06, "loss": 0.0294, "step": 6360 }, { "epoch": 2.4448073782303776, "grad_norm": 0.8714741338620526, "learning_rate": 1.0067850705213084e-06, "loss": 0.0284, "step": 6362 }, { "epoch": 2.4455759438947067, "grad_norm": 0.860172913539049, "learning_rate": 1.0040953848246876e-06, "loss": 0.0265, "step": 6364 }, { "epoch": 2.4463445095590353, "grad_norm": 0.8698034983950517, "learning_rate": 1.0014088956829316e-06, "loss": 0.0324, "step": 6366 }, { "epoch": 2.4471130752233643, "grad_norm": 0.8740541787576065, "learning_rate": 9.987256052451234e-07, "loss": 0.0287, "step": 6368 }, { "epoch": 2.4478816408876933, "grad_norm": 0.996323509441302, "learning_rate": 9.960455156577796e-07, "loss": 0.0325, "step": 6370 }, { "epoch": 2.4486502065520224, "grad_norm": 1.0801089646988793, "learning_rate": 9.93368629064858e-07, "loss": 0.0391, "step": 6372 }, { "epoch": 2.4494187722163514, "grad_norm": 1.010911764580045, "learning_rate": 9.906949476077577e-07, "loss": 0.0302, "step": 6374 }, { "epoch": 2.45018733788068, "grad_norm": 0.9750585777246091, "learning_rate": 9.880244734253103e-07, "loss": 0.0317, "step": 6376 }, { "epoch": 2.450955903545009, "grad_norm": 1.113152440978864, "learning_rate": 9.853572086537783e-07, "loss": 0.0332, "step": 6378 }, { "epoch": 2.451724469209338, "grad_norm": 0.8700401393315036, "learning_rate": 9.826931554268648e-07, "loss": 0.0335, "step": 6380 }, { "epoch": 2.452493034873667, "grad_norm": 1.0281965275769758, "learning_rate": 9.80032315875699e-07, "loss": 0.0315, "step": 6382 }, { "epoch": 2.453261600537996, "grad_norm": 0.9820199369099935, "learning_rate": 9.773746921288384e-07, "loss": 0.0287, "step": 6384 }, { "epoch": 2.4540301662023247, "grad_norm": 0.9521565577585847, "learning_rate": 9.74720286312273e-07, "loss": 0.0283, "step": 6386 }, { "epoch": 2.4547987318666538, "grad_norm": 0.9861478877322565, "learning_rate": 9.720691005494126e-07, "loss": 0.0322, "step": 6388 }, { "epoch": 2.455567297530983, "grad_norm": 0.8795087597753082, "learning_rate": 9.69421136961095e-07, "loss": 0.0284, "step": 6390 }, { "epoch": 2.456335863195312, "grad_norm": 0.7103322625795797, "learning_rate": 9.667763976655793e-07, "loss": 0.0272, "step": 6392 }, { "epoch": 2.457104428859641, "grad_norm": 0.8321709142906215, "learning_rate": 9.641348847785443e-07, "loss": 0.0287, "step": 6394 }, { "epoch": 2.4578729945239695, "grad_norm": 0.9315194522209213, "learning_rate": 9.614966004130878e-07, "loss": 0.0323, "step": 6396 }, { "epoch": 2.4586415601882985, "grad_norm": 0.9905133290529423, "learning_rate": 9.588615466797286e-07, "loss": 0.0337, "step": 6398 }, { "epoch": 2.4594101258526275, "grad_norm": 1.077256841345869, "learning_rate": 9.562297256863974e-07, "loss": 0.0273, "step": 6400 }, { "epoch": 2.4601786915169566, "grad_norm": 0.8991387028668573, "learning_rate": 9.536011395384387e-07, "loss": 0.0284, "step": 6402 }, { "epoch": 2.4609472571812856, "grad_norm": 1.1753603912008075, "learning_rate": 9.509757903386113e-07, "loss": 0.0307, "step": 6404 }, { "epoch": 2.461715822845614, "grad_norm": 0.8432435204808522, "learning_rate": 9.483536801870835e-07, "loss": 0.0293, "step": 6406 }, { "epoch": 2.4624843885099432, "grad_norm": 0.9644809463419595, "learning_rate": 9.457348111814307e-07, "loss": 0.0264, "step": 6408 }, { "epoch": 2.4632529541742723, "grad_norm": 0.794124393306731, "learning_rate": 9.431191854166411e-07, "loss": 0.0249, "step": 6410 }, { "epoch": 2.4640215198386013, "grad_norm": 0.9654799991693823, "learning_rate": 9.405068049851023e-07, "loss": 0.033, "step": 6412 }, { "epoch": 2.4647900855029303, "grad_norm": 0.9629884955407199, "learning_rate": 9.378976719766098e-07, "loss": 0.033, "step": 6414 }, { "epoch": 2.465558651167259, "grad_norm": 1.187954921295432, "learning_rate": 9.35291788478358e-07, "loss": 0.0351, "step": 6416 }, { "epoch": 2.466327216831588, "grad_norm": 0.8623272281384817, "learning_rate": 9.326891565749451e-07, "loss": 0.0326, "step": 6418 }, { "epoch": 2.467095782495917, "grad_norm": 1.151048546916539, "learning_rate": 9.300897783483648e-07, "loss": 0.0317, "step": 6420 }, { "epoch": 2.467864348160246, "grad_norm": 1.0165345836230697, "learning_rate": 9.274936558780118e-07, "loss": 0.0271, "step": 6422 }, { "epoch": 2.468632913824575, "grad_norm": 0.9318925998225205, "learning_rate": 9.249007912406743e-07, "loss": 0.0328, "step": 6424 }, { "epoch": 2.4694014794889037, "grad_norm": 1.0155726588332452, "learning_rate": 9.223111865105338e-07, "loss": 0.0302, "step": 6426 }, { "epoch": 2.4701700451532327, "grad_norm": 0.9523850454533453, "learning_rate": 9.197248437591633e-07, "loss": 0.0307, "step": 6428 }, { "epoch": 2.4709386108175617, "grad_norm": 1.1180789337665311, "learning_rate": 9.17141765055532e-07, "loss": 0.0349, "step": 6430 }, { "epoch": 2.4717071764818908, "grad_norm": 0.9542116936856964, "learning_rate": 9.145619524659882e-07, "loss": 0.0323, "step": 6432 }, { "epoch": 2.47247574214622, "grad_norm": 1.0227058983900146, "learning_rate": 9.119854080542767e-07, "loss": 0.0286, "step": 6434 }, { "epoch": 2.4732443078105484, "grad_norm": 0.9090758596400507, "learning_rate": 9.094121338815237e-07, "loss": 0.0316, "step": 6436 }, { "epoch": 2.4740128734748774, "grad_norm": 1.0305780058876846, "learning_rate": 9.068421320062387e-07, "loss": 0.0337, "step": 6438 }, { "epoch": 2.4747814391392065, "grad_norm": 0.9687581461600948, "learning_rate": 9.042754044843144e-07, "loss": 0.0298, "step": 6440 }, { "epoch": 2.4755500048035355, "grad_norm": 1.1394680692567873, "learning_rate": 9.017119533690277e-07, "loss": 0.0292, "step": 6442 }, { "epoch": 2.4763185704678645, "grad_norm": 1.0099754386599544, "learning_rate": 8.991517807110273e-07, "loss": 0.0296, "step": 6444 }, { "epoch": 2.477087136132193, "grad_norm": 1.0143966372551878, "learning_rate": 8.965948885583431e-07, "loss": 0.0321, "step": 6446 }, { "epoch": 2.477855701796522, "grad_norm": 0.932701338929102, "learning_rate": 8.940412789563835e-07, "loss": 0.0272, "step": 6448 }, { "epoch": 2.478624267460851, "grad_norm": 0.825162656560881, "learning_rate": 8.914909539479271e-07, "loss": 0.0278, "step": 6450 }, { "epoch": 2.47939283312518, "grad_norm": 0.8983443612061126, "learning_rate": 8.88943915573125e-07, "loss": 0.0275, "step": 6452 }, { "epoch": 2.4801613987895093, "grad_norm": 0.9000479894141066, "learning_rate": 8.864001658695026e-07, "loss": 0.0306, "step": 6454 }, { "epoch": 2.480929964453838, "grad_norm": 0.916763241733241, "learning_rate": 8.838597068719518e-07, "loss": 0.0284, "step": 6456 }, { "epoch": 2.481698530118167, "grad_norm": 0.9566849116155112, "learning_rate": 8.813225406127296e-07, "loss": 0.0311, "step": 6458 }, { "epoch": 2.482467095782496, "grad_norm": 0.914229717343525, "learning_rate": 8.787886691214648e-07, "loss": 0.0282, "step": 6460 }, { "epoch": 2.483235661446825, "grad_norm": 0.9699754366122999, "learning_rate": 8.762580944251464e-07, "loss": 0.0284, "step": 6462 }, { "epoch": 2.484004227111154, "grad_norm": 1.035279703853517, "learning_rate": 8.737308185481263e-07, "loss": 0.0359, "step": 6464 }, { "epoch": 2.4847727927754826, "grad_norm": 0.9694333253614933, "learning_rate": 8.712068435121213e-07, "loss": 0.0278, "step": 6466 }, { "epoch": 2.4855413584398116, "grad_norm": 0.9439107526298549, "learning_rate": 8.686861713362027e-07, "loss": 0.0288, "step": 6468 }, { "epoch": 2.4863099241041406, "grad_norm": 1.0032778866504386, "learning_rate": 8.661688040368016e-07, "loss": 0.0317, "step": 6470 }, { "epoch": 2.4870784897684697, "grad_norm": 0.955977679987182, "learning_rate": 8.636547436277065e-07, "loss": 0.0283, "step": 6472 }, { "epoch": 2.4878470554327987, "grad_norm": 0.9493260130834834, "learning_rate": 8.611439921200587e-07, "loss": 0.0333, "step": 6474 }, { "epoch": 2.4886156210971273, "grad_norm": 1.0880414348480019, "learning_rate": 8.586365515223527e-07, "loss": 0.0286, "step": 6476 }, { "epoch": 2.4893841867614563, "grad_norm": 1.2206884937303721, "learning_rate": 8.561324238404367e-07, "loss": 0.0338, "step": 6478 }, { "epoch": 2.4901527524257854, "grad_norm": 0.8894821487863854, "learning_rate": 8.536316110775067e-07, "loss": 0.0296, "step": 6480 }, { "epoch": 2.4909213180901144, "grad_norm": 0.9662551701023767, "learning_rate": 8.511341152341069e-07, "loss": 0.0293, "step": 6482 }, { "epoch": 2.4916898837544434, "grad_norm": 1.1069399810378926, "learning_rate": 8.486399383081279e-07, "loss": 0.0279, "step": 6484 }, { "epoch": 2.492458449418772, "grad_norm": 0.8822729509547681, "learning_rate": 8.461490822948071e-07, "loss": 0.0279, "step": 6486 }, { "epoch": 2.493227015083101, "grad_norm": 1.1000632895831535, "learning_rate": 8.436615491867211e-07, "loss": 0.0313, "step": 6488 }, { "epoch": 2.49399558074743, "grad_norm": 0.9750212876161696, "learning_rate": 8.411773409737955e-07, "loss": 0.0315, "step": 6490 }, { "epoch": 2.494764146411759, "grad_norm": 1.07376593780407, "learning_rate": 8.386964596432901e-07, "loss": 0.0352, "step": 6492 }, { "epoch": 2.495532712076088, "grad_norm": 1.0945953886115516, "learning_rate": 8.362189071798044e-07, "loss": 0.0281, "step": 6494 }, { "epoch": 2.4963012777404168, "grad_norm": 1.0335064843373143, "learning_rate": 8.337446855652764e-07, "loss": 0.0348, "step": 6496 }, { "epoch": 2.497069843404746, "grad_norm": 0.7878526944077424, "learning_rate": 8.312737967789792e-07, "loss": 0.0305, "step": 6498 }, { "epoch": 2.497838409069075, "grad_norm": 1.0561839534343518, "learning_rate": 8.288062427975174e-07, "loss": 0.0323, "step": 6500 }, { "epoch": 2.497838409069075, "eval_loss": 0.1765473186969757, "eval_runtime": 390.3999, "eval_samples_per_second": 47.4, "eval_steps_per_second": 5.927, "step": 6500 }, { "epoch": 2.498606974733404, "grad_norm": 0.9427758575605987, "learning_rate": 8.263420255948329e-07, "loss": 0.0265, "step": 6502 }, { "epoch": 2.499375540397733, "grad_norm": 1.0493645972910841, "learning_rate": 8.238811471421937e-07, "loss": 0.0354, "step": 6504 }, { "epoch": 2.500144106062062, "grad_norm": 1.2514553545238305, "learning_rate": 8.21423609408199e-07, "loss": 0.0354, "step": 6506 }, { "epoch": 2.5009126717263905, "grad_norm": 0.8854489169222982, "learning_rate": 8.189694143587751e-07, "loss": 0.0289, "step": 6508 }, { "epoch": 2.5016812373907196, "grad_norm": 1.122975807263549, "learning_rate": 8.165185639571737e-07, "loss": 0.0325, "step": 6510 }, { "epoch": 2.5024498030550486, "grad_norm": 1.0073530561948987, "learning_rate": 8.140710601639717e-07, "loss": 0.0333, "step": 6512 }, { "epoch": 2.503218368719377, "grad_norm": 1.0434956727197784, "learning_rate": 8.116269049370701e-07, "loss": 0.0298, "step": 6514 }, { "epoch": 2.5039869343837067, "grad_norm": 0.9792959769064291, "learning_rate": 8.091861002316886e-07, "loss": 0.0291, "step": 6516 }, { "epoch": 2.5047555000480353, "grad_norm": 1.009465714787063, "learning_rate": 8.067486480003684e-07, "loss": 0.0317, "step": 6518 }, { "epoch": 2.5055240657123643, "grad_norm": 0.9190226321278292, "learning_rate": 8.043145501929666e-07, "loss": 0.025, "step": 6520 }, { "epoch": 2.5062926313766933, "grad_norm": 0.9222158800658974, "learning_rate": 8.018838087566632e-07, "loss": 0.0239, "step": 6522 }, { "epoch": 2.507061197041022, "grad_norm": 0.944135257209119, "learning_rate": 7.994564256359422e-07, "loss": 0.027, "step": 6524 }, { "epoch": 2.5078297627053514, "grad_norm": 1.2290920923875388, "learning_rate": 7.970324027726123e-07, "loss": 0.0322, "step": 6526 }, { "epoch": 2.50859832836968, "grad_norm": 1.1170359346892142, "learning_rate": 7.946117421057881e-07, "loss": 0.0347, "step": 6528 }, { "epoch": 2.509366894034009, "grad_norm": 0.995731057268171, "learning_rate": 7.921944455718956e-07, "loss": 0.03, "step": 6530 }, { "epoch": 2.510135459698338, "grad_norm": 1.3367666307944026, "learning_rate": 7.897805151046689e-07, "loss": 0.0308, "step": 6532 }, { "epoch": 2.5109040253626667, "grad_norm": 1.1137730959037395, "learning_rate": 7.873699526351524e-07, "loss": 0.0325, "step": 6534 }, { "epoch": 2.511672591026996, "grad_norm": 0.9395159540850272, "learning_rate": 7.849627600916931e-07, "loss": 0.0307, "step": 6536 }, { "epoch": 2.5124411566913247, "grad_norm": 1.0450667539343295, "learning_rate": 7.825589393999439e-07, "loss": 0.0319, "step": 6538 }, { "epoch": 2.5132097223556538, "grad_norm": 0.9575092597415306, "learning_rate": 7.801584924828592e-07, "loss": 0.0292, "step": 6540 }, { "epoch": 2.513978288019983, "grad_norm": 0.910404559206983, "learning_rate": 7.777614212606955e-07, "loss": 0.0277, "step": 6542 }, { "epoch": 2.5147468536843114, "grad_norm": 0.9957010216509086, "learning_rate": 7.753677276510074e-07, "loss": 0.0311, "step": 6544 }, { "epoch": 2.515515419348641, "grad_norm": 1.1297578770859453, "learning_rate": 7.729774135686502e-07, "loss": 0.0308, "step": 6546 }, { "epoch": 2.5162839850129695, "grad_norm": 0.8838486058944544, "learning_rate": 7.705904809257742e-07, "loss": 0.0303, "step": 6548 }, { "epoch": 2.5170525506772985, "grad_norm": 1.043754971624148, "learning_rate": 7.682069316318235e-07, "loss": 0.0341, "step": 6550 }, { "epoch": 2.5178211163416275, "grad_norm": 1.0591965499249727, "learning_rate": 7.658267675935377e-07, "loss": 0.0293, "step": 6552 }, { "epoch": 2.518589682005956, "grad_norm": 1.151425182020231, "learning_rate": 7.634499907149468e-07, "loss": 0.0346, "step": 6554 }, { "epoch": 2.5193582476702856, "grad_norm": 0.9256789781488137, "learning_rate": 7.61076602897371e-07, "loss": 0.0252, "step": 6556 }, { "epoch": 2.520126813334614, "grad_norm": 0.928858338263365, "learning_rate": 7.587066060394232e-07, "loss": 0.0308, "step": 6558 }, { "epoch": 2.5208953789989432, "grad_norm": 0.8448011132551414, "learning_rate": 7.563400020369987e-07, "loss": 0.0285, "step": 6560 }, { "epoch": 2.5216639446632723, "grad_norm": 0.9542015965466251, "learning_rate": 7.539767927832808e-07, "loss": 0.0297, "step": 6562 }, { "epoch": 2.522432510327601, "grad_norm": 0.8273696143407415, "learning_rate": 7.516169801687379e-07, "loss": 0.0292, "step": 6564 }, { "epoch": 2.5232010759919303, "grad_norm": 0.9088682523327773, "learning_rate": 7.492605660811198e-07, "loss": 0.0304, "step": 6566 }, { "epoch": 2.523969641656259, "grad_norm": 0.973990794955316, "learning_rate": 7.469075524054564e-07, "loss": 0.0306, "step": 6568 }, { "epoch": 2.524738207320588, "grad_norm": 0.8756734178316165, "learning_rate": 7.445579410240628e-07, "loss": 0.0281, "step": 6570 }, { "epoch": 2.525506772984917, "grad_norm": 0.8831636343416575, "learning_rate": 7.422117338165274e-07, "loss": 0.0273, "step": 6572 }, { "epoch": 2.5262753386492456, "grad_norm": 0.8862156309133052, "learning_rate": 7.398689326597158e-07, "loss": 0.0258, "step": 6574 }, { "epoch": 2.527043904313575, "grad_norm": 0.969815737928241, "learning_rate": 7.375295394277709e-07, "loss": 0.0285, "step": 6576 }, { "epoch": 2.5278124699779037, "grad_norm": 1.2667376595722641, "learning_rate": 7.351935559921086e-07, "loss": 0.0331, "step": 6578 }, { "epoch": 2.5285810356422327, "grad_norm": 0.766762570878954, "learning_rate": 7.328609842214152e-07, "loss": 0.0239, "step": 6580 }, { "epoch": 2.5293496013065617, "grad_norm": 0.8644982748317027, "learning_rate": 7.305318259816524e-07, "loss": 0.0345, "step": 6582 }, { "epoch": 2.5301181669708903, "grad_norm": 0.9826198645184945, "learning_rate": 7.282060831360465e-07, "loss": 0.0299, "step": 6584 }, { "epoch": 2.53088673263522, "grad_norm": 0.846215131682695, "learning_rate": 7.258837575450928e-07, "loss": 0.0258, "step": 6586 }, { "epoch": 2.5316552982995484, "grad_norm": 0.9083744218394106, "learning_rate": 7.235648510665571e-07, "loss": 0.0305, "step": 6588 }, { "epoch": 2.5324238639638774, "grad_norm": 0.7965237370222513, "learning_rate": 7.212493655554637e-07, "loss": 0.0253, "step": 6590 }, { "epoch": 2.5331924296282065, "grad_norm": 0.8900241609870997, "learning_rate": 7.189373028641023e-07, "loss": 0.0284, "step": 6592 }, { "epoch": 2.533960995292535, "grad_norm": 0.8130781748337983, "learning_rate": 7.166286648420279e-07, "loss": 0.0279, "step": 6594 }, { "epoch": 2.5347295609568645, "grad_norm": 0.8419731017201173, "learning_rate": 7.143234533360527e-07, "loss": 0.0275, "step": 6596 }, { "epoch": 2.535498126621193, "grad_norm": 0.9263275802282434, "learning_rate": 7.120216701902466e-07, "loss": 0.0286, "step": 6598 }, { "epoch": 2.536266692285522, "grad_norm": 0.9788697997310358, "learning_rate": 7.097233172459417e-07, "loss": 0.0271, "step": 6600 }, { "epoch": 2.537035257949851, "grad_norm": 0.8181894396582015, "learning_rate": 7.074283963417234e-07, "loss": 0.028, "step": 6602 }, { "epoch": 2.5378038236141798, "grad_norm": 0.8946054519195439, "learning_rate": 7.051369093134275e-07, "loss": 0.0275, "step": 6604 }, { "epoch": 2.5385723892785093, "grad_norm": 0.9453386885826738, "learning_rate": 7.028488579941506e-07, "loss": 0.0255, "step": 6606 }, { "epoch": 2.539340954942838, "grad_norm": 1.0008131983751647, "learning_rate": 7.005642442142357e-07, "loss": 0.0305, "step": 6608 }, { "epoch": 2.540109520607167, "grad_norm": 1.0510996272002566, "learning_rate": 6.982830698012776e-07, "loss": 0.0321, "step": 6610 }, { "epoch": 2.540878086271496, "grad_norm": 1.0347407408817055, "learning_rate": 6.960053365801183e-07, "loss": 0.0324, "step": 6612 }, { "epoch": 2.5416466519358245, "grad_norm": 0.8190106139300402, "learning_rate": 6.937310463728514e-07, "loss": 0.0282, "step": 6614 }, { "epoch": 2.542415217600154, "grad_norm": 0.9964319411149566, "learning_rate": 6.91460200998808e-07, "loss": 0.0275, "step": 6616 }, { "epoch": 2.5431837832644826, "grad_norm": 0.925596210736709, "learning_rate": 6.891928022745726e-07, "loss": 0.0304, "step": 6618 }, { "epoch": 2.5439523489288116, "grad_norm": 1.092868335765285, "learning_rate": 6.869288520139672e-07, "loss": 0.0326, "step": 6620 }, { "epoch": 2.5447209145931406, "grad_norm": 1.0677034717486524, "learning_rate": 6.846683520280556e-07, "loss": 0.0318, "step": 6622 }, { "epoch": 2.5454894802574692, "grad_norm": 1.0845505140974963, "learning_rate": 6.824113041251423e-07, "loss": 0.0331, "step": 6624 }, { "epoch": 2.5462580459217987, "grad_norm": 1.0023001049695333, "learning_rate": 6.801577101107715e-07, "loss": 0.0324, "step": 6626 }, { "epoch": 2.5470266115861273, "grad_norm": 0.8539406526212486, "learning_rate": 6.779075717877226e-07, "loss": 0.027, "step": 6628 }, { "epoch": 2.5477951772504563, "grad_norm": 0.9170335081922412, "learning_rate": 6.756608909560109e-07, "loss": 0.0267, "step": 6630 }, { "epoch": 2.5485637429147854, "grad_norm": 0.9607645060000095, "learning_rate": 6.734176694128863e-07, "loss": 0.026, "step": 6632 }, { "epoch": 2.549332308579114, "grad_norm": 0.9519722336404509, "learning_rate": 6.71177908952832e-07, "loss": 0.0288, "step": 6634 }, { "epoch": 2.5501008742434434, "grad_norm": 0.9823559559997088, "learning_rate": 6.68941611367559e-07, "loss": 0.0306, "step": 6636 }, { "epoch": 2.550869439907772, "grad_norm": 0.8907340666303374, "learning_rate": 6.667087784460141e-07, "loss": 0.0275, "step": 6638 }, { "epoch": 2.551638005572101, "grad_norm": 0.9497743945721757, "learning_rate": 6.644794119743686e-07, "loss": 0.0312, "step": 6640 }, { "epoch": 2.55240657123643, "grad_norm": 0.822830496291402, "learning_rate": 6.622535137360208e-07, "loss": 0.029, "step": 6642 }, { "epoch": 2.5531751369007587, "grad_norm": 1.0457942990079996, "learning_rate": 6.600310855115949e-07, "loss": 0.0308, "step": 6644 }, { "epoch": 2.553943702565088, "grad_norm": 0.8508877945511636, "learning_rate": 6.578121290789408e-07, "loss": 0.0265, "step": 6646 }, { "epoch": 2.5547122682294168, "grad_norm": 0.8076822059126533, "learning_rate": 6.55596646213128e-07, "loss": 0.0281, "step": 6648 }, { "epoch": 2.555480833893746, "grad_norm": 0.8766641177567035, "learning_rate": 6.533846386864512e-07, "loss": 0.0276, "step": 6650 }, { "epoch": 2.556249399558075, "grad_norm": 0.9633644365460585, "learning_rate": 6.51176108268422e-07, "loss": 0.0291, "step": 6652 }, { "epoch": 2.5570179652224034, "grad_norm": 0.9852988417961691, "learning_rate": 6.489710567257723e-07, "loss": 0.0285, "step": 6654 }, { "epoch": 2.557786530886733, "grad_norm": 0.9935364458723627, "learning_rate": 6.467694858224488e-07, "loss": 0.0301, "step": 6656 }, { "epoch": 2.5585550965510615, "grad_norm": 0.8301407829303622, "learning_rate": 6.445713973196161e-07, "loss": 0.0303, "step": 6658 }, { "epoch": 2.5593236622153905, "grad_norm": 0.9588132240402389, "learning_rate": 6.423767929756508e-07, "loss": 0.0283, "step": 6660 }, { "epoch": 2.5600922278797196, "grad_norm": 1.1184662350204437, "learning_rate": 6.40185674546146e-07, "loss": 0.0297, "step": 6662 }, { "epoch": 2.560860793544048, "grad_norm": 0.8590173645771502, "learning_rate": 6.379980437839028e-07, "loss": 0.0306, "step": 6664 }, { "epoch": 2.5616293592083776, "grad_norm": 0.9286387282862157, "learning_rate": 6.358139024389326e-07, "loss": 0.0303, "step": 6666 }, { "epoch": 2.5623979248727062, "grad_norm": 0.830794758392921, "learning_rate": 6.33633252258457e-07, "loss": 0.0277, "step": 6668 }, { "epoch": 2.5631664905370353, "grad_norm": 0.8586084592745605, "learning_rate": 6.314560949869042e-07, "loss": 0.0276, "step": 6670 }, { "epoch": 2.5639350562013643, "grad_norm": 0.7900962972627349, "learning_rate": 6.292824323659063e-07, "loss": 0.0279, "step": 6672 }, { "epoch": 2.564703621865693, "grad_norm": 0.9230305615191853, "learning_rate": 6.271122661343043e-07, "loss": 0.0328, "step": 6674 }, { "epoch": 2.565472187530022, "grad_norm": 0.9607899350346868, "learning_rate": 6.249455980281377e-07, "loss": 0.0308, "step": 6676 }, { "epoch": 2.566240753194351, "grad_norm": 1.0720797905417259, "learning_rate": 6.22782429780649e-07, "loss": 0.0332, "step": 6678 }, { "epoch": 2.56700931885868, "grad_norm": 0.8246637073204528, "learning_rate": 6.206227631222844e-07, "loss": 0.0258, "step": 6680 }, { "epoch": 2.567777884523009, "grad_norm": 0.8919803821967035, "learning_rate": 6.184665997806832e-07, "loss": 0.031, "step": 6682 }, { "epoch": 2.5685464501873376, "grad_norm": 0.9205788751689733, "learning_rate": 6.16313941480684e-07, "loss": 0.027, "step": 6684 }, { "epoch": 2.5693150158516667, "grad_norm": 1.0058557839657256, "learning_rate": 6.141647899443254e-07, "loss": 0.0294, "step": 6686 }, { "epoch": 2.5700835815159957, "grad_norm": 0.9133522169589798, "learning_rate": 6.120191468908365e-07, "loss": 0.0291, "step": 6688 }, { "epoch": 2.5708521471803247, "grad_norm": 0.8914499416645533, "learning_rate": 6.0987701403664e-07, "loss": 0.0284, "step": 6690 }, { "epoch": 2.5716207128446538, "grad_norm": 0.9855346878128268, "learning_rate": 6.07738393095354e-07, "loss": 0.029, "step": 6692 }, { "epoch": 2.5723892785089824, "grad_norm": 0.8378890965302574, "learning_rate": 6.05603285777785e-07, "loss": 0.0284, "step": 6694 }, { "epoch": 2.5731578441733114, "grad_norm": 0.8304927520548033, "learning_rate": 6.034716937919249e-07, "loss": 0.0264, "step": 6696 }, { "epoch": 2.5739264098376404, "grad_norm": 1.052462457756081, "learning_rate": 6.013436188429605e-07, "loss": 0.0305, "step": 6698 }, { "epoch": 2.5746949755019695, "grad_norm": 0.9987590289751547, "learning_rate": 5.992190626332617e-07, "loss": 0.0277, "step": 6700 }, { "epoch": 2.5754635411662985, "grad_norm": 0.9729130060194069, "learning_rate": 5.970980268623816e-07, "loss": 0.0301, "step": 6702 }, { "epoch": 2.576232106830627, "grad_norm": 0.9450355878143422, "learning_rate": 5.94980513227062e-07, "loss": 0.0279, "step": 6704 }, { "epoch": 2.577000672494956, "grad_norm": 0.9941077516896506, "learning_rate": 5.928665234212233e-07, "loss": 0.0289, "step": 6706 }, { "epoch": 2.577769238159285, "grad_norm": 1.1004478429580822, "learning_rate": 5.907560591359662e-07, "loss": 0.0282, "step": 6708 }, { "epoch": 2.578537803823614, "grad_norm": 1.0187167725245834, "learning_rate": 5.886491220595758e-07, "loss": 0.0325, "step": 6710 }, { "epoch": 2.5793063694879432, "grad_norm": 0.9736694938057174, "learning_rate": 5.86545713877511e-07, "loss": 0.0264, "step": 6712 }, { "epoch": 2.580074935152272, "grad_norm": 0.8866545369874569, "learning_rate": 5.844458362724093e-07, "loss": 0.0309, "step": 6714 }, { "epoch": 2.580843500816601, "grad_norm": 0.7530678681556913, "learning_rate": 5.823494909240857e-07, "loss": 0.024, "step": 6716 }, { "epoch": 2.58161206648093, "grad_norm": 0.9265973798264546, "learning_rate": 5.802566795095266e-07, "loss": 0.0275, "step": 6718 }, { "epoch": 2.582380632145259, "grad_norm": 1.0099157905058127, "learning_rate": 5.781674037028928e-07, "loss": 0.0297, "step": 6720 }, { "epoch": 2.583149197809588, "grad_norm": 0.8713887661676516, "learning_rate": 5.760816651755175e-07, "loss": 0.0299, "step": 6722 }, { "epoch": 2.5839177634739166, "grad_norm": 0.8470105161776056, "learning_rate": 5.73999465595902e-07, "loss": 0.0286, "step": 6724 }, { "epoch": 2.5846863291382456, "grad_norm": 0.9027597315961708, "learning_rate": 5.719208066297194e-07, "loss": 0.0283, "step": 6726 }, { "epoch": 2.5854548948025746, "grad_norm": 0.9392290907761559, "learning_rate": 5.698456899398075e-07, "loss": 0.029, "step": 6728 }, { "epoch": 2.5862234604669037, "grad_norm": 1.0010781225129903, "learning_rate": 5.677741171861734e-07, "loss": 0.0331, "step": 6730 }, { "epoch": 2.5869920261312327, "grad_norm": 1.1346515793592427, "learning_rate": 5.657060900259875e-07, "loss": 0.0301, "step": 6732 }, { "epoch": 2.5877605917955613, "grad_norm": 0.9443209162171894, "learning_rate": 5.636416101135839e-07, "loss": 0.0291, "step": 6734 }, { "epoch": 2.5885291574598903, "grad_norm": 0.9496876015906952, "learning_rate": 5.615806791004596e-07, "loss": 0.0316, "step": 6736 }, { "epoch": 2.5892977231242194, "grad_norm": 0.9064816684908044, "learning_rate": 5.595232986352717e-07, "loss": 0.0306, "step": 6738 }, { "epoch": 2.5900662887885484, "grad_norm": 0.8346638962172653, "learning_rate": 5.574694703638372e-07, "loss": 0.0288, "step": 6740 }, { "epoch": 2.5908348544528774, "grad_norm": 0.8943143701117217, "learning_rate": 5.554191959291338e-07, "loss": 0.0258, "step": 6742 }, { "epoch": 2.591603420117206, "grad_norm": 1.0347910720964864, "learning_rate": 5.533724769712934e-07, "loss": 0.0284, "step": 6744 }, { "epoch": 2.592371985781535, "grad_norm": 1.066651136344854, "learning_rate": 5.51329315127605e-07, "loss": 0.0298, "step": 6746 }, { "epoch": 2.593140551445864, "grad_norm": 0.8060911535984224, "learning_rate": 5.492897120325114e-07, "loss": 0.0246, "step": 6748 }, { "epoch": 2.593909117110193, "grad_norm": 0.8943235403719966, "learning_rate": 5.472536693176083e-07, "loss": 0.0248, "step": 6750 }, { "epoch": 2.594677682774522, "grad_norm": 0.9302001153691233, "learning_rate": 5.452211886116443e-07, "loss": 0.028, "step": 6752 }, { "epoch": 2.5954462484388507, "grad_norm": 1.0129943868648674, "learning_rate": 5.431922715405186e-07, "loss": 0.03, "step": 6754 }, { "epoch": 2.59621481410318, "grad_norm": 0.9642730481929334, "learning_rate": 5.411669197272795e-07, "loss": 0.0259, "step": 6756 }, { "epoch": 2.596983379767509, "grad_norm": 0.9836885716269179, "learning_rate": 5.391451347921206e-07, "loss": 0.0305, "step": 6758 }, { "epoch": 2.597751945431838, "grad_norm": 0.9005879614918434, "learning_rate": 5.371269183523886e-07, "loss": 0.0271, "step": 6760 }, { "epoch": 2.598520511096167, "grad_norm": 0.8666386567614242, "learning_rate": 5.351122720225676e-07, "loss": 0.0287, "step": 6762 }, { "epoch": 2.5992890767604955, "grad_norm": 0.9464825384895261, "learning_rate": 5.331011974142897e-07, "loss": 0.0289, "step": 6764 }, { "epoch": 2.6000576424248245, "grad_norm": 1.0181127581008251, "learning_rate": 5.310936961363316e-07, "loss": 0.0283, "step": 6766 }, { "epoch": 2.6008262080891535, "grad_norm": 0.9217849367679527, "learning_rate": 5.29089769794609e-07, "loss": 0.0323, "step": 6768 }, { "epoch": 2.6015947737534826, "grad_norm": 0.882945592062152, "learning_rate": 5.270894199921766e-07, "loss": 0.0259, "step": 6770 }, { "epoch": 2.6023633394178116, "grad_norm": 0.8824380523627711, "learning_rate": 5.250926483292334e-07, "loss": 0.0295, "step": 6772 }, { "epoch": 2.60313190508214, "grad_norm": 1.038284335994929, "learning_rate": 5.230994564031083e-07, "loss": 0.0313, "step": 6774 }, { "epoch": 2.6039004707464692, "grad_norm": 0.9837591105041108, "learning_rate": 5.21109845808272e-07, "loss": 0.0265, "step": 6776 }, { "epoch": 2.6046690364107983, "grad_norm": 1.023964503512175, "learning_rate": 5.191238181363301e-07, "loss": 0.0322, "step": 6778 }, { "epoch": 2.6054376020751273, "grad_norm": 0.909060876990697, "learning_rate": 5.171413749760201e-07, "loss": 0.0266, "step": 6780 }, { "epoch": 2.6062061677394563, "grad_norm": 1.004353448625073, "learning_rate": 5.151625179132114e-07, "loss": 0.0313, "step": 6782 }, { "epoch": 2.606974733403785, "grad_norm": 0.9014472261229421, "learning_rate": 5.131872485309081e-07, "loss": 0.0285, "step": 6784 }, { "epoch": 2.607743299068114, "grad_norm": 1.002623456894944, "learning_rate": 5.112155684092429e-07, "loss": 0.0329, "step": 6786 }, { "epoch": 2.608511864732443, "grad_norm": 0.940460812373492, "learning_rate": 5.092474791254731e-07, "loss": 0.0276, "step": 6788 }, { "epoch": 2.609280430396772, "grad_norm": 0.8792950438592977, "learning_rate": 5.072829822539899e-07, "loss": 0.0271, "step": 6790 }, { "epoch": 2.610048996061101, "grad_norm": 1.0677574896310553, "learning_rate": 5.053220793663066e-07, "loss": 0.0315, "step": 6792 }, { "epoch": 2.6108175617254297, "grad_norm": 1.021585207425933, "learning_rate": 5.033647720310625e-07, "loss": 0.0266, "step": 6794 }, { "epoch": 2.6115861273897587, "grad_norm": 0.8313290217449991, "learning_rate": 5.014110618140222e-07, "loss": 0.0268, "step": 6796 }, { "epoch": 2.6123546930540877, "grad_norm": 0.90757702128552, "learning_rate": 4.994609502780711e-07, "loss": 0.0276, "step": 6798 }, { "epoch": 2.6131232587184168, "grad_norm": 0.9641017525618332, "learning_rate": 4.975144389832137e-07, "loss": 0.0259, "step": 6800 }, { "epoch": 2.613891824382746, "grad_norm": 0.8816179191020651, "learning_rate": 4.955715294865793e-07, "loss": 0.0298, "step": 6802 }, { "epoch": 2.6146603900470744, "grad_norm": 0.9287763239550864, "learning_rate": 4.936322233424124e-07, "loss": 0.0249, "step": 6804 }, { "epoch": 2.6154289557114034, "grad_norm": 0.8323255674644553, "learning_rate": 4.916965221020753e-07, "loss": 0.0244, "step": 6806 }, { "epoch": 2.6161975213757325, "grad_norm": 0.8886121898579676, "learning_rate": 4.89764427314049e-07, "loss": 0.025, "step": 6808 }, { "epoch": 2.6169660870400615, "grad_norm": 0.9514119178363482, "learning_rate": 4.878359405239269e-07, "loss": 0.0255, "step": 6810 }, { "epoch": 2.6177346527043905, "grad_norm": 0.8985832446418481, "learning_rate": 4.859110632744163e-07, "loss": 0.0293, "step": 6812 }, { "epoch": 2.618503218368719, "grad_norm": 1.073884085512015, "learning_rate": 4.839897971053387e-07, "loss": 0.0323, "step": 6814 }, { "epoch": 2.619271784033048, "grad_norm": 0.7735710225779577, "learning_rate": 4.820721435536246e-07, "loss": 0.0259, "step": 6816 }, { "epoch": 2.620040349697377, "grad_norm": 1.0327025390843008, "learning_rate": 4.80158104153316e-07, "loss": 0.0296, "step": 6818 }, { "epoch": 2.6208089153617062, "grad_norm": 0.875337731347626, "learning_rate": 4.782476804355651e-07, "loss": 0.0279, "step": 6820 }, { "epoch": 2.6215774810260353, "grad_norm": 0.7951917744430118, "learning_rate": 4.763408739286296e-07, "loss": 0.0261, "step": 6822 }, { "epoch": 2.622346046690364, "grad_norm": 0.89039505421469, "learning_rate": 4.744376861578731e-07, "loss": 0.0294, "step": 6824 }, { "epoch": 2.623114612354693, "grad_norm": 0.8830263025190016, "learning_rate": 4.7253811864576614e-07, "loss": 0.0256, "step": 6826 }, { "epoch": 2.623883178019022, "grad_norm": 0.9465907668796639, "learning_rate": 4.706421729118826e-07, "loss": 0.0279, "step": 6828 }, { "epoch": 2.624651743683351, "grad_norm": 0.8220197158715643, "learning_rate": 4.687498504728971e-07, "loss": 0.0272, "step": 6830 }, { "epoch": 2.62542030934768, "grad_norm": 0.9394091647882332, "learning_rate": 4.6686115284259093e-07, "loss": 0.0277, "step": 6832 }, { "epoch": 2.6261888750120086, "grad_norm": 0.9702694181037325, "learning_rate": 4.649760815318405e-07, "loss": 0.0292, "step": 6834 }, { "epoch": 2.6269574406763376, "grad_norm": 1.0695086670675544, "learning_rate": 4.6309463804862344e-07, "loss": 0.033, "step": 6836 }, { "epoch": 2.6277260063406667, "grad_norm": 0.8625174214791087, "learning_rate": 4.612168238980147e-07, "loss": 0.0265, "step": 6838 }, { "epoch": 2.6284945720049957, "grad_norm": 0.7956579110071843, "learning_rate": 4.5934264058218716e-07, "loss": 0.0287, "step": 6840 }, { "epoch": 2.6292631376693247, "grad_norm": 1.034472107102523, "learning_rate": 4.5747208960040754e-07, "loss": 0.0329, "step": 6842 }, { "epoch": 2.6300317033336533, "grad_norm": 0.7748922059751921, "learning_rate": 4.556051724490368e-07, "loss": 0.0292, "step": 6844 }, { "epoch": 2.6308002689979824, "grad_norm": 0.9980555162996639, "learning_rate": 4.537418906215313e-07, "loss": 0.0326, "step": 6846 }, { "epoch": 2.6315688346623114, "grad_norm": 0.9475423744183595, "learning_rate": 4.518822456084382e-07, "loss": 0.0319, "step": 6848 }, { "epoch": 2.6323374003266404, "grad_norm": 0.8961892997003303, "learning_rate": 4.500262388973925e-07, "loss": 0.0258, "step": 6850 }, { "epoch": 2.6331059659909695, "grad_norm": 0.8816213030864392, "learning_rate": 4.481738719731243e-07, "loss": 0.0246, "step": 6852 }, { "epoch": 2.633874531655298, "grad_norm": 0.9558253834799704, "learning_rate": 4.4632514631744674e-07, "loss": 0.0256, "step": 6854 }, { "epoch": 2.634643097319627, "grad_norm": 0.8838693449546204, "learning_rate": 4.4448006340926163e-07, "loss": 0.0257, "step": 6856 }, { "epoch": 2.635411662983956, "grad_norm": 0.9741146538006822, "learning_rate": 4.4263862472455954e-07, "loss": 0.0288, "step": 6858 }, { "epoch": 2.636180228648285, "grad_norm": 1.1558673617385904, "learning_rate": 4.4080083173641206e-07, "loss": 0.0327, "step": 6860 }, { "epoch": 2.636948794312614, "grad_norm": 0.8821143619474989, "learning_rate": 4.3896668591497607e-07, "loss": 0.0278, "step": 6862 }, { "epoch": 2.637717359976943, "grad_norm": 0.9241614604753692, "learning_rate": 4.371361887274922e-07, "loss": 0.0251, "step": 6864 }, { "epoch": 2.638485925641272, "grad_norm": 1.0311021484265055, "learning_rate": 4.353093416382792e-07, "loss": 0.0292, "step": 6866 }, { "epoch": 2.639254491305601, "grad_norm": 1.0377316026281078, "learning_rate": 4.3348614610873753e-07, "loss": 0.0309, "step": 6868 }, { "epoch": 2.64002305696993, "grad_norm": 1.2580891347205014, "learning_rate": 4.316666035973477e-07, "loss": 0.034, "step": 6870 }, { "epoch": 2.640791622634259, "grad_norm": 0.8835443652615319, "learning_rate": 4.2985071555966596e-07, "loss": 0.0305, "step": 6872 }, { "epoch": 2.6415601882985875, "grad_norm": 1.0108280641162715, "learning_rate": 4.2803848344832587e-07, "loss": 0.0319, "step": 6874 }, { "epoch": 2.6423287539629166, "grad_norm": 0.9666135525371793, "learning_rate": 4.262299087130378e-07, "loss": 0.0263, "step": 6876 }, { "epoch": 2.6430973196272456, "grad_norm": 1.0577294076930879, "learning_rate": 4.2442499280058557e-07, "loss": 0.0332, "step": 6878 }, { "epoch": 2.6438658852915746, "grad_norm": 0.9067364417164361, "learning_rate": 4.2262373715482255e-07, "loss": 0.0263, "step": 6880 }, { "epoch": 2.6446344509559037, "grad_norm": 0.9347377741158807, "learning_rate": 4.2082614321667937e-07, "loss": 0.0249, "step": 6882 }, { "epoch": 2.6454030166202323, "grad_norm": 0.9775149141221233, "learning_rate": 4.190322124241553e-07, "loss": 0.0309, "step": 6884 }, { "epoch": 2.6461715822845613, "grad_norm": 0.8234406811200848, "learning_rate": 4.172419462123178e-07, "loss": 0.0267, "step": 6886 }, { "epoch": 2.6469401479488903, "grad_norm": 1.1113380475466295, "learning_rate": 4.154553460133065e-07, "loss": 0.0301, "step": 6888 }, { "epoch": 2.6477087136132194, "grad_norm": 1.0436530413087235, "learning_rate": 4.1367241325632524e-07, "loss": 0.0292, "step": 6890 }, { "epoch": 2.6484772792775484, "grad_norm": 0.8741917831950721, "learning_rate": 4.118931493676426e-07, "loss": 0.0257, "step": 6892 }, { "epoch": 2.649245844941877, "grad_norm": 0.9462531728238378, "learning_rate": 4.1011755577059775e-07, "loss": 0.03, "step": 6894 }, { "epoch": 2.650014410606206, "grad_norm": 0.9402714079084219, "learning_rate": 4.0834563388558847e-07, "loss": 0.0254, "step": 6896 }, { "epoch": 2.650782976270535, "grad_norm": 1.0387991405987609, "learning_rate": 4.065773851300775e-07, "loss": 0.0322, "step": 6898 }, { "epoch": 2.651551541934864, "grad_norm": 1.0479023054185004, "learning_rate": 4.0481281091859057e-07, "loss": 0.03, "step": 6900 }, { "epoch": 2.652320107599193, "grad_norm": 0.9185704910986191, "learning_rate": 4.030519126627119e-07, "loss": 0.0261, "step": 6902 }, { "epoch": 2.6530886732635217, "grad_norm": 0.8899571851833423, "learning_rate": 4.0129469177108526e-07, "loss": 0.0302, "step": 6904 }, { "epoch": 2.6538572389278507, "grad_norm": 0.8530124846356638, "learning_rate": 3.9954114964941336e-07, "loss": 0.0282, "step": 6906 }, { "epoch": 2.65462580459218, "grad_norm": 0.8630026135501464, "learning_rate": 3.977912877004553e-07, "loss": 0.0269, "step": 6908 }, { "epoch": 2.655394370256509, "grad_norm": 0.8700602181834501, "learning_rate": 3.960451073240268e-07, "loss": 0.0267, "step": 6910 }, { "epoch": 2.656162935920838, "grad_norm": 0.8712866026809296, "learning_rate": 3.943026099169994e-07, "loss": 0.0285, "step": 6912 }, { "epoch": 2.6569315015851664, "grad_norm": 0.8719547510203683, "learning_rate": 3.92563796873297e-07, "loss": 0.0317, "step": 6914 }, { "epoch": 2.6577000672494955, "grad_norm": 1.0826502625952683, "learning_rate": 3.9082866958389645e-07, "loss": 0.029, "step": 6916 }, { "epoch": 2.6584686329138245, "grad_norm": 1.1543123010938026, "learning_rate": 3.890972294368256e-07, "loss": 0.0315, "step": 6918 }, { "epoch": 2.6592371985781535, "grad_norm": 0.9434159863196044, "learning_rate": 3.873694778171644e-07, "loss": 0.0294, "step": 6920 }, { "epoch": 2.6600057642424826, "grad_norm": 1.0299439426965562, "learning_rate": 3.856454161070394e-07, "loss": 0.0297, "step": 6922 }, { "epoch": 2.660774329906811, "grad_norm": 0.9379693415293303, "learning_rate": 3.839250456856297e-07, "loss": 0.0273, "step": 6924 }, { "epoch": 2.66154289557114, "grad_norm": 0.8270192194396714, "learning_rate": 3.822083679291577e-07, "loss": 0.0225, "step": 6926 }, { "epoch": 2.6623114612354692, "grad_norm": 0.967595470553159, "learning_rate": 3.804953842108933e-07, "loss": 0.0308, "step": 6928 }, { "epoch": 2.6630800268997983, "grad_norm": 0.9204398101736175, "learning_rate": 3.7878609590115024e-07, "loss": 0.0303, "step": 6930 }, { "epoch": 2.6638485925641273, "grad_norm": 1.0463464061469847, "learning_rate": 3.7708050436728816e-07, "loss": 0.0253, "step": 6932 }, { "epoch": 2.664617158228456, "grad_norm": 0.9603056967322672, "learning_rate": 3.753786109737062e-07, "loss": 0.0295, "step": 6934 }, { "epoch": 2.665385723892785, "grad_norm": 0.9228508376119434, "learning_rate": 3.736804170818503e-07, "loss": 0.0271, "step": 6936 }, { "epoch": 2.666154289557114, "grad_norm": 0.9675031681430767, "learning_rate": 3.719859240502016e-07, "loss": 0.0249, "step": 6938 }, { "epoch": 2.666922855221443, "grad_norm": 0.9264764713871029, "learning_rate": 3.7029513323428354e-07, "loss": 0.0296, "step": 6940 }, { "epoch": 2.667691420885772, "grad_norm": 0.7571673967357256, "learning_rate": 3.6860804598665645e-07, "loss": 0.0269, "step": 6942 }, { "epoch": 2.6684599865501006, "grad_norm": 1.01016116370162, "learning_rate": 3.6692466365692125e-07, "loss": 0.0279, "step": 6944 }, { "epoch": 2.6692285522144297, "grad_norm": 0.9628391025652601, "learning_rate": 3.652449875917097e-07, "loss": 0.0302, "step": 6946 }, { "epoch": 2.6699971178787587, "grad_norm": 1.1151941772804874, "learning_rate": 3.6356901913469413e-07, "loss": 0.0303, "step": 6948 }, { "epoch": 2.6707656835430877, "grad_norm": 0.7950923868536178, "learning_rate": 3.618967596265771e-07, "loss": 0.024, "step": 6950 }, { "epoch": 2.6715342492074168, "grad_norm": 1.0413636027664683, "learning_rate": 3.602282104050958e-07, "loss": 0.0254, "step": 6952 }, { "epoch": 2.6723028148717454, "grad_norm": 1.0283464216235099, "learning_rate": 3.5856337280501864e-07, "loss": 0.0282, "step": 6954 }, { "epoch": 2.6730713805360744, "grad_norm": 0.9962579044205998, "learning_rate": 3.56902248158148e-07, "loss": 0.0299, "step": 6956 }, { "epoch": 2.6738399462004034, "grad_norm": 0.7972807512283205, "learning_rate": 3.5524483779330985e-07, "loss": 0.024, "step": 6958 }, { "epoch": 2.6746085118647325, "grad_norm": 1.0229978875211563, "learning_rate": 3.5359114303636365e-07, "loss": 0.0321, "step": 6960 }, { "epoch": 2.6753770775290615, "grad_norm": 1.0166995187492, "learning_rate": 3.5194116521019615e-07, "loss": 0.0283, "step": 6962 }, { "epoch": 2.67614564319339, "grad_norm": 1.0113721424312483, "learning_rate": 3.502949056347199e-07, "loss": 0.0292, "step": 6964 }, { "epoch": 2.676914208857719, "grad_norm": 0.9783961116977168, "learning_rate": 3.4865236562687145e-07, "loss": 0.0318, "step": 6966 }, { "epoch": 2.677682774522048, "grad_norm": 0.8880296921798623, "learning_rate": 3.4701354650061525e-07, "loss": 0.0245, "step": 6968 }, { "epoch": 2.678451340186377, "grad_norm": 0.882996618352691, "learning_rate": 3.4537844956693765e-07, "loss": 0.0322, "step": 6970 }, { "epoch": 2.6792199058507062, "grad_norm": 0.9644696881926232, "learning_rate": 3.4374707613384404e-07, "loss": 0.0287, "step": 6972 }, { "epoch": 2.679988471515035, "grad_norm": 0.8088382478850411, "learning_rate": 3.421194275063677e-07, "loss": 0.0225, "step": 6974 }, { "epoch": 2.680757037179364, "grad_norm": 0.8776328535929361, "learning_rate": 3.404955049865571e-07, "loss": 0.028, "step": 6976 }, { "epoch": 2.681525602843693, "grad_norm": 0.9563466290976181, "learning_rate": 3.388753098734809e-07, "loss": 0.0295, "step": 6978 }, { "epoch": 2.682294168508022, "grad_norm": 0.8648017862811032, "learning_rate": 3.372588434632285e-07, "loss": 0.0284, "step": 6980 }, { "epoch": 2.683062734172351, "grad_norm": 0.9738043758758947, "learning_rate": 3.356461070489042e-07, "loss": 0.0267, "step": 6982 }, { "epoch": 2.6838312998366796, "grad_norm": 0.8600348985649445, "learning_rate": 3.34037101920629e-07, "loss": 0.029, "step": 6984 }, { "epoch": 2.6845998655010086, "grad_norm": 1.0310627916982993, "learning_rate": 3.324318293655382e-07, "loss": 0.0331, "step": 6986 }, { "epoch": 2.6853684311653376, "grad_norm": 1.080999790777787, "learning_rate": 3.3083029066778293e-07, "loss": 0.0304, "step": 6988 }, { "epoch": 2.6861369968296667, "grad_norm": 1.11941194029954, "learning_rate": 3.2923248710852505e-07, "loss": 0.0336, "step": 6990 }, { "epoch": 2.6869055624939957, "grad_norm": 0.8992045125898469, "learning_rate": 3.2763841996594216e-07, "loss": 0.0305, "step": 6992 }, { "epoch": 2.6876741281583243, "grad_norm": 0.9520637470348735, "learning_rate": 3.260480905152197e-07, "loss": 0.0292, "step": 6994 }, { "epoch": 2.6884426938226533, "grad_norm": 0.8308471334978264, "learning_rate": 3.2446150002855335e-07, "loss": 0.0264, "step": 6996 }, { "epoch": 2.6892112594869824, "grad_norm": 0.8553276776977203, "learning_rate": 3.2287864977514903e-07, "loss": 0.0252, "step": 6998 }, { "epoch": 2.6899798251513114, "grad_norm": 1.0037010971332354, "learning_rate": 3.2129954102121995e-07, "loss": 0.0311, "step": 7000 }, { "epoch": 2.6899798251513114, "eval_loss": 0.17805364727973938, "eval_runtime": 389.1027, "eval_samples_per_second": 47.558, "eval_steps_per_second": 5.947, "step": 7000 }, { "epoch": 2.6907483908156404, "grad_norm": 0.9053186601616398, "learning_rate": 3.1972417502998567e-07, "loss": 0.0245, "step": 7002 }, { "epoch": 2.691516956479969, "grad_norm": 0.9959541583707945, "learning_rate": 3.1815255306167413e-07, "loss": 0.0278, "step": 7004 }, { "epoch": 2.692285522144298, "grad_norm": 0.9491849606669472, "learning_rate": 3.165846763735153e-07, "loss": 0.027, "step": 7006 }, { "epoch": 2.693054087808627, "grad_norm": 0.9718874499141456, "learning_rate": 3.1502054621974474e-07, "loss": 0.029, "step": 7008 }, { "epoch": 2.693822653472956, "grad_norm": 0.8996876460202351, "learning_rate": 3.1346016385159995e-07, "loss": 0.0265, "step": 7010 }, { "epoch": 2.694591219137285, "grad_norm": 0.8766807476041425, "learning_rate": 3.119035305173218e-07, "loss": 0.0295, "step": 7012 }, { "epoch": 2.6953597848016138, "grad_norm": 1.21995064847055, "learning_rate": 3.1035064746214926e-07, "loss": 0.0356, "step": 7014 }, { "epoch": 2.696128350465943, "grad_norm": 0.9856940993110584, "learning_rate": 3.088015159283259e-07, "loss": 0.026, "step": 7016 }, { "epoch": 2.696896916130272, "grad_norm": 0.9889681202119095, "learning_rate": 3.0725613715508975e-07, "loss": 0.0279, "step": 7018 }, { "epoch": 2.697665481794601, "grad_norm": 0.9269678437321852, "learning_rate": 3.0571451237867864e-07, "loss": 0.0284, "step": 7020 }, { "epoch": 2.69843404745893, "grad_norm": 1.0028536208411443, "learning_rate": 3.041766428323284e-07, "loss": 0.0291, "step": 7022 }, { "epoch": 2.6992026131232585, "grad_norm": 0.850498016319793, "learning_rate": 3.026425297462682e-07, "loss": 0.0245, "step": 7024 }, { "epoch": 2.6999711787875875, "grad_norm": 0.8188753255612572, "learning_rate": 3.0111217434772357e-07, "loss": 0.0251, "step": 7026 }, { "epoch": 2.7007397444519166, "grad_norm": 0.9961375940047459, "learning_rate": 2.995855778609158e-07, "loss": 0.0259, "step": 7028 }, { "epoch": 2.7015083101162456, "grad_norm": 1.0231454600880123, "learning_rate": 2.9806274150705695e-07, "loss": 0.0312, "step": 7030 }, { "epoch": 2.7022768757805746, "grad_norm": 0.8642616234855947, "learning_rate": 2.9654366650435083e-07, "loss": 0.0236, "step": 7032 }, { "epoch": 2.703045441444903, "grad_norm": 0.9692576280189156, "learning_rate": 2.950283540679938e-07, "loss": 0.0289, "step": 7034 }, { "epoch": 2.7038140071092323, "grad_norm": 0.9974491845327559, "learning_rate": 2.935168054101734e-07, "loss": 0.027, "step": 7036 }, { "epoch": 2.7045825727735613, "grad_norm": 0.9616529787462555, "learning_rate": 2.920090217400617e-07, "loss": 0.0268, "step": 7038 }, { "epoch": 2.7053511384378903, "grad_norm": 0.8947514964732416, "learning_rate": 2.905050042638241e-07, "loss": 0.024, "step": 7040 }, { "epoch": 2.7061197041022194, "grad_norm": 0.9620532034586888, "learning_rate": 2.890047541846103e-07, "loss": 0.0273, "step": 7042 }, { "epoch": 2.706888269766548, "grad_norm": 0.8968968000282507, "learning_rate": 2.8750827270255744e-07, "loss": 0.0267, "step": 7044 }, { "epoch": 2.707656835430877, "grad_norm": 0.9758450868730401, "learning_rate": 2.8601556101478634e-07, "loss": 0.0339, "step": 7046 }, { "epoch": 2.708425401095206, "grad_norm": 0.861822916570591, "learning_rate": 2.8452662031540457e-07, "loss": 0.0295, "step": 7048 }, { "epoch": 2.709193966759535, "grad_norm": 0.961777064539799, "learning_rate": 2.8304145179550105e-07, "loss": 0.0295, "step": 7050 }, { "epoch": 2.709962532423864, "grad_norm": 1.0852781697754554, "learning_rate": 2.8156005664314866e-07, "loss": 0.0276, "step": 7052 }, { "epoch": 2.7107310980881927, "grad_norm": 1.0547037736558658, "learning_rate": 2.800824360434001e-07, "loss": 0.0303, "step": 7054 }, { "epoch": 2.7114996637525217, "grad_norm": 0.8811259474027515, "learning_rate": 2.7860859117828985e-07, "loss": 0.025, "step": 7056 }, { "epoch": 2.7122682294168508, "grad_norm": 0.9444885064175045, "learning_rate": 2.7713852322683034e-07, "loss": 0.0299, "step": 7058 }, { "epoch": 2.71303679508118, "grad_norm": 0.8968493154584719, "learning_rate": 2.756722333650158e-07, "loss": 0.0299, "step": 7060 }, { "epoch": 2.713805360745509, "grad_norm": 0.9379486372993309, "learning_rate": 2.7420972276581526e-07, "loss": 0.0286, "step": 7062 }, { "epoch": 2.7145739264098374, "grad_norm": 1.1348358759721375, "learning_rate": 2.72750992599175e-07, "loss": 0.0289, "step": 7064 }, { "epoch": 2.7153424920741664, "grad_norm": 0.8579247438312085, "learning_rate": 2.7129604403201837e-07, "loss": 0.0255, "step": 7066 }, { "epoch": 2.7161110577384955, "grad_norm": 0.7927701805744188, "learning_rate": 2.6984487822824203e-07, "loss": 0.0267, "step": 7068 }, { "epoch": 2.7168796234028245, "grad_norm": 0.9403474610120129, "learning_rate": 2.683974963487168e-07, "loss": 0.028, "step": 7070 }, { "epoch": 2.7176481890671536, "grad_norm": 1.1221114621080157, "learning_rate": 2.6695389955128827e-07, "loss": 0.0321, "step": 7072 }, { "epoch": 2.718416754731482, "grad_norm": 0.9608314073103282, "learning_rate": 2.655140889907726e-07, "loss": 0.0286, "step": 7074 }, { "epoch": 2.719185320395811, "grad_norm": 0.9455073573836122, "learning_rate": 2.6407806581895734e-07, "loss": 0.0269, "step": 7076 }, { "epoch": 2.71995388606014, "grad_norm": 0.9519877912618541, "learning_rate": 2.626458311846003e-07, "loss": 0.0246, "step": 7078 }, { "epoch": 2.7207224517244692, "grad_norm": 0.9963571422785878, "learning_rate": 2.612173862334283e-07, "loss": 0.0285, "step": 7080 }, { "epoch": 2.7214910173887983, "grad_norm": 0.9481899407189497, "learning_rate": 2.597927321081367e-07, "loss": 0.029, "step": 7082 }, { "epoch": 2.722259583053127, "grad_norm": 0.9429287819232082, "learning_rate": 2.5837186994839003e-07, "loss": 0.027, "step": 7084 }, { "epoch": 2.723028148717456, "grad_norm": 0.8150030346696628, "learning_rate": 2.569548008908168e-07, "loss": 0.0303, "step": 7086 }, { "epoch": 2.723796714381785, "grad_norm": 1.0107101691724933, "learning_rate": 2.5554152606901307e-07, "loss": 0.0305, "step": 7088 }, { "epoch": 2.724565280046114, "grad_norm": 0.8926284265255755, "learning_rate": 2.541320466135383e-07, "loss": 0.024, "step": 7090 }, { "epoch": 2.725333845710443, "grad_norm": 0.9952976187463988, "learning_rate": 2.5272636365191727e-07, "loss": 0.0279, "step": 7092 }, { "epoch": 2.7261024113747716, "grad_norm": 1.1422483973669504, "learning_rate": 2.513244783086355e-07, "loss": 0.028, "step": 7094 }, { "epoch": 2.7268709770391006, "grad_norm": 0.702370362039485, "learning_rate": 2.4992639170514365e-07, "loss": 0.0251, "step": 7096 }, { "epoch": 2.7276395427034297, "grad_norm": 0.9264949522471603, "learning_rate": 2.4853210495985157e-07, "loss": 0.0304, "step": 7098 }, { "epoch": 2.7284081083677587, "grad_norm": 0.9649235131239939, "learning_rate": 2.4714161918812874e-07, "loss": 0.025, "step": 7100 }, { "epoch": 2.7291766740320877, "grad_norm": 1.091337683515383, "learning_rate": 2.45754935502307e-07, "loss": 0.0301, "step": 7102 }, { "epoch": 2.7299452396964163, "grad_norm": 1.0681569387361614, "learning_rate": 2.4437205501167296e-07, "loss": 0.0306, "step": 7104 }, { "epoch": 2.7307138053607454, "grad_norm": 1.0045214419386717, "learning_rate": 2.429929788224722e-07, "loss": 0.0266, "step": 7106 }, { "epoch": 2.7314823710250744, "grad_norm": 1.09342409643245, "learning_rate": 2.416177080379095e-07, "loss": 0.03, "step": 7108 }, { "epoch": 2.7322509366894034, "grad_norm": 0.9602180526732745, "learning_rate": 2.402462437581421e-07, "loss": 0.0284, "step": 7110 }, { "epoch": 2.7330195023537325, "grad_norm": 0.9622862180313019, "learning_rate": 2.388785870802829e-07, "loss": 0.0301, "step": 7112 }, { "epoch": 2.733788068018061, "grad_norm": 0.8755144847078542, "learning_rate": 2.3751473909840117e-07, "loss": 0.0274, "step": 7114 }, { "epoch": 2.73455663368239, "grad_norm": 1.0269421765730233, "learning_rate": 2.36154700903517e-07, "loss": 0.0285, "step": 7116 }, { "epoch": 2.735325199346719, "grad_norm": 1.0981944918922373, "learning_rate": 2.3479847358360176e-07, "loss": 0.0299, "step": 7118 }, { "epoch": 2.736093765011048, "grad_norm": 0.8697142610647, "learning_rate": 2.33446058223582e-07, "loss": 0.0249, "step": 7120 }, { "epoch": 2.736862330675377, "grad_norm": 0.9320813281587921, "learning_rate": 2.320974559053324e-07, "loss": 0.0232, "step": 7122 }, { "epoch": 2.737630896339706, "grad_norm": 0.9120260985885089, "learning_rate": 2.3075266770767822e-07, "loss": 0.0234, "step": 7124 }, { "epoch": 2.738399462004035, "grad_norm": 0.9822354480474521, "learning_rate": 2.2941169470639124e-07, "loss": 0.0254, "step": 7126 }, { "epoch": 2.739168027668364, "grad_norm": 1.047166399171422, "learning_rate": 2.2807453797419665e-07, "loss": 0.0356, "step": 7128 }, { "epoch": 2.739936593332693, "grad_norm": 1.1966206005272495, "learning_rate": 2.267411985807594e-07, "loss": 0.029, "step": 7130 }, { "epoch": 2.740705158997022, "grad_norm": 0.9584819187918605, "learning_rate": 2.2541167759269788e-07, "loss": 0.0257, "step": 7132 }, { "epoch": 2.7414737246613505, "grad_norm": 0.9599716988006695, "learning_rate": 2.240859760735714e-07, "loss": 0.03, "step": 7134 }, { "epoch": 2.7422422903256796, "grad_norm": 1.014794955450862, "learning_rate": 2.2276409508388498e-07, "loss": 0.0262, "step": 7136 }, { "epoch": 2.7430108559900086, "grad_norm": 0.9163813929176028, "learning_rate": 2.2144603568108713e-07, "loss": 0.0252, "step": 7138 }, { "epoch": 2.7437794216543376, "grad_norm": 1.086633594263399, "learning_rate": 2.2013179891957114e-07, "loss": 0.0289, "step": 7140 }, { "epoch": 2.7445479873186667, "grad_norm": 1.0463568433496606, "learning_rate": 2.188213858506699e-07, "loss": 0.0291, "step": 7142 }, { "epoch": 2.7453165529829953, "grad_norm": 0.9597491430032759, "learning_rate": 2.1751479752265935e-07, "loss": 0.0254, "step": 7144 }, { "epoch": 2.7460851186473243, "grad_norm": 0.9216464538383944, "learning_rate": 2.1621203498075337e-07, "loss": 0.0251, "step": 7146 }, { "epoch": 2.7468536843116533, "grad_norm": 0.8603638409894848, "learning_rate": 2.149130992671089e-07, "loss": 0.0259, "step": 7148 }, { "epoch": 2.7476222499759824, "grad_norm": 0.8538701869664023, "learning_rate": 2.136179914208175e-07, "loss": 0.0293, "step": 7150 }, { "epoch": 2.7483908156403114, "grad_norm": 0.8635091426469574, "learning_rate": 2.1232671247791214e-07, "loss": 0.029, "step": 7152 }, { "epoch": 2.74915938130464, "grad_norm": 0.8728821497039865, "learning_rate": 2.110392634713615e-07, "loss": 0.027, "step": 7154 }, { "epoch": 2.749927946968969, "grad_norm": 1.1160156599248263, "learning_rate": 2.0975564543107007e-07, "loss": 0.0323, "step": 7156 }, { "epoch": 2.750696512633298, "grad_norm": 1.023329036290296, "learning_rate": 2.0847585938387815e-07, "loss": 0.0281, "step": 7158 }, { "epoch": 2.751465078297627, "grad_norm": 0.8902641228048, "learning_rate": 2.0719990635356013e-07, "loss": 0.026, "step": 7160 }, { "epoch": 2.752233643961956, "grad_norm": 0.749679478481006, "learning_rate": 2.0592778736082453e-07, "loss": 0.0279, "step": 7162 }, { "epoch": 2.7530022096262847, "grad_norm": 0.8486319927697024, "learning_rate": 2.0465950342331343e-07, "loss": 0.0291, "step": 7164 }, { "epoch": 2.7537707752906138, "grad_norm": 0.8326615980203376, "learning_rate": 2.0339505555559968e-07, "loss": 0.0224, "step": 7166 }, { "epoch": 2.754539340954943, "grad_norm": 0.9110378966253497, "learning_rate": 2.0213444476918863e-07, "loss": 0.027, "step": 7168 }, { "epoch": 2.755307906619272, "grad_norm": 1.0513869257498392, "learning_rate": 2.0087767207251585e-07, "loss": 0.0282, "step": 7170 }, { "epoch": 2.756076472283601, "grad_norm": 0.9172530741455072, "learning_rate": 1.99624738470946e-07, "loss": 0.0272, "step": 7172 }, { "epoch": 2.7568450379479295, "grad_norm": 0.8787548144712968, "learning_rate": 1.9837564496677176e-07, "loss": 0.0259, "step": 7174 }, { "epoch": 2.7576136036122585, "grad_norm": 0.8087109979691854, "learning_rate": 1.9713039255921773e-07, "loss": 0.0269, "step": 7176 }, { "epoch": 2.7583821692765875, "grad_norm": 0.9370209112062998, "learning_rate": 1.9588898224443208e-07, "loss": 0.0271, "step": 7178 }, { "epoch": 2.7591507349409166, "grad_norm": 0.9481401624791916, "learning_rate": 1.946514150154899e-07, "loss": 0.0281, "step": 7180 }, { "epoch": 2.7599193006052456, "grad_norm": 0.8460579581938166, "learning_rate": 1.934176918623937e-07, "loss": 0.0266, "step": 7182 }, { "epoch": 2.760687866269574, "grad_norm": 1.0475433217647696, "learning_rate": 1.921878137720695e-07, "loss": 0.0298, "step": 7184 }, { "epoch": 2.761456431933903, "grad_norm": 1.0534005833209754, "learning_rate": 1.909617817283671e-07, "loss": 0.0313, "step": 7186 }, { "epoch": 2.7622249975982323, "grad_norm": 0.8533278580172962, "learning_rate": 1.897395967120613e-07, "loss": 0.0305, "step": 7188 }, { "epoch": 2.7629935632625613, "grad_norm": 1.039758576766991, "learning_rate": 1.8852125970084844e-07, "loss": 0.033, "step": 7190 }, { "epoch": 2.7637621289268903, "grad_norm": 1.0384117722919406, "learning_rate": 1.8730677166934607e-07, "loss": 0.0291, "step": 7192 }, { "epoch": 2.764530694591219, "grad_norm": 0.8566841166550301, "learning_rate": 1.860961335890943e-07, "loss": 0.0265, "step": 7194 }, { "epoch": 2.765299260255548, "grad_norm": 1.1218643217588804, "learning_rate": 1.848893464285517e-07, "loss": 0.0287, "step": 7196 }, { "epoch": 2.766067825919877, "grad_norm": 0.8591595414856612, "learning_rate": 1.8368641115309615e-07, "loss": 0.0286, "step": 7198 }, { "epoch": 2.766836391584206, "grad_norm": 1.0217130105055119, "learning_rate": 1.8248732872502673e-07, "loss": 0.0304, "step": 7200 }, { "epoch": 2.767604957248535, "grad_norm": 0.9810419394661105, "learning_rate": 1.812921001035578e-07, "loss": 0.0299, "step": 7202 }, { "epoch": 2.7683735229128636, "grad_norm": 0.9397564375902612, "learning_rate": 1.801007262448218e-07, "loss": 0.0308, "step": 7204 }, { "epoch": 2.7691420885771927, "grad_norm": 0.9517934506815247, "learning_rate": 1.789132081018674e-07, "loss": 0.0257, "step": 7206 }, { "epoch": 2.7699106542415217, "grad_norm": 0.9076945579324281, "learning_rate": 1.7772954662466037e-07, "loss": 0.0271, "step": 7208 }, { "epoch": 2.7706792199058508, "grad_norm": 0.9896122559502295, "learning_rate": 1.7654974276007762e-07, "loss": 0.0262, "step": 7210 }, { "epoch": 2.77144778557018, "grad_norm": 0.9476334562582641, "learning_rate": 1.7537379745191375e-07, "loss": 0.0293, "step": 7212 }, { "epoch": 2.7722163512345084, "grad_norm": 0.9943228920154169, "learning_rate": 1.7420171164087508e-07, "loss": 0.0276, "step": 7214 }, { "epoch": 2.7729849168988374, "grad_norm": 1.0943469228864187, "learning_rate": 1.7303348626457995e-07, "loss": 0.03, "step": 7216 }, { "epoch": 2.7737534825631665, "grad_norm": 0.8543897800097712, "learning_rate": 1.718691222575608e-07, "loss": 0.0265, "step": 7218 }, { "epoch": 2.7745220482274955, "grad_norm": 1.0288346421252794, "learning_rate": 1.707086205512598e-07, "loss": 0.0296, "step": 7220 }, { "epoch": 2.7752906138918245, "grad_norm": 0.8959743799885651, "learning_rate": 1.6955198207402657e-07, "loss": 0.0288, "step": 7222 }, { "epoch": 2.776059179556153, "grad_norm": 0.938617670043059, "learning_rate": 1.6839920775112596e-07, "loss": 0.0297, "step": 7224 }, { "epoch": 2.776827745220482, "grad_norm": 0.9717798378287463, "learning_rate": 1.6725029850472752e-07, "loss": 0.0256, "step": 7226 }, { "epoch": 2.777596310884811, "grad_norm": 0.9139397169447304, "learning_rate": 1.6610525525390942e-07, "loss": 0.0294, "step": 7228 }, { "epoch": 2.77836487654914, "grad_norm": 0.9464528534682193, "learning_rate": 1.6496407891465893e-07, "loss": 0.027, "step": 7230 }, { "epoch": 2.7791334422134693, "grad_norm": 0.8859689650390119, "learning_rate": 1.6382677039986915e-07, "loss": 0.0257, "step": 7232 }, { "epoch": 2.779902007877798, "grad_norm": 0.9253581666228891, "learning_rate": 1.6269333061933788e-07, "loss": 0.027, "step": 7234 }, { "epoch": 2.780670573542127, "grad_norm": 0.7960546196753395, "learning_rate": 1.6156376047976984e-07, "loss": 0.0254, "step": 7236 }, { "epoch": 2.781439139206456, "grad_norm": 0.8557254588102623, "learning_rate": 1.6043806088477277e-07, "loss": 0.0273, "step": 7238 }, { "epoch": 2.782207704870785, "grad_norm": 0.8816446144039397, "learning_rate": 1.5931623273485907e-07, "loss": 0.028, "step": 7240 }, { "epoch": 2.782976270535114, "grad_norm": 1.1762572094785901, "learning_rate": 1.581982769274437e-07, "loss": 0.0319, "step": 7242 }, { "epoch": 2.7837448361994426, "grad_norm": 0.8799117790512527, "learning_rate": 1.5708419435684463e-07, "loss": 0.0275, "step": 7244 }, { "epoch": 2.7845134018637716, "grad_norm": 0.8428080002096288, "learning_rate": 1.5597398591428005e-07, "loss": 0.0277, "step": 7246 }, { "epoch": 2.7852819675281006, "grad_norm": 0.9541707583800543, "learning_rate": 1.548676524878706e-07, "loss": 0.0312, "step": 7248 }, { "epoch": 2.7860505331924297, "grad_norm": 1.0025761176751091, "learning_rate": 1.537651949626362e-07, "loss": 0.0283, "step": 7250 }, { "epoch": 2.7868190988567587, "grad_norm": 0.9778120429776909, "learning_rate": 1.5266661422049523e-07, "loss": 0.0307, "step": 7252 }, { "epoch": 2.7875876645210873, "grad_norm": 1.0085669398663137, "learning_rate": 1.5157191114026693e-07, "loss": 0.03, "step": 7254 }, { "epoch": 2.7883562301854163, "grad_norm": 0.9202664109508717, "learning_rate": 1.5048108659766693e-07, "loss": 0.0264, "step": 7256 }, { "epoch": 2.7891247958497454, "grad_norm": 0.9852785394486864, "learning_rate": 1.4939414146530996e-07, "loss": 0.0261, "step": 7258 }, { "epoch": 2.7898933615140744, "grad_norm": 0.8152316895943101, "learning_rate": 1.4831107661270437e-07, "loss": 0.0213, "step": 7260 }, { "epoch": 2.7906619271784034, "grad_norm": 0.903578195274129, "learning_rate": 1.472318929062577e-07, "loss": 0.0267, "step": 7262 }, { "epoch": 2.791430492842732, "grad_norm": 0.9987373217178226, "learning_rate": 1.4615659120927106e-07, "loss": 0.0272, "step": 7264 }, { "epoch": 2.792199058507061, "grad_norm": 0.8567973574668626, "learning_rate": 1.4508517238193964e-07, "loss": 0.0262, "step": 7266 }, { "epoch": 2.79296762417139, "grad_norm": 0.9728894114093125, "learning_rate": 1.4401763728135398e-07, "loss": 0.0268, "step": 7268 }, { "epoch": 2.793736189835719, "grad_norm": 1.0483144576950547, "learning_rate": 1.4295398676149652e-07, "loss": 0.029, "step": 7270 }, { "epoch": 2.794504755500048, "grad_norm": 1.043309415345882, "learning_rate": 1.418942216732433e-07, "loss": 0.0252, "step": 7272 }, { "epoch": 2.7952733211643768, "grad_norm": 0.8113642776716711, "learning_rate": 1.408383428643617e-07, "loss": 0.0233, "step": 7274 }, { "epoch": 2.796041886828706, "grad_norm": 0.9939542627480664, "learning_rate": 1.3978635117950945e-07, "loss": 0.0282, "step": 7276 }, { "epoch": 2.796810452493035, "grad_norm": 0.9182777776573938, "learning_rate": 1.3873824746023557e-07, "loss": 0.0258, "step": 7278 }, { "epoch": 2.797579018157364, "grad_norm": 1.0105223897428088, "learning_rate": 1.3769403254497938e-07, "loss": 0.0302, "step": 7280 }, { "epoch": 2.798347583821693, "grad_norm": 0.8884893623331457, "learning_rate": 1.366537072690688e-07, "loss": 0.0255, "step": 7282 }, { "epoch": 2.7991161494860215, "grad_norm": 0.9665565730376303, "learning_rate": 1.3561727246471867e-07, "loss": 0.0306, "step": 7284 }, { "epoch": 2.7998847151503505, "grad_norm": 1.0475129550487072, "learning_rate": 1.345847289610347e-07, "loss": 0.0271, "step": 7286 }, { "epoch": 2.8006532808146796, "grad_norm": 0.8176029639975629, "learning_rate": 1.3355607758400724e-07, "loss": 0.0268, "step": 7288 }, { "epoch": 2.8014218464790086, "grad_norm": 0.8814614017412566, "learning_rate": 1.3253131915651364e-07, "loss": 0.0252, "step": 7290 }, { "epoch": 2.8021904121433376, "grad_norm": 0.8629427485693584, "learning_rate": 1.315104544983181e-07, "loss": 0.0275, "step": 7292 }, { "epoch": 2.8029589778076662, "grad_norm": 0.8250292382453978, "learning_rate": 1.3049348442606857e-07, "loss": 0.0221, "step": 7294 }, { "epoch": 2.8037275434719953, "grad_norm": 0.7258041226811235, "learning_rate": 1.2948040975329869e-07, "loss": 0.0267, "step": 7296 }, { "epoch": 2.8044961091363243, "grad_norm": 0.7363806388421352, "learning_rate": 1.2847123129042515e-07, "loss": 0.0237, "step": 7298 }, { "epoch": 2.8052646748006533, "grad_norm": 0.8623516898115705, "learning_rate": 1.2746594984474835e-07, "loss": 0.0243, "step": 7300 }, { "epoch": 2.8060332404649824, "grad_norm": 1.0751193092769749, "learning_rate": 1.2646456622045057e-07, "loss": 0.0307, "step": 7302 }, { "epoch": 2.806801806129311, "grad_norm": 0.902120455389303, "learning_rate": 1.2546708121859652e-07, "loss": 0.0257, "step": 7304 }, { "epoch": 2.80757037179364, "grad_norm": 0.8207999379379269, "learning_rate": 1.2447349563713186e-07, "loss": 0.0303, "step": 7306 }, { "epoch": 2.808338937457969, "grad_norm": 0.9199609383027654, "learning_rate": 1.2348381027088296e-07, "loss": 0.0257, "step": 7308 }, { "epoch": 2.809107503122298, "grad_norm": 0.9027685949482191, "learning_rate": 1.2249802591155714e-07, "loss": 0.0266, "step": 7310 }, { "epoch": 2.809876068786627, "grad_norm": 1.018546767738803, "learning_rate": 1.215161433477402e-07, "loss": 0.0309, "step": 7312 }, { "epoch": 2.8106446344509557, "grad_norm": 0.8560479243422576, "learning_rate": 1.2053816336489555e-07, "loss": 0.0254, "step": 7314 }, { "epoch": 2.8114132001152847, "grad_norm": 0.8335477312586327, "learning_rate": 1.195640867453668e-07, "loss": 0.0272, "step": 7316 }, { "epoch": 2.8121817657796138, "grad_norm": 0.759483144093165, "learning_rate": 1.1859391426837396e-07, "loss": 0.0235, "step": 7318 }, { "epoch": 2.812950331443943, "grad_norm": 0.8464706523979441, "learning_rate": 1.1762764671001348e-07, "loss": 0.0228, "step": 7320 }, { "epoch": 2.813718897108272, "grad_norm": 1.0429707916032696, "learning_rate": 1.1666528484325923e-07, "loss": 0.0309, "step": 7322 }, { "epoch": 2.8144874627726004, "grad_norm": 0.9206560548609958, "learning_rate": 1.1570682943796042e-07, "loss": 0.0273, "step": 7324 }, { "epoch": 2.8152560284369295, "grad_norm": 0.8806360953990707, "learning_rate": 1.1475228126083982e-07, "loss": 0.0266, "step": 7326 }, { "epoch": 2.8160245941012585, "grad_norm": 0.9438934885972436, "learning_rate": 1.1380164107549607e-07, "loss": 0.0268, "step": 7328 }, { "epoch": 2.8167931597655875, "grad_norm": 0.8397910539771488, "learning_rate": 1.1285490964240142e-07, "loss": 0.0309, "step": 7330 }, { "epoch": 2.8175617254299166, "grad_norm": 0.9226185036793075, "learning_rate": 1.1191208771890005e-07, "loss": 0.0287, "step": 7332 }, { "epoch": 2.818330291094245, "grad_norm": 0.8100764395043435, "learning_rate": 1.1097317605921087e-07, "loss": 0.0264, "step": 7334 }, { "epoch": 2.819098856758574, "grad_norm": 0.9673297128637642, "learning_rate": 1.1003817541442308e-07, "loss": 0.0283, "step": 7336 }, { "epoch": 2.8198674224229032, "grad_norm": 0.934963775705795, "learning_rate": 1.0910708653249779e-07, "loss": 0.0289, "step": 7338 }, { "epoch": 2.8206359880872323, "grad_norm": 1.1767095780226338, "learning_rate": 1.08179910158267e-07, "loss": 0.0293, "step": 7340 }, { "epoch": 2.8214045537515613, "grad_norm": 0.8843689345301603, "learning_rate": 1.0725664703343186e-07, "loss": 0.0272, "step": 7342 }, { "epoch": 2.82217311941589, "grad_norm": 1.0334450812829803, "learning_rate": 1.0633729789656377e-07, "loss": 0.0287, "step": 7344 }, { "epoch": 2.822941685080219, "grad_norm": 1.014775474632912, "learning_rate": 1.0542186348310446e-07, "loss": 0.0264, "step": 7346 }, { "epoch": 2.823710250744548, "grad_norm": 0.9310079044996011, "learning_rate": 1.0451034452536201e-07, "loss": 0.027, "step": 7348 }, { "epoch": 2.824478816408877, "grad_norm": 0.9605365839738166, "learning_rate": 1.0360274175251317e-07, "loss": 0.0271, "step": 7350 }, { "epoch": 2.825247382073206, "grad_norm": 0.9891195370841307, "learning_rate": 1.0269905589060158e-07, "loss": 0.0287, "step": 7352 }, { "epoch": 2.8260159477375346, "grad_norm": 0.874055581854697, "learning_rate": 1.0179928766253844e-07, "loss": 0.0236, "step": 7354 }, { "epoch": 2.8267845134018637, "grad_norm": 0.9182239456639025, "learning_rate": 1.0090343778809908e-07, "loss": 0.0289, "step": 7356 }, { "epoch": 2.8275530790661927, "grad_norm": 0.9157979447925776, "learning_rate": 1.0001150698392637e-07, "loss": 0.0331, "step": 7358 }, { "epoch": 2.8283216447305217, "grad_norm": 0.8501329041554059, "learning_rate": 9.912349596352732e-08, "loss": 0.0254, "step": 7360 }, { "epoch": 2.8290902103948508, "grad_norm": 1.10113164155731, "learning_rate": 9.823940543727317e-08, "loss": 0.032, "step": 7362 }, { "epoch": 2.8298587760591793, "grad_norm": 0.8608845983996287, "learning_rate": 9.735923611239872e-08, "loss": 0.0248, "step": 7364 }, { "epoch": 2.8306273417235084, "grad_norm": 1.1061761330024649, "learning_rate": 9.648298869300298e-08, "loss": 0.0297, "step": 7366 }, { "epoch": 2.8313959073878374, "grad_norm": 1.1306379590050064, "learning_rate": 9.561066388004636e-08, "loss": 0.0316, "step": 7368 }, { "epoch": 2.8321644730521665, "grad_norm": 0.8855885832674008, "learning_rate": 9.474226237135065e-08, "loss": 0.0293, "step": 7370 }, { "epoch": 2.8329330387164955, "grad_norm": 1.151825447166723, "learning_rate": 9.387778486160238e-08, "loss": 0.0297, "step": 7372 }, { "epoch": 2.833701604380824, "grad_norm": 0.964650006691683, "learning_rate": 9.301723204234614e-08, "loss": 0.0251, "step": 7374 }, { "epoch": 2.834470170045153, "grad_norm": 0.8464806263017828, "learning_rate": 9.216060460198795e-08, "loss": 0.0227, "step": 7376 }, { "epoch": 2.835238735709482, "grad_norm": 0.9630043196542051, "learning_rate": 9.130790322579352e-08, "loss": 0.025, "step": 7378 }, { "epoch": 2.836007301373811, "grad_norm": 0.8734692947376731, "learning_rate": 9.045912859588779e-08, "loss": 0.028, "step": 7380 }, { "epoch": 2.83677586703814, "grad_norm": 0.9633082963649481, "learning_rate": 8.961428139125427e-08, "loss": 0.0268, "step": 7382 }, { "epoch": 2.837544432702469, "grad_norm": 0.9807255825808232, "learning_rate": 8.877336228773626e-08, "loss": 0.0265, "step": 7384 }, { "epoch": 2.838312998366798, "grad_norm": 0.8332942566587281, "learning_rate": 8.793637195803228e-08, "loss": 0.0244, "step": 7386 }, { "epoch": 2.839081564031127, "grad_norm": 1.0707532577207919, "learning_rate": 8.710331107169956e-08, "loss": 0.0298, "step": 7388 }, { "epoch": 2.839850129695456, "grad_norm": 1.0201632561220813, "learning_rate": 8.627418029515166e-08, "loss": 0.0281, "step": 7390 }, { "epoch": 2.840618695359785, "grad_norm": 0.8157440532822474, "learning_rate": 8.544898029165915e-08, "loss": 0.0285, "step": 7392 }, { "epoch": 2.8413872610241135, "grad_norm": 0.997894695860624, "learning_rate": 8.46277117213451e-08, "loss": 0.0288, "step": 7394 }, { "epoch": 2.8421558266884426, "grad_norm": 0.9786348027235051, "learning_rate": 8.381037524119174e-08, "loss": 0.0284, "step": 7396 }, { "epoch": 2.8429243923527716, "grad_norm": 0.9089591428381713, "learning_rate": 8.299697150503217e-08, "loss": 0.0278, "step": 7398 }, { "epoch": 2.8436929580171006, "grad_norm": 1.0090800746660153, "learning_rate": 8.218750116355589e-08, "loss": 0.0255, "step": 7400 }, { "epoch": 2.8444615236814297, "grad_norm": 1.0279360049305237, "learning_rate": 8.138196486430438e-08, "loss": 0.0274, "step": 7402 }, { "epoch": 2.8452300893457583, "grad_norm": 1.0540526782203794, "learning_rate": 8.058036325167385e-08, "loss": 0.0284, "step": 7404 }, { "epoch": 2.8459986550100873, "grad_norm": 1.0277005176586937, "learning_rate": 7.978269696691021e-08, "loss": 0.028, "step": 7406 }, { "epoch": 2.8467672206744163, "grad_norm": 0.911409918931573, "learning_rate": 7.898896664811361e-08, "loss": 0.0297, "step": 7408 }, { "epoch": 2.8475357863387454, "grad_norm": 0.8656302917300289, "learning_rate": 7.819917293023505e-08, "loss": 0.0216, "step": 7410 }, { "epoch": 2.8483043520030744, "grad_norm": 0.9688050061817669, "learning_rate": 7.741331644507465e-08, "loss": 0.0265, "step": 7412 }, { "epoch": 2.849072917667403, "grad_norm": 0.8992384442206116, "learning_rate": 7.663139782128626e-08, "loss": 0.0307, "step": 7414 }, { "epoch": 2.849841483331732, "grad_norm": 0.9670916876214349, "learning_rate": 7.585341768437115e-08, "loss": 0.0301, "step": 7416 }, { "epoch": 2.850610048996061, "grad_norm": 0.9304421913114236, "learning_rate": 7.507937665667986e-08, "loss": 0.026, "step": 7418 }, { "epoch": 2.85137861466039, "grad_norm": 0.878406030798763, "learning_rate": 7.430927535741317e-08, "loss": 0.0311, "step": 7420 }, { "epoch": 2.852147180324719, "grad_norm": 0.8470037212212715, "learning_rate": 7.35431144026194e-08, "loss": 0.0259, "step": 7422 }, { "epoch": 2.8529157459890477, "grad_norm": 1.1322477290846025, "learning_rate": 7.278089440519443e-08, "loss": 0.0279, "step": 7424 }, { "epoch": 2.8536843116533768, "grad_norm": 0.9616657122092438, "learning_rate": 7.202261597488324e-08, "loss": 0.0324, "step": 7426 }, { "epoch": 2.854452877317706, "grad_norm": 0.9897110379778411, "learning_rate": 7.126827971827566e-08, "loss": 0.0295, "step": 7428 }, { "epoch": 2.855221442982035, "grad_norm": 1.0066932632911507, "learning_rate": 7.051788623880896e-08, "loss": 0.0299, "step": 7430 }, { "epoch": 2.855990008646364, "grad_norm": 0.8130927938002929, "learning_rate": 6.97714361367663e-08, "loss": 0.0262, "step": 7432 }, { "epoch": 2.8567585743106925, "grad_norm": 0.8204530226448732, "learning_rate": 6.902893000927668e-08, "loss": 0.0252, "step": 7434 }, { "epoch": 2.8575271399750215, "grad_norm": 0.9458267863252886, "learning_rate": 6.829036845031222e-08, "loss": 0.0278, "step": 7436 }, { "epoch": 2.8582957056393505, "grad_norm": 0.9452272569233351, "learning_rate": 6.755575205069254e-08, "loss": 0.0305, "step": 7438 }, { "epoch": 2.8590642713036796, "grad_norm": 0.9362125919222387, "learning_rate": 6.68250813980792e-08, "loss": 0.028, "step": 7440 }, { "epoch": 2.8598328369680086, "grad_norm": 1.0460352959770138, "learning_rate": 6.609835707697743e-08, "loss": 0.0332, "step": 7442 }, { "epoch": 2.860601402632337, "grad_norm": 0.8525691442567789, "learning_rate": 6.537557966873664e-08, "loss": 0.0257, "step": 7444 }, { "epoch": 2.8613699682966662, "grad_norm": 0.9005764184955649, "learning_rate": 6.465674975154767e-08, "loss": 0.0251, "step": 7446 }, { "epoch": 2.8621385339609953, "grad_norm": 0.9818918751068753, "learning_rate": 6.394186790044443e-08, "loss": 0.0309, "step": 7448 }, { "epoch": 2.8629070996253243, "grad_norm": 0.9495523185925938, "learning_rate": 6.323093468730168e-08, "loss": 0.0266, "step": 7450 }, { "epoch": 2.8636756652896533, "grad_norm": 0.8727728496735719, "learning_rate": 6.252395068083672e-08, "loss": 0.0256, "step": 7452 }, { "epoch": 2.864444230953982, "grad_norm": 0.9972053212088162, "learning_rate": 6.18209164466066e-08, "loss": 0.027, "step": 7454 }, { "epoch": 2.865212796618311, "grad_norm": 0.9301867693602383, "learning_rate": 6.112183254700866e-08, "loss": 0.031, "step": 7456 }, { "epoch": 2.86598136228264, "grad_norm": 0.7446331022643888, "learning_rate": 6.042669954128111e-08, "loss": 0.024, "step": 7458 }, { "epoch": 2.866749927946969, "grad_norm": 1.0510131466940011, "learning_rate": 5.973551798549971e-08, "loss": 0.0287, "step": 7460 }, { "epoch": 2.867518493611298, "grad_norm": 0.9719091618516881, "learning_rate": 5.904828843258215e-08, "loss": 0.026, "step": 7462 }, { "epoch": 2.8682870592756267, "grad_norm": 0.9474061605831551, "learning_rate": 5.8365011432282014e-08, "loss": 0.03, "step": 7464 }, { "epoch": 2.8690556249399557, "grad_norm": 1.0210743457943339, "learning_rate": 5.768568753119263e-08, "loss": 0.0281, "step": 7466 }, { "epoch": 2.8698241906042847, "grad_norm": 1.1022300484901781, "learning_rate": 5.701031727274375e-08, "loss": 0.0318, "step": 7468 }, { "epoch": 2.8705927562686138, "grad_norm": 1.154558702850704, "learning_rate": 5.633890119720375e-08, "loss": 0.0333, "step": 7470 }, { "epoch": 2.871361321932943, "grad_norm": 0.8932512718886566, "learning_rate": 5.567143984167689e-08, "loss": 0.0262, "step": 7472 }, { "epoch": 2.8721298875972714, "grad_norm": 0.9201965085930025, "learning_rate": 5.500793374010327e-08, "loss": 0.0244, "step": 7474 }, { "epoch": 2.8728984532616004, "grad_norm": 0.8924324847871282, "learning_rate": 5.434838342326054e-08, "loss": 0.0271, "step": 7476 }, { "epoch": 2.8736670189259295, "grad_norm": 0.944056497693951, "learning_rate": 5.3692789418761104e-08, "loss": 0.0247, "step": 7478 }, { "epoch": 2.8744355845902585, "grad_norm": 1.0354859851733738, "learning_rate": 5.304115225105211e-08, "loss": 0.0289, "step": 7480 }, { "epoch": 2.8752041502545875, "grad_norm": 0.8103550430926265, "learning_rate": 5.239347244141657e-08, "loss": 0.0209, "step": 7482 }, { "epoch": 2.875972715918916, "grad_norm": 0.913916389766087, "learning_rate": 5.174975050797004e-08, "loss": 0.0236, "step": 7484 }, { "epoch": 2.876741281583245, "grad_norm": 0.8833685073065882, "learning_rate": 5.110998696566283e-08, "loss": 0.0269, "step": 7486 }, { "epoch": 2.877509847247574, "grad_norm": 0.9503860204499671, "learning_rate": 5.047418232627943e-08, "loss": 0.0282, "step": 7488 }, { "epoch": 2.8782784129119032, "grad_norm": 1.0066050032067249, "learning_rate": 4.984233709843689e-08, "loss": 0.0306, "step": 7490 }, { "epoch": 2.8790469785762323, "grad_norm": 1.0132898238017118, "learning_rate": 4.921445178758311e-08, "loss": 0.0273, "step": 7492 }, { "epoch": 2.879815544240561, "grad_norm": 1.0449313464349879, "learning_rate": 4.85905268960013e-08, "loss": 0.0268, "step": 7494 }, { "epoch": 2.88058410990489, "grad_norm": 0.8563279731707988, "learning_rate": 4.7970562922805e-08, "loss": 0.0264, "step": 7496 }, { "epoch": 2.881352675569219, "grad_norm": 1.0302664226247722, "learning_rate": 4.735456036393859e-08, "loss": 0.0264, "step": 7498 }, { "epoch": 2.882121241233548, "grad_norm": 0.9474800073626922, "learning_rate": 4.6742519712177916e-08, "loss": 0.0277, "step": 7500 }, { "epoch": 2.882121241233548, "eval_loss": 0.178667813539505, "eval_runtime": 390.2942, "eval_samples_per_second": 47.413, "eval_steps_per_second": 5.929, "step": 7500 }, { "epoch": 2.882889806897877, "grad_norm": 0.8630910689097392, "learning_rate": 4.6134441457130195e-08, "loss": 0.0308, "step": 7502 }, { "epoch": 2.8836583725622056, "grad_norm": 0.9696707124076421, "learning_rate": 4.553032608523189e-08, "loss": 0.0262, "step": 7504 }, { "epoch": 2.8844269382265346, "grad_norm": 0.8097816986642715, "learning_rate": 4.493017407975087e-08, "loss": 0.0257, "step": 7506 }, { "epoch": 2.8851955038908637, "grad_norm": 0.793281567178151, "learning_rate": 4.433398592078197e-08, "loss": 0.028, "step": 7508 }, { "epoch": 2.8859640695551927, "grad_norm": 0.8893758717528171, "learning_rate": 4.374176208525094e-08, "loss": 0.0285, "step": 7510 }, { "epoch": 2.8867326352195217, "grad_norm": 1.1774068897283696, "learning_rate": 4.315350304691268e-08, "loss": 0.0333, "step": 7512 }, { "epoch": 2.8875012008838503, "grad_norm": 0.8782418266315122, "learning_rate": 4.2569209276348e-08, "loss": 0.0245, "step": 7514 }, { "epoch": 2.8882697665481794, "grad_norm": 0.9482747492175474, "learning_rate": 4.1988881240968557e-08, "loss": 0.0302, "step": 7516 }, { "epoch": 2.8890383322125084, "grad_norm": 1.412400418503253, "learning_rate": 4.14125194050119e-08, "loss": 0.0305, "step": 7518 }, { "epoch": 2.8898068978768374, "grad_norm": 0.8692409446665603, "learning_rate": 4.084012422954309e-08, "loss": 0.0268, "step": 7520 }, { "epoch": 2.8905754635411665, "grad_norm": 1.1122406150228263, "learning_rate": 4.027169617245363e-08, "loss": 0.0316, "step": 7522 }, { "epoch": 2.891344029205495, "grad_norm": 1.0228827642228595, "learning_rate": 3.970723568846313e-08, "loss": 0.0286, "step": 7524 }, { "epoch": 2.892112594869824, "grad_norm": 0.9549454881266057, "learning_rate": 3.914674322911482e-08, "loss": 0.029, "step": 7526 }, { "epoch": 2.892881160534153, "grad_norm": 1.191666614934801, "learning_rate": 3.8590219242779505e-08, "loss": 0.0307, "step": 7528 }, { "epoch": 2.893649726198482, "grad_norm": 0.9848993947898935, "learning_rate": 3.803766417465327e-08, "loss": 0.0268, "step": 7530 }, { "epoch": 2.894418291862811, "grad_norm": 0.8179077473240036, "learning_rate": 3.748907846675587e-08, "loss": 0.0246, "step": 7532 }, { "epoch": 2.89518685752714, "grad_norm": 0.9412417615339049, "learning_rate": 3.694446255793405e-08, "loss": 0.0281, "step": 7534 }, { "epoch": 2.895955423191469, "grad_norm": 0.9712369151274626, "learning_rate": 3.640381688385653e-08, "loss": 0.0271, "step": 7536 }, { "epoch": 2.896723988855798, "grad_norm": 0.9283109297685763, "learning_rate": 3.5867141877016783e-08, "loss": 0.0254, "step": 7538 }, { "epoch": 2.897492554520127, "grad_norm": 1.0915364505174352, "learning_rate": 3.5334437966733083e-08, "loss": 0.0316, "step": 7540 }, { "epoch": 2.898261120184456, "grad_norm": 0.9527345992858091, "learning_rate": 3.480570557914509e-08, "loss": 0.0242, "step": 7542 }, { "epoch": 2.8990296858487845, "grad_norm": 0.9830376409403826, "learning_rate": 3.4280945137217246e-08, "loss": 0.0298, "step": 7544 }, { "epoch": 2.8997982515131135, "grad_norm": 0.7762955119290066, "learning_rate": 3.376015706073543e-08, "loss": 0.0229, "step": 7546 }, { "epoch": 2.9005668171774426, "grad_norm": 1.098424489268818, "learning_rate": 3.32433417663075e-08, "loss": 0.0263, "step": 7548 }, { "epoch": 2.9013353828417716, "grad_norm": 0.9148948781971622, "learning_rate": 3.273049966736497e-08, "loss": 0.0311, "step": 7550 }, { "epoch": 2.9021039485061007, "grad_norm": 1.0506282508963107, "learning_rate": 3.222163117415855e-08, "loss": 0.0302, "step": 7552 }, { "epoch": 2.9028725141704292, "grad_norm": 0.8644224429987766, "learning_rate": 3.171673669376263e-08, "loss": 0.025, "step": 7554 }, { "epoch": 2.9036410798347583, "grad_norm": 0.8596325629025294, "learning_rate": 3.121581663007134e-08, "loss": 0.0256, "step": 7556 }, { "epoch": 2.9044096454990873, "grad_norm": 0.8703794523473042, "learning_rate": 3.0718871383799146e-08, "loss": 0.0293, "step": 7558 }, { "epoch": 2.9051782111634163, "grad_norm": 0.8946670419497575, "learning_rate": 3.0225901352480823e-08, "loss": 0.0251, "step": 7560 }, { "epoch": 2.9059467768277454, "grad_norm": 0.9027028660650444, "learning_rate": 2.9736906930472598e-08, "loss": 0.0269, "step": 7562 }, { "epoch": 2.906715342492074, "grad_norm": 0.9097090282988143, "learning_rate": 2.925188850894878e-08, "loss": 0.0307, "step": 7564 }, { "epoch": 2.907483908156403, "grad_norm": 1.0306792434818786, "learning_rate": 2.8770846475904003e-08, "loss": 0.029, "step": 7566 }, { "epoch": 2.908252473820732, "grad_norm": 0.9549262284597876, "learning_rate": 2.8293781216151007e-08, "loss": 0.0272, "step": 7568 }, { "epoch": 2.909021039485061, "grad_norm": 0.7837125181489196, "learning_rate": 2.7820693111322294e-08, "loss": 0.0296, "step": 7570 }, { "epoch": 2.90978960514939, "grad_norm": 0.9363535299706386, "learning_rate": 2.7351582539867915e-08, "loss": 0.0327, "step": 7572 }, { "epoch": 2.9105581708137187, "grad_norm": 1.0247068511221704, "learning_rate": 2.688644987705713e-08, "loss": 0.0252, "step": 7574 }, { "epoch": 2.9113267364780477, "grad_norm": 1.0243383845181557, "learning_rate": 2.6425295494975632e-08, "loss": 0.0311, "step": 7576 }, { "epoch": 2.9120953021423768, "grad_norm": 0.9652509444087541, "learning_rate": 2.596811976252833e-08, "loss": 0.0307, "step": 7578 }, { "epoch": 2.912863867806706, "grad_norm": 0.9739213660325812, "learning_rate": 2.5514923045436014e-08, "loss": 0.0322, "step": 7580 }, { "epoch": 2.913632433471035, "grad_norm": 0.7974975643342606, "learning_rate": 2.506570570623701e-08, "loss": 0.0271, "step": 7582 }, { "epoch": 2.9144009991353634, "grad_norm": 0.845847074746792, "learning_rate": 2.4620468104285534e-08, "loss": 0.0284, "step": 7584 }, { "epoch": 2.9151695647996925, "grad_norm": 0.9637964908861171, "learning_rate": 2.41792105957539e-08, "loss": 0.0281, "step": 7586 }, { "epoch": 2.9159381304640215, "grad_norm": 0.9847987125383134, "learning_rate": 2.3741933533629192e-08, "loss": 0.0276, "step": 7588 }, { "epoch": 2.9167066961283505, "grad_norm": 0.8900766719141754, "learning_rate": 2.3308637267714372e-08, "loss": 0.0245, "step": 7590 }, { "epoch": 2.9174752617926796, "grad_norm": 0.8622366256931285, "learning_rate": 2.287932214462829e-08, "loss": 0.0273, "step": 7592 }, { "epoch": 2.918243827457008, "grad_norm": 1.0980414605854183, "learning_rate": 2.2453988507804557e-08, "loss": 0.032, "step": 7594 }, { "epoch": 2.919012393121337, "grad_norm": 0.8603145622892301, "learning_rate": 2.2032636697492115e-08, "loss": 0.0269, "step": 7596 }, { "epoch": 2.9197809587856662, "grad_norm": 0.8891757362757595, "learning_rate": 2.1615267050755784e-08, "loss": 0.0315, "step": 7598 }, { "epoch": 2.9205495244499953, "grad_norm": 0.8381170152836135, "learning_rate": 2.1201879901471822e-08, "loss": 0.0274, "step": 7600 }, { "epoch": 2.9213180901143243, "grad_norm": 1.2015102011235899, "learning_rate": 2.0792475580334037e-08, "loss": 0.0267, "step": 7602 }, { "epoch": 2.922086655778653, "grad_norm": 0.8277145119402302, "learning_rate": 2.0387054414847674e-08, "loss": 0.0299, "step": 7604 }, { "epoch": 2.922855221442982, "grad_norm": 0.8911160701348961, "learning_rate": 1.9985616729332747e-08, "loss": 0.0282, "step": 7606 }, { "epoch": 2.923623787107311, "grad_norm": 0.8899318047452413, "learning_rate": 1.9588162844922375e-08, "loss": 0.0253, "step": 7608 }, { "epoch": 2.92439235277164, "grad_norm": 1.0447393392651048, "learning_rate": 1.9194693079563342e-08, "loss": 0.0338, "step": 7610 }, { "epoch": 2.925160918435969, "grad_norm": 0.9959800964885274, "learning_rate": 1.880520774801442e-08, "loss": 0.028, "step": 7612 }, { "epoch": 2.9259294841002976, "grad_norm": 0.8759661506230436, "learning_rate": 1.841970716184749e-08, "loss": 0.027, "step": 7614 }, { "epoch": 2.9266980497646267, "grad_norm": 0.8908033092188583, "learning_rate": 1.803819162944698e-08, "loss": 0.026, "step": 7616 }, { "epoch": 2.9274666154289557, "grad_norm": 0.9297779941572804, "learning_rate": 1.7660661456008753e-08, "loss": 0.0262, "step": 7618 }, { "epoch": 2.9282351810932847, "grad_norm": 0.7788662115280711, "learning_rate": 1.728711694354124e-08, "loss": 0.0213, "step": 7620 }, { "epoch": 2.9290037467576138, "grad_norm": 0.9951671553828053, "learning_rate": 1.6917558390864284e-08, "loss": 0.0288, "step": 7622 }, { "epoch": 2.9297723124219424, "grad_norm": 1.0699281649671744, "learning_rate": 1.6551986093608642e-08, "loss": 0.0297, "step": 7624 }, { "epoch": 2.9305408780862714, "grad_norm": 0.9511679636340409, "learning_rate": 1.6190400344217593e-08, "loss": 0.027, "step": 7626 }, { "epoch": 2.9313094437506004, "grad_norm": 0.9476324172739506, "learning_rate": 1.5832801431943657e-08, "loss": 0.0281, "step": 7628 }, { "epoch": 2.9320780094149295, "grad_norm": 0.8459658981059867, "learning_rate": 1.547918964285189e-08, "loss": 0.0281, "step": 7630 }, { "epoch": 2.9328465750792585, "grad_norm": 0.9568319274750331, "learning_rate": 1.512956525981546e-08, "loss": 0.0261, "step": 7632 }, { "epoch": 2.933615140743587, "grad_norm": 1.267508785507199, "learning_rate": 1.4783928562519534e-08, "loss": 0.0291, "step": 7634 }, { "epoch": 2.934383706407916, "grad_norm": 1.0177800213961141, "learning_rate": 1.4442279827459605e-08, "loss": 0.0288, "step": 7636 }, { "epoch": 2.935152272072245, "grad_norm": 1.234758108963972, "learning_rate": 1.4104619327939827e-08, "loss": 0.0289, "step": 7638 }, { "epoch": 2.935920837736574, "grad_norm": 1.0198632696361511, "learning_rate": 1.377094733407358e-08, "loss": 0.0281, "step": 7640 }, { "epoch": 2.9366894034009032, "grad_norm": 0.8943099997957515, "learning_rate": 1.3441264112785124e-08, "loss": 0.0258, "step": 7642 }, { "epoch": 2.937457969065232, "grad_norm": 0.9233044917800737, "learning_rate": 1.3115569927807382e-08, "loss": 0.0306, "step": 7644 }, { "epoch": 2.938226534729561, "grad_norm": 1.1023437183678146, "learning_rate": 1.2793865039681386e-08, "loss": 0.0314, "step": 7646 }, { "epoch": 2.93899510039389, "grad_norm": 0.8367158750592715, "learning_rate": 1.247614970575739e-08, "loss": 0.0264, "step": 7648 }, { "epoch": 2.939763666058219, "grad_norm": 0.920369900040142, "learning_rate": 1.216242418019431e-08, "loss": 0.0292, "step": 7650 }, { "epoch": 2.940532231722548, "grad_norm": 1.1297199679941479, "learning_rate": 1.1852688713959725e-08, "loss": 0.0346, "step": 7652 }, { "epoch": 2.9413007973868766, "grad_norm": 0.9905013325851304, "learning_rate": 1.1546943554828215e-08, "loss": 0.0316, "step": 7654 }, { "epoch": 2.9420693630512056, "grad_norm": 0.8449033680634367, "learning_rate": 1.1245188947384133e-08, "loss": 0.0255, "step": 7656 }, { "epoch": 2.9428379287155346, "grad_norm": 0.9781017456071339, "learning_rate": 1.0947425133017164e-08, "loss": 0.029, "step": 7658 }, { "epoch": 2.9436064943798637, "grad_norm": 0.916415783025838, "learning_rate": 1.0653652349926214e-08, "loss": 0.0275, "step": 7660 }, { "epoch": 2.9443750600441927, "grad_norm": 0.8882859930133875, "learning_rate": 1.0363870833117739e-08, "loss": 0.0274, "step": 7662 }, { "epoch": 2.9451436257085213, "grad_norm": 1.0567710583388108, "learning_rate": 1.0078080814403534e-08, "loss": 0.0317, "step": 7664 }, { "epoch": 2.9459121913728503, "grad_norm": 1.076880589066171, "learning_rate": 9.79628252240461e-09, "loss": 0.0273, "step": 7666 }, { "epoch": 2.9466807570371794, "grad_norm": 0.898915257083523, "learning_rate": 9.518476182547309e-09, "loss": 0.0289, "step": 7668 }, { "epoch": 2.9474493227015084, "grad_norm": 0.8861677221595257, "learning_rate": 9.244662017064421e-09, "loss": 0.0275, "step": 7670 }, { "epoch": 2.9482178883658374, "grad_norm": 1.1331518802825407, "learning_rate": 8.974840244996842e-09, "loss": 0.029, "step": 7672 }, { "epoch": 2.948986454030166, "grad_norm": 0.7814118498230102, "learning_rate": 8.709011082189134e-09, "loss": 0.0247, "step": 7674 }, { "epoch": 2.949755019694495, "grad_norm": 1.0212291349354252, "learning_rate": 8.447174741294528e-09, "loss": 0.0281, "step": 7676 }, { "epoch": 2.950523585358824, "grad_norm": 0.9949008609674989, "learning_rate": 8.18933143177103e-09, "loss": 0.0303, "step": 7678 }, { "epoch": 2.951292151023153, "grad_norm": 0.8873467834434147, "learning_rate": 7.935481359881981e-09, "loss": 0.0249, "step": 7680 }, { "epoch": 2.952060716687482, "grad_norm": 0.9246842444426915, "learning_rate": 7.685624728696606e-09, "loss": 0.0271, "step": 7682 }, { "epoch": 2.9528292823518107, "grad_norm": 1.251141332314057, "learning_rate": 7.439761738090024e-09, "loss": 0.0298, "step": 7684 }, { "epoch": 2.95359784801614, "grad_norm": 0.9277966984672557, "learning_rate": 7.197892584742127e-09, "loss": 0.0297, "step": 7686 }, { "epoch": 2.954366413680469, "grad_norm": 0.8487404475556988, "learning_rate": 6.9600174621375914e-09, "loss": 0.0257, "step": 7688 }, { "epoch": 2.955134979344798, "grad_norm": 0.9089367262346528, "learning_rate": 6.726136560567531e-09, "loss": 0.0285, "step": 7690 }, { "epoch": 2.955903545009127, "grad_norm": 0.8127136176615201, "learning_rate": 6.49625006712562e-09, "loss": 0.0257, "step": 7692 }, { "epoch": 2.9566721106734555, "grad_norm": 0.9782748118805716, "learning_rate": 6.270358165711976e-09, "loss": 0.0286, "step": 7694 }, { "epoch": 2.9574406763377845, "grad_norm": 0.8751615644340816, "learning_rate": 6.048461037030384e-09, "loss": 0.0293, "step": 7696 }, { "epoch": 2.9582092420021135, "grad_norm": 0.7990022022103657, "learning_rate": 5.830558858588853e-09, "loss": 0.0267, "step": 7698 }, { "epoch": 2.9589778076664426, "grad_norm": 0.7959135704655069, "learning_rate": 5.616651804700723e-09, "loss": 0.0252, "step": 7700 }, { "epoch": 2.9597463733307716, "grad_norm": 1.0467935839441156, "learning_rate": 5.406740046481896e-09, "loss": 0.0257, "step": 7702 }, { "epoch": 2.9605149389951, "grad_norm": 0.8147735781832639, "learning_rate": 5.200823751853046e-09, "loss": 0.0242, "step": 7704 }, { "epoch": 2.9612835046594292, "grad_norm": 0.7561397877712323, "learning_rate": 4.998903085539075e-09, "loss": 0.0251, "step": 7706 }, { "epoch": 2.9620520703237583, "grad_norm": 0.994729294284005, "learning_rate": 4.800978209067441e-09, "loss": 0.0302, "step": 7708 }, { "epoch": 2.9628206359880873, "grad_norm": 1.0247356085040544, "learning_rate": 4.607049280769271e-09, "loss": 0.0267, "step": 7710 }, { "epoch": 2.9635892016524163, "grad_norm": 0.9499290082968285, "learning_rate": 4.417116455780468e-09, "loss": 0.0294, "step": 7712 }, { "epoch": 2.964357767316745, "grad_norm": 0.9516768882996094, "learning_rate": 4.2311798860389385e-09, "loss": 0.0261, "step": 7714 }, { "epoch": 2.965126332981074, "grad_norm": 0.7415071124711834, "learning_rate": 4.049239720285703e-09, "loss": 0.0228, "step": 7716 }, { "epoch": 2.965894898645403, "grad_norm": 0.8403487580394559, "learning_rate": 3.871296104065448e-09, "loss": 0.0267, "step": 7718 }, { "epoch": 2.966663464309732, "grad_norm": 0.9170945491380734, "learning_rate": 3.697349179725418e-09, "loss": 0.0254, "step": 7720 }, { "epoch": 2.967432029974061, "grad_norm": 0.866529344053199, "learning_rate": 3.5273990864165274e-09, "loss": 0.0238, "step": 7722 }, { "epoch": 2.9682005956383897, "grad_norm": 0.8868689888485576, "learning_rate": 3.3614459600905812e-09, "loss": 0.0302, "step": 7724 }, { "epoch": 2.9689691613027187, "grad_norm": 0.9352844558419862, "learning_rate": 3.199489933503608e-09, "loss": 0.0294, "step": 7726 }, { "epoch": 2.9697377269670477, "grad_norm": 1.1396033913474968, "learning_rate": 3.041531136213638e-09, "loss": 0.0317, "step": 7728 }, { "epoch": 2.9705062926313768, "grad_norm": 1.0513378167596514, "learning_rate": 2.8875696945812603e-09, "loss": 0.0268, "step": 7730 }, { "epoch": 2.971274858295706, "grad_norm": 0.806076849943114, "learning_rate": 2.737605731768511e-09, "loss": 0.0222, "step": 7732 }, { "epoch": 2.9720434239600344, "grad_norm": 0.9268975344015234, "learning_rate": 2.5916393677405395e-09, "loss": 0.0289, "step": 7734 }, { "epoch": 2.9728119896243634, "grad_norm": 0.940353223928189, "learning_rate": 2.4496707192644964e-09, "loss": 0.0242, "step": 7736 }, { "epoch": 2.9735805552886925, "grad_norm": 0.9494425509655721, "learning_rate": 2.3116998999089813e-09, "loss": 0.0282, "step": 7738 }, { "epoch": 2.9743491209530215, "grad_norm": 0.8622081045696698, "learning_rate": 2.1777270200445954e-09, "loss": 0.0273, "step": 7740 }, { "epoch": 2.9751176866173505, "grad_norm": 0.9935475044353351, "learning_rate": 2.0477521868444984e-09, "loss": 0.0275, "step": 7742 }, { "epoch": 2.975886252281679, "grad_norm": 0.8928678562687371, "learning_rate": 1.9217755042832965e-09, "loss": 0.0304, "step": 7744 }, { "epoch": 2.976654817946008, "grad_norm": 0.8548532621379579, "learning_rate": 1.7997970731359338e-09, "loss": 0.0235, "step": 7746 }, { "epoch": 2.977423383610337, "grad_norm": 0.8128519014823441, "learning_rate": 1.6818169909810223e-09, "loss": 0.0267, "step": 7748 }, { "epoch": 2.9781919492746662, "grad_norm": 0.9737934541961301, "learning_rate": 1.5678353521969557e-09, "loss": 0.0271, "step": 7750 }, { "epoch": 2.9789605149389953, "grad_norm": 0.9964058385285809, "learning_rate": 1.457852247964686e-09, "loss": 0.0302, "step": 7752 }, { "epoch": 2.979729080603324, "grad_norm": 0.9008605046113106, "learning_rate": 1.3518677662660573e-09, "loss": 0.026, "step": 7754 }, { "epoch": 2.980497646267653, "grad_norm": 1.05866895560724, "learning_rate": 1.2498819918843609e-09, "loss": 0.0311, "step": 7756 }, { "epoch": 2.981266211931982, "grad_norm": 1.0031447680188395, "learning_rate": 1.1518950064026701e-09, "loss": 0.0269, "step": 7758 }, { "epoch": 2.982034777596311, "grad_norm": 0.9597091725639213, "learning_rate": 1.057906888208282e-09, "loss": 0.0285, "step": 7760 }, { "epoch": 2.98280334326064, "grad_norm": 0.947606351764002, "learning_rate": 9.67917712486055e-10, "loss": 0.0281, "step": 7762 }, { "epoch": 2.9835719089249686, "grad_norm": 1.1260331326143445, "learning_rate": 8.819275512245151e-10, "loss": 0.0324, "step": 7764 }, { "epoch": 2.9843404745892976, "grad_norm": 0.9855538847125018, "learning_rate": 7.999364732119708e-10, "loss": 0.0278, "step": 7766 }, { "epoch": 2.9851090402536267, "grad_norm": 0.8226644716521432, "learning_rate": 7.219445440376227e-10, "loss": 0.0249, "step": 7768 }, { "epoch": 2.9858776059179557, "grad_norm": 1.050269375419271, "learning_rate": 6.479518260921191e-10, "loss": 0.0288, "step": 7770 }, { "epoch": 2.9866461715822847, "grad_norm": 0.9819811169719662, "learning_rate": 5.779583785658905e-10, "loss": 0.0291, "step": 7772 }, { "epoch": 2.9874147372466133, "grad_norm": 0.8572531589730571, "learning_rate": 5.119642574513695e-10, "loss": 0.0256, "step": 7774 }, { "epoch": 2.9881833029109424, "grad_norm": 0.8965464607434339, "learning_rate": 4.499695155407713e-10, "loss": 0.0283, "step": 7776 }, { "epoch": 2.9889518685752714, "grad_norm": 1.0035217661768527, "learning_rate": 3.919742024266482e-10, "loss": 0.0285, "step": 7778 }, { "epoch": 2.9897204342396004, "grad_norm": 0.7733054547913483, "learning_rate": 3.3797836450411015e-10, "loss": 0.0233, "step": 7780 }, { "epoch": 2.9904889999039295, "grad_norm": 0.9543864178382439, "learning_rate": 2.8798204496638393e-10, "loss": 0.0294, "step": 7782 }, { "epoch": 2.991257565568258, "grad_norm": 0.8632486528014632, "learning_rate": 2.41985283809254e-10, "loss": 0.0257, "step": 7784 }, { "epoch": 2.992026131232587, "grad_norm": 0.9443651221586652, "learning_rate": 1.9998811782773187e-10, "loss": 0.0285, "step": 7786 }, { "epoch": 2.992794696896916, "grad_norm": 0.8974918267283917, "learning_rate": 1.6199058061772132e-10, "loss": 0.023, "step": 7788 }, { "epoch": 2.993563262561245, "grad_norm": 0.9418962561844657, "learning_rate": 1.2799270257601858e-10, "loss": 0.027, "step": 7790 }, { "epoch": 2.994331828225574, "grad_norm": 0.8699472284595392, "learning_rate": 9.799451089920198e-11, "loss": 0.027, "step": 7792 }, { "epoch": 2.995100393889903, "grad_norm": 1.0327488918767198, "learning_rate": 7.199602958474217e-11, "loss": 0.0272, "step": 7794 }, { "epoch": 2.995868959554232, "grad_norm": 0.7725168445925498, "learning_rate": 4.999727942989196e-11, "loss": 0.0258, "step": 7796 }, { "epoch": 2.996637525218561, "grad_norm": 0.936954829608146, "learning_rate": 3.1998278032796536e-11, "loss": 0.0325, "step": 7798 }, { "epoch": 2.99740609088289, "grad_norm": 1.0845629650243849, "learning_rate": 1.79990397924934e-11, "loss": 0.0343, "step": 7800 }, { "epoch": 2.998174656547219, "grad_norm": 0.8885405926153468, "learning_rate": 7.999575907802204e-12, "loss": 0.0293, "step": 7802 }, { "epoch": 2.9989432222115475, "grad_norm": 0.7700894927516254, "learning_rate": 1.9998943767696176e-12, "loss": 0.0264, "step": 7804 }, { "epoch": 2.9997117878758766, "grad_norm": 1.0878450097278667, "learning_rate": 0.0, "loss": 0.0308, "step": 7806 }, { "epoch": 2.9997117878758766, "step": 7806, "total_flos": 769047897047040.0, "train_loss": 0.11931899001966477, "train_runtime": 29289.2649, "train_samples_per_second": 17.058, "train_steps_per_second": 0.267 } ], "logging_steps": 2, "max_steps": 7806, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 769047897047040.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }