{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 10990, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00045495905368516835, "grad_norm": 11.705517638294971, "learning_rate": 5e-05, "loss": 0.3487, "step": 1 }, { "epoch": 0.0009099181073703367, "grad_norm": 5.625533664291238, "learning_rate": 4.999999897855645e-05, "loss": 0.3475, "step": 2 }, { "epoch": 0.001364877161055505, "grad_norm": 3.050231759218471, "learning_rate": 4.9999995914225884e-05, "loss": 0.2509, "step": 3 }, { "epoch": 0.0018198362147406734, "grad_norm": 2.8736287685120083, "learning_rate": 4.999999080700855e-05, "loss": 0.2033, "step": 4 }, { "epoch": 0.0022747952684258415, "grad_norm": 5.641734287312084, "learning_rate": 4.9999983656904865e-05, "loss": 0.2507, "step": 5 }, { "epoch": 0.00272975432211101, "grad_norm": 21.95717145768768, "learning_rate": 4.999997446391541e-05, "loss": 0.272, "step": 6 }, { "epoch": 0.0031847133757961785, "grad_norm": 1.7784000344733917, "learning_rate": 4.999996322804095e-05, "loss": 0.2085, "step": 7 }, { "epoch": 0.003639672429481347, "grad_norm": 1.5833500335813995, "learning_rate": 4.999994994928239e-05, "loss": 0.192, "step": 8 }, { "epoch": 0.004094631483166515, "grad_norm": 2.130403672123271, "learning_rate": 4.999993462764082e-05, "loss": 0.2674, "step": 9 }, { "epoch": 0.004549590536851683, "grad_norm": 1.496781741273883, "learning_rate": 4.9999917263117485e-05, "loss": 0.1765, "step": 10 }, { "epoch": 0.005004549590536852, "grad_norm": 2.0423071988934147, "learning_rate": 4.9999897855713816e-05, "loss": 0.2361, "step": 11 }, { "epoch": 0.00545950864422202, "grad_norm": 2.451764386913058, "learning_rate": 4.999987640543139e-05, "loss": 0.2698, "step": 12 }, { "epoch": 0.005914467697907188, "grad_norm": 3.972515109472195, "learning_rate": 4.999985291227196e-05, "loss": 0.2393, "step": 13 }, { "epoch": 0.006369426751592357, "grad_norm": 2.0773905159558765, "learning_rate": 4.999982737623745e-05, "loss": 0.3335, "step": 14 }, { "epoch": 0.006824385805277525, "grad_norm": 1.5506423565582652, "learning_rate": 4.999979979732995e-05, "loss": 0.2581, "step": 15 }, { "epoch": 0.007279344858962694, "grad_norm": 1.2950827876188111, "learning_rate": 4.9999770175551705e-05, "loss": 0.175, "step": 16 }, { "epoch": 0.0077343039126478615, "grad_norm": 1.2558416571692808, "learning_rate": 4.999973851090514e-05, "loss": 0.2674, "step": 17 }, { "epoch": 0.00818926296633303, "grad_norm": 1.1485865015052659, "learning_rate": 4.999970480339285e-05, "loss": 0.2361, "step": 18 }, { "epoch": 0.008644222020018199, "grad_norm": 2.1071510228354704, "learning_rate": 4.9999669053017564e-05, "loss": 0.2395, "step": 19 }, { "epoch": 0.009099181073703366, "grad_norm": 1.7585226852484823, "learning_rate": 4.999963125978223e-05, "loss": 0.1951, "step": 20 }, { "epoch": 0.009554140127388535, "grad_norm": 1.619518025852925, "learning_rate": 4.999959142368992e-05, "loss": 0.2135, "step": 21 }, { "epoch": 0.010009099181073703, "grad_norm": 7.23106820979905, "learning_rate": 4.9999549544743906e-05, "loss": 0.2357, "step": 22 }, { "epoch": 0.010464058234758872, "grad_norm": 1.0407840606007694, "learning_rate": 4.99995056229476e-05, "loss": 0.1978, "step": 23 }, { "epoch": 0.01091901728844404, "grad_norm": 1.2281492084048224, "learning_rate": 4.999945965830458e-05, "loss": 0.236, "step": 24 }, { "epoch": 0.011373976342129208, "grad_norm": 1.3164987460279216, "learning_rate": 4.9999411650818623e-05, "loss": 0.2037, "step": 25 }, { "epoch": 0.011828935395814377, "grad_norm": 1.0633028261158373, "learning_rate": 4.999936160049364e-05, "loss": 0.219, "step": 26 }, { "epoch": 0.012283894449499545, "grad_norm": 1.2941998128360934, "learning_rate": 4.999930950733372e-05, "loss": 0.2755, "step": 27 }, { "epoch": 0.012738853503184714, "grad_norm": 1.1685438468523945, "learning_rate": 4.9999255371343125e-05, "loss": 0.2103, "step": 28 }, { "epoch": 0.013193812556869881, "grad_norm": 1.1942326366823812, "learning_rate": 4.999919919252628e-05, "loss": 0.2538, "step": 29 }, { "epoch": 0.01364877161055505, "grad_norm": 1.1062150278612115, "learning_rate": 4.999914097088777e-05, "loss": 0.2381, "step": 30 }, { "epoch": 0.014103730664240218, "grad_norm": 1.37690938622929, "learning_rate": 4.9999080706432355e-05, "loss": 0.2374, "step": 31 }, { "epoch": 0.014558689717925387, "grad_norm": 1.5049840020802696, "learning_rate": 4.999901839916496e-05, "loss": 0.2026, "step": 32 }, { "epoch": 0.015013648771610554, "grad_norm": 1.269006405808121, "learning_rate": 4.9998954049090676e-05, "loss": 0.2178, "step": 33 }, { "epoch": 0.015468607825295723, "grad_norm": 1.3598431657906636, "learning_rate": 4.999888765621475e-05, "loss": 0.2078, "step": 34 }, { "epoch": 0.01592356687898089, "grad_norm": 1.2486022776853285, "learning_rate": 4.999881922054264e-05, "loss": 0.1988, "step": 35 }, { "epoch": 0.01637852593266606, "grad_norm": 1.0168849023345046, "learning_rate": 4.9998748742079904e-05, "loss": 0.17, "step": 36 }, { "epoch": 0.01683348498635123, "grad_norm": 1.3378814176122384, "learning_rate": 4.999867622083232e-05, "loss": 0.2478, "step": 37 }, { "epoch": 0.017288444040036398, "grad_norm": 1.0143269949823155, "learning_rate": 4.99986016568058e-05, "loss": 0.2275, "step": 38 }, { "epoch": 0.017743403093721567, "grad_norm": 1.5776510807323647, "learning_rate": 4.999852505000645e-05, "loss": 0.2522, "step": 39 }, { "epoch": 0.018198362147406732, "grad_norm": 0.9272338649997853, "learning_rate": 4.999844640044052e-05, "loss": 0.1526, "step": 40 }, { "epoch": 0.0186533212010919, "grad_norm": 1.3110035115076597, "learning_rate": 4.999836570811445e-05, "loss": 0.2594, "step": 41 }, { "epoch": 0.01910828025477707, "grad_norm": 1.102238378713438, "learning_rate": 4.999828297303483e-05, "loss": 0.2455, "step": 42 }, { "epoch": 0.019563239308462238, "grad_norm": 1.2400668175759613, "learning_rate": 4.9998198195208405e-05, "loss": 0.234, "step": 43 }, { "epoch": 0.020018198362147407, "grad_norm": 1.2706427374996991, "learning_rate": 4.999811137464212e-05, "loss": 0.2333, "step": 44 }, { "epoch": 0.020473157415832575, "grad_norm": 1.0227954451905796, "learning_rate": 4.999802251134307e-05, "loss": 0.2718, "step": 45 }, { "epoch": 0.020928116469517744, "grad_norm": 1.129854136221206, "learning_rate": 4.99979316053185e-05, "loss": 0.1993, "step": 46 }, { "epoch": 0.021383075523202913, "grad_norm": 1.0566869480663346, "learning_rate": 4.999783865657585e-05, "loss": 0.3201, "step": 47 }, { "epoch": 0.02183803457688808, "grad_norm": 1.3644671171523401, "learning_rate": 4.9997743665122723e-05, "loss": 0.2672, "step": 48 }, { "epoch": 0.022292993630573247, "grad_norm": 1.3371621386186787, "learning_rate": 4.999764663096686e-05, "loss": 0.2982, "step": 49 }, { "epoch": 0.022747952684258416, "grad_norm": 0.9819251723210562, "learning_rate": 4.999754755411621e-05, "loss": 0.1932, "step": 50 }, { "epoch": 0.023202911737943584, "grad_norm": 1.272139842264722, "learning_rate": 4.999744643457886e-05, "loss": 0.2352, "step": 51 }, { "epoch": 0.023657870791628753, "grad_norm": 1.4633848011272832, "learning_rate": 4.999734327236307e-05, "loss": 0.2331, "step": 52 }, { "epoch": 0.024112829845313922, "grad_norm": 0.8534385316122761, "learning_rate": 4.999723806747729e-05, "loss": 0.1465, "step": 53 }, { "epoch": 0.02456778889899909, "grad_norm": 1.0949860395099258, "learning_rate": 4.999713081993009e-05, "loss": 0.1731, "step": 54 }, { "epoch": 0.02502274795268426, "grad_norm": 1.1829726026120988, "learning_rate": 4.999702152973025e-05, "loss": 0.1986, "step": 55 }, { "epoch": 0.025477707006369428, "grad_norm": 1.4588831566316596, "learning_rate": 4.999691019688669e-05, "loss": 0.2027, "step": 56 }, { "epoch": 0.025932666060054597, "grad_norm": 0.9041150159503183, "learning_rate": 4.999679682140852e-05, "loss": 0.136, "step": 57 }, { "epoch": 0.026387625113739762, "grad_norm": 1.184894654186384, "learning_rate": 4.9996681403305e-05, "loss": 0.2959, "step": 58 }, { "epoch": 0.02684258416742493, "grad_norm": 1.7698815697030899, "learning_rate": 4.999656394258555e-05, "loss": 0.2511, "step": 59 }, { "epoch": 0.0272975432211101, "grad_norm": 0.9092461242140687, "learning_rate": 4.999644443925978e-05, "loss": 0.2103, "step": 60 }, { "epoch": 0.027752502274795268, "grad_norm": 1.1062857329941562, "learning_rate": 4.999632289333746e-05, "loss": 0.23, "step": 61 }, { "epoch": 0.028207461328480437, "grad_norm": 1.4212490673236147, "learning_rate": 4.9996199304828514e-05, "loss": 0.2075, "step": 62 }, { "epoch": 0.028662420382165606, "grad_norm": 1.5285991392407892, "learning_rate": 4.999607367374304e-05, "loss": 0.2937, "step": 63 }, { "epoch": 0.029117379435850774, "grad_norm": 1.535895466709353, "learning_rate": 4.999594600009131e-05, "loss": 0.2176, "step": 64 }, { "epoch": 0.029572338489535943, "grad_norm": 1.3542478803064237, "learning_rate": 4.999581628388375e-05, "loss": 0.2478, "step": 65 }, { "epoch": 0.03002729754322111, "grad_norm": 1.005104821194542, "learning_rate": 4.999568452513097e-05, "loss": 0.2768, "step": 66 }, { "epoch": 0.030482256596906277, "grad_norm": 1.0446720362047501, "learning_rate": 4.999555072384372e-05, "loss": 0.2457, "step": 67 }, { "epoch": 0.030937215650591446, "grad_norm": 1.2903168733267987, "learning_rate": 4.999541488003295e-05, "loss": 0.2146, "step": 68 }, { "epoch": 0.03139217470427662, "grad_norm": 0.9466626958786747, "learning_rate": 4.999527699370975e-05, "loss": 0.2266, "step": 69 }, { "epoch": 0.03184713375796178, "grad_norm": 1.0943470369291173, "learning_rate": 4.99951370648854e-05, "loss": 0.216, "step": 70 }, { "epoch": 0.03230209281164695, "grad_norm": 1.0584653377465871, "learning_rate": 4.9994995093571314e-05, "loss": 0.2074, "step": 71 }, { "epoch": 0.03275705186533212, "grad_norm": 1.3190163096713692, "learning_rate": 4.999485107977912e-05, "loss": 0.2571, "step": 72 }, { "epoch": 0.033212010919017286, "grad_norm": 1.0710531576617517, "learning_rate": 4.999470502352056e-05, "loss": 0.2183, "step": 73 }, { "epoch": 0.03366696997270246, "grad_norm": 1.171898418000334, "learning_rate": 4.9994556924807584e-05, "loss": 0.257, "step": 74 }, { "epoch": 0.034121929026387623, "grad_norm": 0.9318333327642333, "learning_rate": 4.999440678365229e-05, "loss": 0.2151, "step": 75 }, { "epoch": 0.034576888080072796, "grad_norm": 1.0540431316354797, "learning_rate": 4.999425460006695e-05, "loss": 0.2168, "step": 76 }, { "epoch": 0.03503184713375796, "grad_norm": 0.9359829647848551, "learning_rate": 4.999410037406399e-05, "loss": 0.2089, "step": 77 }, { "epoch": 0.03548680618744313, "grad_norm": 1.6371641130606778, "learning_rate": 4.999394410565603e-05, "loss": 0.3167, "step": 78 }, { "epoch": 0.0359417652411283, "grad_norm": 1.6259583096921861, "learning_rate": 4.999378579485583e-05, "loss": 0.2061, "step": 79 }, { "epoch": 0.036396724294813464, "grad_norm": 1.0290562777993864, "learning_rate": 4.9993625441676315e-05, "loss": 0.2324, "step": 80 }, { "epoch": 0.036851683348498636, "grad_norm": 0.8318367643913458, "learning_rate": 4.999346304613061e-05, "loss": 0.2347, "step": 81 }, { "epoch": 0.0373066424021838, "grad_norm": 1.4118508193951111, "learning_rate": 4.9993298608231966e-05, "loss": 0.2771, "step": 82 }, { "epoch": 0.03776160145586897, "grad_norm": 1.058471337149328, "learning_rate": 4.999313212799382e-05, "loss": 0.2237, "step": 83 }, { "epoch": 0.03821656050955414, "grad_norm": 1.4253258272976375, "learning_rate": 4.99929636054298e-05, "loss": 0.2862, "step": 84 }, { "epoch": 0.03867151956323931, "grad_norm": 1.0894164736230274, "learning_rate": 4.999279304055366e-05, "loss": 0.2713, "step": 85 }, { "epoch": 0.039126478616924476, "grad_norm": 0.93733361977436, "learning_rate": 4.999262043337933e-05, "loss": 0.2419, "step": 86 }, { "epoch": 0.03958143767060965, "grad_norm": 1.0408412273845644, "learning_rate": 4.999244578392094e-05, "loss": 0.2187, "step": 87 }, { "epoch": 0.040036396724294813, "grad_norm": 0.869634294861204, "learning_rate": 4.9992269092192734e-05, "loss": 0.2132, "step": 88 }, { "epoch": 0.04049135577797998, "grad_norm": 1.052994124344848, "learning_rate": 4.999209035820917e-05, "loss": 0.2339, "step": 89 }, { "epoch": 0.04094631483166515, "grad_norm": 0.8888036175609889, "learning_rate": 4.999190958198483e-05, "loss": 0.1959, "step": 90 }, { "epoch": 0.041401273885350316, "grad_norm": 1.1422410696009744, "learning_rate": 4.999172676353451e-05, "loss": 0.1869, "step": 91 }, { "epoch": 0.04185623293903549, "grad_norm": 0.8567024180556292, "learning_rate": 4.999154190287314e-05, "loss": 0.2041, "step": 92 }, { "epoch": 0.042311191992720654, "grad_norm": 1.1583793253948331, "learning_rate": 4.999135500001583e-05, "loss": 0.2838, "step": 93 }, { "epoch": 0.042766151046405826, "grad_norm": 0.8971093580462918, "learning_rate": 4.999116605497784e-05, "loss": 0.1783, "step": 94 }, { "epoch": 0.04322111010009099, "grad_norm": 1.1454812103959264, "learning_rate": 4.999097506777463e-05, "loss": 0.2448, "step": 95 }, { "epoch": 0.04367606915377616, "grad_norm": 1.1673198678019097, "learning_rate": 4.9990782038421794e-05, "loss": 0.2797, "step": 96 }, { "epoch": 0.04413102820746133, "grad_norm": 0.9411242321621639, "learning_rate": 4.9990586966935107e-05, "loss": 0.1617, "step": 97 }, { "epoch": 0.044585987261146494, "grad_norm": 1.1181547368420937, "learning_rate": 4.99903898533305e-05, "loss": 0.237, "step": 98 }, { "epoch": 0.045040946314831666, "grad_norm": 1.1224266668961962, "learning_rate": 4.9990190697624095e-05, "loss": 0.2244, "step": 99 }, { "epoch": 0.04549590536851683, "grad_norm": 0.9107011406834199, "learning_rate": 4.998998949983217e-05, "loss": 0.1685, "step": 100 }, { "epoch": 0.045950864422202004, "grad_norm": 1.1715429796165449, "learning_rate": 4.998978625997115e-05, "loss": 0.2729, "step": 101 }, { "epoch": 0.04640582347588717, "grad_norm": 1.1980256314734712, "learning_rate": 4.998958097805765e-05, "loss": 0.2302, "step": 102 }, { "epoch": 0.04686078252957234, "grad_norm": 1.1482917301959503, "learning_rate": 4.998937365410844e-05, "loss": 0.1985, "step": 103 }, { "epoch": 0.047315741583257506, "grad_norm": 1.1228041619613163, "learning_rate": 4.9989164288140463e-05, "loss": 0.1813, "step": 104 }, { "epoch": 0.04777070063694268, "grad_norm": 1.1989246971355767, "learning_rate": 4.998895288017085e-05, "loss": 0.2369, "step": 105 }, { "epoch": 0.048225659690627844, "grad_norm": 1.2592548396604681, "learning_rate": 4.9988739430216834e-05, "loss": 0.2173, "step": 106 }, { "epoch": 0.04868061874431301, "grad_norm": 1.087082920682345, "learning_rate": 4.9988523938295896e-05, "loss": 0.2001, "step": 107 }, { "epoch": 0.04913557779799818, "grad_norm": 1.3222813789625618, "learning_rate": 4.9988306404425625e-05, "loss": 0.2867, "step": 108 }, { "epoch": 0.049590536851683346, "grad_norm": 0.8632279272612142, "learning_rate": 4.9988086828623796e-05, "loss": 0.2089, "step": 109 }, { "epoch": 0.05004549590536852, "grad_norm": 1.5618851887465837, "learning_rate": 4.998786521090836e-05, "loss": 0.2855, "step": 110 }, { "epoch": 0.050500454959053684, "grad_norm": 1.0324519425398178, "learning_rate": 4.998764155129742e-05, "loss": 0.2673, "step": 111 }, { "epoch": 0.050955414012738856, "grad_norm": 1.6911595693690926, "learning_rate": 4.998741584980926e-05, "loss": 0.26, "step": 112 }, { "epoch": 0.05141037306642402, "grad_norm": 1.2144334923385869, "learning_rate": 4.998718810646231e-05, "loss": 0.2418, "step": 113 }, { "epoch": 0.051865332120109194, "grad_norm": 0.6311838497342976, "learning_rate": 4.99869583212752e-05, "loss": 0.1868, "step": 114 }, { "epoch": 0.05232029117379436, "grad_norm": 1.0223030232681207, "learning_rate": 4.998672649426669e-05, "loss": 0.3238, "step": 115 }, { "epoch": 0.052775250227479524, "grad_norm": 1.0506982443808877, "learning_rate": 4.998649262545574e-05, "loss": 0.2441, "step": 116 }, { "epoch": 0.053230209281164696, "grad_norm": 1.3332445373041606, "learning_rate": 4.998625671486144e-05, "loss": 0.2522, "step": 117 }, { "epoch": 0.05368516833484986, "grad_norm": 1.6324668993126747, "learning_rate": 4.998601876250308e-05, "loss": 0.236, "step": 118 }, { "epoch": 0.054140127388535034, "grad_norm": 0.9485734961244265, "learning_rate": 4.9985778768400105e-05, "loss": 0.1682, "step": 119 }, { "epoch": 0.0545950864422202, "grad_norm": 1.0098569259084629, "learning_rate": 4.998553673257212e-05, "loss": 0.2621, "step": 120 }, { "epoch": 0.05505004549590537, "grad_norm": 1.022249078413692, "learning_rate": 4.9985292655038905e-05, "loss": 0.2101, "step": 121 }, { "epoch": 0.055505004549590536, "grad_norm": 0.8802940837276332, "learning_rate": 4.9985046535820414e-05, "loss": 0.261, "step": 122 }, { "epoch": 0.05595996360327571, "grad_norm": 0.7428228951488147, "learning_rate": 4.9984798374936746e-05, "loss": 0.1769, "step": 123 }, { "epoch": 0.056414922656960874, "grad_norm": 1.26166207922868, "learning_rate": 4.998454817240819e-05, "loss": 0.228, "step": 124 }, { "epoch": 0.05686988171064604, "grad_norm": 1.0488702039329023, "learning_rate": 4.998429592825519e-05, "loss": 0.2232, "step": 125 }, { "epoch": 0.05732484076433121, "grad_norm": 1.1303588453646947, "learning_rate": 4.998404164249835e-05, "loss": 0.2916, "step": 126 }, { "epoch": 0.05777979981801638, "grad_norm": 0.9768207513960141, "learning_rate": 4.998378531515845e-05, "loss": 0.2007, "step": 127 }, { "epoch": 0.05823475887170155, "grad_norm": 1.3353829960148134, "learning_rate": 4.9983526946256445e-05, "loss": 0.2173, "step": 128 }, { "epoch": 0.058689717925386714, "grad_norm": 0.7001213335640923, "learning_rate": 4.998326653581343e-05, "loss": 0.166, "step": 129 }, { "epoch": 0.059144676979071886, "grad_norm": 1.2238624903386932, "learning_rate": 4.9983004083850715e-05, "loss": 0.2569, "step": 130 }, { "epoch": 0.05959963603275705, "grad_norm": 1.3684464189312067, "learning_rate": 4.9982739590389715e-05, "loss": 0.3189, "step": 131 }, { "epoch": 0.06005459508644222, "grad_norm": 0.7768254773526782, "learning_rate": 4.9982473055452066e-05, "loss": 0.1713, "step": 132 }, { "epoch": 0.06050955414012739, "grad_norm": 0.6728637449638813, "learning_rate": 4.9982204479059536e-05, "loss": 0.2166, "step": 133 }, { "epoch": 0.060964513193812554, "grad_norm": 1.6115148715610486, "learning_rate": 4.998193386123408e-05, "loss": 0.2796, "step": 134 }, { "epoch": 0.061419472247497726, "grad_norm": 1.108628942130757, "learning_rate": 4.99816612019978e-05, "loss": 0.2579, "step": 135 }, { "epoch": 0.06187443130118289, "grad_norm": 1.1878551775987127, "learning_rate": 4.998138650137298e-05, "loss": 0.2313, "step": 136 }, { "epoch": 0.062329390354868064, "grad_norm": 1.0455316421319232, "learning_rate": 4.998110975938207e-05, "loss": 0.2745, "step": 137 }, { "epoch": 0.06278434940855324, "grad_norm": 1.0745831940051274, "learning_rate": 4.998083097604769e-05, "loss": 0.2655, "step": 138 }, { "epoch": 0.0632393084622384, "grad_norm": 1.087020436890146, "learning_rate": 4.9980550151392615e-05, "loss": 0.1969, "step": 139 }, { "epoch": 0.06369426751592357, "grad_norm": 1.2353262431194252, "learning_rate": 4.9980267285439786e-05, "loss": 0.2487, "step": 140 }, { "epoch": 0.06414922656960874, "grad_norm": 1.004527220834801, "learning_rate": 4.997998237821233e-05, "loss": 0.2572, "step": 141 }, { "epoch": 0.0646041856232939, "grad_norm": 0.6481729806282698, "learning_rate": 4.997969542973352e-05, "loss": 0.1804, "step": 142 }, { "epoch": 0.06505914467697907, "grad_norm": 0.9165848395572888, "learning_rate": 4.997940644002681e-05, "loss": 0.2285, "step": 143 }, { "epoch": 0.06551410373066424, "grad_norm": 1.3735320351537703, "learning_rate": 4.997911540911581e-05, "loss": 0.3031, "step": 144 }, { "epoch": 0.06596906278434941, "grad_norm": 0.8961782777502066, "learning_rate": 4.99788223370243e-05, "loss": 0.1931, "step": 145 }, { "epoch": 0.06642402183803457, "grad_norm": 0.9534806680319905, "learning_rate": 4.997852722377624e-05, "loss": 0.2524, "step": 146 }, { "epoch": 0.06687898089171974, "grad_norm": 0.8781945527977916, "learning_rate": 4.997823006939573e-05, "loss": 0.2098, "step": 147 }, { "epoch": 0.06733393994540492, "grad_norm": 1.0789792103175795, "learning_rate": 4.997793087390706e-05, "loss": 0.2162, "step": 148 }, { "epoch": 0.06778889899909009, "grad_norm": 0.9657175690419708, "learning_rate": 4.997762963733468e-05, "loss": 0.2094, "step": 149 }, { "epoch": 0.06824385805277525, "grad_norm": 1.2297473059622832, "learning_rate": 4.997732635970321e-05, "loss": 0.2803, "step": 150 }, { "epoch": 0.06869881710646042, "grad_norm": 0.6893752587378985, "learning_rate": 4.997702104103742e-05, "loss": 0.2118, "step": 151 }, { "epoch": 0.06915377616014559, "grad_norm": 1.1015130942838443, "learning_rate": 4.997671368136226e-05, "loss": 0.3035, "step": 152 }, { "epoch": 0.06960873521383075, "grad_norm": 1.0061561004814914, "learning_rate": 4.997640428070286e-05, "loss": 0.2895, "step": 153 }, { "epoch": 0.07006369426751592, "grad_norm": 0.9743072677983932, "learning_rate": 4.99760928390845e-05, "loss": 0.1877, "step": 154 }, { "epoch": 0.0705186533212011, "grad_norm": 1.0058819296060422, "learning_rate": 4.997577935653261e-05, "loss": 0.2035, "step": 155 }, { "epoch": 0.07097361237488627, "grad_norm": 1.1103663912790263, "learning_rate": 4.9975463833072836e-05, "loss": 0.2053, "step": 156 }, { "epoch": 0.07142857142857142, "grad_norm": 0.7832277046257746, "learning_rate": 4.9975146268730934e-05, "loss": 0.1901, "step": 157 }, { "epoch": 0.0718835304822566, "grad_norm": 0.9268069397893121, "learning_rate": 4.997482666353287e-05, "loss": 0.1847, "step": 158 }, { "epoch": 0.07233848953594177, "grad_norm": 0.9868441187983981, "learning_rate": 4.997450501750476e-05, "loss": 0.2229, "step": 159 }, { "epoch": 0.07279344858962693, "grad_norm": 0.8267098621379654, "learning_rate": 4.9974181330672875e-05, "loss": 0.1871, "step": 160 }, { "epoch": 0.0732484076433121, "grad_norm": 1.2522404547410633, "learning_rate": 4.997385560306368e-05, "loss": 0.2523, "step": 161 }, { "epoch": 0.07370336669699727, "grad_norm": 0.918397795660698, "learning_rate": 4.997352783470378e-05, "loss": 0.216, "step": 162 }, { "epoch": 0.07415832575068244, "grad_norm": 0.9403402374937561, "learning_rate": 4.9973198025619974e-05, "loss": 0.1893, "step": 163 }, { "epoch": 0.0746132848043676, "grad_norm": 0.8198319378738574, "learning_rate": 4.997286617583919e-05, "loss": 0.1564, "step": 164 }, { "epoch": 0.07506824385805277, "grad_norm": 0.949925121676635, "learning_rate": 4.997253228538857e-05, "loss": 0.2288, "step": 165 }, { "epoch": 0.07552320291173795, "grad_norm": 1.1504182808013323, "learning_rate": 4.997219635429539e-05, "loss": 0.2445, "step": 166 }, { "epoch": 0.07597816196542312, "grad_norm": 1.0240326241437807, "learning_rate": 4.997185838258709e-05, "loss": 0.2218, "step": 167 }, { "epoch": 0.07643312101910828, "grad_norm": 1.2834368493772934, "learning_rate": 4.997151837029129e-05, "loss": 0.2349, "step": 168 }, { "epoch": 0.07688808007279345, "grad_norm": 0.8593772097880827, "learning_rate": 4.997117631743579e-05, "loss": 0.195, "step": 169 }, { "epoch": 0.07734303912647862, "grad_norm": 1.1249976489970943, "learning_rate": 4.997083222404852e-05, "loss": 0.229, "step": 170 }, { "epoch": 0.07779799818016378, "grad_norm": 0.9365739267800142, "learning_rate": 4.997048609015762e-05, "loss": 0.1726, "step": 171 }, { "epoch": 0.07825295723384895, "grad_norm": 1.1274650771481407, "learning_rate": 4.997013791579136e-05, "loss": 0.2207, "step": 172 }, { "epoch": 0.07870791628753412, "grad_norm": 1.0674007035527455, "learning_rate": 4.996978770097819e-05, "loss": 0.2367, "step": 173 }, { "epoch": 0.0791628753412193, "grad_norm": 0.98369179113072, "learning_rate": 4.996943544574672e-05, "loss": 0.2159, "step": 174 }, { "epoch": 0.07961783439490445, "grad_norm": 0.9766441420396662, "learning_rate": 4.9969081150125765e-05, "loss": 0.2068, "step": 175 }, { "epoch": 0.08007279344858963, "grad_norm": 1.0882037334216006, "learning_rate": 4.996872481414425e-05, "loss": 0.2406, "step": 176 }, { "epoch": 0.0805277525022748, "grad_norm": 1.3698592883299627, "learning_rate": 4.9968366437831305e-05, "loss": 0.2928, "step": 177 }, { "epoch": 0.08098271155595996, "grad_norm": 0.6987520632973869, "learning_rate": 4.99680060212162e-05, "loss": 0.1764, "step": 178 }, { "epoch": 0.08143767060964513, "grad_norm": 1.3027058622416998, "learning_rate": 4.9967643564328407e-05, "loss": 0.2724, "step": 179 }, { "epoch": 0.0818926296633303, "grad_norm": 0.8947764994248179, "learning_rate": 4.996727906719754e-05, "loss": 0.2063, "step": 180 }, { "epoch": 0.08234758871701547, "grad_norm": 0.9763597484914425, "learning_rate": 4.996691252985336e-05, "loss": 0.2028, "step": 181 }, { "epoch": 0.08280254777070063, "grad_norm": 0.9150363076013049, "learning_rate": 4.996654395232585e-05, "loss": 0.2276, "step": 182 }, { "epoch": 0.0832575068243858, "grad_norm": 0.9103497357752012, "learning_rate": 4.9966173334645115e-05, "loss": 0.2185, "step": 183 }, { "epoch": 0.08371246587807098, "grad_norm": 0.8247430263974811, "learning_rate": 4.9965800676841445e-05, "loss": 0.2071, "step": 184 }, { "epoch": 0.08416742493175614, "grad_norm": 0.7480935933799238, "learning_rate": 4.996542597894529e-05, "loss": 0.235, "step": 185 }, { "epoch": 0.08462238398544131, "grad_norm": 0.7351885920182925, "learning_rate": 4.996504924098726e-05, "loss": 0.2277, "step": 186 }, { "epoch": 0.08507734303912648, "grad_norm": 0.6852087113550299, "learning_rate": 4.996467046299814e-05, "loss": 0.1962, "step": 187 }, { "epoch": 0.08553230209281165, "grad_norm": 0.7911136395423962, "learning_rate": 4.9964289645008896e-05, "loss": 0.2893, "step": 188 }, { "epoch": 0.08598726114649681, "grad_norm": 1.0822147942792688, "learning_rate": 4.996390678705065e-05, "loss": 0.3031, "step": 189 }, { "epoch": 0.08644222020018198, "grad_norm": 0.8503276338672248, "learning_rate": 4.996352188915467e-05, "loss": 0.2383, "step": 190 }, { "epoch": 0.08689717925386715, "grad_norm": 1.2786178320902941, "learning_rate": 4.9963134951352416e-05, "loss": 0.312, "step": 191 }, { "epoch": 0.08735213830755233, "grad_norm": 1.0179661012144667, "learning_rate": 4.99627459736755e-05, "loss": 0.2831, "step": 192 }, { "epoch": 0.08780709736123748, "grad_norm": 1.0898701286404884, "learning_rate": 4.996235495615572e-05, "loss": 0.2295, "step": 193 }, { "epoch": 0.08826205641492266, "grad_norm": 1.3217944667311619, "learning_rate": 4.996196189882502e-05, "loss": 0.2423, "step": 194 }, { "epoch": 0.08871701546860783, "grad_norm": 1.4153844083331932, "learning_rate": 4.996156680171552e-05, "loss": 0.3296, "step": 195 }, { "epoch": 0.08917197452229299, "grad_norm": 1.0161813818526169, "learning_rate": 4.996116966485951e-05, "loss": 0.242, "step": 196 }, { "epoch": 0.08962693357597816, "grad_norm": 1.1502736682331154, "learning_rate": 4.996077048828944e-05, "loss": 0.3, "step": 197 }, { "epoch": 0.09008189262966333, "grad_norm": 0.8340955562868464, "learning_rate": 4.996036927203792e-05, "loss": 0.2298, "step": 198 }, { "epoch": 0.0905368516833485, "grad_norm": 1.178331791996245, "learning_rate": 4.995996601613775e-05, "loss": 0.2546, "step": 199 }, { "epoch": 0.09099181073703366, "grad_norm": 0.9820106506239689, "learning_rate": 4.995956072062187e-05, "loss": 0.2085, "step": 200 }, { "epoch": 0.09144676979071883, "grad_norm": 1.524177968916044, "learning_rate": 4.995915338552341e-05, "loss": 0.2749, "step": 201 }, { "epoch": 0.09190172884440401, "grad_norm": 2.7908020909742537, "learning_rate": 4.9958744010875646e-05, "loss": 0.2282, "step": 202 }, { "epoch": 0.09235668789808917, "grad_norm": 1.161533853858501, "learning_rate": 4.995833259671203e-05, "loss": 0.272, "step": 203 }, { "epoch": 0.09281164695177434, "grad_norm": 1.3305550414451668, "learning_rate": 4.995791914306619e-05, "loss": 0.2615, "step": 204 }, { "epoch": 0.09326660600545951, "grad_norm": 0.9446324909125462, "learning_rate": 4.995750364997192e-05, "loss": 0.1978, "step": 205 }, { "epoch": 0.09372156505914468, "grad_norm": 2.025455569978896, "learning_rate": 4.995708611746314e-05, "loss": 0.2749, "step": 206 }, { "epoch": 0.09417652411282984, "grad_norm": 1.0190667998091618, "learning_rate": 4.995666654557399e-05, "loss": 0.2572, "step": 207 }, { "epoch": 0.09463148316651501, "grad_norm": 0.9876461146649904, "learning_rate": 4.9956244934338756e-05, "loss": 0.267, "step": 208 }, { "epoch": 0.09508644222020018, "grad_norm": 0.9085146041356152, "learning_rate": 4.9955821283791895e-05, "loss": 0.2624, "step": 209 }, { "epoch": 0.09554140127388536, "grad_norm": 1.0259809279791963, "learning_rate": 4.9955395593968e-05, "loss": 0.3108, "step": 210 }, { "epoch": 0.09599636032757052, "grad_norm": 1.2613005448897872, "learning_rate": 4.995496786490189e-05, "loss": 0.284, "step": 211 }, { "epoch": 0.09645131938125569, "grad_norm": 0.9490464963042589, "learning_rate": 4.9954538096628504e-05, "loss": 0.2577, "step": 212 }, { "epoch": 0.09690627843494086, "grad_norm": 1.094140068752516, "learning_rate": 4.995410628918294e-05, "loss": 0.2277, "step": 213 }, { "epoch": 0.09736123748862602, "grad_norm": 0.9564858342611784, "learning_rate": 4.995367244260052e-05, "loss": 0.252, "step": 214 }, { "epoch": 0.09781619654231119, "grad_norm": 0.9326203499108876, "learning_rate": 4.9953236556916675e-05, "loss": 0.1728, "step": 215 }, { "epoch": 0.09827115559599636, "grad_norm": 1.13491356573659, "learning_rate": 4.9952798632167016e-05, "loss": 0.2961, "step": 216 }, { "epoch": 0.09872611464968153, "grad_norm": 0.9866062616028243, "learning_rate": 4.995235866838735e-05, "loss": 0.2416, "step": 217 }, { "epoch": 0.09918107370336669, "grad_norm": 0.9499384498938079, "learning_rate": 4.995191666561361e-05, "loss": 0.2232, "step": 218 }, { "epoch": 0.09963603275705187, "grad_norm": 0.8667087934433083, "learning_rate": 4.995147262388192e-05, "loss": 0.166, "step": 219 }, { "epoch": 0.10009099181073704, "grad_norm": 1.2643430421929842, "learning_rate": 4.9951026543228576e-05, "loss": 0.2608, "step": 220 }, { "epoch": 0.1005459508644222, "grad_norm": 0.8444091468054182, "learning_rate": 4.995057842369002e-05, "loss": 0.1427, "step": 221 }, { "epoch": 0.10100090991810737, "grad_norm": 1.0191236814021594, "learning_rate": 4.995012826530287e-05, "loss": 0.2715, "step": 222 }, { "epoch": 0.10145586897179254, "grad_norm": 0.9113183017750295, "learning_rate": 4.9949676068103904e-05, "loss": 0.2562, "step": 223 }, { "epoch": 0.10191082802547771, "grad_norm": 0.9633172136168865, "learning_rate": 4.994922183213009e-05, "loss": 0.2465, "step": 224 }, { "epoch": 0.10236578707916287, "grad_norm": 0.8031972085872137, "learning_rate": 4.994876555741853e-05, "loss": 0.1843, "step": 225 }, { "epoch": 0.10282074613284804, "grad_norm": 1.223758887068779, "learning_rate": 4.994830724400652e-05, "loss": 0.3388, "step": 226 }, { "epoch": 0.10327570518653321, "grad_norm": 0.8627575799289282, "learning_rate": 4.9947846891931517e-05, "loss": 0.1879, "step": 227 }, { "epoch": 0.10373066424021839, "grad_norm": 0.9083456008163456, "learning_rate": 4.9947384501231115e-05, "loss": 0.2431, "step": 228 }, { "epoch": 0.10418562329390355, "grad_norm": 0.967227861205465, "learning_rate": 4.994692007194312e-05, "loss": 0.2291, "step": 229 }, { "epoch": 0.10464058234758872, "grad_norm": 0.6397850394521112, "learning_rate": 4.9946453604105475e-05, "loss": 0.172, "step": 230 }, { "epoch": 0.10509554140127389, "grad_norm": 0.8612207427346141, "learning_rate": 4.99459850977563e-05, "loss": 0.193, "step": 231 }, { "epoch": 0.10555050045495905, "grad_norm": 1.2587193005638013, "learning_rate": 4.9945514552933875e-05, "loss": 0.2351, "step": 232 }, { "epoch": 0.10600545950864422, "grad_norm": 0.8448212904705226, "learning_rate": 4.994504196967665e-05, "loss": 0.2292, "step": 233 }, { "epoch": 0.10646041856232939, "grad_norm": 0.9479092781594234, "learning_rate": 4.994456734802325e-05, "loss": 0.2009, "step": 234 }, { "epoch": 0.10691537761601456, "grad_norm": 0.7949133571568955, "learning_rate": 4.994409068801246e-05, "loss": 0.2409, "step": 235 }, { "epoch": 0.10737033666969972, "grad_norm": 0.6674087296660387, "learning_rate": 4.994361198968323e-05, "loss": 0.1628, "step": 236 }, { "epoch": 0.1078252957233849, "grad_norm": 1.201564983604896, "learning_rate": 4.994313125307466e-05, "loss": 0.2677, "step": 237 }, { "epoch": 0.10828025477707007, "grad_norm": 1.0431680248749846, "learning_rate": 4.994264847822605e-05, "loss": 0.289, "step": 238 }, { "epoch": 0.10873521383075523, "grad_norm": 0.9653424594034078, "learning_rate": 4.994216366517685e-05, "loss": 0.1968, "step": 239 }, { "epoch": 0.1091901728844404, "grad_norm": 0.9125387759488122, "learning_rate": 4.994167681396666e-05, "loss": 0.2617, "step": 240 }, { "epoch": 0.10964513193812557, "grad_norm": 0.9352652381091711, "learning_rate": 4.9941187924635294e-05, "loss": 0.2033, "step": 241 }, { "epoch": 0.11010009099181074, "grad_norm": 0.8486085196314103, "learning_rate": 4.9940696997222667e-05, "loss": 0.2521, "step": 242 }, { "epoch": 0.1105550500454959, "grad_norm": 0.7169086937032756, "learning_rate": 4.994020403176893e-05, "loss": 0.1947, "step": 243 }, { "epoch": 0.11101000909918107, "grad_norm": 0.7848172375628268, "learning_rate": 4.993970902831434e-05, "loss": 0.2079, "step": 244 }, { "epoch": 0.11146496815286625, "grad_norm": 1.1215727046768524, "learning_rate": 4.993921198689935e-05, "loss": 0.1722, "step": 245 }, { "epoch": 0.11191992720655142, "grad_norm": 0.8394128217152594, "learning_rate": 4.993871290756459e-05, "loss": 0.2277, "step": 246 }, { "epoch": 0.11237488626023658, "grad_norm": 1.0300622720949144, "learning_rate": 4.993821179035083e-05, "loss": 0.2363, "step": 247 }, { "epoch": 0.11282984531392175, "grad_norm": 1.0103941806691277, "learning_rate": 4.993770863529902e-05, "loss": 0.2604, "step": 248 }, { "epoch": 0.11328480436760692, "grad_norm": 1.5350533138761926, "learning_rate": 4.9937203442450284e-05, "loss": 0.2361, "step": 249 }, { "epoch": 0.11373976342129208, "grad_norm": 0.8993849316410959, "learning_rate": 4.993669621184589e-05, "loss": 0.2132, "step": 250 }, { "epoch": 0.11419472247497725, "grad_norm": 1.0288784473172377, "learning_rate": 4.99361869435273e-05, "loss": 0.1867, "step": 251 }, { "epoch": 0.11464968152866242, "grad_norm": 0.9642598487133237, "learning_rate": 4.993567563753613e-05, "loss": 0.2327, "step": 252 }, { "epoch": 0.1151046405823476, "grad_norm": 0.9627848388659644, "learning_rate": 4.9935162293914136e-05, "loss": 0.1813, "step": 253 }, { "epoch": 0.11555959963603275, "grad_norm": 1.1969772124808065, "learning_rate": 4.993464691270331e-05, "loss": 0.2246, "step": 254 }, { "epoch": 0.11601455868971793, "grad_norm": 0.8925549446282988, "learning_rate": 4.9934129493945724e-05, "loss": 0.2193, "step": 255 }, { "epoch": 0.1164695177434031, "grad_norm": 0.9009083702443086, "learning_rate": 4.993361003768369e-05, "loss": 0.1619, "step": 256 }, { "epoch": 0.11692447679708826, "grad_norm": 0.926318675344456, "learning_rate": 4.9933088543959624e-05, "loss": 0.2685, "step": 257 }, { "epoch": 0.11737943585077343, "grad_norm": 0.9816855771807739, "learning_rate": 4.993256501281618e-05, "loss": 0.2272, "step": 258 }, { "epoch": 0.1178343949044586, "grad_norm": 0.8392667690733058, "learning_rate": 4.9932039444296105e-05, "loss": 0.2492, "step": 259 }, { "epoch": 0.11828935395814377, "grad_norm": 1.189859214740981, "learning_rate": 4.9931511838442364e-05, "loss": 0.2537, "step": 260 }, { "epoch": 0.11874431301182893, "grad_norm": 0.7328258163767538, "learning_rate": 4.993098219529807e-05, "loss": 0.2301, "step": 261 }, { "epoch": 0.1191992720655141, "grad_norm": 0.9123068157116897, "learning_rate": 4.9930450514906486e-05, "loss": 0.2432, "step": 262 }, { "epoch": 0.11965423111919928, "grad_norm": 1.0391414474764322, "learning_rate": 4.992991679731108e-05, "loss": 0.2477, "step": 263 }, { "epoch": 0.12010919017288443, "grad_norm": 0.9268303743602482, "learning_rate": 4.992938104255545e-05, "loss": 0.215, "step": 264 }, { "epoch": 0.1205641492265696, "grad_norm": 1.013893387467843, "learning_rate": 4.9928843250683385e-05, "loss": 0.2857, "step": 265 }, { "epoch": 0.12101910828025478, "grad_norm": 0.8602676777999626, "learning_rate": 4.9928303421738825e-05, "loss": 0.2401, "step": 266 }, { "epoch": 0.12147406733393995, "grad_norm": 0.6803832120130948, "learning_rate": 4.992776155576588e-05, "loss": 0.1871, "step": 267 }, { "epoch": 0.12192902638762511, "grad_norm": 0.6598469160631033, "learning_rate": 4.9927217652808847e-05, "loss": 0.174, "step": 268 }, { "epoch": 0.12238398544131028, "grad_norm": 1.2973520906729556, "learning_rate": 4.992667171291215e-05, "loss": 0.2946, "step": 269 }, { "epoch": 0.12283894449499545, "grad_norm": 1.2009193015365256, "learning_rate": 4.992612373612041e-05, "loss": 0.245, "step": 270 }, { "epoch": 0.12329390354868063, "grad_norm": 1.1244329576372483, "learning_rate": 4.99255737224784e-05, "loss": 0.2643, "step": 271 }, { "epoch": 0.12374886260236578, "grad_norm": 0.6441013893547167, "learning_rate": 4.9925021672031066e-05, "loss": 0.1954, "step": 272 }, { "epoch": 0.12420382165605096, "grad_norm": 1.0576659351271296, "learning_rate": 4.992446758482353e-05, "loss": 0.2442, "step": 273 }, { "epoch": 0.12465878070973613, "grad_norm": 0.9585929099664303, "learning_rate": 4.992391146090106e-05, "loss": 0.2368, "step": 274 }, { "epoch": 0.1251137397634213, "grad_norm": 0.7049091722308071, "learning_rate": 4.9923353300309096e-05, "loss": 0.1842, "step": 275 }, { "epoch": 0.12556869881710647, "grad_norm": 0.7320434028517127, "learning_rate": 4.992279310309326e-05, "loss": 0.2441, "step": 276 }, { "epoch": 0.12602365787079162, "grad_norm": 1.0894014477962464, "learning_rate": 4.992223086929931e-05, "loss": 0.3448, "step": 277 }, { "epoch": 0.1264786169244768, "grad_norm": 0.9204993506670345, "learning_rate": 4.99216665989732e-05, "loss": 0.242, "step": 278 }, { "epoch": 0.12693357597816196, "grad_norm": 0.940564609463127, "learning_rate": 4.992110029216106e-05, "loss": 0.2307, "step": 279 }, { "epoch": 0.12738853503184713, "grad_norm": 0.7446733630340908, "learning_rate": 4.992053194890913e-05, "loss": 0.1732, "step": 280 }, { "epoch": 0.1278434940855323, "grad_norm": 0.9067882861735025, "learning_rate": 4.991996156926387e-05, "loss": 0.2074, "step": 281 }, { "epoch": 0.12829845313921748, "grad_norm": 1.1603498037062319, "learning_rate": 4.99193891532719e-05, "loss": 0.2815, "step": 282 }, { "epoch": 0.12875341219290265, "grad_norm": 0.609720699641334, "learning_rate": 4.9918814700979977e-05, "loss": 0.25, "step": 283 }, { "epoch": 0.1292083712465878, "grad_norm": 0.9279501297825673, "learning_rate": 4.9918238212435046e-05, "loss": 0.2151, "step": 284 }, { "epoch": 0.12966333030027297, "grad_norm": 1.145147710706986, "learning_rate": 4.991765968768422e-05, "loss": 0.2294, "step": 285 }, { "epoch": 0.13011828935395814, "grad_norm": 1.0963594661629925, "learning_rate": 4.991707912677478e-05, "loss": 0.2154, "step": 286 }, { "epoch": 0.1305732484076433, "grad_norm": 0.8368215676876812, "learning_rate": 4.991649652975414e-05, "loss": 0.1895, "step": 287 }, { "epoch": 0.13102820746132848, "grad_norm": 1.2728391512581174, "learning_rate": 4.991591189666994e-05, "loss": 0.336, "step": 288 }, { "epoch": 0.13148316651501366, "grad_norm": 0.9217489919301896, "learning_rate": 4.991532522756993e-05, "loss": 0.2335, "step": 289 }, { "epoch": 0.13193812556869883, "grad_norm": 0.7733762857858478, "learning_rate": 4.991473652250207e-05, "loss": 0.2109, "step": 290 }, { "epoch": 0.13239308462238397, "grad_norm": 0.9885925779386018, "learning_rate": 4.991414578151444e-05, "loss": 0.1912, "step": 291 }, { "epoch": 0.13284804367606914, "grad_norm": 1.1103839302767298, "learning_rate": 4.991355300465534e-05, "loss": 0.2717, "step": 292 }, { "epoch": 0.13330300272975432, "grad_norm": 1.115391456880468, "learning_rate": 4.99129581919732e-05, "loss": 0.2988, "step": 293 }, { "epoch": 0.1337579617834395, "grad_norm": 1.0310643412765783, "learning_rate": 4.991236134351661e-05, "loss": 0.1864, "step": 294 }, { "epoch": 0.13421292083712466, "grad_norm": 0.9946918597809171, "learning_rate": 4.991176245933437e-05, "loss": 0.2637, "step": 295 }, { "epoch": 0.13466787989080983, "grad_norm": 0.9411380293671334, "learning_rate": 4.991116153947539e-05, "loss": 0.2349, "step": 296 }, { "epoch": 0.135122838944495, "grad_norm": 1.0955222841539587, "learning_rate": 4.9910558583988784e-05, "loss": 0.2716, "step": 297 }, { "epoch": 0.13557779799818018, "grad_norm": 1.1869760751324405, "learning_rate": 4.9909953592923835e-05, "loss": 0.2991, "step": 298 }, { "epoch": 0.13603275705186532, "grad_norm": 0.8305527348011584, "learning_rate": 4.990934656632997e-05, "loss": 0.2322, "step": 299 }, { "epoch": 0.1364877161055505, "grad_norm": 1.0771681383539558, "learning_rate": 4.9908737504256786e-05, "loss": 0.282, "step": 300 }, { "epoch": 0.13694267515923567, "grad_norm": 1.0219579324950732, "learning_rate": 4.9908126406754066e-05, "loss": 0.1965, "step": 301 }, { "epoch": 0.13739763421292084, "grad_norm": 1.0882518015247715, "learning_rate": 4.9907513273871744e-05, "loss": 0.2154, "step": 302 }, { "epoch": 0.137852593266606, "grad_norm": 1.0267367119820836, "learning_rate": 4.99068981056599e-05, "loss": 0.2081, "step": 303 }, { "epoch": 0.13830755232029118, "grad_norm": 1.0586655405854108, "learning_rate": 4.990628090216885e-05, "loss": 0.2764, "step": 304 }, { "epoch": 0.13876251137397635, "grad_norm": 1.1488116625073979, "learning_rate": 4.990566166344898e-05, "loss": 0.2099, "step": 305 }, { "epoch": 0.1392174704276615, "grad_norm": 1.0108872679147511, "learning_rate": 4.9905040389550913e-05, "loss": 0.219, "step": 306 }, { "epoch": 0.13967242948134667, "grad_norm": 1.128979049955556, "learning_rate": 4.9904417080525426e-05, "loss": 0.2271, "step": 307 }, { "epoch": 0.14012738853503184, "grad_norm": 0.9184211601572116, "learning_rate": 4.990379173642343e-05, "loss": 0.2153, "step": 308 }, { "epoch": 0.14058234758871702, "grad_norm": 0.7498191369944927, "learning_rate": 4.990316435729604e-05, "loss": 0.1545, "step": 309 }, { "epoch": 0.1410373066424022, "grad_norm": 0.7359338352556679, "learning_rate": 4.990253494319453e-05, "loss": 0.2569, "step": 310 }, { "epoch": 0.14149226569608736, "grad_norm": 1.1586293882987138, "learning_rate": 4.990190349417032e-05, "loss": 0.3302, "step": 311 }, { "epoch": 0.14194722474977253, "grad_norm": 1.0303817169406913, "learning_rate": 4.990127001027501e-05, "loss": 0.2244, "step": 312 }, { "epoch": 0.14240218380345768, "grad_norm": 0.9208675144447142, "learning_rate": 4.9900634491560366e-05, "loss": 0.2853, "step": 313 }, { "epoch": 0.14285714285714285, "grad_norm": 1.2080650894164364, "learning_rate": 4.989999693807832e-05, "loss": 0.2598, "step": 314 }, { "epoch": 0.14331210191082802, "grad_norm": 1.3176583061924925, "learning_rate": 4.989935734988098e-05, "loss": 0.2665, "step": 315 }, { "epoch": 0.1437670609645132, "grad_norm": 1.155936755367866, "learning_rate": 4.9898715727020594e-05, "loss": 0.2152, "step": 316 }, { "epoch": 0.14422202001819837, "grad_norm": 1.244015310208092, "learning_rate": 4.9898072069549604e-05, "loss": 0.3472, "step": 317 }, { "epoch": 0.14467697907188354, "grad_norm": 0.6619110373379009, "learning_rate": 4.9897426377520605e-05, "loss": 0.1743, "step": 318 }, { "epoch": 0.1451319381255687, "grad_norm": 0.863849098825801, "learning_rate": 4.989677865098635e-05, "loss": 0.2153, "step": 319 }, { "epoch": 0.14558689717925385, "grad_norm": 0.7649715189892816, "learning_rate": 4.989612888999978e-05, "loss": 0.1691, "step": 320 }, { "epoch": 0.14604185623293903, "grad_norm": 0.8713448612177858, "learning_rate": 4.9895477094613994e-05, "loss": 0.2083, "step": 321 }, { "epoch": 0.1464968152866242, "grad_norm": 0.728985195295809, "learning_rate": 4.989482326488224e-05, "loss": 0.203, "step": 322 }, { "epoch": 0.14695177434030937, "grad_norm": 0.888181720232883, "learning_rate": 4.989416740085796e-05, "loss": 0.2295, "step": 323 }, { "epoch": 0.14740673339399454, "grad_norm": 1.0922587657288432, "learning_rate": 4.9893509502594735e-05, "loss": 0.2684, "step": 324 }, { "epoch": 0.14786169244767972, "grad_norm": 0.9971220529421283, "learning_rate": 4.989284957014633e-05, "loss": 0.2617, "step": 325 }, { "epoch": 0.1483166515013649, "grad_norm": 0.9870448743720823, "learning_rate": 4.989218760356668e-05, "loss": 0.2318, "step": 326 }, { "epoch": 0.14877161055505003, "grad_norm": 0.9018589981975008, "learning_rate": 4.9891523602909864e-05, "loss": 0.2247, "step": 327 }, { "epoch": 0.1492265696087352, "grad_norm": 0.8598483031018114, "learning_rate": 4.989085756823015e-05, "loss": 0.2632, "step": 328 }, { "epoch": 0.14968152866242038, "grad_norm": 0.8643781078050121, "learning_rate": 4.9890189499581966e-05, "loss": 0.2734, "step": 329 }, { "epoch": 0.15013648771610555, "grad_norm": 0.7687464204962443, "learning_rate": 4.9889519397019897e-05, "loss": 0.1697, "step": 330 }, { "epoch": 0.15059144676979072, "grad_norm": 1.0106162938009775, "learning_rate": 4.98888472605987e-05, "loss": 0.2288, "step": 331 }, { "epoch": 0.1510464058234759, "grad_norm": 1.0671673871836995, "learning_rate": 4.98881730903733e-05, "loss": 0.2301, "step": 332 }, { "epoch": 0.15150136487716107, "grad_norm": 0.8400240886557742, "learning_rate": 4.98874968863988e-05, "loss": 0.2399, "step": 333 }, { "epoch": 0.15195632393084624, "grad_norm": 0.9664168166027034, "learning_rate": 4.988681864873044e-05, "loss": 0.2194, "step": 334 }, { "epoch": 0.15241128298453138, "grad_norm": 0.969020664042685, "learning_rate": 4.988613837742364e-05, "loss": 0.2592, "step": 335 }, { "epoch": 0.15286624203821655, "grad_norm": 1.0823638241901714, "learning_rate": 4.988545607253401e-05, "loss": 0.2276, "step": 336 }, { "epoch": 0.15332120109190173, "grad_norm": 1.2217498650546794, "learning_rate": 4.9884771734117283e-05, "loss": 0.2991, "step": 337 }, { "epoch": 0.1537761601455869, "grad_norm": 0.7501090403731316, "learning_rate": 4.988408536222939e-05, "loss": 0.1973, "step": 338 }, { "epoch": 0.15423111919927207, "grad_norm": 0.8921758076703502, "learning_rate": 4.988339695692641e-05, "loss": 0.2525, "step": 339 }, { "epoch": 0.15468607825295724, "grad_norm": 0.9226622299381118, "learning_rate": 4.988270651826461e-05, "loss": 0.2292, "step": 340 }, { "epoch": 0.15514103730664242, "grad_norm": 1.152103512210007, "learning_rate": 4.9882014046300406e-05, "loss": 0.2679, "step": 341 }, { "epoch": 0.15559599636032756, "grad_norm": 0.9363582022115696, "learning_rate": 4.988131954109038e-05, "loss": 0.2523, "step": 342 }, { "epoch": 0.15605095541401273, "grad_norm": 0.7051604140254526, "learning_rate": 4.988062300269128e-05, "loss": 0.2668, "step": 343 }, { "epoch": 0.1565059144676979, "grad_norm": 0.784310468103235, "learning_rate": 4.987992443116003e-05, "loss": 0.1844, "step": 344 }, { "epoch": 0.15696087352138308, "grad_norm": 1.081270224193415, "learning_rate": 4.9879223826553715e-05, "loss": 0.2158, "step": 345 }, { "epoch": 0.15741583257506825, "grad_norm": 0.9309294315799115, "learning_rate": 4.987852118892957e-05, "loss": 0.3169, "step": 346 }, { "epoch": 0.15787079162875342, "grad_norm": 0.8582087279169507, "learning_rate": 4.987781651834503e-05, "loss": 0.2792, "step": 347 }, { "epoch": 0.1583257506824386, "grad_norm": 0.9222514197836991, "learning_rate": 4.9877109814857684e-05, "loss": 0.2174, "step": 348 }, { "epoch": 0.15878070973612374, "grad_norm": 1.2123517073269288, "learning_rate": 4.987640107852525e-05, "loss": 0.2949, "step": 349 }, { "epoch": 0.1592356687898089, "grad_norm": 0.8447356256741352, "learning_rate": 4.987569030940567e-05, "loss": 0.2381, "step": 350 }, { "epoch": 0.15969062784349408, "grad_norm": 0.748932841941387, "learning_rate": 4.987497750755702e-05, "loss": 0.1292, "step": 351 }, { "epoch": 0.16014558689717925, "grad_norm": 0.898610169466206, "learning_rate": 4.9874262673037534e-05, "loss": 0.2873, "step": 352 }, { "epoch": 0.16060054595086443, "grad_norm": 1.0240724236709602, "learning_rate": 4.987354580590563e-05, "loss": 0.2417, "step": 353 }, { "epoch": 0.1610555050045496, "grad_norm": 0.8622421630738083, "learning_rate": 4.98728269062199e-05, "loss": 0.2551, "step": 354 }, { "epoch": 0.16151046405823477, "grad_norm": 0.8134532647830077, "learning_rate": 4.987210597403907e-05, "loss": 0.2174, "step": 355 }, { "epoch": 0.16196542311191992, "grad_norm": 0.9534453999496592, "learning_rate": 4.987138300942208e-05, "loss": 0.2366, "step": 356 }, { "epoch": 0.1624203821656051, "grad_norm": 1.0371844945284734, "learning_rate": 4.9870658012427974e-05, "loss": 0.2335, "step": 357 }, { "epoch": 0.16287534121929026, "grad_norm": 0.9004202299256066, "learning_rate": 4.986993098311601e-05, "loss": 0.242, "step": 358 }, { "epoch": 0.16333030027297543, "grad_norm": 0.9781473802486437, "learning_rate": 4.9869201921545605e-05, "loss": 0.2204, "step": 359 }, { "epoch": 0.1637852593266606, "grad_norm": 0.8274281201466194, "learning_rate": 4.986847082777632e-05, "loss": 0.2348, "step": 360 }, { "epoch": 0.16424021838034578, "grad_norm": 0.9334603870585367, "learning_rate": 4.9867737701867904e-05, "loss": 0.2563, "step": 361 }, { "epoch": 0.16469517743403095, "grad_norm": 0.9312184132525261, "learning_rate": 4.986700254388027e-05, "loss": 0.3632, "step": 362 }, { "epoch": 0.1651501364877161, "grad_norm": 0.8232839110015855, "learning_rate": 4.9866265353873484e-05, "loss": 0.2639, "step": 363 }, { "epoch": 0.16560509554140126, "grad_norm": 1.2193684853775268, "learning_rate": 4.9865526131907794e-05, "loss": 0.2573, "step": 364 }, { "epoch": 0.16606005459508644, "grad_norm": 0.9585919856440537, "learning_rate": 4.986478487804359e-05, "loss": 0.2898, "step": 365 }, { "epoch": 0.1665150136487716, "grad_norm": 1.1391585911626447, "learning_rate": 4.986404159234146e-05, "loss": 0.2924, "step": 366 }, { "epoch": 0.16696997270245678, "grad_norm": 0.8553657475720432, "learning_rate": 4.9863296274862134e-05, "loss": 0.2228, "step": 367 }, { "epoch": 0.16742493175614195, "grad_norm": 0.7763035651416975, "learning_rate": 4.9862548925666516e-05, "loss": 0.2544, "step": 368 }, { "epoch": 0.16787989080982713, "grad_norm": 0.9598552659524475, "learning_rate": 4.986179954481568e-05, "loss": 0.2266, "step": 369 }, { "epoch": 0.16833484986351227, "grad_norm": 0.7915076916881143, "learning_rate": 4.986104813237086e-05, "loss": 0.2141, "step": 370 }, { "epoch": 0.16878980891719744, "grad_norm": 0.93062460876264, "learning_rate": 4.986029468839346e-05, "loss": 0.2217, "step": 371 }, { "epoch": 0.16924476797088261, "grad_norm": 0.9695657860712834, "learning_rate": 4.985953921294505e-05, "loss": 0.3365, "step": 372 }, { "epoch": 0.1696997270245678, "grad_norm": 1.242731787426362, "learning_rate": 4.9858781706087355e-05, "loss": 0.2818, "step": 373 }, { "epoch": 0.17015468607825296, "grad_norm": 0.8957753411556577, "learning_rate": 4.985802216788228e-05, "loss": 0.2692, "step": 374 }, { "epoch": 0.17060964513193813, "grad_norm": 1.0273281941675514, "learning_rate": 4.985726059839189e-05, "loss": 0.2713, "step": 375 }, { "epoch": 0.1710646041856233, "grad_norm": 0.5842102943211992, "learning_rate": 4.985649699767842e-05, "loss": 0.1419, "step": 376 }, { "epoch": 0.17151956323930848, "grad_norm": 0.7692282188943281, "learning_rate": 4.985573136580427e-05, "loss": 0.2284, "step": 377 }, { "epoch": 0.17197452229299362, "grad_norm": 1.4274488633088522, "learning_rate": 4.985496370283199e-05, "loss": 0.2241, "step": 378 }, { "epoch": 0.1724294813466788, "grad_norm": 0.9628915318446656, "learning_rate": 4.9854194008824326e-05, "loss": 0.2869, "step": 379 }, { "epoch": 0.17288444040036396, "grad_norm": 0.9678778944028185, "learning_rate": 4.9853422283844176e-05, "loss": 0.2386, "step": 380 }, { "epoch": 0.17333939945404914, "grad_norm": 1.2851973245888821, "learning_rate": 4.985264852795459e-05, "loss": 0.3403, "step": 381 }, { "epoch": 0.1737943585077343, "grad_norm": 1.1512549644049486, "learning_rate": 4.98518727412188e-05, "loss": 0.272, "step": 382 }, { "epoch": 0.17424931756141948, "grad_norm": 1.1034128519120743, "learning_rate": 4.9851094923700194e-05, "loss": 0.2314, "step": 383 }, { "epoch": 0.17470427661510465, "grad_norm": 1.0792900244616377, "learning_rate": 4.985031507546234e-05, "loss": 0.2241, "step": 384 }, { "epoch": 0.1751592356687898, "grad_norm": 1.1833709250710265, "learning_rate": 4.984953319656896e-05, "loss": 0.2627, "step": 385 }, { "epoch": 0.17561419472247497, "grad_norm": 1.526133028770585, "learning_rate": 4.9848749287083945e-05, "loss": 0.3124, "step": 386 }, { "epoch": 0.17606915377616014, "grad_norm": 0.8253890824154251, "learning_rate": 4.984796334707136e-05, "loss": 0.2363, "step": 387 }, { "epoch": 0.17652411282984531, "grad_norm": 1.3213065879963553, "learning_rate": 4.984717537659542e-05, "loss": 0.2883, "step": 388 }, { "epoch": 0.1769790718835305, "grad_norm": 1.2277238402287651, "learning_rate": 4.9846385375720515e-05, "loss": 0.2034, "step": 389 }, { "epoch": 0.17743403093721566, "grad_norm": 0.9029988848875732, "learning_rate": 4.984559334451121e-05, "loss": 0.1922, "step": 390 }, { "epoch": 0.17788898999090083, "grad_norm": 1.011318694572055, "learning_rate": 4.984479928303221e-05, "loss": 0.16, "step": 391 }, { "epoch": 0.17834394904458598, "grad_norm": 0.7842684083287716, "learning_rate": 4.984400319134841e-05, "loss": 0.1513, "step": 392 }, { "epoch": 0.17879890809827115, "grad_norm": 1.0047121229003204, "learning_rate": 4.984320506952487e-05, "loss": 0.2775, "step": 393 }, { "epoch": 0.17925386715195632, "grad_norm": 0.96838578798944, "learning_rate": 4.9842404917626794e-05, "loss": 0.2581, "step": 394 }, { "epoch": 0.1797088262056415, "grad_norm": 0.8552424874926928, "learning_rate": 4.984160273571958e-05, "loss": 0.2185, "step": 395 }, { "epoch": 0.18016378525932666, "grad_norm": 0.7215923747123871, "learning_rate": 4.9840798523868783e-05, "loss": 0.2648, "step": 396 }, { "epoch": 0.18061874431301184, "grad_norm": 0.8273795348852334, "learning_rate": 4.9839992282140104e-05, "loss": 0.2044, "step": 397 }, { "epoch": 0.181073703366697, "grad_norm": 0.9785918148494038, "learning_rate": 4.983918401059943e-05, "loss": 0.2477, "step": 398 }, { "epoch": 0.18152866242038215, "grad_norm": 1.107312964787855, "learning_rate": 4.983837370931282e-05, "loss": 0.281, "step": 399 }, { "epoch": 0.18198362147406733, "grad_norm": 0.9090559022978792, "learning_rate": 4.983756137834646e-05, "loss": 0.2436, "step": 400 }, { "epoch": 0.1824385805277525, "grad_norm": 1.2144867941478938, "learning_rate": 4.9836747017766765e-05, "loss": 0.2858, "step": 401 }, { "epoch": 0.18289353958143767, "grad_norm": 1.111249776533361, "learning_rate": 4.9835930627640264e-05, "loss": 0.2887, "step": 402 }, { "epoch": 0.18334849863512284, "grad_norm": 1.2328261736683936, "learning_rate": 4.983511220803367e-05, "loss": 0.3321, "step": 403 }, { "epoch": 0.18380345768880801, "grad_norm": 1.0256852027579921, "learning_rate": 4.9834291759013864e-05, "loss": 0.2447, "step": 404 }, { "epoch": 0.1842584167424932, "grad_norm": 0.8916587179445885, "learning_rate": 4.983346928064788e-05, "loss": 0.235, "step": 405 }, { "epoch": 0.18471337579617833, "grad_norm": 0.8010374639077099, "learning_rate": 4.983264477300293e-05, "loss": 0.2049, "step": 406 }, { "epoch": 0.1851683348498635, "grad_norm": 0.9259601734043552, "learning_rate": 4.98318182361464e-05, "loss": 0.2666, "step": 407 }, { "epoch": 0.18562329390354868, "grad_norm": 1.0996800951329422, "learning_rate": 4.9830989670145825e-05, "loss": 0.3126, "step": 408 }, { "epoch": 0.18607825295723385, "grad_norm": 0.9825813831593347, "learning_rate": 4.98301590750689e-05, "loss": 0.2945, "step": 409 }, { "epoch": 0.18653321201091902, "grad_norm": 0.826298037357574, "learning_rate": 4.9829326450983514e-05, "loss": 0.2436, "step": 410 }, { "epoch": 0.1869881710646042, "grad_norm": 1.0778605984812317, "learning_rate": 4.98284917979577e-05, "loss": 0.244, "step": 411 }, { "epoch": 0.18744313011828936, "grad_norm": 0.8826503961722449, "learning_rate": 4.9827655116059656e-05, "loss": 0.2285, "step": 412 }, { "epoch": 0.18789808917197454, "grad_norm": 0.6783880431833905, "learning_rate": 4.982681640535776e-05, "loss": 0.2263, "step": 413 }, { "epoch": 0.18835304822565968, "grad_norm": 0.850607062096716, "learning_rate": 4.9825975665920544e-05, "loss": 0.1964, "step": 414 }, { "epoch": 0.18880800727934485, "grad_norm": 1.2053147239277264, "learning_rate": 4.9825132897816705e-05, "loss": 0.2821, "step": 415 }, { "epoch": 0.18926296633303002, "grad_norm": 0.9124205049308536, "learning_rate": 4.982428810111512e-05, "loss": 0.2914, "step": 416 }, { "epoch": 0.1897179253867152, "grad_norm": 0.6319974506555571, "learning_rate": 4.982344127588481e-05, "loss": 0.1761, "step": 417 }, { "epoch": 0.19017288444040037, "grad_norm": 1.112189111727749, "learning_rate": 4.982259242219499e-05, "loss": 0.2411, "step": 418 }, { "epoch": 0.19062784349408554, "grad_norm": 0.8057841883773158, "learning_rate": 4.982174154011501e-05, "loss": 0.1791, "step": 419 }, { "epoch": 0.1910828025477707, "grad_norm": 0.9489628556352278, "learning_rate": 4.982088862971441e-05, "loss": 0.2822, "step": 420 }, { "epoch": 0.19153776160145586, "grad_norm": 1.2587143584633964, "learning_rate": 4.982003369106287e-05, "loss": 0.2872, "step": 421 }, { "epoch": 0.19199272065514103, "grad_norm": 1.0751564520471593, "learning_rate": 4.981917672423027e-05, "loss": 0.1679, "step": 422 }, { "epoch": 0.1924476797088262, "grad_norm": 0.8596781622427906, "learning_rate": 4.9818317729286637e-05, "loss": 0.201, "step": 423 }, { "epoch": 0.19290263876251137, "grad_norm": 0.800295294104614, "learning_rate": 4.981745670630216e-05, "loss": 0.2201, "step": 424 }, { "epoch": 0.19335759781619655, "grad_norm": 0.825800169362288, "learning_rate": 4.981659365534718e-05, "loss": 0.2608, "step": 425 }, { "epoch": 0.19381255686988172, "grad_norm": 1.2537016281438078, "learning_rate": 4.981572857649225e-05, "loss": 0.3201, "step": 426 }, { "epoch": 0.1942675159235669, "grad_norm": 0.6630773711070428, "learning_rate": 4.981486146980804e-05, "loss": 0.1721, "step": 427 }, { "epoch": 0.19472247497725204, "grad_norm": 1.0456272834524787, "learning_rate": 4.981399233536541e-05, "loss": 0.2339, "step": 428 }, { "epoch": 0.1951774340309372, "grad_norm": 0.7588258951531742, "learning_rate": 4.98131211732354e-05, "loss": 0.2164, "step": 429 }, { "epoch": 0.19563239308462238, "grad_norm": 1.1842167043892005, "learning_rate": 4.981224798348917e-05, "loss": 0.2695, "step": 430 }, { "epoch": 0.19608735213830755, "grad_norm": 0.7817668941153749, "learning_rate": 4.981137276619809e-05, "loss": 0.2618, "step": 431 }, { "epoch": 0.19654231119199272, "grad_norm": 1.6115466860711452, "learning_rate": 4.981049552143368e-05, "loss": 0.2529, "step": 432 }, { "epoch": 0.1969972702456779, "grad_norm": 4.0902956604962775, "learning_rate": 4.980961624926761e-05, "loss": 0.4753, "step": 433 }, { "epoch": 0.19745222929936307, "grad_norm": 0.7960678611320803, "learning_rate": 4.980873494977174e-05, "loss": 0.2948, "step": 434 }, { "epoch": 0.1979071883530482, "grad_norm": 1.0070958810069257, "learning_rate": 4.980785162301809e-05, "loss": 0.2567, "step": 435 }, { "epoch": 0.19836214740673339, "grad_norm": 0.8191302299252047, "learning_rate": 4.980696626907883e-05, "loss": 0.2168, "step": 436 }, { "epoch": 0.19881710646041856, "grad_norm": 0.7937387047871365, "learning_rate": 4.980607888802633e-05, "loss": 0.1955, "step": 437 }, { "epoch": 0.19927206551410373, "grad_norm": 0.8432008094692945, "learning_rate": 4.9805189479933075e-05, "loss": 0.2084, "step": 438 }, { "epoch": 0.1997270245677889, "grad_norm": 0.8073292212036548, "learning_rate": 4.9804298044871755e-05, "loss": 0.2026, "step": 439 }, { "epoch": 0.20018198362147407, "grad_norm": 1.1300035170898861, "learning_rate": 4.9803404582915216e-05, "loss": 0.289, "step": 440 }, { "epoch": 0.20063694267515925, "grad_norm": 0.8002343868088962, "learning_rate": 4.9802509094136464e-05, "loss": 0.2003, "step": 441 }, { "epoch": 0.2010919017288444, "grad_norm": 1.1332893383280338, "learning_rate": 4.980161157860868e-05, "loss": 0.2597, "step": 442 }, { "epoch": 0.20154686078252956, "grad_norm": 1.3383795894954178, "learning_rate": 4.98007120364052e-05, "loss": 0.2769, "step": 443 }, { "epoch": 0.20200181983621474, "grad_norm": 0.7103836091735324, "learning_rate": 4.9799810467599515e-05, "loss": 0.1927, "step": 444 }, { "epoch": 0.2024567788898999, "grad_norm": 0.9076048167447247, "learning_rate": 4.979890687226533e-05, "loss": 0.2232, "step": 445 }, { "epoch": 0.20291173794358508, "grad_norm": 1.0814403812455224, "learning_rate": 4.979800125047647e-05, "loss": 0.2275, "step": 446 }, { "epoch": 0.20336669699727025, "grad_norm": 0.9521195412119995, "learning_rate": 4.979709360230692e-05, "loss": 0.2505, "step": 447 }, { "epoch": 0.20382165605095542, "grad_norm": 0.8132871987985927, "learning_rate": 4.9796183927830874e-05, "loss": 0.1968, "step": 448 }, { "epoch": 0.20427661510464057, "grad_norm": 0.7688587774051666, "learning_rate": 4.979527222712266e-05, "loss": 0.1934, "step": 449 }, { "epoch": 0.20473157415832574, "grad_norm": 0.9493574268445784, "learning_rate": 4.979435850025676e-05, "loss": 0.2343, "step": 450 }, { "epoch": 0.2051865332120109, "grad_norm": 1.3577065524441854, "learning_rate": 4.979344274730786e-05, "loss": 0.2941, "step": 451 }, { "epoch": 0.20564149226569609, "grad_norm": 1.51315156565673, "learning_rate": 4.979252496835079e-05, "loss": 0.3188, "step": 452 }, { "epoch": 0.20609645131938126, "grad_norm": 1.293280483202906, "learning_rate": 4.979160516346053e-05, "loss": 0.3135, "step": 453 }, { "epoch": 0.20655141037306643, "grad_norm": 0.9037700868581328, "learning_rate": 4.979068333271227e-05, "loss": 0.234, "step": 454 }, { "epoch": 0.2070063694267516, "grad_norm": 1.573289370729168, "learning_rate": 4.9789759476181306e-05, "loss": 0.3095, "step": 455 }, { "epoch": 0.20746132848043677, "grad_norm": 0.8457915006097118, "learning_rate": 4.9788833593943166e-05, "loss": 0.2473, "step": 456 }, { "epoch": 0.20791628753412192, "grad_norm": 0.8884327578476467, "learning_rate": 4.978790568607347e-05, "loss": 0.2191, "step": 457 }, { "epoch": 0.2083712465878071, "grad_norm": 0.9155886552129602, "learning_rate": 4.9786975752648074e-05, "loss": 0.2867, "step": 458 }, { "epoch": 0.20882620564149226, "grad_norm": 0.9084469838644137, "learning_rate": 4.978604379374295e-05, "loss": 0.2251, "step": 459 }, { "epoch": 0.20928116469517744, "grad_norm": 0.9575633260624943, "learning_rate": 4.978510980943427e-05, "loss": 0.2169, "step": 460 }, { "epoch": 0.2097361237488626, "grad_norm": 1.1825466207472919, "learning_rate": 4.978417379979834e-05, "loss": 0.3653, "step": 461 }, { "epoch": 0.21019108280254778, "grad_norm": 0.8295971856442961, "learning_rate": 4.978323576491164e-05, "loss": 0.1957, "step": 462 }, { "epoch": 0.21064604185623295, "grad_norm": 0.8642494306586788, "learning_rate": 4.978229570485085e-05, "loss": 0.2874, "step": 463 }, { "epoch": 0.2111010009099181, "grad_norm": 0.726091863884625, "learning_rate": 4.978135361969276e-05, "loss": 0.2367, "step": 464 }, { "epoch": 0.21155595996360327, "grad_norm": 0.8327914763971832, "learning_rate": 4.978040950951437e-05, "loss": 0.2376, "step": 465 }, { "epoch": 0.21201091901728844, "grad_norm": 0.8801851999194134, "learning_rate": 4.9779463374392824e-05, "loss": 0.2941, "step": 466 }, { "epoch": 0.2124658780709736, "grad_norm": 0.7662817087663885, "learning_rate": 4.977851521440543e-05, "loss": 0.236, "step": 467 }, { "epoch": 0.21292083712465878, "grad_norm": 0.6744745228210125, "learning_rate": 4.977756502962967e-05, "loss": 0.1539, "step": 468 }, { "epoch": 0.21337579617834396, "grad_norm": 0.8640316984393297, "learning_rate": 4.9776612820143195e-05, "loss": 0.1792, "step": 469 }, { "epoch": 0.21383075523202913, "grad_norm": 0.8306296700939642, "learning_rate": 4.977565858602381e-05, "loss": 0.2239, "step": 470 }, { "epoch": 0.21428571428571427, "grad_norm": 0.8473860965213286, "learning_rate": 4.9774702327349484e-05, "loss": 0.1683, "step": 471 }, { "epoch": 0.21474067333939945, "grad_norm": 1.0675935292798793, "learning_rate": 4.977374404419837e-05, "loss": 0.2583, "step": 472 }, { "epoch": 0.21519563239308462, "grad_norm": 0.8722899471545442, "learning_rate": 4.977278373664877e-05, "loss": 0.2007, "step": 473 }, { "epoch": 0.2156505914467698, "grad_norm": 0.8736194584238443, "learning_rate": 4.977182140477916e-05, "loss": 0.2573, "step": 474 }, { "epoch": 0.21610555050045496, "grad_norm": 0.8929224577448658, "learning_rate": 4.9770857048668166e-05, "loss": 0.2224, "step": 475 }, { "epoch": 0.21656050955414013, "grad_norm": 1.1703430933958943, "learning_rate": 4.9769890668394605e-05, "loss": 0.2598, "step": 476 }, { "epoch": 0.2170154686078253, "grad_norm": 1.1687716722578678, "learning_rate": 4.976892226403743e-05, "loss": 0.2949, "step": 477 }, { "epoch": 0.21747042766151045, "grad_norm": 0.7851741818146389, "learning_rate": 4.976795183567579e-05, "loss": 0.1967, "step": 478 }, { "epoch": 0.21792538671519562, "grad_norm": 1.3368751218650363, "learning_rate": 4.976697938338898e-05, "loss": 0.1944, "step": 479 }, { "epoch": 0.2183803457688808, "grad_norm": 0.7804098679474321, "learning_rate": 4.976600490725645e-05, "loss": 0.2162, "step": 480 }, { "epoch": 0.21883530482256597, "grad_norm": 0.8067282580539209, "learning_rate": 4.976502840735785e-05, "loss": 0.2696, "step": 481 }, { "epoch": 0.21929026387625114, "grad_norm": 0.881636931021168, "learning_rate": 4.976404988377297e-05, "loss": 0.1693, "step": 482 }, { "epoch": 0.2197452229299363, "grad_norm": 0.9778356024485488, "learning_rate": 4.9763069336581755e-05, "loss": 0.3015, "step": 483 }, { "epoch": 0.22020018198362148, "grad_norm": 1.5158998257963476, "learning_rate": 4.976208676586435e-05, "loss": 0.2786, "step": 484 }, { "epoch": 0.22065514103730663, "grad_norm": 1.4003653927664346, "learning_rate": 4.976110217170104e-05, "loss": 0.2286, "step": 485 }, { "epoch": 0.2211101000909918, "grad_norm": 1.113968195239722, "learning_rate": 4.976011555417228e-05, "loss": 0.2918, "step": 486 }, { "epoch": 0.22156505914467697, "grad_norm": 0.6814550890931536, "learning_rate": 4.975912691335869e-05, "loss": 0.1758, "step": 487 }, { "epoch": 0.22202001819836215, "grad_norm": 1.0250197993224708, "learning_rate": 4.975813624934107e-05, "loss": 0.2103, "step": 488 }, { "epoch": 0.22247497725204732, "grad_norm": 0.8530099663867843, "learning_rate": 4.975714356220035e-05, "loss": 0.1978, "step": 489 }, { "epoch": 0.2229299363057325, "grad_norm": 0.8857383817331087, "learning_rate": 4.9756148852017656e-05, "loss": 0.214, "step": 490 }, { "epoch": 0.22338489535941766, "grad_norm": 1.0067157373964037, "learning_rate": 4.9755152118874294e-05, "loss": 0.2172, "step": 491 }, { "epoch": 0.22383985441310283, "grad_norm": 0.7700875743307447, "learning_rate": 4.975415336285168e-05, "loss": 0.1673, "step": 492 }, { "epoch": 0.22429481346678798, "grad_norm": 0.8793173662516012, "learning_rate": 4.9753152584031445e-05, "loss": 0.213, "step": 493 }, { "epoch": 0.22474977252047315, "grad_norm": 0.7806046431854715, "learning_rate": 4.975214978249537e-05, "loss": 0.2404, "step": 494 }, { "epoch": 0.22520473157415832, "grad_norm": 1.0408321729609522, "learning_rate": 4.975114495832539e-05, "loss": 0.3285, "step": 495 }, { "epoch": 0.2256596906278435, "grad_norm": 0.9763010914811852, "learning_rate": 4.975013811160362e-05, "loss": 0.2676, "step": 496 }, { "epoch": 0.22611464968152867, "grad_norm": 0.9362866491749535, "learning_rate": 4.9749129242412326e-05, "loss": 0.2318, "step": 497 }, { "epoch": 0.22656960873521384, "grad_norm": 0.9030290795617781, "learning_rate": 4.9748118350833974e-05, "loss": 0.2987, "step": 498 }, { "epoch": 0.227024567788899, "grad_norm": 1.0989605755069412, "learning_rate": 4.974710543695114e-05, "loss": 0.2956, "step": 499 }, { "epoch": 0.22747952684258416, "grad_norm": 1.1146444201421968, "learning_rate": 4.974609050084661e-05, "loss": 0.2891, "step": 500 }, { "epoch": 0.22793448589626933, "grad_norm": 0.806494567271344, "learning_rate": 4.9745073542603314e-05, "loss": 0.2719, "step": 501 }, { "epoch": 0.2283894449499545, "grad_norm": 0.9335000122264804, "learning_rate": 4.974405456230435e-05, "loss": 0.2584, "step": 502 }, { "epoch": 0.22884440400363967, "grad_norm": 1.1989868007046802, "learning_rate": 4.9743033560033e-05, "loss": 0.1895, "step": 503 }, { "epoch": 0.22929936305732485, "grad_norm": 0.8206793184670869, "learning_rate": 4.974201053587268e-05, "loss": 0.2312, "step": 504 }, { "epoch": 0.22975432211101002, "grad_norm": 0.8762870112318909, "learning_rate": 4.974098548990701e-05, "loss": 0.1712, "step": 505 }, { "epoch": 0.2302092811646952, "grad_norm": 0.8810795527204738, "learning_rate": 4.9739958422219714e-05, "loss": 0.2485, "step": 506 }, { "epoch": 0.23066424021838033, "grad_norm": 0.9131927875121051, "learning_rate": 4.9738929332894755e-05, "loss": 0.2081, "step": 507 }, { "epoch": 0.2311191992720655, "grad_norm": 1.1875363818207734, "learning_rate": 4.97378982220162e-05, "loss": 0.244, "step": 508 }, { "epoch": 0.23157415832575068, "grad_norm": 1.0859170773747107, "learning_rate": 4.973686508966832e-05, "loss": 0.2055, "step": 509 }, { "epoch": 0.23202911737943585, "grad_norm": 0.9133043813219233, "learning_rate": 4.973582993593554e-05, "loss": 0.2148, "step": 510 }, { "epoch": 0.23248407643312102, "grad_norm": 1.310737464485677, "learning_rate": 4.973479276090244e-05, "loss": 0.2494, "step": 511 }, { "epoch": 0.2329390354868062, "grad_norm": 0.8529092254297224, "learning_rate": 4.973375356465378e-05, "loss": 0.1725, "step": 512 }, { "epoch": 0.23339399454049137, "grad_norm": 0.9465212302042785, "learning_rate": 4.973271234727447e-05, "loss": 0.2329, "step": 513 }, { "epoch": 0.2338489535941765, "grad_norm": 0.7256334452449963, "learning_rate": 4.973166910884961e-05, "loss": 0.1795, "step": 514 }, { "epoch": 0.23430391264786168, "grad_norm": 1.093598727987344, "learning_rate": 4.973062384946442e-05, "loss": 0.3119, "step": 515 }, { "epoch": 0.23475887170154686, "grad_norm": 0.9015553238361032, "learning_rate": 4.9729576569204345e-05, "loss": 0.1417, "step": 516 }, { "epoch": 0.23521383075523203, "grad_norm": 0.7225604951862246, "learning_rate": 4.972852726815495e-05, "loss": 0.1946, "step": 517 }, { "epoch": 0.2356687898089172, "grad_norm": 2.0905779078889766, "learning_rate": 4.9727475946401966e-05, "loss": 0.2124, "step": 518 }, { "epoch": 0.23612374886260237, "grad_norm": 0.9099581003839937, "learning_rate": 4.972642260403133e-05, "loss": 0.2184, "step": 519 }, { "epoch": 0.23657870791628755, "grad_norm": 0.7192008189313542, "learning_rate": 4.9725367241129104e-05, "loss": 0.2284, "step": 520 }, { "epoch": 0.2370336669699727, "grad_norm": 0.6847301751652567, "learning_rate": 4.972430985778152e-05, "loss": 0.1635, "step": 521 }, { "epoch": 0.23748862602365786, "grad_norm": 1.0884703037776753, "learning_rate": 4.9723250454074985e-05, "loss": 0.2305, "step": 522 }, { "epoch": 0.23794358507734303, "grad_norm": 1.4823406398429884, "learning_rate": 4.9722189030096076e-05, "loss": 0.3141, "step": 523 }, { "epoch": 0.2383985441310282, "grad_norm": 0.9543228410916551, "learning_rate": 4.972112558593153e-05, "loss": 0.24, "step": 524 }, { "epoch": 0.23885350318471338, "grad_norm": 0.8547072683379531, "learning_rate": 4.9720060121668235e-05, "loss": 0.2701, "step": 525 }, { "epoch": 0.23930846223839855, "grad_norm": 0.9503547053850286, "learning_rate": 4.9718992637393256e-05, "loss": 0.2121, "step": 526 }, { "epoch": 0.23976342129208372, "grad_norm": 0.9098591591106474, "learning_rate": 4.971792313319384e-05, "loss": 0.2523, "step": 527 }, { "epoch": 0.24021838034576887, "grad_norm": 0.7280425034062946, "learning_rate": 4.971685160915737e-05, "loss": 0.2164, "step": 528 }, { "epoch": 0.24067333939945404, "grad_norm": 0.8961690750579513, "learning_rate": 4.9715778065371396e-05, "loss": 0.2212, "step": 529 }, { "epoch": 0.2411282984531392, "grad_norm": 0.8118127840747844, "learning_rate": 4.971470250192366e-05, "loss": 0.2166, "step": 530 }, { "epoch": 0.24158325750682438, "grad_norm": 1.318299849716252, "learning_rate": 4.971362491890205e-05, "loss": 0.3136, "step": 531 }, { "epoch": 0.24203821656050956, "grad_norm": 1.0081824604890162, "learning_rate": 4.971254531639461e-05, "loss": 0.2202, "step": 532 }, { "epoch": 0.24249317561419473, "grad_norm": 1.2126916853817178, "learning_rate": 4.971146369448957e-05, "loss": 0.2829, "step": 533 }, { "epoch": 0.2429481346678799, "grad_norm": 0.8472581042257032, "learning_rate": 4.971038005327531e-05, "loss": 0.2084, "step": 534 }, { "epoch": 0.24340309372156507, "grad_norm": 0.7594809604183035, "learning_rate": 4.970929439284039e-05, "loss": 0.2295, "step": 535 }, { "epoch": 0.24385805277525022, "grad_norm": 0.8636454287212753, "learning_rate": 4.970820671327351e-05, "loss": 0.2478, "step": 536 }, { "epoch": 0.2443130118289354, "grad_norm": 0.7813693526252882, "learning_rate": 4.970711701466357e-05, "loss": 0.1914, "step": 537 }, { "epoch": 0.24476797088262056, "grad_norm": 0.7683586186622385, "learning_rate": 4.9706025297099595e-05, "loss": 0.2627, "step": 538 }, { "epoch": 0.24522292993630573, "grad_norm": 0.9410000807935135, "learning_rate": 4.970493156067081e-05, "loss": 0.2666, "step": 539 }, { "epoch": 0.2456778889899909, "grad_norm": 0.8580159793846174, "learning_rate": 4.970383580546658e-05, "loss": 0.2128, "step": 540 }, { "epoch": 0.24613284804367608, "grad_norm": 1.1638124362464524, "learning_rate": 4.9702738031576445e-05, "loss": 0.2837, "step": 541 }, { "epoch": 0.24658780709736125, "grad_norm": 0.8867809918481854, "learning_rate": 4.970163823909013e-05, "loss": 0.2137, "step": 542 }, { "epoch": 0.2470427661510464, "grad_norm": 1.1121220889601369, "learning_rate": 4.970053642809748e-05, "loss": 0.2494, "step": 543 }, { "epoch": 0.24749772520473157, "grad_norm": 1.0107895407168916, "learning_rate": 4.969943259868853e-05, "loss": 0.2514, "step": 544 }, { "epoch": 0.24795268425841674, "grad_norm": 0.77613339632877, "learning_rate": 4.969832675095351e-05, "loss": 0.1802, "step": 545 }, { "epoch": 0.2484076433121019, "grad_norm": 0.8752967804017977, "learning_rate": 4.969721888498275e-05, "loss": 0.2978, "step": 546 }, { "epoch": 0.24886260236578708, "grad_norm": 0.8441570308900757, "learning_rate": 4.96961090008668e-05, "loss": 0.199, "step": 547 }, { "epoch": 0.24931756141947226, "grad_norm": 0.6963956824698917, "learning_rate": 4.969499709869635e-05, "loss": 0.2844, "step": 548 }, { "epoch": 0.24977252047315743, "grad_norm": 0.8791555529767203, "learning_rate": 4.969388317856225e-05, "loss": 0.2021, "step": 549 }, { "epoch": 0.2502274795268426, "grad_norm": 1.015606520913881, "learning_rate": 4.969276724055554e-05, "loss": 0.3095, "step": 550 }, { "epoch": 0.25068243858052774, "grad_norm": 0.7831490460351049, "learning_rate": 4.9691649284767406e-05, "loss": 0.1886, "step": 551 }, { "epoch": 0.25113739763421294, "grad_norm": 0.983327555515575, "learning_rate": 4.969052931128919e-05, "loss": 0.2694, "step": 552 }, { "epoch": 0.2515923566878981, "grad_norm": 0.755569830796656, "learning_rate": 4.968940732021243e-05, "loss": 0.2242, "step": 553 }, { "epoch": 0.25204731574158323, "grad_norm": 0.9138676059935472, "learning_rate": 4.968828331162879e-05, "loss": 0.2757, "step": 554 }, { "epoch": 0.25250227479526843, "grad_norm": 0.9345695141979689, "learning_rate": 4.968715728563014e-05, "loss": 0.2751, "step": 555 }, { "epoch": 0.2529572338489536, "grad_norm": 0.8352141222823103, "learning_rate": 4.968602924230848e-05, "loss": 0.197, "step": 556 }, { "epoch": 0.2534121929026388, "grad_norm": 1.3284964932869368, "learning_rate": 4.9684899181755984e-05, "loss": 0.2535, "step": 557 }, { "epoch": 0.2538671519563239, "grad_norm": 1.329649433423361, "learning_rate": 4.968376710406501e-05, "loss": 0.3074, "step": 558 }, { "epoch": 0.2543221110100091, "grad_norm": 0.8786692765515908, "learning_rate": 4.968263300932806e-05, "loss": 0.23, "step": 559 }, { "epoch": 0.25477707006369427, "grad_norm": 0.8178924999905985, "learning_rate": 4.9681496897637803e-05, "loss": 0.2031, "step": 560 }, { "epoch": 0.2552320291173794, "grad_norm": 0.8433152966553115, "learning_rate": 4.9680358769087076e-05, "loss": 0.2332, "step": 561 }, { "epoch": 0.2556869881710646, "grad_norm": 0.8752100907080377, "learning_rate": 4.967921862376889e-05, "loss": 0.3207, "step": 562 }, { "epoch": 0.25614194722474976, "grad_norm": 1.1084735558698116, "learning_rate": 4.967807646177641e-05, "loss": 0.3067, "step": 563 }, { "epoch": 0.25659690627843496, "grad_norm": 0.8629831876339815, "learning_rate": 4.9676932283202965e-05, "loss": 0.2124, "step": 564 }, { "epoch": 0.2570518653321201, "grad_norm": 1.0113479225009223, "learning_rate": 4.967578608814205e-05, "loss": 0.2768, "step": 565 }, { "epoch": 0.2575068243858053, "grad_norm": 0.8169807080761028, "learning_rate": 4.967463787668734e-05, "loss": 0.2322, "step": 566 }, { "epoch": 0.25796178343949044, "grad_norm": 0.6799959778645319, "learning_rate": 4.967348764893265e-05, "loss": 0.1587, "step": 567 }, { "epoch": 0.2584167424931756, "grad_norm": 1.0130457584730224, "learning_rate": 4.967233540497197e-05, "loss": 0.2272, "step": 568 }, { "epoch": 0.2588717015468608, "grad_norm": 0.9675037432129917, "learning_rate": 4.967118114489946e-05, "loss": 0.281, "step": 569 }, { "epoch": 0.25932666060054593, "grad_norm": 0.8192363395206572, "learning_rate": 4.967002486880944e-05, "loss": 0.209, "step": 570 }, { "epoch": 0.25978161965423113, "grad_norm": 1.1014249871401594, "learning_rate": 4.9668866576796405e-05, "loss": 0.2977, "step": 571 }, { "epoch": 0.2602365787079163, "grad_norm": 0.9684967692524976, "learning_rate": 4.9667706268954986e-05, "loss": 0.2007, "step": 572 }, { "epoch": 0.2606915377616015, "grad_norm": 0.9680429884046904, "learning_rate": 4.966654394538002e-05, "loss": 0.2436, "step": 573 }, { "epoch": 0.2611464968152866, "grad_norm": 0.6313186318862374, "learning_rate": 4.966537960616646e-05, "loss": 0.1291, "step": 574 }, { "epoch": 0.26160145586897177, "grad_norm": 0.9138135547568222, "learning_rate": 4.966421325140948e-05, "loss": 0.2562, "step": 575 }, { "epoch": 0.26205641492265697, "grad_norm": 0.9355024486900201, "learning_rate": 4.966304488120437e-05, "loss": 0.271, "step": 576 }, { "epoch": 0.2625113739763421, "grad_norm": 0.9811973799634708, "learning_rate": 4.966187449564661e-05, "loss": 0.2136, "step": 577 }, { "epoch": 0.2629663330300273, "grad_norm": 0.8230151151145665, "learning_rate": 4.9660702094831846e-05, "loss": 0.2748, "step": 578 }, { "epoch": 0.26342129208371245, "grad_norm": 1.1156420570362466, "learning_rate": 4.9659527678855865e-05, "loss": 0.2987, "step": 579 }, { "epoch": 0.26387625113739765, "grad_norm": 2.8761171447320146, "learning_rate": 4.965835124781465e-05, "loss": 0.3121, "step": 580 }, { "epoch": 0.2643312101910828, "grad_norm": 0.8642644117751058, "learning_rate": 4.965717280180432e-05, "loss": 0.2209, "step": 581 }, { "epoch": 0.26478616924476794, "grad_norm": 0.8535499421022061, "learning_rate": 4.965599234092118e-05, "loss": 0.1616, "step": 582 }, { "epoch": 0.26524112829845314, "grad_norm": 1.1409600185101643, "learning_rate": 4.9654809865261695e-05, "loss": 0.2741, "step": 583 }, { "epoch": 0.2656960873521383, "grad_norm": 0.7789400171412606, "learning_rate": 4.965362537492249e-05, "loss": 0.2352, "step": 584 }, { "epoch": 0.2661510464058235, "grad_norm": 0.6952261761352863, "learning_rate": 4.965243887000035e-05, "loss": 0.2296, "step": 585 }, { "epoch": 0.26660600545950863, "grad_norm": 0.643579754402992, "learning_rate": 4.9651250350592236e-05, "loss": 0.1537, "step": 586 }, { "epoch": 0.26706096451319383, "grad_norm": 0.9478392266404356, "learning_rate": 4.965005981679527e-05, "loss": 0.2461, "step": 587 }, { "epoch": 0.267515923566879, "grad_norm": 1.2884554995403397, "learning_rate": 4.964886726870673e-05, "loss": 0.2731, "step": 588 }, { "epoch": 0.2679708826205642, "grad_norm": 0.7515261190274288, "learning_rate": 4.964767270642407e-05, "loss": 0.2368, "step": 589 }, { "epoch": 0.2684258416742493, "grad_norm": 0.6476142246750416, "learning_rate": 4.964647613004491e-05, "loss": 0.1602, "step": 590 }, { "epoch": 0.26888080072793447, "grad_norm": 0.7340595442555482, "learning_rate": 4.964527753966701e-05, "loss": 0.2037, "step": 591 }, { "epoch": 0.26933575978161967, "grad_norm": 0.8671351258368892, "learning_rate": 4.964407693538834e-05, "loss": 0.2139, "step": 592 }, { "epoch": 0.2697907188353048, "grad_norm": 1.0286541565846778, "learning_rate": 4.964287431730699e-05, "loss": 0.3234, "step": 593 }, { "epoch": 0.27024567788899, "grad_norm": 0.6108889669092895, "learning_rate": 4.9641669685521245e-05, "loss": 0.2402, "step": 594 }, { "epoch": 0.27070063694267515, "grad_norm": 0.8612909171718697, "learning_rate": 4.964046304012952e-05, "loss": 0.2642, "step": 595 }, { "epoch": 0.27115559599636035, "grad_norm": 0.8679632414032727, "learning_rate": 4.963925438123044e-05, "loss": 0.1953, "step": 596 }, { "epoch": 0.2716105550500455, "grad_norm": 0.9272399803240488, "learning_rate": 4.9638043708922754e-05, "loss": 0.2218, "step": 597 }, { "epoch": 0.27206551410373064, "grad_norm": 0.8885436001838017, "learning_rate": 4.963683102330541e-05, "loss": 0.2286, "step": 598 }, { "epoch": 0.27252047315741584, "grad_norm": 0.7231820604348435, "learning_rate": 4.9635616324477485e-05, "loss": 0.2016, "step": 599 }, { "epoch": 0.272975432211101, "grad_norm": 0.9345703630238253, "learning_rate": 4.963439961253825e-05, "loss": 0.2456, "step": 600 }, { "epoch": 0.2734303912647862, "grad_norm": 1.2146289298430135, "learning_rate": 4.963318088758713e-05, "loss": 0.2732, "step": 601 }, { "epoch": 0.27388535031847133, "grad_norm": 1.0511057737139682, "learning_rate": 4.963196014972371e-05, "loss": 0.2288, "step": 602 }, { "epoch": 0.27434030937215653, "grad_norm": 1.2012052766204586, "learning_rate": 4.9630737399047745e-05, "loss": 0.2873, "step": 603 }, { "epoch": 0.2747952684258417, "grad_norm": 1.2402982907751752, "learning_rate": 4.962951263565915e-05, "loss": 0.2873, "step": 604 }, { "epoch": 0.2752502274795268, "grad_norm": 1.1854824252092173, "learning_rate": 4.962828585965801e-05, "loss": 0.2534, "step": 605 }, { "epoch": 0.275705186533212, "grad_norm": 0.8858245185359226, "learning_rate": 4.962705707114456e-05, "loss": 0.2088, "step": 606 }, { "epoch": 0.27616014558689717, "grad_norm": 1.1113380974782898, "learning_rate": 4.962582627021923e-05, "loss": 0.2812, "step": 607 }, { "epoch": 0.27661510464058237, "grad_norm": 1.0533185978736896, "learning_rate": 4.962459345698258e-05, "loss": 0.2513, "step": 608 }, { "epoch": 0.2770700636942675, "grad_norm": 1.2253680043021111, "learning_rate": 4.962335863153537e-05, "loss": 0.3091, "step": 609 }, { "epoch": 0.2775250227479527, "grad_norm": 0.5878838793772323, "learning_rate": 4.9622121793978474e-05, "loss": 0.1856, "step": 610 }, { "epoch": 0.27797998180163785, "grad_norm": 0.9092751982851152, "learning_rate": 4.962088294441298e-05, "loss": 0.1762, "step": 611 }, { "epoch": 0.278434940855323, "grad_norm": 0.7969498125969083, "learning_rate": 4.9619642082940135e-05, "loss": 0.3023, "step": 612 }, { "epoch": 0.2788898999090082, "grad_norm": 0.8860169445941871, "learning_rate": 4.9618399209661305e-05, "loss": 0.1939, "step": 613 }, { "epoch": 0.27934485896269334, "grad_norm": 0.9774136926180137, "learning_rate": 4.9617154324678073e-05, "loss": 0.2034, "step": 614 }, { "epoch": 0.27979981801637854, "grad_norm": 0.961695137690413, "learning_rate": 4.961590742809216e-05, "loss": 0.2699, "step": 615 }, { "epoch": 0.2802547770700637, "grad_norm": 0.755554512979983, "learning_rate": 4.961465852000545e-05, "loss": 0.2043, "step": 616 }, { "epoch": 0.2807097361237489, "grad_norm": 1.6508599118351832, "learning_rate": 4.961340760052001e-05, "loss": 0.3042, "step": 617 }, { "epoch": 0.28116469517743403, "grad_norm": 0.8914969602824668, "learning_rate": 4.9612154669738055e-05, "loss": 0.1972, "step": 618 }, { "epoch": 0.2816196542311192, "grad_norm": 0.8125093544137703, "learning_rate": 4.961089972776196e-05, "loss": 0.221, "step": 619 }, { "epoch": 0.2820746132848044, "grad_norm": 0.9802177326723582, "learning_rate": 4.960964277469429e-05, "loss": 0.2568, "step": 620 }, { "epoch": 0.2825295723384895, "grad_norm": 0.8914807828231378, "learning_rate": 4.960838381063774e-05, "loss": 0.2268, "step": 621 }, { "epoch": 0.2829845313921747, "grad_norm": 1.086778908952814, "learning_rate": 4.96071228356952e-05, "loss": 0.2695, "step": 622 }, { "epoch": 0.28343949044585987, "grad_norm": 0.8679774985658909, "learning_rate": 4.960585984996971e-05, "loss": 0.2411, "step": 623 }, { "epoch": 0.28389444949954507, "grad_norm": 0.9469823222976179, "learning_rate": 4.9604594853564465e-05, "loss": 0.3092, "step": 624 }, { "epoch": 0.2843494085532302, "grad_norm": 0.8263977863363166, "learning_rate": 4.9603327846582855e-05, "loss": 0.203, "step": 625 }, { "epoch": 0.28480436760691535, "grad_norm": 1.0305271008019032, "learning_rate": 4.960205882912839e-05, "loss": 0.2498, "step": 626 }, { "epoch": 0.28525932666060055, "grad_norm": 0.9519784589996221, "learning_rate": 4.960078780130478e-05, "loss": 0.2899, "step": 627 }, { "epoch": 0.2857142857142857, "grad_norm": 0.8560290743793322, "learning_rate": 4.9599514763215896e-05, "loss": 0.2598, "step": 628 }, { "epoch": 0.2861692447679709, "grad_norm": 0.7650744807280736, "learning_rate": 4.959823971496574e-05, "loss": 0.2298, "step": 629 }, { "epoch": 0.28662420382165604, "grad_norm": 1.008382094852365, "learning_rate": 4.959696265665853e-05, "loss": 0.2556, "step": 630 }, { "epoch": 0.28707916287534124, "grad_norm": 0.8563049072288126, "learning_rate": 4.959568358839861e-05, "loss": 0.1788, "step": 631 }, { "epoch": 0.2875341219290264, "grad_norm": 0.8511970340901663, "learning_rate": 4.95944025102905e-05, "loss": 0.208, "step": 632 }, { "epoch": 0.28798908098271153, "grad_norm": 0.9236013840057509, "learning_rate": 4.959311942243888e-05, "loss": 0.2879, "step": 633 }, { "epoch": 0.28844404003639673, "grad_norm": 0.6864653199519503, "learning_rate": 4.9591834324948606e-05, "loss": 0.1676, "step": 634 }, { "epoch": 0.2888989990900819, "grad_norm": 2.5681361585801294, "learning_rate": 4.9590547217924685e-05, "loss": 0.2849, "step": 635 }, { "epoch": 0.2893539581437671, "grad_norm": 0.892616517182535, "learning_rate": 4.9589258101472306e-05, "loss": 0.1652, "step": 636 }, { "epoch": 0.2898089171974522, "grad_norm": 0.796932945938617, "learning_rate": 4.9587966975696785e-05, "loss": 0.1733, "step": 637 }, { "epoch": 0.2902638762511374, "grad_norm": 1.0744605074333295, "learning_rate": 4.958667384070364e-05, "loss": 0.2137, "step": 638 }, { "epoch": 0.29071883530482256, "grad_norm": 0.856720110155343, "learning_rate": 4.958537869659855e-05, "loss": 0.2648, "step": 639 }, { "epoch": 0.2911737943585077, "grad_norm": 1.1698700002853544, "learning_rate": 4.958408154348734e-05, "loss": 0.2354, "step": 640 }, { "epoch": 0.2916287534121929, "grad_norm": 1.024731799423735, "learning_rate": 4.9582782381476e-05, "loss": 0.2761, "step": 641 }, { "epoch": 0.29208371246587805, "grad_norm": 1.1461053550581966, "learning_rate": 4.95814812106707e-05, "loss": 0.3863, "step": 642 }, { "epoch": 0.29253867151956325, "grad_norm": 0.9520969663286281, "learning_rate": 4.958017803117777e-05, "loss": 0.2617, "step": 643 }, { "epoch": 0.2929936305732484, "grad_norm": 0.6081925280896296, "learning_rate": 4.957887284310369e-05, "loss": 0.1831, "step": 644 }, { "epoch": 0.2934485896269336, "grad_norm": 0.6758542383389606, "learning_rate": 4.9577565646555125e-05, "loss": 0.2179, "step": 645 }, { "epoch": 0.29390354868061874, "grad_norm": 0.796075461817172, "learning_rate": 4.957625644163888e-05, "loss": 0.2529, "step": 646 }, { "epoch": 0.2943585077343039, "grad_norm": 0.9730251606841677, "learning_rate": 4.9574945228461945e-05, "loss": 0.2286, "step": 647 }, { "epoch": 0.2948134667879891, "grad_norm": 1.0725588945042845, "learning_rate": 4.9573632007131465e-05, "loss": 0.3195, "step": 648 }, { "epoch": 0.29526842584167423, "grad_norm": 0.8120647197863851, "learning_rate": 4.957231677775474e-05, "loss": 0.2501, "step": 649 }, { "epoch": 0.29572338489535943, "grad_norm": 0.9562342281259179, "learning_rate": 4.9570999540439276e-05, "loss": 0.2521, "step": 650 }, { "epoch": 0.2961783439490446, "grad_norm": 0.8531452260427399, "learning_rate": 4.956968029529268e-05, "loss": 0.2528, "step": 651 }, { "epoch": 0.2966333030027298, "grad_norm": 0.8786760090903362, "learning_rate": 4.956835904242278e-05, "loss": 0.2333, "step": 652 }, { "epoch": 0.2970882620564149, "grad_norm": 1.468463971883643, "learning_rate": 4.956703578193751e-05, "loss": 0.1994, "step": 653 }, { "epoch": 0.29754322111010006, "grad_norm": 0.9300252922546592, "learning_rate": 4.9565710513945026e-05, "loss": 0.2553, "step": 654 }, { "epoch": 0.29799818016378526, "grad_norm": 0.9014318710606085, "learning_rate": 4.956438323855361e-05, "loss": 0.2014, "step": 655 }, { "epoch": 0.2984531392174704, "grad_norm": 0.7285826405232265, "learning_rate": 4.956305395587174e-05, "loss": 0.2416, "step": 656 }, { "epoch": 0.2989080982711556, "grad_norm": 0.8787760697875467, "learning_rate": 4.956172266600802e-05, "loss": 0.1754, "step": 657 }, { "epoch": 0.29936305732484075, "grad_norm": 0.989007080375839, "learning_rate": 4.9560389369071246e-05, "loss": 0.2057, "step": 658 }, { "epoch": 0.29981801637852595, "grad_norm": 1.002339022195567, "learning_rate": 4.955905406517036e-05, "loss": 0.2301, "step": 659 }, { "epoch": 0.3002729754322111, "grad_norm": 1.0532992026149823, "learning_rate": 4.9557716754414494e-05, "loss": 0.2579, "step": 660 }, { "epoch": 0.30072793448589624, "grad_norm": 1.0659893788822008, "learning_rate": 4.955637743691291e-05, "loss": 0.2509, "step": 661 }, { "epoch": 0.30118289353958144, "grad_norm": 0.8259419243165222, "learning_rate": 4.955503611277506e-05, "loss": 0.2377, "step": 662 }, { "epoch": 0.3016378525932666, "grad_norm": 0.959130920601689, "learning_rate": 4.9553692782110546e-05, "loss": 0.2528, "step": 663 }, { "epoch": 0.3020928116469518, "grad_norm": 1.4656809421826265, "learning_rate": 4.955234744502914e-05, "loss": 0.2555, "step": 664 }, { "epoch": 0.30254777070063693, "grad_norm": 0.8688450878017016, "learning_rate": 4.955100010164078e-05, "loss": 0.2583, "step": 665 }, { "epoch": 0.30300272975432213, "grad_norm": 0.7803383908108252, "learning_rate": 4.9549650752055564e-05, "loss": 0.2032, "step": 666 }, { "epoch": 0.3034576888080073, "grad_norm": 1.0972410526886847, "learning_rate": 4.954829939638376e-05, "loss": 0.2873, "step": 667 }, { "epoch": 0.3039126478616925, "grad_norm": 0.9136362992337661, "learning_rate": 4.9546946034735775e-05, "loss": 0.2351, "step": 668 }, { "epoch": 0.3043676069153776, "grad_norm": 0.9007460892709158, "learning_rate": 4.954559066722222e-05, "loss": 0.2683, "step": 669 }, { "epoch": 0.30482256596906276, "grad_norm": 1.0224931362214231, "learning_rate": 4.954423329395384e-05, "loss": 0.2725, "step": 670 }, { "epoch": 0.30527752502274796, "grad_norm": 0.8491379728440557, "learning_rate": 4.9542873915041564e-05, "loss": 0.2612, "step": 671 }, { "epoch": 0.3057324840764331, "grad_norm": 1.0202145669024136, "learning_rate": 4.9541512530596454e-05, "loss": 0.2725, "step": 672 }, { "epoch": 0.3061874431301183, "grad_norm": 0.8767030999381581, "learning_rate": 4.954014914072978e-05, "loss": 0.2512, "step": 673 }, { "epoch": 0.30664240218380345, "grad_norm": 0.6910704535150918, "learning_rate": 4.953878374555293e-05, "loss": 0.2097, "step": 674 }, { "epoch": 0.30709736123748865, "grad_norm": 1.085874797740716, "learning_rate": 4.95374163451775e-05, "loss": 0.3119, "step": 675 }, { "epoch": 0.3075523202911738, "grad_norm": 1.0383041261404968, "learning_rate": 4.953604693971521e-05, "loss": 0.2438, "step": 676 }, { "epoch": 0.30800727934485894, "grad_norm": 1.015897020308681, "learning_rate": 4.953467552927797e-05, "loss": 0.2761, "step": 677 }, { "epoch": 0.30846223839854414, "grad_norm": 1.3552265188321118, "learning_rate": 4.953330211397784e-05, "loss": 0.355, "step": 678 }, { "epoch": 0.3089171974522293, "grad_norm": 1.225929380250296, "learning_rate": 4.9531926693927057e-05, "loss": 0.2579, "step": 679 }, { "epoch": 0.3093721565059145, "grad_norm": 1.0292788160153652, "learning_rate": 4.9530549269238005e-05, "loss": 0.257, "step": 680 }, { "epoch": 0.30982711555959963, "grad_norm": 1.0972178928221403, "learning_rate": 4.952916984002325e-05, "loss": 0.253, "step": 681 }, { "epoch": 0.31028207461328483, "grad_norm": 0.9813832276926537, "learning_rate": 4.95277884063955e-05, "loss": 0.2458, "step": 682 }, { "epoch": 0.31073703366697, "grad_norm": 0.8270338791429248, "learning_rate": 4.952640496846766e-05, "loss": 0.2634, "step": 683 }, { "epoch": 0.3111919927206551, "grad_norm": 0.9183687316500325, "learning_rate": 4.952501952635276e-05, "loss": 0.2463, "step": 684 }, { "epoch": 0.3116469517743403, "grad_norm": 0.8628773005959172, "learning_rate": 4.952363208016402e-05, "loss": 0.2425, "step": 685 }, { "epoch": 0.31210191082802546, "grad_norm": 1.0117423406539965, "learning_rate": 4.952224263001481e-05, "loss": 0.1795, "step": 686 }, { "epoch": 0.31255686988171066, "grad_norm": 0.8120316477434865, "learning_rate": 4.952085117601868e-05, "loss": 0.2473, "step": 687 }, { "epoch": 0.3130118289353958, "grad_norm": 0.9037509733332857, "learning_rate": 4.951945771828933e-05, "loss": 0.3109, "step": 688 }, { "epoch": 0.313466787989081, "grad_norm": 0.7841049933815181, "learning_rate": 4.951806225694061e-05, "loss": 0.2361, "step": 689 }, { "epoch": 0.31392174704276615, "grad_norm": 1.0663742790154704, "learning_rate": 4.951666479208658e-05, "loss": 0.2593, "step": 690 }, { "epoch": 0.3143767060964513, "grad_norm": 1.1324940598084432, "learning_rate": 4.951526532384141e-05, "loss": 0.2622, "step": 691 }, { "epoch": 0.3148316651501365, "grad_norm": 0.7861938873646999, "learning_rate": 4.951386385231946e-05, "loss": 0.22, "step": 692 }, { "epoch": 0.31528662420382164, "grad_norm": 0.6855808438385542, "learning_rate": 4.9512460377635275e-05, "loss": 0.1897, "step": 693 }, { "epoch": 0.31574158325750684, "grad_norm": 0.7512393871370366, "learning_rate": 4.951105489990352e-05, "loss": 0.2057, "step": 694 }, { "epoch": 0.316196542311192, "grad_norm": 1.0271592167038488, "learning_rate": 4.950964741923905e-05, "loss": 0.2971, "step": 695 }, { "epoch": 0.3166515013648772, "grad_norm": 0.8453697932231744, "learning_rate": 4.950823793575688e-05, "loss": 0.2028, "step": 696 }, { "epoch": 0.31710646041856233, "grad_norm": 1.014746609126079, "learning_rate": 4.950682644957218e-05, "loss": 0.2304, "step": 697 }, { "epoch": 0.3175614194722475, "grad_norm": 3.6734583853709704, "learning_rate": 4.9505412960800295e-05, "loss": 0.3345, "step": 698 }, { "epoch": 0.3180163785259327, "grad_norm": 0.9849977726666123, "learning_rate": 4.950399746955673e-05, "loss": 0.3007, "step": 699 }, { "epoch": 0.3184713375796178, "grad_norm": 1.3213677122109524, "learning_rate": 4.9502579975957154e-05, "loss": 0.2661, "step": 700 }, { "epoch": 0.318926296633303, "grad_norm": 0.9447935956425747, "learning_rate": 4.950116048011739e-05, "loss": 0.2555, "step": 701 }, { "epoch": 0.31938125568698816, "grad_norm": 0.8104973391915997, "learning_rate": 4.949973898215343e-05, "loss": 0.1986, "step": 702 }, { "epoch": 0.31983621474067336, "grad_norm": 1.1058340197415355, "learning_rate": 4.9498315482181455e-05, "loss": 0.2003, "step": 703 }, { "epoch": 0.3202911737943585, "grad_norm": 0.8175870501767695, "learning_rate": 4.949688998031776e-05, "loss": 0.2126, "step": 704 }, { "epoch": 0.32074613284804365, "grad_norm": 0.9039274292687272, "learning_rate": 4.9495462476678853e-05, "loss": 0.1878, "step": 705 }, { "epoch": 0.32120109190172885, "grad_norm": 0.7646255772499492, "learning_rate": 4.949403297138137e-05, "loss": 0.1925, "step": 706 }, { "epoch": 0.321656050955414, "grad_norm": 0.9372608149186025, "learning_rate": 4.949260146454212e-05, "loss": 0.2299, "step": 707 }, { "epoch": 0.3221110100090992, "grad_norm": 0.7036242629698544, "learning_rate": 4.9491167956278094e-05, "loss": 0.2129, "step": 708 }, { "epoch": 0.32256596906278434, "grad_norm": 0.9099053851402681, "learning_rate": 4.948973244670643e-05, "loss": 0.2101, "step": 709 }, { "epoch": 0.32302092811646954, "grad_norm": 1.0894002147766235, "learning_rate": 4.948829493594441e-05, "loss": 0.2924, "step": 710 }, { "epoch": 0.3234758871701547, "grad_norm": 0.9141588281519077, "learning_rate": 4.9486855424109524e-05, "loss": 0.2037, "step": 711 }, { "epoch": 0.32393084622383983, "grad_norm": 0.7572018653437371, "learning_rate": 4.948541391131939e-05, "loss": 0.1984, "step": 712 }, { "epoch": 0.32438580527752503, "grad_norm": 0.9641210793477122, "learning_rate": 4.948397039769181e-05, "loss": 0.224, "step": 713 }, { "epoch": 0.3248407643312102, "grad_norm": 0.8283109463411993, "learning_rate": 4.948252488334474e-05, "loss": 0.1778, "step": 714 }, { "epoch": 0.3252957233848954, "grad_norm": 0.7084242369346853, "learning_rate": 4.948107736839629e-05, "loss": 0.25, "step": 715 }, { "epoch": 0.3257506824385805, "grad_norm": 0.6370004195466552, "learning_rate": 4.947962785296476e-05, "loss": 0.2248, "step": 716 }, { "epoch": 0.3262056414922657, "grad_norm": 0.8620898149878387, "learning_rate": 4.947817633716859e-05, "loss": 0.1805, "step": 717 }, { "epoch": 0.32666060054595086, "grad_norm": 1.1178378598352234, "learning_rate": 4.9476722821126384e-05, "loss": 0.3258, "step": 718 }, { "epoch": 0.327115559599636, "grad_norm": 0.9519467235518019, "learning_rate": 4.947526730495694e-05, "loss": 0.217, "step": 719 }, { "epoch": 0.3275705186533212, "grad_norm": 1.099124089447005, "learning_rate": 4.947380978877917e-05, "loss": 0.284, "step": 720 }, { "epoch": 0.32802547770700635, "grad_norm": 0.7377753804400712, "learning_rate": 4.947235027271219e-05, "loss": 0.2248, "step": 721 }, { "epoch": 0.32848043676069155, "grad_norm": 0.8288041978531289, "learning_rate": 4.947088875687526e-05, "loss": 0.2066, "step": 722 }, { "epoch": 0.3289353958143767, "grad_norm": 0.8196879936718493, "learning_rate": 4.9469425241387815e-05, "loss": 0.2149, "step": 723 }, { "epoch": 0.3293903548680619, "grad_norm": 0.7967173187985016, "learning_rate": 4.946795972636944e-05, "loss": 0.2235, "step": 724 }, { "epoch": 0.32984531392174704, "grad_norm": 0.7445213894581975, "learning_rate": 4.94664922119399e-05, "loss": 0.2297, "step": 725 }, { "epoch": 0.3303002729754322, "grad_norm": 0.898099373303196, "learning_rate": 4.946502269821909e-05, "loss": 0.2512, "step": 726 }, { "epoch": 0.3307552320291174, "grad_norm": 0.733527119008864, "learning_rate": 4.946355118532712e-05, "loss": 0.3141, "step": 727 }, { "epoch": 0.33121019108280253, "grad_norm": 1.0406305026620009, "learning_rate": 4.946207767338422e-05, "loss": 0.2356, "step": 728 }, { "epoch": 0.33166515013648773, "grad_norm": 0.8426021737902853, "learning_rate": 4.9460602162510804e-05, "loss": 0.2455, "step": 729 }, { "epoch": 0.3321201091901729, "grad_norm": 0.9790475635162611, "learning_rate": 4.9459124652827437e-05, "loss": 0.2163, "step": 730 }, { "epoch": 0.3325750682438581, "grad_norm": 0.9466137818648156, "learning_rate": 4.9457645144454864e-05, "loss": 0.3217, "step": 731 }, { "epoch": 0.3330300272975432, "grad_norm": 1.1759817215176778, "learning_rate": 4.945616363751398e-05, "loss": 0.2851, "step": 732 }, { "epoch": 0.33348498635122836, "grad_norm": 1.1054354107152844, "learning_rate": 4.945468013212585e-05, "loss": 0.2807, "step": 733 }, { "epoch": 0.33393994540491356, "grad_norm": 0.9789043844256137, "learning_rate": 4.945319462841169e-05, "loss": 0.2419, "step": 734 }, { "epoch": 0.3343949044585987, "grad_norm": 0.858493149652979, "learning_rate": 4.94517071264929e-05, "loss": 0.2826, "step": 735 }, { "epoch": 0.3348498635122839, "grad_norm": 0.9310291483260735, "learning_rate": 4.9450217626491016e-05, "loss": 0.2151, "step": 736 }, { "epoch": 0.33530482256596905, "grad_norm": 0.7448088009341015, "learning_rate": 4.9448726128527774e-05, "loss": 0.2547, "step": 737 }, { "epoch": 0.33575978161965425, "grad_norm": 1.058811207978302, "learning_rate": 4.944723263272504e-05, "loss": 0.2935, "step": 738 }, { "epoch": 0.3362147406733394, "grad_norm": 0.8670237274413242, "learning_rate": 4.944573713920485e-05, "loss": 0.2702, "step": 739 }, { "epoch": 0.33666969972702454, "grad_norm": 0.6873597212576252, "learning_rate": 4.944423964808943e-05, "loss": 0.2548, "step": 740 }, { "epoch": 0.33712465878070974, "grad_norm": 0.7344896346819564, "learning_rate": 4.9442740159501125e-05, "loss": 0.2042, "step": 741 }, { "epoch": 0.3375796178343949, "grad_norm": 1.0312983773160056, "learning_rate": 4.9441238673562484e-05, "loss": 0.2239, "step": 742 }, { "epoch": 0.3380345768880801, "grad_norm": 1.0088502312369485, "learning_rate": 4.943973519039619e-05, "loss": 0.2668, "step": 743 }, { "epoch": 0.33848953594176523, "grad_norm": 0.9505891866541584, "learning_rate": 4.9438229710125104e-05, "loss": 0.1975, "step": 744 }, { "epoch": 0.33894449499545043, "grad_norm": 0.8492324606725229, "learning_rate": 4.9436722232872255e-05, "loss": 0.249, "step": 745 }, { "epoch": 0.3393994540491356, "grad_norm": 0.8622862276396092, "learning_rate": 4.9435212758760815e-05, "loss": 0.2502, "step": 746 }, { "epoch": 0.3398544131028208, "grad_norm": 0.5400585321039387, "learning_rate": 4.943370128791414e-05, "loss": 0.1568, "step": 747 }, { "epoch": 0.3403093721565059, "grad_norm": 0.7447889954628661, "learning_rate": 4.943218782045573e-05, "loss": 0.2284, "step": 748 }, { "epoch": 0.34076433121019106, "grad_norm": 1.0287707280357545, "learning_rate": 4.9430672356509274e-05, "loss": 0.2663, "step": 749 }, { "epoch": 0.34121929026387626, "grad_norm": 0.7681345312585995, "learning_rate": 4.9429154896198594e-05, "loss": 0.2121, "step": 750 }, { "epoch": 0.3416742493175614, "grad_norm": 1.0821309803769619, "learning_rate": 4.9427635439647704e-05, "loss": 0.2656, "step": 751 }, { "epoch": 0.3421292083712466, "grad_norm": 0.7841620951260261, "learning_rate": 4.9426113986980754e-05, "loss": 0.1981, "step": 752 }, { "epoch": 0.34258416742493175, "grad_norm": 0.6755409598106863, "learning_rate": 4.942459053832208e-05, "loss": 0.1916, "step": 753 }, { "epoch": 0.34303912647861695, "grad_norm": 2.0250058612595674, "learning_rate": 4.942306509379617e-05, "loss": 0.2316, "step": 754 }, { "epoch": 0.3434940855323021, "grad_norm": 0.7100981997796803, "learning_rate": 4.942153765352767e-05, "loss": 0.1896, "step": 755 }, { "epoch": 0.34394904458598724, "grad_norm": 0.8888438578459636, "learning_rate": 4.94200082176414e-05, "loss": 0.2197, "step": 756 }, { "epoch": 0.34440400363967244, "grad_norm": 0.9223818529228094, "learning_rate": 4.9418476786262334e-05, "loss": 0.2457, "step": 757 }, { "epoch": 0.3448589626933576, "grad_norm": 0.8534903029550724, "learning_rate": 4.941694335951563e-05, "loss": 0.2848, "step": 758 }, { "epoch": 0.3453139217470428, "grad_norm": 0.7504469599833069, "learning_rate": 4.941540793752657e-05, "loss": 0.2031, "step": 759 }, { "epoch": 0.34576888080072793, "grad_norm": 0.8627571754482262, "learning_rate": 4.941387052042064e-05, "loss": 0.291, "step": 760 }, { "epoch": 0.34622383985441313, "grad_norm": 0.9064675561705541, "learning_rate": 4.941233110832346e-05, "loss": 0.2177, "step": 761 }, { "epoch": 0.3466787989080983, "grad_norm": 1.2828225898233279, "learning_rate": 4.941078970136082e-05, "loss": 0.2307, "step": 762 }, { "epoch": 0.3471337579617834, "grad_norm": 0.704723925738874, "learning_rate": 4.940924629965869e-05, "loss": 0.1987, "step": 763 }, { "epoch": 0.3475887170154686, "grad_norm": 0.9260627240136159, "learning_rate": 4.940770090334319e-05, "loss": 0.1922, "step": 764 }, { "epoch": 0.34804367606915376, "grad_norm": 0.7463336939112161, "learning_rate": 4.9406153512540585e-05, "loss": 0.17, "step": 765 }, { "epoch": 0.34849863512283896, "grad_norm": 0.8352969720189538, "learning_rate": 4.940460412737733e-05, "loss": 0.2553, "step": 766 }, { "epoch": 0.3489535941765241, "grad_norm": 0.7942333382046778, "learning_rate": 4.940305274798005e-05, "loss": 0.2091, "step": 767 }, { "epoch": 0.3494085532302093, "grad_norm": 0.8376471160704844, "learning_rate": 4.940149937447549e-05, "loss": 0.2608, "step": 768 }, { "epoch": 0.34986351228389445, "grad_norm": 0.7834442602857384, "learning_rate": 4.9399944006990605e-05, "loss": 0.2176, "step": 769 }, { "epoch": 0.3503184713375796, "grad_norm": 0.9533590068104372, "learning_rate": 4.939838664565248e-05, "loss": 0.2334, "step": 770 }, { "epoch": 0.3507734303912648, "grad_norm": 0.765259779409494, "learning_rate": 4.939682729058839e-05, "loss": 0.1759, "step": 771 }, { "epoch": 0.35122838944494994, "grad_norm": 0.7038679499755328, "learning_rate": 4.939526594192574e-05, "loss": 0.1807, "step": 772 }, { "epoch": 0.35168334849863514, "grad_norm": 0.9641356316928706, "learning_rate": 4.939370259979213e-05, "loss": 0.1617, "step": 773 }, { "epoch": 0.3521383075523203, "grad_norm": 1.38000102013683, "learning_rate": 4.9392137264315295e-05, "loss": 0.3819, "step": 774 }, { "epoch": 0.3525932666060055, "grad_norm": 0.7924332662210166, "learning_rate": 4.9390569935623164e-05, "loss": 0.3036, "step": 775 }, { "epoch": 0.35304822565969063, "grad_norm": 1.1696538079437302, "learning_rate": 4.9389000613843805e-05, "loss": 0.195, "step": 776 }, { "epoch": 0.3535031847133758, "grad_norm": 1.1615642417440915, "learning_rate": 4.938742929910546e-05, "loss": 0.2543, "step": 777 }, { "epoch": 0.353958143767061, "grad_norm": 0.5863414759846027, "learning_rate": 4.938585599153652e-05, "loss": 0.1749, "step": 778 }, { "epoch": 0.3544131028207461, "grad_norm": 0.8960359715200923, "learning_rate": 4.938428069126555e-05, "loss": 0.2431, "step": 779 }, { "epoch": 0.3548680618744313, "grad_norm": 0.7330408071865291, "learning_rate": 4.938270339842128e-05, "loss": 0.2216, "step": 780 }, { "epoch": 0.35532302092811646, "grad_norm": 0.9915964620431511, "learning_rate": 4.938112411313261e-05, "loss": 0.2503, "step": 781 }, { "epoch": 0.35577797998180166, "grad_norm": 0.7505212338905825, "learning_rate": 4.937954283552858e-05, "loss": 0.1897, "step": 782 }, { "epoch": 0.3562329390354868, "grad_norm": 1.0037412332734268, "learning_rate": 4.9377959565738396e-05, "loss": 0.2324, "step": 783 }, { "epoch": 0.35668789808917195, "grad_norm": 0.8372273536005587, "learning_rate": 4.937637430389145e-05, "loss": 0.2369, "step": 784 }, { "epoch": 0.35714285714285715, "grad_norm": 0.9460886430032468, "learning_rate": 4.9374787050117284e-05, "loss": 0.2166, "step": 785 }, { "epoch": 0.3575978161965423, "grad_norm": 0.7626176184288666, "learning_rate": 4.937319780454559e-05, "loss": 0.2737, "step": 786 }, { "epoch": 0.3580527752502275, "grad_norm": 0.6200450483963668, "learning_rate": 4.937160656730624e-05, "loss": 0.2057, "step": 787 }, { "epoch": 0.35850773430391264, "grad_norm": 0.9514055412111241, "learning_rate": 4.937001333852927e-05, "loss": 0.2437, "step": 788 }, { "epoch": 0.35896269335759784, "grad_norm": 1.2461904187777066, "learning_rate": 4.9368418118344864e-05, "loss": 0.2553, "step": 789 }, { "epoch": 0.359417652411283, "grad_norm": 0.6132191964650525, "learning_rate": 4.9366820906883365e-05, "loss": 0.1919, "step": 790 }, { "epoch": 0.35987261146496813, "grad_norm": 0.9821143456832055, "learning_rate": 4.936522170427531e-05, "loss": 0.2384, "step": 791 }, { "epoch": 0.36032757051865333, "grad_norm": 1.1220533901205472, "learning_rate": 4.936362051065137e-05, "loss": 0.2416, "step": 792 }, { "epoch": 0.3607825295723385, "grad_norm": 0.9875124816434747, "learning_rate": 4.936201732614238e-05, "loss": 0.2229, "step": 793 }, { "epoch": 0.3612374886260237, "grad_norm": 0.7530489955775127, "learning_rate": 4.936041215087935e-05, "loss": 0.1767, "step": 794 }, { "epoch": 0.3616924476797088, "grad_norm": 0.816578911290518, "learning_rate": 4.935880498499346e-05, "loss": 0.1757, "step": 795 }, { "epoch": 0.362147406733394, "grad_norm": 1.033593251865879, "learning_rate": 4.9357195828616034e-05, "loss": 0.2823, "step": 796 }, { "epoch": 0.36260236578707916, "grad_norm": 0.7799808545723609, "learning_rate": 4.9355584681878546e-05, "loss": 0.2358, "step": 797 }, { "epoch": 0.3630573248407643, "grad_norm": 0.878858693886008, "learning_rate": 4.9353971544912676e-05, "loss": 0.2507, "step": 798 }, { "epoch": 0.3635122838944495, "grad_norm": 0.8102145032840067, "learning_rate": 4.9352356417850235e-05, "loss": 0.239, "step": 799 }, { "epoch": 0.36396724294813465, "grad_norm": 1.1015770181032158, "learning_rate": 4.935073930082319e-05, "loss": 0.2411, "step": 800 }, { "epoch": 0.36442220200181985, "grad_norm": 1.4471391654366006, "learning_rate": 4.934912019396371e-05, "loss": 0.3217, "step": 801 }, { "epoch": 0.364877161055505, "grad_norm": 1.0014035915390782, "learning_rate": 4.934749909740407e-05, "loss": 0.2726, "step": 802 }, { "epoch": 0.3653321201091902, "grad_norm": 0.8842996458030732, "learning_rate": 4.934587601127677e-05, "loss": 0.2065, "step": 803 }, { "epoch": 0.36578707916287534, "grad_norm": 0.8426534241686839, "learning_rate": 4.934425093571443e-05, "loss": 0.2713, "step": 804 }, { "epoch": 0.3662420382165605, "grad_norm": 1.2897416366624703, "learning_rate": 4.9342623870849834e-05, "loss": 0.2642, "step": 805 }, { "epoch": 0.3666969972702457, "grad_norm": 0.8252338135748168, "learning_rate": 4.9340994816815946e-05, "loss": 0.2108, "step": 806 }, { "epoch": 0.36715195632393083, "grad_norm": 0.8624719573841998, "learning_rate": 4.9339363773745884e-05, "loss": 0.1936, "step": 807 }, { "epoch": 0.36760691537761603, "grad_norm": 0.8225210326757177, "learning_rate": 4.933773074177293e-05, "loss": 0.2325, "step": 808 }, { "epoch": 0.3680618744313012, "grad_norm": 1.0917137924182418, "learning_rate": 4.933609572103053e-05, "loss": 0.2375, "step": 809 }, { "epoch": 0.3685168334849864, "grad_norm": 0.9770852542903552, "learning_rate": 4.9334458711652286e-05, "loss": 0.3171, "step": 810 }, { "epoch": 0.3689717925386715, "grad_norm": 0.9874908904305237, "learning_rate": 4.933281971377197e-05, "loss": 0.2224, "step": 811 }, { "epoch": 0.36942675159235666, "grad_norm": 0.7953975565144558, "learning_rate": 4.9331178727523516e-05, "loss": 0.2574, "step": 812 }, { "epoch": 0.36988171064604186, "grad_norm": 1.014046710687757, "learning_rate": 4.932953575304102e-05, "loss": 0.2763, "step": 813 }, { "epoch": 0.370336669699727, "grad_norm": 0.9744068617678555, "learning_rate": 4.932789079045873e-05, "loss": 0.2594, "step": 814 }, { "epoch": 0.3707916287534122, "grad_norm": 3.6666400298066786, "learning_rate": 4.932624383991106e-05, "loss": 0.2466, "step": 815 }, { "epoch": 0.37124658780709735, "grad_norm": 0.9550952967484305, "learning_rate": 4.932459490153261e-05, "loss": 0.2509, "step": 816 }, { "epoch": 0.37170154686078255, "grad_norm": 0.724330623956253, "learning_rate": 4.93229439754581e-05, "loss": 0.2025, "step": 817 }, { "epoch": 0.3721565059144677, "grad_norm": 1.1927085069954713, "learning_rate": 4.932129106182246e-05, "loss": 0.2371, "step": 818 }, { "epoch": 0.37261146496815284, "grad_norm": 1.1137424218176708, "learning_rate": 4.9319636160760754e-05, "loss": 0.2846, "step": 819 }, { "epoch": 0.37306642402183804, "grad_norm": 0.7452064014387284, "learning_rate": 4.93179792724082e-05, "loss": 0.1746, "step": 820 }, { "epoch": 0.3735213830755232, "grad_norm": 0.8610688286872649, "learning_rate": 4.9316320396900193e-05, "loss": 0.231, "step": 821 }, { "epoch": 0.3739763421292084, "grad_norm": 0.8505256804223623, "learning_rate": 4.93146595343723e-05, "loss": 0.2796, "step": 822 }, { "epoch": 0.37443130118289353, "grad_norm": 0.9369680350436137, "learning_rate": 4.9312996684960236e-05, "loss": 0.2063, "step": 823 }, { "epoch": 0.37488626023657873, "grad_norm": 0.9693070668876962, "learning_rate": 4.931133184879988e-05, "loss": 0.2438, "step": 824 }, { "epoch": 0.37534121929026387, "grad_norm": 0.7018123050615585, "learning_rate": 4.9309665026027273e-05, "loss": 0.2194, "step": 825 }, { "epoch": 0.37579617834394907, "grad_norm": 1.7534434822354055, "learning_rate": 4.930799621677863e-05, "loss": 0.2821, "step": 826 }, { "epoch": 0.3762511373976342, "grad_norm": 0.880116841053919, "learning_rate": 4.930632542119029e-05, "loss": 0.2209, "step": 827 }, { "epoch": 0.37670609645131936, "grad_norm": 1.2478995692663526, "learning_rate": 4.930465263939882e-05, "loss": 0.2156, "step": 828 }, { "epoch": 0.37716105550500456, "grad_norm": 3.1902522990466484, "learning_rate": 4.930297787154089e-05, "loss": 0.3108, "step": 829 }, { "epoch": 0.3776160145586897, "grad_norm": 0.8033071138203286, "learning_rate": 4.930130111775336e-05, "loss": 0.1716, "step": 830 }, { "epoch": 0.3780709736123749, "grad_norm": 0.9159248622508217, "learning_rate": 4.9299622378173246e-05, "loss": 0.1932, "step": 831 }, { "epoch": 0.37852593266606005, "grad_norm": 1.8154036229700532, "learning_rate": 4.9297941652937726e-05, "loss": 0.4416, "step": 832 }, { "epoch": 0.37898089171974525, "grad_norm": 0.6289367923978163, "learning_rate": 4.9296258942184145e-05, "loss": 0.1494, "step": 833 }, { "epoch": 0.3794358507734304, "grad_norm": 0.7099581380937244, "learning_rate": 4.9294574246049996e-05, "loss": 0.1643, "step": 834 }, { "epoch": 0.37989080982711554, "grad_norm": 4.327844244651128, "learning_rate": 4.9292887564672964e-05, "loss": 0.2706, "step": 835 }, { "epoch": 0.38034576888080074, "grad_norm": 1.0851512501338119, "learning_rate": 4.929119889819086e-05, "loss": 0.2712, "step": 836 }, { "epoch": 0.3808007279344859, "grad_norm": 1.1812252512026913, "learning_rate": 4.9289508246741685e-05, "loss": 0.3524, "step": 837 }, { "epoch": 0.3812556869881711, "grad_norm": 1.000489939213766, "learning_rate": 4.9287815610463584e-05, "loss": 0.2644, "step": 838 }, { "epoch": 0.3817106460418562, "grad_norm": 1.112597837658351, "learning_rate": 4.9286120989494874e-05, "loss": 0.3193, "step": 839 }, { "epoch": 0.3821656050955414, "grad_norm": 1.0711512521045894, "learning_rate": 4.9284424383974026e-05, "loss": 0.2621, "step": 840 }, { "epoch": 0.38262056414922657, "grad_norm": 0.7667959355942393, "learning_rate": 4.928272579403969e-05, "loss": 0.1792, "step": 841 }, { "epoch": 0.3830755232029117, "grad_norm": 0.7143902803716169, "learning_rate": 4.928102521983067e-05, "loss": 0.2759, "step": 842 }, { "epoch": 0.3835304822565969, "grad_norm": 1.0533561300297432, "learning_rate": 4.92793226614859e-05, "loss": 0.3263, "step": 843 }, { "epoch": 0.38398544131028206, "grad_norm": 1.1475456506593584, "learning_rate": 4.927761811914455e-05, "loss": 0.3071, "step": 844 }, { "epoch": 0.38444040036396726, "grad_norm": 0.7571507112907061, "learning_rate": 4.927591159294587e-05, "loss": 0.2185, "step": 845 }, { "epoch": 0.3848953594176524, "grad_norm": 1.3969662575247135, "learning_rate": 4.927420308302933e-05, "loss": 0.2948, "step": 846 }, { "epoch": 0.3853503184713376, "grad_norm": 0.7735055870798285, "learning_rate": 4.9272492589534536e-05, "loss": 0.2789, "step": 847 }, { "epoch": 0.38580527752502275, "grad_norm": 1.208160158685136, "learning_rate": 4.9270780112601254e-05, "loss": 0.3036, "step": 848 }, { "epoch": 0.3862602365787079, "grad_norm": 0.9674516731033433, "learning_rate": 4.926906565236943e-05, "loss": 0.2923, "step": 849 }, { "epoch": 0.3867151956323931, "grad_norm": 0.8623893212242417, "learning_rate": 4.9267349208979165e-05, "loss": 0.2151, "step": 850 }, { "epoch": 0.38717015468607824, "grad_norm": 2.342477233327166, "learning_rate": 4.9265630782570704e-05, "loss": 0.2671, "step": 851 }, { "epoch": 0.38762511373976344, "grad_norm": 0.7035695601157603, "learning_rate": 4.926391037328448e-05, "loss": 0.2436, "step": 852 }, { "epoch": 0.3880800727934486, "grad_norm": 0.8153530601206586, "learning_rate": 4.9262187981261074e-05, "loss": 0.2428, "step": 853 }, { "epoch": 0.3885350318471338, "grad_norm": 0.9485589568264835, "learning_rate": 4.926046360664124e-05, "loss": 0.1976, "step": 854 }, { "epoch": 0.3889899909008189, "grad_norm": 0.8747535385269428, "learning_rate": 4.9258737249565875e-05, "loss": 0.1862, "step": 855 }, { "epoch": 0.38944494995450407, "grad_norm": 0.8515066533543931, "learning_rate": 4.925700891017606e-05, "loss": 0.3074, "step": 856 }, { "epoch": 0.38989990900818927, "grad_norm": 1.0731034722633095, "learning_rate": 4.9255278588613016e-05, "loss": 0.2474, "step": 857 }, { "epoch": 0.3903548680618744, "grad_norm": 1.0085291384744488, "learning_rate": 4.9253546285018135e-05, "loss": 0.2143, "step": 858 }, { "epoch": 0.3908098271155596, "grad_norm": 0.8263584280282137, "learning_rate": 4.925181199953299e-05, "loss": 0.2172, "step": 859 }, { "epoch": 0.39126478616924476, "grad_norm": 0.9573405264483064, "learning_rate": 4.925007573229928e-05, "loss": 0.2285, "step": 860 }, { "epoch": 0.39171974522292996, "grad_norm": 1.0988270383605436, "learning_rate": 4.9248337483458903e-05, "loss": 0.2364, "step": 861 }, { "epoch": 0.3921747042766151, "grad_norm": 0.8735259344025906, "learning_rate": 4.9246597253153884e-05, "loss": 0.2356, "step": 862 }, { "epoch": 0.39262966333030025, "grad_norm": 0.6542892040020104, "learning_rate": 4.924485504152644e-05, "loss": 0.2121, "step": 863 }, { "epoch": 0.39308462238398545, "grad_norm": 1.2354383237771651, "learning_rate": 4.924311084871892e-05, "loss": 0.2741, "step": 864 }, { "epoch": 0.3935395814376706, "grad_norm": 0.6990265680705534, "learning_rate": 4.924136467487387e-05, "loss": 0.1764, "step": 865 }, { "epoch": 0.3939945404913558, "grad_norm": 0.7709028924557814, "learning_rate": 4.923961652013397e-05, "loss": 0.2162, "step": 866 }, { "epoch": 0.39444949954504094, "grad_norm": 0.9296977878000466, "learning_rate": 4.923786638464207e-05, "loss": 0.2843, "step": 867 }, { "epoch": 0.39490445859872614, "grad_norm": 0.9218588248560363, "learning_rate": 4.92361142685412e-05, "loss": 0.2107, "step": 868 }, { "epoch": 0.3953594176524113, "grad_norm": 0.9380438388326167, "learning_rate": 4.923436017197451e-05, "loss": 0.2559, "step": 869 }, { "epoch": 0.3958143767060964, "grad_norm": 0.9491531754287603, "learning_rate": 4.923260409508534e-05, "loss": 0.1742, "step": 870 }, { "epoch": 0.3962693357597816, "grad_norm": 0.7304177134091459, "learning_rate": 4.923084603801721e-05, "loss": 0.2564, "step": 871 }, { "epoch": 0.39672429481346677, "grad_norm": 0.8431845100741889, "learning_rate": 4.9229086000913773e-05, "loss": 0.2002, "step": 872 }, { "epoch": 0.39717925386715197, "grad_norm": 0.7576934636961222, "learning_rate": 4.922732398391883e-05, "loss": 0.2576, "step": 873 }, { "epoch": 0.3976342129208371, "grad_norm": 0.7429081150365122, "learning_rate": 4.922555998717639e-05, "loss": 0.1824, "step": 874 }, { "epoch": 0.3980891719745223, "grad_norm": 0.9386765983406252, "learning_rate": 4.922379401083058e-05, "loss": 0.2228, "step": 875 }, { "epoch": 0.39854413102820746, "grad_norm": 1.3703445244688306, "learning_rate": 4.922202605502573e-05, "loss": 0.32, "step": 876 }, { "epoch": 0.3989990900818926, "grad_norm": 0.6453955725166932, "learning_rate": 4.922025611990629e-05, "loss": 0.1995, "step": 877 }, { "epoch": 0.3994540491355778, "grad_norm": 0.6952068828627649, "learning_rate": 4.92184842056169e-05, "loss": 0.2238, "step": 878 }, { "epoch": 0.39990900818926295, "grad_norm": 0.883708178069293, "learning_rate": 4.921671031230234e-05, "loss": 0.2182, "step": 879 }, { "epoch": 0.40036396724294815, "grad_norm": 0.8954637002762277, "learning_rate": 4.921493444010759e-05, "loss": 0.2309, "step": 880 }, { "epoch": 0.4008189262966333, "grad_norm": 0.7284555211439814, "learning_rate": 4.9213156589177746e-05, "loss": 0.22, "step": 881 }, { "epoch": 0.4012738853503185, "grad_norm": 0.9502197899990069, "learning_rate": 4.921137675965809e-05, "loss": 0.2164, "step": 882 }, { "epoch": 0.40172884440400364, "grad_norm": 0.8584628557388456, "learning_rate": 4.920959495169406e-05, "loss": 0.2366, "step": 883 }, { "epoch": 0.4021838034576888, "grad_norm": 1.0371020037418903, "learning_rate": 4.9207811165431264e-05, "loss": 0.2959, "step": 884 }, { "epoch": 0.402638762511374, "grad_norm": 0.6629644220584772, "learning_rate": 4.9206025401015455e-05, "loss": 0.161, "step": 885 }, { "epoch": 0.4030937215650591, "grad_norm": 1.2497702992902513, "learning_rate": 4.920423765859257e-05, "loss": 0.2846, "step": 886 }, { "epoch": 0.4035486806187443, "grad_norm": 0.8235723161752649, "learning_rate": 4.9202447938308686e-05, "loss": 0.2356, "step": 887 }, { "epoch": 0.40400363967242947, "grad_norm": 0.8012437423279374, "learning_rate": 4.920065624031006e-05, "loss": 0.2246, "step": 888 }, { "epoch": 0.40445859872611467, "grad_norm": 0.6925516387247307, "learning_rate": 4.919886256474309e-05, "loss": 0.214, "step": 889 }, { "epoch": 0.4049135577797998, "grad_norm": 0.8249547835771969, "learning_rate": 4.919706691175435e-05, "loss": 0.2114, "step": 890 }, { "epoch": 0.40536851683348496, "grad_norm": 0.9184202193550818, "learning_rate": 4.919526928149057e-05, "loss": 0.3217, "step": 891 }, { "epoch": 0.40582347588717016, "grad_norm": 0.7328034060267359, "learning_rate": 4.9193469674098667e-05, "loss": 0.1952, "step": 892 }, { "epoch": 0.4062784349408553, "grad_norm": 0.7322631049916856, "learning_rate": 4.919166808972567e-05, "loss": 0.289, "step": 893 }, { "epoch": 0.4067333939945405, "grad_norm": 1.2559506511280694, "learning_rate": 4.91898645285188e-05, "loss": 0.2832, "step": 894 }, { "epoch": 0.40718835304822565, "grad_norm": 1.0231241024767233, "learning_rate": 4.9188058990625445e-05, "loss": 0.2574, "step": 895 }, { "epoch": 0.40764331210191085, "grad_norm": 0.598733777612073, "learning_rate": 4.918625147619315e-05, "loss": 0.2053, "step": 896 }, { "epoch": 0.408098271155596, "grad_norm": 0.7783451939467328, "learning_rate": 4.91844419853696e-05, "loss": 0.2211, "step": 897 }, { "epoch": 0.40855323020928114, "grad_norm": 0.7755447992817268, "learning_rate": 4.918263051830267e-05, "loss": 0.1658, "step": 898 }, { "epoch": 0.40900818926296634, "grad_norm": 0.9553645461143808, "learning_rate": 4.9180817075140376e-05, "loss": 0.1969, "step": 899 }, { "epoch": 0.4094631483166515, "grad_norm": 0.8406985595129353, "learning_rate": 4.917900165603091e-05, "loss": 0.1927, "step": 900 }, { "epoch": 0.4099181073703367, "grad_norm": 1.0297069097638345, "learning_rate": 4.917718426112262e-05, "loss": 0.2139, "step": 901 }, { "epoch": 0.4103730664240218, "grad_norm": 1.2981642635667683, "learning_rate": 4.9175364890564015e-05, "loss": 0.2242, "step": 902 }, { "epoch": 0.410828025477707, "grad_norm": 0.8638962009852217, "learning_rate": 4.917354354450378e-05, "loss": 0.1745, "step": 903 }, { "epoch": 0.41128298453139217, "grad_norm": 0.7881868674496092, "learning_rate": 4.9171720223090715e-05, "loss": 0.1821, "step": 904 }, { "epoch": 0.41173794358507737, "grad_norm": 0.8949031519984939, "learning_rate": 4.9169894926473845e-05, "loss": 0.1766, "step": 905 }, { "epoch": 0.4121929026387625, "grad_norm": 0.8314582477111235, "learning_rate": 4.916806765480231e-05, "loss": 0.1568, "step": 906 }, { "epoch": 0.41264786169244766, "grad_norm": 0.9866327299291593, "learning_rate": 4.9166238408225416e-05, "loss": 0.2782, "step": 907 }, { "epoch": 0.41310282074613286, "grad_norm": 1.1018092455495974, "learning_rate": 4.916440718689267e-05, "loss": 0.2232, "step": 908 }, { "epoch": 0.413557779799818, "grad_norm": 0.7954404927608166, "learning_rate": 4.9162573990953686e-05, "loss": 0.2019, "step": 909 }, { "epoch": 0.4140127388535032, "grad_norm": 0.8700015504770355, "learning_rate": 4.916073882055827e-05, "loss": 0.1952, "step": 910 }, { "epoch": 0.41446769790718835, "grad_norm": 0.9788237170102215, "learning_rate": 4.9158901675856395e-05, "loss": 0.2696, "step": 911 }, { "epoch": 0.41492265696087355, "grad_norm": 0.782156205303467, "learning_rate": 4.9157062556998166e-05, "loss": 0.2007, "step": 912 }, { "epoch": 0.4153776160145587, "grad_norm": 1.2831847539888002, "learning_rate": 4.915522146413388e-05, "loss": 0.3806, "step": 913 }, { "epoch": 0.41583257506824384, "grad_norm": 0.7920116815416954, "learning_rate": 4.9153378397413984e-05, "loss": 0.238, "step": 914 }, { "epoch": 0.41628753412192904, "grad_norm": 0.7753175241732486, "learning_rate": 4.915153335698908e-05, "loss": 0.2372, "step": 915 }, { "epoch": 0.4167424931756142, "grad_norm": 0.9224788925539529, "learning_rate": 4.914968634300994e-05, "loss": 0.3007, "step": 916 }, { "epoch": 0.4171974522292994, "grad_norm": 0.7782273800409717, "learning_rate": 4.914783735562748e-05, "loss": 0.2016, "step": 917 }, { "epoch": 0.4176524112829845, "grad_norm": 0.7337499720126937, "learning_rate": 4.914598639499281e-05, "loss": 0.2391, "step": 918 }, { "epoch": 0.4181073703366697, "grad_norm": 1.1225430318415532, "learning_rate": 4.914413346125717e-05, "loss": 0.1478, "step": 919 }, { "epoch": 0.41856232939035487, "grad_norm": 0.9471073415184693, "learning_rate": 4.9142278554571985e-05, "loss": 0.2475, "step": 920 }, { "epoch": 0.41901728844404, "grad_norm": 1.149566961485527, "learning_rate": 4.9140421675088815e-05, "loss": 0.2012, "step": 921 }, { "epoch": 0.4194722474977252, "grad_norm": 1.0176698773947965, "learning_rate": 4.913856282295941e-05, "loss": 0.2325, "step": 922 }, { "epoch": 0.41992720655141036, "grad_norm": 1.04725173254006, "learning_rate": 4.9136701998335654e-05, "loss": 0.2268, "step": 923 }, { "epoch": 0.42038216560509556, "grad_norm": 1.4335029903058476, "learning_rate": 4.913483920136961e-05, "loss": 0.2677, "step": 924 }, { "epoch": 0.4208371246587807, "grad_norm": 0.9409599556318122, "learning_rate": 4.9132974432213504e-05, "loss": 0.2149, "step": 925 }, { "epoch": 0.4212920837124659, "grad_norm": 0.8118608912136547, "learning_rate": 4.9131107691019704e-05, "loss": 0.2269, "step": 926 }, { "epoch": 0.42174704276615105, "grad_norm": 0.9674113879611551, "learning_rate": 4.9129238977940766e-05, "loss": 0.2408, "step": 927 }, { "epoch": 0.4222020018198362, "grad_norm": 0.6813432599306817, "learning_rate": 4.9127368293129384e-05, "loss": 0.2023, "step": 928 }, { "epoch": 0.4226569608735214, "grad_norm": 0.7981152360241318, "learning_rate": 4.912549563673842e-05, "loss": 0.2305, "step": 929 }, { "epoch": 0.42311191992720654, "grad_norm": 0.6409103251371974, "learning_rate": 4.9123621008920906e-05, "loss": 0.2059, "step": 930 }, { "epoch": 0.42356687898089174, "grad_norm": 0.8363004210306043, "learning_rate": 4.9121744409830015e-05, "loss": 0.2233, "step": 931 }, { "epoch": 0.4240218380345769, "grad_norm": 1.0853303928216167, "learning_rate": 4.911986583961912e-05, "loss": 0.3498, "step": 932 }, { "epoch": 0.4244767970882621, "grad_norm": 0.9985980579566411, "learning_rate": 4.91179852984417e-05, "loss": 0.2975, "step": 933 }, { "epoch": 0.4249317561419472, "grad_norm": 0.7967407154627208, "learning_rate": 4.911610278645144e-05, "loss": 0.2119, "step": 934 }, { "epoch": 0.42538671519563237, "grad_norm": 0.8646707095339231, "learning_rate": 4.911421830380217e-05, "loss": 0.1701, "step": 935 }, { "epoch": 0.42584167424931757, "grad_norm": 0.808416401539496, "learning_rate": 4.911233185064787e-05, "loss": 0.2666, "step": 936 }, { "epoch": 0.4262966333030027, "grad_norm": 0.9451839182963364, "learning_rate": 4.911044342714272e-05, "loss": 0.1655, "step": 937 }, { "epoch": 0.4267515923566879, "grad_norm": 0.9826776779404278, "learning_rate": 4.9108553033440994e-05, "loss": 0.189, "step": 938 }, { "epoch": 0.42720655141037306, "grad_norm": 0.9506190390751013, "learning_rate": 4.91066606696972e-05, "loss": 0.2321, "step": 939 }, { "epoch": 0.42766151046405826, "grad_norm": 0.9313756936819222, "learning_rate": 4.9104766336065965e-05, "loss": 0.2399, "step": 940 }, { "epoch": 0.4281164695177434, "grad_norm": 0.822212817230545, "learning_rate": 4.910287003270208e-05, "loss": 0.2265, "step": 941 }, { "epoch": 0.42857142857142855, "grad_norm": 0.7425311259858757, "learning_rate": 4.910097175976049e-05, "loss": 0.2358, "step": 942 }, { "epoch": 0.42902638762511375, "grad_norm": 1.02073524728647, "learning_rate": 4.909907151739633e-05, "loss": 0.2474, "step": 943 }, { "epoch": 0.4294813466787989, "grad_norm": 1.4117902586105977, "learning_rate": 4.909716930576489e-05, "loss": 0.2869, "step": 944 }, { "epoch": 0.4299363057324841, "grad_norm": 0.882575224129149, "learning_rate": 4.9095265125021584e-05, "loss": 0.2179, "step": 945 }, { "epoch": 0.43039126478616924, "grad_norm": 1.0188920312930905, "learning_rate": 4.909335897532202e-05, "loss": 0.2293, "step": 946 }, { "epoch": 0.43084622383985444, "grad_norm": 1.6200753244956199, "learning_rate": 4.909145085682198e-05, "loss": 0.3462, "step": 947 }, { "epoch": 0.4313011828935396, "grad_norm": 0.9963931704767716, "learning_rate": 4.9089540769677366e-05, "loss": 0.2116, "step": 948 }, { "epoch": 0.4317561419472247, "grad_norm": 1.0769829835475282, "learning_rate": 4.9087628714044266e-05, "loss": 0.3572, "step": 949 }, { "epoch": 0.4322111010009099, "grad_norm": 0.8184276922485177, "learning_rate": 4.908571469007893e-05, "loss": 0.2609, "step": 950 }, { "epoch": 0.43266606005459507, "grad_norm": 0.9142678479187198, "learning_rate": 4.9083798697937754e-05, "loss": 0.2163, "step": 951 }, { "epoch": 0.43312101910828027, "grad_norm": 0.943811274498386, "learning_rate": 4.908188073777732e-05, "loss": 0.2631, "step": 952 }, { "epoch": 0.4335759781619654, "grad_norm": 0.8948634710112429, "learning_rate": 4.907996080975433e-05, "loss": 0.2423, "step": 953 }, { "epoch": 0.4340309372156506, "grad_norm": 0.880734647041023, "learning_rate": 4.9078038914025695e-05, "loss": 0.2002, "step": 954 }, { "epoch": 0.43448589626933576, "grad_norm": 1.2362304259527994, "learning_rate": 4.907611505074846e-05, "loss": 0.2685, "step": 955 }, { "epoch": 0.4349408553230209, "grad_norm": 0.9862858823724424, "learning_rate": 4.907418922007982e-05, "loss": 0.1975, "step": 956 }, { "epoch": 0.4353958143767061, "grad_norm": 1.4025166658185078, "learning_rate": 4.907226142217716e-05, "loss": 0.2196, "step": 957 }, { "epoch": 0.43585077343039125, "grad_norm": 1.0726839381373077, "learning_rate": 4.907033165719801e-05, "loss": 0.2409, "step": 958 }, { "epoch": 0.43630573248407645, "grad_norm": 0.899798748985714, "learning_rate": 4.9068399925300055e-05, "loss": 0.2696, "step": 959 }, { "epoch": 0.4367606915377616, "grad_norm": 0.851752515774608, "learning_rate": 4.906646622664115e-05, "loss": 0.2546, "step": 960 }, { "epoch": 0.4372156505914468, "grad_norm": 1.2658378280379758, "learning_rate": 4.9064530561379305e-05, "loss": 0.1971, "step": 961 }, { "epoch": 0.43767060964513194, "grad_norm": 0.9598388956897335, "learning_rate": 4.90625929296727e-05, "loss": 0.3262, "step": 962 }, { "epoch": 0.4381255686988171, "grad_norm": 0.9976585075216508, "learning_rate": 4.9060653331679665e-05, "loss": 0.2849, "step": 963 }, { "epoch": 0.4385805277525023, "grad_norm": 0.8235457659387697, "learning_rate": 4.9058711767558694e-05, "loss": 0.2185, "step": 964 }, { "epoch": 0.4390354868061874, "grad_norm": 0.7475595452128824, "learning_rate": 4.905676823746845e-05, "loss": 0.1985, "step": 965 }, { "epoch": 0.4394904458598726, "grad_norm": 0.5974221511986432, "learning_rate": 4.905482274156774e-05, "loss": 0.1838, "step": 966 }, { "epoch": 0.43994540491355777, "grad_norm": 0.9768618560401001, "learning_rate": 4.905287528001555e-05, "loss": 0.2008, "step": 967 }, { "epoch": 0.44040036396724297, "grad_norm": 0.9214269249469357, "learning_rate": 4.905092585297102e-05, "loss": 0.2239, "step": 968 }, { "epoch": 0.4408553230209281, "grad_norm": 1.0318881985593285, "learning_rate": 4.9048974460593436e-05, "loss": 0.2221, "step": 969 }, { "epoch": 0.44131028207461326, "grad_norm": 1.7355532421943394, "learning_rate": 4.904702110304226e-05, "loss": 0.2287, "step": 970 }, { "epoch": 0.44176524112829846, "grad_norm": 1.4509482406864085, "learning_rate": 4.9045065780477117e-05, "loss": 0.2425, "step": 971 }, { "epoch": 0.4422202001819836, "grad_norm": 0.9532445419601329, "learning_rate": 4.904310849305779e-05, "loss": 0.2325, "step": 972 }, { "epoch": 0.4426751592356688, "grad_norm": 1.5730073688332673, "learning_rate": 4.904114924094421e-05, "loss": 0.26, "step": 973 }, { "epoch": 0.44313011828935395, "grad_norm": 1.0178605252750241, "learning_rate": 4.903918802429648e-05, "loss": 0.2929, "step": 974 }, { "epoch": 0.44358507734303915, "grad_norm": 0.8082167882377479, "learning_rate": 4.903722484327487e-05, "loss": 0.2644, "step": 975 }, { "epoch": 0.4440400363967243, "grad_norm": 1.2053356760378824, "learning_rate": 4.903525969803979e-05, "loss": 0.375, "step": 976 }, { "epoch": 0.44449499545040944, "grad_norm": 0.7971578003154205, "learning_rate": 4.903329258875185e-05, "loss": 0.1721, "step": 977 }, { "epoch": 0.44494995450409464, "grad_norm": 0.8844575956737264, "learning_rate": 4.9031323515571746e-05, "loss": 0.2077, "step": 978 }, { "epoch": 0.4454049135577798, "grad_norm": 8.682910536592209, "learning_rate": 4.902935247866042e-05, "loss": 0.212, "step": 979 }, { "epoch": 0.445859872611465, "grad_norm": 1.3317536077439764, "learning_rate": 4.902737947817893e-05, "loss": 0.2417, "step": 980 }, { "epoch": 0.4463148316651501, "grad_norm": 0.6626516597833116, "learning_rate": 4.902540451428849e-05, "loss": 0.1444, "step": 981 }, { "epoch": 0.4467697907188353, "grad_norm": 0.9298337685427198, "learning_rate": 4.9023427587150496e-05, "loss": 0.1908, "step": 982 }, { "epoch": 0.44722474977252047, "grad_norm": 1.191318887497276, "learning_rate": 4.902144869692649e-05, "loss": 0.2402, "step": 983 }, { "epoch": 0.44767970882620567, "grad_norm": 0.686457648674905, "learning_rate": 4.901946784377817e-05, "loss": 0.2371, "step": 984 }, { "epoch": 0.4481346678798908, "grad_norm": 0.7046291004272071, "learning_rate": 4.9017485027867406e-05, "loss": 0.2149, "step": 985 }, { "epoch": 0.44858962693357596, "grad_norm": 0.916983847557995, "learning_rate": 4.9015500249356225e-05, "loss": 0.2498, "step": 986 }, { "epoch": 0.44904458598726116, "grad_norm": 0.6071674300600297, "learning_rate": 4.901351350840683e-05, "loss": 0.2027, "step": 987 }, { "epoch": 0.4494995450409463, "grad_norm": 1.6072708968387832, "learning_rate": 4.9011524805181546e-05, "loss": 0.235, "step": 988 }, { "epoch": 0.4499545040946315, "grad_norm": 0.815255028928963, "learning_rate": 4.9009534139842895e-05, "loss": 0.2231, "step": 989 }, { "epoch": 0.45040946314831665, "grad_norm": 0.6936429185347536, "learning_rate": 4.9007541512553535e-05, "loss": 0.2531, "step": 990 }, { "epoch": 0.45086442220200185, "grad_norm": 1.0007501647579404, "learning_rate": 4.9005546923476305e-05, "loss": 0.2623, "step": 991 }, { "epoch": 0.451319381255687, "grad_norm": 0.8419869528725192, "learning_rate": 4.900355037277419e-05, "loss": 0.1976, "step": 992 }, { "epoch": 0.45177434030937214, "grad_norm": 0.9522606875436962, "learning_rate": 4.900155186061033e-05, "loss": 0.2069, "step": 993 }, { "epoch": 0.45222929936305734, "grad_norm": 0.9406215090397574, "learning_rate": 4.899955138714805e-05, "loss": 0.2469, "step": 994 }, { "epoch": 0.4526842584167425, "grad_norm": 1.0918706150749706, "learning_rate": 4.89975489525508e-05, "loss": 0.2478, "step": 995 }, { "epoch": 0.4531392174704277, "grad_norm": 0.91175491610584, "learning_rate": 4.899554455698223e-05, "loss": 0.2183, "step": 996 }, { "epoch": 0.4535941765241128, "grad_norm": 0.8848398926900624, "learning_rate": 4.899353820060612e-05, "loss": 0.241, "step": 997 }, { "epoch": 0.454049135577798, "grad_norm": 0.9692418718499057, "learning_rate": 4.899152988358642e-05, "loss": 0.3062, "step": 998 }, { "epoch": 0.45450409463148317, "grad_norm": 0.7886391507707521, "learning_rate": 4.898951960608725e-05, "loss": 0.2257, "step": 999 }, { "epoch": 0.4549590536851683, "grad_norm": 0.7807437383123544, "learning_rate": 4.898750736827287e-05, "loss": 0.2314, "step": 1000 }, { "epoch": 0.4554140127388535, "grad_norm": 0.8765275543205044, "learning_rate": 4.898549317030771e-05, "loss": 0.2382, "step": 1001 }, { "epoch": 0.45586897179253866, "grad_norm": 0.8271324592930718, "learning_rate": 4.8983477012356374e-05, "loss": 0.1944, "step": 1002 }, { "epoch": 0.45632393084622386, "grad_norm": 0.8515590062307445, "learning_rate": 4.89814588945836e-05, "loss": 0.2275, "step": 1003 }, { "epoch": 0.456778889899909, "grad_norm": 0.8445130797438481, "learning_rate": 4.89794388171543e-05, "loss": 0.2258, "step": 1004 }, { "epoch": 0.4572338489535942, "grad_norm": 1.0529013961148483, "learning_rate": 4.897741678023355e-05, "loss": 0.2628, "step": 1005 }, { "epoch": 0.45768880800727935, "grad_norm": 1.0114150037005099, "learning_rate": 4.897539278398659e-05, "loss": 0.2518, "step": 1006 }, { "epoch": 0.4581437670609645, "grad_norm": 1.0140248272602872, "learning_rate": 4.8973366828578804e-05, "loss": 0.2796, "step": 1007 }, { "epoch": 0.4585987261146497, "grad_norm": 0.8639020710867072, "learning_rate": 4.897133891417574e-05, "loss": 0.2318, "step": 1008 }, { "epoch": 0.45905368516833484, "grad_norm": 1.003180686542684, "learning_rate": 4.896930904094311e-05, "loss": 0.2264, "step": 1009 }, { "epoch": 0.45950864422202004, "grad_norm": 1.142824098926668, "learning_rate": 4.8967277209046795e-05, "loss": 0.2713, "step": 1010 }, { "epoch": 0.4599636032757052, "grad_norm": 0.5750406319173392, "learning_rate": 4.896524341865282e-05, "loss": 0.1255, "step": 1011 }, { "epoch": 0.4604185623293904, "grad_norm": 0.8800426024836312, "learning_rate": 4.896320766992737e-05, "loss": 0.2523, "step": 1012 }, { "epoch": 0.4608735213830755, "grad_norm": 0.6762602662483737, "learning_rate": 4.8961169963036816e-05, "loss": 0.1942, "step": 1013 }, { "epoch": 0.46132848043676067, "grad_norm": 1.3124625398986691, "learning_rate": 4.895913029814766e-05, "loss": 0.2366, "step": 1014 }, { "epoch": 0.46178343949044587, "grad_norm": 0.9218586110779193, "learning_rate": 4.895708867542658e-05, "loss": 0.2744, "step": 1015 }, { "epoch": 0.462238398544131, "grad_norm": 0.8629702450026177, "learning_rate": 4.895504509504039e-05, "loss": 0.231, "step": 1016 }, { "epoch": 0.4626933575978162, "grad_norm": 0.7572020149552082, "learning_rate": 4.89529995571561e-05, "loss": 0.2119, "step": 1017 }, { "epoch": 0.46314831665150136, "grad_norm": 0.7226282784642739, "learning_rate": 4.895095206194086e-05, "loss": 0.2059, "step": 1018 }, { "epoch": 0.46360327570518656, "grad_norm": 0.7234795836904321, "learning_rate": 4.8948902609561975e-05, "loss": 0.1846, "step": 1019 }, { "epoch": 0.4640582347588717, "grad_norm": 1.2367193745649907, "learning_rate": 4.894685120018692e-05, "loss": 0.2669, "step": 1020 }, { "epoch": 0.46451319381255685, "grad_norm": 1.1958012439157037, "learning_rate": 4.894479783398334e-05, "loss": 0.3107, "step": 1021 }, { "epoch": 0.46496815286624205, "grad_norm": 1.163517556730293, "learning_rate": 4.8942742511119e-05, "loss": 0.3109, "step": 1022 }, { "epoch": 0.4654231119199272, "grad_norm": 1.3388573915166198, "learning_rate": 4.894068523176187e-05, "loss": 0.3599, "step": 1023 }, { "epoch": 0.4658780709736124, "grad_norm": 0.9455743200291276, "learning_rate": 4.893862599608006e-05, "loss": 0.267, "step": 1024 }, { "epoch": 0.46633303002729753, "grad_norm": 0.8069308690970114, "learning_rate": 4.8936564804241835e-05, "loss": 0.2313, "step": 1025 }, { "epoch": 0.46678798908098273, "grad_norm": 0.9929962333380147, "learning_rate": 4.8934501656415636e-05, "loss": 0.2118, "step": 1026 }, { "epoch": 0.4672429481346679, "grad_norm": 0.5565267466800858, "learning_rate": 4.8932436552770053e-05, "loss": 0.1993, "step": 1027 }, { "epoch": 0.467697907188353, "grad_norm": 0.9786542475079227, "learning_rate": 4.893036949347383e-05, "loss": 0.2518, "step": 1028 }, { "epoch": 0.4681528662420382, "grad_norm": 0.972082549859437, "learning_rate": 4.8928300478695876e-05, "loss": 0.2599, "step": 1029 }, { "epoch": 0.46860782529572337, "grad_norm": 1.1234264043308753, "learning_rate": 4.892622950860527e-05, "loss": 0.2397, "step": 1030 }, { "epoch": 0.46906278434940857, "grad_norm": 0.6324315018222074, "learning_rate": 4.8924156583371236e-05, "loss": 0.1933, "step": 1031 }, { "epoch": 0.4695177434030937, "grad_norm": 1.0263089532195, "learning_rate": 4.8922081703163166e-05, "loss": 0.2678, "step": 1032 }, { "epoch": 0.4699727024567789, "grad_norm": 0.8208255371876227, "learning_rate": 4.8920004868150614e-05, "loss": 0.1929, "step": 1033 }, { "epoch": 0.47042766151046406, "grad_norm": 0.7931758930012907, "learning_rate": 4.8917926078503284e-05, "loss": 0.2121, "step": 1034 }, { "epoch": 0.4708826205641492, "grad_norm": 1.108839122884992, "learning_rate": 4.8915845334391043e-05, "loss": 0.3123, "step": 1035 }, { "epoch": 0.4713375796178344, "grad_norm": 0.754564954461654, "learning_rate": 4.891376263598393e-05, "loss": 0.1917, "step": 1036 }, { "epoch": 0.47179253867151955, "grad_norm": 0.8158372435947269, "learning_rate": 4.891167798345213e-05, "loss": 0.1951, "step": 1037 }, { "epoch": 0.47224749772520475, "grad_norm": 1.4093518287721674, "learning_rate": 4.890959137696598e-05, "loss": 0.3491, "step": 1038 }, { "epoch": 0.4727024567788899, "grad_norm": 0.7996698502306632, "learning_rate": 4.8907502816696006e-05, "loss": 0.2005, "step": 1039 }, { "epoch": 0.4731574158325751, "grad_norm": 0.7701069380803095, "learning_rate": 4.890541230281286e-05, "loss": 0.1761, "step": 1040 }, { "epoch": 0.47361237488626023, "grad_norm": 0.8592122837427122, "learning_rate": 4.890331983548738e-05, "loss": 0.2178, "step": 1041 }, { "epoch": 0.4740673339399454, "grad_norm": 0.7829361295730538, "learning_rate": 4.8901225414890555e-05, "loss": 0.3064, "step": 1042 }, { "epoch": 0.4745222929936306, "grad_norm": 0.6021190980233786, "learning_rate": 4.889912904119352e-05, "loss": 0.2187, "step": 1043 }, { "epoch": 0.4749772520473157, "grad_norm": 0.9589203628488044, "learning_rate": 4.889703071456759e-05, "loss": 0.2428, "step": 1044 }, { "epoch": 0.4754322111010009, "grad_norm": 0.7574665896328514, "learning_rate": 4.8894930435184224e-05, "loss": 0.2688, "step": 1045 }, { "epoch": 0.47588717015468607, "grad_norm": 0.8944574431411412, "learning_rate": 4.889282820321506e-05, "loss": 0.1718, "step": 1046 }, { "epoch": 0.47634212920837127, "grad_norm": 1.096151524381797, "learning_rate": 4.889072401883187e-05, "loss": 0.3288, "step": 1047 }, { "epoch": 0.4767970882620564, "grad_norm": 0.8050236843081586, "learning_rate": 4.88886178822066e-05, "loss": 0.2541, "step": 1048 }, { "epoch": 0.47725204731574156, "grad_norm": 0.8879431053566677, "learning_rate": 4.888650979351136e-05, "loss": 0.2641, "step": 1049 }, { "epoch": 0.47770700636942676, "grad_norm": 0.9809721909896907, "learning_rate": 4.888439975291841e-05, "loss": 0.3384, "step": 1050 }, { "epoch": 0.4781619654231119, "grad_norm": 0.7376157631994235, "learning_rate": 4.888228776060016e-05, "loss": 0.2526, "step": 1051 }, { "epoch": 0.4786169244767971, "grad_norm": 0.8008108173603419, "learning_rate": 4.8880173816729225e-05, "loss": 0.2199, "step": 1052 }, { "epoch": 0.47907188353048225, "grad_norm": 0.8075148819400902, "learning_rate": 4.887805792147832e-05, "loss": 0.2115, "step": 1053 }, { "epoch": 0.47952684258416745, "grad_norm": 1.0516017331113456, "learning_rate": 4.887594007502035e-05, "loss": 0.2386, "step": 1054 }, { "epoch": 0.4799818016378526, "grad_norm": 0.9626099881621293, "learning_rate": 4.887382027752838e-05, "loss": 0.2603, "step": 1055 }, { "epoch": 0.48043676069153773, "grad_norm": 0.8842761065099104, "learning_rate": 4.887169852917564e-05, "loss": 0.2335, "step": 1056 }, { "epoch": 0.48089171974522293, "grad_norm": 0.9991273206545239, "learning_rate": 4.8869574830135486e-05, "loss": 0.2669, "step": 1057 }, { "epoch": 0.4813466787989081, "grad_norm": 0.7423264223657465, "learning_rate": 4.886744918058148e-05, "loss": 0.2583, "step": 1058 }, { "epoch": 0.4818016378525933, "grad_norm": 1.1305515411817775, "learning_rate": 4.886532158068732e-05, "loss": 0.2662, "step": 1059 }, { "epoch": 0.4822565969062784, "grad_norm": 0.7907135365184496, "learning_rate": 4.8863192030626836e-05, "loss": 0.1951, "step": 1060 }, { "epoch": 0.4827115559599636, "grad_norm": 1.0291042841104086, "learning_rate": 4.8861060530574075e-05, "loss": 0.2248, "step": 1061 }, { "epoch": 0.48316651501364877, "grad_norm": 1.0001647639024973, "learning_rate": 4.885892708070321e-05, "loss": 0.2457, "step": 1062 }, { "epoch": 0.48362147406733397, "grad_norm": 0.7828386022327105, "learning_rate": 4.885679168118855e-05, "loss": 0.2519, "step": 1063 }, { "epoch": 0.4840764331210191, "grad_norm": 0.8528270348387023, "learning_rate": 4.885465433220463e-05, "loss": 0.2326, "step": 1064 }, { "epoch": 0.48453139217470426, "grad_norm": 0.8727303148794464, "learning_rate": 4.885251503392607e-05, "loss": 0.1849, "step": 1065 }, { "epoch": 0.48498635122838946, "grad_norm": 1.0323872563239376, "learning_rate": 4.885037378652771e-05, "loss": 0.3024, "step": 1066 }, { "epoch": 0.4854413102820746, "grad_norm": 1.0188932780661109, "learning_rate": 4.884823059018451e-05, "loss": 0.1954, "step": 1067 }, { "epoch": 0.4858962693357598, "grad_norm": 1.0151859952719173, "learning_rate": 4.88460854450716e-05, "loss": 0.2405, "step": 1068 }, { "epoch": 0.48635122838944495, "grad_norm": 1.0796458447997055, "learning_rate": 4.884393835136427e-05, "loss": 0.3113, "step": 1069 }, { "epoch": 0.48680618744313015, "grad_norm": 1.1652366283867859, "learning_rate": 4.884178930923799e-05, "loss": 0.2685, "step": 1070 }, { "epoch": 0.4872611464968153, "grad_norm": 1.0580123391746836, "learning_rate": 4.883963831886834e-05, "loss": 0.3209, "step": 1071 }, { "epoch": 0.48771610555050043, "grad_norm": 0.8056344126771602, "learning_rate": 4.883748538043111e-05, "loss": 0.229, "step": 1072 }, { "epoch": 0.48817106460418563, "grad_norm": 0.8827137737881184, "learning_rate": 4.883533049410223e-05, "loss": 0.2485, "step": 1073 }, { "epoch": 0.4886260236578708, "grad_norm": 1.532862888979465, "learning_rate": 4.883317366005778e-05, "loss": 0.3715, "step": 1074 }, { "epoch": 0.489080982711556, "grad_norm": 1.1046238964070791, "learning_rate": 4.8831014878474e-05, "loss": 0.2347, "step": 1075 }, { "epoch": 0.4895359417652411, "grad_norm": 0.9263995038682171, "learning_rate": 4.882885414952731e-05, "loss": 0.2501, "step": 1076 }, { "epoch": 0.4899909008189263, "grad_norm": 0.7911126023632093, "learning_rate": 4.882669147339427e-05, "loss": 0.2, "step": 1077 }, { "epoch": 0.49044585987261147, "grad_norm": 0.7168567268441636, "learning_rate": 4.882452685025161e-05, "loss": 0.17, "step": 1078 }, { "epoch": 0.4909008189262966, "grad_norm": 1.062689195415948, "learning_rate": 4.88223602802762e-05, "loss": 0.2508, "step": 1079 }, { "epoch": 0.4913557779799818, "grad_norm": 0.8146971490550287, "learning_rate": 4.882019176364509e-05, "loss": 0.2241, "step": 1080 }, { "epoch": 0.49181073703366696, "grad_norm": 0.7474634277259916, "learning_rate": 4.881802130053548e-05, "loss": 0.2275, "step": 1081 }, { "epoch": 0.49226569608735216, "grad_norm": 0.7943011026801956, "learning_rate": 4.8815848891124725e-05, "loss": 0.1734, "step": 1082 }, { "epoch": 0.4927206551410373, "grad_norm": 0.9183541974559059, "learning_rate": 4.8813674535590356e-05, "loss": 0.2596, "step": 1083 }, { "epoch": 0.4931756141947225, "grad_norm": 0.9036934030115955, "learning_rate": 4.881149823411004e-05, "loss": 0.2255, "step": 1084 }, { "epoch": 0.49363057324840764, "grad_norm": 0.7535349170677319, "learning_rate": 4.8809319986861626e-05, "loss": 0.1821, "step": 1085 }, { "epoch": 0.4940855323020928, "grad_norm": 1.1346390727588112, "learning_rate": 4.880713979402311e-05, "loss": 0.3807, "step": 1086 }, { "epoch": 0.494540491355778, "grad_norm": 1.3057353047788176, "learning_rate": 4.880495765577262e-05, "loss": 0.2798, "step": 1087 }, { "epoch": 0.49499545040946313, "grad_norm": 1.1095949134493601, "learning_rate": 4.880277357228851e-05, "loss": 0.1898, "step": 1088 }, { "epoch": 0.49545040946314833, "grad_norm": 0.7546580612699083, "learning_rate": 4.880058754374923e-05, "loss": 0.2065, "step": 1089 }, { "epoch": 0.4959053685168335, "grad_norm": 0.8254047235952874, "learning_rate": 4.8798399570333434e-05, "loss": 0.2327, "step": 1090 }, { "epoch": 0.4963603275705187, "grad_norm": 1.1432438891940184, "learning_rate": 4.879620965221988e-05, "loss": 0.1731, "step": 1091 }, { "epoch": 0.4968152866242038, "grad_norm": 0.5894364221949757, "learning_rate": 4.8794017789587546e-05, "loss": 0.1292, "step": 1092 }, { "epoch": 0.49727024567788897, "grad_norm": 0.8531002265847603, "learning_rate": 4.879182398261552e-05, "loss": 0.1957, "step": 1093 }, { "epoch": 0.49772520473157417, "grad_norm": 0.6269437994779095, "learning_rate": 4.878962823148309e-05, "loss": 0.1597, "step": 1094 }, { "epoch": 0.4981801637852593, "grad_norm": 1.1266526663207475, "learning_rate": 4.878743053636967e-05, "loss": 0.2748, "step": 1095 }, { "epoch": 0.4986351228389445, "grad_norm": 0.8601705509972103, "learning_rate": 4.878523089745485e-05, "loss": 0.3157, "step": 1096 }, { "epoch": 0.49909008189262966, "grad_norm": 0.809401935465149, "learning_rate": 4.878302931491837e-05, "loss": 0.2064, "step": 1097 }, { "epoch": 0.49954504094631486, "grad_norm": 0.9078472917832632, "learning_rate": 4.878082578894014e-05, "loss": 0.1775, "step": 1098 }, { "epoch": 0.5, "grad_norm": 0.6666128718139168, "learning_rate": 4.877862031970023e-05, "loss": 0.2127, "step": 1099 }, { "epoch": 0.5004549590536852, "grad_norm": 2.4075682387829276, "learning_rate": 4.877641290737884e-05, "loss": 0.286, "step": 1100 }, { "epoch": 0.5009099181073703, "grad_norm": 1.2331717369657271, "learning_rate": 4.877420355215637e-05, "loss": 0.2718, "step": 1101 }, { "epoch": 0.5013648771610555, "grad_norm": 1.0212922052271163, "learning_rate": 4.877199225421334e-05, "loss": 0.3205, "step": 1102 }, { "epoch": 0.5018198362147407, "grad_norm": 0.6302566901964723, "learning_rate": 4.8769779013730456e-05, "loss": 0.2357, "step": 1103 }, { "epoch": 0.5022747952684259, "grad_norm": 0.6506816852458047, "learning_rate": 4.8767563830888576e-05, "loss": 0.1823, "step": 1104 }, { "epoch": 0.502729754322111, "grad_norm": 0.9664508560388849, "learning_rate": 4.876534670586872e-05, "loss": 0.2195, "step": 1105 }, { "epoch": 0.5031847133757962, "grad_norm": 1.036865502480681, "learning_rate": 4.8763127638852044e-05, "loss": 0.2213, "step": 1106 }, { "epoch": 0.5036396724294814, "grad_norm": 0.7103391664873184, "learning_rate": 4.87609066300199e-05, "loss": 0.1921, "step": 1107 }, { "epoch": 0.5040946314831665, "grad_norm": 1.1090297068021537, "learning_rate": 4.875868367955376e-05, "loss": 0.256, "step": 1108 }, { "epoch": 0.5045495905368517, "grad_norm": 2.9237199241847964, "learning_rate": 4.8756458787635295e-05, "loss": 0.5057, "step": 1109 }, { "epoch": 0.5050045495905369, "grad_norm": 0.802774751017459, "learning_rate": 4.875423195444629e-05, "loss": 0.2441, "step": 1110 }, { "epoch": 0.5054595086442221, "grad_norm": 0.6834123653587366, "learning_rate": 4.8752003180168724e-05, "loss": 0.2209, "step": 1111 }, { "epoch": 0.5059144676979072, "grad_norm": 1.281849632392797, "learning_rate": 4.8749772464984736e-05, "loss": 0.2688, "step": 1112 }, { "epoch": 0.5063694267515924, "grad_norm": 0.8086573607910287, "learning_rate": 4.874753980907658e-05, "loss": 0.2549, "step": 1113 }, { "epoch": 0.5068243858052776, "grad_norm": 1.0077286222701938, "learning_rate": 4.874530521262671e-05, "loss": 0.2373, "step": 1114 }, { "epoch": 0.5072793448589626, "grad_norm": 0.7390026149681648, "learning_rate": 4.874306867581774e-05, "loss": 0.2309, "step": 1115 }, { "epoch": 0.5077343039126478, "grad_norm": 0.7425876094384148, "learning_rate": 4.874083019883242e-05, "loss": 0.1977, "step": 1116 }, { "epoch": 0.508189262966333, "grad_norm": 0.9190060408224374, "learning_rate": 4.873858978185367e-05, "loss": 0.2662, "step": 1117 }, { "epoch": 0.5086442220200182, "grad_norm": 1.0835347060698675, "learning_rate": 4.873634742506456e-05, "loss": 0.2817, "step": 1118 }, { "epoch": 0.5090991810737033, "grad_norm": 1.1654688141298462, "learning_rate": 4.873410312864833e-05, "loss": 0.2758, "step": 1119 }, { "epoch": 0.5095541401273885, "grad_norm": 1.3348004093737977, "learning_rate": 4.8731856892788386e-05, "loss": 0.3078, "step": 1120 }, { "epoch": 0.5100090991810737, "grad_norm": 0.9971204474312969, "learning_rate": 4.8729608717668265e-05, "loss": 0.2976, "step": 1121 }, { "epoch": 0.5104640582347588, "grad_norm": 0.6102224077041795, "learning_rate": 4.8727358603471675e-05, "loss": 0.2084, "step": 1122 }, { "epoch": 0.510919017288444, "grad_norm": 1.0902791919587975, "learning_rate": 4.8725106550382495e-05, "loss": 0.2414, "step": 1123 }, { "epoch": 0.5113739763421292, "grad_norm": 0.6081729698959278, "learning_rate": 4.872285255858475e-05, "loss": 0.2093, "step": 1124 }, { "epoch": 0.5118289353958144, "grad_norm": 1.175505482030395, "learning_rate": 4.8720596628262624e-05, "loss": 0.2603, "step": 1125 }, { "epoch": 0.5122838944494995, "grad_norm": 0.8171174966727413, "learning_rate": 4.871833875960046e-05, "loss": 0.2359, "step": 1126 }, { "epoch": 0.5127388535031847, "grad_norm": 0.8221921644789411, "learning_rate": 4.8716078952782774e-05, "loss": 0.2132, "step": 1127 }, { "epoch": 0.5131938125568699, "grad_norm": 0.79109033221081, "learning_rate": 4.8713817207994206e-05, "loss": 0.2056, "step": 1128 }, { "epoch": 0.513648771610555, "grad_norm": 1.1680158875176525, "learning_rate": 4.8711553525419587e-05, "loss": 0.3029, "step": 1129 }, { "epoch": 0.5141037306642402, "grad_norm": 0.6437891558768953, "learning_rate": 4.87092879052439e-05, "loss": 0.2218, "step": 1130 }, { "epoch": 0.5145586897179254, "grad_norm": 0.7945254452003422, "learning_rate": 4.8707020347652274e-05, "loss": 0.189, "step": 1131 }, { "epoch": 0.5150136487716106, "grad_norm": 0.8264107176660566, "learning_rate": 4.8704750852830005e-05, "loss": 0.2582, "step": 1132 }, { "epoch": 0.5154686078252957, "grad_norm": 0.7637171884969352, "learning_rate": 4.8702479420962547e-05, "loss": 0.1933, "step": 1133 }, { "epoch": 0.5159235668789809, "grad_norm": 0.6768026042670279, "learning_rate": 4.870020605223551e-05, "loss": 0.1804, "step": 1134 }, { "epoch": 0.5163785259326661, "grad_norm": 0.7751982177397607, "learning_rate": 4.869793074683466e-05, "loss": 0.2021, "step": 1135 }, { "epoch": 0.5168334849863512, "grad_norm": 0.9636529556156375, "learning_rate": 4.8695653504945925e-05, "loss": 0.2193, "step": 1136 }, { "epoch": 0.5172884440400364, "grad_norm": 0.717685113531743, "learning_rate": 4.86933743267554e-05, "loss": 0.1916, "step": 1137 }, { "epoch": 0.5177434030937216, "grad_norm": 0.6624842709145033, "learning_rate": 4.869109321244932e-05, "loss": 0.1568, "step": 1138 }, { "epoch": 0.5181983621474068, "grad_norm": 0.9186093029449445, "learning_rate": 4.86888101622141e-05, "loss": 0.2532, "step": 1139 }, { "epoch": 0.5186533212010919, "grad_norm": 0.8914765551826179, "learning_rate": 4.868652517623629e-05, "loss": 0.1924, "step": 1140 }, { "epoch": 0.5191082802547771, "grad_norm": 0.7364171126411557, "learning_rate": 4.86842382547026e-05, "loss": 0.1803, "step": 1141 }, { "epoch": 0.5195632393084623, "grad_norm": 1.1062341220462748, "learning_rate": 4.868194939779992e-05, "loss": 0.2137, "step": 1142 }, { "epoch": 0.5200181983621474, "grad_norm": 0.8235183867859717, "learning_rate": 4.8679658605715286e-05, "loss": 0.2101, "step": 1143 }, { "epoch": 0.5204731574158326, "grad_norm": 0.9539793894281974, "learning_rate": 4.867736587863588e-05, "loss": 0.2886, "step": 1144 }, { "epoch": 0.5209281164695178, "grad_norm": 1.0823221030765018, "learning_rate": 4.867507121674907e-05, "loss": 0.3213, "step": 1145 }, { "epoch": 0.521383075523203, "grad_norm": 0.9535277788599255, "learning_rate": 4.867277462024235e-05, "loss": 0.2352, "step": 1146 }, { "epoch": 0.521838034576888, "grad_norm": 0.9759441448371383, "learning_rate": 4.86704760893034e-05, "loss": 0.2831, "step": 1147 }, { "epoch": 0.5222929936305732, "grad_norm": 0.6743611636898612, "learning_rate": 4.8668175624120026e-05, "loss": 0.2699, "step": 1148 }, { "epoch": 0.5227479526842584, "grad_norm": 0.5962683590204526, "learning_rate": 4.866587322488023e-05, "loss": 0.2006, "step": 1149 }, { "epoch": 0.5232029117379435, "grad_norm": 1.0037423132730205, "learning_rate": 4.8663568891772155e-05, "loss": 0.2557, "step": 1150 }, { "epoch": 0.5236578707916287, "grad_norm": 0.7856298854445986, "learning_rate": 4.866126262498409e-05, "loss": 0.2229, "step": 1151 }, { "epoch": 0.5241128298453139, "grad_norm": 1.212526332576152, "learning_rate": 4.865895442470449e-05, "loss": 0.2758, "step": 1152 }, { "epoch": 0.5245677888989991, "grad_norm": 0.6876050788952495, "learning_rate": 4.865664429112199e-05, "loss": 0.1952, "step": 1153 }, { "epoch": 0.5250227479526842, "grad_norm": 0.8987951453450724, "learning_rate": 4.8654332224425344e-05, "loss": 0.2019, "step": 1154 }, { "epoch": 0.5254777070063694, "grad_norm": 1.3241048810965137, "learning_rate": 4.865201822480348e-05, "loss": 0.2904, "step": 1155 }, { "epoch": 0.5259326660600546, "grad_norm": 0.6436909403169875, "learning_rate": 4.864970229244552e-05, "loss": 0.1595, "step": 1156 }, { "epoch": 0.5263876251137397, "grad_norm": 0.8266340449396545, "learning_rate": 4.8647384427540674e-05, "loss": 0.1826, "step": 1157 }, { "epoch": 0.5268425841674249, "grad_norm": 0.8552825484803546, "learning_rate": 4.864506463027837e-05, "loss": 0.2297, "step": 1158 }, { "epoch": 0.5272975432211101, "grad_norm": 0.6821958967442053, "learning_rate": 4.864274290084816e-05, "loss": 0.2182, "step": 1159 }, { "epoch": 0.5277525022747953, "grad_norm": 1.1525813020735614, "learning_rate": 4.864041923943977e-05, "loss": 0.2528, "step": 1160 }, { "epoch": 0.5282074613284804, "grad_norm": 1.1429233763588498, "learning_rate": 4.863809364624309e-05, "loss": 0.2796, "step": 1161 }, { "epoch": 0.5286624203821656, "grad_norm": 0.7504930514544694, "learning_rate": 4.863576612144813e-05, "loss": 0.2173, "step": 1162 }, { "epoch": 0.5291173794358508, "grad_norm": 0.9684947796021939, "learning_rate": 4.8633436665245114e-05, "loss": 0.2679, "step": 1163 }, { "epoch": 0.5295723384895359, "grad_norm": 0.7162527701209747, "learning_rate": 4.863110527782437e-05, "loss": 0.216, "step": 1164 }, { "epoch": 0.5300272975432211, "grad_norm": 0.8995489126209955, "learning_rate": 4.862877195937643e-05, "loss": 0.2083, "step": 1165 }, { "epoch": 0.5304822565969063, "grad_norm": 0.8935874548388171, "learning_rate": 4.862643671009194e-05, "loss": 0.1943, "step": 1166 }, { "epoch": 0.5309372156505915, "grad_norm": 0.7134263417695643, "learning_rate": 4.862409953016175e-05, "loss": 0.2247, "step": 1167 }, { "epoch": 0.5313921747042766, "grad_norm": 0.8413770880977715, "learning_rate": 4.862176041977683e-05, "loss": 0.2498, "step": 1168 }, { "epoch": 0.5318471337579618, "grad_norm": 0.7844381717495773, "learning_rate": 4.861941937912832e-05, "loss": 0.1885, "step": 1169 }, { "epoch": 0.532302092811647, "grad_norm": 0.92165566091653, "learning_rate": 4.861707640840752e-05, "loss": 0.2166, "step": 1170 }, { "epoch": 0.5327570518653321, "grad_norm": 0.6862017506884803, "learning_rate": 4.8614731507805886e-05, "loss": 0.2226, "step": 1171 }, { "epoch": 0.5332120109190173, "grad_norm": 1.211198664012631, "learning_rate": 4.861238467751505e-05, "loss": 0.2577, "step": 1172 }, { "epoch": 0.5336669699727025, "grad_norm": 0.9586499713114898, "learning_rate": 4.8610035917726764e-05, "loss": 0.224, "step": 1173 }, { "epoch": 0.5341219290263877, "grad_norm": 0.8568827890058869, "learning_rate": 4.860768522863297e-05, "loss": 0.2142, "step": 1174 }, { "epoch": 0.5345768880800728, "grad_norm": 0.6608166515281452, "learning_rate": 4.860533261042573e-05, "loss": 0.1967, "step": 1175 }, { "epoch": 0.535031847133758, "grad_norm": 0.5829174365424206, "learning_rate": 4.860297806329733e-05, "loss": 0.2292, "step": 1176 }, { "epoch": 0.5354868061874432, "grad_norm": 1.148549584310484, "learning_rate": 4.860062158744015e-05, "loss": 0.2899, "step": 1177 }, { "epoch": 0.5359417652411284, "grad_norm": 0.8939121423398432, "learning_rate": 4.859826318304676e-05, "loss": 0.2962, "step": 1178 }, { "epoch": 0.5363967242948134, "grad_norm": 0.5477543802180151, "learning_rate": 4.859590285030986e-05, "loss": 0.1852, "step": 1179 }, { "epoch": 0.5368516833484986, "grad_norm": 0.5666927925731042, "learning_rate": 4.8593540589422334e-05, "loss": 0.1899, "step": 1180 }, { "epoch": 0.5373066424021838, "grad_norm": 0.6671247014715298, "learning_rate": 4.8591176400577227e-05, "loss": 0.2264, "step": 1181 }, { "epoch": 0.5377616014558689, "grad_norm": 0.9492228189627655, "learning_rate": 4.858881028396772e-05, "loss": 0.3462, "step": 1182 }, { "epoch": 0.5382165605095541, "grad_norm": 1.136112636255282, "learning_rate": 4.858644223978716e-05, "loss": 0.2315, "step": 1183 }, { "epoch": 0.5386715195632393, "grad_norm": 1.1571528689143449, "learning_rate": 4.858407226822906e-05, "loss": 0.3235, "step": 1184 }, { "epoch": 0.5391264786169245, "grad_norm": 0.8679500051886512, "learning_rate": 4.858170036948707e-05, "loss": 0.2267, "step": 1185 }, { "epoch": 0.5395814376706096, "grad_norm": 0.956184936343263, "learning_rate": 4.857932654375503e-05, "loss": 0.229, "step": 1186 }, { "epoch": 0.5400363967242948, "grad_norm": 1.0943158353362976, "learning_rate": 4.85769507912269e-05, "loss": 0.3795, "step": 1187 }, { "epoch": 0.54049135577798, "grad_norm": 0.7963106538080135, "learning_rate": 4.8574573112096835e-05, "loss": 0.2049, "step": 1188 }, { "epoch": 0.5409463148316651, "grad_norm": 1.0477330496692627, "learning_rate": 4.8572193506559106e-05, "loss": 0.1984, "step": 1189 }, { "epoch": 0.5414012738853503, "grad_norm": 0.7942472992054348, "learning_rate": 4.856981197480818e-05, "loss": 0.2416, "step": 1190 }, { "epoch": 0.5418562329390355, "grad_norm": 0.6446054485714459, "learning_rate": 4.856742851703866e-05, "loss": 0.2149, "step": 1191 }, { "epoch": 0.5423111919927207, "grad_norm": 0.9930229607349556, "learning_rate": 4.856504313344531e-05, "loss": 0.3591, "step": 1192 }, { "epoch": 0.5427661510464058, "grad_norm": 0.8783426488800363, "learning_rate": 4.856265582422305e-05, "loss": 0.2129, "step": 1193 }, { "epoch": 0.543221110100091, "grad_norm": 3.5879518885894277, "learning_rate": 4.856026658956697e-05, "loss": 0.2295, "step": 1194 }, { "epoch": 0.5436760691537762, "grad_norm": 0.51820912365723, "learning_rate": 4.855787542967229e-05, "loss": 0.1821, "step": 1195 }, { "epoch": 0.5441310282074613, "grad_norm": 0.7008464535620046, "learning_rate": 4.855548234473444e-05, "loss": 0.2349, "step": 1196 }, { "epoch": 0.5445859872611465, "grad_norm": 1.069591635815003, "learning_rate": 4.855308733494893e-05, "loss": 0.221, "step": 1197 }, { "epoch": 0.5450409463148317, "grad_norm": 0.8910233767830608, "learning_rate": 4.855069040051149e-05, "loss": 0.2343, "step": 1198 }, { "epoch": 0.5454959053685169, "grad_norm": 0.8494070810324226, "learning_rate": 4.8548291541617985e-05, "loss": 0.3149, "step": 1199 }, { "epoch": 0.545950864422202, "grad_norm": 0.7508673014683475, "learning_rate": 4.854589075846444e-05, "loss": 0.2538, "step": 1200 }, { "epoch": 0.5464058234758872, "grad_norm": 0.5712955200936813, "learning_rate": 4.854348805124703e-05, "loss": 0.2132, "step": 1201 }, { "epoch": 0.5468607825295724, "grad_norm": 0.741906293890281, "learning_rate": 4.85410834201621e-05, "loss": 0.1688, "step": 1202 }, { "epoch": 0.5473157415832575, "grad_norm": 1.0391338989294345, "learning_rate": 4.853867686540615e-05, "loss": 0.3389, "step": 1203 }, { "epoch": 0.5477707006369427, "grad_norm": 0.6766157043776165, "learning_rate": 4.853626838717582e-05, "loss": 0.1726, "step": 1204 }, { "epoch": 0.5482256596906279, "grad_norm": 1.5164841486966711, "learning_rate": 4.853385798566793e-05, "loss": 0.2112, "step": 1205 }, { "epoch": 0.5486806187443131, "grad_norm": 0.7127497483577858, "learning_rate": 4.8531445661079444e-05, "loss": 0.1695, "step": 1206 }, { "epoch": 0.5491355777979982, "grad_norm": 0.6478485895561551, "learning_rate": 4.852903141360748e-05, "loss": 0.1911, "step": 1207 }, { "epoch": 0.5495905368516834, "grad_norm": 0.672286299611825, "learning_rate": 4.852661524344933e-05, "loss": 0.1539, "step": 1208 }, { "epoch": 0.5500454959053686, "grad_norm": 0.9226787547626162, "learning_rate": 4.852419715080243e-05, "loss": 0.2704, "step": 1209 }, { "epoch": 0.5505004549590536, "grad_norm": 0.783649613098744, "learning_rate": 4.8521777135864367e-05, "loss": 0.2269, "step": 1210 }, { "epoch": 0.5509554140127388, "grad_norm": 0.7385104005566707, "learning_rate": 4.85193551988329e-05, "loss": 0.2217, "step": 1211 }, { "epoch": 0.551410373066424, "grad_norm": 0.8358163455046759, "learning_rate": 4.851693133990594e-05, "loss": 0.2246, "step": 1212 }, { "epoch": 0.5518653321201092, "grad_norm": 0.9639079144122397, "learning_rate": 4.8514505559281555e-05, "loss": 0.2156, "step": 1213 }, { "epoch": 0.5523202911737943, "grad_norm": 1.0669800324932461, "learning_rate": 4.851207785715797e-05, "loss": 0.3236, "step": 1214 }, { "epoch": 0.5527752502274795, "grad_norm": 0.9121607960596084, "learning_rate": 4.850964823373355e-05, "loss": 0.2833, "step": 1215 }, { "epoch": 0.5532302092811647, "grad_norm": 0.9384157343919924, "learning_rate": 4.850721668920685e-05, "loss": 0.2557, "step": 1216 }, { "epoch": 0.5536851683348498, "grad_norm": 0.774754312772182, "learning_rate": 4.850478322377656e-05, "loss": 0.2262, "step": 1217 }, { "epoch": 0.554140127388535, "grad_norm": 0.9224044463944866, "learning_rate": 4.850234783764154e-05, "loss": 0.3548, "step": 1218 }, { "epoch": 0.5545950864422202, "grad_norm": 0.7014398334959238, "learning_rate": 4.8499910531000774e-05, "loss": 0.2188, "step": 1219 }, { "epoch": 0.5550500454959054, "grad_norm": 1.1212880983692388, "learning_rate": 4.8497471304053456e-05, "loss": 0.2545, "step": 1220 }, { "epoch": 0.5555050045495905, "grad_norm": 0.8215862135090843, "learning_rate": 4.8495030156998896e-05, "loss": 0.2222, "step": 1221 }, { "epoch": 0.5559599636032757, "grad_norm": 0.9945440696414826, "learning_rate": 4.849258709003657e-05, "loss": 0.3308, "step": 1222 }, { "epoch": 0.5564149226569609, "grad_norm": 0.9453413490303284, "learning_rate": 4.849014210336612e-05, "loss": 0.2528, "step": 1223 }, { "epoch": 0.556869881710646, "grad_norm": 0.9486968915669729, "learning_rate": 4.848769519718734e-05, "loss": 0.3064, "step": 1224 }, { "epoch": 0.5573248407643312, "grad_norm": 1.269388299398648, "learning_rate": 4.8485246371700175e-05, "loss": 0.2779, "step": 1225 }, { "epoch": 0.5577797998180164, "grad_norm": 0.8611078002646142, "learning_rate": 4.848279562710474e-05, "loss": 0.2955, "step": 1226 }, { "epoch": 0.5582347588717016, "grad_norm": 0.7527674181356859, "learning_rate": 4.848034296360129e-05, "loss": 0.1842, "step": 1227 }, { "epoch": 0.5586897179253867, "grad_norm": 1.197203214806719, "learning_rate": 4.8477888381390245e-05, "loss": 0.2176, "step": 1228 }, { "epoch": 0.5591446769790719, "grad_norm": 0.7413317899699277, "learning_rate": 4.847543188067219e-05, "loss": 0.1998, "step": 1229 }, { "epoch": 0.5595996360327571, "grad_norm": 0.7771762779004044, "learning_rate": 4.847297346164786e-05, "loss": 0.2253, "step": 1230 }, { "epoch": 0.5600545950864422, "grad_norm": 0.7421797941153655, "learning_rate": 4.847051312451814e-05, "loss": 0.2305, "step": 1231 }, { "epoch": 0.5605095541401274, "grad_norm": 1.3689345326632218, "learning_rate": 4.846805086948407e-05, "loss": 0.3197, "step": 1232 }, { "epoch": 0.5609645131938126, "grad_norm": 0.8953883575762568, "learning_rate": 4.846558669674688e-05, "loss": 0.2223, "step": 1233 }, { "epoch": 0.5614194722474978, "grad_norm": 0.8628915160388261, "learning_rate": 4.846312060650791e-05, "loss": 0.2694, "step": 1234 }, { "epoch": 0.5618744313011829, "grad_norm": 1.1052026374572126, "learning_rate": 4.846065259896867e-05, "loss": 0.2893, "step": 1235 }, { "epoch": 0.5623293903548681, "grad_norm": 0.8938485215131268, "learning_rate": 4.845818267433085e-05, "loss": 0.2693, "step": 1236 }, { "epoch": 0.5627843494085533, "grad_norm": 1.0649867112707627, "learning_rate": 4.8455710832796294e-05, "loss": 0.2235, "step": 1237 }, { "epoch": 0.5632393084622384, "grad_norm": 0.5973333178793431, "learning_rate": 4.845323707456696e-05, "loss": 0.2219, "step": 1238 }, { "epoch": 0.5636942675159236, "grad_norm": 1.0909458196726627, "learning_rate": 4.8450761399845013e-05, "loss": 0.367, "step": 1239 }, { "epoch": 0.5641492265696088, "grad_norm": 0.9081091794300769, "learning_rate": 4.844828380883274e-05, "loss": 0.2263, "step": 1240 }, { "epoch": 0.564604185623294, "grad_norm": 0.8271776697377083, "learning_rate": 4.844580430173261e-05, "loss": 0.2255, "step": 1241 }, { "epoch": 0.565059144676979, "grad_norm": 0.8306906866771375, "learning_rate": 4.8443322878747234e-05, "loss": 0.1732, "step": 1242 }, { "epoch": 0.5655141037306642, "grad_norm": 0.6032071297368831, "learning_rate": 4.844083954007938e-05, "loss": 0.2516, "step": 1243 }, { "epoch": 0.5659690627843494, "grad_norm": 0.8070276094369395, "learning_rate": 4.843835428593198e-05, "loss": 0.3309, "step": 1244 }, { "epoch": 0.5664240218380345, "grad_norm": 1.019559095943495, "learning_rate": 4.8435867116508106e-05, "loss": 0.3057, "step": 1245 }, { "epoch": 0.5668789808917197, "grad_norm": 0.8116194341050816, "learning_rate": 4.843337803201101e-05, "loss": 0.2824, "step": 1246 }, { "epoch": 0.5673339399454049, "grad_norm": 0.8395355413142086, "learning_rate": 4.843088703264409e-05, "loss": 0.2577, "step": 1247 }, { "epoch": 0.5677888989990901, "grad_norm": 0.951197599467303, "learning_rate": 4.842839411861089e-05, "loss": 0.2167, "step": 1248 }, { "epoch": 0.5682438580527752, "grad_norm": 0.6547897938439432, "learning_rate": 4.842589929011513e-05, "loss": 0.1674, "step": 1249 }, { "epoch": 0.5686988171064604, "grad_norm": 0.7583148753324122, "learning_rate": 4.8423402547360666e-05, "loss": 0.2321, "step": 1250 }, { "epoch": 0.5691537761601456, "grad_norm": 1.1452655925451338, "learning_rate": 4.8420903890551526e-05, "loss": 0.2197, "step": 1251 }, { "epoch": 0.5696087352138307, "grad_norm": 0.8071672699493014, "learning_rate": 4.8418403319891885e-05, "loss": 0.2702, "step": 1252 }, { "epoch": 0.5700636942675159, "grad_norm": 0.9019789335504952, "learning_rate": 4.841590083558608e-05, "loss": 0.2538, "step": 1253 }, { "epoch": 0.5705186533212011, "grad_norm": 1.0426660776376306, "learning_rate": 4.841339643783861e-05, "loss": 0.2556, "step": 1254 }, { "epoch": 0.5709736123748863, "grad_norm": 0.7144053480741497, "learning_rate": 4.841089012685412e-05, "loss": 0.2311, "step": 1255 }, { "epoch": 0.5714285714285714, "grad_norm": 0.7028285410789339, "learning_rate": 4.8408381902837406e-05, "loss": 0.2111, "step": 1256 }, { "epoch": 0.5718835304822566, "grad_norm": 0.9073609051825994, "learning_rate": 4.8405871765993433e-05, "loss": 0.1512, "step": 1257 }, { "epoch": 0.5723384895359418, "grad_norm": 0.8578411915304995, "learning_rate": 4.840335971652732e-05, "loss": 0.1651, "step": 1258 }, { "epoch": 0.5727934485896269, "grad_norm": 0.7253455510848343, "learning_rate": 4.840084575464434e-05, "loss": 0.2317, "step": 1259 }, { "epoch": 0.5732484076433121, "grad_norm": 0.7211245818199832, "learning_rate": 4.839832988054992e-05, "loss": 0.2245, "step": 1260 }, { "epoch": 0.5737033666969973, "grad_norm": 0.8289069089579228, "learning_rate": 4.839581209444966e-05, "loss": 0.1818, "step": 1261 }, { "epoch": 0.5741583257506825, "grad_norm": 1.0559323860908696, "learning_rate": 4.839329239654927e-05, "loss": 0.2967, "step": 1262 }, { "epoch": 0.5746132848043676, "grad_norm": 0.8934572869169254, "learning_rate": 4.839077078705468e-05, "loss": 0.1973, "step": 1263 }, { "epoch": 0.5750682438580528, "grad_norm": 0.7583555412924592, "learning_rate": 4.838824726617194e-05, "loss": 0.2128, "step": 1264 }, { "epoch": 0.575523202911738, "grad_norm": 0.7659379338325312, "learning_rate": 4.838572183410724e-05, "loss": 0.2524, "step": 1265 }, { "epoch": 0.5759781619654231, "grad_norm": 0.7066627254544321, "learning_rate": 4.838319449106697e-05, "loss": 0.2178, "step": 1266 }, { "epoch": 0.5764331210191083, "grad_norm": 0.6815780700497029, "learning_rate": 4.8380665237257635e-05, "loss": 0.1445, "step": 1267 }, { "epoch": 0.5768880800727935, "grad_norm": 1.0960340860040276, "learning_rate": 4.837813407288594e-05, "loss": 0.2086, "step": 1268 }, { "epoch": 0.5773430391264787, "grad_norm": 0.8858986395012974, "learning_rate": 4.837560099815869e-05, "loss": 0.2784, "step": 1269 }, { "epoch": 0.5777979981801638, "grad_norm": 0.8415530620031968, "learning_rate": 4.8373066013282886e-05, "loss": 0.2385, "step": 1270 }, { "epoch": 0.578252957233849, "grad_norm": 0.8439377397801023, "learning_rate": 4.837052911846569e-05, "loss": 0.2121, "step": 1271 }, { "epoch": 0.5787079162875342, "grad_norm": 0.7734422885393291, "learning_rate": 4.836799031391439e-05, "loss": 0.2363, "step": 1272 }, { "epoch": 0.5791628753412192, "grad_norm": 0.544667503730463, "learning_rate": 4.836544959983645e-05, "loss": 0.1672, "step": 1273 }, { "epoch": 0.5796178343949044, "grad_norm": 0.808958637351162, "learning_rate": 4.8362906976439484e-05, "loss": 0.2405, "step": 1274 }, { "epoch": 0.5800727934485896, "grad_norm": 0.8937704950892612, "learning_rate": 4.836036244393127e-05, "loss": 0.2303, "step": 1275 }, { "epoch": 0.5805277525022748, "grad_norm": 1.0332689969968218, "learning_rate": 4.835781600251973e-05, "loss": 0.3527, "step": 1276 }, { "epoch": 0.5809827115559599, "grad_norm": 1.735501490519224, "learning_rate": 4.835526765241295e-05, "loss": 0.2627, "step": 1277 }, { "epoch": 0.5814376706096451, "grad_norm": 0.7303608691968647, "learning_rate": 4.835271739381917e-05, "loss": 0.2206, "step": 1278 }, { "epoch": 0.5818926296633303, "grad_norm": 0.5718614380257576, "learning_rate": 4.835016522694678e-05, "loss": 0.1835, "step": 1279 }, { "epoch": 0.5823475887170154, "grad_norm": 1.0852076319833721, "learning_rate": 4.834761115200433e-05, "loss": 0.2947, "step": 1280 }, { "epoch": 0.5828025477707006, "grad_norm": 0.6989882368078278, "learning_rate": 4.8345055169200546e-05, "loss": 0.2363, "step": 1281 }, { "epoch": 0.5832575068243858, "grad_norm": 0.7940885453787274, "learning_rate": 4.8342497278744284e-05, "loss": 0.1827, "step": 1282 }, { "epoch": 0.583712465878071, "grad_norm": 0.8908511608134294, "learning_rate": 4.833993748084455e-05, "loss": 0.2312, "step": 1283 }, { "epoch": 0.5841674249317561, "grad_norm": 0.7927929773568422, "learning_rate": 4.833737577571052e-05, "loss": 0.2009, "step": 1284 }, { "epoch": 0.5846223839854413, "grad_norm": 1.1036106319724284, "learning_rate": 4.8334812163551535e-05, "loss": 0.2084, "step": 1285 }, { "epoch": 0.5850773430391265, "grad_norm": 1.3526568700950423, "learning_rate": 4.833224664457708e-05, "loss": 0.312, "step": 1286 }, { "epoch": 0.5855323020928116, "grad_norm": 1.306228642889318, "learning_rate": 4.8329679218996806e-05, "loss": 0.3054, "step": 1287 }, { "epoch": 0.5859872611464968, "grad_norm": 0.7892913603049707, "learning_rate": 4.83271098870205e-05, "loss": 0.1877, "step": 1288 }, { "epoch": 0.586442220200182, "grad_norm": 0.7397745135724605, "learning_rate": 4.832453864885811e-05, "loss": 0.1895, "step": 1289 }, { "epoch": 0.5868971792538672, "grad_norm": 0.8126291856766744, "learning_rate": 4.832196550471976e-05, "loss": 0.2387, "step": 1290 }, { "epoch": 0.5873521383075523, "grad_norm": 0.6733731148133821, "learning_rate": 4.8319390454815706e-05, "loss": 0.2191, "step": 1291 }, { "epoch": 0.5878070973612375, "grad_norm": 0.7677012920536053, "learning_rate": 4.8316813499356375e-05, "loss": 0.2206, "step": 1292 }, { "epoch": 0.5882620564149227, "grad_norm": 0.8340857958611263, "learning_rate": 4.831423463855235e-05, "loss": 0.2315, "step": 1293 }, { "epoch": 0.5887170154686078, "grad_norm": 1.1455489934001462, "learning_rate": 4.8311653872614343e-05, "loss": 0.1947, "step": 1294 }, { "epoch": 0.589171974522293, "grad_norm": 0.6675554797235319, "learning_rate": 4.830907120175326e-05, "loss": 0.1992, "step": 1295 }, { "epoch": 0.5896269335759782, "grad_norm": 0.9214500608599597, "learning_rate": 4.830648662618015e-05, "loss": 0.3201, "step": 1296 }, { "epoch": 0.5900818926296634, "grad_norm": 0.6255284298703153, "learning_rate": 4.8303900146106194e-05, "loss": 0.148, "step": 1297 }, { "epoch": 0.5905368516833485, "grad_norm": 0.7776259738808721, "learning_rate": 4.830131176174276e-05, "loss": 0.2017, "step": 1298 }, { "epoch": 0.5909918107370337, "grad_norm": 0.7309836046994429, "learning_rate": 4.829872147330136e-05, "loss": 0.2413, "step": 1299 }, { "epoch": 0.5914467697907189, "grad_norm": 6.812242635604768, "learning_rate": 4.829612928099366e-05, "loss": 0.6214, "step": 1300 }, { "epoch": 0.591901728844404, "grad_norm": 0.7011400557331887, "learning_rate": 4.829353518503147e-05, "loss": 0.2201, "step": 1301 }, { "epoch": 0.5923566878980892, "grad_norm": 0.9947437613880482, "learning_rate": 4.8290939185626785e-05, "loss": 0.215, "step": 1302 }, { "epoch": 0.5928116469517744, "grad_norm": 0.8798091346885517, "learning_rate": 4.828834128299173e-05, "loss": 0.2251, "step": 1303 }, { "epoch": 0.5932666060054596, "grad_norm": 0.743613351155558, "learning_rate": 4.828574147733859e-05, "loss": 0.2254, "step": 1304 }, { "epoch": 0.5937215650591446, "grad_norm": 0.7422496363190143, "learning_rate": 4.828313976887982e-05, "loss": 0.1828, "step": 1305 }, { "epoch": 0.5941765241128298, "grad_norm": 0.7708086547603374, "learning_rate": 4.8280536157828006e-05, "loss": 0.2261, "step": 1306 }, { "epoch": 0.594631483166515, "grad_norm": 0.9646069516473497, "learning_rate": 4.8277930644395916e-05, "loss": 0.235, "step": 1307 }, { "epoch": 0.5950864422202001, "grad_norm": 0.9756579323472088, "learning_rate": 4.827532322879645e-05, "loss": 0.2177, "step": 1308 }, { "epoch": 0.5955414012738853, "grad_norm": 0.8024846997095755, "learning_rate": 4.82727139112427e-05, "loss": 0.1699, "step": 1309 }, { "epoch": 0.5959963603275705, "grad_norm": 0.7977544406480572, "learning_rate": 4.827010269194785e-05, "loss": 0.1519, "step": 1310 }, { "epoch": 0.5964513193812557, "grad_norm": 1.0724664140777984, "learning_rate": 4.8267489571125295e-05, "loss": 0.2331, "step": 1311 }, { "epoch": 0.5969062784349408, "grad_norm": 1.1593955633960513, "learning_rate": 4.826487454898857e-05, "loss": 0.3408, "step": 1312 }, { "epoch": 0.597361237488626, "grad_norm": 0.7883603700883876, "learning_rate": 4.826225762575136e-05, "loss": 0.2349, "step": 1313 }, { "epoch": 0.5978161965423112, "grad_norm": 0.6888096976393598, "learning_rate": 4.8259638801627514e-05, "loss": 0.224, "step": 1314 }, { "epoch": 0.5982711555959963, "grad_norm": 0.6976622929000161, "learning_rate": 4.8257018076831016e-05, "loss": 0.1835, "step": 1315 }, { "epoch": 0.5987261146496815, "grad_norm": 0.8807437081075277, "learning_rate": 4.8254395451576035e-05, "loss": 0.2519, "step": 1316 }, { "epoch": 0.5991810737033667, "grad_norm": 0.9775841003281946, "learning_rate": 4.8251770926076865e-05, "loss": 0.3283, "step": 1317 }, { "epoch": 0.5996360327570519, "grad_norm": 0.6223902161588447, "learning_rate": 4.824914450054799e-05, "loss": 0.1608, "step": 1318 }, { "epoch": 0.600090991810737, "grad_norm": 0.7788129151510634, "learning_rate": 4.824651617520402e-05, "loss": 0.2236, "step": 1319 }, { "epoch": 0.6005459508644222, "grad_norm": 0.8549858575789846, "learning_rate": 4.8243885950259714e-05, "loss": 0.2349, "step": 1320 }, { "epoch": 0.6010009099181074, "grad_norm": 0.7913376230533556, "learning_rate": 4.824125382593003e-05, "loss": 0.2257, "step": 1321 }, { "epoch": 0.6014558689717925, "grad_norm": 0.8036296975975055, "learning_rate": 4.823861980243003e-05, "loss": 0.1859, "step": 1322 }, { "epoch": 0.6019108280254777, "grad_norm": 0.6888987422576993, "learning_rate": 4.823598387997497e-05, "loss": 0.1873, "step": 1323 }, { "epoch": 0.6023657870791629, "grad_norm": 0.913896144046218, "learning_rate": 4.8233346058780235e-05, "loss": 0.184, "step": 1324 }, { "epoch": 0.6028207461328481, "grad_norm": 0.8378680691351381, "learning_rate": 4.8230706339061395e-05, "loss": 0.2441, "step": 1325 }, { "epoch": 0.6032757051865332, "grad_norm": 1.1316255076413428, "learning_rate": 4.822806472103413e-05, "loss": 0.3192, "step": 1326 }, { "epoch": 0.6037306642402184, "grad_norm": 0.6414627108301011, "learning_rate": 4.822542120491431e-05, "loss": 0.2173, "step": 1327 }, { "epoch": 0.6041856232939036, "grad_norm": 0.9280712985252471, "learning_rate": 4.822277579091796e-05, "loss": 0.2618, "step": 1328 }, { "epoch": 0.6046405823475887, "grad_norm": 0.7686739315774844, "learning_rate": 4.822012847926125e-05, "loss": 0.2166, "step": 1329 }, { "epoch": 0.6050955414012739, "grad_norm": 0.7468453520301765, "learning_rate": 4.8217479270160494e-05, "loss": 0.185, "step": 1330 }, { "epoch": 0.6055505004549591, "grad_norm": 0.7411132253414962, "learning_rate": 4.821482816383218e-05, "loss": 0.2125, "step": 1331 }, { "epoch": 0.6060054595086443, "grad_norm": 0.8232672770167779, "learning_rate": 4.8212175160492954e-05, "loss": 0.2066, "step": 1332 }, { "epoch": 0.6064604185623294, "grad_norm": 0.6965342149709591, "learning_rate": 4.82095202603596e-05, "loss": 0.2097, "step": 1333 }, { "epoch": 0.6069153776160146, "grad_norm": 0.6648406661938879, "learning_rate": 4.820686346364906e-05, "loss": 0.1924, "step": 1334 }, { "epoch": 0.6073703366696998, "grad_norm": 0.8876492157414716, "learning_rate": 4.820420477057843e-05, "loss": 0.2513, "step": 1335 }, { "epoch": 0.607825295723385, "grad_norm": 0.8086867314661342, "learning_rate": 4.820154418136498e-05, "loss": 0.1967, "step": 1336 }, { "epoch": 0.60828025477707, "grad_norm": 0.753672554784404, "learning_rate": 4.819888169622612e-05, "loss": 0.2506, "step": 1337 }, { "epoch": 0.6087352138307552, "grad_norm": 1.0850047253782569, "learning_rate": 4.8196217315379414e-05, "loss": 0.312, "step": 1338 }, { "epoch": 0.6091901728844404, "grad_norm": 0.6896899702331832, "learning_rate": 4.8193551039042586e-05, "loss": 0.194, "step": 1339 }, { "epoch": 0.6096451319381255, "grad_norm": 0.7415761323967038, "learning_rate": 4.81908828674335e-05, "loss": 0.2283, "step": 1340 }, { "epoch": 0.6101000909918107, "grad_norm": 0.7054547741016202, "learning_rate": 4.8188212800770196e-05, "loss": 0.1918, "step": 1341 }, { "epoch": 0.6105550500454959, "grad_norm": 0.8255564083279657, "learning_rate": 4.818554083927086e-05, "loss": 0.2266, "step": 1342 }, { "epoch": 0.6110100090991811, "grad_norm": 0.6653877406803654, "learning_rate": 4.8182866983153835e-05, "loss": 0.1944, "step": 1343 }, { "epoch": 0.6114649681528662, "grad_norm": 0.734928903748162, "learning_rate": 4.818019123263761e-05, "loss": 0.2211, "step": 1344 }, { "epoch": 0.6119199272065514, "grad_norm": 0.743731622384506, "learning_rate": 4.8177513587940835e-05, "loss": 0.216, "step": 1345 }, { "epoch": 0.6123748862602366, "grad_norm": 0.8524277768897053, "learning_rate": 4.8174834049282326e-05, "loss": 0.2028, "step": 1346 }, { "epoch": 0.6128298453139217, "grad_norm": 0.8477360892509624, "learning_rate": 4.817215261688103e-05, "loss": 0.2312, "step": 1347 }, { "epoch": 0.6132848043676069, "grad_norm": 0.861460742836237, "learning_rate": 4.816946929095607e-05, "loss": 0.2104, "step": 1348 }, { "epoch": 0.6137397634212921, "grad_norm": 0.7885606447415434, "learning_rate": 4.816678407172671e-05, "loss": 0.2222, "step": 1349 }, { "epoch": 0.6141947224749773, "grad_norm": 0.7034342508904212, "learning_rate": 4.816409695941238e-05, "loss": 0.191, "step": 1350 }, { "epoch": 0.6146496815286624, "grad_norm": 1.0893195646540481, "learning_rate": 4.816140795423265e-05, "loss": 0.221, "step": 1351 }, { "epoch": 0.6151046405823476, "grad_norm": 0.8927433263164638, "learning_rate": 4.8158717056407255e-05, "loss": 0.2772, "step": 1352 }, { "epoch": 0.6155595996360328, "grad_norm": 0.9203828708075085, "learning_rate": 4.815602426615609e-05, "loss": 0.2543, "step": 1353 }, { "epoch": 0.6160145586897179, "grad_norm": 0.8598684142318657, "learning_rate": 4.815332958369919e-05, "loss": 0.2499, "step": 1354 }, { "epoch": 0.6164695177434031, "grad_norm": 0.9285763596320077, "learning_rate": 4.8150633009256765e-05, "loss": 0.2339, "step": 1355 }, { "epoch": 0.6169244767970883, "grad_norm": 0.7169931124995436, "learning_rate": 4.814793454304915e-05, "loss": 0.2541, "step": 1356 }, { "epoch": 0.6173794358507735, "grad_norm": 0.5628814139449309, "learning_rate": 4.814523418529686e-05, "loss": 0.1757, "step": 1357 }, { "epoch": 0.6178343949044586, "grad_norm": 0.7195343244094972, "learning_rate": 4.8142531936220556e-05, "loss": 0.1872, "step": 1358 }, { "epoch": 0.6182893539581438, "grad_norm": 0.6037841425521502, "learning_rate": 4.813982779604106e-05, "loss": 0.1554, "step": 1359 }, { "epoch": 0.618744313011829, "grad_norm": 0.886933639963298, "learning_rate": 4.813712176497933e-05, "loss": 0.1993, "step": 1360 }, { "epoch": 0.6191992720655141, "grad_norm": 0.699443168218115, "learning_rate": 4.813441384325649e-05, "loss": 0.1652, "step": 1361 }, { "epoch": 0.6196542311191993, "grad_norm": 0.8235501348163379, "learning_rate": 4.813170403109383e-05, "loss": 0.202, "step": 1362 }, { "epoch": 0.6201091901728845, "grad_norm": 0.8228440451825022, "learning_rate": 4.8128992328712776e-05, "loss": 0.1738, "step": 1363 }, { "epoch": 0.6205641492265697, "grad_norm": 0.6980337570817791, "learning_rate": 4.8126278736334916e-05, "loss": 0.1958, "step": 1364 }, { "epoch": 0.6210191082802548, "grad_norm": 0.7883325948822482, "learning_rate": 4.8123563254182004e-05, "loss": 0.2148, "step": 1365 }, { "epoch": 0.62147406733394, "grad_norm": 0.8402383471769966, "learning_rate": 4.812084588247592e-05, "loss": 0.2506, "step": 1366 }, { "epoch": 0.6219290263876252, "grad_norm": 1.0616043840984681, "learning_rate": 4.811812662143873e-05, "loss": 0.306, "step": 1367 }, { "epoch": 0.6223839854413102, "grad_norm": 0.7563771382538562, "learning_rate": 4.811540547129263e-05, "loss": 0.3048, "step": 1368 }, { "epoch": 0.6228389444949954, "grad_norm": 0.8999567978531593, "learning_rate": 4.811268243225998e-05, "loss": 0.2465, "step": 1369 }, { "epoch": 0.6232939035486806, "grad_norm": 1.0553686363921313, "learning_rate": 4.8109957504563304e-05, "loss": 0.2519, "step": 1370 }, { "epoch": 0.6237488626023658, "grad_norm": 0.9489801410609198, "learning_rate": 4.8107230688425257e-05, "loss": 0.2497, "step": 1371 }, { "epoch": 0.6242038216560509, "grad_norm": 0.8567509951796937, "learning_rate": 4.810450198406867e-05, "loss": 0.2847, "step": 1372 }, { "epoch": 0.6246587807097361, "grad_norm": 1.027322203159552, "learning_rate": 4.810177139171653e-05, "loss": 0.2728, "step": 1373 }, { "epoch": 0.6251137397634213, "grad_norm": 0.7914624094616435, "learning_rate": 4.809903891159195e-05, "loss": 0.1795, "step": 1374 }, { "epoch": 0.6255686988171064, "grad_norm": 1.0715704002833262, "learning_rate": 4.809630454391822e-05, "loss": 0.2251, "step": 1375 }, { "epoch": 0.6260236578707916, "grad_norm": 0.7971812356074832, "learning_rate": 4.8093568288918794e-05, "loss": 0.2538, "step": 1376 }, { "epoch": 0.6264786169244768, "grad_norm": 0.7996403225141987, "learning_rate": 4.809083014681726e-05, "loss": 0.3104, "step": 1377 }, { "epoch": 0.626933575978162, "grad_norm": 0.9319158148541462, "learning_rate": 4.808809011783735e-05, "loss": 0.2698, "step": 1378 }, { "epoch": 0.6273885350318471, "grad_norm": 2.8247009472929525, "learning_rate": 4.808534820220299e-05, "loss": 0.1976, "step": 1379 }, { "epoch": 0.6278434940855323, "grad_norm": 0.9665853683424284, "learning_rate": 4.8082604400138226e-05, "loss": 0.27, "step": 1380 }, { "epoch": 0.6282984531392175, "grad_norm": 0.6653840432135856, "learning_rate": 4.8079858711867264e-05, "loss": 0.1752, "step": 1381 }, { "epoch": 0.6287534121929026, "grad_norm": 0.7894745781821271, "learning_rate": 4.8077111137614486e-05, "loss": 0.1958, "step": 1382 }, { "epoch": 0.6292083712465878, "grad_norm": 1.0480178045543644, "learning_rate": 4.80743616776044e-05, "loss": 0.2757, "step": 1383 }, { "epoch": 0.629663330300273, "grad_norm": 0.8951374120103949, "learning_rate": 4.807161033206168e-05, "loss": 0.2328, "step": 1384 }, { "epoch": 0.6301182893539582, "grad_norm": 0.7857577735262563, "learning_rate": 4.8068857101211146e-05, "loss": 0.2229, "step": 1385 }, { "epoch": 0.6305732484076433, "grad_norm": 0.8741091288284604, "learning_rate": 4.806610198527779e-05, "loss": 0.2345, "step": 1386 }, { "epoch": 0.6310282074613285, "grad_norm": 0.9921539325460732, "learning_rate": 4.8063344984486755e-05, "loss": 0.2892, "step": 1387 }, { "epoch": 0.6314831665150137, "grad_norm": 1.1096204927875453, "learning_rate": 4.806058609906331e-05, "loss": 0.2371, "step": 1388 }, { "epoch": 0.6319381255686988, "grad_norm": 1.1516711499966765, "learning_rate": 4.805782532923291e-05, "loss": 0.3128, "step": 1389 }, { "epoch": 0.632393084622384, "grad_norm": 0.7294253399006744, "learning_rate": 4.805506267522116e-05, "loss": 0.2859, "step": 1390 }, { "epoch": 0.6328480436760692, "grad_norm": 1.4680742493862398, "learning_rate": 4.8052298137253803e-05, "loss": 0.3547, "step": 1391 }, { "epoch": 0.6333030027297544, "grad_norm": 0.709043766963109, "learning_rate": 4.8049531715556737e-05, "loss": 0.2101, "step": 1392 }, { "epoch": 0.6337579617834395, "grad_norm": 1.84890272845285, "learning_rate": 4.804676341035604e-05, "loss": 0.3467, "step": 1393 }, { "epoch": 0.6342129208371247, "grad_norm": 0.8125487017057067, "learning_rate": 4.804399322187791e-05, "loss": 0.2559, "step": 1394 }, { "epoch": 0.6346678798908099, "grad_norm": 0.9486732684203032, "learning_rate": 4.804122115034873e-05, "loss": 0.3164, "step": 1395 }, { "epoch": 0.635122838944495, "grad_norm": 0.7791291004150325, "learning_rate": 4.8038447195994994e-05, "loss": 0.2635, "step": 1396 }, { "epoch": 0.6355777979981801, "grad_norm": 1.323563522328408, "learning_rate": 4.80356713590434e-05, "loss": 0.2314, "step": 1397 }, { "epoch": 0.6360327570518653, "grad_norm": 0.6731745532360786, "learning_rate": 4.803289363972078e-05, "loss": 0.2416, "step": 1398 }, { "epoch": 0.6364877161055505, "grad_norm": 1.335210446845052, "learning_rate": 4.803011403825409e-05, "loss": 0.2176, "step": 1399 }, { "epoch": 0.6369426751592356, "grad_norm": 0.724595702369653, "learning_rate": 4.8027332554870495e-05, "loss": 0.2383, "step": 1400 }, { "epoch": 0.6373976342129208, "grad_norm": 0.9762291134076576, "learning_rate": 4.8024549189797276e-05, "loss": 0.2268, "step": 1401 }, { "epoch": 0.637852593266606, "grad_norm": 1.1786250179326672, "learning_rate": 4.8021763943261874e-05, "loss": 0.2996, "step": 1402 }, { "epoch": 0.6383075523202911, "grad_norm": 0.5955825071601878, "learning_rate": 4.801897681549188e-05, "loss": 0.1666, "step": 1403 }, { "epoch": 0.6387625113739763, "grad_norm": 0.8360706200169066, "learning_rate": 4.801618780671506e-05, "loss": 0.2184, "step": 1404 }, { "epoch": 0.6392174704276615, "grad_norm": 0.72301677266375, "learning_rate": 4.801339691715931e-05, "loss": 0.2104, "step": 1405 }, { "epoch": 0.6396724294813467, "grad_norm": 1.2405900402033225, "learning_rate": 4.8010604147052694e-05, "loss": 0.2409, "step": 1406 }, { "epoch": 0.6401273885350318, "grad_norm": 1.1441884149135042, "learning_rate": 4.800780949662342e-05, "loss": 0.2492, "step": 1407 }, { "epoch": 0.640582347588717, "grad_norm": 0.8087406981270053, "learning_rate": 4.800501296609986e-05, "loss": 0.1584, "step": 1408 }, { "epoch": 0.6410373066424022, "grad_norm": 0.7408364000353893, "learning_rate": 4.800221455571053e-05, "loss": 0.199, "step": 1409 }, { "epoch": 0.6414922656960873, "grad_norm": 0.7036425353728212, "learning_rate": 4.7999414265684106e-05, "loss": 0.2041, "step": 1410 }, { "epoch": 0.6419472247497725, "grad_norm": 0.863176111680568, "learning_rate": 4.7996612096249404e-05, "loss": 0.2123, "step": 1411 }, { "epoch": 0.6424021838034577, "grad_norm": 1.0046365889333297, "learning_rate": 4.799380804763542e-05, "loss": 0.2313, "step": 1412 }, { "epoch": 0.6428571428571429, "grad_norm": 0.7433880184766053, "learning_rate": 4.799100212007128e-05, "loss": 0.2143, "step": 1413 }, { "epoch": 0.643312101910828, "grad_norm": 0.8860978853171779, "learning_rate": 4.7988194313786275e-05, "loss": 0.2609, "step": 1414 }, { "epoch": 0.6437670609645132, "grad_norm": 0.7727449728978378, "learning_rate": 4.798538462900983e-05, "loss": 0.2208, "step": 1415 }, { "epoch": 0.6442220200181984, "grad_norm": 1.1262950203929436, "learning_rate": 4.798257306597157e-05, "loss": 0.2736, "step": 1416 }, { "epoch": 0.6446769790718835, "grad_norm": 0.8407421790947961, "learning_rate": 4.797975962490122e-05, "loss": 0.2118, "step": 1417 }, { "epoch": 0.6451319381255687, "grad_norm": 0.6107583672339236, "learning_rate": 4.797694430602869e-05, "loss": 0.1307, "step": 1418 }, { "epoch": 0.6455868971792539, "grad_norm": 1.0828157310873419, "learning_rate": 4.7974127109584046e-05, "loss": 0.2738, "step": 1419 }, { "epoch": 0.6460418562329391, "grad_norm": 0.9289474013426606, "learning_rate": 4.797130803579747e-05, "loss": 0.2216, "step": 1420 }, { "epoch": 0.6464968152866242, "grad_norm": 0.93350769490831, "learning_rate": 4.796848708489935e-05, "loss": 0.3121, "step": 1421 }, { "epoch": 0.6469517743403094, "grad_norm": 0.7545738716037698, "learning_rate": 4.796566425712018e-05, "loss": 0.2051, "step": 1422 }, { "epoch": 0.6474067333939946, "grad_norm": 0.812986944859845, "learning_rate": 4.796283955269064e-05, "loss": 0.2476, "step": 1423 }, { "epoch": 0.6478616924476797, "grad_norm": 0.7437284959557895, "learning_rate": 4.796001297184156e-05, "loss": 0.223, "step": 1424 }, { "epoch": 0.6483166515013649, "grad_norm": 0.9446368520141168, "learning_rate": 4.7957184514803896e-05, "loss": 0.3131, "step": 1425 }, { "epoch": 0.6487716105550501, "grad_norm": 0.47109488103668673, "learning_rate": 4.795435418180879e-05, "loss": 0.129, "step": 1426 }, { "epoch": 0.6492265696087353, "grad_norm": 0.7392654423961034, "learning_rate": 4.7951521973087524e-05, "loss": 0.2057, "step": 1427 }, { "epoch": 0.6496815286624203, "grad_norm": 0.784381003544401, "learning_rate": 4.794868788887154e-05, "loss": 0.1532, "step": 1428 }, { "epoch": 0.6501364877161055, "grad_norm": 0.5913731578310638, "learning_rate": 4.79458519293924e-05, "loss": 0.1917, "step": 1429 }, { "epoch": 0.6505914467697907, "grad_norm": 0.739309981176162, "learning_rate": 4.794301409488187e-05, "loss": 0.2004, "step": 1430 }, { "epoch": 0.6510464058234758, "grad_norm": 0.6672491539937117, "learning_rate": 4.794017438557183e-05, "loss": 0.2111, "step": 1431 }, { "epoch": 0.651501364877161, "grad_norm": 0.9579411655006415, "learning_rate": 4.793733280169435e-05, "loss": 0.2795, "step": 1432 }, { "epoch": 0.6519563239308462, "grad_norm": 1.1093659305219192, "learning_rate": 4.7934489343481614e-05, "loss": 0.2936, "step": 1433 }, { "epoch": 0.6524112829845314, "grad_norm": 0.9160777666511662, "learning_rate": 4.7931644011165975e-05, "loss": 0.2234, "step": 1434 }, { "epoch": 0.6528662420382165, "grad_norm": 0.8782753985284759, "learning_rate": 4.792879680497995e-05, "loss": 0.2229, "step": 1435 }, { "epoch": 0.6533212010919017, "grad_norm": 1.0671175047830812, "learning_rate": 4.792594772515619e-05, "loss": 0.2936, "step": 1436 }, { "epoch": 0.6537761601455869, "grad_norm": 1.1033079811377373, "learning_rate": 4.792309677192752e-05, "loss": 0.2952, "step": 1437 }, { "epoch": 0.654231119199272, "grad_norm": 0.8801188712868003, "learning_rate": 4.79202439455269e-05, "loss": 0.204, "step": 1438 }, { "epoch": 0.6546860782529572, "grad_norm": 0.6041392217309238, "learning_rate": 4.791738924618745e-05, "loss": 0.1675, "step": 1439 }, { "epoch": 0.6551410373066424, "grad_norm": 1.085143221901585, "learning_rate": 4.791453267414245e-05, "loss": 0.2421, "step": 1440 }, { "epoch": 0.6555959963603276, "grad_norm": 0.8396060883283153, "learning_rate": 4.791167422962531e-05, "loss": 0.2056, "step": 1441 }, { "epoch": 0.6560509554140127, "grad_norm": 0.8408283469581144, "learning_rate": 4.7908813912869633e-05, "loss": 0.2112, "step": 1442 }, { "epoch": 0.6565059144676979, "grad_norm": 0.93541212117855, "learning_rate": 4.790595172410913e-05, "loss": 0.225, "step": 1443 }, { "epoch": 0.6569608735213831, "grad_norm": 1.0564169981084905, "learning_rate": 4.79030876635777e-05, "loss": 0.252, "step": 1444 }, { "epoch": 0.6574158325750682, "grad_norm": 0.8023853711692919, "learning_rate": 4.790022173150937e-05, "loss": 0.2381, "step": 1445 }, { "epoch": 0.6578707916287534, "grad_norm": 0.6197467711979955, "learning_rate": 4.789735392813834e-05, "loss": 0.1982, "step": 1446 }, { "epoch": 0.6583257506824386, "grad_norm": 1.1953941235303345, "learning_rate": 4.7894484253698954e-05, "loss": 0.2787, "step": 1447 }, { "epoch": 0.6587807097361238, "grad_norm": 0.8620097300166194, "learning_rate": 4.78916127084257e-05, "loss": 0.1935, "step": 1448 }, { "epoch": 0.6592356687898089, "grad_norm": 0.8148614409672897, "learning_rate": 4.7888739292553235e-05, "loss": 0.2554, "step": 1449 }, { "epoch": 0.6596906278434941, "grad_norm": 0.7853988255073447, "learning_rate": 4.788586400631636e-05, "loss": 0.2831, "step": 1450 }, { "epoch": 0.6601455868971793, "grad_norm": 0.5234549519482684, "learning_rate": 4.788298684995003e-05, "loss": 0.1749, "step": 1451 }, { "epoch": 0.6606005459508644, "grad_norm": 0.7833249790115708, "learning_rate": 4.788010782368935e-05, "loss": 0.229, "step": 1452 }, { "epoch": 0.6610555050045496, "grad_norm": 0.9888451854357717, "learning_rate": 4.787722692776959e-05, "loss": 0.2874, "step": 1453 }, { "epoch": 0.6615104640582348, "grad_norm": 0.7087265736539009, "learning_rate": 4.7874344162426154e-05, "loss": 0.2421, "step": 1454 }, { "epoch": 0.66196542311192, "grad_norm": 0.8956138281046879, "learning_rate": 4.7871459527894614e-05, "loss": 0.2501, "step": 1455 }, { "epoch": 0.6624203821656051, "grad_norm": 0.9523892652185957, "learning_rate": 4.786857302441069e-05, "loss": 0.2251, "step": 1456 }, { "epoch": 0.6628753412192903, "grad_norm": 0.8711956950362809, "learning_rate": 4.786568465221025e-05, "loss": 0.1964, "step": 1457 }, { "epoch": 0.6633303002729755, "grad_norm": 0.9830567168594708, "learning_rate": 4.786279441152931e-05, "loss": 0.2788, "step": 1458 }, { "epoch": 0.6637852593266605, "grad_norm": 0.791196618747227, "learning_rate": 4.785990230260408e-05, "loss": 0.2415, "step": 1459 }, { "epoch": 0.6642402183803457, "grad_norm": 1.2474203549383065, "learning_rate": 4.785700832567085e-05, "loss": 0.3125, "step": 1460 }, { "epoch": 0.664695177434031, "grad_norm": 0.6521474040997438, "learning_rate": 4.785411248096613e-05, "loss": 0.2101, "step": 1461 }, { "epoch": 0.6651501364877161, "grad_norm": 0.9528843335872199, "learning_rate": 4.785121476872654e-05, "loss": 0.2595, "step": 1462 }, { "epoch": 0.6656050955414012, "grad_norm": 0.5993696328555543, "learning_rate": 4.784831518918888e-05, "loss": 0.2275, "step": 1463 }, { "epoch": 0.6660600545950864, "grad_norm": 0.8089530016649561, "learning_rate": 4.784541374259008e-05, "loss": 0.2102, "step": 1464 }, { "epoch": 0.6665150136487716, "grad_norm": 0.7370370077929486, "learning_rate": 4.784251042916724e-05, "loss": 0.2354, "step": 1465 }, { "epoch": 0.6669699727024567, "grad_norm": 0.9355440213182629, "learning_rate": 4.783960524915761e-05, "loss": 0.2202, "step": 1466 }, { "epoch": 0.6674249317561419, "grad_norm": 1.2225948937622102, "learning_rate": 4.7836698202798577e-05, "loss": 0.2635, "step": 1467 }, { "epoch": 0.6678798908098271, "grad_norm": 0.7534075937760788, "learning_rate": 4.783378929032769e-05, "loss": 0.2127, "step": 1468 }, { "epoch": 0.6683348498635123, "grad_norm": 0.9454834097715902, "learning_rate": 4.783087851198267e-05, "loss": 0.2356, "step": 1469 }, { "epoch": 0.6687898089171974, "grad_norm": 0.9130698018751382, "learning_rate": 4.782796586800136e-05, "loss": 0.2164, "step": 1470 }, { "epoch": 0.6692447679708826, "grad_norm": 0.6077242726361439, "learning_rate": 4.782505135862176e-05, "loss": 0.2385, "step": 1471 }, { "epoch": 0.6696997270245678, "grad_norm": 0.7380553114333529, "learning_rate": 4.782213498408205e-05, "loss": 0.2062, "step": 1472 }, { "epoch": 0.6701546860782529, "grad_norm": 0.7617426877765652, "learning_rate": 4.781921674462052e-05, "loss": 0.1775, "step": 1473 }, { "epoch": 0.6706096451319381, "grad_norm": 1.0331725532619822, "learning_rate": 4.781629664047566e-05, "loss": 0.2964, "step": 1474 }, { "epoch": 0.6710646041856233, "grad_norm": 1.1683484091751883, "learning_rate": 4.781337467188607e-05, "loss": 0.2226, "step": 1475 }, { "epoch": 0.6715195632393085, "grad_norm": 0.8044764641851986, "learning_rate": 4.781045083909053e-05, "loss": 0.2414, "step": 1476 }, { "epoch": 0.6719745222929936, "grad_norm": 1.0186928855151918, "learning_rate": 4.780752514232796e-05, "loss": 0.2473, "step": 1477 }, { "epoch": 0.6724294813466788, "grad_norm": 0.8069274257194631, "learning_rate": 4.7804597581837426e-05, "loss": 0.2062, "step": 1478 }, { "epoch": 0.672884440400364, "grad_norm": 0.7031662825315823, "learning_rate": 4.780166815785817e-05, "loss": 0.2072, "step": 1479 }, { "epoch": 0.6733393994540491, "grad_norm": 0.8558826502986352, "learning_rate": 4.7798736870629556e-05, "loss": 0.2307, "step": 1480 }, { "epoch": 0.6737943585077343, "grad_norm": 0.7108482040536147, "learning_rate": 4.779580372039113e-05, "loss": 0.2075, "step": 1481 }, { "epoch": 0.6742493175614195, "grad_norm": 0.826342256850621, "learning_rate": 4.7792868707382564e-05, "loss": 0.1898, "step": 1482 }, { "epoch": 0.6747042766151047, "grad_norm": 1.1132476297389478, "learning_rate": 4.77899318318437e-05, "loss": 0.2583, "step": 1483 }, { "epoch": 0.6751592356687898, "grad_norm": 0.9197836464347385, "learning_rate": 4.7786993094014534e-05, "loss": 0.2854, "step": 1484 }, { "epoch": 0.675614194722475, "grad_norm": 0.7752364654855942, "learning_rate": 4.778405249413519e-05, "loss": 0.2258, "step": 1485 }, { "epoch": 0.6760691537761602, "grad_norm": 0.6469829616097981, "learning_rate": 4.7781110032445966e-05, "loss": 0.1994, "step": 1486 }, { "epoch": 0.6765241128298453, "grad_norm": 0.5778627958342278, "learning_rate": 4.777816570918731e-05, "loss": 0.1869, "step": 1487 }, { "epoch": 0.6769790718835305, "grad_norm": 0.8755002174547356, "learning_rate": 4.7775219524599815e-05, "loss": 0.2589, "step": 1488 }, { "epoch": 0.6774340309372157, "grad_norm": 1.0328859567806488, "learning_rate": 4.777227147892424e-05, "loss": 0.2695, "step": 1489 }, { "epoch": 0.6778889899909009, "grad_norm": 0.8385994778943285, "learning_rate": 4.776932157240147e-05, "loss": 0.2113, "step": 1490 }, { "epoch": 0.678343949044586, "grad_norm": 0.9442057902313798, "learning_rate": 4.776636980527257e-05, "loss": 0.2028, "step": 1491 }, { "epoch": 0.6787989080982711, "grad_norm": 0.8560645177405655, "learning_rate": 4.776341617777874e-05, "loss": 0.2708, "step": 1492 }, { "epoch": 0.6792538671519563, "grad_norm": 0.6980467007990466, "learning_rate": 4.776046069016134e-05, "loss": 0.2223, "step": 1493 }, { "epoch": 0.6797088262056415, "grad_norm": 1.154469135277789, "learning_rate": 4.775750334266188e-05, "loss": 0.3002, "step": 1494 }, { "epoch": 0.6801637852593266, "grad_norm": 0.5109504514221442, "learning_rate": 4.775454413552202e-05, "loss": 0.1336, "step": 1495 }, { "epoch": 0.6806187443130118, "grad_norm": 0.8714535508428582, "learning_rate": 4.7751583068983575e-05, "loss": 0.1681, "step": 1496 }, { "epoch": 0.681073703366697, "grad_norm": 0.733324857125982, "learning_rate": 4.774862014328849e-05, "loss": 0.2155, "step": 1497 }, { "epoch": 0.6815286624203821, "grad_norm": 0.7006438434350538, "learning_rate": 4.7745655358678917e-05, "loss": 0.2228, "step": 1498 }, { "epoch": 0.6819836214740673, "grad_norm": 0.9117256849429939, "learning_rate": 4.77426887153971e-05, "loss": 0.2241, "step": 1499 }, { "epoch": 0.6824385805277525, "grad_norm": 0.9283494108146093, "learning_rate": 4.773972021368546e-05, "loss": 0.2755, "step": 1500 }, { "epoch": 0.6828935395814377, "grad_norm": 0.9536751565412336, "learning_rate": 4.7736749853786585e-05, "loss": 0.2848, "step": 1501 }, { "epoch": 0.6833484986351228, "grad_norm": 1.2547958133895527, "learning_rate": 4.7733777635943186e-05, "loss": 0.3136, "step": 1502 }, { "epoch": 0.683803457688808, "grad_norm": 0.9457638122070287, "learning_rate": 4.773080356039814e-05, "loss": 0.2568, "step": 1503 }, { "epoch": 0.6842584167424932, "grad_norm": 0.7463194458239247, "learning_rate": 4.7727827627394486e-05, "loss": 0.2432, "step": 1504 }, { "epoch": 0.6847133757961783, "grad_norm": 0.8570827984267938, "learning_rate": 4.772484983717539e-05, "loss": 0.2815, "step": 1505 }, { "epoch": 0.6851683348498635, "grad_norm": 1.014788670397018, "learning_rate": 4.77218701899842e-05, "loss": 0.299, "step": 1506 }, { "epoch": 0.6856232939035487, "grad_norm": 0.8925949057872121, "learning_rate": 4.771888868606438e-05, "loss": 0.2914, "step": 1507 }, { "epoch": 0.6860782529572339, "grad_norm": 0.9592518796348762, "learning_rate": 4.771590532565957e-05, "loss": 0.1948, "step": 1508 }, { "epoch": 0.686533212010919, "grad_norm": 1.0699122735892388, "learning_rate": 4.771292010901357e-05, "loss": 0.1663, "step": 1509 }, { "epoch": 0.6869881710646042, "grad_norm": 0.8958155470904243, "learning_rate": 4.77099330363703e-05, "loss": 0.2228, "step": 1510 }, { "epoch": 0.6874431301182894, "grad_norm": 0.5921274575089706, "learning_rate": 4.7706944107973874e-05, "loss": 0.1714, "step": 1511 }, { "epoch": 0.6878980891719745, "grad_norm": 0.8017588574860767, "learning_rate": 4.7703953324068504e-05, "loss": 0.2139, "step": 1512 }, { "epoch": 0.6883530482256597, "grad_norm": 0.7565199317679644, "learning_rate": 4.770096068489861e-05, "loss": 0.2362, "step": 1513 }, { "epoch": 0.6888080072793449, "grad_norm": 0.5841968032702605, "learning_rate": 4.769796619070872e-05, "loss": 0.2029, "step": 1514 }, { "epoch": 0.6892629663330301, "grad_norm": 0.6848103274008948, "learning_rate": 4.769496984174353e-05, "loss": 0.1563, "step": 1515 }, { "epoch": 0.6897179253867152, "grad_norm": 0.6758639223104627, "learning_rate": 4.769197163824791e-05, "loss": 0.1913, "step": 1516 }, { "epoch": 0.6901728844404004, "grad_norm": 0.9285535107347506, "learning_rate": 4.768897158046683e-05, "loss": 0.2776, "step": 1517 }, { "epoch": 0.6906278434940856, "grad_norm": 1.3422000059159473, "learning_rate": 4.7685969668645456e-05, "loss": 0.2279, "step": 1518 }, { "epoch": 0.6910828025477707, "grad_norm": 0.7879197682474613, "learning_rate": 4.76829659030291e-05, "loss": 0.282, "step": 1519 }, { "epoch": 0.6915377616014559, "grad_norm": 0.6527337724318661, "learning_rate": 4.76799602838632e-05, "loss": 0.2012, "step": 1520 }, { "epoch": 0.6919927206551411, "grad_norm": 1.2873248313977839, "learning_rate": 4.767695281139336e-05, "loss": 0.3201, "step": 1521 }, { "epoch": 0.6924476797088263, "grad_norm": 1.0755937169741605, "learning_rate": 4.767394348586535e-05, "loss": 0.2451, "step": 1522 }, { "epoch": 0.6929026387625113, "grad_norm": 0.9619852926606365, "learning_rate": 4.7670932307525075e-05, "loss": 0.2572, "step": 1523 }, { "epoch": 0.6933575978161965, "grad_norm": 0.7110241322244775, "learning_rate": 4.766791927661859e-05, "loss": 0.2032, "step": 1524 }, { "epoch": 0.6938125568698817, "grad_norm": 0.7472515907285855, "learning_rate": 4.7664904393392115e-05, "loss": 0.1782, "step": 1525 }, { "epoch": 0.6942675159235668, "grad_norm": 0.7106607886130152, "learning_rate": 4.7661887658092e-05, "loss": 0.226, "step": 1526 }, { "epoch": 0.694722474977252, "grad_norm": 1.1337976217386991, "learning_rate": 4.7658869070964766e-05, "loss": 0.31, "step": 1527 }, { "epoch": 0.6951774340309372, "grad_norm": 0.7981124653663301, "learning_rate": 4.765584863225708e-05, "loss": 0.1937, "step": 1528 }, { "epoch": 0.6956323930846224, "grad_norm": 0.940612162899207, "learning_rate": 4.765282634221576e-05, "loss": 0.2958, "step": 1529 }, { "epoch": 0.6960873521383075, "grad_norm": 0.7540080337990112, "learning_rate": 4.764980220108777e-05, "loss": 0.2583, "step": 1530 }, { "epoch": 0.6965423111919927, "grad_norm": 0.923792734466249, "learning_rate": 4.7646776209120224e-05, "loss": 0.2786, "step": 1531 }, { "epoch": 0.6969972702456779, "grad_norm": 0.7763317221100198, "learning_rate": 4.76437483665604e-05, "loss": 0.1954, "step": 1532 }, { "epoch": 0.697452229299363, "grad_norm": 2.3772743831769394, "learning_rate": 4.764071867365571e-05, "loss": 0.236, "step": 1533 }, { "epoch": 0.6979071883530482, "grad_norm": 0.7440195850860047, "learning_rate": 4.763768713065375e-05, "loss": 0.2176, "step": 1534 }, { "epoch": 0.6983621474067334, "grad_norm": 0.7378211539399473, "learning_rate": 4.7634653737802226e-05, "loss": 0.1579, "step": 1535 }, { "epoch": 0.6988171064604186, "grad_norm": 1.183179184331495, "learning_rate": 4.763161849534901e-05, "loss": 0.2704, "step": 1536 }, { "epoch": 0.6992720655141037, "grad_norm": 0.711596873917575, "learning_rate": 4.762858140354214e-05, "loss": 0.2155, "step": 1537 }, { "epoch": 0.6997270245677889, "grad_norm": 0.6677862093785281, "learning_rate": 4.762554246262978e-05, "loss": 0.2335, "step": 1538 }, { "epoch": 0.7001819836214741, "grad_norm": 0.7025745792991753, "learning_rate": 4.762250167286027e-05, "loss": 0.1528, "step": 1539 }, { "epoch": 0.7006369426751592, "grad_norm": 0.8102227383314293, "learning_rate": 4.761945903448208e-05, "loss": 0.2789, "step": 1540 }, { "epoch": 0.7010919017288444, "grad_norm": 0.5548793515834874, "learning_rate": 4.761641454774386e-05, "loss": 0.1546, "step": 1541 }, { "epoch": 0.7015468607825296, "grad_norm": 1.0360300357946346, "learning_rate": 4.7613368212894365e-05, "loss": 0.4014, "step": 1542 }, { "epoch": 0.7020018198362148, "grad_norm": 0.7484697899350387, "learning_rate": 4.7610320030182544e-05, "loss": 0.1967, "step": 1543 }, { "epoch": 0.7024567788898999, "grad_norm": 0.7188290635115757, "learning_rate": 4.760726999985748e-05, "loss": 0.2089, "step": 1544 }, { "epoch": 0.7029117379435851, "grad_norm": 0.6937697621389333, "learning_rate": 4.7604218122168406e-05, "loss": 0.1772, "step": 1545 }, { "epoch": 0.7033666969972703, "grad_norm": 0.8150112824314043, "learning_rate": 4.7601164397364707e-05, "loss": 0.3238, "step": 1546 }, { "epoch": 0.7038216560509554, "grad_norm": 0.7409939976166345, "learning_rate": 4.7598108825695906e-05, "loss": 0.2425, "step": 1547 }, { "epoch": 0.7042766151046406, "grad_norm": 0.7838940217432695, "learning_rate": 4.7595051407411715e-05, "loss": 0.1504, "step": 1548 }, { "epoch": 0.7047315741583258, "grad_norm": 1.211488496628736, "learning_rate": 4.759199214276197e-05, "loss": 0.3731, "step": 1549 }, { "epoch": 0.705186533212011, "grad_norm": 0.816575130356372, "learning_rate": 4.758893103199664e-05, "loss": 0.2299, "step": 1550 }, { "epoch": 0.7056414922656961, "grad_norm": 0.7037241551840089, "learning_rate": 4.758586807536588e-05, "loss": 0.2022, "step": 1551 }, { "epoch": 0.7060964513193813, "grad_norm": 0.7614330809480855, "learning_rate": 4.758280327311998e-05, "loss": 0.1652, "step": 1552 }, { "epoch": 0.7065514103730665, "grad_norm": 0.9101571284590226, "learning_rate": 4.7579736625509376e-05, "loss": 0.2717, "step": 1553 }, { "epoch": 0.7070063694267515, "grad_norm": 0.7580783014457948, "learning_rate": 4.757666813278466e-05, "loss": 0.2374, "step": 1554 }, { "epoch": 0.7074613284804367, "grad_norm": 0.809555929155453, "learning_rate": 4.757359779519659e-05, "loss": 0.3084, "step": 1555 }, { "epoch": 0.707916287534122, "grad_norm": 0.844907202704376, "learning_rate": 4.757052561299604e-05, "loss": 0.3078, "step": 1556 }, { "epoch": 0.7083712465878071, "grad_norm": 0.6631670267399067, "learning_rate": 4.7567451586434066e-05, "loss": 0.268, "step": 1557 }, { "epoch": 0.7088262056414922, "grad_norm": 0.7954404630303853, "learning_rate": 4.756437571576187e-05, "loss": 0.2976, "step": 1558 }, { "epoch": 0.7092811646951774, "grad_norm": 1.0754447469361523, "learning_rate": 4.756129800123078e-05, "loss": 0.1776, "step": 1559 }, { "epoch": 0.7097361237488626, "grad_norm": 0.6905355067243765, "learning_rate": 4.755821844309231e-05, "loss": 0.2287, "step": 1560 }, { "epoch": 0.7101910828025477, "grad_norm": 0.7306534123029486, "learning_rate": 4.7555137041598096e-05, "loss": 0.2062, "step": 1561 }, { "epoch": 0.7106460418562329, "grad_norm": 1.071862023263919, "learning_rate": 4.755205379699995e-05, "loss": 0.2798, "step": 1562 }, { "epoch": 0.7111010009099181, "grad_norm": 0.9530913691265211, "learning_rate": 4.7548968709549804e-05, "loss": 0.2647, "step": 1563 }, { "epoch": 0.7115559599636033, "grad_norm": 0.9436518901516885, "learning_rate": 4.7545881779499766e-05, "loss": 0.2021, "step": 1564 }, { "epoch": 0.7120109190172884, "grad_norm": 0.8623384683172712, "learning_rate": 4.754279300710209e-05, "loss": 0.2461, "step": 1565 }, { "epoch": 0.7124658780709736, "grad_norm": 0.5463598062300931, "learning_rate": 4.753970239260916e-05, "loss": 0.2109, "step": 1566 }, { "epoch": 0.7129208371246588, "grad_norm": 0.8238205218623695, "learning_rate": 4.753660993627356e-05, "loss": 0.1434, "step": 1567 }, { "epoch": 0.7133757961783439, "grad_norm": 0.6985202785856633, "learning_rate": 4.7533515638347945e-05, "loss": 0.227, "step": 1568 }, { "epoch": 0.7138307552320291, "grad_norm": 1.2191695417095496, "learning_rate": 4.753041949908521e-05, "loss": 0.2549, "step": 1569 }, { "epoch": 0.7142857142857143, "grad_norm": 0.9115213096198047, "learning_rate": 4.752732151873834e-05, "loss": 0.2839, "step": 1570 }, { "epoch": 0.7147406733393995, "grad_norm": 0.7291347284737063, "learning_rate": 4.752422169756048e-05, "loss": 0.2124, "step": 1571 }, { "epoch": 0.7151956323930846, "grad_norm": 0.8535336220507284, "learning_rate": 4.7521120035804946e-05, "loss": 0.2148, "step": 1572 }, { "epoch": 0.7156505914467698, "grad_norm": 0.9633959255079565, "learning_rate": 4.751801653372518e-05, "loss": 0.2371, "step": 1573 }, { "epoch": 0.716105550500455, "grad_norm": 0.9579580609387999, "learning_rate": 4.751491119157481e-05, "loss": 0.2434, "step": 1574 }, { "epoch": 0.7165605095541401, "grad_norm": 0.6905426118294363, "learning_rate": 4.7511804009607555e-05, "loss": 0.2313, "step": 1575 }, { "epoch": 0.7170154686078253, "grad_norm": 0.7338973043130227, "learning_rate": 4.750869498807735e-05, "loss": 0.2167, "step": 1576 }, { "epoch": 0.7174704276615105, "grad_norm": 1.08530862997925, "learning_rate": 4.750558412723823e-05, "loss": 0.2767, "step": 1577 }, { "epoch": 0.7179253867151957, "grad_norm": 0.8042058272523862, "learning_rate": 4.750247142734442e-05, "loss": 0.2226, "step": 1578 }, { "epoch": 0.7183803457688808, "grad_norm": 0.8953285518451788, "learning_rate": 4.749935688865026e-05, "loss": 0.2267, "step": 1579 }, { "epoch": 0.718835304822566, "grad_norm": 1.012137732170053, "learning_rate": 4.749624051141026e-05, "loss": 0.1532, "step": 1580 }, { "epoch": 0.7192902638762512, "grad_norm": 0.6323182777808519, "learning_rate": 4.749312229587908e-05, "loss": 0.1569, "step": 1581 }, { "epoch": 0.7197452229299363, "grad_norm": 1.031260098915168, "learning_rate": 4.749000224231153e-05, "loss": 0.2814, "step": 1582 }, { "epoch": 0.7202001819836215, "grad_norm": 0.9592959956185382, "learning_rate": 4.748688035096255e-05, "loss": 0.2788, "step": 1583 }, { "epoch": 0.7206551410373067, "grad_norm": 1.014425757197947, "learning_rate": 4.748375662208726e-05, "loss": 0.2151, "step": 1584 }, { "epoch": 0.7211101000909919, "grad_norm": 0.8688494448348922, "learning_rate": 4.7480631055940914e-05, "loss": 0.3037, "step": 1585 }, { "epoch": 0.721565059144677, "grad_norm": 0.8063718239714964, "learning_rate": 4.747750365277892e-05, "loss": 0.2314, "step": 1586 }, { "epoch": 0.7220200181983621, "grad_norm": 0.7766241568754442, "learning_rate": 4.747437441285684e-05, "loss": 0.2047, "step": 1587 }, { "epoch": 0.7224749772520473, "grad_norm": 0.8367252649640058, "learning_rate": 4.747124333643037e-05, "loss": 0.2593, "step": 1588 }, { "epoch": 0.7229299363057324, "grad_norm": 0.6985323096526038, "learning_rate": 4.746811042375538e-05, "loss": 0.177, "step": 1589 }, { "epoch": 0.7233848953594176, "grad_norm": 2.0071251432023876, "learning_rate": 4.7464975675087866e-05, "loss": 0.2418, "step": 1590 }, { "epoch": 0.7238398544131028, "grad_norm": 0.8837569896957833, "learning_rate": 4.7461839090684e-05, "loss": 0.2463, "step": 1591 }, { "epoch": 0.724294813466788, "grad_norm": 0.9273153949576584, "learning_rate": 4.745870067080007e-05, "loss": 0.2598, "step": 1592 }, { "epoch": 0.7247497725204731, "grad_norm": 1.5322758378823882, "learning_rate": 4.745556041569254e-05, "loss": 0.3212, "step": 1593 }, { "epoch": 0.7252047315741583, "grad_norm": 0.9890410708529028, "learning_rate": 4.7452418325618034e-05, "loss": 0.2109, "step": 1594 }, { "epoch": 0.7256596906278435, "grad_norm": 0.8442974141895655, "learning_rate": 4.7449274400833286e-05, "loss": 0.2365, "step": 1595 }, { "epoch": 0.7261146496815286, "grad_norm": 0.7835568760198466, "learning_rate": 4.744612864159522e-05, "loss": 0.2579, "step": 1596 }, { "epoch": 0.7265696087352138, "grad_norm": 0.8274169620772465, "learning_rate": 4.744298104816089e-05, "loss": 0.2279, "step": 1597 }, { "epoch": 0.727024567788899, "grad_norm": 1.3035047188930373, "learning_rate": 4.74398316207875e-05, "loss": 0.2468, "step": 1598 }, { "epoch": 0.7274795268425842, "grad_norm": 1.1550287203699308, "learning_rate": 4.74366803597324e-05, "loss": 0.4014, "step": 1599 }, { "epoch": 0.7279344858962693, "grad_norm": 0.6642116014233423, "learning_rate": 4.743352726525311e-05, "loss": 0.1574, "step": 1600 }, { "epoch": 0.7283894449499545, "grad_norm": 0.6394037284542411, "learning_rate": 4.743037233760728e-05, "loss": 0.2039, "step": 1601 }, { "epoch": 0.7288444040036397, "grad_norm": 0.834289057373217, "learning_rate": 4.742721557705271e-05, "loss": 0.2589, "step": 1602 }, { "epoch": 0.7292993630573248, "grad_norm": 0.8947888746659172, "learning_rate": 4.742405698384737e-05, "loss": 0.2568, "step": 1603 }, { "epoch": 0.72975432211101, "grad_norm": 0.8565443654350698, "learning_rate": 4.7420896558249364e-05, "loss": 0.188, "step": 1604 }, { "epoch": 0.7302092811646952, "grad_norm": 0.7935722289784024, "learning_rate": 4.7417734300516934e-05, "loss": 0.2067, "step": 1605 }, { "epoch": 0.7306642402183804, "grad_norm": 0.6861743889598044, "learning_rate": 4.7414570210908496e-05, "loss": 0.1704, "step": 1606 }, { "epoch": 0.7311191992720655, "grad_norm": 1.136992461834501, "learning_rate": 4.741140428968261e-05, "loss": 0.3155, "step": 1607 }, { "epoch": 0.7315741583257507, "grad_norm": 0.7998460175991096, "learning_rate": 4.740823653709797e-05, "loss": 0.1427, "step": 1608 }, { "epoch": 0.7320291173794359, "grad_norm": 2.2721722163557723, "learning_rate": 4.740506695341343e-05, "loss": 0.2167, "step": 1609 }, { "epoch": 0.732484076433121, "grad_norm": 0.8713711142583378, "learning_rate": 4.740189553888801e-05, "loss": 0.3301, "step": 1610 }, { "epoch": 0.7329390354868062, "grad_norm": 0.7579945673322844, "learning_rate": 4.739872229378085e-05, "loss": 0.1837, "step": 1611 }, { "epoch": 0.7333939945404914, "grad_norm": 0.7066761607511316, "learning_rate": 4.739554721835125e-05, "loss": 0.1858, "step": 1612 }, { "epoch": 0.7338489535941766, "grad_norm": 0.7237471307069353, "learning_rate": 4.739237031285867e-05, "loss": 0.2454, "step": 1613 }, { "epoch": 0.7343039126478617, "grad_norm": 0.8857264048308667, "learning_rate": 4.738919157756272e-05, "loss": 0.218, "step": 1614 }, { "epoch": 0.7347588717015469, "grad_norm": 0.7961058727334365, "learning_rate": 4.738601101272313e-05, "loss": 0.2424, "step": 1615 }, { "epoch": 0.7352138307552321, "grad_norm": 1.8432899047084668, "learning_rate": 4.738282861859982e-05, "loss": 0.3088, "step": 1616 }, { "epoch": 0.7356687898089171, "grad_norm": 0.6919961427565404, "learning_rate": 4.737964439545284e-05, "loss": 0.2014, "step": 1617 }, { "epoch": 0.7361237488626023, "grad_norm": 0.9163182435824033, "learning_rate": 4.737645834354238e-05, "loss": 0.2623, "step": 1618 }, { "epoch": 0.7365787079162875, "grad_norm": 0.6542636327876111, "learning_rate": 4.73732704631288e-05, "loss": 0.1609, "step": 1619 }, { "epoch": 0.7370336669699727, "grad_norm": 0.7579772480323497, "learning_rate": 4.737008075447259e-05, "loss": 0.1631, "step": 1620 }, { "epoch": 0.7374886260236578, "grad_norm": 0.8547055558606551, "learning_rate": 4.7366889217834396e-05, "loss": 0.2373, "step": 1621 }, { "epoch": 0.737943585077343, "grad_norm": 0.9635216296685644, "learning_rate": 4.7363695853475034e-05, "loss": 0.2517, "step": 1622 }, { "epoch": 0.7383985441310282, "grad_norm": 1.105971799609247, "learning_rate": 4.736050066165544e-05, "loss": 0.2496, "step": 1623 }, { "epoch": 0.7388535031847133, "grad_norm": 1.0501633540671214, "learning_rate": 4.735730364263671e-05, "loss": 0.3261, "step": 1624 }, { "epoch": 0.7393084622383985, "grad_norm": 0.7374660413621993, "learning_rate": 4.735410479668009e-05, "loss": 0.1924, "step": 1625 }, { "epoch": 0.7397634212920837, "grad_norm": 0.9791015452806593, "learning_rate": 4.735090412404697e-05, "loss": 0.2714, "step": 1626 }, { "epoch": 0.7402183803457689, "grad_norm": 0.9092759243894789, "learning_rate": 4.734770162499891e-05, "loss": 0.1822, "step": 1627 }, { "epoch": 0.740673339399454, "grad_norm": 0.8469345439556404, "learning_rate": 4.7344497299797593e-05, "loss": 0.239, "step": 1628 }, { "epoch": 0.7411282984531392, "grad_norm": 0.7760022826400782, "learning_rate": 4.7341291148704856e-05, "loss": 0.2317, "step": 1629 }, { "epoch": 0.7415832575068244, "grad_norm": 1.866309610803659, "learning_rate": 4.733808317198271e-05, "loss": 0.2698, "step": 1630 }, { "epoch": 0.7420382165605095, "grad_norm": 0.8999325333475687, "learning_rate": 4.733487336989328e-05, "loss": 0.2129, "step": 1631 }, { "epoch": 0.7424931756141947, "grad_norm": 0.9800878807203892, "learning_rate": 4.7331661742698864e-05, "loss": 0.2366, "step": 1632 }, { "epoch": 0.7429481346678799, "grad_norm": 0.9634648037605466, "learning_rate": 4.7328448290661894e-05, "loss": 0.2976, "step": 1633 }, { "epoch": 0.7434030937215651, "grad_norm": 0.9768121627390086, "learning_rate": 4.732523301404497e-05, "loss": 0.2751, "step": 1634 }, { "epoch": 0.7438580527752502, "grad_norm": 0.9318509516932899, "learning_rate": 4.732201591311082e-05, "loss": 0.2817, "step": 1635 }, { "epoch": 0.7443130118289354, "grad_norm": 0.6554077397224386, "learning_rate": 4.7318796988122325e-05, "loss": 0.2203, "step": 1636 }, { "epoch": 0.7447679708826206, "grad_norm": 0.7265763165359881, "learning_rate": 4.731557623934255e-05, "loss": 0.1985, "step": 1637 }, { "epoch": 0.7452229299363057, "grad_norm": 1.2971439910004419, "learning_rate": 4.731235366703465e-05, "loss": 0.3797, "step": 1638 }, { "epoch": 0.7456778889899909, "grad_norm": 1.1431313312936877, "learning_rate": 4.7309129271461974e-05, "loss": 0.2988, "step": 1639 }, { "epoch": 0.7461328480436761, "grad_norm": 0.9365747688761388, "learning_rate": 4.7305903052888e-05, "loss": 0.2238, "step": 1640 }, { "epoch": 0.7465878070973613, "grad_norm": 1.0149904881792813, "learning_rate": 4.730267501157636e-05, "loss": 0.2689, "step": 1641 }, { "epoch": 0.7470427661510464, "grad_norm": 0.7986117309008063, "learning_rate": 4.729944514779083e-05, "loss": 0.225, "step": 1642 }, { "epoch": 0.7474977252047316, "grad_norm": 0.58849312747805, "learning_rate": 4.7296213461795355e-05, "loss": 0.1917, "step": 1643 }, { "epoch": 0.7479526842584168, "grad_norm": 0.7294089537223724, "learning_rate": 4.7292979953854e-05, "loss": 0.1662, "step": 1644 }, { "epoch": 0.7484076433121019, "grad_norm": 0.8089739077956907, "learning_rate": 4.7289744624231e-05, "loss": 0.2325, "step": 1645 }, { "epoch": 0.7488626023657871, "grad_norm": 0.8469271769641282, "learning_rate": 4.7286507473190736e-05, "loss": 0.2475, "step": 1646 }, { "epoch": 0.7493175614194723, "grad_norm": 0.8992398904335263, "learning_rate": 4.7283268500997716e-05, "loss": 0.2587, "step": 1647 }, { "epoch": 0.7497725204731575, "grad_norm": 0.8857828348524657, "learning_rate": 4.728002770791663e-05, "loss": 0.2223, "step": 1648 }, { "epoch": 0.7502274795268425, "grad_norm": 0.8226788547341873, "learning_rate": 4.727678509421229e-05, "loss": 0.2288, "step": 1649 }, { "epoch": 0.7506824385805277, "grad_norm": 0.6620083715283182, "learning_rate": 4.727354066014967e-05, "loss": 0.1801, "step": 1650 }, { "epoch": 0.7511373976342129, "grad_norm": 0.618937637720396, "learning_rate": 4.72702944059939e-05, "loss": 0.179, "step": 1651 }, { "epoch": 0.7515923566878981, "grad_norm": 0.8967583843223165, "learning_rate": 4.726704633201024e-05, "loss": 0.1999, "step": 1652 }, { "epoch": 0.7520473157415832, "grad_norm": 0.7366134919304225, "learning_rate": 4.726379643846412e-05, "loss": 0.2368, "step": 1653 }, { "epoch": 0.7525022747952684, "grad_norm": 0.9829037802585676, "learning_rate": 4.7260544725621094e-05, "loss": 0.2747, "step": 1654 }, { "epoch": 0.7529572338489536, "grad_norm": 0.7920560807409885, "learning_rate": 4.7257291193746875e-05, "loss": 0.1994, "step": 1655 }, { "epoch": 0.7534121929026387, "grad_norm": 0.7848067584096017, "learning_rate": 4.725403584310734e-05, "loss": 0.1418, "step": 1656 }, { "epoch": 0.7538671519563239, "grad_norm": 0.89865186655351, "learning_rate": 4.725077867396848e-05, "loss": 0.2224, "step": 1657 }, { "epoch": 0.7543221110100091, "grad_norm": 0.7846803793814601, "learning_rate": 4.724751968659649e-05, "loss": 0.2184, "step": 1658 }, { "epoch": 0.7547770700636943, "grad_norm": 1.1352368621937672, "learning_rate": 4.724425888125764e-05, "loss": 0.3331, "step": 1659 }, { "epoch": 0.7552320291173794, "grad_norm": 0.9396652457648224, "learning_rate": 4.724099625821842e-05, "loss": 0.2739, "step": 1660 }, { "epoch": 0.7556869881710646, "grad_norm": 0.7948173232499403, "learning_rate": 4.723773181774542e-05, "loss": 0.2079, "step": 1661 }, { "epoch": 0.7561419472247498, "grad_norm": 0.9222260699456961, "learning_rate": 4.723446556010541e-05, "loss": 0.2524, "step": 1662 }, { "epoch": 0.7565969062784349, "grad_norm": 0.6413934787842581, "learning_rate": 4.723119748556528e-05, "loss": 0.2091, "step": 1663 }, { "epoch": 0.7570518653321201, "grad_norm": 0.79467233379209, "learning_rate": 4.7227927594392085e-05, "loss": 0.1834, "step": 1664 }, { "epoch": 0.7575068243858053, "grad_norm": 0.994306059877791, "learning_rate": 4.722465588685302e-05, "loss": 0.2954, "step": 1665 }, { "epoch": 0.7579617834394905, "grad_norm": 0.7497774201096754, "learning_rate": 4.722138236321545e-05, "loss": 0.1595, "step": 1666 }, { "epoch": 0.7584167424931756, "grad_norm": 1.0513587832849938, "learning_rate": 4.721810702374687e-05, "loss": 0.2128, "step": 1667 }, { "epoch": 0.7588717015468608, "grad_norm": 0.8072068992543614, "learning_rate": 4.721482986871491e-05, "loss": 0.2007, "step": 1668 }, { "epoch": 0.759326660600546, "grad_norm": 1.3362074147529164, "learning_rate": 4.721155089838738e-05, "loss": 0.2896, "step": 1669 }, { "epoch": 0.7597816196542311, "grad_norm": 0.9430948039874072, "learning_rate": 4.7208270113032215e-05, "loss": 0.211, "step": 1670 }, { "epoch": 0.7602365787079163, "grad_norm": 0.8540660438225566, "learning_rate": 4.720498751291751e-05, "loss": 0.2264, "step": 1671 }, { "epoch": 0.7606915377616015, "grad_norm": 0.8341243884283956, "learning_rate": 4.72017030983115e-05, "loss": 0.2809, "step": 1672 }, { "epoch": 0.7611464968152867, "grad_norm": 0.8132587599868146, "learning_rate": 4.7198416869482575e-05, "loss": 0.2294, "step": 1673 }, { "epoch": 0.7616014558689718, "grad_norm": 0.9763322409516897, "learning_rate": 4.7195128826699266e-05, "loss": 0.2499, "step": 1674 }, { "epoch": 0.762056414922657, "grad_norm": 0.8918782644922225, "learning_rate": 4.7191838970230266e-05, "loss": 0.2345, "step": 1675 }, { "epoch": 0.7625113739763422, "grad_norm": 1.0250092577523153, "learning_rate": 4.7188547300344404e-05, "loss": 0.2796, "step": 1676 }, { "epoch": 0.7629663330300273, "grad_norm": 0.9285035166534873, "learning_rate": 4.718525381731066e-05, "loss": 0.216, "step": 1677 }, { "epoch": 0.7634212920837125, "grad_norm": 0.7238172521541588, "learning_rate": 4.718195852139816e-05, "loss": 0.2494, "step": 1678 }, { "epoch": 0.7638762511373977, "grad_norm": 1.2145002915744996, "learning_rate": 4.717866141287618e-05, "loss": 0.385, "step": 1679 }, { "epoch": 0.7643312101910829, "grad_norm": 1.2831904461077772, "learning_rate": 4.717536249201415e-05, "loss": 0.2755, "step": 1680 }, { "epoch": 0.7647861692447679, "grad_norm": 0.7831485186938991, "learning_rate": 4.717206175908164e-05, "loss": 0.2067, "step": 1681 }, { "epoch": 0.7652411282984531, "grad_norm": 0.8696257225256196, "learning_rate": 4.716875921434837e-05, "loss": 0.2119, "step": 1682 }, { "epoch": 0.7656960873521383, "grad_norm": 0.8290169406888904, "learning_rate": 4.7165454858084215e-05, "loss": 0.2005, "step": 1683 }, { "epoch": 0.7661510464058234, "grad_norm": 1.0685296701874785, "learning_rate": 4.716214869055918e-05, "loss": 0.2514, "step": 1684 }, { "epoch": 0.7666060054595086, "grad_norm": 0.6795258401743371, "learning_rate": 4.715884071204344e-05, "loss": 0.1532, "step": 1685 }, { "epoch": 0.7670609645131938, "grad_norm": 0.8355506325741213, "learning_rate": 4.715553092280731e-05, "loss": 0.2768, "step": 1686 }, { "epoch": 0.767515923566879, "grad_norm": 0.8955321360946634, "learning_rate": 4.715221932312124e-05, "loss": 0.2418, "step": 1687 }, { "epoch": 0.7679708826205641, "grad_norm": 1.1054245421643807, "learning_rate": 4.714890591325585e-05, "loss": 0.2982, "step": 1688 }, { "epoch": 0.7684258416742493, "grad_norm": 0.8252874713237395, "learning_rate": 4.714559069348189e-05, "loss": 0.2232, "step": 1689 }, { "epoch": 0.7688808007279345, "grad_norm": 0.99760910475362, "learning_rate": 4.714227366407027e-05, "loss": 0.2593, "step": 1690 }, { "epoch": 0.7693357597816196, "grad_norm": 0.9649299662025591, "learning_rate": 4.7138954825292035e-05, "loss": 0.2197, "step": 1691 }, { "epoch": 0.7697907188353048, "grad_norm": 0.8869453896124802, "learning_rate": 4.71356341774184e-05, "loss": 0.2663, "step": 1692 }, { "epoch": 0.77024567788899, "grad_norm": 0.868497977498269, "learning_rate": 4.713231172072069e-05, "loss": 0.2432, "step": 1693 }, { "epoch": 0.7707006369426752, "grad_norm": 0.8478480737309461, "learning_rate": 4.7128987455470426e-05, "loss": 0.2586, "step": 1694 }, { "epoch": 0.7711555959963603, "grad_norm": 1.0864571871567013, "learning_rate": 4.712566138193923e-05, "loss": 0.3259, "step": 1695 }, { "epoch": 0.7716105550500455, "grad_norm": 0.9311794636572722, "learning_rate": 4.712233350039892e-05, "loss": 0.2178, "step": 1696 }, { "epoch": 0.7720655141037307, "grad_norm": 0.7041623841936461, "learning_rate": 4.711900381112141e-05, "loss": 0.1928, "step": 1697 }, { "epoch": 0.7725204731574158, "grad_norm": 0.8735613748755046, "learning_rate": 4.7115672314378796e-05, "loss": 0.2201, "step": 1698 }, { "epoch": 0.772975432211101, "grad_norm": 0.804743315813295, "learning_rate": 4.711233901044332e-05, "loss": 0.2506, "step": 1699 }, { "epoch": 0.7734303912647862, "grad_norm": 0.8034807788717324, "learning_rate": 4.710900389958735e-05, "loss": 0.2116, "step": 1700 }, { "epoch": 0.7738853503184714, "grad_norm": 0.9483505337669819, "learning_rate": 4.710566698208343e-05, "loss": 0.3308, "step": 1701 }, { "epoch": 0.7743403093721565, "grad_norm": 0.7138702619208577, "learning_rate": 4.710232825820423e-05, "loss": 0.192, "step": 1702 }, { "epoch": 0.7747952684258417, "grad_norm": 0.6090333039531137, "learning_rate": 4.709898772822258e-05, "loss": 0.1467, "step": 1703 }, { "epoch": 0.7752502274795269, "grad_norm": 0.8125331688901821, "learning_rate": 4.7095645392411445e-05, "loss": 0.17, "step": 1704 }, { "epoch": 0.775705186533212, "grad_norm": 0.7182009775147873, "learning_rate": 4.7092301251043956e-05, "loss": 0.1868, "step": 1705 }, { "epoch": 0.7761601455868972, "grad_norm": 0.6444548966536022, "learning_rate": 4.7088955304393385e-05, "loss": 0.1732, "step": 1706 }, { "epoch": 0.7766151046405824, "grad_norm": 1.1501997551344432, "learning_rate": 4.708560755273313e-05, "loss": 0.1945, "step": 1707 }, { "epoch": 0.7770700636942676, "grad_norm": 2.719927352048504, "learning_rate": 4.7082257996336765e-05, "loss": 0.421, "step": 1708 }, { "epoch": 0.7775250227479527, "grad_norm": 0.6422815837858196, "learning_rate": 4.7078906635478e-05, "loss": 0.2567, "step": 1709 }, { "epoch": 0.7779799818016379, "grad_norm": 0.6199289553909232, "learning_rate": 4.707555347043069e-05, "loss": 0.2109, "step": 1710 }, { "epoch": 0.778434940855323, "grad_norm": 1.0667772877068755, "learning_rate": 4.707219850146884e-05, "loss": 0.2592, "step": 1711 }, { "epoch": 0.7788898999090081, "grad_norm": 0.8704529537892721, "learning_rate": 4.706884172886662e-05, "loss": 0.2179, "step": 1712 }, { "epoch": 0.7793448589626933, "grad_norm": 0.6035245621606724, "learning_rate": 4.70654831528983e-05, "loss": 0.201, "step": 1713 }, { "epoch": 0.7797998180163785, "grad_norm": 0.8312592358518076, "learning_rate": 4.7062122773838355e-05, "loss": 0.1982, "step": 1714 }, { "epoch": 0.7802547770700637, "grad_norm": 0.7631774531017866, "learning_rate": 4.705876059196136e-05, "loss": 0.2418, "step": 1715 }, { "epoch": 0.7807097361237488, "grad_norm": 0.7835832155804697, "learning_rate": 4.705539660754207e-05, "loss": 0.1775, "step": 1716 }, { "epoch": 0.781164695177434, "grad_norm": 0.7792922380196013, "learning_rate": 4.705203082085537e-05, "loss": 0.191, "step": 1717 }, { "epoch": 0.7816196542311192, "grad_norm": 0.7934712912954754, "learning_rate": 4.7048663232176306e-05, "loss": 0.2931, "step": 1718 }, { "epoch": 0.7820746132848043, "grad_norm": 1.0152101381659828, "learning_rate": 4.704529384178004e-05, "loss": 0.1711, "step": 1719 }, { "epoch": 0.7825295723384895, "grad_norm": 0.7119741906406033, "learning_rate": 4.704192264994192e-05, "loss": 0.1523, "step": 1720 }, { "epoch": 0.7829845313921747, "grad_norm": 0.7881346497230108, "learning_rate": 4.703854965693742e-05, "loss": 0.1958, "step": 1721 }, { "epoch": 0.7834394904458599, "grad_norm": 1.3068621874102124, "learning_rate": 4.703517486304218e-05, "loss": 0.2952, "step": 1722 }, { "epoch": 0.783894449499545, "grad_norm": 0.7712898732287292, "learning_rate": 4.703179826853195e-05, "loss": 0.2553, "step": 1723 }, { "epoch": 0.7843494085532302, "grad_norm": 0.9550466726202117, "learning_rate": 4.7028419873682654e-05, "loss": 0.2193, "step": 1724 }, { "epoch": 0.7848043676069154, "grad_norm": 2.6061751616129594, "learning_rate": 4.702503967877038e-05, "loss": 0.2843, "step": 1725 }, { "epoch": 0.7852593266606005, "grad_norm": 0.8586440389264631, "learning_rate": 4.7021657684071316e-05, "loss": 0.2679, "step": 1726 }, { "epoch": 0.7857142857142857, "grad_norm": 0.8647292991680374, "learning_rate": 4.701827388986184e-05, "loss": 0.2624, "step": 1727 }, { "epoch": 0.7861692447679709, "grad_norm": 1.1240073803255768, "learning_rate": 4.701488829641845e-05, "loss": 0.1868, "step": 1728 }, { "epoch": 0.7866242038216561, "grad_norm": 2.2759847106858513, "learning_rate": 4.7011500904017816e-05, "loss": 0.1822, "step": 1729 }, { "epoch": 0.7870791628753412, "grad_norm": 0.7819237664798718, "learning_rate": 4.700811171293673e-05, "loss": 0.1709, "step": 1730 }, { "epoch": 0.7875341219290264, "grad_norm": 0.7552318719641333, "learning_rate": 4.700472072345214e-05, "loss": 0.1927, "step": 1731 }, { "epoch": 0.7879890809827116, "grad_norm": 0.723993926702435, "learning_rate": 4.7001327935841134e-05, "loss": 0.2282, "step": 1732 }, { "epoch": 0.7884440400363967, "grad_norm": 0.9467086263729142, "learning_rate": 4.699793335038098e-05, "loss": 0.2678, "step": 1733 }, { "epoch": 0.7888989990900819, "grad_norm": 0.8200190085909438, "learning_rate": 4.699453696734905e-05, "loss": 0.1638, "step": 1734 }, { "epoch": 0.7893539581437671, "grad_norm": 0.8702046451434459, "learning_rate": 4.699113878702288e-05, "loss": 0.2526, "step": 1735 }, { "epoch": 0.7898089171974523, "grad_norm": 0.9091249122929689, "learning_rate": 4.698773880968017e-05, "loss": 0.1918, "step": 1736 }, { "epoch": 0.7902638762511374, "grad_norm": 0.5815528611075529, "learning_rate": 4.698433703559873e-05, "loss": 0.2196, "step": 1737 }, { "epoch": 0.7907188353048226, "grad_norm": 0.7052950360750093, "learning_rate": 4.698093346505656e-05, "loss": 0.1839, "step": 1738 }, { "epoch": 0.7911737943585078, "grad_norm": 1.3174621585861392, "learning_rate": 4.697752809833177e-05, "loss": 0.2331, "step": 1739 }, { "epoch": 0.7916287534121929, "grad_norm": 1.031028234921381, "learning_rate": 4.697412093570263e-05, "loss": 0.293, "step": 1740 }, { "epoch": 0.792083712465878, "grad_norm": 0.7629695082437841, "learning_rate": 4.697071197744756e-05, "loss": 0.2213, "step": 1741 }, { "epoch": 0.7925386715195633, "grad_norm": 0.9173487757979405, "learning_rate": 4.6967301223845115e-05, "loss": 0.2296, "step": 1742 }, { "epoch": 0.7929936305732485, "grad_norm": 1.3652176372424112, "learning_rate": 4.6963888675174035e-05, "loss": 0.2227, "step": 1743 }, { "epoch": 0.7934485896269335, "grad_norm": 1.184968535576391, "learning_rate": 4.696047433171315e-05, "loss": 0.1597, "step": 1744 }, { "epoch": 0.7939035486806187, "grad_norm": 0.8218758407883989, "learning_rate": 4.695705819374149e-05, "loss": 0.216, "step": 1745 }, { "epoch": 0.7943585077343039, "grad_norm": 0.7539943247182409, "learning_rate": 4.695364026153818e-05, "loss": 0.1876, "step": 1746 }, { "epoch": 0.794813466787989, "grad_norm": 0.6315072600266541, "learning_rate": 4.695022053538253e-05, "loss": 0.1805, "step": 1747 }, { "epoch": 0.7952684258416742, "grad_norm": 1.1880960169575603, "learning_rate": 4.694679901555398e-05, "loss": 0.2914, "step": 1748 }, { "epoch": 0.7957233848953594, "grad_norm": 0.9580232339647915, "learning_rate": 4.6943375702332134e-05, "loss": 0.1853, "step": 1749 }, { "epoch": 0.7961783439490446, "grad_norm": 0.8368753052584226, "learning_rate": 4.693995059599672e-05, "loss": 0.2807, "step": 1750 }, { "epoch": 0.7966333030027297, "grad_norm": 2.7234172584490857, "learning_rate": 4.6936523696827615e-05, "loss": 0.3155, "step": 1751 }, { "epoch": 0.7970882620564149, "grad_norm": 0.8482090189542567, "learning_rate": 4.6933095005104864e-05, "loss": 0.2246, "step": 1752 }, { "epoch": 0.7975432211101001, "grad_norm": 0.9432969253866593, "learning_rate": 4.692966452110864e-05, "loss": 0.2218, "step": 1753 }, { "epoch": 0.7979981801637852, "grad_norm": 0.727703395688791, "learning_rate": 4.692623224511926e-05, "loss": 0.2863, "step": 1754 }, { "epoch": 0.7984531392174704, "grad_norm": 1.2112664695993531, "learning_rate": 4.69227981774172e-05, "loss": 0.2844, "step": 1755 }, { "epoch": 0.7989080982711556, "grad_norm": 1.0357919781218323, "learning_rate": 4.691936231828308e-05, "loss": 0.2173, "step": 1756 }, { "epoch": 0.7993630573248408, "grad_norm": 1.9720044502782152, "learning_rate": 4.6915924667997655e-05, "loss": 0.2233, "step": 1757 }, { "epoch": 0.7998180163785259, "grad_norm": 0.7021275448990074, "learning_rate": 4.691248522684184e-05, "loss": 0.1679, "step": 1758 }, { "epoch": 0.8002729754322111, "grad_norm": 0.7624014501793099, "learning_rate": 4.690904399509668e-05, "loss": 0.2, "step": 1759 }, { "epoch": 0.8007279344858963, "grad_norm": 1.1717077926459623, "learning_rate": 4.6905600973043396e-05, "loss": 0.3625, "step": 1760 }, { "epoch": 0.8011828935395814, "grad_norm": 1.0553191097212828, "learning_rate": 4.690215616096332e-05, "loss": 0.2519, "step": 1761 }, { "epoch": 0.8016378525932666, "grad_norm": 0.9746733212532771, "learning_rate": 4.689870955913795e-05, "loss": 0.1901, "step": 1762 }, { "epoch": 0.8020928116469518, "grad_norm": 0.9347714220841017, "learning_rate": 4.6895261167848935e-05, "loss": 0.2475, "step": 1763 }, { "epoch": 0.802547770700637, "grad_norm": 1.3331534535482115, "learning_rate": 4.689181098737805e-05, "loss": 0.3242, "step": 1764 }, { "epoch": 0.8030027297543221, "grad_norm": 0.8255715004638128, "learning_rate": 4.6888359018007234e-05, "loss": 0.1884, "step": 1765 }, { "epoch": 0.8034576888080073, "grad_norm": 1.0230525077095158, "learning_rate": 4.6884905260018565e-05, "loss": 0.2398, "step": 1766 }, { "epoch": 0.8039126478616925, "grad_norm": 0.9511158357801189, "learning_rate": 4.688144971369427e-05, "loss": 0.2826, "step": 1767 }, { "epoch": 0.8043676069153776, "grad_norm": 0.7860063163154657, "learning_rate": 4.687799237931673e-05, "loss": 0.2094, "step": 1768 }, { "epoch": 0.8048225659690628, "grad_norm": 0.7094904463487425, "learning_rate": 4.687453325716844e-05, "loss": 0.2188, "step": 1769 }, { "epoch": 0.805277525022748, "grad_norm": 1.1648360440471097, "learning_rate": 4.687107234753208e-05, "loss": 0.2476, "step": 1770 }, { "epoch": 0.8057324840764332, "grad_norm": 1.1928285313716782, "learning_rate": 4.6867609650690456e-05, "loss": 0.2402, "step": 1771 }, { "epoch": 0.8061874431301183, "grad_norm": 0.826605473133501, "learning_rate": 4.686414516692653e-05, "loss": 0.2932, "step": 1772 }, { "epoch": 0.8066424021838035, "grad_norm": 0.704107605735714, "learning_rate": 4.686067889652339e-05, "loss": 0.2218, "step": 1773 }, { "epoch": 0.8070973612374887, "grad_norm": 1.0635871797013954, "learning_rate": 4.6857210839764297e-05, "loss": 0.2596, "step": 1774 }, { "epoch": 0.8075523202911737, "grad_norm": 1.0149567532357107, "learning_rate": 4.6853740996932643e-05, "loss": 0.2246, "step": 1775 }, { "epoch": 0.8080072793448589, "grad_norm": 0.6951770211988767, "learning_rate": 4.685026936831196e-05, "loss": 0.2023, "step": 1776 }, { "epoch": 0.8084622383985441, "grad_norm": 0.8908819339186815, "learning_rate": 4.684679595418595e-05, "loss": 0.2652, "step": 1777 }, { "epoch": 0.8089171974522293, "grad_norm": 0.6947407002132683, "learning_rate": 4.6843320754838426e-05, "loss": 0.2345, "step": 1778 }, { "epoch": 0.8093721565059144, "grad_norm": 0.8712306666252957, "learning_rate": 4.683984377055337e-05, "loss": 0.2774, "step": 1779 }, { "epoch": 0.8098271155595996, "grad_norm": 0.6898760130415076, "learning_rate": 4.6836365001614914e-05, "loss": 0.187, "step": 1780 }, { "epoch": 0.8102820746132848, "grad_norm": 0.7425993080060801, "learning_rate": 4.6832884448307325e-05, "loss": 0.2335, "step": 1781 }, { "epoch": 0.8107370336669699, "grad_norm": 1.0065425276790676, "learning_rate": 4.682940211091501e-05, "loss": 0.2135, "step": 1782 }, { "epoch": 0.8111919927206551, "grad_norm": 0.7639819462077564, "learning_rate": 4.6825917989722533e-05, "loss": 0.2392, "step": 1783 }, { "epoch": 0.8116469517743403, "grad_norm": 0.8051947589541039, "learning_rate": 4.682243208501461e-05, "loss": 0.2062, "step": 1784 }, { "epoch": 0.8121019108280255, "grad_norm": 1.026077877147304, "learning_rate": 4.681894439707608e-05, "loss": 0.2519, "step": 1785 }, { "epoch": 0.8125568698817106, "grad_norm": 0.6490356447271867, "learning_rate": 4.681545492619195e-05, "loss": 0.1605, "step": 1786 }, { "epoch": 0.8130118289353958, "grad_norm": 1.0774911990052742, "learning_rate": 4.6811963672647367e-05, "loss": 0.2576, "step": 1787 }, { "epoch": 0.813466787989081, "grad_norm": 1.2572525330640067, "learning_rate": 4.680847063672761e-05, "loss": 0.2032, "step": 1788 }, { "epoch": 0.8139217470427661, "grad_norm": 0.6888614732968641, "learning_rate": 4.680497581871811e-05, "loss": 0.2183, "step": 1789 }, { "epoch": 0.8143767060964513, "grad_norm": 1.0012704901002916, "learning_rate": 4.680147921890446e-05, "loss": 0.2532, "step": 1790 }, { "epoch": 0.8148316651501365, "grad_norm": 1.1145134589812051, "learning_rate": 4.6797980837572396e-05, "loss": 0.2516, "step": 1791 }, { "epoch": 0.8152866242038217, "grad_norm": 0.9211514703691216, "learning_rate": 4.6794480675007765e-05, "loss": 0.235, "step": 1792 }, { "epoch": 0.8157415832575068, "grad_norm": 1.014969710630086, "learning_rate": 4.6790978731496596e-05, "loss": 0.2372, "step": 1793 }, { "epoch": 0.816196542311192, "grad_norm": 0.7304816583693594, "learning_rate": 4.678747500732506e-05, "loss": 0.2905, "step": 1794 }, { "epoch": 0.8166515013648772, "grad_norm": 1.0487453713249915, "learning_rate": 4.678396950277945e-05, "loss": 0.2695, "step": 1795 }, { "epoch": 0.8171064604185623, "grad_norm": 0.7404427893741102, "learning_rate": 4.6780462218146234e-05, "loss": 0.2214, "step": 1796 }, { "epoch": 0.8175614194722475, "grad_norm": 0.7783284480432697, "learning_rate": 4.6776953153712e-05, "loss": 0.2737, "step": 1797 }, { "epoch": 0.8180163785259327, "grad_norm": 0.847330041394511, "learning_rate": 4.6773442309763496e-05, "loss": 0.2211, "step": 1798 }, { "epoch": 0.8184713375796179, "grad_norm": 0.6199545558134602, "learning_rate": 4.676992968658762e-05, "loss": 0.2165, "step": 1799 }, { "epoch": 0.818926296633303, "grad_norm": 0.7478161764671385, "learning_rate": 4.67664152844714e-05, "loss": 0.1576, "step": 1800 }, { "epoch": 0.8193812556869882, "grad_norm": 1.0055881619478166, "learning_rate": 4.676289910370202e-05, "loss": 0.2622, "step": 1801 }, { "epoch": 0.8198362147406734, "grad_norm": 0.9992937761171394, "learning_rate": 4.675938114456681e-05, "loss": 0.2773, "step": 1802 }, { "epoch": 0.8202911737943585, "grad_norm": 0.8710061340668593, "learning_rate": 4.675586140735323e-05, "loss": 0.2963, "step": 1803 }, { "epoch": 0.8207461328480437, "grad_norm": 0.9978593717421264, "learning_rate": 4.675233989234891e-05, "loss": 0.2565, "step": 1804 }, { "epoch": 0.8212010919017289, "grad_norm": 0.7703154365955148, "learning_rate": 4.674881659984161e-05, "loss": 0.1821, "step": 1805 }, { "epoch": 0.821656050955414, "grad_norm": 0.7561245616672031, "learning_rate": 4.674529153011922e-05, "loss": 0.2224, "step": 1806 }, { "epoch": 0.8221110100090991, "grad_norm": 1.004749675749536, "learning_rate": 4.6741764683469816e-05, "loss": 0.2801, "step": 1807 }, { "epoch": 0.8225659690627843, "grad_norm": 0.801523227927576, "learning_rate": 4.673823606018159e-05, "loss": 0.2758, "step": 1808 }, { "epoch": 0.8230209281164695, "grad_norm": 0.9895389550349256, "learning_rate": 4.6734705660542874e-05, "loss": 0.2095, "step": 1809 }, { "epoch": 0.8234758871701547, "grad_norm": 0.8098062024985617, "learning_rate": 4.673117348484217e-05, "loss": 0.2464, "step": 1810 }, { "epoch": 0.8239308462238398, "grad_norm": 0.6163778915836193, "learning_rate": 4.6727639533368105e-05, "loss": 0.1872, "step": 1811 }, { "epoch": 0.824385805277525, "grad_norm": 1.1769287985894705, "learning_rate": 4.672410380640946e-05, "loss": 0.333, "step": 1812 }, { "epoch": 0.8248407643312102, "grad_norm": 0.8022100311045284, "learning_rate": 4.672056630425515e-05, "loss": 0.1783, "step": 1813 }, { "epoch": 0.8252957233848953, "grad_norm": 0.6119145765128124, "learning_rate": 4.671702702719426e-05, "loss": 0.1832, "step": 1814 }, { "epoch": 0.8257506824385805, "grad_norm": 1.437038473037203, "learning_rate": 4.671348597551599e-05, "loss": 0.3134, "step": 1815 }, { "epoch": 0.8262056414922657, "grad_norm": 0.8888812613834538, "learning_rate": 4.670994314950971e-05, "loss": 0.253, "step": 1816 }, { "epoch": 0.8266606005459509, "grad_norm": 1.1146761197461337, "learning_rate": 4.6706398549464905e-05, "loss": 0.2952, "step": 1817 }, { "epoch": 0.827115559599636, "grad_norm": 1.0158471487069036, "learning_rate": 4.6702852175671243e-05, "loss": 0.1983, "step": 1818 }, { "epoch": 0.8275705186533212, "grad_norm": 0.808069927915032, "learning_rate": 4.669930402841851e-05, "loss": 0.2573, "step": 1819 }, { "epoch": 0.8280254777070064, "grad_norm": 1.6895699883697703, "learning_rate": 4.669575410799665e-05, "loss": 0.3078, "step": 1820 }, { "epoch": 0.8284804367606915, "grad_norm": 0.7015595497974129, "learning_rate": 4.669220241469573e-05, "loss": 0.206, "step": 1821 }, { "epoch": 0.8289353958143767, "grad_norm": 0.7673138446265733, "learning_rate": 4.6688648948806e-05, "loss": 0.2693, "step": 1822 }, { "epoch": 0.8293903548680619, "grad_norm": 1.1211117956717018, "learning_rate": 4.668509371061781e-05, "loss": 0.3294, "step": 1823 }, { "epoch": 0.8298453139217471, "grad_norm": 0.995892861540871, "learning_rate": 4.6681536700421704e-05, "loss": 0.3288, "step": 1824 }, { "epoch": 0.8303002729754322, "grad_norm": 0.9703853272000893, "learning_rate": 4.667797791850833e-05, "loss": 0.2009, "step": 1825 }, { "epoch": 0.8307552320291174, "grad_norm": 0.7696197948764151, "learning_rate": 4.6674417365168495e-05, "loss": 0.2049, "step": 1826 }, { "epoch": 0.8312101910828026, "grad_norm": 0.851781451786209, "learning_rate": 4.6670855040693154e-05, "loss": 0.257, "step": 1827 }, { "epoch": 0.8316651501364877, "grad_norm": 1.0742654099379305, "learning_rate": 4.66672909453734e-05, "loss": 0.2276, "step": 1828 }, { "epoch": 0.8321201091901729, "grad_norm": 1.0725306170906928, "learning_rate": 4.666372507950048e-05, "loss": 0.3169, "step": 1829 }, { "epoch": 0.8325750682438581, "grad_norm": 0.738645201925261, "learning_rate": 4.666015744336578e-05, "loss": 0.1667, "step": 1830 }, { "epoch": 0.8330300272975433, "grad_norm": 0.7605359954526583, "learning_rate": 4.6656588037260834e-05, "loss": 0.2039, "step": 1831 }, { "epoch": 0.8334849863512284, "grad_norm": 0.7435410751805681, "learning_rate": 4.665301686147731e-05, "loss": 0.2198, "step": 1832 }, { "epoch": 0.8339399454049136, "grad_norm": 0.5018286014633249, "learning_rate": 4.6649443916307034e-05, "loss": 0.1546, "step": 1833 }, { "epoch": 0.8343949044585988, "grad_norm": 1.2242717503353187, "learning_rate": 4.664586920204197e-05, "loss": 0.3056, "step": 1834 }, { "epoch": 0.8348498635122839, "grad_norm": 0.949646065247952, "learning_rate": 4.664229271897422e-05, "loss": 0.2169, "step": 1835 }, { "epoch": 0.835304822565969, "grad_norm": 0.7246312493494437, "learning_rate": 4.6638714467396056e-05, "loss": 0.2671, "step": 1836 }, { "epoch": 0.8357597816196543, "grad_norm": 0.8150972337035541, "learning_rate": 4.663513444759986e-05, "loss": 0.1975, "step": 1837 }, { "epoch": 0.8362147406733395, "grad_norm": 1.1125925631170612, "learning_rate": 4.6631552659878176e-05, "loss": 0.2902, "step": 1838 }, { "epoch": 0.8366696997270245, "grad_norm": 0.9019825020561058, "learning_rate": 4.66279691045237e-05, "loss": 0.2663, "step": 1839 }, { "epoch": 0.8371246587807097, "grad_norm": 1.0304339303089993, "learning_rate": 4.662438378182926e-05, "loss": 0.3157, "step": 1840 }, { "epoch": 0.8375796178343949, "grad_norm": 0.795789497477498, "learning_rate": 4.6620796692087834e-05, "loss": 0.2194, "step": 1841 }, { "epoch": 0.83803457688808, "grad_norm": 1.0680290804182897, "learning_rate": 4.661720783559254e-05, "loss": 0.2424, "step": 1842 }, { "epoch": 0.8384895359417652, "grad_norm": 0.8397554072692728, "learning_rate": 4.661361721263664e-05, "loss": 0.2272, "step": 1843 }, { "epoch": 0.8389444949954504, "grad_norm": 0.8918098831349569, "learning_rate": 4.661002482351355e-05, "loss": 0.254, "step": 1844 }, { "epoch": 0.8393994540491356, "grad_norm": 1.4592141061638724, "learning_rate": 4.660643066851682e-05, "loss": 0.2389, "step": 1845 }, { "epoch": 0.8398544131028207, "grad_norm": 0.9548396890088198, "learning_rate": 4.660283474794015e-05, "loss": 0.2626, "step": 1846 }, { "epoch": 0.8403093721565059, "grad_norm": 2.5166250137339463, "learning_rate": 4.6599237062077385e-05, "loss": 0.1841, "step": 1847 }, { "epoch": 0.8407643312101911, "grad_norm": 0.9763946737025999, "learning_rate": 4.6595637611222505e-05, "loss": 0.3035, "step": 1848 }, { "epoch": 0.8412192902638762, "grad_norm": 0.650231466084095, "learning_rate": 4.6592036395669644e-05, "loss": 0.1919, "step": 1849 }, { "epoch": 0.8416742493175614, "grad_norm": 1.1806455449934175, "learning_rate": 4.6588433415713084e-05, "loss": 0.2034, "step": 1850 }, { "epoch": 0.8421292083712466, "grad_norm": 0.7298682203994213, "learning_rate": 4.658482867164723e-05, "loss": 0.1623, "step": 1851 }, { "epoch": 0.8425841674249318, "grad_norm": 0.5569689449562514, "learning_rate": 4.658122216376666e-05, "loss": 0.1873, "step": 1852 }, { "epoch": 0.8430391264786169, "grad_norm": 1.5988538507185075, "learning_rate": 4.657761389236607e-05, "loss": 0.324, "step": 1853 }, { "epoch": 0.8434940855323021, "grad_norm": 1.0207376776498869, "learning_rate": 4.657400385774032e-05, "loss": 0.2637, "step": 1854 }, { "epoch": 0.8439490445859873, "grad_norm": 1.340641557857451, "learning_rate": 4.65703920601844e-05, "loss": 0.3697, "step": 1855 }, { "epoch": 0.8444040036396724, "grad_norm": 0.719503390289502, "learning_rate": 4.656677849999345e-05, "loss": 0.2002, "step": 1856 }, { "epoch": 0.8448589626933576, "grad_norm": 1.0321990105751249, "learning_rate": 4.6563163177462754e-05, "loss": 0.2551, "step": 1857 }, { "epoch": 0.8453139217470428, "grad_norm": 0.9429711993689786, "learning_rate": 4.655954609288775e-05, "loss": 0.2312, "step": 1858 }, { "epoch": 0.845768880800728, "grad_norm": 0.7187240521711654, "learning_rate": 4.655592724656399e-05, "loss": 0.1905, "step": 1859 }, { "epoch": 0.8462238398544131, "grad_norm": 0.6217554038990128, "learning_rate": 4.655230663878721e-05, "loss": 0.1496, "step": 1860 }, { "epoch": 0.8466787989080983, "grad_norm": 0.7836954385876896, "learning_rate": 4.654868426985326e-05, "loss": 0.2002, "step": 1861 }, { "epoch": 0.8471337579617835, "grad_norm": 0.9314379388582316, "learning_rate": 4.654506014005814e-05, "loss": 0.2532, "step": 1862 }, { "epoch": 0.8475887170154686, "grad_norm": 1.219979473523802, "learning_rate": 4.6541434249698e-05, "loss": 0.2635, "step": 1863 }, { "epoch": 0.8480436760691538, "grad_norm": 0.9305279857719031, "learning_rate": 4.653780659906914e-05, "loss": 0.2687, "step": 1864 }, { "epoch": 0.848498635122839, "grad_norm": 0.8166185715007146, "learning_rate": 4.6534177188467985e-05, "loss": 0.2173, "step": 1865 }, { "epoch": 0.8489535941765242, "grad_norm": 1.1323368019022846, "learning_rate": 4.653054601819112e-05, "loss": 0.2618, "step": 1866 }, { "epoch": 0.8494085532302093, "grad_norm": 0.8174126069133056, "learning_rate": 4.6526913088535264e-05, "loss": 0.2138, "step": 1867 }, { "epoch": 0.8498635122838945, "grad_norm": 0.7306604696684644, "learning_rate": 4.652327839979729e-05, "loss": 0.2374, "step": 1868 }, { "epoch": 0.8503184713375797, "grad_norm": 1.1089120615306276, "learning_rate": 4.6519641952274195e-05, "loss": 0.1935, "step": 1869 }, { "epoch": 0.8507734303912647, "grad_norm": 0.8286194860185626, "learning_rate": 4.6516003746263147e-05, "loss": 0.1935, "step": 1870 }, { "epoch": 0.8512283894449499, "grad_norm": 0.7394474824035133, "learning_rate": 4.651236378206144e-05, "loss": 0.2161, "step": 1871 }, { "epoch": 0.8516833484986351, "grad_norm": 0.7851586878518295, "learning_rate": 4.650872205996651e-05, "loss": 0.2418, "step": 1872 }, { "epoch": 0.8521383075523203, "grad_norm": 0.8752911551781419, "learning_rate": 4.650507858027595e-05, "loss": 0.1881, "step": 1873 }, { "epoch": 0.8525932666060054, "grad_norm": 0.9338501130023897, "learning_rate": 4.6501433343287477e-05, "loss": 0.252, "step": 1874 }, { "epoch": 0.8530482256596906, "grad_norm": 1.2974802700932495, "learning_rate": 4.6497786349298975e-05, "loss": 0.2854, "step": 1875 }, { "epoch": 0.8535031847133758, "grad_norm": 0.9526266180420256, "learning_rate": 4.649413759860846e-05, "loss": 0.1943, "step": 1876 }, { "epoch": 0.8539581437670609, "grad_norm": 1.0453136299550185, "learning_rate": 4.649048709151408e-05, "loss": 0.2814, "step": 1877 }, { "epoch": 0.8544131028207461, "grad_norm": 0.7304432832553566, "learning_rate": 4.648683482831414e-05, "loss": 0.1688, "step": 1878 }, { "epoch": 0.8548680618744313, "grad_norm": 0.9938363280224967, "learning_rate": 4.6483180809307104e-05, "loss": 0.2709, "step": 1879 }, { "epoch": 0.8553230209281165, "grad_norm": 0.743015859214017, "learning_rate": 4.647952503479154e-05, "loss": 0.2456, "step": 1880 }, { "epoch": 0.8557779799818016, "grad_norm": 0.8994947312536917, "learning_rate": 4.647586750506619e-05, "loss": 0.2793, "step": 1881 }, { "epoch": 0.8562329390354868, "grad_norm": 0.9221921226615283, "learning_rate": 4.6472208220429944e-05, "loss": 0.2314, "step": 1882 }, { "epoch": 0.856687898089172, "grad_norm": 0.9299998123370422, "learning_rate": 4.6468547181181804e-05, "loss": 0.2002, "step": 1883 }, { "epoch": 0.8571428571428571, "grad_norm": 0.9161996273201173, "learning_rate": 4.646488438762094e-05, "loss": 0.2991, "step": 1884 }, { "epoch": 0.8575978161965423, "grad_norm": 0.7272467356551077, "learning_rate": 4.6461219840046654e-05, "loss": 0.2125, "step": 1885 }, { "epoch": 0.8580527752502275, "grad_norm": 0.9759818068950538, "learning_rate": 4.64575535387584e-05, "loss": 0.343, "step": 1886 }, { "epoch": 0.8585077343039127, "grad_norm": 12.601559917013905, "learning_rate": 4.645388548405578e-05, "loss": 0.4077, "step": 1887 }, { "epoch": 0.8589626933575978, "grad_norm": 0.9859239852215845, "learning_rate": 4.645021567623852e-05, "loss": 0.3147, "step": 1888 }, { "epoch": 0.859417652411283, "grad_norm": 0.8030104446842451, "learning_rate": 4.64465441156065e-05, "loss": 0.215, "step": 1889 }, { "epoch": 0.8598726114649682, "grad_norm": 0.9198669667685293, "learning_rate": 4.644287080245975e-05, "loss": 0.2473, "step": 1890 }, { "epoch": 0.8603275705186533, "grad_norm": 0.8465514952601936, "learning_rate": 4.643919573709843e-05, "loss": 0.2749, "step": 1891 }, { "epoch": 0.8607825295723385, "grad_norm": 0.7943564571600611, "learning_rate": 4.6435518919822854e-05, "loss": 0.2767, "step": 1892 }, { "epoch": 0.8612374886260237, "grad_norm": 0.9755068876091788, "learning_rate": 4.6431840350933476e-05, "loss": 0.1788, "step": 1893 }, { "epoch": 0.8616924476797089, "grad_norm": 0.8899131888238376, "learning_rate": 4.642816003073089e-05, "loss": 0.2302, "step": 1894 }, { "epoch": 0.862147406733394, "grad_norm": 0.607970483390467, "learning_rate": 4.642447795951583e-05, "loss": 0.2186, "step": 1895 }, { "epoch": 0.8626023657870792, "grad_norm": 0.6391084233611739, "learning_rate": 4.642079413758919e-05, "loss": 0.1957, "step": 1896 }, { "epoch": 0.8630573248407644, "grad_norm": 0.9432208680999226, "learning_rate": 4.641710856525199e-05, "loss": 0.2709, "step": 1897 }, { "epoch": 0.8635122838944495, "grad_norm": 0.6894998383364671, "learning_rate": 4.6413421242805385e-05, "loss": 0.2353, "step": 1898 }, { "epoch": 0.8639672429481347, "grad_norm": 0.9775206711070474, "learning_rate": 4.6409732170550705e-05, "loss": 0.2344, "step": 1899 }, { "epoch": 0.8644222020018199, "grad_norm": 1.647367256635667, "learning_rate": 4.64060413487894e-05, "loss": 0.2213, "step": 1900 }, { "epoch": 0.864877161055505, "grad_norm": 0.9469124646362161, "learning_rate": 4.6402348777823063e-05, "loss": 0.2228, "step": 1901 }, { "epoch": 0.8653321201091901, "grad_norm": 158.76337561745143, "learning_rate": 4.639865445795344e-05, "loss": 0.7401, "step": 1902 }, { "epoch": 0.8657870791628753, "grad_norm": 0.9698656746401868, "learning_rate": 4.63949583894824e-05, "loss": 0.1624, "step": 1903 }, { "epoch": 0.8662420382165605, "grad_norm": 0.9332260828633699, "learning_rate": 4.6391260572711984e-05, "loss": 0.2787, "step": 1904 }, { "epoch": 0.8666969972702456, "grad_norm": 0.8113785739884632, "learning_rate": 4.638756100794436e-05, "loss": 0.2576, "step": 1905 }, { "epoch": 0.8671519563239308, "grad_norm": 0.8295626932854749, "learning_rate": 4.6383859695481835e-05, "loss": 0.2455, "step": 1906 }, { "epoch": 0.867606915377616, "grad_norm": 0.727674099829055, "learning_rate": 4.638015663562686e-05, "loss": 0.1838, "step": 1907 }, { "epoch": 0.8680618744313012, "grad_norm": 0.9089931221191712, "learning_rate": 4.637645182868204e-05, "loss": 0.241, "step": 1908 }, { "epoch": 0.8685168334849863, "grad_norm": 0.7217467763681586, "learning_rate": 4.637274527495011e-05, "loss": 0.2098, "step": 1909 }, { "epoch": 0.8689717925386715, "grad_norm": 0.8483360952351728, "learning_rate": 4.636903697473395e-05, "loss": 0.2172, "step": 1910 }, { "epoch": 0.8694267515923567, "grad_norm": 0.6660030626274394, "learning_rate": 4.6365326928336597e-05, "loss": 0.176, "step": 1911 }, { "epoch": 0.8698817106460418, "grad_norm": 0.922721780760967, "learning_rate": 4.6361615136061214e-05, "loss": 0.2618, "step": 1912 }, { "epoch": 0.870336669699727, "grad_norm": 0.7276891696379233, "learning_rate": 4.6357901598211105e-05, "loss": 0.1958, "step": 1913 }, { "epoch": 0.8707916287534122, "grad_norm": 0.6926178317940361, "learning_rate": 4.635418631508973e-05, "loss": 0.1986, "step": 1914 }, { "epoch": 0.8712465878070974, "grad_norm": 2.448726689134449, "learning_rate": 4.6350469287000686e-05, "loss": 0.3595, "step": 1915 }, { "epoch": 0.8717015468607825, "grad_norm": 0.9055588214396405, "learning_rate": 4.6346750514247714e-05, "loss": 0.2405, "step": 1916 }, { "epoch": 0.8721565059144677, "grad_norm": 0.6652231944304916, "learning_rate": 4.634302999713468e-05, "loss": 0.1307, "step": 1917 }, { "epoch": 0.8726114649681529, "grad_norm": 0.7545829218236425, "learning_rate": 4.6339307735965635e-05, "loss": 0.252, "step": 1918 }, { "epoch": 0.873066424021838, "grad_norm": 0.960527989204103, "learning_rate": 4.633558373104471e-05, "loss": 0.2718, "step": 1919 }, { "epoch": 0.8735213830755232, "grad_norm": 0.6226946557437518, "learning_rate": 4.633185798267624e-05, "loss": 0.2344, "step": 1920 }, { "epoch": 0.8739763421292084, "grad_norm": 1.1770572830372952, "learning_rate": 4.6328130491164675e-05, "loss": 0.3174, "step": 1921 }, { "epoch": 0.8744313011828936, "grad_norm": 0.761318803022577, "learning_rate": 4.63244012568146e-05, "loss": 0.2563, "step": 1922 }, { "epoch": 0.8748862602365787, "grad_norm": 1.112130780920492, "learning_rate": 4.632067027993076e-05, "loss": 0.261, "step": 1923 }, { "epoch": 0.8753412192902639, "grad_norm": 0.6443108923874801, "learning_rate": 4.631693756081802e-05, "loss": 0.1669, "step": 1924 }, { "epoch": 0.8757961783439491, "grad_norm": 0.9073543919597087, "learning_rate": 4.6313203099781413e-05, "loss": 0.2548, "step": 1925 }, { "epoch": 0.8762511373976342, "grad_norm": 0.7871967462798791, "learning_rate": 4.630946689712609e-05, "loss": 0.2416, "step": 1926 }, { "epoch": 0.8767060964513194, "grad_norm": 0.8560723010159856, "learning_rate": 4.630572895315737e-05, "loss": 0.2494, "step": 1927 }, { "epoch": 0.8771610555050046, "grad_norm": 1.3568766394538878, "learning_rate": 4.63019892681807e-05, "loss": 0.1573, "step": 1928 }, { "epoch": 0.8776160145586898, "grad_norm": 0.6796191925436035, "learning_rate": 4.6298247842501665e-05, "loss": 0.1793, "step": 1929 }, { "epoch": 0.8780709736123748, "grad_norm": 0.7052926779665649, "learning_rate": 4.629450467642599e-05, "loss": 0.1666, "step": 1930 }, { "epoch": 0.87852593266606, "grad_norm": 0.7817608784865954, "learning_rate": 4.629075977025957e-05, "loss": 0.2139, "step": 1931 }, { "epoch": 0.8789808917197452, "grad_norm": 0.6854198193492888, "learning_rate": 4.6287013124308395e-05, "loss": 0.2262, "step": 1932 }, { "epoch": 0.8794358507734303, "grad_norm": 0.7994789071924215, "learning_rate": 4.628326473887865e-05, "loss": 0.1589, "step": 1933 }, { "epoch": 0.8798908098271155, "grad_norm": 0.5647076465209514, "learning_rate": 4.627951461427662e-05, "loss": 0.1323, "step": 1934 }, { "epoch": 0.8803457688808007, "grad_norm": 1.160822524454404, "learning_rate": 4.627576275080876e-05, "loss": 0.2517, "step": 1935 }, { "epoch": 0.8808007279344859, "grad_norm": 0.6563696872786408, "learning_rate": 4.627200914878165e-05, "loss": 0.2144, "step": 1936 }, { "epoch": 0.881255686988171, "grad_norm": 0.7452355357390676, "learning_rate": 4.6268253808502005e-05, "loss": 0.2363, "step": 1937 }, { "epoch": 0.8817106460418562, "grad_norm": 0.5482408720946741, "learning_rate": 4.626449673027671e-05, "loss": 0.1545, "step": 1938 }, { "epoch": 0.8821656050955414, "grad_norm": 0.9608989310635632, "learning_rate": 4.6260737914412775e-05, "loss": 0.2281, "step": 1939 }, { "epoch": 0.8826205641492265, "grad_norm": 0.6553974977030336, "learning_rate": 4.625697736121735e-05, "loss": 0.2405, "step": 1940 }, { "epoch": 0.8830755232029117, "grad_norm": 0.7330961684516484, "learning_rate": 4.6253215070997726e-05, "loss": 0.2363, "step": 1941 }, { "epoch": 0.8835304822565969, "grad_norm": 0.7138994404952329, "learning_rate": 4.624945104406135e-05, "loss": 0.1856, "step": 1942 }, { "epoch": 0.8839854413102821, "grad_norm": 0.8402802237608219, "learning_rate": 4.6245685280715795e-05, "loss": 0.1587, "step": 1943 }, { "epoch": 0.8844404003639672, "grad_norm": 0.7182303556439276, "learning_rate": 4.624191778126878e-05, "loss": 0.2493, "step": 1944 }, { "epoch": 0.8848953594176524, "grad_norm": 0.8507207942033849, "learning_rate": 4.623814854602817e-05, "loss": 0.2943, "step": 1945 }, { "epoch": 0.8853503184713376, "grad_norm": 0.7008646278272528, "learning_rate": 4.623437757530198e-05, "loss": 0.1519, "step": 1946 }, { "epoch": 0.8858052775250227, "grad_norm": 0.711840864684127, "learning_rate": 4.623060486939834e-05, "loss": 0.201, "step": 1947 }, { "epoch": 0.8862602365787079, "grad_norm": 1.4388990652806701, "learning_rate": 4.622683042862556e-05, "loss": 0.2525, "step": 1948 }, { "epoch": 0.8867151956323931, "grad_norm": 4.860569255144998, "learning_rate": 4.622305425329204e-05, "loss": 0.1489, "step": 1949 }, { "epoch": 0.8871701546860783, "grad_norm": 0.6884569010224555, "learning_rate": 4.6219276343706374e-05, "loss": 0.1778, "step": 1950 }, { "epoch": 0.8876251137397634, "grad_norm": 0.8838769974814632, "learning_rate": 4.6215496700177276e-05, "loss": 0.29, "step": 1951 }, { "epoch": 0.8880800727934486, "grad_norm": 0.7136978626771867, "learning_rate": 4.621171532301359e-05, "loss": 0.2188, "step": 1952 }, { "epoch": 0.8885350318471338, "grad_norm": 0.7185575175596868, "learning_rate": 4.620793221252432e-05, "loss": 0.2436, "step": 1953 }, { "epoch": 0.8889899909008189, "grad_norm": 0.8093863456181173, "learning_rate": 4.620414736901861e-05, "loss": 0.2259, "step": 1954 }, { "epoch": 0.8894449499545041, "grad_norm": 0.7333665132966963, "learning_rate": 4.620036079280573e-05, "loss": 0.2749, "step": 1955 }, { "epoch": 0.8898999090081893, "grad_norm": 1.124173634407274, "learning_rate": 4.61965724841951e-05, "loss": 0.32, "step": 1956 }, { "epoch": 0.8903548680618745, "grad_norm": 0.7057127477907514, "learning_rate": 4.61927824434963e-05, "loss": 0.1869, "step": 1957 }, { "epoch": 0.8908098271155596, "grad_norm": 0.6891706239016225, "learning_rate": 4.618899067101902e-05, "loss": 0.1968, "step": 1958 }, { "epoch": 0.8912647861692448, "grad_norm": 0.6396661032704952, "learning_rate": 4.618519716707311e-05, "loss": 0.1792, "step": 1959 }, { "epoch": 0.89171974522293, "grad_norm": 1.2156673962625801, "learning_rate": 4.618140193196856e-05, "loss": 0.2245, "step": 1960 }, { "epoch": 0.892174704276615, "grad_norm": 1.074900035345066, "learning_rate": 4.61776049660155e-05, "loss": 0.2245, "step": 1961 }, { "epoch": 0.8926296633303002, "grad_norm": 0.8244052698456992, "learning_rate": 4.6173806269524195e-05, "loss": 0.2096, "step": 1962 }, { "epoch": 0.8930846223839854, "grad_norm": 0.8973975017925981, "learning_rate": 4.617000584280507e-05, "loss": 0.1701, "step": 1963 }, { "epoch": 0.8935395814376706, "grad_norm": 0.7640483319062898, "learning_rate": 4.6166203686168665e-05, "loss": 0.2431, "step": 1964 }, { "epoch": 0.8939945404913557, "grad_norm": 0.9931292912869408, "learning_rate": 4.616239979992568e-05, "loss": 0.3454, "step": 1965 }, { "epoch": 0.8944494995450409, "grad_norm": 0.9068879723263414, "learning_rate": 4.615859418438695e-05, "loss": 0.2836, "step": 1966 }, { "epoch": 0.8949044585987261, "grad_norm": 0.986557280364449, "learning_rate": 4.615478683986345e-05, "loss": 0.2011, "step": 1967 }, { "epoch": 0.8953594176524113, "grad_norm": 0.9095097041612654, "learning_rate": 4.6150977766666315e-05, "loss": 0.2884, "step": 1968 }, { "epoch": 0.8958143767060964, "grad_norm": 0.9809791061294768, "learning_rate": 4.614716696510679e-05, "loss": 0.3157, "step": 1969 }, { "epoch": 0.8962693357597816, "grad_norm": 0.960546994848248, "learning_rate": 4.614335443549628e-05, "loss": 0.2181, "step": 1970 }, { "epoch": 0.8967242948134668, "grad_norm": 0.7645612974513454, "learning_rate": 4.613954017814633e-05, "loss": 0.1712, "step": 1971 }, { "epoch": 0.8971792538671519, "grad_norm": 1.1551626306003082, "learning_rate": 4.6135724193368615e-05, "loss": 0.1894, "step": 1972 }, { "epoch": 0.8976342129208371, "grad_norm": 0.6648616795666332, "learning_rate": 4.6131906481474975e-05, "loss": 0.1942, "step": 1973 }, { "epoch": 0.8980891719745223, "grad_norm": 1.7011035640131738, "learning_rate": 4.612808704277736e-05, "loss": 0.2622, "step": 1974 }, { "epoch": 0.8985441310282075, "grad_norm": 0.7251316811522316, "learning_rate": 4.612426587758789e-05, "loss": 0.2484, "step": 1975 }, { "epoch": 0.8989990900818926, "grad_norm": 1.1014265188092283, "learning_rate": 4.612044298621881e-05, "loss": 0.2109, "step": 1976 }, { "epoch": 0.8994540491355778, "grad_norm": 0.8671016321138109, "learning_rate": 4.6116618368982515e-05, "loss": 0.2316, "step": 1977 }, { "epoch": 0.899909008189263, "grad_norm": 0.8792213040866663, "learning_rate": 4.6112792026191514e-05, "loss": 0.2318, "step": 1978 }, { "epoch": 0.9003639672429481, "grad_norm": 0.8551067192131241, "learning_rate": 4.61089639581585e-05, "loss": 0.2282, "step": 1979 }, { "epoch": 0.9008189262966333, "grad_norm": 0.9114204285087306, "learning_rate": 4.610513416519628e-05, "loss": 0.2135, "step": 1980 }, { "epoch": 0.9012738853503185, "grad_norm": 0.8903637477139125, "learning_rate": 4.610130264761781e-05, "loss": 0.1998, "step": 1981 }, { "epoch": 0.9017288444040037, "grad_norm": 0.869441380504924, "learning_rate": 4.6097469405736174e-05, "loss": 0.1898, "step": 1982 }, { "epoch": 0.9021838034576888, "grad_norm": 0.8647662506307642, "learning_rate": 4.609363443986461e-05, "loss": 0.2464, "step": 1983 }, { "epoch": 0.902638762511374, "grad_norm": 0.9360633313142048, "learning_rate": 4.60897977503165e-05, "loss": 0.2438, "step": 1984 }, { "epoch": 0.9030937215650592, "grad_norm": 1.0219791023743805, "learning_rate": 4.608595933740536e-05, "loss": 0.3168, "step": 1985 }, { "epoch": 0.9035486806187443, "grad_norm": 0.6184629843212726, "learning_rate": 4.6082119201444854e-05, "loss": 0.2643, "step": 1986 }, { "epoch": 0.9040036396724295, "grad_norm": 1.1384232440582027, "learning_rate": 4.607827734274876e-05, "loss": 0.2303, "step": 1987 }, { "epoch": 0.9044585987261147, "grad_norm": 0.9123860259256762, "learning_rate": 4.607443376163104e-05, "loss": 0.1974, "step": 1988 }, { "epoch": 0.9049135577797999, "grad_norm": 0.8660291523461697, "learning_rate": 4.6070588458405753e-05, "loss": 0.2096, "step": 1989 }, { "epoch": 0.905368516833485, "grad_norm": 1.5005809639108874, "learning_rate": 4.606674143338714e-05, "loss": 0.2105, "step": 1990 }, { "epoch": 0.9058234758871702, "grad_norm": 1.3113141120949041, "learning_rate": 4.606289268688955e-05, "loss": 0.2577, "step": 1991 }, { "epoch": 0.9062784349408554, "grad_norm": 1.0240486736553809, "learning_rate": 4.605904221922749e-05, "loss": 0.2718, "step": 1992 }, { "epoch": 0.9067333939945404, "grad_norm": 0.995232994874542, "learning_rate": 4.605519003071561e-05, "loss": 0.2642, "step": 1993 }, { "epoch": 0.9071883530482256, "grad_norm": 0.6689503737858176, "learning_rate": 4.6051336121668676e-05, "loss": 0.1921, "step": 1994 }, { "epoch": 0.9076433121019108, "grad_norm": 0.6823835373070575, "learning_rate": 4.604748049240162e-05, "loss": 0.1798, "step": 1995 }, { "epoch": 0.908098271155596, "grad_norm": 0.6203025572163604, "learning_rate": 4.604362314322951e-05, "loss": 0.1656, "step": 1996 }, { "epoch": 0.9085532302092811, "grad_norm": 1.0716425680642103, "learning_rate": 4.603976407446755e-05, "loss": 0.2224, "step": 1997 }, { "epoch": 0.9090081892629663, "grad_norm": 0.5715126642813589, "learning_rate": 4.603590328643108e-05, "loss": 0.1404, "step": 1998 }, { "epoch": 0.9094631483166515, "grad_norm": 0.8358698285874162, "learning_rate": 4.6032040779435597e-05, "loss": 0.2114, "step": 1999 }, { "epoch": 0.9099181073703366, "grad_norm": 0.8315720747988173, "learning_rate": 4.602817655379672e-05, "loss": 0.2623, "step": 2000 }, { "epoch": 0.9103730664240218, "grad_norm": 0.771600247771412, "learning_rate": 4.602431060983022e-05, "loss": 0.1878, "step": 2001 }, { "epoch": 0.910828025477707, "grad_norm": 0.7349234334829461, "learning_rate": 4.602044294785199e-05, "loss": 0.2095, "step": 2002 }, { "epoch": 0.9112829845313922, "grad_norm": 0.6838796483292938, "learning_rate": 4.60165735681781e-05, "loss": 0.1789, "step": 2003 }, { "epoch": 0.9117379435850773, "grad_norm": 0.7563690068639993, "learning_rate": 4.601270247112473e-05, "loss": 0.3092, "step": 2004 }, { "epoch": 0.9121929026387625, "grad_norm": 1.0992250696938368, "learning_rate": 4.60088296570082e-05, "loss": 0.3028, "step": 2005 }, { "epoch": 0.9126478616924477, "grad_norm": 0.6853803879496766, "learning_rate": 4.6004955126144986e-05, "loss": 0.1472, "step": 2006 }, { "epoch": 0.9131028207461328, "grad_norm": 0.8387362197218656, "learning_rate": 4.60010788788517e-05, "loss": 0.3089, "step": 2007 }, { "epoch": 0.913557779799818, "grad_norm": 0.7143366075642489, "learning_rate": 4.5997200915445095e-05, "loss": 0.2436, "step": 2008 }, { "epoch": 0.9140127388535032, "grad_norm": 1.574504218420329, "learning_rate": 4.599332123624204e-05, "loss": 0.2098, "step": 2009 }, { "epoch": 0.9144676979071884, "grad_norm": 0.8344728729376291, "learning_rate": 4.598943984155959e-05, "loss": 0.2308, "step": 2010 }, { "epoch": 0.9149226569608735, "grad_norm": 0.4973239269053567, "learning_rate": 4.598555673171489e-05, "loss": 0.1285, "step": 2011 }, { "epoch": 0.9153776160145587, "grad_norm": 0.5856902451139081, "learning_rate": 4.598167190702527e-05, "loss": 0.1542, "step": 2012 }, { "epoch": 0.9158325750682439, "grad_norm": 0.8222794898235876, "learning_rate": 4.597778536780818e-05, "loss": 0.2162, "step": 2013 }, { "epoch": 0.916287534121929, "grad_norm": 1.0794852506644925, "learning_rate": 4.5973897114381204e-05, "loss": 0.3172, "step": 2014 }, { "epoch": 0.9167424931756142, "grad_norm": 0.805295140662633, "learning_rate": 4.5970007147062065e-05, "loss": 0.1846, "step": 2015 }, { "epoch": 0.9171974522292994, "grad_norm": 0.8205803796172838, "learning_rate": 4.596611546616865e-05, "loss": 0.2729, "step": 2016 }, { "epoch": 0.9176524112829846, "grad_norm": 0.7767387714501359, "learning_rate": 4.5962222072018955e-05, "loss": 0.2416, "step": 2017 }, { "epoch": 0.9181073703366697, "grad_norm": 0.8117474177847019, "learning_rate": 4.595832696493114e-05, "loss": 0.2272, "step": 2018 }, { "epoch": 0.9185623293903549, "grad_norm": 0.8326795173347789, "learning_rate": 4.595443014522349e-05, "loss": 0.2075, "step": 2019 }, { "epoch": 0.9190172884440401, "grad_norm": 1.1269520872978156, "learning_rate": 4.595053161321444e-05, "loss": 0.2923, "step": 2020 }, { "epoch": 0.9194722474977252, "grad_norm": 0.7497872720061703, "learning_rate": 4.594663136922256e-05, "loss": 0.2451, "step": 2021 }, { "epoch": 0.9199272065514104, "grad_norm": 0.7573969550044222, "learning_rate": 4.5942729413566556e-05, "loss": 0.2156, "step": 2022 }, { "epoch": 0.9203821656050956, "grad_norm": 0.8706777626897606, "learning_rate": 4.593882574656528e-05, "loss": 0.2344, "step": 2023 }, { "epoch": 0.9208371246587808, "grad_norm": 0.873780047622031, "learning_rate": 4.593492036853773e-05, "loss": 0.2199, "step": 2024 }, { "epoch": 0.9212920837124658, "grad_norm": 0.8779650812014959, "learning_rate": 4.5931013279803016e-05, "loss": 0.2465, "step": 2025 }, { "epoch": 0.921747042766151, "grad_norm": 0.7711870355594431, "learning_rate": 4.592710448068043e-05, "loss": 0.2225, "step": 2026 }, { "epoch": 0.9222020018198362, "grad_norm": 0.8858046894398245, "learning_rate": 4.5923193971489364e-05, "loss": 0.193, "step": 2027 }, { "epoch": 0.9226569608735213, "grad_norm": 0.578914859110813, "learning_rate": 4.5919281752549384e-05, "loss": 0.1959, "step": 2028 }, { "epoch": 0.9231119199272065, "grad_norm": 1.6647496531948363, "learning_rate": 4.591536782418017e-05, "loss": 0.3052, "step": 2029 }, { "epoch": 0.9235668789808917, "grad_norm": 0.8301576710388708, "learning_rate": 4.591145218670154e-05, "loss": 0.1537, "step": 2030 }, { "epoch": 0.9240218380345769, "grad_norm": 0.99274942021486, "learning_rate": 4.590753484043348e-05, "loss": 0.2755, "step": 2031 }, { "epoch": 0.924476797088262, "grad_norm": 0.9134548711041515, "learning_rate": 4.5903615785696086e-05, "loss": 0.2489, "step": 2032 }, { "epoch": 0.9249317561419472, "grad_norm": 0.8073061739421968, "learning_rate": 4.589969502280962e-05, "loss": 0.1824, "step": 2033 }, { "epoch": 0.9253867151956324, "grad_norm": 0.5956694472057665, "learning_rate": 4.5895772552094454e-05, "loss": 0.2098, "step": 2034 }, { "epoch": 0.9258416742493175, "grad_norm": 0.8205172786554654, "learning_rate": 4.589184837387112e-05, "loss": 0.2109, "step": 2035 }, { "epoch": 0.9262966333030027, "grad_norm": 0.8350262493260755, "learning_rate": 4.588792248846028e-05, "loss": 0.2177, "step": 2036 }, { "epoch": 0.9267515923566879, "grad_norm": 1.0084438457991158, "learning_rate": 4.588399489618274e-05, "loss": 0.2725, "step": 2037 }, { "epoch": 0.9272065514103731, "grad_norm": 0.6459091086837169, "learning_rate": 4.588006559735945e-05, "loss": 0.1481, "step": 2038 }, { "epoch": 0.9276615104640582, "grad_norm": 0.7240659559949687, "learning_rate": 4.587613459231149e-05, "loss": 0.245, "step": 2039 }, { "epoch": 0.9281164695177434, "grad_norm": 0.703350986781065, "learning_rate": 4.5872201881360096e-05, "loss": 0.2344, "step": 2040 }, { "epoch": 0.9285714285714286, "grad_norm": 0.8529297773083054, "learning_rate": 4.586826746482662e-05, "loss": 0.2414, "step": 2041 }, { "epoch": 0.9290263876251137, "grad_norm": 0.680206508991848, "learning_rate": 4.586433134303257e-05, "loss": 0.1768, "step": 2042 }, { "epoch": 0.9294813466787989, "grad_norm": 1.1571549212202592, "learning_rate": 4.586039351629958e-05, "loss": 0.2352, "step": 2043 }, { "epoch": 0.9299363057324841, "grad_norm": 0.8920237741108188, "learning_rate": 4.5856453984949435e-05, "loss": 0.1809, "step": 2044 }, { "epoch": 0.9303912647861693, "grad_norm": 0.7621379382500061, "learning_rate": 4.585251274930407e-05, "loss": 0.2197, "step": 2045 }, { "epoch": 0.9308462238398544, "grad_norm": 2.0625951082360547, "learning_rate": 4.584856980968552e-05, "loss": 0.3262, "step": 2046 }, { "epoch": 0.9313011828935396, "grad_norm": 0.8793697876113563, "learning_rate": 4.5844625166415994e-05, "loss": 0.2702, "step": 2047 }, { "epoch": 0.9317561419472248, "grad_norm": 1.0298313332714228, "learning_rate": 4.584067881981784e-05, "loss": 0.2238, "step": 2048 }, { "epoch": 0.9322111010009099, "grad_norm": 0.79905967332074, "learning_rate": 4.583673077021352e-05, "loss": 0.1673, "step": 2049 }, { "epoch": 0.9326660600545951, "grad_norm": 0.7310179433485852, "learning_rate": 4.583278101792567e-05, "loss": 0.226, "step": 2050 }, { "epoch": 0.9331210191082803, "grad_norm": 0.7701635541870674, "learning_rate": 4.582882956327703e-05, "loss": 0.1704, "step": 2051 }, { "epoch": 0.9335759781619655, "grad_norm": 0.8389240385209522, "learning_rate": 4.58248764065905e-05, "loss": 0.194, "step": 2052 }, { "epoch": 0.9340309372156506, "grad_norm": 0.7696049369652356, "learning_rate": 4.5820921548189116e-05, "loss": 0.193, "step": 2053 }, { "epoch": 0.9344858962693358, "grad_norm": 0.7870296034176034, "learning_rate": 4.581696498839605e-05, "loss": 0.2631, "step": 2054 }, { "epoch": 0.934940855323021, "grad_norm": 0.787379376805249, "learning_rate": 4.5813006727534616e-05, "loss": 0.1921, "step": 2055 }, { "epoch": 0.935395814376706, "grad_norm": 1.2472223994300602, "learning_rate": 4.5809046765928256e-05, "loss": 0.2655, "step": 2056 }, { "epoch": 0.9358507734303912, "grad_norm": 0.7534704483988667, "learning_rate": 4.580508510390057e-05, "loss": 0.1739, "step": 2057 }, { "epoch": 0.9363057324840764, "grad_norm": 0.7176288529138601, "learning_rate": 4.580112174177529e-05, "loss": 0.2188, "step": 2058 }, { "epoch": 0.9367606915377616, "grad_norm": 0.5692172000708039, "learning_rate": 4.579715667987627e-05, "loss": 0.1476, "step": 2059 }, { "epoch": 0.9372156505914467, "grad_norm": 1.308788342300197, "learning_rate": 4.5793189918527526e-05, "loss": 0.3981, "step": 2060 }, { "epoch": 0.9376706096451319, "grad_norm": 0.8831572542548628, "learning_rate": 4.578922145805321e-05, "loss": 0.2036, "step": 2061 }, { "epoch": 0.9381255686988171, "grad_norm": 0.8420080476280356, "learning_rate": 4.5785251298777594e-05, "loss": 0.2572, "step": 2062 }, { "epoch": 0.9385805277525022, "grad_norm": 0.7955802203471891, "learning_rate": 4.57812794410251e-05, "loss": 0.2116, "step": 2063 }, { "epoch": 0.9390354868061874, "grad_norm": 1.1393473471025757, "learning_rate": 4.577730588512031e-05, "loss": 0.3116, "step": 2064 }, { "epoch": 0.9394904458598726, "grad_norm": 0.8913141725539951, "learning_rate": 4.577333063138791e-05, "loss": 0.2411, "step": 2065 }, { "epoch": 0.9399454049135578, "grad_norm": 0.6392373108531396, "learning_rate": 4.576935368015274e-05, "loss": 0.1998, "step": 2066 }, { "epoch": 0.9404003639672429, "grad_norm": 0.9083503165548502, "learning_rate": 4.576537503173978e-05, "loss": 0.2647, "step": 2067 }, { "epoch": 0.9408553230209281, "grad_norm": 1.0612290558985202, "learning_rate": 4.5761394686474155e-05, "loss": 0.2681, "step": 2068 }, { "epoch": 0.9413102820746133, "grad_norm": 0.9960955188075843, "learning_rate": 4.575741264468111e-05, "loss": 0.1903, "step": 2069 }, { "epoch": 0.9417652411282984, "grad_norm": 0.8863201521786314, "learning_rate": 4.575342890668603e-05, "loss": 0.2675, "step": 2070 }, { "epoch": 0.9422202001819836, "grad_norm": 1.9314403244998013, "learning_rate": 4.574944347281448e-05, "loss": 0.3414, "step": 2071 }, { "epoch": 0.9426751592356688, "grad_norm": 0.9605080693209704, "learning_rate": 4.5745456343392114e-05, "loss": 0.2622, "step": 2072 }, { "epoch": 0.943130118289354, "grad_norm": 0.9636296023394202, "learning_rate": 4.574146751874473e-05, "loss": 0.2915, "step": 2073 }, { "epoch": 0.9435850773430391, "grad_norm": 0.7916023430773905, "learning_rate": 4.573747699919829e-05, "loss": 0.2083, "step": 2074 }, { "epoch": 0.9440400363967243, "grad_norm": 0.8225670249707251, "learning_rate": 4.573348478507888e-05, "loss": 0.193, "step": 2075 }, { "epoch": 0.9444949954504095, "grad_norm": 0.9100358059583294, "learning_rate": 4.572949087671273e-05, "loss": 0.3529, "step": 2076 }, { "epoch": 0.9449499545040946, "grad_norm": 0.719094307090598, "learning_rate": 4.572549527442619e-05, "loss": 0.1579, "step": 2077 }, { "epoch": 0.9454049135577798, "grad_norm": 0.9337672701727904, "learning_rate": 4.572149797854578e-05, "loss": 0.2125, "step": 2078 }, { "epoch": 0.945859872611465, "grad_norm": 0.6117704526967206, "learning_rate": 4.571749898939813e-05, "loss": 0.1934, "step": 2079 }, { "epoch": 0.9463148316651502, "grad_norm": 0.5636100463274094, "learning_rate": 4.571349830731002e-05, "loss": 0.1563, "step": 2080 }, { "epoch": 0.9467697907188353, "grad_norm": 0.6200130677712031, "learning_rate": 4.570949593260837e-05, "loss": 0.1606, "step": 2081 }, { "epoch": 0.9472247497725205, "grad_norm": 0.7264912184845804, "learning_rate": 4.570549186562024e-05, "loss": 0.212, "step": 2082 }, { "epoch": 0.9476797088262057, "grad_norm": 0.9708291750103298, "learning_rate": 4.570148610667281e-05, "loss": 0.3015, "step": 2083 }, { "epoch": 0.9481346678798908, "grad_norm": 1.76376644040552, "learning_rate": 4.5697478656093426e-05, "loss": 0.3395, "step": 2084 }, { "epoch": 0.948589626933576, "grad_norm": 0.9936935353919659, "learning_rate": 4.5693469514209566e-05, "loss": 0.3156, "step": 2085 }, { "epoch": 0.9490445859872612, "grad_norm": 0.9737116514874316, "learning_rate": 4.568945868134882e-05, "loss": 0.26, "step": 2086 }, { "epoch": 0.9494995450409464, "grad_norm": 0.8060487906862576, "learning_rate": 4.568544615783894e-05, "loss": 0.2449, "step": 2087 }, { "epoch": 0.9499545040946314, "grad_norm": 1.1420158279731871, "learning_rate": 4.568143194400782e-05, "loss": 0.2094, "step": 2088 }, { "epoch": 0.9504094631483166, "grad_norm": 1.0956262740710827, "learning_rate": 4.5677416040183475e-05, "loss": 0.237, "step": 2089 }, { "epoch": 0.9508644222020018, "grad_norm": 0.8065710387647892, "learning_rate": 4.567339844669407e-05, "loss": 0.2835, "step": 2090 }, { "epoch": 0.9513193812556869, "grad_norm": 0.6285962750122424, "learning_rate": 4.566937916386791e-05, "loss": 0.1629, "step": 2091 }, { "epoch": 0.9517743403093721, "grad_norm": 0.7760557915378703, "learning_rate": 4.566535819203342e-05, "loss": 0.202, "step": 2092 }, { "epoch": 0.9522292993630573, "grad_norm": 0.6268998324082413, "learning_rate": 4.566133553151918e-05, "loss": 0.2265, "step": 2093 }, { "epoch": 0.9526842584167425, "grad_norm": 0.6953674606051694, "learning_rate": 4.565731118265392e-05, "loss": 0.2084, "step": 2094 }, { "epoch": 0.9531392174704276, "grad_norm": 1.2014026603699721, "learning_rate": 4.565328514576647e-05, "loss": 0.2989, "step": 2095 }, { "epoch": 0.9535941765241128, "grad_norm": 0.7349425117279724, "learning_rate": 4.564925742118582e-05, "loss": 0.2041, "step": 2096 }, { "epoch": 0.954049135577798, "grad_norm": 0.9844185499607369, "learning_rate": 4.564522800924111e-05, "loss": 0.2734, "step": 2097 }, { "epoch": 0.9545040946314831, "grad_norm": 0.6755713538316092, "learning_rate": 4.5641196910261594e-05, "loss": 0.1835, "step": 2098 }, { "epoch": 0.9549590536851683, "grad_norm": 0.722274317195714, "learning_rate": 4.563716412457669e-05, "loss": 0.2853, "step": 2099 }, { "epoch": 0.9554140127388535, "grad_norm": 0.9492243586696505, "learning_rate": 4.563312965251593e-05, "loss": 0.2541, "step": 2100 }, { "epoch": 0.9558689717925387, "grad_norm": 0.9173495031430036, "learning_rate": 4.562909349440899e-05, "loss": 0.2584, "step": 2101 }, { "epoch": 0.9563239308462238, "grad_norm": 0.7484293233694994, "learning_rate": 4.562505565058569e-05, "loss": 0.2061, "step": 2102 }, { "epoch": 0.956778889899909, "grad_norm": 0.8840134664625301, "learning_rate": 4.562101612137599e-05, "loss": 0.22, "step": 2103 }, { "epoch": 0.9572338489535942, "grad_norm": 0.6908195634290815, "learning_rate": 4.5616974907109975e-05, "loss": 0.1538, "step": 2104 }, { "epoch": 0.9576888080072793, "grad_norm": 0.8459962826783943, "learning_rate": 4.561293200811787e-05, "loss": 0.2177, "step": 2105 }, { "epoch": 0.9581437670609645, "grad_norm": 0.8932245675617106, "learning_rate": 4.560888742473005e-05, "loss": 0.2328, "step": 2106 }, { "epoch": 0.9585987261146497, "grad_norm": 0.8089065225545112, "learning_rate": 4.560484115727702e-05, "loss": 0.2768, "step": 2107 }, { "epoch": 0.9590536851683349, "grad_norm": 0.6338566561603042, "learning_rate": 4.560079320608942e-05, "loss": 0.2094, "step": 2108 }, { "epoch": 0.95950864422202, "grad_norm": 0.6122449420675595, "learning_rate": 4.5596743571498037e-05, "loss": 0.2063, "step": 2109 }, { "epoch": 0.9599636032757052, "grad_norm": 1.3344383766409467, "learning_rate": 4.559269225383377e-05, "loss": 0.2481, "step": 2110 }, { "epoch": 0.9604185623293904, "grad_norm": 2.0594719997991313, "learning_rate": 4.55886392534277e-05, "loss": 0.262, "step": 2111 }, { "epoch": 0.9608735213830755, "grad_norm": 0.6397684476761636, "learning_rate": 4.5584584570611e-05, "loss": 0.1785, "step": 2112 }, { "epoch": 0.9613284804367607, "grad_norm": 0.8885334870648754, "learning_rate": 4.558052820571502e-05, "loss": 0.2216, "step": 2113 }, { "epoch": 0.9617834394904459, "grad_norm": 1.000774969320222, "learning_rate": 4.5576470159071206e-05, "loss": 0.2932, "step": 2114 }, { "epoch": 0.9622383985441311, "grad_norm": 0.6307603822095373, "learning_rate": 4.5572410431011175e-05, "loss": 0.1649, "step": 2115 }, { "epoch": 0.9626933575978162, "grad_norm": 0.8467562040890299, "learning_rate": 4.5568349021866667e-05, "loss": 0.3443, "step": 2116 }, { "epoch": 0.9631483166515014, "grad_norm": 0.9045629432456505, "learning_rate": 4.556428593196956e-05, "loss": 0.2491, "step": 2117 }, { "epoch": 0.9636032757051866, "grad_norm": 0.8086766057402117, "learning_rate": 4.556022116165188e-05, "loss": 0.2678, "step": 2118 }, { "epoch": 0.9640582347588716, "grad_norm": 0.8084111749882547, "learning_rate": 4.555615471124578e-05, "loss": 0.2027, "step": 2119 }, { "epoch": 0.9645131938125568, "grad_norm": 0.6780830128265918, "learning_rate": 4.5552086581083544e-05, "loss": 0.1468, "step": 2120 }, { "epoch": 0.964968152866242, "grad_norm": 0.9703450600409347, "learning_rate": 4.55480167714976e-05, "loss": 0.2538, "step": 2121 }, { "epoch": 0.9654231119199272, "grad_norm": 0.6060613758240011, "learning_rate": 4.554394528282052e-05, "loss": 0.2019, "step": 2122 }, { "epoch": 0.9658780709736123, "grad_norm": 0.7065724899694515, "learning_rate": 4.553987211538501e-05, "loss": 0.2125, "step": 2123 }, { "epoch": 0.9663330300272975, "grad_norm": 1.10179662792588, "learning_rate": 4.55357972695239e-05, "loss": 0.2713, "step": 2124 }, { "epoch": 0.9667879890809827, "grad_norm": 0.7808110048188406, "learning_rate": 4.5531720745570195e-05, "loss": 0.2152, "step": 2125 }, { "epoch": 0.9672429481346679, "grad_norm": 0.6215213068165848, "learning_rate": 4.5527642543856975e-05, "loss": 0.1502, "step": 2126 }, { "epoch": 0.967697907188353, "grad_norm": 0.7489090444468421, "learning_rate": 4.552356266471751e-05, "loss": 0.2592, "step": 2127 }, { "epoch": 0.9681528662420382, "grad_norm": 0.9211076815363415, "learning_rate": 4.551948110848519e-05, "loss": 0.2174, "step": 2128 }, { "epoch": 0.9686078252957234, "grad_norm": 0.9133079938862912, "learning_rate": 4.5515397875493545e-05, "loss": 0.2612, "step": 2129 }, { "epoch": 0.9690627843494085, "grad_norm": 0.6564955946868491, "learning_rate": 4.5511312966076223e-05, "loss": 0.1681, "step": 2130 }, { "epoch": 0.9695177434030937, "grad_norm": 0.9014612709583826, "learning_rate": 4.550722638056704e-05, "loss": 0.1977, "step": 2131 }, { "epoch": 0.9699727024567789, "grad_norm": 1.0774528815003883, "learning_rate": 4.550313811929993e-05, "loss": 0.2108, "step": 2132 }, { "epoch": 0.9704276615104641, "grad_norm": 1.3234514807288098, "learning_rate": 4.549904818260895e-05, "loss": 0.2538, "step": 2133 }, { "epoch": 0.9708826205641492, "grad_norm": 0.938549980719347, "learning_rate": 4.5494956570828334e-05, "loss": 0.2471, "step": 2134 }, { "epoch": 0.9713375796178344, "grad_norm": 0.8773406219958427, "learning_rate": 4.549086328429242e-05, "loss": 0.1591, "step": 2135 }, { "epoch": 0.9717925386715196, "grad_norm": 0.6912984007060564, "learning_rate": 4.548676832333569e-05, "loss": 0.1809, "step": 2136 }, { "epoch": 0.9722474977252047, "grad_norm": 2.0886646739593435, "learning_rate": 4.5482671688292784e-05, "loss": 0.218, "step": 2137 }, { "epoch": 0.9727024567788899, "grad_norm": 0.8234180223039547, "learning_rate": 4.5478573379498436e-05, "loss": 0.1888, "step": 2138 }, { "epoch": 0.9731574158325751, "grad_norm": 0.6891294079693814, "learning_rate": 4.5474473397287556e-05, "loss": 0.1676, "step": 2139 }, { "epoch": 0.9736123748862603, "grad_norm": 0.6620716660957371, "learning_rate": 4.5470371741995175e-05, "loss": 0.1839, "step": 2140 }, { "epoch": 0.9740673339399454, "grad_norm": 0.7391668001823253, "learning_rate": 4.546626841395645e-05, "loss": 0.203, "step": 2141 }, { "epoch": 0.9745222929936306, "grad_norm": 0.7693074960790695, "learning_rate": 4.54621634135067e-05, "loss": 0.1888, "step": 2142 }, { "epoch": 0.9749772520473158, "grad_norm": 0.689124806462059, "learning_rate": 4.545805674098136e-05, "loss": 0.2476, "step": 2143 }, { "epoch": 0.9754322111010009, "grad_norm": 0.7098847974755175, "learning_rate": 4.5453948396716006e-05, "loss": 0.202, "step": 2144 }, { "epoch": 0.9758871701546861, "grad_norm": 0.9873769422382779, "learning_rate": 4.544983838104637e-05, "loss": 0.2356, "step": 2145 }, { "epoch": 0.9763421292083713, "grad_norm": 3.0017760997174188, "learning_rate": 4.544572669430828e-05, "loss": 0.3218, "step": 2146 }, { "epoch": 0.9767970882620565, "grad_norm": 0.7514096889317271, "learning_rate": 4.5441613336837747e-05, "loss": 0.1925, "step": 2147 }, { "epoch": 0.9772520473157416, "grad_norm": 0.8225469900279387, "learning_rate": 4.543749830897088e-05, "loss": 0.2647, "step": 2148 }, { "epoch": 0.9777070063694268, "grad_norm": 0.7925531359205661, "learning_rate": 4.543338161104394e-05, "loss": 0.2202, "step": 2149 }, { "epoch": 0.978161965423112, "grad_norm": 0.7285491024147576, "learning_rate": 4.5429263243393346e-05, "loss": 0.2122, "step": 2150 }, { "epoch": 0.978616924476797, "grad_norm": 0.7673007646597891, "learning_rate": 4.5425143206355614e-05, "loss": 0.2177, "step": 2151 }, { "epoch": 0.9790718835304822, "grad_norm": 0.8173104755847399, "learning_rate": 4.542102150026741e-05, "loss": 0.2655, "step": 2152 }, { "epoch": 0.9795268425841674, "grad_norm": 0.5741159248184082, "learning_rate": 4.541689812546556e-05, "loss": 0.2165, "step": 2153 }, { "epoch": 0.9799818016378526, "grad_norm": 1.0531934660491316, "learning_rate": 4.541277308228699e-05, "loss": 0.3193, "step": 2154 }, { "epoch": 0.9804367606915377, "grad_norm": 0.8662371667836346, "learning_rate": 4.5408646371068787e-05, "loss": 0.2269, "step": 2155 }, { "epoch": 0.9808917197452229, "grad_norm": 1.4021303422300646, "learning_rate": 4.5404517992148165e-05, "loss": 0.2241, "step": 2156 }, { "epoch": 0.9813466787989081, "grad_norm": 0.8517709708419259, "learning_rate": 4.5400387945862486e-05, "loss": 0.2019, "step": 2157 }, { "epoch": 0.9818016378525932, "grad_norm": 0.9448076733375593, "learning_rate": 4.539625623254923e-05, "loss": 0.175, "step": 2158 }, { "epoch": 0.9822565969062784, "grad_norm": 0.7351752833613573, "learning_rate": 4.5392122852546016e-05, "loss": 0.2739, "step": 2159 }, { "epoch": 0.9827115559599636, "grad_norm": 0.8211790936798152, "learning_rate": 4.5387987806190615e-05, "loss": 0.2588, "step": 2160 }, { "epoch": 0.9831665150136488, "grad_norm": 0.7462502356950003, "learning_rate": 4.538385109382093e-05, "loss": 0.2604, "step": 2161 }, { "epoch": 0.9836214740673339, "grad_norm": 0.5988861585591733, "learning_rate": 4.5379712715774975e-05, "loss": 0.2292, "step": 2162 }, { "epoch": 0.9840764331210191, "grad_norm": 0.5072834548466262, "learning_rate": 4.537557267239093e-05, "loss": 0.14, "step": 2163 }, { "epoch": 0.9845313921747043, "grad_norm": 0.5426544417122134, "learning_rate": 4.537143096400711e-05, "loss": 0.1424, "step": 2164 }, { "epoch": 0.9849863512283894, "grad_norm": 0.9199943342731794, "learning_rate": 4.536728759096195e-05, "loss": 0.2477, "step": 2165 }, { "epoch": 0.9854413102820746, "grad_norm": 0.7519689564691593, "learning_rate": 4.5363142553594017e-05, "loss": 0.1811, "step": 2166 }, { "epoch": 0.9858962693357598, "grad_norm": 0.756882822730654, "learning_rate": 4.535899585224204e-05, "loss": 0.2134, "step": 2167 }, { "epoch": 0.986351228389445, "grad_norm": 1.1040679941165308, "learning_rate": 4.5354847487244864e-05, "loss": 0.254, "step": 2168 }, { "epoch": 0.9868061874431301, "grad_norm": 0.6966161436149814, "learning_rate": 4.535069745894147e-05, "loss": 0.2138, "step": 2169 }, { "epoch": 0.9872611464968153, "grad_norm": 0.8626603482689079, "learning_rate": 4.534654576767098e-05, "loss": 0.2029, "step": 2170 }, { "epoch": 0.9877161055505005, "grad_norm": 0.707361964238885, "learning_rate": 4.534239241377266e-05, "loss": 0.2451, "step": 2171 }, { "epoch": 0.9881710646041856, "grad_norm": 0.7675747021603291, "learning_rate": 4.5338237397585895e-05, "loss": 0.3212, "step": 2172 }, { "epoch": 0.9886260236578708, "grad_norm": 1.1685350748250836, "learning_rate": 4.533408071945021e-05, "loss": 0.2783, "step": 2173 }, { "epoch": 0.989080982711556, "grad_norm": 0.7415234996246145, "learning_rate": 4.532992237970528e-05, "loss": 0.208, "step": 2174 }, { "epoch": 0.9895359417652412, "grad_norm": 0.9085277964628254, "learning_rate": 4.532576237869091e-05, "loss": 0.2236, "step": 2175 }, { "epoch": 0.9899909008189263, "grad_norm": 0.9902152925129987, "learning_rate": 4.532160071674702e-05, "loss": 0.2057, "step": 2176 }, { "epoch": 0.9904458598726115, "grad_norm": 0.9091211343914799, "learning_rate": 4.531743739421369e-05, "loss": 0.246, "step": 2177 }, { "epoch": 0.9909008189262967, "grad_norm": 0.7164418397958597, "learning_rate": 4.531327241143113e-05, "loss": 0.1819, "step": 2178 }, { "epoch": 0.9913557779799818, "grad_norm": 0.8741800898216185, "learning_rate": 4.530910576873969e-05, "loss": 0.2357, "step": 2179 }, { "epoch": 0.991810737033667, "grad_norm": 0.8623205635802316, "learning_rate": 4.5304937466479836e-05, "loss": 0.2251, "step": 2180 }, { "epoch": 0.9922656960873522, "grad_norm": 1.116239270470829, "learning_rate": 4.5300767504992195e-05, "loss": 0.2816, "step": 2181 }, { "epoch": 0.9927206551410374, "grad_norm": 0.7406563795327836, "learning_rate": 4.5296595884617496e-05, "loss": 0.245, "step": 2182 }, { "epoch": 0.9931756141947224, "grad_norm": 0.6780193736641491, "learning_rate": 4.529242260569665e-05, "loss": 0.1945, "step": 2183 }, { "epoch": 0.9936305732484076, "grad_norm": 1.1152390090675552, "learning_rate": 4.5288247668570674e-05, "loss": 0.2902, "step": 2184 }, { "epoch": 0.9940855323020928, "grad_norm": 1.2312871944988681, "learning_rate": 4.528407107358071e-05, "loss": 0.2589, "step": 2185 }, { "epoch": 0.9945404913557779, "grad_norm": 0.8981095818889836, "learning_rate": 4.5279892821068065e-05, "loss": 0.2094, "step": 2186 }, { "epoch": 0.9949954504094631, "grad_norm": 1.0507252097197897, "learning_rate": 4.5275712911374156e-05, "loss": 0.2819, "step": 2187 }, { "epoch": 0.9954504094631483, "grad_norm": 0.9008865860791743, "learning_rate": 4.527153134484056e-05, "loss": 0.2663, "step": 2188 }, { "epoch": 0.9959053685168335, "grad_norm": 0.6897796303644019, "learning_rate": 4.526734812180896e-05, "loss": 0.1479, "step": 2189 }, { "epoch": 0.9963603275705186, "grad_norm": 0.528634806131606, "learning_rate": 4.5263163242621206e-05, "loss": 0.1827, "step": 2190 }, { "epoch": 0.9968152866242038, "grad_norm": 0.9974495258826525, "learning_rate": 4.5258976707619255e-05, "loss": 0.2469, "step": 2191 }, { "epoch": 0.997270245677889, "grad_norm": 0.9260324103307455, "learning_rate": 4.525478851714522e-05, "loss": 0.254, "step": 2192 }, { "epoch": 0.9977252047315741, "grad_norm": 1.0421476606466749, "learning_rate": 4.525059867154133e-05, "loss": 0.2417, "step": 2193 }, { "epoch": 0.9981801637852593, "grad_norm": 0.6818911181515721, "learning_rate": 4.524640717114997e-05, "loss": 0.1918, "step": 2194 }, { "epoch": 0.9986351228389445, "grad_norm": 0.864603872771227, "learning_rate": 4.5242214016313655e-05, "loss": 0.2799, "step": 2195 }, { "epoch": 0.9990900818926297, "grad_norm": 0.7703420771205931, "learning_rate": 4.523801920737501e-05, "loss": 0.2415, "step": 2196 }, { "epoch": 0.9995450409463148, "grad_norm": 0.6652375272459842, "learning_rate": 4.523382274467684e-05, "loss": 0.2376, "step": 2197 }, { "epoch": 1.0, "grad_norm": 0.6896678824090074, "learning_rate": 4.522962462856205e-05, "loss": 0.1379, "step": 2198 }, { "epoch": 1.000454959053685, "grad_norm": 0.5675474280059891, "learning_rate": 4.522542485937369e-05, "loss": 0.0822, "step": 2199 }, { "epoch": 1.0009099181073704, "grad_norm": 0.6461292383166993, "learning_rate": 4.5221223437454946e-05, "loss": 0.1193, "step": 2200 }, { "epoch": 1.0013648771610555, "grad_norm": 0.5124435178990642, "learning_rate": 4.5217020363149146e-05, "loss": 0.0954, "step": 2201 }, { "epoch": 1.0018198362147406, "grad_norm": 0.900023351667322, "learning_rate": 4.521281563679973e-05, "loss": 0.0912, "step": 2202 }, { "epoch": 1.0022747952684259, "grad_norm": 0.6541930791901857, "learning_rate": 4.5208609258750314e-05, "loss": 0.0981, "step": 2203 }, { "epoch": 1.002729754322111, "grad_norm": 0.48688894618805556, "learning_rate": 4.52044012293446e-05, "loss": 0.095, "step": 2204 }, { "epoch": 1.0031847133757963, "grad_norm": 0.7831615762741222, "learning_rate": 4.520019154892646e-05, "loss": 0.1065, "step": 2205 }, { "epoch": 1.0036396724294814, "grad_norm": 0.6992389254326765, "learning_rate": 4.51959802178399e-05, "loss": 0.1323, "step": 2206 }, { "epoch": 1.0040946314831665, "grad_norm": 0.5982945172598987, "learning_rate": 4.519176723642903e-05, "loss": 0.1328, "step": 2207 }, { "epoch": 1.0045495905368518, "grad_norm": 0.6953062094273642, "learning_rate": 4.518755260503813e-05, "loss": 0.1062, "step": 2208 }, { "epoch": 1.0050045495905369, "grad_norm": 0.56035726038632, "learning_rate": 4.518333632401161e-05, "loss": 0.1006, "step": 2209 }, { "epoch": 1.005459508644222, "grad_norm": 0.5561364217440575, "learning_rate": 4.517911839369397e-05, "loss": 0.1069, "step": 2210 }, { "epoch": 1.0059144676979073, "grad_norm": 0.563710690749881, "learning_rate": 4.5174898814429925e-05, "loss": 0.0801, "step": 2211 }, { "epoch": 1.0063694267515924, "grad_norm": 0.6840473965536749, "learning_rate": 4.517067758656424e-05, "loss": 0.1076, "step": 2212 }, { "epoch": 1.0068243858052774, "grad_norm": 0.6314250873778653, "learning_rate": 4.516645471044188e-05, "loss": 0.1047, "step": 2213 }, { "epoch": 1.0072793448589628, "grad_norm": 0.4907695041998029, "learning_rate": 4.516223018640791e-05, "loss": 0.0925, "step": 2214 }, { "epoch": 1.0077343039126478, "grad_norm": 0.6404273266109531, "learning_rate": 4.515800401480754e-05, "loss": 0.1285, "step": 2215 }, { "epoch": 1.008189262966333, "grad_norm": 0.6624739021415951, "learning_rate": 4.515377619598612e-05, "loss": 0.123, "step": 2216 }, { "epoch": 1.0086442220200182, "grad_norm": 0.6671298985365878, "learning_rate": 4.5149546730289125e-05, "loss": 0.1047, "step": 2217 }, { "epoch": 1.0090991810737033, "grad_norm": 0.8279547265406216, "learning_rate": 4.514531561806216e-05, "loss": 0.1162, "step": 2218 }, { "epoch": 1.0095541401273886, "grad_norm": 1.4078122777953415, "learning_rate": 4.514108285965098e-05, "loss": 0.2397, "step": 2219 }, { "epoch": 1.0100090991810737, "grad_norm": 0.7193512151984053, "learning_rate": 4.513684845540146e-05, "loss": 0.0919, "step": 2220 }, { "epoch": 1.0104640582347588, "grad_norm": 0.8538650965955809, "learning_rate": 4.5132612405659625e-05, "loss": 0.1468, "step": 2221 }, { "epoch": 1.0109190172884441, "grad_norm": 0.6100471527966651, "learning_rate": 4.5128374710771626e-05, "loss": 0.1218, "step": 2222 }, { "epoch": 1.0113739763421292, "grad_norm": 0.6024731963187352, "learning_rate": 4.5124135371083745e-05, "loss": 0.1216, "step": 2223 }, { "epoch": 1.0118289353958143, "grad_norm": 0.775945995562542, "learning_rate": 4.5119894386942396e-05, "loss": 0.1039, "step": 2224 }, { "epoch": 1.0122838944494996, "grad_norm": 0.5677161425253688, "learning_rate": 4.5115651758694145e-05, "loss": 0.1009, "step": 2225 }, { "epoch": 1.0127388535031847, "grad_norm": 0.7217555586669266, "learning_rate": 4.5111407486685666e-05, "loss": 0.1179, "step": 2226 }, { "epoch": 1.0131938125568698, "grad_norm": 1.1000700204824057, "learning_rate": 4.510716157126379e-05, "loss": 0.0781, "step": 2227 }, { "epoch": 1.013648771610555, "grad_norm": 0.8191578929864042, "learning_rate": 4.5102914012775476e-05, "loss": 0.1139, "step": 2228 }, { "epoch": 1.0141037306642402, "grad_norm": 1.3293623272031176, "learning_rate": 4.509866481156782e-05, "loss": 0.1318, "step": 2229 }, { "epoch": 1.0145586897179253, "grad_norm": 1.1256855901026706, "learning_rate": 4.509441396798803e-05, "loss": 0.1244, "step": 2230 }, { "epoch": 1.0150136487716106, "grad_norm": 1.3284669948241072, "learning_rate": 4.509016148238347e-05, "loss": 0.294, "step": 2231 }, { "epoch": 1.0154686078252957, "grad_norm": 0.6444673087091571, "learning_rate": 4.508590735510165e-05, "loss": 0.0856, "step": 2232 }, { "epoch": 1.015923566878981, "grad_norm": 0.7555559849481752, "learning_rate": 4.508165158649019e-05, "loss": 0.1141, "step": 2233 }, { "epoch": 1.016378525932666, "grad_norm": 0.7812992963775933, "learning_rate": 4.507739417689685e-05, "loss": 0.1054, "step": 2234 }, { "epoch": 1.0168334849863512, "grad_norm": 0.6145130986419153, "learning_rate": 4.5073135126669525e-05, "loss": 0.0992, "step": 2235 }, { "epoch": 1.0172884440400365, "grad_norm": 0.5581309521537281, "learning_rate": 4.5068874436156245e-05, "loss": 0.0986, "step": 2236 }, { "epoch": 1.0177434030937216, "grad_norm": 0.7906792873168468, "learning_rate": 4.506461210570518e-05, "loss": 0.1291, "step": 2237 }, { "epoch": 1.0181983621474067, "grad_norm": 0.5151018167994562, "learning_rate": 4.506034813566462e-05, "loss": 0.1627, "step": 2238 }, { "epoch": 1.018653321201092, "grad_norm": 0.6261133544739453, "learning_rate": 4.5056082526383e-05, "loss": 0.1126, "step": 2239 }, { "epoch": 1.019108280254777, "grad_norm": 0.8363390447841931, "learning_rate": 4.5051815278208895e-05, "loss": 0.1535, "step": 2240 }, { "epoch": 1.0195632393084622, "grad_norm": 0.4919100179284053, "learning_rate": 4.5047546391491e-05, "loss": 0.089, "step": 2241 }, { "epoch": 1.0200181983621475, "grad_norm": 0.6012704733393903, "learning_rate": 4.5043275866578146e-05, "loss": 0.0843, "step": 2242 }, { "epoch": 1.0204731574158326, "grad_norm": 0.8018361219121878, "learning_rate": 4.50390037038193e-05, "loss": 0.1315, "step": 2243 }, { "epoch": 1.0209281164695176, "grad_norm": 0.6009228859602307, "learning_rate": 4.5034729903563564e-05, "loss": 0.1285, "step": 2244 }, { "epoch": 1.021383075523203, "grad_norm": 0.5194928444011025, "learning_rate": 4.503045446616018e-05, "loss": 0.087, "step": 2245 }, { "epoch": 1.021838034576888, "grad_norm": 0.6544976787561159, "learning_rate": 4.5026177391958516e-05, "loss": 0.1082, "step": 2246 }, { "epoch": 1.0222929936305734, "grad_norm": 0.5710041132177246, "learning_rate": 4.502189868130807e-05, "loss": 0.116, "step": 2247 }, { "epoch": 1.0227479526842584, "grad_norm": 0.7374599713802978, "learning_rate": 4.501761833455849e-05, "loss": 0.1565, "step": 2248 }, { "epoch": 1.0232029117379435, "grad_norm": 0.6125594155856972, "learning_rate": 4.5013336352059524e-05, "loss": 0.0822, "step": 2249 }, { "epoch": 1.0236578707916288, "grad_norm": 0.7007092027440186, "learning_rate": 4.5009052734161095e-05, "loss": 0.1045, "step": 2250 }, { "epoch": 1.024112829845314, "grad_norm": 0.5238881717323509, "learning_rate": 4.500476748121324e-05, "loss": 0.149, "step": 2251 }, { "epoch": 1.024567788898999, "grad_norm": 0.6155367020633183, "learning_rate": 4.500048059356613e-05, "loss": 0.1345, "step": 2252 }, { "epoch": 1.0250227479526843, "grad_norm": 0.47482629595256715, "learning_rate": 4.499619207157006e-05, "loss": 0.0867, "step": 2253 }, { "epoch": 1.0254777070063694, "grad_norm": 0.5951053319169279, "learning_rate": 4.4991901915575485e-05, "loss": 0.1182, "step": 2254 }, { "epoch": 1.0259326660600545, "grad_norm": 0.7860366739311223, "learning_rate": 4.498761012593296e-05, "loss": 0.1541, "step": 2255 }, { "epoch": 1.0263876251137398, "grad_norm": 0.6213929980017107, "learning_rate": 4.49833167029932e-05, "loss": 0.1644, "step": 2256 }, { "epoch": 1.026842584167425, "grad_norm": 0.7182378394439781, "learning_rate": 4.4979021647107044e-05, "loss": 0.0739, "step": 2257 }, { "epoch": 1.02729754322111, "grad_norm": 0.7908925931728727, "learning_rate": 4.4974724958625466e-05, "loss": 0.1319, "step": 2258 }, { "epoch": 1.0277525022747953, "grad_norm": 0.6794768252845865, "learning_rate": 4.4970426637899574e-05, "loss": 0.114, "step": 2259 }, { "epoch": 1.0282074613284804, "grad_norm": 0.7786743444068422, "learning_rate": 4.496612668528059e-05, "loss": 0.1501, "step": 2260 }, { "epoch": 1.0286624203821657, "grad_norm": 0.5722702951519604, "learning_rate": 4.496182510111991e-05, "loss": 0.1767, "step": 2261 }, { "epoch": 1.0291173794358508, "grad_norm": 0.5939881674509739, "learning_rate": 4.495752188576902e-05, "loss": 0.1387, "step": 2262 }, { "epoch": 1.0295723384895359, "grad_norm": 0.8231681925894041, "learning_rate": 4.4953217039579574e-05, "loss": 0.1472, "step": 2263 }, { "epoch": 1.0300272975432212, "grad_norm": 0.6847915552698882, "learning_rate": 4.494891056290335e-05, "loss": 0.1566, "step": 2264 }, { "epoch": 1.0304822565969063, "grad_norm": 0.7906436876499066, "learning_rate": 4.4944602456092224e-05, "loss": 0.1086, "step": 2265 }, { "epoch": 1.0309372156505914, "grad_norm": 0.6768407971207729, "learning_rate": 4.4940292719498265e-05, "loss": 0.1291, "step": 2266 }, { "epoch": 1.0313921747042767, "grad_norm": 0.39237604460908926, "learning_rate": 4.493598135347363e-05, "loss": 0.1107, "step": 2267 }, { "epoch": 1.0318471337579618, "grad_norm": 0.7467287877032941, "learning_rate": 4.4931668358370636e-05, "loss": 0.0957, "step": 2268 }, { "epoch": 1.0323020928116469, "grad_norm": 0.7221593731008337, "learning_rate": 4.492735373454171e-05, "loss": 0.1245, "step": 2269 }, { "epoch": 1.0327570518653322, "grad_norm": 0.7524738998932702, "learning_rate": 4.492303748233943e-05, "loss": 0.1493, "step": 2270 }, { "epoch": 1.0332120109190173, "grad_norm": 0.6494550792504774, "learning_rate": 4.4918719602116494e-05, "loss": 0.1572, "step": 2271 }, { "epoch": 1.0336669699727024, "grad_norm": 0.6240212702800515, "learning_rate": 4.491440009422575e-05, "loss": 0.0978, "step": 2272 }, { "epoch": 1.0341219290263877, "grad_norm": 0.7796832874930902, "learning_rate": 4.491007895902015e-05, "loss": 0.1125, "step": 2273 }, { "epoch": 1.0345768880800728, "grad_norm": 0.7522397722059601, "learning_rate": 4.490575619685283e-05, "loss": 0.1342, "step": 2274 }, { "epoch": 1.035031847133758, "grad_norm": 0.7764385784665752, "learning_rate": 4.4901431808077e-05, "loss": 0.0997, "step": 2275 }, { "epoch": 1.0354868061874432, "grad_norm": 1.0575570478701943, "learning_rate": 4.489710579304603e-05, "loss": 0.2011, "step": 2276 }, { "epoch": 1.0359417652411282, "grad_norm": 0.637684398137446, "learning_rate": 4.4892778152113434e-05, "loss": 0.0771, "step": 2277 }, { "epoch": 1.0363967242948136, "grad_norm": 0.6589345008945857, "learning_rate": 4.488844888563284e-05, "loss": 0.1802, "step": 2278 }, { "epoch": 1.0368516833484986, "grad_norm": 0.9214337489735039, "learning_rate": 4.488411799395802e-05, "loss": 0.0831, "step": 2279 }, { "epoch": 1.0373066424021837, "grad_norm": 0.9046080404102755, "learning_rate": 4.487978547744287e-05, "loss": 0.1141, "step": 2280 }, { "epoch": 1.037761601455869, "grad_norm": 0.9546439587695728, "learning_rate": 4.487545133644143e-05, "loss": 0.1458, "step": 2281 }, { "epoch": 1.0382165605095541, "grad_norm": 0.5997820485111602, "learning_rate": 4.487111557130787e-05, "loss": 0.1336, "step": 2282 }, { "epoch": 1.0386715195632392, "grad_norm": 0.6572172231902929, "learning_rate": 4.486677818239647e-05, "loss": 0.1239, "step": 2283 }, { "epoch": 1.0391264786169245, "grad_norm": 0.65040185053436, "learning_rate": 4.486243917006169e-05, "loss": 0.1149, "step": 2284 }, { "epoch": 1.0395814376706096, "grad_norm": 0.6218781333934238, "learning_rate": 4.485809853465806e-05, "loss": 0.0711, "step": 2285 }, { "epoch": 1.0400363967242947, "grad_norm": 0.7777545694837289, "learning_rate": 4.485375627654031e-05, "loss": 0.1264, "step": 2286 }, { "epoch": 1.04049135577798, "grad_norm": 0.6895795767646289, "learning_rate": 4.484941239606326e-05, "loss": 0.1177, "step": 2287 }, { "epoch": 1.040946314831665, "grad_norm": 0.9862123414452264, "learning_rate": 4.4845066893581855e-05, "loss": 0.1871, "step": 2288 }, { "epoch": 1.0414012738853504, "grad_norm": 1.467998232090506, "learning_rate": 4.484071976945121e-05, "loss": 0.1617, "step": 2289 }, { "epoch": 1.0418562329390355, "grad_norm": 0.5377126546581447, "learning_rate": 4.483637102402655e-05, "loss": 0.1345, "step": 2290 }, { "epoch": 1.0423111919927206, "grad_norm": 0.5249995849887196, "learning_rate": 4.483202065766322e-05, "loss": 0.1087, "step": 2291 }, { "epoch": 1.042766151046406, "grad_norm": 0.7043838999509769, "learning_rate": 4.482766867071673e-05, "loss": 0.0973, "step": 2292 }, { "epoch": 1.043221110100091, "grad_norm": 0.5127147024184353, "learning_rate": 4.482331506354269e-05, "loss": 0.1281, "step": 2293 }, { "epoch": 1.043676069153776, "grad_norm": 0.6694303549411263, "learning_rate": 4.4818959836496874e-05, "loss": 0.0809, "step": 2294 }, { "epoch": 1.0441310282074614, "grad_norm": 0.6087572954935193, "learning_rate": 4.481460298993515e-05, "loss": 0.1043, "step": 2295 }, { "epoch": 1.0445859872611465, "grad_norm": 0.7009407953733705, "learning_rate": 4.481024452421356e-05, "loss": 0.1218, "step": 2296 }, { "epoch": 1.0450409463148316, "grad_norm": 0.6625268755322336, "learning_rate": 4.4805884439688244e-05, "loss": 0.1115, "step": 2297 }, { "epoch": 1.0454959053685169, "grad_norm": 0.5303125735723936, "learning_rate": 4.48015227367155e-05, "loss": 0.1135, "step": 2298 }, { "epoch": 1.045950864422202, "grad_norm": 0.6978341616155364, "learning_rate": 4.479715941565173e-05, "loss": 0.0835, "step": 2299 }, { "epoch": 1.046405823475887, "grad_norm": 0.7242849911341792, "learning_rate": 4.4792794476853514e-05, "loss": 0.1368, "step": 2300 }, { "epoch": 1.0468607825295724, "grad_norm": 0.694896093361029, "learning_rate": 4.478842792067751e-05, "loss": 0.1314, "step": 2301 }, { "epoch": 1.0473157415832575, "grad_norm": 0.455615121073278, "learning_rate": 4.478405974748053e-05, "loss": 0.0988, "step": 2302 }, { "epoch": 1.0477707006369428, "grad_norm": 0.6506828765823676, "learning_rate": 4.477968995761954e-05, "loss": 0.1259, "step": 2303 }, { "epoch": 1.0482256596906279, "grad_norm": 0.6142973613816182, "learning_rate": 4.477531855145161e-05, "loss": 0.124, "step": 2304 }, { "epoch": 1.048680618744313, "grad_norm": 0.5246136273623709, "learning_rate": 4.4770945529333953e-05, "loss": 0.074, "step": 2305 }, { "epoch": 1.0491355777979983, "grad_norm": 0.48635609707450445, "learning_rate": 4.476657089162391e-05, "loss": 0.0988, "step": 2306 }, { "epoch": 1.0495905368516834, "grad_norm": 0.702230367405953, "learning_rate": 4.476219463867896e-05, "loss": 0.16, "step": 2307 }, { "epoch": 1.0500454959053684, "grad_norm": 0.6694574141442476, "learning_rate": 4.475781677085671e-05, "loss": 0.1045, "step": 2308 }, { "epoch": 1.0505004549590538, "grad_norm": 0.7326548957192494, "learning_rate": 4.47534372885149e-05, "loss": 0.0903, "step": 2309 }, { "epoch": 1.0509554140127388, "grad_norm": 0.7076600320322881, "learning_rate": 4.474905619201139e-05, "loss": 0.1179, "step": 2310 }, { "epoch": 1.051410373066424, "grad_norm": 0.6838482483891196, "learning_rate": 4.474467348170421e-05, "loss": 0.1305, "step": 2311 }, { "epoch": 1.0518653321201092, "grad_norm": 0.6820925380338886, "learning_rate": 4.474028915795148e-05, "loss": 0.1409, "step": 2312 }, { "epoch": 1.0523202911737943, "grad_norm": 0.40820193356025203, "learning_rate": 4.4735903221111454e-05, "loss": 0.0816, "step": 2313 }, { "epoch": 1.0527752502274794, "grad_norm": 0.5880821247328845, "learning_rate": 4.4731515671542545e-05, "loss": 0.0934, "step": 2314 }, { "epoch": 1.0532302092811647, "grad_norm": 0.6462178451199984, "learning_rate": 4.472712650960328e-05, "loss": 0.0914, "step": 2315 }, { "epoch": 1.0536851683348498, "grad_norm": 0.617326030107021, "learning_rate": 4.472273573565233e-05, "loss": 0.1833, "step": 2316 }, { "epoch": 1.0541401273885351, "grad_norm": 0.7621514532190963, "learning_rate": 4.471834335004849e-05, "loss": 0.187, "step": 2317 }, { "epoch": 1.0545950864422202, "grad_norm": 0.8878054166869559, "learning_rate": 4.471394935315066e-05, "loss": 0.0867, "step": 2318 }, { "epoch": 1.0550500454959053, "grad_norm": 0.7390783696338742, "learning_rate": 4.470955374531793e-05, "loss": 0.1488, "step": 2319 }, { "epoch": 1.0555050045495906, "grad_norm": 0.626471540317727, "learning_rate": 4.470515652690947e-05, "loss": 0.0864, "step": 2320 }, { "epoch": 1.0559599636032757, "grad_norm": 0.5149277073957546, "learning_rate": 4.470075769828461e-05, "loss": 0.1039, "step": 2321 }, { "epoch": 1.0564149226569608, "grad_norm": 0.7814091115079279, "learning_rate": 4.4696357259802796e-05, "loss": 0.1142, "step": 2322 }, { "epoch": 1.056869881710646, "grad_norm": 0.5005295665146683, "learning_rate": 4.469195521182361e-05, "loss": 0.1079, "step": 2323 }, { "epoch": 1.0573248407643312, "grad_norm": 0.6682416814574408, "learning_rate": 4.468755155470679e-05, "loss": 0.1145, "step": 2324 }, { "epoch": 1.0577797998180163, "grad_norm": 0.7322261006325289, "learning_rate": 4.4683146288812145e-05, "loss": 0.0976, "step": 2325 }, { "epoch": 1.0582347588717016, "grad_norm": 0.636108822574827, "learning_rate": 4.467873941449968e-05, "loss": 0.1248, "step": 2326 }, { "epoch": 1.0586897179253867, "grad_norm": 0.694678645619039, "learning_rate": 4.4674330932129505e-05, "loss": 0.1202, "step": 2327 }, { "epoch": 1.0591446769790718, "grad_norm": 0.687074424018652, "learning_rate": 4.4669920842061845e-05, "loss": 0.1294, "step": 2328 }, { "epoch": 1.059599636032757, "grad_norm": 0.47856481998586897, "learning_rate": 4.4665509144657085e-05, "loss": 0.0758, "step": 2329 }, { "epoch": 1.0600545950864422, "grad_norm": 0.7744181862291262, "learning_rate": 4.466109584027573e-05, "loss": 0.1636, "step": 2330 }, { "epoch": 1.0605095541401275, "grad_norm": 0.6466101409556725, "learning_rate": 4.465668092927841e-05, "loss": 0.1129, "step": 2331 }, { "epoch": 1.0609645131938126, "grad_norm": 0.791908160629809, "learning_rate": 4.465226441202589e-05, "loss": 0.1193, "step": 2332 }, { "epoch": 1.0614194722474977, "grad_norm": 0.7075341322049811, "learning_rate": 4.464784628887908e-05, "loss": 0.0977, "step": 2333 }, { "epoch": 1.061874431301183, "grad_norm": 0.6274232084075776, "learning_rate": 4.4643426560198993e-05, "loss": 0.146, "step": 2334 }, { "epoch": 1.062329390354868, "grad_norm": 0.6293011557105322, "learning_rate": 4.46390052263468e-05, "loss": 0.1406, "step": 2335 }, { "epoch": 1.0627843494085532, "grad_norm": 1.0991444115555715, "learning_rate": 4.463458228768378e-05, "loss": 0.1574, "step": 2336 }, { "epoch": 1.0632393084622385, "grad_norm": 0.6216438890805942, "learning_rate": 4.463015774457137e-05, "loss": 0.1054, "step": 2337 }, { "epoch": 1.0636942675159236, "grad_norm": 0.9859137591461596, "learning_rate": 4.4625731597371125e-05, "loss": 0.1305, "step": 2338 }, { "epoch": 1.0641492265696086, "grad_norm": 0.6051821705709919, "learning_rate": 4.462130384644472e-05, "loss": 0.1082, "step": 2339 }, { "epoch": 1.064604185623294, "grad_norm": 0.933850788717118, "learning_rate": 4.4616874492153964e-05, "loss": 0.1375, "step": 2340 }, { "epoch": 1.065059144676979, "grad_norm": 0.7233186149429025, "learning_rate": 4.461244353486082e-05, "loss": 0.1383, "step": 2341 }, { "epoch": 1.0655141037306644, "grad_norm": 0.6521283333571863, "learning_rate": 4.460801097492736e-05, "loss": 0.1011, "step": 2342 }, { "epoch": 1.0659690627843494, "grad_norm": 0.539869792490685, "learning_rate": 4.460357681271579e-05, "loss": 0.0951, "step": 2343 }, { "epoch": 1.0664240218380345, "grad_norm": 0.6626517657800329, "learning_rate": 4.4599141048588456e-05, "loss": 0.154, "step": 2344 }, { "epoch": 1.0668789808917198, "grad_norm": 0.7541841294673466, "learning_rate": 4.459470368290782e-05, "loss": 0.133, "step": 2345 }, { "epoch": 1.067333939945405, "grad_norm": 0.676924584521641, "learning_rate": 4.459026471603649e-05, "loss": 0.1509, "step": 2346 }, { "epoch": 1.06778889899909, "grad_norm": 0.9589645367887629, "learning_rate": 4.4585824148337195e-05, "loss": 0.1616, "step": 2347 }, { "epoch": 1.0682438580527753, "grad_norm": 0.5870098667611191, "learning_rate": 4.4581381980172806e-05, "loss": 0.1143, "step": 2348 }, { "epoch": 1.0686988171064604, "grad_norm": 0.5660314684691228, "learning_rate": 4.4576938211906306e-05, "loss": 0.119, "step": 2349 }, { "epoch": 1.0691537761601455, "grad_norm": 0.6714783849756203, "learning_rate": 4.4572492843900815e-05, "loss": 0.1401, "step": 2350 }, { "epoch": 1.0696087352138308, "grad_norm": 0.647366253995588, "learning_rate": 4.456804587651961e-05, "loss": 0.1262, "step": 2351 }, { "epoch": 1.070063694267516, "grad_norm": 0.5763339622088601, "learning_rate": 4.4563597310126057e-05, "loss": 0.1376, "step": 2352 }, { "epoch": 1.070518653321201, "grad_norm": 0.6991088624311325, "learning_rate": 4.4559147145083686e-05, "loss": 0.1181, "step": 2353 }, { "epoch": 1.0709736123748863, "grad_norm": 0.4070162458260712, "learning_rate": 4.455469538175613e-05, "loss": 0.0546, "step": 2354 }, { "epoch": 1.0714285714285714, "grad_norm": 0.8336490482645359, "learning_rate": 4.455024202050718e-05, "loss": 0.1485, "step": 2355 }, { "epoch": 1.0718835304822565, "grad_norm": 0.544772059750671, "learning_rate": 4.454578706170075e-05, "loss": 0.0934, "step": 2356 }, { "epoch": 1.0723384895359418, "grad_norm": 0.899811402879079, "learning_rate": 4.4541330505700864e-05, "loss": 0.119, "step": 2357 }, { "epoch": 1.0727934485896269, "grad_norm": 0.5975346243162832, "learning_rate": 4.453687235287169e-05, "loss": 0.1424, "step": 2358 }, { "epoch": 1.0732484076433122, "grad_norm": 0.51222590460739, "learning_rate": 4.453241260357753e-05, "loss": 0.1208, "step": 2359 }, { "epoch": 1.0737033666969973, "grad_norm": 0.6738375717248295, "learning_rate": 4.452795125818283e-05, "loss": 0.1255, "step": 2360 }, { "epoch": 1.0741583257506824, "grad_norm": 0.8127925380880848, "learning_rate": 4.452348831705214e-05, "loss": 0.1488, "step": 2361 }, { "epoch": 1.0746132848043677, "grad_norm": 0.44876479461409136, "learning_rate": 4.451902378055015e-05, "loss": 0.0615, "step": 2362 }, { "epoch": 1.0750682438580528, "grad_norm": 0.6640510121849843, "learning_rate": 4.451455764904169e-05, "loss": 0.1161, "step": 2363 }, { "epoch": 1.0755232029117379, "grad_norm": 0.9295527905822911, "learning_rate": 4.45100899228917e-05, "loss": 0.1567, "step": 2364 }, { "epoch": 1.0759781619654232, "grad_norm": 0.5364761493277019, "learning_rate": 4.450562060246527e-05, "loss": 0.0789, "step": 2365 }, { "epoch": 1.0764331210191083, "grad_norm": 0.7257794800275588, "learning_rate": 4.450114968812761e-05, "loss": 0.1336, "step": 2366 }, { "epoch": 1.0768880800727934, "grad_norm": 0.5544904763928689, "learning_rate": 4.4496677180244065e-05, "loss": 0.1509, "step": 2367 }, { "epoch": 1.0773430391264787, "grad_norm": 0.6181636832023903, "learning_rate": 4.449220307918011e-05, "loss": 0.1549, "step": 2368 }, { "epoch": 1.0777979981801638, "grad_norm": 1.0342288998875147, "learning_rate": 4.4487727385301334e-05, "loss": 0.1109, "step": 2369 }, { "epoch": 1.078252957233849, "grad_norm": 0.5494681897033089, "learning_rate": 4.4483250098973494e-05, "loss": 0.0596, "step": 2370 }, { "epoch": 1.0787079162875342, "grad_norm": 0.5500759191341273, "learning_rate": 4.447877122056243e-05, "loss": 0.0833, "step": 2371 }, { "epoch": 1.0791628753412192, "grad_norm": 0.5949716939787879, "learning_rate": 4.4474290750434154e-05, "loss": 0.0775, "step": 2372 }, { "epoch": 1.0796178343949046, "grad_norm": 0.8196111089467918, "learning_rate": 4.446980868895478e-05, "loss": 0.0908, "step": 2373 }, { "epoch": 1.0800727934485896, "grad_norm": 0.7080124701066379, "learning_rate": 4.446532503649058e-05, "loss": 0.1392, "step": 2374 }, { "epoch": 1.0805277525022747, "grad_norm": 0.4963623584563871, "learning_rate": 4.446083979340791e-05, "loss": 0.0749, "step": 2375 }, { "epoch": 1.08098271155596, "grad_norm": 0.7678955006581015, "learning_rate": 4.445635296007329e-05, "loss": 0.1253, "step": 2376 }, { "epoch": 1.0814376706096451, "grad_norm": 0.9621635927739111, "learning_rate": 4.445186453685338e-05, "loss": 0.1301, "step": 2377 }, { "epoch": 1.0818926296633302, "grad_norm": 1.2129394023566127, "learning_rate": 4.444737452411494e-05, "loss": 0.1926, "step": 2378 }, { "epoch": 1.0823475887170155, "grad_norm": 1.0483400987009681, "learning_rate": 4.4442882922224874e-05, "loss": 0.1603, "step": 2379 }, { "epoch": 1.0828025477707006, "grad_norm": 0.5871632465207439, "learning_rate": 4.443838973155023e-05, "loss": 0.1118, "step": 2380 }, { "epoch": 1.0832575068243857, "grad_norm": 0.6684106911050516, "learning_rate": 4.4433894952458156e-05, "loss": 0.1214, "step": 2381 }, { "epoch": 1.083712465878071, "grad_norm": 0.497494952424991, "learning_rate": 4.442939858531594e-05, "loss": 0.1235, "step": 2382 }, { "epoch": 1.084167424931756, "grad_norm": 0.8541177179002439, "learning_rate": 4.442490063049102e-05, "loss": 0.1501, "step": 2383 }, { "epoch": 1.0846223839854412, "grad_norm": 0.6043478466547025, "learning_rate": 4.442040108835095e-05, "loss": 0.0929, "step": 2384 }, { "epoch": 1.0850773430391265, "grad_norm": 0.48675776252599057, "learning_rate": 4.4415899959263397e-05, "loss": 0.0869, "step": 2385 }, { "epoch": 1.0855323020928116, "grad_norm": 0.8353152840572221, "learning_rate": 4.441139724359618e-05, "loss": 0.0952, "step": 2386 }, { "epoch": 1.085987261146497, "grad_norm": 0.5565066245660785, "learning_rate": 4.4406892941717246e-05, "loss": 0.1059, "step": 2387 }, { "epoch": 1.086442220200182, "grad_norm": 0.760885732620097, "learning_rate": 4.4402387053994655e-05, "loss": 0.1082, "step": 2388 }, { "epoch": 1.086897179253867, "grad_norm": 0.6388030030950889, "learning_rate": 4.439787958079662e-05, "loss": 0.1133, "step": 2389 }, { "epoch": 1.0873521383075524, "grad_norm": 0.6026449521189914, "learning_rate": 4.439337052249146e-05, "loss": 0.1035, "step": 2390 }, { "epoch": 1.0878070973612375, "grad_norm": 0.6123954723974186, "learning_rate": 4.4388859879447644e-05, "loss": 0.0833, "step": 2391 }, { "epoch": 1.0882620564149226, "grad_norm": 1.0658743083175966, "learning_rate": 4.438434765203376e-05, "loss": 0.1841, "step": 2392 }, { "epoch": 1.0887170154686079, "grad_norm": 0.6773279231077233, "learning_rate": 4.4379833840618524e-05, "loss": 0.1381, "step": 2393 }, { "epoch": 1.089171974522293, "grad_norm": 0.8247716072191486, "learning_rate": 4.4375318445570777e-05, "loss": 0.097, "step": 2394 }, { "epoch": 1.089626933575978, "grad_norm": 0.5952121923977395, "learning_rate": 4.437080146725951e-05, "loss": 0.0723, "step": 2395 }, { "epoch": 1.0900818926296634, "grad_norm": 0.49101372515583486, "learning_rate": 4.436628290605384e-05, "loss": 0.0913, "step": 2396 }, { "epoch": 1.0905368516833485, "grad_norm": 0.5950219694890557, "learning_rate": 4.4361762762322966e-05, "loss": 0.1063, "step": 2397 }, { "epoch": 1.0909918107370338, "grad_norm": 1.233001621993024, "learning_rate": 4.435724103643629e-05, "loss": 0.1295, "step": 2398 }, { "epoch": 1.0914467697907189, "grad_norm": 0.692607161911306, "learning_rate": 4.435271772876329e-05, "loss": 0.121, "step": 2399 }, { "epoch": 1.091901728844404, "grad_norm": 0.6475725169063435, "learning_rate": 4.434819283967359e-05, "loss": 0.1483, "step": 2400 }, { "epoch": 1.0923566878980893, "grad_norm": 1.0200029223205633, "learning_rate": 4.434366636953695e-05, "loss": 0.0854, "step": 2401 }, { "epoch": 1.0928116469517744, "grad_norm": 0.5017301932790267, "learning_rate": 4.4339138318723246e-05, "loss": 0.078, "step": 2402 }, { "epoch": 1.0932666060054594, "grad_norm": 0.6916186419085405, "learning_rate": 4.43346086876025e-05, "loss": 0.2168, "step": 2403 }, { "epoch": 1.0937215650591448, "grad_norm": 0.9793619488020967, "learning_rate": 4.433007747654484e-05, "loss": 0.1073, "step": 2404 }, { "epoch": 1.0941765241128298, "grad_norm": 0.6937903948671724, "learning_rate": 4.432554468592054e-05, "loss": 0.1452, "step": 2405 }, { "epoch": 1.094631483166515, "grad_norm": 0.8910232939689493, "learning_rate": 4.432101031610001e-05, "loss": 0.1622, "step": 2406 }, { "epoch": 1.0950864422202002, "grad_norm": 0.6285131295143538, "learning_rate": 4.431647436745376e-05, "loss": 0.1115, "step": 2407 }, { "epoch": 1.0955414012738853, "grad_norm": 0.7925629575199707, "learning_rate": 4.431193684035246e-05, "loss": 0.1304, "step": 2408 }, { "epoch": 1.0959963603275704, "grad_norm": 0.6814388186274062, "learning_rate": 4.43073977351669e-05, "loss": 0.0991, "step": 2409 }, { "epoch": 1.0964513193812557, "grad_norm": 0.7940718467260173, "learning_rate": 4.4302857052267985e-05, "loss": 0.1198, "step": 2410 }, { "epoch": 1.0969062784349408, "grad_norm": 0.6172613963641802, "learning_rate": 4.4298314792026764e-05, "loss": 0.0985, "step": 2411 }, { "epoch": 1.097361237488626, "grad_norm": 0.5797005865532603, "learning_rate": 4.4293770954814404e-05, "loss": 0.0936, "step": 2412 }, { "epoch": 1.0978161965423112, "grad_norm": 0.6990883700004399, "learning_rate": 4.428922554100221e-05, "loss": 0.1738, "step": 2413 }, { "epoch": 1.0982711555959963, "grad_norm": 0.46994140641602095, "learning_rate": 4.428467855096162e-05, "loss": 0.0977, "step": 2414 }, { "epoch": 1.0987261146496816, "grad_norm": 2.7977482185463383, "learning_rate": 4.428012998506419e-05, "loss": 0.2055, "step": 2415 }, { "epoch": 1.0991810737033667, "grad_norm": 0.49591175618578326, "learning_rate": 4.42755798436816e-05, "loss": 0.112, "step": 2416 }, { "epoch": 1.0996360327570518, "grad_norm": 0.6757848207373124, "learning_rate": 4.427102812718568e-05, "loss": 0.1023, "step": 2417 }, { "epoch": 1.100090991810737, "grad_norm": 0.5450194043308251, "learning_rate": 4.426647483594836e-05, "loss": 0.1352, "step": 2418 }, { "epoch": 1.1005459508644222, "grad_norm": 0.5910771390727293, "learning_rate": 4.426191997034172e-05, "loss": 0.1228, "step": 2419 }, { "epoch": 1.1010009099181073, "grad_norm": 0.6751041035277012, "learning_rate": 4.4257363530737976e-05, "loss": 0.0875, "step": 2420 }, { "epoch": 1.1014558689717926, "grad_norm": 0.7007722327263856, "learning_rate": 4.425280551750945e-05, "loss": 0.1169, "step": 2421 }, { "epoch": 1.1019108280254777, "grad_norm": 0.5351734806483252, "learning_rate": 4.42482459310286e-05, "loss": 0.1142, "step": 2422 }, { "epoch": 1.1023657870791628, "grad_norm": 2.1198024735890053, "learning_rate": 4.4243684771668015e-05, "loss": 0.1727, "step": 2423 }, { "epoch": 1.102820746132848, "grad_norm": 0.6794135944132158, "learning_rate": 4.423912203980041e-05, "loss": 0.1649, "step": 2424 }, { "epoch": 1.1032757051865332, "grad_norm": 0.7405355208292914, "learning_rate": 4.423455773579864e-05, "loss": 0.0868, "step": 2425 }, { "epoch": 1.1037306642402185, "grad_norm": 0.6183242290720827, "learning_rate": 4.422999186003567e-05, "loss": 0.1055, "step": 2426 }, { "epoch": 1.1041856232939036, "grad_norm": 0.5098960823987368, "learning_rate": 4.422542441288462e-05, "loss": 0.0867, "step": 2427 }, { "epoch": 1.1046405823475887, "grad_norm": 0.6693006398249935, "learning_rate": 4.4220855394718694e-05, "loss": 0.1493, "step": 2428 }, { "epoch": 1.105095541401274, "grad_norm": 0.6120968584678131, "learning_rate": 4.4216284805911275e-05, "loss": 0.1205, "step": 2429 }, { "epoch": 1.105550500454959, "grad_norm": 0.7579531408040922, "learning_rate": 4.421171264683584e-05, "loss": 0.1334, "step": 2430 }, { "epoch": 1.1060054595086442, "grad_norm": 1.1482682352115183, "learning_rate": 4.4207138917866e-05, "loss": 0.1621, "step": 2431 }, { "epoch": 1.1064604185623295, "grad_norm": 0.6622687927782536, "learning_rate": 4.420256361937551e-05, "loss": 0.1281, "step": 2432 }, { "epoch": 1.1069153776160146, "grad_norm": 0.5924156178275082, "learning_rate": 4.419798675173824e-05, "loss": 0.078, "step": 2433 }, { "epoch": 1.1073703366696996, "grad_norm": 0.5262259721956631, "learning_rate": 4.4193408315328185e-05, "loss": 0.095, "step": 2434 }, { "epoch": 1.107825295723385, "grad_norm": 0.5975726252527817, "learning_rate": 4.418882831051949e-05, "loss": 0.1055, "step": 2435 }, { "epoch": 1.10828025477707, "grad_norm": 0.653520553510086, "learning_rate": 4.418424673768639e-05, "loss": 0.0861, "step": 2436 }, { "epoch": 1.1087352138307551, "grad_norm": 0.4939435328625673, "learning_rate": 4.417966359720329e-05, "loss": 0.0526, "step": 2437 }, { "epoch": 1.1091901728844404, "grad_norm": 0.5835214042072661, "learning_rate": 4.417507888944469e-05, "loss": 0.0883, "step": 2438 }, { "epoch": 1.1096451319381255, "grad_norm": 0.8128815248945501, "learning_rate": 4.417049261478524e-05, "loss": 0.1331, "step": 2439 }, { "epoch": 1.1101000909918108, "grad_norm": 0.7042832227820245, "learning_rate": 4.416590477359971e-05, "loss": 0.0801, "step": 2440 }, { "epoch": 1.110555050045496, "grad_norm": 0.44307208327025766, "learning_rate": 4.416131536626299e-05, "loss": 0.1081, "step": 2441 }, { "epoch": 1.111010009099181, "grad_norm": 0.6504157615449226, "learning_rate": 4.415672439315011e-05, "loss": 0.1458, "step": 2442 }, { "epoch": 1.1114649681528663, "grad_norm": 0.5669283702061299, "learning_rate": 4.4152131854636225e-05, "loss": 0.1371, "step": 2443 }, { "epoch": 1.1119199272065514, "grad_norm": 0.66299329029815, "learning_rate": 4.414753775109661e-05, "loss": 0.0834, "step": 2444 }, { "epoch": 1.1123748862602365, "grad_norm": 0.7551975715158735, "learning_rate": 4.414294208290669e-05, "loss": 0.1472, "step": 2445 }, { "epoch": 1.1128298453139218, "grad_norm": 0.4913640202408364, "learning_rate": 4.413834485044198e-05, "loss": 0.0899, "step": 2446 }, { "epoch": 1.113284804367607, "grad_norm": 0.70365036071613, "learning_rate": 4.413374605407816e-05, "loss": 0.0721, "step": 2447 }, { "epoch": 1.113739763421292, "grad_norm": 0.8215843089627018, "learning_rate": 4.412914569419103e-05, "loss": 0.1194, "step": 2448 }, { "epoch": 1.1141947224749773, "grad_norm": 0.5275958257118392, "learning_rate": 4.412454377115649e-05, "loss": 0.1118, "step": 2449 }, { "epoch": 1.1146496815286624, "grad_norm": 0.7775291016323521, "learning_rate": 4.41199402853506e-05, "loss": 0.1614, "step": 2450 }, { "epoch": 1.1151046405823477, "grad_norm": 0.6165112868519741, "learning_rate": 4.411533523714954e-05, "loss": 0.0832, "step": 2451 }, { "epoch": 1.1155595996360328, "grad_norm": 0.6326996523115097, "learning_rate": 4.41107286269296e-05, "loss": 0.1158, "step": 2452 }, { "epoch": 1.1160145586897179, "grad_norm": 0.49421991040868185, "learning_rate": 4.410612045506723e-05, "loss": 0.1102, "step": 2453 }, { "epoch": 1.1164695177434032, "grad_norm": 0.5797873337155166, "learning_rate": 4.4101510721938966e-05, "loss": 0.1313, "step": 2454 }, { "epoch": 1.1169244767970883, "grad_norm": 0.5504330150701939, "learning_rate": 4.409689942792152e-05, "loss": 0.0982, "step": 2455 }, { "epoch": 1.1173794358507734, "grad_norm": 0.6136416703378204, "learning_rate": 4.409228657339168e-05, "loss": 0.1574, "step": 2456 }, { "epoch": 1.1178343949044587, "grad_norm": 2.3163632552322992, "learning_rate": 4.408767215872641e-05, "loss": 0.2001, "step": 2457 }, { "epoch": 1.1182893539581438, "grad_norm": 0.7743472759661335, "learning_rate": 4.408305618430277e-05, "loss": 0.1171, "step": 2458 }, { "epoch": 1.1187443130118289, "grad_norm": 0.8413080258611573, "learning_rate": 4.4078438650497965e-05, "loss": 0.1131, "step": 2459 }, { "epoch": 1.1191992720655142, "grad_norm": 0.7360940213332019, "learning_rate": 4.40738195576893e-05, "loss": 0.0936, "step": 2460 }, { "epoch": 1.1196542311191993, "grad_norm": 0.6266114304261672, "learning_rate": 4.406919890625424e-05, "loss": 0.103, "step": 2461 }, { "epoch": 1.1201091901728844, "grad_norm": 0.5785853195536527, "learning_rate": 4.406457669657036e-05, "loss": 0.0958, "step": 2462 }, { "epoch": 1.1205641492265697, "grad_norm": 0.8177301159919063, "learning_rate": 4.4059952929015366e-05, "loss": 0.1008, "step": 2463 }, { "epoch": 1.1210191082802548, "grad_norm": 0.945783745155429, "learning_rate": 4.40553276039671e-05, "loss": 0.2134, "step": 2464 }, { "epoch": 1.1214740673339398, "grad_norm": 0.6171405892557792, "learning_rate": 4.4050700721803505e-05, "loss": 0.1192, "step": 2465 }, { "epoch": 1.1219290263876252, "grad_norm": 0.5584068120531289, "learning_rate": 4.404607228290269e-05, "loss": 0.1155, "step": 2466 }, { "epoch": 1.1223839854413102, "grad_norm": 0.842166486773733, "learning_rate": 4.404144228764285e-05, "loss": 0.0897, "step": 2467 }, { "epoch": 1.1228389444949956, "grad_norm": 0.7935571252399594, "learning_rate": 4.4036810736402346e-05, "loss": 0.1775, "step": 2468 }, { "epoch": 1.1232939035486806, "grad_norm": 0.8127854056771939, "learning_rate": 4.4032177629559636e-05, "loss": 0.1244, "step": 2469 }, { "epoch": 1.1237488626023657, "grad_norm": 0.8454575583946952, "learning_rate": 4.4027542967493304e-05, "loss": 0.1624, "step": 2470 }, { "epoch": 1.124203821656051, "grad_norm": 0.6160286821713526, "learning_rate": 4.40229067505821e-05, "loss": 0.1021, "step": 2471 }, { "epoch": 1.1246587807097361, "grad_norm": 0.5397074250935002, "learning_rate": 4.401826897920487e-05, "loss": 0.1129, "step": 2472 }, { "epoch": 1.1251137397634212, "grad_norm": 0.8299740879149127, "learning_rate": 4.401362965374057e-05, "loss": 0.0789, "step": 2473 }, { "epoch": 1.1255686988171065, "grad_norm": 0.6727439280759618, "learning_rate": 4.4008988774568335e-05, "loss": 0.1236, "step": 2474 }, { "epoch": 1.1260236578707916, "grad_norm": 1.2572553175178833, "learning_rate": 4.400434634206737e-05, "loss": 0.1514, "step": 2475 }, { "epoch": 1.1264786169244767, "grad_norm": 0.5199955377940919, "learning_rate": 4.399970235661704e-05, "loss": 0.091, "step": 2476 }, { "epoch": 1.126933575978162, "grad_norm": 0.640840756913376, "learning_rate": 4.3995056818596846e-05, "loss": 0.157, "step": 2477 }, { "epoch": 1.127388535031847, "grad_norm": 0.9364217094495968, "learning_rate": 4.3990409728386396e-05, "loss": 0.0705, "step": 2478 }, { "epoch": 1.1278434940855324, "grad_norm": 0.6212252649859028, "learning_rate": 4.3985761086365405e-05, "loss": 0.086, "step": 2479 }, { "epoch": 1.1282984531392175, "grad_norm": 0.6407673640120304, "learning_rate": 4.398111089291377e-05, "loss": 0.104, "step": 2480 }, { "epoch": 1.1287534121929026, "grad_norm": 0.7945318464159773, "learning_rate": 4.3976459148411464e-05, "loss": 0.1293, "step": 2481 }, { "epoch": 1.129208371246588, "grad_norm": 0.6022767855250688, "learning_rate": 4.397180585323862e-05, "loss": 0.1, "step": 2482 }, { "epoch": 1.129663330300273, "grad_norm": 1.1020301286606915, "learning_rate": 4.396715100777548e-05, "loss": 0.1033, "step": 2483 }, { "epoch": 1.130118289353958, "grad_norm": 1.1222715212880856, "learning_rate": 4.3962494612402397e-05, "loss": 0.1459, "step": 2484 }, { "epoch": 1.1305732484076434, "grad_norm": 0.9237571724506419, "learning_rate": 4.39578366674999e-05, "loss": 0.1229, "step": 2485 }, { "epoch": 1.1310282074613285, "grad_norm": 0.5960780795055003, "learning_rate": 4.3953177173448605e-05, "loss": 0.1169, "step": 2486 }, { "epoch": 1.1314831665150136, "grad_norm": 0.8626147484423653, "learning_rate": 4.3948516130629263e-05, "loss": 0.1319, "step": 2487 }, { "epoch": 1.1319381255686989, "grad_norm": 0.5627703572533737, "learning_rate": 4.394385353942275e-05, "loss": 0.0945, "step": 2488 }, { "epoch": 1.132393084622384, "grad_norm": 0.7172780863576442, "learning_rate": 4.393918940021008e-05, "loss": 0.1437, "step": 2489 }, { "epoch": 1.132848043676069, "grad_norm": 0.7586988728870461, "learning_rate": 4.3934523713372375e-05, "loss": 0.0986, "step": 2490 }, { "epoch": 1.1333030027297544, "grad_norm": 0.6902275400748321, "learning_rate": 4.39298564792909e-05, "loss": 0.1008, "step": 2491 }, { "epoch": 1.1337579617834395, "grad_norm": 0.8206144221443289, "learning_rate": 4.392518769834705e-05, "loss": 0.1264, "step": 2492 }, { "epoch": 1.1342129208371245, "grad_norm": 0.8960021719585161, "learning_rate": 4.392051737092231e-05, "loss": 0.1378, "step": 2493 }, { "epoch": 1.1346678798908099, "grad_norm": 0.6834892023013055, "learning_rate": 4.391584549739835e-05, "loss": 0.145, "step": 2494 }, { "epoch": 1.135122838944495, "grad_norm": 0.7465969324479527, "learning_rate": 4.3911172078156906e-05, "loss": 0.0998, "step": 2495 }, { "epoch": 1.1355777979981803, "grad_norm": 0.6374543208162182, "learning_rate": 4.390649711357989e-05, "loss": 0.0882, "step": 2496 }, { "epoch": 1.1360327570518653, "grad_norm": 0.6069669679889129, "learning_rate": 4.390182060404932e-05, "loss": 0.0762, "step": 2497 }, { "epoch": 1.1364877161055504, "grad_norm": 0.7740223456174087, "learning_rate": 4.3897142549947313e-05, "loss": 0.104, "step": 2498 }, { "epoch": 1.1369426751592357, "grad_norm": 0.7796068757447613, "learning_rate": 4.389246295165616e-05, "loss": 0.1376, "step": 2499 }, { "epoch": 1.1373976342129208, "grad_norm": 0.4610059940261574, "learning_rate": 4.3887781809558256e-05, "loss": 0.0784, "step": 2500 }, { "epoch": 1.137852593266606, "grad_norm": 0.9612441655570161, "learning_rate": 4.388309912403612e-05, "loss": 0.1695, "step": 2501 }, { "epoch": 1.1383075523202912, "grad_norm": 0.6966859881387231, "learning_rate": 4.38784148954724e-05, "loss": 0.099, "step": 2502 }, { "epoch": 1.1387625113739763, "grad_norm": 0.7479949704206463, "learning_rate": 4.387372912424986e-05, "loss": 0.1079, "step": 2503 }, { "epoch": 1.1392174704276614, "grad_norm": 1.024582290648865, "learning_rate": 4.386904181075142e-05, "loss": 0.1553, "step": 2504 }, { "epoch": 1.1396724294813467, "grad_norm": 0.669765369999807, "learning_rate": 4.386435295536008e-05, "loss": 0.0924, "step": 2505 }, { "epoch": 1.1401273885350318, "grad_norm": 0.5958514856254521, "learning_rate": 4.385966255845902e-05, "loss": 0.1171, "step": 2506 }, { "epoch": 1.1405823475887171, "grad_norm": 0.5758610881649472, "learning_rate": 4.38549706204315e-05, "loss": 0.1229, "step": 2507 }, { "epoch": 1.1410373066424022, "grad_norm": 0.6451501402007896, "learning_rate": 4.3850277141660934e-05, "loss": 0.114, "step": 2508 }, { "epoch": 1.1414922656960873, "grad_norm": 1.3219025602293988, "learning_rate": 4.384558212253084e-05, "loss": 0.1361, "step": 2509 }, { "epoch": 1.1419472247497726, "grad_norm": 0.5385999734757722, "learning_rate": 4.3840885563424885e-05, "loss": 0.0905, "step": 2510 }, { "epoch": 1.1424021838034577, "grad_norm": 0.5854859344168203, "learning_rate": 4.383618746472685e-05, "loss": 0.104, "step": 2511 }, { "epoch": 1.1428571428571428, "grad_norm": 0.6729892235579931, "learning_rate": 4.3831487826820634e-05, "loss": 0.0896, "step": 2512 }, { "epoch": 1.143312101910828, "grad_norm": 0.6989932227628077, "learning_rate": 4.382678665009028e-05, "loss": 0.174, "step": 2513 }, { "epoch": 1.1437670609645132, "grad_norm": 0.5108672459160504, "learning_rate": 4.3822083934919936e-05, "loss": 0.1351, "step": 2514 }, { "epoch": 1.1442220200181983, "grad_norm": 0.8403503731219659, "learning_rate": 4.381737968169389e-05, "loss": 0.1607, "step": 2515 }, { "epoch": 1.1446769790718836, "grad_norm": 0.9417547676375162, "learning_rate": 4.3812673890796566e-05, "loss": 0.1101, "step": 2516 }, { "epoch": 1.1451319381255687, "grad_norm": 0.5601337101738048, "learning_rate": 4.380796656261248e-05, "loss": 0.1155, "step": 2517 }, { "epoch": 1.1455868971792538, "grad_norm": 0.5654421896530383, "learning_rate": 4.38032576975263e-05, "loss": 0.0942, "step": 2518 }, { "epoch": 1.146041856232939, "grad_norm": 0.5166879817059317, "learning_rate": 4.379854729592282e-05, "loss": 0.0861, "step": 2519 }, { "epoch": 1.1464968152866242, "grad_norm": 0.5855157312275544, "learning_rate": 4.379383535818695e-05, "loss": 0.0954, "step": 2520 }, { "epoch": 1.1469517743403093, "grad_norm": 0.7218663989339901, "learning_rate": 4.378912188470373e-05, "loss": 0.1583, "step": 2521 }, { "epoch": 1.1474067333939946, "grad_norm": 0.6891372596825441, "learning_rate": 4.378440687585832e-05, "loss": 0.1389, "step": 2522 }, { "epoch": 1.1478616924476797, "grad_norm": 0.9112755752671445, "learning_rate": 4.3779690332036004e-05, "loss": 0.1912, "step": 2523 }, { "epoch": 1.148316651501365, "grad_norm": 0.7656213076108356, "learning_rate": 4.377497225362221e-05, "loss": 0.1487, "step": 2524 }, { "epoch": 1.14877161055505, "grad_norm": 0.5735194449363498, "learning_rate": 4.377025264100246e-05, "loss": 0.1136, "step": 2525 }, { "epoch": 1.1492265696087351, "grad_norm": 0.659634785413603, "learning_rate": 4.376553149456244e-05, "loss": 0.0839, "step": 2526 }, { "epoch": 1.1496815286624205, "grad_norm": 0.7228925658465164, "learning_rate": 4.376080881468793e-05, "loss": 0.133, "step": 2527 }, { "epoch": 1.1501364877161055, "grad_norm": 0.5117896783089262, "learning_rate": 4.3756084601764836e-05, "loss": 0.0829, "step": 2528 }, { "epoch": 1.1505914467697906, "grad_norm": 0.5886801770543539, "learning_rate": 4.375135885617922e-05, "loss": 0.0982, "step": 2529 }, { "epoch": 1.151046405823476, "grad_norm": 0.5604751585352101, "learning_rate": 4.374663157831723e-05, "loss": 0.1042, "step": 2530 }, { "epoch": 1.151501364877161, "grad_norm": 0.6422290046142347, "learning_rate": 4.374190276856517e-05, "loss": 0.0945, "step": 2531 }, { "epoch": 1.1519563239308463, "grad_norm": 0.7377879686278781, "learning_rate": 4.3737172427309455e-05, "loss": 0.109, "step": 2532 }, { "epoch": 1.1524112829845314, "grad_norm": 0.6525986243775587, "learning_rate": 4.373244055493663e-05, "loss": 0.0749, "step": 2533 }, { "epoch": 1.1528662420382165, "grad_norm": 2.1155965606249056, "learning_rate": 4.372770715183335e-05, "loss": 0.1551, "step": 2534 }, { "epoch": 1.1533212010919018, "grad_norm": 0.7957845989572859, "learning_rate": 4.372297221838642e-05, "loss": 0.1798, "step": 2535 }, { "epoch": 1.153776160145587, "grad_norm": 0.5886510478263857, "learning_rate": 4.371823575498275e-05, "loss": 0.1254, "step": 2536 }, { "epoch": 1.154231119199272, "grad_norm": 0.6871623948927064, "learning_rate": 4.3713497762009385e-05, "loss": 0.1303, "step": 2537 }, { "epoch": 1.1546860782529573, "grad_norm": 0.7575026044676054, "learning_rate": 4.3708758239853495e-05, "loss": 0.1017, "step": 2538 }, { "epoch": 1.1551410373066424, "grad_norm": 0.6769381157448425, "learning_rate": 4.370401718890237e-05, "loss": 0.111, "step": 2539 }, { "epoch": 1.1555959963603275, "grad_norm": 0.6610592882904601, "learning_rate": 4.369927460954342e-05, "loss": 0.0951, "step": 2540 }, { "epoch": 1.1560509554140128, "grad_norm": 0.6609822327874378, "learning_rate": 4.3694530502164196e-05, "loss": 0.1159, "step": 2541 }, { "epoch": 1.156505914467698, "grad_norm": 0.5892330247404384, "learning_rate": 4.368978486715237e-05, "loss": 0.121, "step": 2542 }, { "epoch": 1.156960873521383, "grad_norm": 0.5600721740679389, "learning_rate": 4.3685037704895724e-05, "loss": 0.1068, "step": 2543 }, { "epoch": 1.1574158325750683, "grad_norm": 0.7272523707201152, "learning_rate": 4.3680289015782174e-05, "loss": 0.0985, "step": 2544 }, { "epoch": 1.1578707916287534, "grad_norm": 0.5238281152488657, "learning_rate": 4.367553880019977e-05, "loss": 0.0888, "step": 2545 }, { "epoch": 1.1583257506824385, "grad_norm": 1.000416670211714, "learning_rate": 4.367078705853667e-05, "loss": 0.1393, "step": 2546 }, { "epoch": 1.1587807097361238, "grad_norm": 0.7307119679075275, "learning_rate": 4.366603379118117e-05, "loss": 0.1311, "step": 2547 }, { "epoch": 1.1592356687898089, "grad_norm": 0.6483213178220296, "learning_rate": 4.366127899852168e-05, "loss": 0.1076, "step": 2548 }, { "epoch": 1.159690627843494, "grad_norm": 0.6341813101697773, "learning_rate": 4.365652268094675e-05, "loss": 0.0953, "step": 2549 }, { "epoch": 1.1601455868971793, "grad_norm": 0.7627728670670156, "learning_rate": 4.3651764838845035e-05, "loss": 0.1517, "step": 2550 }, { "epoch": 1.1606005459508644, "grad_norm": 0.6769380137050982, "learning_rate": 4.364700547260533e-05, "loss": 0.1229, "step": 2551 }, { "epoch": 1.1610555050045497, "grad_norm": 0.6878444860755325, "learning_rate": 4.364224458261654e-05, "loss": 0.1262, "step": 2552 }, { "epoch": 1.1615104640582348, "grad_norm": 0.5497097848944188, "learning_rate": 4.363748216926772e-05, "loss": 0.1162, "step": 2553 }, { "epoch": 1.1619654231119199, "grad_norm": 0.7724964367473399, "learning_rate": 4.363271823294802e-05, "loss": 0.1462, "step": 2554 }, { "epoch": 1.1624203821656052, "grad_norm": 0.7548712967093116, "learning_rate": 4.362795277404673e-05, "loss": 0.1483, "step": 2555 }, { "epoch": 1.1628753412192903, "grad_norm": 0.4754941971534692, "learning_rate": 4.362318579295326e-05, "loss": 0.0602, "step": 2556 }, { "epoch": 1.1633303002729753, "grad_norm": 0.6154119339388949, "learning_rate": 4.361841729005715e-05, "loss": 0.1274, "step": 2557 }, { "epoch": 1.1637852593266607, "grad_norm": 1.061735069156857, "learning_rate": 4.3613647265748056e-05, "loss": 0.1183, "step": 2558 }, { "epoch": 1.1642402183803457, "grad_norm": 0.5306703692893123, "learning_rate": 4.360887572041578e-05, "loss": 0.0976, "step": 2559 }, { "epoch": 1.164695177434031, "grad_norm": 0.8176586319422494, "learning_rate": 4.36041026544502e-05, "loss": 0.1675, "step": 2560 }, { "epoch": 1.1651501364877161, "grad_norm": 0.6707868923581274, "learning_rate": 4.3599328068241376e-05, "loss": 0.1163, "step": 2561 }, { "epoch": 1.1656050955414012, "grad_norm": 0.9179394482743749, "learning_rate": 4.359455196217945e-05, "loss": 0.0943, "step": 2562 }, { "epoch": 1.1660600545950865, "grad_norm": 0.6873985016385786, "learning_rate": 4.358977433665471e-05, "loss": 0.0807, "step": 2563 }, { "epoch": 1.1665150136487716, "grad_norm": 0.682622869101383, "learning_rate": 4.358499519205756e-05, "loss": 0.1099, "step": 2564 }, { "epoch": 1.1669699727024567, "grad_norm": 0.7417134322323133, "learning_rate": 4.358021452877854e-05, "loss": 0.1396, "step": 2565 }, { "epoch": 1.167424931756142, "grad_norm": 0.8954891052097662, "learning_rate": 4.3575432347208296e-05, "loss": 0.1134, "step": 2566 }, { "epoch": 1.1678798908098271, "grad_norm": 0.6569523014604418, "learning_rate": 4.3570648647737605e-05, "loss": 0.1119, "step": 2567 }, { "epoch": 1.1683348498635122, "grad_norm": 0.6442773259933505, "learning_rate": 4.3565863430757366e-05, "loss": 0.1274, "step": 2568 }, { "epoch": 1.1687898089171975, "grad_norm": 0.447658497982329, "learning_rate": 4.356107669665862e-05, "loss": 0.1059, "step": 2569 }, { "epoch": 1.1692447679708826, "grad_norm": 0.7036017053391979, "learning_rate": 4.3556288445832494e-05, "loss": 0.172, "step": 2570 }, { "epoch": 1.1696997270245677, "grad_norm": 0.6215658593219049, "learning_rate": 4.355149867867029e-05, "loss": 0.088, "step": 2571 }, { "epoch": 1.170154686078253, "grad_norm": 0.8619309052844237, "learning_rate": 4.354670739556338e-05, "loss": 0.1406, "step": 2572 }, { "epoch": 1.170609645131938, "grad_norm": 0.6513370856662442, "learning_rate": 4.35419145969033e-05, "loss": 0.1212, "step": 2573 }, { "epoch": 1.1710646041856232, "grad_norm": 0.5999864112842914, "learning_rate": 4.35371202830817e-05, "loss": 0.1145, "step": 2574 }, { "epoch": 1.1715195632393085, "grad_norm": 0.7044667367487478, "learning_rate": 4.3532324454490336e-05, "loss": 0.1212, "step": 2575 }, { "epoch": 1.1719745222929936, "grad_norm": 0.6328239245625524, "learning_rate": 4.352752711152112e-05, "loss": 0.1068, "step": 2576 }, { "epoch": 1.1724294813466787, "grad_norm": 0.742198761509031, "learning_rate": 4.352272825456605e-05, "loss": 0.1626, "step": 2577 }, { "epoch": 1.172884440400364, "grad_norm": 0.520180501630736, "learning_rate": 4.351792788401727e-05, "loss": 0.1382, "step": 2578 }, { "epoch": 1.173339399454049, "grad_norm": 0.5655346421168753, "learning_rate": 4.351312600026706e-05, "loss": 0.1129, "step": 2579 }, { "epoch": 1.1737943585077344, "grad_norm": 0.7729316352055582, "learning_rate": 4.350832260370778e-05, "loss": 0.1307, "step": 2580 }, { "epoch": 1.1742493175614195, "grad_norm": 0.5044712990971124, "learning_rate": 4.350351769473198e-05, "loss": 0.0882, "step": 2581 }, { "epoch": 1.1747042766151046, "grad_norm": 0.6545835414099042, "learning_rate": 4.3498711273732264e-05, "loss": 0.1311, "step": 2582 }, { "epoch": 1.1751592356687899, "grad_norm": 0.6651383296131227, "learning_rate": 4.34939033411014e-05, "loss": 0.1458, "step": 2583 }, { "epoch": 1.175614194722475, "grad_norm": 2.4762785931385825, "learning_rate": 4.348909389723228e-05, "loss": 0.1268, "step": 2584 }, { "epoch": 1.17606915377616, "grad_norm": 1.0338854180757677, "learning_rate": 4.3484282942517905e-05, "loss": 0.1212, "step": 2585 }, { "epoch": 1.1765241128298454, "grad_norm": 0.5934402869235662, "learning_rate": 4.34794704773514e-05, "loss": 0.0848, "step": 2586 }, { "epoch": 1.1769790718835305, "grad_norm": 0.7268491749833524, "learning_rate": 4.3474656502126015e-05, "loss": 0.1608, "step": 2587 }, { "epoch": 1.1774340309372158, "grad_norm": 0.6734300862193695, "learning_rate": 4.3469841017235136e-05, "loss": 0.1216, "step": 2588 }, { "epoch": 1.1778889899909009, "grad_norm": 1.245129887710001, "learning_rate": 4.346502402307225e-05, "loss": 0.2029, "step": 2589 }, { "epoch": 1.178343949044586, "grad_norm": 0.5964592062368135, "learning_rate": 4.346020552003101e-05, "loss": 0.0977, "step": 2590 }, { "epoch": 1.1787989080982713, "grad_norm": 1.1738681073967419, "learning_rate": 4.345538550850512e-05, "loss": 0.1637, "step": 2591 }, { "epoch": 1.1792538671519563, "grad_norm": 0.8186110309610523, "learning_rate": 4.345056398888847e-05, "loss": 0.1151, "step": 2592 }, { "epoch": 1.1797088262056414, "grad_norm": 3.299163803112887, "learning_rate": 4.344574096157506e-05, "loss": 0.0953, "step": 2593 }, { "epoch": 1.1801637852593267, "grad_norm": 0.652369842039476, "learning_rate": 4.3440916426959e-05, "loss": 0.1134, "step": 2594 }, { "epoch": 1.1806187443130118, "grad_norm": 0.6345593588600449, "learning_rate": 4.3436090385434525e-05, "loss": 0.1226, "step": 2595 }, { "epoch": 1.181073703366697, "grad_norm": 0.565069962380645, "learning_rate": 4.3431262837396e-05, "loss": 0.1124, "step": 2596 }, { "epoch": 1.1815286624203822, "grad_norm": 0.6285280815591272, "learning_rate": 4.342643378323791e-05, "loss": 0.0972, "step": 2597 }, { "epoch": 1.1819836214740673, "grad_norm": 0.5443473240603252, "learning_rate": 4.342160322335487e-05, "loss": 0.0816, "step": 2598 }, { "epoch": 1.1824385805277524, "grad_norm": 0.8644259975635772, "learning_rate": 4.34167711581416e-05, "loss": 0.1446, "step": 2599 }, { "epoch": 1.1828935395814377, "grad_norm": 0.6517739022421992, "learning_rate": 4.341193758799296e-05, "loss": 0.1138, "step": 2600 }, { "epoch": 1.1833484986351228, "grad_norm": 0.5171926450265705, "learning_rate": 4.340710251330393e-05, "loss": 0.0919, "step": 2601 }, { "epoch": 1.183803457688808, "grad_norm": 0.7287969862644994, "learning_rate": 4.3402265934469604e-05, "loss": 0.1395, "step": 2602 }, { "epoch": 1.1842584167424932, "grad_norm": 0.8455088424977499, "learning_rate": 4.339742785188521e-05, "loss": 0.1343, "step": 2603 }, { "epoch": 1.1847133757961783, "grad_norm": 0.6401792849442558, "learning_rate": 4.33925882659461e-05, "loss": 0.0802, "step": 2604 }, { "epoch": 1.1851683348498634, "grad_norm": 0.6006209370946619, "learning_rate": 4.338774717704774e-05, "loss": 0.0718, "step": 2605 }, { "epoch": 1.1856232939035487, "grad_norm": 0.7749634486020501, "learning_rate": 4.338290458558572e-05, "loss": 0.2066, "step": 2606 }, { "epoch": 1.1860782529572338, "grad_norm": 0.743885578092865, "learning_rate": 4.337806049195574e-05, "loss": 0.1443, "step": 2607 }, { "epoch": 1.186533212010919, "grad_norm": 0.8606389251246873, "learning_rate": 4.337321489655366e-05, "loss": 0.1782, "step": 2608 }, { "epoch": 1.1869881710646042, "grad_norm": 0.6580659321897124, "learning_rate": 4.336836779977543e-05, "loss": 0.1229, "step": 2609 }, { "epoch": 1.1874431301182893, "grad_norm": 0.5936142208114212, "learning_rate": 4.336351920201713e-05, "loss": 0.1368, "step": 2610 }, { "epoch": 1.1878980891719746, "grad_norm": 0.5831704618459245, "learning_rate": 4.335866910367497e-05, "loss": 0.08, "step": 2611 }, { "epoch": 1.1883530482256597, "grad_norm": 1.0622356525305887, "learning_rate": 4.335381750514529e-05, "loss": 0.1725, "step": 2612 }, { "epoch": 1.1888080072793448, "grad_norm": 0.7961615996321623, "learning_rate": 4.334896440682452e-05, "loss": 0.0943, "step": 2613 }, { "epoch": 1.18926296633303, "grad_norm": 0.9988542913997885, "learning_rate": 4.334410980910924e-05, "loss": 0.1811, "step": 2614 }, { "epoch": 1.1897179253867152, "grad_norm": 0.6402080095515631, "learning_rate": 4.333925371239615e-05, "loss": 0.128, "step": 2615 }, { "epoch": 1.1901728844404005, "grad_norm": 0.6102087010421632, "learning_rate": 4.333439611708206e-05, "loss": 0.1317, "step": 2616 }, { "epoch": 1.1906278434940856, "grad_norm": 0.7341101417256162, "learning_rate": 4.332953702356393e-05, "loss": 0.1847, "step": 2617 }, { "epoch": 1.1910828025477707, "grad_norm": 0.5494955846308348, "learning_rate": 4.3324676432238795e-05, "loss": 0.0843, "step": 2618 }, { "epoch": 1.191537761601456, "grad_norm": 1.5268869802668048, "learning_rate": 4.331981434350387e-05, "loss": 0.2707, "step": 2619 }, { "epoch": 1.191992720655141, "grad_norm": 0.5488736944390338, "learning_rate": 4.331495075775644e-05, "loss": 0.1727, "step": 2620 }, { "epoch": 1.1924476797088261, "grad_norm": 0.6219657303741966, "learning_rate": 4.331008567539395e-05, "loss": 0.0903, "step": 2621 }, { "epoch": 1.1929026387625115, "grad_norm": 0.5394159014259298, "learning_rate": 4.330521909681394e-05, "loss": 0.0964, "step": 2622 }, { "epoch": 1.1933575978161965, "grad_norm": 0.7011896436552927, "learning_rate": 4.3300351022414087e-05, "loss": 0.1431, "step": 2623 }, { "epoch": 1.1938125568698816, "grad_norm": 1.4612613287183236, "learning_rate": 4.32954814525922e-05, "loss": 0.0947, "step": 2624 }, { "epoch": 1.194267515923567, "grad_norm": 0.6451123519382151, "learning_rate": 4.329061038774619e-05, "loss": 0.1045, "step": 2625 }, { "epoch": 1.194722474977252, "grad_norm": 0.7110180979255467, "learning_rate": 4.32857378282741e-05, "loss": 0.1035, "step": 2626 }, { "epoch": 1.1951774340309371, "grad_norm": 0.6149768022144073, "learning_rate": 4.328086377457409e-05, "loss": 0.0958, "step": 2627 }, { "epoch": 1.1956323930846224, "grad_norm": 0.7417887345549831, "learning_rate": 4.327598822704444e-05, "loss": 0.1577, "step": 2628 }, { "epoch": 1.1960873521383075, "grad_norm": 0.7739008806367439, "learning_rate": 4.327111118608357e-05, "loss": 0.1989, "step": 2629 }, { "epoch": 1.1965423111919926, "grad_norm": 0.637188293166282, "learning_rate": 4.326623265209001e-05, "loss": 0.1006, "step": 2630 }, { "epoch": 1.196997270245678, "grad_norm": 0.7914790130599995, "learning_rate": 4.3261352625462404e-05, "loss": 0.0886, "step": 2631 }, { "epoch": 1.197452229299363, "grad_norm": 0.6334605467755683, "learning_rate": 4.325647110659953e-05, "loss": 0.1528, "step": 2632 }, { "epoch": 1.197907188353048, "grad_norm": 0.7862210034514856, "learning_rate": 4.325158809590029e-05, "loss": 0.1071, "step": 2633 }, { "epoch": 1.1983621474067334, "grad_norm": 0.3957830000627573, "learning_rate": 4.324670359376368e-05, "loss": 0.0568, "step": 2634 }, { "epoch": 1.1988171064604185, "grad_norm": 0.6851166750666957, "learning_rate": 4.3241817600588863e-05, "loss": 0.1046, "step": 2635 }, { "epoch": 1.1992720655141038, "grad_norm": 0.4912410142485056, "learning_rate": 4.323693011677509e-05, "loss": 0.0772, "step": 2636 }, { "epoch": 1.199727024567789, "grad_norm": 0.7448640482015604, "learning_rate": 4.3232041142721746e-05, "loss": 0.1829, "step": 2637 }, { "epoch": 1.200181983621474, "grad_norm": 0.7150890819398116, "learning_rate": 4.322715067882833e-05, "loss": 0.1316, "step": 2638 }, { "epoch": 1.2006369426751593, "grad_norm": 0.6077967510354548, "learning_rate": 4.322225872549448e-05, "loss": 0.1116, "step": 2639 }, { "epoch": 1.2010919017288444, "grad_norm": 0.7424865866424795, "learning_rate": 4.321736528311993e-05, "loss": 0.1484, "step": 2640 }, { "epoch": 1.2015468607825295, "grad_norm": 0.6474110326950687, "learning_rate": 4.321247035210456e-05, "loss": 0.0952, "step": 2641 }, { "epoch": 1.2020018198362148, "grad_norm": 0.7034529673460805, "learning_rate": 4.3207573932848365e-05, "loss": 0.1087, "step": 2642 }, { "epoch": 1.2024567788898999, "grad_norm": 0.5605776389797877, "learning_rate": 4.3202676025751455e-05, "loss": 0.0769, "step": 2643 }, { "epoch": 1.2029117379435852, "grad_norm": 0.6037685984575554, "learning_rate": 4.319777663121406e-05, "loss": 0.1171, "step": 2644 }, { "epoch": 1.2033666969972703, "grad_norm": 0.9054892928817779, "learning_rate": 4.319287574963653e-05, "loss": 0.179, "step": 2645 }, { "epoch": 1.2038216560509554, "grad_norm": 0.5991244800733359, "learning_rate": 4.318797338141936e-05, "loss": 0.1006, "step": 2646 }, { "epoch": 1.2042766151046407, "grad_norm": 0.4868186453993439, "learning_rate": 4.3183069526963135e-05, "loss": 0.1033, "step": 2647 }, { "epoch": 1.2047315741583258, "grad_norm": 0.5840414512383596, "learning_rate": 4.317816418666859e-05, "loss": 0.0816, "step": 2648 }, { "epoch": 1.2051865332120109, "grad_norm": 0.6463269340681866, "learning_rate": 4.317325736093656e-05, "loss": 0.1113, "step": 2649 }, { "epoch": 1.2056414922656962, "grad_norm": 0.4658157249890115, "learning_rate": 4.3168349050168005e-05, "loss": 0.1002, "step": 2650 }, { "epoch": 1.2060964513193813, "grad_norm": 0.6736602754769699, "learning_rate": 4.316343925476402e-05, "loss": 0.0983, "step": 2651 }, { "epoch": 1.2065514103730663, "grad_norm": 0.7469167609928444, "learning_rate": 4.3158527975125793e-05, "loss": 0.1323, "step": 2652 }, { "epoch": 1.2070063694267517, "grad_norm": 0.5254544641734525, "learning_rate": 4.315361521165467e-05, "loss": 0.0563, "step": 2653 }, { "epoch": 1.2074613284804367, "grad_norm": 0.9480543803941438, "learning_rate": 4.3148700964752084e-05, "loss": 0.1512, "step": 2654 }, { "epoch": 1.2079162875341218, "grad_norm": 0.7468491254785992, "learning_rate": 4.314378523481962e-05, "loss": 0.14, "step": 2655 }, { "epoch": 1.2083712465878071, "grad_norm": 0.6547626011778394, "learning_rate": 4.313886802225897e-05, "loss": 0.1022, "step": 2656 }, { "epoch": 1.2088262056414922, "grad_norm": 0.6844420873183268, "learning_rate": 4.3133949327471934e-05, "loss": 0.1476, "step": 2657 }, { "epoch": 1.2092811646951773, "grad_norm": 0.7597996255581633, "learning_rate": 4.312902915086046e-05, "loss": 0.1185, "step": 2658 }, { "epoch": 1.2097361237488626, "grad_norm": 0.5593557323808429, "learning_rate": 4.312410749282658e-05, "loss": 0.1108, "step": 2659 }, { "epoch": 1.2101910828025477, "grad_norm": 0.6526546836876365, "learning_rate": 4.311918435377249e-05, "loss": 0.1184, "step": 2660 }, { "epoch": 1.210646041856233, "grad_norm": 0.6218049172291933, "learning_rate": 4.311425973410047e-05, "loss": 0.1671, "step": 2661 }, { "epoch": 1.2111010009099181, "grad_norm": 0.6592206577715823, "learning_rate": 4.310933363421296e-05, "loss": 0.1171, "step": 2662 }, { "epoch": 1.2115559599636032, "grad_norm": 0.8004719721598931, "learning_rate": 4.310440605451248e-05, "loss": 0.1376, "step": 2663 }, { "epoch": 1.2120109190172885, "grad_norm": 1.1359531222728543, "learning_rate": 4.30994769954017e-05, "loss": 0.1722, "step": 2664 }, { "epoch": 1.2124658780709736, "grad_norm": 1.06718900203593, "learning_rate": 4.309454645728339e-05, "loss": 0.1864, "step": 2665 }, { "epoch": 1.2129208371246587, "grad_norm": 0.6550340239781399, "learning_rate": 4.3089614440560464e-05, "loss": 0.0726, "step": 2666 }, { "epoch": 1.213375796178344, "grad_norm": 0.6737456697254435, "learning_rate": 4.308468094563594e-05, "loss": 0.1664, "step": 2667 }, { "epoch": 1.213830755232029, "grad_norm": 0.7989503244343559, "learning_rate": 4.3079745972912956e-05, "loss": 0.1581, "step": 2668 }, { "epoch": 1.2142857142857142, "grad_norm": 0.8240736362763148, "learning_rate": 4.307480952279478e-05, "loss": 0.1018, "step": 2669 }, { "epoch": 1.2147406733393995, "grad_norm": 0.6467615406432587, "learning_rate": 4.306987159568479e-05, "loss": 0.1256, "step": 2670 }, { "epoch": 1.2151956323930846, "grad_norm": 0.5565372650772141, "learning_rate": 4.30649321919865e-05, "loss": 0.1077, "step": 2671 }, { "epoch": 1.21565059144677, "grad_norm": 0.5970165645890977, "learning_rate": 4.305999131210353e-05, "loss": 0.1145, "step": 2672 }, { "epoch": 1.216105550500455, "grad_norm": 0.7721781033143867, "learning_rate": 4.305504895643963e-05, "loss": 0.1218, "step": 2673 }, { "epoch": 1.21656050955414, "grad_norm": 0.5287870260828984, "learning_rate": 4.3050105125398664e-05, "loss": 0.1248, "step": 2674 }, { "epoch": 1.2170154686078254, "grad_norm": 0.677508566928539, "learning_rate": 4.304515981938462e-05, "loss": 0.083, "step": 2675 }, { "epoch": 1.2174704276615105, "grad_norm": 0.5933506850166768, "learning_rate": 4.304021303880161e-05, "loss": 0.115, "step": 2676 }, { "epoch": 1.2179253867151956, "grad_norm": 0.5517066597696805, "learning_rate": 4.303526478405385e-05, "loss": 0.1208, "step": 2677 }, { "epoch": 1.2183803457688809, "grad_norm": 0.5821896293024265, "learning_rate": 4.303031505554571e-05, "loss": 0.1347, "step": 2678 }, { "epoch": 1.218835304822566, "grad_norm": 0.6399453939175587, "learning_rate": 4.302536385368164e-05, "loss": 0.1102, "step": 2679 }, { "epoch": 1.219290263876251, "grad_norm": 0.5526591188636323, "learning_rate": 4.302041117886624e-05, "loss": 0.0703, "step": 2680 }, { "epoch": 1.2197452229299364, "grad_norm": 0.5361210344789754, "learning_rate": 4.301545703150422e-05, "loss": 0.0808, "step": 2681 }, { "epoch": 1.2202001819836215, "grad_norm": 0.7283489625281305, "learning_rate": 4.301050141200041e-05, "loss": 0.1252, "step": 2682 }, { "epoch": 1.2206551410373065, "grad_norm": 0.7581284214675921, "learning_rate": 4.3005544320759756e-05, "loss": 0.0938, "step": 2683 }, { "epoch": 1.2211101000909919, "grad_norm": 0.7393446056618634, "learning_rate": 4.300058575818733e-05, "loss": 0.1347, "step": 2684 }, { "epoch": 1.221565059144677, "grad_norm": 0.7157442165039828, "learning_rate": 4.299562572468833e-05, "loss": 0.1445, "step": 2685 }, { "epoch": 1.222020018198362, "grad_norm": 0.714926011080975, "learning_rate": 4.2990664220668064e-05, "loss": 0.0738, "step": 2686 }, { "epoch": 1.2224749772520473, "grad_norm": 0.6340042851197536, "learning_rate": 4.298570124653196e-05, "loss": 0.1362, "step": 2687 }, { "epoch": 1.2229299363057324, "grad_norm": 0.7581019072368538, "learning_rate": 4.2980736802685574e-05, "loss": 0.0851, "step": 2688 }, { "epoch": 1.2233848953594177, "grad_norm": 0.875924683946305, "learning_rate": 4.297577088953458e-05, "loss": 0.1231, "step": 2689 }, { "epoch": 1.2238398544131028, "grad_norm": 0.46525529536236676, "learning_rate": 4.297080350748476e-05, "loss": 0.0798, "step": 2690 }, { "epoch": 1.224294813466788, "grad_norm": 0.6383052265431863, "learning_rate": 4.296583465694204e-05, "loss": 0.1091, "step": 2691 }, { "epoch": 1.2247497725204732, "grad_norm": 0.804231076042413, "learning_rate": 4.296086433831243e-05, "loss": 0.1346, "step": 2692 }, { "epoch": 1.2252047315741583, "grad_norm": 0.5840599662534331, "learning_rate": 4.295589255200212e-05, "loss": 0.0998, "step": 2693 }, { "epoch": 1.2256596906278434, "grad_norm": 1.0282539933623867, "learning_rate": 4.295091929841734e-05, "loss": 0.1405, "step": 2694 }, { "epoch": 1.2261146496815287, "grad_norm": 0.6143899620777007, "learning_rate": 4.2945944577964514e-05, "loss": 0.2031, "step": 2695 }, { "epoch": 1.2265696087352138, "grad_norm": 0.5621524293115948, "learning_rate": 4.294096839105013e-05, "loss": 0.0858, "step": 2696 }, { "epoch": 1.2270245677888991, "grad_norm": 0.5917521350589942, "learning_rate": 4.293599073808083e-05, "loss": 0.1041, "step": 2697 }, { "epoch": 1.2274795268425842, "grad_norm": 0.8503436920214672, "learning_rate": 4.2931011619463366e-05, "loss": 0.1029, "step": 2698 }, { "epoch": 1.2279344858962693, "grad_norm": 0.650484262166286, "learning_rate": 4.292603103560462e-05, "loss": 0.0876, "step": 2699 }, { "epoch": 1.2283894449499546, "grad_norm": 0.512682415853728, "learning_rate": 4.292104898691156e-05, "loss": 0.1171, "step": 2700 }, { "epoch": 1.2288444040036397, "grad_norm": 0.7572839602157379, "learning_rate": 4.2916065473791306e-05, "loss": 0.1261, "step": 2701 }, { "epoch": 1.2292993630573248, "grad_norm": 1.0452418687607465, "learning_rate": 4.291108049665109e-05, "loss": 0.2178, "step": 2702 }, { "epoch": 1.22975432211101, "grad_norm": 0.5592452715170244, "learning_rate": 4.290609405589827e-05, "loss": 0.0808, "step": 2703 }, { "epoch": 1.2302092811646952, "grad_norm": 0.5955285647390792, "learning_rate": 4.2901106151940294e-05, "loss": 0.1505, "step": 2704 }, { "epoch": 1.2306642402183803, "grad_norm": 0.7042913102565671, "learning_rate": 4.289611678518478e-05, "loss": 0.1193, "step": 2705 }, { "epoch": 1.2311191992720656, "grad_norm": 0.7349675101641922, "learning_rate": 4.289112595603941e-05, "loss": 0.1686, "step": 2706 }, { "epoch": 1.2315741583257507, "grad_norm": 0.7420884980252541, "learning_rate": 4.288613366491202e-05, "loss": 0.1187, "step": 2707 }, { "epoch": 1.2320291173794358, "grad_norm": 0.7302741772593933, "learning_rate": 4.2881139912210566e-05, "loss": 0.1305, "step": 2708 }, { "epoch": 1.232484076433121, "grad_norm": 0.5996725214014331, "learning_rate": 4.287614469834311e-05, "loss": 0.1085, "step": 2709 }, { "epoch": 1.2329390354868062, "grad_norm": 0.6621437361138447, "learning_rate": 4.287114802371783e-05, "loss": 0.1237, "step": 2710 }, { "epoch": 1.2333939945404913, "grad_norm": 0.6867421412077639, "learning_rate": 4.2866149888743045e-05, "loss": 0.132, "step": 2711 }, { "epoch": 1.2338489535941766, "grad_norm": 0.6916490380297293, "learning_rate": 4.286115029382717e-05, "loss": 0.166, "step": 2712 }, { "epoch": 1.2343039126478617, "grad_norm": 1.2131054139529431, "learning_rate": 4.285614923937876e-05, "loss": 0.1616, "step": 2713 }, { "epoch": 1.2347588717015467, "grad_norm": 0.8723905482785432, "learning_rate": 4.2851146725806466e-05, "loss": 0.0971, "step": 2714 }, { "epoch": 1.235213830755232, "grad_norm": 0.7305209396804292, "learning_rate": 4.284614275351908e-05, "loss": 0.1535, "step": 2715 }, { "epoch": 1.2356687898089171, "grad_norm": 0.6556629234790382, "learning_rate": 4.28411373229255e-05, "loss": 0.1393, "step": 2716 }, { "epoch": 1.2361237488626025, "grad_norm": 0.7351099447902546, "learning_rate": 4.283613043443474e-05, "loss": 0.0981, "step": 2717 }, { "epoch": 1.2365787079162875, "grad_norm": 0.5040201221671352, "learning_rate": 4.283112208845595e-05, "loss": 0.0733, "step": 2718 }, { "epoch": 1.2370336669699726, "grad_norm": 0.6828214263080464, "learning_rate": 4.282611228539839e-05, "loss": 0.1683, "step": 2719 }, { "epoch": 1.237488626023658, "grad_norm": 0.4539789169035856, "learning_rate": 4.2821101025671446e-05, "loss": 0.0667, "step": 2720 }, { "epoch": 1.237943585077343, "grad_norm": 0.616072573402778, "learning_rate": 4.2816088309684595e-05, "loss": 0.086, "step": 2721 }, { "epoch": 1.2383985441310281, "grad_norm": 0.6521712966291419, "learning_rate": 4.2811074137847466e-05, "loss": 0.1087, "step": 2722 }, { "epoch": 1.2388535031847134, "grad_norm": 0.5622169306494461, "learning_rate": 4.28060585105698e-05, "loss": 0.1352, "step": 2723 }, { "epoch": 1.2393084622383985, "grad_norm": 0.998333521845532, "learning_rate": 4.2801041428261435e-05, "loss": 0.1302, "step": 2724 }, { "epoch": 1.2397634212920838, "grad_norm": 1.0946576039657885, "learning_rate": 4.279602289133235e-05, "loss": 0.2481, "step": 2725 }, { "epoch": 1.240218380345769, "grad_norm": 0.5691381770740301, "learning_rate": 4.279100290019265e-05, "loss": 0.0847, "step": 2726 }, { "epoch": 1.240673339399454, "grad_norm": 0.5859206979099922, "learning_rate": 4.2785981455252536e-05, "loss": 0.1474, "step": 2727 }, { "epoch": 1.2411282984531393, "grad_norm": 0.7129086547838894, "learning_rate": 4.278095855692233e-05, "loss": 0.1333, "step": 2728 }, { "epoch": 1.2415832575068244, "grad_norm": 0.7664154530936227, "learning_rate": 4.277593420561249e-05, "loss": 0.0737, "step": 2729 }, { "epoch": 1.2420382165605095, "grad_norm": 0.6284950698055043, "learning_rate": 4.277090840173359e-05, "loss": 0.0868, "step": 2730 }, { "epoch": 1.2424931756141948, "grad_norm": 0.7390620737566158, "learning_rate": 4.2765881145696306e-05, "loss": 0.1497, "step": 2731 }, { "epoch": 1.24294813466788, "grad_norm": 0.5831801486628019, "learning_rate": 4.2760852437911434e-05, "loss": 0.0965, "step": 2732 }, { "epoch": 1.243403093721565, "grad_norm": 0.6017514579858171, "learning_rate": 4.2755822278789926e-05, "loss": 0.0872, "step": 2733 }, { "epoch": 1.2438580527752503, "grad_norm": 1.0807083225835552, "learning_rate": 4.275079066874279e-05, "loss": 0.1249, "step": 2734 }, { "epoch": 1.2443130118289354, "grad_norm": 0.5609361126631663, "learning_rate": 4.274575760818121e-05, "loss": 0.1353, "step": 2735 }, { "epoch": 1.2447679708826205, "grad_norm": 0.789595931106956, "learning_rate": 4.2740723097516455e-05, "loss": 0.1021, "step": 2736 }, { "epoch": 1.2452229299363058, "grad_norm": 0.5946455861065726, "learning_rate": 4.273568713715993e-05, "loss": 0.1456, "step": 2737 }, { "epoch": 1.2456778889899909, "grad_norm": 1.1527375940071025, "learning_rate": 4.2730649727523145e-05, "loss": 0.1253, "step": 2738 }, { "epoch": 1.246132848043676, "grad_norm": 0.7398191443369819, "learning_rate": 4.272561086901773e-05, "loss": 0.1148, "step": 2739 }, { "epoch": 1.2465878070973613, "grad_norm": 0.807396731384125, "learning_rate": 4.272057056205544e-05, "loss": 0.1248, "step": 2740 }, { "epoch": 1.2470427661510464, "grad_norm": 0.7891137992692318, "learning_rate": 4.271552880704816e-05, "loss": 0.1232, "step": 2741 }, { "epoch": 1.2474977252047315, "grad_norm": 0.6721094519463059, "learning_rate": 4.271048560440786e-05, "loss": 0.1208, "step": 2742 }, { "epoch": 1.2479526842584168, "grad_norm": 0.821649490639464, "learning_rate": 4.270544095454666e-05, "loss": 0.1662, "step": 2743 }, { "epoch": 1.2484076433121019, "grad_norm": 0.8992318454548489, "learning_rate": 4.270039485787678e-05, "loss": 0.1381, "step": 2744 }, { "epoch": 1.2488626023657872, "grad_norm": 0.6663082629706552, "learning_rate": 4.269534731481057e-05, "loss": 0.1546, "step": 2745 }, { "epoch": 1.2493175614194723, "grad_norm": 1.18467973358046, "learning_rate": 4.269029832576048e-05, "loss": 0.1018, "step": 2746 }, { "epoch": 1.2497725204731573, "grad_norm": 0.6735949221408182, "learning_rate": 4.2685247891139116e-05, "loss": 0.109, "step": 2747 }, { "epoch": 1.2502274795268427, "grad_norm": 0.5468702214599221, "learning_rate": 4.268019601135914e-05, "loss": 0.0866, "step": 2748 }, { "epoch": 1.2506824385805277, "grad_norm": 0.7323528051324587, "learning_rate": 4.2675142686833405e-05, "loss": 0.0715, "step": 2749 }, { "epoch": 1.251137397634213, "grad_norm": 0.581977762856333, "learning_rate": 4.2670087917974824e-05, "loss": 0.1083, "step": 2750 }, { "epoch": 1.2515923566878981, "grad_norm": 0.8115897991000579, "learning_rate": 4.266503170519646e-05, "loss": 0.1129, "step": 2751 }, { "epoch": 1.2520473157415832, "grad_norm": 0.705298119668317, "learning_rate": 4.2659974048911474e-05, "loss": 0.12, "step": 2752 }, { "epoch": 1.2525022747952685, "grad_norm": 0.8479065124545331, "learning_rate": 4.265491494953316e-05, "loss": 0.1216, "step": 2753 }, { "epoch": 1.2529572338489536, "grad_norm": 0.6871316741748481, "learning_rate": 4.2649854407474924e-05, "loss": 0.1674, "step": 2754 }, { "epoch": 1.2534121929026387, "grad_norm": 0.6145407311772955, "learning_rate": 4.26447924231503e-05, "loss": 0.0705, "step": 2755 }, { "epoch": 1.253867151956324, "grad_norm": 0.5621149953105964, "learning_rate": 4.263972899697292e-05, "loss": 0.114, "step": 2756 }, { "epoch": 1.2543221110100091, "grad_norm": 0.4830585967072514, "learning_rate": 4.2634664129356536e-05, "loss": 0.1241, "step": 2757 }, { "epoch": 1.2547770700636942, "grad_norm": 0.6806400938044078, "learning_rate": 4.262959782071505e-05, "loss": 0.085, "step": 2758 }, { "epoch": 1.2552320291173795, "grad_norm": 0.697965224456859, "learning_rate": 4.2624530071462435e-05, "loss": 0.1466, "step": 2759 }, { "epoch": 1.2556869881710646, "grad_norm": 0.7493410416084858, "learning_rate": 4.261946088201282e-05, "loss": 0.0782, "step": 2760 }, { "epoch": 1.2561419472247497, "grad_norm": 0.7579762096846328, "learning_rate": 4.261439025278043e-05, "loss": 0.1177, "step": 2761 }, { "epoch": 1.256596906278435, "grad_norm": 1.056571424566734, "learning_rate": 4.260931818417961e-05, "loss": 0.1031, "step": 2762 }, { "epoch": 1.25705186533212, "grad_norm": 0.5470408182926436, "learning_rate": 4.260424467662484e-05, "loss": 0.0949, "step": 2763 }, { "epoch": 1.2575068243858052, "grad_norm": 0.7037747989133635, "learning_rate": 4.259916973053069e-05, "loss": 0.1478, "step": 2764 }, { "epoch": 1.2579617834394905, "grad_norm": 0.5632963889298429, "learning_rate": 4.259409334631187e-05, "loss": 0.1266, "step": 2765 }, { "epoch": 1.2584167424931756, "grad_norm": 0.7317764956897098, "learning_rate": 4.258901552438319e-05, "loss": 0.1435, "step": 2766 }, { "epoch": 1.2588717015468607, "grad_norm": 0.5391632219035042, "learning_rate": 4.2583936265159594e-05, "loss": 0.1166, "step": 2767 }, { "epoch": 1.259326660600546, "grad_norm": 0.8889792008611366, "learning_rate": 4.257885556905613e-05, "loss": 0.1298, "step": 2768 }, { "epoch": 1.259781619654231, "grad_norm": 0.7559694059597198, "learning_rate": 4.2573773436487985e-05, "loss": 0.111, "step": 2769 }, { "epoch": 1.2602365787079162, "grad_norm": 0.5842738747086464, "learning_rate": 4.256868986787044e-05, "loss": 0.1076, "step": 2770 }, { "epoch": 1.2606915377616015, "grad_norm": 0.9332925711964637, "learning_rate": 4.256360486361889e-05, "loss": 0.1212, "step": 2771 }, { "epoch": 1.2611464968152866, "grad_norm": 0.7115802265463957, "learning_rate": 4.255851842414886e-05, "loss": 0.0906, "step": 2772 }, { "epoch": 1.2616014558689717, "grad_norm": 0.7537128597950845, "learning_rate": 4.255343054987601e-05, "loss": 0.1576, "step": 2773 }, { "epoch": 1.262056414922657, "grad_norm": 0.5743398045268376, "learning_rate": 4.2548341241216084e-05, "loss": 0.0827, "step": 2774 }, { "epoch": 1.262511373976342, "grad_norm": 0.7025763847120244, "learning_rate": 4.254325049858495e-05, "loss": 0.1269, "step": 2775 }, { "epoch": 1.2629663330300274, "grad_norm": 0.3970584469131486, "learning_rate": 4.2538158322398626e-05, "loss": 0.0912, "step": 2776 }, { "epoch": 1.2634212920837125, "grad_norm": 0.4801182346649101, "learning_rate": 4.253306471307319e-05, "loss": 0.0946, "step": 2777 }, { "epoch": 1.2638762511373978, "grad_norm": 0.5581218545967146, "learning_rate": 4.252796967102489e-05, "loss": 0.0959, "step": 2778 }, { "epoch": 1.2643312101910829, "grad_norm": 0.48983382644387785, "learning_rate": 4.2522873196670065e-05, "loss": 0.084, "step": 2779 }, { "epoch": 1.264786169244768, "grad_norm": 0.6557626199285048, "learning_rate": 4.2517775290425175e-05, "loss": 0.0994, "step": 2780 }, { "epoch": 1.2652411282984533, "grad_norm": 0.7270618926357125, "learning_rate": 4.25126759527068e-05, "loss": 0.1246, "step": 2781 }, { "epoch": 1.2656960873521383, "grad_norm": 0.8018983602476695, "learning_rate": 4.250757518393163e-05, "loss": 0.1439, "step": 2782 }, { "epoch": 1.2661510464058234, "grad_norm": 1.0122917751097398, "learning_rate": 4.250247298451648e-05, "loss": 0.1216, "step": 2783 }, { "epoch": 1.2666060054595087, "grad_norm": 0.5836847840223519, "learning_rate": 4.249736935487828e-05, "loss": 0.0821, "step": 2784 }, { "epoch": 1.2670609645131938, "grad_norm": 0.5864421855771743, "learning_rate": 4.2492264295434075e-05, "loss": 0.1427, "step": 2785 }, { "epoch": 1.267515923566879, "grad_norm": 0.6729475619590193, "learning_rate": 4.248715780660102e-05, "loss": 0.137, "step": 2786 }, { "epoch": 1.2679708826205642, "grad_norm": 0.8997461516423765, "learning_rate": 4.2482049888796406e-05, "loss": 0.162, "step": 2787 }, { "epoch": 1.2684258416742493, "grad_norm": 0.6523852256362883, "learning_rate": 4.247694054243762e-05, "loss": 0.0914, "step": 2788 }, { "epoch": 1.2688808007279344, "grad_norm": 0.5884437166900801, "learning_rate": 4.2471829767942176e-05, "loss": 0.0889, "step": 2789 }, { "epoch": 1.2693357597816197, "grad_norm": 0.6745770835822151, "learning_rate": 4.246671756572771e-05, "loss": 0.0744, "step": 2790 }, { "epoch": 1.2697907188353048, "grad_norm": 0.721988948687989, "learning_rate": 4.2461603936211966e-05, "loss": 0.1817, "step": 2791 }, { "epoch": 1.27024567788899, "grad_norm": 0.7103561849385529, "learning_rate": 4.24564888798128e-05, "loss": 0.1224, "step": 2792 }, { "epoch": 1.2707006369426752, "grad_norm": 0.7274455692182416, "learning_rate": 4.2451372396948196e-05, "loss": 0.1024, "step": 2793 }, { "epoch": 1.2711555959963603, "grad_norm": 0.6045875335827648, "learning_rate": 4.244625448803625e-05, "loss": 0.1047, "step": 2794 }, { "epoch": 1.2716105550500454, "grad_norm": 0.7358928526768396, "learning_rate": 4.244113515349517e-05, "loss": 0.1449, "step": 2795 }, { "epoch": 1.2720655141037307, "grad_norm": 0.667986758732061, "learning_rate": 4.2436014393743296e-05, "loss": 0.1046, "step": 2796 }, { "epoch": 1.2725204731574158, "grad_norm": 0.6755997905850931, "learning_rate": 4.243089220919906e-05, "loss": 0.1389, "step": 2797 }, { "epoch": 1.2729754322111009, "grad_norm": 1.8209296456454536, "learning_rate": 4.242576860028103e-05, "loss": 0.1914, "step": 2798 }, { "epoch": 1.2734303912647862, "grad_norm": 0.8559448167213666, "learning_rate": 4.2420643567407886e-05, "loss": 0.1519, "step": 2799 }, { "epoch": 1.2738853503184713, "grad_norm": 0.6988222035132733, "learning_rate": 4.241551711099842e-05, "loss": 0.1277, "step": 2800 }, { "epoch": 1.2743403093721566, "grad_norm": 0.8899555023995173, "learning_rate": 4.241038923147154e-05, "loss": 0.0965, "step": 2801 }, { "epoch": 1.2747952684258417, "grad_norm": 0.636046626976844, "learning_rate": 4.240525992924629e-05, "loss": 0.0874, "step": 2802 }, { "epoch": 1.2752502274795268, "grad_norm": 0.5619651980703508, "learning_rate": 4.240012920474178e-05, "loss": 0.1124, "step": 2803 }, { "epoch": 1.275705186533212, "grad_norm": 0.7349133349801116, "learning_rate": 4.239499705837731e-05, "loss": 0.1362, "step": 2804 }, { "epoch": 1.2761601455868972, "grad_norm": 0.8357592585859318, "learning_rate": 4.238986349057222e-05, "loss": 0.094, "step": 2805 }, { "epoch": 1.2766151046405825, "grad_norm": 0.7643382869396901, "learning_rate": 4.238472850174603e-05, "loss": 0.1271, "step": 2806 }, { "epoch": 1.2770700636942676, "grad_norm": 0.7301225168131261, "learning_rate": 4.237959209231832e-05, "loss": 0.1305, "step": 2807 }, { "epoch": 1.2775250227479527, "grad_norm": 0.7771799105147376, "learning_rate": 4.237445426270884e-05, "loss": 0.188, "step": 2808 }, { "epoch": 1.277979981801638, "grad_norm": 0.5943410928463245, "learning_rate": 4.236931501333742e-05, "loss": 0.1528, "step": 2809 }, { "epoch": 1.278434940855323, "grad_norm": 0.6615581607143649, "learning_rate": 4.2364174344624014e-05, "loss": 0.1211, "step": 2810 }, { "epoch": 1.2788898999090081, "grad_norm": 1.0153625297788724, "learning_rate": 4.23590322569887e-05, "loss": 0.1451, "step": 2811 }, { "epoch": 1.2793448589626935, "grad_norm": 0.5756395814224274, "learning_rate": 4.235388875085165e-05, "loss": 0.1301, "step": 2812 }, { "epoch": 1.2797998180163785, "grad_norm": 0.9063715448308006, "learning_rate": 4.2348743826633196e-05, "loss": 0.1393, "step": 2813 }, { "epoch": 1.2802547770700636, "grad_norm": 1.0622773191634087, "learning_rate": 4.234359748475374e-05, "loss": 0.2176, "step": 2814 }, { "epoch": 1.280709736123749, "grad_norm": 0.9007827454215946, "learning_rate": 4.233844972563382e-05, "loss": 0.1315, "step": 2815 }, { "epoch": 1.281164695177434, "grad_norm": 0.6699931004574149, "learning_rate": 4.233330054969409e-05, "loss": 0.1686, "step": 2816 }, { "epoch": 1.2816196542311191, "grad_norm": 0.7164471763591254, "learning_rate": 4.23281499573553e-05, "loss": 0.1051, "step": 2817 }, { "epoch": 1.2820746132848044, "grad_norm": 0.7645879782954921, "learning_rate": 4.232299794903837e-05, "loss": 0.1471, "step": 2818 }, { "epoch": 1.2825295723384895, "grad_norm": 0.7188416490444823, "learning_rate": 4.2317844525164266e-05, "loss": 0.1363, "step": 2819 }, { "epoch": 1.2829845313921746, "grad_norm": 0.7902417673939018, "learning_rate": 4.231268968615412e-05, "loss": 0.1283, "step": 2820 }, { "epoch": 1.28343949044586, "grad_norm": 0.6831557650099505, "learning_rate": 4.230753343242915e-05, "loss": 0.1373, "step": 2821 }, { "epoch": 1.283894449499545, "grad_norm": 0.4632221117262363, "learning_rate": 4.2302375764410706e-05, "loss": 0.0749, "step": 2822 }, { "epoch": 1.28434940855323, "grad_norm": 0.5594143970282535, "learning_rate": 4.229721668252026e-05, "loss": 0.1408, "step": 2823 }, { "epoch": 1.2848043676069154, "grad_norm": 0.6582848302752689, "learning_rate": 4.2292056187179374e-05, "loss": 0.1072, "step": 2824 }, { "epoch": 1.2852593266606005, "grad_norm": 0.564964900545591, "learning_rate": 4.2286894278809745e-05, "loss": 0.0813, "step": 2825 }, { "epoch": 1.2857142857142856, "grad_norm": 0.507619402363043, "learning_rate": 4.2281730957833186e-05, "loss": 0.0765, "step": 2826 }, { "epoch": 1.286169244767971, "grad_norm": 0.5886165787940576, "learning_rate": 4.227656622467162e-05, "loss": 0.1183, "step": 2827 }, { "epoch": 1.286624203821656, "grad_norm": 0.8865620634163152, "learning_rate": 4.227140007974708e-05, "loss": 0.1454, "step": 2828 }, { "epoch": 1.2870791628753413, "grad_norm": 0.6754816166132465, "learning_rate": 4.2266232523481716e-05, "loss": 0.1328, "step": 2829 }, { "epoch": 1.2875341219290264, "grad_norm": 0.7467933267821145, "learning_rate": 4.226106355629781e-05, "loss": 0.1014, "step": 2830 }, { "epoch": 1.2879890809827115, "grad_norm": 0.7931981141608265, "learning_rate": 4.2255893178617745e-05, "loss": 0.1272, "step": 2831 }, { "epoch": 1.2884440400363968, "grad_norm": 0.74354957618501, "learning_rate": 4.2250721390864013e-05, "loss": 0.1066, "step": 2832 }, { "epoch": 1.2888989990900819, "grad_norm": 0.521740771268616, "learning_rate": 4.224554819345923e-05, "loss": 0.105, "step": 2833 }, { "epoch": 1.2893539581437672, "grad_norm": 0.9361069397230757, "learning_rate": 4.224037358682613e-05, "loss": 0.1171, "step": 2834 }, { "epoch": 1.2898089171974523, "grad_norm": 0.5758669371540409, "learning_rate": 4.223519757138755e-05, "loss": 0.0958, "step": 2835 }, { "epoch": 1.2902638762511374, "grad_norm": 0.6515231558756156, "learning_rate": 4.223002014756647e-05, "loss": 0.1042, "step": 2836 }, { "epoch": 1.2907188353048227, "grad_norm": 0.7559909714798301, "learning_rate": 4.2224841315785946e-05, "loss": 0.1257, "step": 2837 }, { "epoch": 1.2911737943585078, "grad_norm": 0.786602574912791, "learning_rate": 4.221966107646918e-05, "loss": 0.0897, "step": 2838 }, { "epoch": 1.2916287534121929, "grad_norm": 2.2907736435447474, "learning_rate": 4.221447943003947e-05, "loss": 0.1012, "step": 2839 }, { "epoch": 1.2920837124658782, "grad_norm": 0.5696717088932503, "learning_rate": 4.220929637692025e-05, "loss": 0.1292, "step": 2840 }, { "epoch": 1.2925386715195633, "grad_norm": 0.5894769811310329, "learning_rate": 4.220411191753504e-05, "loss": 0.1068, "step": 2841 }, { "epoch": 1.2929936305732483, "grad_norm": 0.7351395248316225, "learning_rate": 4.2198926052307494e-05, "loss": 0.1048, "step": 2842 }, { "epoch": 1.2934485896269337, "grad_norm": 0.6317976930801152, "learning_rate": 4.219373878166138e-05, "loss": 0.1703, "step": 2843 }, { "epoch": 1.2939035486806187, "grad_norm": 0.7554516081947528, "learning_rate": 4.2188550106020594e-05, "loss": 0.1309, "step": 2844 }, { "epoch": 1.2943585077343038, "grad_norm": 0.6502916139894431, "learning_rate": 4.21833600258091e-05, "loss": 0.1169, "step": 2845 }, { "epoch": 1.2948134667879891, "grad_norm": 0.8550521130922989, "learning_rate": 4.217816854145103e-05, "loss": 0.1352, "step": 2846 }, { "epoch": 1.2952684258416742, "grad_norm": 0.5345535797055011, "learning_rate": 4.21729756533706e-05, "loss": 0.0665, "step": 2847 }, { "epoch": 1.2957233848953593, "grad_norm": 0.6894413184951362, "learning_rate": 4.2167781361992155e-05, "loss": 0.162, "step": 2848 }, { "epoch": 1.2961783439490446, "grad_norm": 1.0374796223308687, "learning_rate": 4.216258566774014e-05, "loss": 0.2155, "step": 2849 }, { "epoch": 1.2966333030027297, "grad_norm": 0.5218605337985679, "learning_rate": 4.2157388571039145e-05, "loss": 0.1143, "step": 2850 }, { "epoch": 1.2970882620564148, "grad_norm": 0.6669454728904876, "learning_rate": 4.2152190072313824e-05, "loss": 0.1123, "step": 2851 }, { "epoch": 1.2975432211101001, "grad_norm": 1.0172113497631046, "learning_rate": 4.214699017198899e-05, "loss": 0.1616, "step": 2852 }, { "epoch": 1.2979981801637852, "grad_norm": 0.5827513100640702, "learning_rate": 4.2141788870489564e-05, "loss": 0.1458, "step": 2853 }, { "epoch": 1.2984531392174703, "grad_norm": 0.904819089294418, "learning_rate": 4.213658616824056e-05, "loss": 0.1166, "step": 2854 }, { "epoch": 1.2989080982711556, "grad_norm": 0.9815149270475607, "learning_rate": 4.2131382065667116e-05, "loss": 0.2072, "step": 2855 }, { "epoch": 1.2993630573248407, "grad_norm": 0.5699606578796995, "learning_rate": 4.2126176563194495e-05, "loss": 0.1042, "step": 2856 }, { "epoch": 1.299818016378526, "grad_norm": 0.5910658915853825, "learning_rate": 4.212096966124807e-05, "loss": 0.1074, "step": 2857 }, { "epoch": 1.300272975432211, "grad_norm": 0.9053018459508856, "learning_rate": 4.211576136025332e-05, "loss": 0.1867, "step": 2858 }, { "epoch": 1.3007279344858962, "grad_norm": 0.727267837157171, "learning_rate": 4.211055166063584e-05, "loss": 0.1833, "step": 2859 }, { "epoch": 1.3011828935395815, "grad_norm": 0.6196562238354426, "learning_rate": 4.210534056282136e-05, "loss": 0.096, "step": 2860 }, { "epoch": 1.3016378525932666, "grad_norm": 0.5052723334388877, "learning_rate": 4.21001280672357e-05, "loss": 0.1264, "step": 2861 }, { "epoch": 1.302092811646952, "grad_norm": 0.6348367193143003, "learning_rate": 4.2094914174304785e-05, "loss": 0.1062, "step": 2862 }, { "epoch": 1.302547770700637, "grad_norm": 0.7493872663028557, "learning_rate": 4.2089698884454686e-05, "loss": 0.1339, "step": 2863 }, { "epoch": 1.303002729754322, "grad_norm": 1.424538036412926, "learning_rate": 4.208448219811157e-05, "loss": 0.218, "step": 2864 }, { "epoch": 1.3034576888080074, "grad_norm": 0.5537170894213298, "learning_rate": 4.207926411570172e-05, "loss": 0.0855, "step": 2865 }, { "epoch": 1.3039126478616925, "grad_norm": 0.4902223174230951, "learning_rate": 4.2074044637651544e-05, "loss": 0.1003, "step": 2866 }, { "epoch": 1.3043676069153776, "grad_norm": 1.876472235943037, "learning_rate": 4.2068823764387545e-05, "loss": 0.2416, "step": 2867 }, { "epoch": 1.3048225659690629, "grad_norm": 0.7995045293400125, "learning_rate": 4.206360149633635e-05, "loss": 0.1517, "step": 2868 }, { "epoch": 1.305277525022748, "grad_norm": 0.7513773525254528, "learning_rate": 4.2058377833924686e-05, "loss": 0.1121, "step": 2869 }, { "epoch": 1.305732484076433, "grad_norm": 0.5866663419820946, "learning_rate": 4.205315277757943e-05, "loss": 0.1118, "step": 2870 }, { "epoch": 1.3061874431301184, "grad_norm": 0.8108964016921745, "learning_rate": 4.204792632772754e-05, "loss": 0.1706, "step": 2871 }, { "epoch": 1.3066424021838035, "grad_norm": 0.5550302144635557, "learning_rate": 4.204269848479611e-05, "loss": 0.1025, "step": 2872 }, { "epoch": 1.3070973612374885, "grad_norm": 1.1107701541482444, "learning_rate": 4.2037469249212306e-05, "loss": 0.1074, "step": 2873 }, { "epoch": 1.3075523202911739, "grad_norm": 0.6154864783342084, "learning_rate": 4.2032238621403466e-05, "loss": 0.1003, "step": 2874 }, { "epoch": 1.308007279344859, "grad_norm": 0.5344757286760073, "learning_rate": 4.2027006601797e-05, "loss": 0.1009, "step": 2875 }, { "epoch": 1.308462238398544, "grad_norm": 0.5959912196776225, "learning_rate": 4.202177319082045e-05, "loss": 0.1088, "step": 2876 }, { "epoch": 1.3089171974522293, "grad_norm": 0.6674634108113608, "learning_rate": 4.201653838890145e-05, "loss": 0.1497, "step": 2877 }, { "epoch": 1.3093721565059144, "grad_norm": 0.7401235764655458, "learning_rate": 4.2011302196467795e-05, "loss": 0.1108, "step": 2878 }, { "epoch": 1.3098271155595995, "grad_norm": 2.2830919563822802, "learning_rate": 4.200606461394735e-05, "loss": 0.2341, "step": 2879 }, { "epoch": 1.3102820746132848, "grad_norm": 0.6870710527880628, "learning_rate": 4.2000825641768087e-05, "loss": 0.1336, "step": 2880 }, { "epoch": 1.31073703366697, "grad_norm": 0.6208028509168081, "learning_rate": 4.199558528035814e-05, "loss": 0.1265, "step": 2881 }, { "epoch": 1.311191992720655, "grad_norm": 1.0426301685356183, "learning_rate": 4.199034353014571e-05, "loss": 0.1378, "step": 2882 }, { "epoch": 1.3116469517743403, "grad_norm": 0.7612555792698975, "learning_rate": 4.198510039155914e-05, "loss": 0.1336, "step": 2883 }, { "epoch": 1.3121019108280254, "grad_norm": 0.682745960508485, "learning_rate": 4.1979855865026866e-05, "loss": 0.1413, "step": 2884 }, { "epoch": 1.3125568698817107, "grad_norm": 0.7104509284737333, "learning_rate": 4.197460995097745e-05, "loss": 0.1398, "step": 2885 }, { "epoch": 1.3130118289353958, "grad_norm": 0.6826031773532562, "learning_rate": 4.1969362649839565e-05, "loss": 0.1698, "step": 2886 }, { "epoch": 1.3134667879890811, "grad_norm": 0.5536750470554721, "learning_rate": 4.196411396204199e-05, "loss": 0.0949, "step": 2887 }, { "epoch": 1.3139217470427662, "grad_norm": 0.8167836395807306, "learning_rate": 4.195886388801364e-05, "loss": 0.0772, "step": 2888 }, { "epoch": 1.3143767060964513, "grad_norm": 0.9447630668737573, "learning_rate": 4.1953612428183534e-05, "loss": 0.1083, "step": 2889 }, { "epoch": 1.3148316651501366, "grad_norm": 0.5831819136609305, "learning_rate": 4.194835958298076e-05, "loss": 0.1398, "step": 2890 }, { "epoch": 1.3152866242038217, "grad_norm": 1.0853008107240636, "learning_rate": 4.194310535283459e-05, "loss": 0.1273, "step": 2891 }, { "epoch": 1.3157415832575068, "grad_norm": 0.7498395308605064, "learning_rate": 4.1937849738174364e-05, "loss": 0.0976, "step": 2892 }, { "epoch": 1.316196542311192, "grad_norm": 0.6548256874463239, "learning_rate": 4.193259273942954e-05, "loss": 0.1288, "step": 2893 }, { "epoch": 1.3166515013648772, "grad_norm": 0.6548970326579058, "learning_rate": 4.192733435702971e-05, "loss": 0.1261, "step": 2894 }, { "epoch": 1.3171064604185623, "grad_norm": 1.1261190353198447, "learning_rate": 4.192207459140456e-05, "loss": 0.1386, "step": 2895 }, { "epoch": 1.3175614194722476, "grad_norm": 0.6457437373340709, "learning_rate": 4.191681344298389e-05, "loss": 0.0942, "step": 2896 }, { "epoch": 1.3180163785259327, "grad_norm": 0.6620344043565901, "learning_rate": 4.191155091219763e-05, "loss": 0.1357, "step": 2897 }, { "epoch": 1.3184713375796178, "grad_norm": 0.5967464312728041, "learning_rate": 4.190628699947579e-05, "loss": 0.1246, "step": 2898 }, { "epoch": 1.318926296633303, "grad_norm": 2.929074689664917, "learning_rate": 4.190102170524853e-05, "loss": 0.1921, "step": 2899 }, { "epoch": 1.3193812556869882, "grad_norm": 0.6791511991822728, "learning_rate": 4.18957550299461e-05, "loss": 0.128, "step": 2900 }, { "epoch": 1.3198362147406733, "grad_norm": 0.6719460856995868, "learning_rate": 4.1890486973998866e-05, "loss": 0.1385, "step": 2901 }, { "epoch": 1.3202911737943586, "grad_norm": 0.6062781882301137, "learning_rate": 4.1885217537837315e-05, "loss": 0.0836, "step": 2902 }, { "epoch": 1.3207461328480437, "grad_norm": 0.5797886572409322, "learning_rate": 4.187994672189205e-05, "loss": 0.1409, "step": 2903 }, { "epoch": 1.3212010919017287, "grad_norm": 0.991552141255984, "learning_rate": 4.187467452659376e-05, "loss": 0.1382, "step": 2904 }, { "epoch": 1.321656050955414, "grad_norm": 0.8299590240195782, "learning_rate": 4.186940095237327e-05, "loss": 0.1392, "step": 2905 }, { "epoch": 1.3221110100090991, "grad_norm": 0.5767125187738175, "learning_rate": 4.186412599966152e-05, "loss": 0.1183, "step": 2906 }, { "epoch": 1.3225659690627842, "grad_norm": 0.545945856518355, "learning_rate": 4.1858849668889545e-05, "loss": 0.0802, "step": 2907 }, { "epoch": 1.3230209281164695, "grad_norm": 0.8846674941675078, "learning_rate": 4.1853571960488514e-05, "loss": 0.1422, "step": 2908 }, { "epoch": 1.3234758871701546, "grad_norm": 0.5046987673489105, "learning_rate": 4.1848292874889694e-05, "loss": 0.0838, "step": 2909 }, { "epoch": 1.3239308462238397, "grad_norm": 0.6139947306568435, "learning_rate": 4.1843012412524465e-05, "loss": 0.0843, "step": 2910 }, { "epoch": 1.324385805277525, "grad_norm": 0.5474530873774468, "learning_rate": 4.1837730573824316e-05, "loss": 0.1032, "step": 2911 }, { "epoch": 1.3248407643312101, "grad_norm": 0.503352581236489, "learning_rate": 4.183244735922087e-05, "loss": 0.0992, "step": 2912 }, { "epoch": 1.3252957233848954, "grad_norm": 0.7041940179109856, "learning_rate": 4.182716276914584e-05, "loss": 0.1068, "step": 2913 }, { "epoch": 1.3257506824385805, "grad_norm": 0.8158453199293952, "learning_rate": 4.182187680403107e-05, "loss": 0.109, "step": 2914 }, { "epoch": 1.3262056414922658, "grad_norm": 0.6167903718485351, "learning_rate": 4.1816589464308474e-05, "loss": 0.1016, "step": 2915 }, { "epoch": 1.326660600545951, "grad_norm": 0.6514923163175018, "learning_rate": 4.181130075041014e-05, "loss": 0.1112, "step": 2916 }, { "epoch": 1.327115559599636, "grad_norm": 0.8489716455581969, "learning_rate": 4.1806010662768234e-05, "loss": 0.1136, "step": 2917 }, { "epoch": 1.3275705186533213, "grad_norm": 1.0542659063815762, "learning_rate": 4.180071920181503e-05, "loss": 0.1379, "step": 2918 }, { "epoch": 1.3280254777070064, "grad_norm": 0.6300826731499437, "learning_rate": 4.1795426367982914e-05, "loss": 0.0961, "step": 2919 }, { "epoch": 1.3284804367606915, "grad_norm": 0.7280570353792163, "learning_rate": 4.1790132161704416e-05, "loss": 0.1078, "step": 2920 }, { "epoch": 1.3289353958143768, "grad_norm": 0.611780726197178, "learning_rate": 4.1784836583412135e-05, "loss": 0.129, "step": 2921 }, { "epoch": 1.329390354868062, "grad_norm": 0.6299614621287858, "learning_rate": 4.17795396335388e-05, "loss": 0.1151, "step": 2922 }, { "epoch": 1.329845313921747, "grad_norm": 0.8966911090038843, "learning_rate": 4.177424131251727e-05, "loss": 0.1724, "step": 2923 }, { "epoch": 1.3303002729754323, "grad_norm": 0.736823882061507, "learning_rate": 4.17689416207805e-05, "loss": 0.0734, "step": 2924 }, { "epoch": 1.3307552320291174, "grad_norm": 0.7951524219687088, "learning_rate": 4.1763640558761544e-05, "loss": 0.126, "step": 2925 }, { "epoch": 1.3312101910828025, "grad_norm": 0.7005318472475044, "learning_rate": 4.175833812689357e-05, "loss": 0.135, "step": 2926 }, { "epoch": 1.3316651501364878, "grad_norm": 0.5797943781661393, "learning_rate": 4.17530343256099e-05, "loss": 0.1085, "step": 2927 }, { "epoch": 1.3321201091901729, "grad_norm": 1.4361257397665288, "learning_rate": 4.174772915534392e-05, "loss": 0.2256, "step": 2928 }, { "epoch": 1.332575068243858, "grad_norm": 0.7414128326574033, "learning_rate": 4.1742422616529135e-05, "loss": 0.1326, "step": 2929 }, { "epoch": 1.3330300272975433, "grad_norm": 0.6248252510529683, "learning_rate": 4.173711470959919e-05, "loss": 0.1005, "step": 2930 }, { "epoch": 1.3334849863512284, "grad_norm": 1.3057860391873353, "learning_rate": 4.173180543498781e-05, "loss": 0.2325, "step": 2931 }, { "epoch": 1.3339399454049135, "grad_norm": 0.6982587371041312, "learning_rate": 4.172649479312886e-05, "loss": 0.0816, "step": 2932 }, { "epoch": 1.3343949044585988, "grad_norm": 0.8382181615133929, "learning_rate": 4.172118278445629e-05, "loss": 0.0904, "step": 2933 }, { "epoch": 1.3348498635122839, "grad_norm": 0.6143659510098456, "learning_rate": 4.1715869409404166e-05, "loss": 0.1573, "step": 2934 }, { "epoch": 1.335304822565969, "grad_norm": 0.5948310393673898, "learning_rate": 4.171055466840669e-05, "loss": 0.1059, "step": 2935 }, { "epoch": 1.3357597816196543, "grad_norm": 0.6677964145228075, "learning_rate": 4.170523856189814e-05, "loss": 0.1512, "step": 2936 }, { "epoch": 1.3362147406733393, "grad_norm": 0.5296240356342853, "learning_rate": 4.169992109031295e-05, "loss": 0.0823, "step": 2937 }, { "epoch": 1.3366696997270244, "grad_norm": 0.5884013938332541, "learning_rate": 4.1694602254085616e-05, "loss": 0.0919, "step": 2938 }, { "epoch": 1.3371246587807097, "grad_norm": 0.9214633039317147, "learning_rate": 4.168928205365078e-05, "loss": 0.1582, "step": 2939 }, { "epoch": 1.3375796178343948, "grad_norm": 0.6507646638284043, "learning_rate": 4.168396048944318e-05, "loss": 0.1102, "step": 2940 }, { "epoch": 1.3380345768880801, "grad_norm": 0.6577372667004274, "learning_rate": 4.167863756189767e-05, "loss": 0.1441, "step": 2941 }, { "epoch": 1.3384895359417652, "grad_norm": 0.5054916615027351, "learning_rate": 4.167331327144923e-05, "loss": 0.0804, "step": 2942 }, { "epoch": 1.3389444949954505, "grad_norm": 0.5755024188951866, "learning_rate": 4.1667987618532914e-05, "loss": 0.123, "step": 2943 }, { "epoch": 1.3393994540491356, "grad_norm": 0.4961525020770999, "learning_rate": 4.166266060358393e-05, "loss": 0.1338, "step": 2944 }, { "epoch": 1.3398544131028207, "grad_norm": 0.6594610960711816, "learning_rate": 4.1657332227037565e-05, "loss": 0.1524, "step": 2945 }, { "epoch": 1.340309372156506, "grad_norm": 0.7553151646716542, "learning_rate": 4.165200248932923e-05, "loss": 0.1637, "step": 2946 }, { "epoch": 1.3407643312101911, "grad_norm": 0.7134825112588269, "learning_rate": 4.1646671390894466e-05, "loss": 0.122, "step": 2947 }, { "epoch": 1.3412192902638762, "grad_norm": 0.5615477563232011, "learning_rate": 4.164133893216888e-05, "loss": 0.0713, "step": 2948 }, { "epoch": 1.3416742493175615, "grad_norm": 0.5481067429134988, "learning_rate": 4.163600511358823e-05, "loss": 0.1086, "step": 2949 }, { "epoch": 1.3421292083712466, "grad_norm": 0.5556922212449436, "learning_rate": 4.163066993558837e-05, "loss": 0.0973, "step": 2950 }, { "epoch": 1.3425841674249317, "grad_norm": 0.4783594638445936, "learning_rate": 4.162533339860527e-05, "loss": 0.0869, "step": 2951 }, { "epoch": 1.343039126478617, "grad_norm": 0.7023592624988556, "learning_rate": 4.1619995503074996e-05, "loss": 0.1098, "step": 2952 }, { "epoch": 1.343494085532302, "grad_norm": 0.6742627161712805, "learning_rate": 4.161465624943375e-05, "loss": 0.0974, "step": 2953 }, { "epoch": 1.3439490445859872, "grad_norm": 0.49532030083245465, "learning_rate": 4.160931563811782e-05, "loss": 0.0963, "step": 2954 }, { "epoch": 1.3444040036396725, "grad_norm": 0.6805932454251685, "learning_rate": 4.160397366956364e-05, "loss": 0.1254, "step": 2955 }, { "epoch": 1.3448589626933576, "grad_norm": 1.4847584361221913, "learning_rate": 4.15986303442077e-05, "loss": 0.1825, "step": 2956 }, { "epoch": 1.3453139217470427, "grad_norm": 0.6316954794732007, "learning_rate": 4.1593285662486646e-05, "loss": 0.114, "step": 2957 }, { "epoch": 1.345768880800728, "grad_norm": 0.8717549415958851, "learning_rate": 4.158793962483723e-05, "loss": 0.1398, "step": 2958 }, { "epoch": 1.346223839854413, "grad_norm": 0.7665882848110539, "learning_rate": 4.15825922316963e-05, "loss": 0.1224, "step": 2959 }, { "epoch": 1.3466787989080982, "grad_norm": 0.9212661700384108, "learning_rate": 4.15772434835008e-05, "loss": 0.141, "step": 2960 }, { "epoch": 1.3471337579617835, "grad_norm": 0.7621615733566135, "learning_rate": 4.157189338068784e-05, "loss": 0.1224, "step": 2961 }, { "epoch": 1.3475887170154686, "grad_norm": 0.5342391704811869, "learning_rate": 4.1566541923694594e-05, "loss": 0.1036, "step": 2962 }, { "epoch": 1.3480436760691537, "grad_norm": 0.6966674885202696, "learning_rate": 4.1561189112958357e-05, "loss": 0.0907, "step": 2963 }, { "epoch": 1.348498635122839, "grad_norm": 0.6362960344768629, "learning_rate": 4.1555834948916516e-05, "loss": 0.1562, "step": 2964 }, { "epoch": 1.348953594176524, "grad_norm": 0.6387787290035614, "learning_rate": 4.155047943200663e-05, "loss": 0.0838, "step": 2965 }, { "epoch": 1.3494085532302094, "grad_norm": 0.49979988104448186, "learning_rate": 4.15451225626663e-05, "loss": 0.0681, "step": 2966 }, { "epoch": 1.3498635122838945, "grad_norm": 1.0796907854603606, "learning_rate": 4.1539764341333264e-05, "loss": 0.1372, "step": 2967 }, { "epoch": 1.3503184713375795, "grad_norm": 0.5709065586143283, "learning_rate": 4.153440476844539e-05, "loss": 0.0906, "step": 2968 }, { "epoch": 1.3507734303912649, "grad_norm": 0.8012053302023932, "learning_rate": 4.152904384444062e-05, "loss": 0.1226, "step": 2969 }, { "epoch": 1.35122838944495, "grad_norm": 0.6455714768052097, "learning_rate": 4.1523681569757035e-05, "loss": 0.1489, "step": 2970 }, { "epoch": 1.3516833484986353, "grad_norm": 0.5998166291634722, "learning_rate": 4.1518317944832806e-05, "loss": 0.1163, "step": 2971 }, { "epoch": 1.3521383075523203, "grad_norm": 0.6061756215692133, "learning_rate": 4.1512952970106234e-05, "loss": 0.1179, "step": 2972 }, { "epoch": 1.3525932666060054, "grad_norm": 0.5548563527915419, "learning_rate": 4.150758664601572e-05, "loss": 0.1805, "step": 2973 }, { "epoch": 1.3530482256596907, "grad_norm": 0.537801729364992, "learning_rate": 4.150221897299977e-05, "loss": 0.1842, "step": 2974 }, { "epoch": 1.3535031847133758, "grad_norm": 0.8261099924271932, "learning_rate": 4.1496849951497005e-05, "loss": 0.1129, "step": 2975 }, { "epoch": 1.353958143767061, "grad_norm": 0.712092501901451, "learning_rate": 4.149147958194617e-05, "loss": 0.1016, "step": 2976 }, { "epoch": 1.3544131028207462, "grad_norm": 0.7496595981265164, "learning_rate": 4.1486107864786095e-05, "loss": 0.1067, "step": 2977 }, { "epoch": 1.3548680618744313, "grad_norm": 0.8395914037056401, "learning_rate": 4.1480734800455734e-05, "loss": 0.1146, "step": 2978 }, { "epoch": 1.3553230209281164, "grad_norm": 0.7793408672013444, "learning_rate": 4.1475360389394154e-05, "loss": 0.1102, "step": 2979 }, { "epoch": 1.3557779799818017, "grad_norm": 0.547465098049914, "learning_rate": 4.1469984632040526e-05, "loss": 0.1008, "step": 2980 }, { "epoch": 1.3562329390354868, "grad_norm": 0.5878297068548402, "learning_rate": 4.1464607528834126e-05, "loss": 0.1186, "step": 2981 }, { "epoch": 1.356687898089172, "grad_norm": 0.6466079599011247, "learning_rate": 4.1459229080214356e-05, "loss": 0.1159, "step": 2982 }, { "epoch": 1.3571428571428572, "grad_norm": 0.44956538596793444, "learning_rate": 4.1453849286620716e-05, "loss": 0.0945, "step": 2983 }, { "epoch": 1.3575978161965423, "grad_norm": 0.574902805219109, "learning_rate": 4.144846814849282e-05, "loss": 0.1105, "step": 2984 }, { "epoch": 1.3580527752502274, "grad_norm": 0.5954776992233206, "learning_rate": 4.144308566627038e-05, "loss": 0.0884, "step": 2985 }, { "epoch": 1.3585077343039127, "grad_norm": 0.698368124501691, "learning_rate": 4.143770184039324e-05, "loss": 0.107, "step": 2986 }, { "epoch": 1.3589626933575978, "grad_norm": 0.5574347341598207, "learning_rate": 4.143231667130134e-05, "loss": 0.0887, "step": 2987 }, { "epoch": 1.3594176524112829, "grad_norm": 0.5216404100419536, "learning_rate": 4.142693015943472e-05, "loss": 0.1068, "step": 2988 }, { "epoch": 1.3598726114649682, "grad_norm": 0.7568059562270393, "learning_rate": 4.1421542305233565e-05, "loss": 0.1452, "step": 2989 }, { "epoch": 1.3603275705186533, "grad_norm": 0.8706458805056028, "learning_rate": 4.1416153109138125e-05, "loss": 0.1647, "step": 2990 }, { "epoch": 1.3607825295723384, "grad_norm": 0.5305515760800668, "learning_rate": 4.141076257158878e-05, "loss": 0.0755, "step": 2991 }, { "epoch": 1.3612374886260237, "grad_norm": 0.6027017839017346, "learning_rate": 4.140537069302603e-05, "loss": 0.1136, "step": 2992 }, { "epoch": 1.3616924476797088, "grad_norm": 0.6359226439480484, "learning_rate": 4.1399977473890486e-05, "loss": 0.094, "step": 2993 }, { "epoch": 1.362147406733394, "grad_norm": 0.6171882887537021, "learning_rate": 4.139458291462283e-05, "loss": 0.1038, "step": 2994 }, { "epoch": 1.3626023657870792, "grad_norm": 0.7361296374492355, "learning_rate": 4.13891870156639e-05, "loss": 0.1248, "step": 2995 }, { "epoch": 1.3630573248407643, "grad_norm": 0.4461875102128264, "learning_rate": 4.138378977745462e-05, "loss": 0.0811, "step": 2996 }, { "epoch": 1.3635122838944496, "grad_norm": 0.6141010644557094, "learning_rate": 4.137839120043603e-05, "loss": 0.1237, "step": 2997 }, { "epoch": 1.3639672429481347, "grad_norm": 0.8108301207599925, "learning_rate": 4.137299128504928e-05, "loss": 0.1073, "step": 2998 }, { "epoch": 1.36442220200182, "grad_norm": 0.7003659090643755, "learning_rate": 4.136759003173561e-05, "loss": 0.087, "step": 2999 }, { "epoch": 1.364877161055505, "grad_norm": 0.6548262334887451, "learning_rate": 4.1362187440936404e-05, "loss": 0.1071, "step": 3000 }, { "epoch": 1.3653321201091901, "grad_norm": 0.5675269482617771, "learning_rate": 4.135678351309313e-05, "loss": 0.1377, "step": 3001 }, { "epoch": 1.3657870791628755, "grad_norm": 0.719105846017161, "learning_rate": 4.1351378248647374e-05, "loss": 0.1363, "step": 3002 }, { "epoch": 1.3662420382165605, "grad_norm": 0.5774368298361787, "learning_rate": 4.134597164804084e-05, "loss": 0.117, "step": 3003 }, { "epoch": 1.3666969972702456, "grad_norm": 0.5556362815065823, "learning_rate": 4.134056371171531e-05, "loss": 0.1159, "step": 3004 }, { "epoch": 1.367151956323931, "grad_norm": 0.6793528065231036, "learning_rate": 4.133515444011271e-05, "loss": 0.1154, "step": 3005 }, { "epoch": 1.367606915377616, "grad_norm": 0.6081119486380929, "learning_rate": 4.132974383367505e-05, "loss": 0.0881, "step": 3006 }, { "epoch": 1.3680618744313011, "grad_norm": 0.7156851481225148, "learning_rate": 4.132433189284448e-05, "loss": 0.1332, "step": 3007 }, { "epoch": 1.3685168334849864, "grad_norm": 0.9040097755837111, "learning_rate": 4.131891861806322e-05, "loss": 0.1339, "step": 3008 }, { "epoch": 1.3689717925386715, "grad_norm": 0.5684579373174153, "learning_rate": 4.131350400977363e-05, "loss": 0.0936, "step": 3009 }, { "epoch": 1.3694267515923566, "grad_norm": 0.6748162088310881, "learning_rate": 4.1308088068418164e-05, "loss": 0.1265, "step": 3010 }, { "epoch": 1.369881710646042, "grad_norm": 0.7753151693260233, "learning_rate": 4.130267079443938e-05, "loss": 0.2108, "step": 3011 }, { "epoch": 1.370336669699727, "grad_norm": 0.6402839829621945, "learning_rate": 4.129725218827997e-05, "loss": 0.1146, "step": 3012 }, { "epoch": 1.370791628753412, "grad_norm": 0.5796015189097802, "learning_rate": 4.1291832250382706e-05, "loss": 0.1118, "step": 3013 }, { "epoch": 1.3712465878070974, "grad_norm": 0.8433910894061148, "learning_rate": 4.128641098119048e-05, "loss": 0.1397, "step": 3014 }, { "epoch": 1.3717015468607825, "grad_norm": 0.5394833188263843, "learning_rate": 4.128098838114631e-05, "loss": 0.1428, "step": 3015 }, { "epoch": 1.3721565059144676, "grad_norm": 0.6082802534488235, "learning_rate": 4.1275564450693285e-05, "loss": 0.1284, "step": 3016 }, { "epoch": 1.372611464968153, "grad_norm": 0.8007036533827185, "learning_rate": 4.1270139190274625e-05, "loss": 0.1093, "step": 3017 }, { "epoch": 1.373066424021838, "grad_norm": 0.5198270666163424, "learning_rate": 4.126471260033368e-05, "loss": 0.0846, "step": 3018 }, { "epoch": 1.373521383075523, "grad_norm": 0.6118879928187145, "learning_rate": 4.125928468131387e-05, "loss": 0.1404, "step": 3019 }, { "epoch": 1.3739763421292084, "grad_norm": 0.6604422662501428, "learning_rate": 4.125385543365873e-05, "loss": 0.1781, "step": 3020 }, { "epoch": 1.3744313011828935, "grad_norm": 1.1409201521570944, "learning_rate": 4.124842485781193e-05, "loss": 0.1235, "step": 3021 }, { "epoch": 1.3748862602365788, "grad_norm": 0.6172918659452711, "learning_rate": 4.124299295421724e-05, "loss": 0.1212, "step": 3022 }, { "epoch": 1.3753412192902639, "grad_norm": 0.6508895542930899, "learning_rate": 4.123755972331851e-05, "loss": 0.114, "step": 3023 }, { "epoch": 1.3757961783439492, "grad_norm": 0.768562933902743, "learning_rate": 4.123212516555972e-05, "loss": 0.1276, "step": 3024 }, { "epoch": 1.3762511373976343, "grad_norm": 0.946738956684474, "learning_rate": 4.122668928138498e-05, "loss": 0.084, "step": 3025 }, { "epoch": 1.3767060964513194, "grad_norm": 0.5811002400760628, "learning_rate": 4.122125207123846e-05, "loss": 0.1469, "step": 3026 }, { "epoch": 1.3771610555050047, "grad_norm": 0.7140129648672559, "learning_rate": 4.1215813535564474e-05, "loss": 0.1504, "step": 3027 }, { "epoch": 1.3776160145586898, "grad_norm": 0.4744260305859922, "learning_rate": 4.1210373674807446e-05, "loss": 0.0993, "step": 3028 }, { "epoch": 1.3780709736123748, "grad_norm": 0.6893281320279329, "learning_rate": 4.120493248941188e-05, "loss": 0.1112, "step": 3029 }, { "epoch": 1.3785259326660602, "grad_norm": 0.5101077829133549, "learning_rate": 4.119948997982241e-05, "loss": 0.089, "step": 3030 }, { "epoch": 1.3789808917197452, "grad_norm": 0.8624659105536161, "learning_rate": 4.1194046146483775e-05, "loss": 0.1544, "step": 3031 }, { "epoch": 1.3794358507734303, "grad_norm": 0.9313865095031963, "learning_rate": 4.118860098984083e-05, "loss": 0.1899, "step": 3032 }, { "epoch": 1.3798908098271156, "grad_norm": 0.7255309916326554, "learning_rate": 4.118315451033851e-05, "loss": 0.1337, "step": 3033 }, { "epoch": 1.3803457688808007, "grad_norm": 0.8519148871744132, "learning_rate": 4.117770670842189e-05, "loss": 0.1194, "step": 3034 }, { "epoch": 1.3808007279344858, "grad_norm": 0.7184549522460003, "learning_rate": 4.117225758453613e-05, "loss": 0.0966, "step": 3035 }, { "epoch": 1.3812556869881711, "grad_norm": 0.6178040141672766, "learning_rate": 4.116680713912652e-05, "loss": 0.0978, "step": 3036 }, { "epoch": 1.3817106460418562, "grad_norm": 0.48048046735547145, "learning_rate": 4.1161355372638436e-05, "loss": 0.0952, "step": 3037 }, { "epoch": 1.3821656050955413, "grad_norm": 0.4772180495231209, "learning_rate": 4.115590228551738e-05, "loss": 0.0723, "step": 3038 }, { "epoch": 1.3826205641492266, "grad_norm": 0.6953981260988068, "learning_rate": 4.1150447878208944e-05, "loss": 0.1102, "step": 3039 }, { "epoch": 1.3830755232029117, "grad_norm": 0.6968173523356338, "learning_rate": 4.114499215115885e-05, "loss": 0.1282, "step": 3040 }, { "epoch": 1.3835304822565968, "grad_norm": 0.6263772698957264, "learning_rate": 4.113953510481289e-05, "loss": 0.1025, "step": 3041 }, { "epoch": 1.3839854413102821, "grad_norm": 0.5395147890776835, "learning_rate": 4.1134076739617024e-05, "loss": 0.089, "step": 3042 }, { "epoch": 1.3844404003639672, "grad_norm": 0.6139695391424208, "learning_rate": 4.112861705601726e-05, "loss": 0.1482, "step": 3043 }, { "epoch": 1.3848953594176523, "grad_norm": 0.5281899704413763, "learning_rate": 4.1123156054459745e-05, "loss": 0.0756, "step": 3044 }, { "epoch": 1.3853503184713376, "grad_norm": 0.5646068308364369, "learning_rate": 4.111769373539073e-05, "loss": 0.1441, "step": 3045 }, { "epoch": 1.3858052775250227, "grad_norm": 0.7469697178771106, "learning_rate": 4.111223009925657e-05, "loss": 0.1541, "step": 3046 }, { "epoch": 1.3862602365787078, "grad_norm": 0.6288226014720864, "learning_rate": 4.1106765146503735e-05, "loss": 0.134, "step": 3047 }, { "epoch": 1.386715195632393, "grad_norm": 0.6494184902976146, "learning_rate": 4.110129887757878e-05, "loss": 0.0885, "step": 3048 }, { "epoch": 1.3871701546860782, "grad_norm": 0.8194312013746352, "learning_rate": 4.10958312929284e-05, "loss": 0.1487, "step": 3049 }, { "epoch": 1.3876251137397635, "grad_norm": 0.7115281047369317, "learning_rate": 4.109036239299937e-05, "loss": 0.1568, "step": 3050 }, { "epoch": 1.3880800727934486, "grad_norm": 0.4755066885822269, "learning_rate": 4.108489217823859e-05, "loss": 0.0905, "step": 3051 }, { "epoch": 1.388535031847134, "grad_norm": 0.8463137125296744, "learning_rate": 4.107942064909306e-05, "loss": 0.1105, "step": 3052 }, { "epoch": 1.388989990900819, "grad_norm": 0.670745623021873, "learning_rate": 4.1073947806009897e-05, "loss": 0.0889, "step": 3053 }, { "epoch": 1.389444949954504, "grad_norm": 1.1040169083993254, "learning_rate": 4.10684736494363e-05, "loss": 0.2291, "step": 3054 }, { "epoch": 1.3898999090081894, "grad_norm": 0.8506169401737156, "learning_rate": 4.1062998179819603e-05, "loss": 0.1192, "step": 3055 }, { "epoch": 1.3903548680618745, "grad_norm": 0.6308026485245732, "learning_rate": 4.105752139760723e-05, "loss": 0.1174, "step": 3056 }, { "epoch": 1.3908098271155596, "grad_norm": 0.5199417780282442, "learning_rate": 4.105204330324673e-05, "loss": 0.116, "step": 3057 }, { "epoch": 1.3912647861692449, "grad_norm": 0.7348821108297482, "learning_rate": 4.1046563897185734e-05, "loss": 0.0817, "step": 3058 }, { "epoch": 1.39171974522293, "grad_norm": 0.6242151440672933, "learning_rate": 4.104108317987201e-05, "loss": 0.0873, "step": 3059 }, { "epoch": 1.392174704276615, "grad_norm": 0.6936840545160566, "learning_rate": 4.103560115175341e-05, "loss": 0.1719, "step": 3060 }, { "epoch": 1.3926296633303004, "grad_norm": 0.5739867806042289, "learning_rate": 4.103011781327789e-05, "loss": 0.0971, "step": 3061 }, { "epoch": 1.3930846223839854, "grad_norm": 0.5961639325361975, "learning_rate": 4.102463316489355e-05, "loss": 0.0979, "step": 3062 }, { "epoch": 1.3935395814376705, "grad_norm": 0.7576887369756035, "learning_rate": 4.1019147207048534e-05, "loss": 0.1665, "step": 3063 }, { "epoch": 1.3939945404913558, "grad_norm": 0.7527329681734916, "learning_rate": 4.101365994019116e-05, "loss": 0.1304, "step": 3064 }, { "epoch": 1.394449499545041, "grad_norm": 0.6432967319848361, "learning_rate": 4.100817136476982e-05, "loss": 0.1869, "step": 3065 }, { "epoch": 1.394904458598726, "grad_norm": 0.6294298093486607, "learning_rate": 4.1002681481232995e-05, "loss": 0.0757, "step": 3066 }, { "epoch": 1.3953594176524113, "grad_norm": 0.6396260531482296, "learning_rate": 4.099719029002932e-05, "loss": 0.0919, "step": 3067 }, { "epoch": 1.3958143767060964, "grad_norm": 0.8883436380468461, "learning_rate": 4.099169779160748e-05, "loss": 0.1836, "step": 3068 }, { "epoch": 1.3962693357597815, "grad_norm": 0.6100276353321656, "learning_rate": 4.0986203986416324e-05, "loss": 0.0783, "step": 3069 }, { "epoch": 1.3967242948134668, "grad_norm": 0.6354492254498514, "learning_rate": 4.0980708874904784e-05, "loss": 0.1257, "step": 3070 }, { "epoch": 1.397179253867152, "grad_norm": 0.6547971691110054, "learning_rate": 4.0975212457521865e-05, "loss": 0.0684, "step": 3071 }, { "epoch": 1.397634212920837, "grad_norm": 0.7115050156190534, "learning_rate": 4.096971473471674e-05, "loss": 0.076, "step": 3072 }, { "epoch": 1.3980891719745223, "grad_norm": 0.685533960113905, "learning_rate": 4.0964215706938635e-05, "loss": 0.1505, "step": 3073 }, { "epoch": 1.3985441310282074, "grad_norm": 0.7937477623323673, "learning_rate": 4.0958715374636925e-05, "loss": 0.1273, "step": 3074 }, { "epoch": 1.3989990900818925, "grad_norm": 0.7860300774339637, "learning_rate": 4.095321373826105e-05, "loss": 0.0821, "step": 3075 }, { "epoch": 1.3994540491355778, "grad_norm": 0.5574189265623309, "learning_rate": 4.094771079826061e-05, "loss": 0.116, "step": 3076 }, { "epoch": 1.399909008189263, "grad_norm": 0.6083866437821321, "learning_rate": 4.0942206555085246e-05, "loss": 0.115, "step": 3077 }, { "epoch": 1.4003639672429482, "grad_norm": 0.9609980391951334, "learning_rate": 4.093670100918477e-05, "loss": 0.186, "step": 3078 }, { "epoch": 1.4008189262966333, "grad_norm": 0.5812240690639653, "learning_rate": 4.0931194161009046e-05, "loss": 0.1165, "step": 3079 }, { "epoch": 1.4012738853503186, "grad_norm": 0.5518884028579658, "learning_rate": 4.092568601100809e-05, "loss": 0.0932, "step": 3080 }, { "epoch": 1.4017288444040037, "grad_norm": 0.7426605618187326, "learning_rate": 4.092017655963198e-05, "loss": 0.1177, "step": 3081 }, { "epoch": 1.4021838034576888, "grad_norm": 0.6995862653398903, "learning_rate": 4.0914665807330955e-05, "loss": 0.1229, "step": 3082 }, { "epoch": 1.402638762511374, "grad_norm": 0.5604687424126411, "learning_rate": 4.0909153754555295e-05, "loss": 0.1175, "step": 3083 }, { "epoch": 1.4030937215650592, "grad_norm": 0.6646420223174071, "learning_rate": 4.090364040175545e-05, "loss": 0.1214, "step": 3084 }, { "epoch": 1.4035486806187443, "grad_norm": 0.6051271783933619, "learning_rate": 4.0898125749381916e-05, "loss": 0.155, "step": 3085 }, { "epoch": 1.4040036396724296, "grad_norm": 0.4936184650873689, "learning_rate": 4.0892609797885345e-05, "loss": 0.0926, "step": 3086 }, { "epoch": 1.4044585987261147, "grad_norm": 0.6457658385080421, "learning_rate": 4.0887092547716476e-05, "loss": 0.1087, "step": 3087 }, { "epoch": 1.4049135577797998, "grad_norm": 0.6335286964863972, "learning_rate": 4.088157399932614e-05, "loss": 0.1065, "step": 3088 }, { "epoch": 1.405368516833485, "grad_norm": 0.46957599132845684, "learning_rate": 4.087605415316531e-05, "loss": 0.109, "step": 3089 }, { "epoch": 1.4058234758871702, "grad_norm": 0.6380153376940364, "learning_rate": 4.087053300968503e-05, "loss": 0.1207, "step": 3090 }, { "epoch": 1.4062784349408552, "grad_norm": 0.6475814134675122, "learning_rate": 4.086501056933645e-05, "loss": 0.1032, "step": 3091 }, { "epoch": 1.4067333939945406, "grad_norm": 0.8429391621407989, "learning_rate": 4.085948683257086e-05, "loss": 0.1277, "step": 3092 }, { "epoch": 1.4071883530482256, "grad_norm": 0.8659175538481514, "learning_rate": 4.085396179983963e-05, "loss": 0.1998, "step": 3093 }, { "epoch": 1.4076433121019107, "grad_norm": 0.765508539860875, "learning_rate": 4.0848435471594236e-05, "loss": 0.1414, "step": 3094 }, { "epoch": 1.408098271155596, "grad_norm": 0.7257848454881196, "learning_rate": 4.084290784828626e-05, "loss": 0.1282, "step": 3095 }, { "epoch": 1.4085532302092811, "grad_norm": 1.194533999146872, "learning_rate": 4.083737893036741e-05, "loss": 0.1019, "step": 3096 }, { "epoch": 1.4090081892629662, "grad_norm": 0.5435235689268562, "learning_rate": 4.0831848718289464e-05, "loss": 0.1509, "step": 3097 }, { "epoch": 1.4094631483166515, "grad_norm": 1.0838312924508156, "learning_rate": 4.082631721250434e-05, "loss": 0.0868, "step": 3098 }, { "epoch": 1.4099181073703366, "grad_norm": 0.6408339069453253, "learning_rate": 4.082078441346405e-05, "loss": 0.1133, "step": 3099 }, { "epoch": 1.4103730664240217, "grad_norm": 0.6479426111302613, "learning_rate": 4.08152503216207e-05, "loss": 0.1241, "step": 3100 }, { "epoch": 1.410828025477707, "grad_norm": 0.7918474055188695, "learning_rate": 4.0809714937426516e-05, "loss": 0.1054, "step": 3101 }, { "epoch": 1.4112829845313921, "grad_norm": 0.6771714545674453, "learning_rate": 4.0804178261333826e-05, "loss": 0.1395, "step": 3102 }, { "epoch": 1.4117379435850774, "grad_norm": 0.4960327377605718, "learning_rate": 4.079864029379506e-05, "loss": 0.1251, "step": 3103 }, { "epoch": 1.4121929026387625, "grad_norm": 0.6554709177996831, "learning_rate": 4.079310103526275e-05, "loss": 0.0821, "step": 3104 }, { "epoch": 1.4126478616924476, "grad_norm": 0.6895844968267966, "learning_rate": 4.0787560486189545e-05, "loss": 0.0919, "step": 3105 }, { "epoch": 1.413102820746133, "grad_norm": 0.6052756373331966, "learning_rate": 4.07820186470282e-05, "loss": 0.1154, "step": 3106 }, { "epoch": 1.413557779799818, "grad_norm": 0.6351751953665614, "learning_rate": 4.077647551823155e-05, "loss": 0.0801, "step": 3107 }, { "epoch": 1.4140127388535033, "grad_norm": 0.5912912102853896, "learning_rate": 4.0770931100252574e-05, "loss": 0.1066, "step": 3108 }, { "epoch": 1.4144676979071884, "grad_norm": 0.7024767760835764, "learning_rate": 4.076538539354433e-05, "loss": 0.1012, "step": 3109 }, { "epoch": 1.4149226569608735, "grad_norm": 0.7936110230287754, "learning_rate": 4.075983839855998e-05, "loss": 0.1286, "step": 3110 }, { "epoch": 1.4153776160145588, "grad_norm": 0.7029733990886241, "learning_rate": 4.0754290115752815e-05, "loss": 0.1845, "step": 3111 }, { "epoch": 1.415832575068244, "grad_norm": 0.5391237483909781, "learning_rate": 4.0748740545576206e-05, "loss": 0.0891, "step": 3112 }, { "epoch": 1.416287534121929, "grad_norm": 0.8440300370336851, "learning_rate": 4.074318968848364e-05, "loss": 0.1209, "step": 3113 }, { "epoch": 1.4167424931756143, "grad_norm": 0.816853941970173, "learning_rate": 4.073763754492871e-05, "loss": 0.1402, "step": 3114 }, { "epoch": 1.4171974522292994, "grad_norm": 0.5875136212506723, "learning_rate": 4.073208411536511e-05, "loss": 0.1066, "step": 3115 }, { "epoch": 1.4176524112829845, "grad_norm": 0.8961235609048689, "learning_rate": 4.072652940024664e-05, "loss": 0.1254, "step": 3116 }, { "epoch": 1.4181073703366698, "grad_norm": 0.6599971009063496, "learning_rate": 4.0720973400027204e-05, "loss": 0.1099, "step": 3117 }, { "epoch": 1.4185623293903549, "grad_norm": 0.923190868152788, "learning_rate": 4.071541611516082e-05, "loss": 0.1414, "step": 3118 }, { "epoch": 1.41901728844404, "grad_norm": 1.026918731481436, "learning_rate": 4.0709857546101606e-05, "loss": 0.2031, "step": 3119 }, { "epoch": 1.4194722474977253, "grad_norm": 0.5328432265391283, "learning_rate": 4.070429769330377e-05, "loss": 0.1722, "step": 3120 }, { "epoch": 1.4199272065514104, "grad_norm": 0.5548953791791491, "learning_rate": 4.069873655722165e-05, "loss": 0.1921, "step": 3121 }, { "epoch": 1.4203821656050954, "grad_norm": 0.641128007430031, "learning_rate": 4.069317413830967e-05, "loss": 0.1213, "step": 3122 }, { "epoch": 1.4208371246587808, "grad_norm": 0.6029831334912243, "learning_rate": 4.0687610437022374e-05, "loss": 0.0792, "step": 3123 }, { "epoch": 1.4212920837124658, "grad_norm": 0.6900260498811527, "learning_rate": 4.06820454538144e-05, "loss": 0.1504, "step": 3124 }, { "epoch": 1.421747042766151, "grad_norm": 3.456753334515492, "learning_rate": 4.0676479189140484e-05, "loss": 0.0992, "step": 3125 }, { "epoch": 1.4222020018198362, "grad_norm": 0.6029679836456441, "learning_rate": 4.067091164345549e-05, "loss": 0.1014, "step": 3126 }, { "epoch": 1.4226569608735213, "grad_norm": 0.6203481096043598, "learning_rate": 4.0665342817214365e-05, "loss": 0.1033, "step": 3127 }, { "epoch": 1.4231119199272064, "grad_norm": 0.6885749374229723, "learning_rate": 4.065977271087216e-05, "loss": 0.1363, "step": 3128 }, { "epoch": 1.4235668789808917, "grad_norm": 0.675125739419606, "learning_rate": 4.065420132488406e-05, "loss": 0.1042, "step": 3129 }, { "epoch": 1.4240218380345768, "grad_norm": 0.7643268612229153, "learning_rate": 4.0648628659705315e-05, "loss": 0.1673, "step": 3130 }, { "epoch": 1.4244767970882621, "grad_norm": 0.6112752164489814, "learning_rate": 4.064305471579131e-05, "loss": 0.1464, "step": 3131 }, { "epoch": 1.4249317561419472, "grad_norm": 0.5996021404824161, "learning_rate": 4.063747949359751e-05, "loss": 0.1196, "step": 3132 }, { "epoch": 1.4253867151956323, "grad_norm": 0.585099637915133, "learning_rate": 4.0631902993579505e-05, "loss": 0.1173, "step": 3133 }, { "epoch": 1.4258416742493176, "grad_norm": 0.6306422121980905, "learning_rate": 4.0626325216192986e-05, "loss": 0.1298, "step": 3134 }, { "epoch": 1.4262966333030027, "grad_norm": 0.7126927856110096, "learning_rate": 4.062074616189373e-05, "loss": 0.1266, "step": 3135 }, { "epoch": 1.426751592356688, "grad_norm": 0.9316343304970098, "learning_rate": 4.061516583113765e-05, "loss": 0.1745, "step": 3136 }, { "epoch": 1.4272065514103731, "grad_norm": 0.5232338995494858, "learning_rate": 4.060958422438072e-05, "loss": 0.1449, "step": 3137 }, { "epoch": 1.4276615104640582, "grad_norm": 0.600838815409091, "learning_rate": 4.0604001342079076e-05, "loss": 0.1022, "step": 3138 }, { "epoch": 1.4281164695177435, "grad_norm": 0.5205157058379787, "learning_rate": 4.0598417184688894e-05, "loss": 0.109, "step": 3139 }, { "epoch": 1.4285714285714286, "grad_norm": 0.7542443416785327, "learning_rate": 4.059283175266652e-05, "loss": 0.1491, "step": 3140 }, { "epoch": 1.4290263876251137, "grad_norm": 1.1083443407990075, "learning_rate": 4.058724504646834e-05, "loss": 0.1165, "step": 3141 }, { "epoch": 1.429481346678799, "grad_norm": 0.5902395608175633, "learning_rate": 4.0581657066550884e-05, "loss": 0.092, "step": 3142 }, { "epoch": 1.429936305732484, "grad_norm": 0.4618596071616064, "learning_rate": 4.057606781337079e-05, "loss": 0.0785, "step": 3143 }, { "epoch": 1.4303912647861692, "grad_norm": 0.6759532366847553, "learning_rate": 4.0570477287384765e-05, "loss": 0.1129, "step": 3144 }, { "epoch": 1.4308462238398545, "grad_norm": 0.7466370270363466, "learning_rate": 4.056488548904966e-05, "loss": 0.1522, "step": 3145 }, { "epoch": 1.4313011828935396, "grad_norm": 0.47531381462703554, "learning_rate": 4.0559292418822394e-05, "loss": 0.1118, "step": 3146 }, { "epoch": 1.4317561419472247, "grad_norm": 0.7380341868918394, "learning_rate": 4.0553698077160025e-05, "loss": 0.1299, "step": 3147 }, { "epoch": 1.43221110100091, "grad_norm": 0.6375272626642342, "learning_rate": 4.054810246451969e-05, "loss": 0.1907, "step": 3148 }, { "epoch": 1.432666060054595, "grad_norm": 0.48954920295839915, "learning_rate": 4.0542505581358624e-05, "loss": 0.1031, "step": 3149 }, { "epoch": 1.4331210191082802, "grad_norm": 0.6042033630873699, "learning_rate": 4.05369074281342e-05, "loss": 0.0848, "step": 3150 }, { "epoch": 1.4335759781619655, "grad_norm": 0.6653666595548453, "learning_rate": 4.053130800530386e-05, "loss": 0.1287, "step": 3151 }, { "epoch": 1.4340309372156506, "grad_norm": 0.6794626576620493, "learning_rate": 4.052570731332518e-05, "loss": 0.1227, "step": 3152 }, { "epoch": 1.4344858962693356, "grad_norm": 0.6965389629669829, "learning_rate": 4.0520105352655804e-05, "loss": 0.088, "step": 3153 }, { "epoch": 1.434940855323021, "grad_norm": 0.9038550510148031, "learning_rate": 4.0514502123753504e-05, "loss": 0.1488, "step": 3154 }, { "epoch": 1.435395814376706, "grad_norm": 0.5941742010302732, "learning_rate": 4.0508897627076156e-05, "loss": 0.1265, "step": 3155 }, { "epoch": 1.4358507734303911, "grad_norm": 0.5861235868420277, "learning_rate": 4.0503291863081726e-05, "loss": 0.0891, "step": 3156 }, { "epoch": 1.4363057324840764, "grad_norm": 0.7321074708442263, "learning_rate": 4.049768483222831e-05, "loss": 0.11, "step": 3157 }, { "epoch": 1.4367606915377615, "grad_norm": 0.6619081309038998, "learning_rate": 4.049207653497407e-05, "loss": 0.1438, "step": 3158 }, { "epoch": 1.4372156505914468, "grad_norm": 0.9337721754067222, "learning_rate": 4.048646697177729e-05, "loss": 0.1703, "step": 3159 }, { "epoch": 1.437670609645132, "grad_norm": 0.5803831873506402, "learning_rate": 4.048085614309637e-05, "loss": 0.1317, "step": 3160 }, { "epoch": 1.438125568698817, "grad_norm": 1.1930828917723455, "learning_rate": 4.04752440493898e-05, "loss": 0.1164, "step": 3161 }, { "epoch": 1.4385805277525023, "grad_norm": 0.6507070647200851, "learning_rate": 4.0469630691116176e-05, "loss": 0.108, "step": 3162 }, { "epoch": 1.4390354868061874, "grad_norm": 0.7634908963508983, "learning_rate": 4.0464016068734184e-05, "loss": 0.1378, "step": 3163 }, { "epoch": 1.4394904458598727, "grad_norm": 0.7431355834895528, "learning_rate": 4.045840018270264e-05, "loss": 0.158, "step": 3164 }, { "epoch": 1.4399454049135578, "grad_norm": 0.8046749606808715, "learning_rate": 4.0452783033480434e-05, "loss": 0.2181, "step": 3165 }, { "epoch": 1.440400363967243, "grad_norm": 0.6121894596219519, "learning_rate": 4.044716462152659e-05, "loss": 0.0786, "step": 3166 }, { "epoch": 1.4408553230209282, "grad_norm": 0.5565226937008696, "learning_rate": 4.0441544947300204e-05, "loss": 0.0927, "step": 3167 }, { "epoch": 1.4413102820746133, "grad_norm": 0.6622549211380421, "learning_rate": 4.0435924011260506e-05, "loss": 0.1256, "step": 3168 }, { "epoch": 1.4417652411282984, "grad_norm": 0.780535160568948, "learning_rate": 4.0430301813866804e-05, "loss": 0.1884, "step": 3169 }, { "epoch": 1.4422202001819837, "grad_norm": 0.6029219121562562, "learning_rate": 4.042467835557852e-05, "loss": 0.1287, "step": 3170 }, { "epoch": 1.4426751592356688, "grad_norm": 0.6360676728507516, "learning_rate": 4.041905363685518e-05, "loss": 0.1419, "step": 3171 }, { "epoch": 1.443130118289354, "grad_norm": 0.5992847054269326, "learning_rate": 4.041342765815641e-05, "loss": 0.1403, "step": 3172 }, { "epoch": 1.4435850773430392, "grad_norm": 1.6975664249864715, "learning_rate": 4.040780041994193e-05, "loss": 0.1876, "step": 3173 }, { "epoch": 1.4440400363967243, "grad_norm": 0.8110969959087653, "learning_rate": 4.0402171922671585e-05, "loss": 0.147, "step": 3174 }, { "epoch": 1.4444949954504094, "grad_norm": 0.6715237286579965, "learning_rate": 4.0396542166805304e-05, "loss": 0.1423, "step": 3175 }, { "epoch": 1.4449499545040947, "grad_norm": 0.7530257372171939, "learning_rate": 4.0390911152803134e-05, "loss": 0.1467, "step": 3176 }, { "epoch": 1.4454049135577798, "grad_norm": 0.6662076486553342, "learning_rate": 4.0385278881125204e-05, "loss": 0.1483, "step": 3177 }, { "epoch": 1.4458598726114649, "grad_norm": 0.5817801508077106, "learning_rate": 4.0379645352231765e-05, "loss": 0.107, "step": 3178 }, { "epoch": 1.4463148316651502, "grad_norm": 0.6041395661217476, "learning_rate": 4.037401056658316e-05, "loss": 0.1344, "step": 3179 }, { "epoch": 1.4467697907188353, "grad_norm": 0.6288553424401877, "learning_rate": 4.036837452463985e-05, "loss": 0.0821, "step": 3180 }, { "epoch": 1.4472247497725204, "grad_norm": 0.6610554891715242, "learning_rate": 4.036273722686236e-05, "loss": 0.1363, "step": 3181 }, { "epoch": 1.4476797088262057, "grad_norm": 0.6074507182544459, "learning_rate": 4.0357098673711366e-05, "loss": 0.0754, "step": 3182 }, { "epoch": 1.4481346678798908, "grad_norm": 0.7482115047411302, "learning_rate": 4.035145886564763e-05, "loss": 0.121, "step": 3183 }, { "epoch": 1.4485896269335758, "grad_norm": 0.6434692709064725, "learning_rate": 4.034581780313199e-05, "loss": 0.1163, "step": 3184 }, { "epoch": 1.4490445859872612, "grad_norm": 0.725788427700022, "learning_rate": 4.034017548662543e-05, "loss": 0.095, "step": 3185 }, { "epoch": 1.4494995450409462, "grad_norm": 0.5136231265877684, "learning_rate": 4.0334531916589006e-05, "loss": 0.1314, "step": 3186 }, { "epoch": 1.4499545040946316, "grad_norm": 0.6854267147945023, "learning_rate": 4.032888709348387e-05, "loss": 0.1403, "step": 3187 }, { "epoch": 1.4504094631483166, "grad_norm": 0.9553015968984349, "learning_rate": 4.032324101777132e-05, "loss": 0.1437, "step": 3188 }, { "epoch": 1.450864422202002, "grad_norm": 0.6727321283355262, "learning_rate": 4.03175936899127e-05, "loss": 0.1198, "step": 3189 }, { "epoch": 1.451319381255687, "grad_norm": 0.6327840786095357, "learning_rate": 4.031194511036951e-05, "loss": 0.0829, "step": 3190 }, { "epoch": 1.4517743403093721, "grad_norm": 0.5934454506864989, "learning_rate": 4.03062952796033e-05, "loss": 0.1102, "step": 3191 }, { "epoch": 1.4522292993630574, "grad_norm": 0.6468492736644094, "learning_rate": 4.0300644198075773e-05, "loss": 0.086, "step": 3192 }, { "epoch": 1.4526842584167425, "grad_norm": 0.66179534818032, "learning_rate": 4.02949918662487e-05, "loss": 0.1033, "step": 3193 }, { "epoch": 1.4531392174704276, "grad_norm": 0.9059428723365093, "learning_rate": 4.0289338284583966e-05, "loss": 0.1525, "step": 3194 }, { "epoch": 1.453594176524113, "grad_norm": 0.4324366931167935, "learning_rate": 4.028368345354354e-05, "loss": 0.1109, "step": 3195 }, { "epoch": 1.454049135577798, "grad_norm": 0.7394211213056495, "learning_rate": 4.0278027373589536e-05, "loss": 0.1101, "step": 3196 }, { "epoch": 1.4545040946314831, "grad_norm": 0.4868037275787818, "learning_rate": 4.027237004518413e-05, "loss": 0.0696, "step": 3197 }, { "epoch": 1.4549590536851684, "grad_norm": 0.43541697138620206, "learning_rate": 4.02667114687896e-05, "loss": 0.0678, "step": 3198 }, { "epoch": 1.4554140127388535, "grad_norm": 0.7018799894797957, "learning_rate": 4.026105164486836e-05, "loss": 0.115, "step": 3199 }, { "epoch": 1.4558689717925386, "grad_norm": 0.6614507791954163, "learning_rate": 4.02553905738829e-05, "loss": 0.1168, "step": 3200 }, { "epoch": 1.456323930846224, "grad_norm": 0.6975040982900316, "learning_rate": 4.024972825629582e-05, "loss": 0.1113, "step": 3201 }, { "epoch": 1.456778889899909, "grad_norm": 0.716690960087669, "learning_rate": 4.024406469256979e-05, "loss": 0.124, "step": 3202 }, { "epoch": 1.457233848953594, "grad_norm": 0.6014661828397126, "learning_rate": 4.023839988316766e-05, "loss": 0.111, "step": 3203 }, { "epoch": 1.4576888080072794, "grad_norm": 0.5144624287574798, "learning_rate": 4.0232733828552297e-05, "loss": 0.1002, "step": 3204 }, { "epoch": 1.4581437670609645, "grad_norm": 0.8728737925579584, "learning_rate": 4.022706652918672e-05, "loss": 0.1421, "step": 3205 }, { "epoch": 1.4585987261146496, "grad_norm": 0.6937181849873922, "learning_rate": 4.0221397985534035e-05, "loss": 0.154, "step": 3206 }, { "epoch": 1.459053685168335, "grad_norm": 0.7125410220689627, "learning_rate": 4.021572819805744e-05, "loss": 0.1148, "step": 3207 }, { "epoch": 1.45950864422202, "grad_norm": 0.734795870039996, "learning_rate": 4.021005716722025e-05, "loss": 0.1142, "step": 3208 }, { "epoch": 1.459963603275705, "grad_norm": 0.5316236181133821, "learning_rate": 4.020438489348587e-05, "loss": 0.0842, "step": 3209 }, { "epoch": 1.4604185623293904, "grad_norm": 0.500640206163899, "learning_rate": 4.0198711377317835e-05, "loss": 0.0765, "step": 3210 }, { "epoch": 1.4608735213830755, "grad_norm": 0.5460126490902358, "learning_rate": 4.019303661917973e-05, "loss": 0.0971, "step": 3211 }, { "epoch": 1.4613284804367606, "grad_norm": 0.7579401149968024, "learning_rate": 4.0187360619535285e-05, "loss": 0.0952, "step": 3212 }, { "epoch": 1.4617834394904459, "grad_norm": 0.872782327387759, "learning_rate": 4.018168337884832e-05, "loss": 0.1406, "step": 3213 }, { "epoch": 1.462238398544131, "grad_norm": 0.5700267561587976, "learning_rate": 4.017600489758275e-05, "loss": 0.1002, "step": 3214 }, { "epoch": 1.4626933575978163, "grad_norm": 0.6610389768982015, "learning_rate": 4.017032517620259e-05, "loss": 0.0985, "step": 3215 }, { "epoch": 1.4631483166515014, "grad_norm": 0.5572852652681938, "learning_rate": 4.016464421517197e-05, "loss": 0.0905, "step": 3216 }, { "epoch": 1.4636032757051867, "grad_norm": 0.7451681498223804, "learning_rate": 4.01589620149551e-05, "loss": 0.1388, "step": 3217 }, { "epoch": 1.4640582347588718, "grad_norm": 0.7061685880017289, "learning_rate": 4.015327857601632e-05, "loss": 0.0862, "step": 3218 }, { "epoch": 1.4645131938125568, "grad_norm": 2.1566765179370804, "learning_rate": 4.0147593898820036e-05, "loss": 0.2772, "step": 3219 }, { "epoch": 1.4649681528662422, "grad_norm": 0.5088425506482742, "learning_rate": 4.0141907983830796e-05, "loss": 0.158, "step": 3220 }, { "epoch": 1.4654231119199272, "grad_norm": 0.7372009572840288, "learning_rate": 4.013622083151321e-05, "loss": 0.1259, "step": 3221 }, { "epoch": 1.4658780709736123, "grad_norm": 0.6769204280855545, "learning_rate": 4.0130532442332015e-05, "loss": 0.0835, "step": 3222 }, { "epoch": 1.4663330300272976, "grad_norm": 0.6109031807835317, "learning_rate": 4.012484281675203e-05, "loss": 0.0688, "step": 3223 }, { "epoch": 1.4667879890809827, "grad_norm": 0.8515805237581134, "learning_rate": 4.01191519552382e-05, "loss": 0.1228, "step": 3224 }, { "epoch": 1.4672429481346678, "grad_norm": 1.16006857442471, "learning_rate": 4.011345985825555e-05, "loss": 0.2429, "step": 3225 }, { "epoch": 1.4676979071883531, "grad_norm": 0.8816221922039368, "learning_rate": 4.010776652626921e-05, "loss": 0.1233, "step": 3226 }, { "epoch": 1.4681528662420382, "grad_norm": 0.7209271227461486, "learning_rate": 4.0102071959744414e-05, "loss": 0.137, "step": 3227 }, { "epoch": 1.4686078252957233, "grad_norm": 0.8698801780929453, "learning_rate": 4.00963761591465e-05, "loss": 0.1697, "step": 3228 }, { "epoch": 1.4690627843494086, "grad_norm": 0.7114194027315188, "learning_rate": 4.00906791249409e-05, "loss": 0.1552, "step": 3229 }, { "epoch": 1.4695177434030937, "grad_norm": 0.7774220215957052, "learning_rate": 4.008498085759315e-05, "loss": 0.1381, "step": 3230 }, { "epoch": 1.4699727024567788, "grad_norm": 0.6352706951880233, "learning_rate": 4.007928135756889e-05, "loss": 0.1125, "step": 3231 }, { "epoch": 1.4704276615104641, "grad_norm": 0.774950280967451, "learning_rate": 4.007358062533385e-05, "loss": 0.1613, "step": 3232 }, { "epoch": 1.4708826205641492, "grad_norm": 0.6836955636787949, "learning_rate": 4.0067878661353875e-05, "loss": 0.1495, "step": 3233 }, { "epoch": 1.4713375796178343, "grad_norm": 0.5306239267775383, "learning_rate": 4.006217546609491e-05, "loss": 0.107, "step": 3234 }, { "epoch": 1.4717925386715196, "grad_norm": 0.5759157950551701, "learning_rate": 4.0056471040022983e-05, "loss": 0.0844, "step": 3235 }, { "epoch": 1.4722474977252047, "grad_norm": 0.4988996782899359, "learning_rate": 4.005076538360423e-05, "loss": 0.1306, "step": 3236 }, { "epoch": 1.4727024567788898, "grad_norm": 0.8249010535094888, "learning_rate": 4.00450584973049e-05, "loss": 0.1216, "step": 3237 }, { "epoch": 1.473157415832575, "grad_norm": 0.7015056951950606, "learning_rate": 4.0039350381591336e-05, "loss": 0.0997, "step": 3238 }, { "epoch": 1.4736123748862602, "grad_norm": 0.8251832971590896, "learning_rate": 4.0033641036929974e-05, "loss": 0.1137, "step": 3239 }, { "epoch": 1.4740673339399453, "grad_norm": 0.5912346488516331, "learning_rate": 4.002793046378735e-05, "loss": 0.107, "step": 3240 }, { "epoch": 1.4745222929936306, "grad_norm": 0.6817905573533362, "learning_rate": 4.002221866263013e-05, "loss": 0.1621, "step": 3241 }, { "epoch": 1.4749772520473157, "grad_norm": 0.6127971790125251, "learning_rate": 4.001650563392504e-05, "loss": 0.1342, "step": 3242 }, { "epoch": 1.475432211101001, "grad_norm": 1.0094249056971418, "learning_rate": 4.0010791378138915e-05, "loss": 0.1818, "step": 3243 }, { "epoch": 1.475887170154686, "grad_norm": 0.7308218206210023, "learning_rate": 4.00050758957387e-05, "loss": 0.1371, "step": 3244 }, { "epoch": 1.4763421292083714, "grad_norm": 0.6303139051694561, "learning_rate": 3.999935918719145e-05, "loss": 0.1672, "step": 3245 }, { "epoch": 1.4767970882620565, "grad_norm": 0.6755464222352257, "learning_rate": 3.999364125296432e-05, "loss": 0.1315, "step": 3246 }, { "epoch": 1.4772520473157416, "grad_norm": 0.6355432101629651, "learning_rate": 3.9987922093524523e-05, "loss": 0.0854, "step": 3247 }, { "epoch": 1.4777070063694269, "grad_norm": 0.652014639922562, "learning_rate": 3.9982201709339415e-05, "loss": 0.1365, "step": 3248 }, { "epoch": 1.478161965423112, "grad_norm": 0.5661882983880364, "learning_rate": 3.9976480100876445e-05, "loss": 0.0897, "step": 3249 }, { "epoch": 1.478616924476797, "grad_norm": 0.6413707195121499, "learning_rate": 3.997075726860316e-05, "loss": 0.1135, "step": 3250 }, { "epoch": 1.4790718835304824, "grad_norm": 0.5296505266071954, "learning_rate": 3.996503321298719e-05, "loss": 0.1171, "step": 3251 }, { "epoch": 1.4795268425841674, "grad_norm": 0.5410033563494365, "learning_rate": 3.995930793449629e-05, "loss": 0.1669, "step": 3252 }, { "epoch": 1.4799818016378525, "grad_norm": 0.5259153442813943, "learning_rate": 3.995358143359831e-05, "loss": 0.1128, "step": 3253 }, { "epoch": 1.4804367606915378, "grad_norm": 0.7152995490664823, "learning_rate": 3.994785371076117e-05, "loss": 0.1096, "step": 3254 }, { "epoch": 1.480891719745223, "grad_norm": 0.5019174570269138, "learning_rate": 3.994212476645294e-05, "loss": 0.0995, "step": 3255 }, { "epoch": 1.481346678798908, "grad_norm": 0.6636716623775123, "learning_rate": 3.9936394601141745e-05, "loss": 0.1745, "step": 3256 }, { "epoch": 1.4818016378525933, "grad_norm": 0.6853198257963037, "learning_rate": 3.9930663215295845e-05, "loss": 0.1574, "step": 3257 }, { "epoch": 1.4822565969062784, "grad_norm": 0.8882247432910473, "learning_rate": 3.9924930609383566e-05, "loss": 0.1836, "step": 3258 }, { "epoch": 1.4827115559599635, "grad_norm": 0.6716220521615681, "learning_rate": 3.991919678387336e-05, "loss": 0.159, "step": 3259 }, { "epoch": 1.4831665150136488, "grad_norm": 0.9446014096341956, "learning_rate": 3.991346173923377e-05, "loss": 0.1221, "step": 3260 }, { "epoch": 1.483621474067334, "grad_norm": 0.47796726102167747, "learning_rate": 3.990772547593342e-05, "loss": 0.0603, "step": 3261 }, { "epoch": 1.484076433121019, "grad_norm": 0.7712596130114697, "learning_rate": 3.9901987994441084e-05, "loss": 0.1685, "step": 3262 }, { "epoch": 1.4845313921747043, "grad_norm": 1.7915324025154713, "learning_rate": 3.989624929522558e-05, "loss": 0.1541, "step": 3263 }, { "epoch": 1.4849863512283894, "grad_norm": 0.5221068718991133, "learning_rate": 3.9890509378755856e-05, "loss": 0.0842, "step": 3264 }, { "epoch": 1.4854413102820745, "grad_norm": 0.5162403489269772, "learning_rate": 3.988476824550095e-05, "loss": 0.117, "step": 3265 }, { "epoch": 1.4858962693357598, "grad_norm": 1.4268556498631961, "learning_rate": 3.987902589593e-05, "loss": 0.1556, "step": 3266 }, { "epoch": 1.486351228389445, "grad_norm": 0.5009241577380319, "learning_rate": 3.987328233051225e-05, "loss": 0.1024, "step": 3267 }, { "epoch": 1.4868061874431302, "grad_norm": 0.7233489821395322, "learning_rate": 3.986753754971703e-05, "loss": 0.1169, "step": 3268 }, { "epoch": 1.4872611464968153, "grad_norm": 0.7126728947028413, "learning_rate": 3.986179155401379e-05, "loss": 0.1224, "step": 3269 }, { "epoch": 1.4877161055505004, "grad_norm": 0.5556855479219344, "learning_rate": 3.985604434387206e-05, "loss": 0.1665, "step": 3270 }, { "epoch": 1.4881710646041857, "grad_norm": 1.1659524009758488, "learning_rate": 3.985029591976147e-05, "loss": 0.1326, "step": 3271 }, { "epoch": 1.4886260236578708, "grad_norm": 0.9433504542058174, "learning_rate": 3.984454628215176e-05, "loss": 0.1836, "step": 3272 }, { "epoch": 1.489080982711556, "grad_norm": 0.7415332218675222, "learning_rate": 3.9838795431512765e-05, "loss": 0.14, "step": 3273 }, { "epoch": 1.4895359417652412, "grad_norm": 0.7052476820050338, "learning_rate": 3.983304336831442e-05, "loss": 0.13, "step": 3274 }, { "epoch": 1.4899909008189263, "grad_norm": 0.6060059735510632, "learning_rate": 3.982729009302676e-05, "loss": 0.1709, "step": 3275 }, { "epoch": 1.4904458598726116, "grad_norm": 0.4872092668709008, "learning_rate": 3.982153560611991e-05, "loss": 0.1362, "step": 3276 }, { "epoch": 1.4909008189262967, "grad_norm": 0.9054055153960676, "learning_rate": 3.98157799080641e-05, "loss": 0.1392, "step": 3277 }, { "epoch": 1.4913557779799818, "grad_norm": 0.7890249737565119, "learning_rate": 3.9810022999329675e-05, "loss": 0.2271, "step": 3278 }, { "epoch": 1.491810737033667, "grad_norm": 0.5844467442772879, "learning_rate": 3.9804264880387034e-05, "loss": 0.0977, "step": 3279 }, { "epoch": 1.4922656960873522, "grad_norm": 0.5859109108520417, "learning_rate": 3.9798505551706733e-05, "loss": 0.1311, "step": 3280 }, { "epoch": 1.4927206551410372, "grad_norm": 0.550562445210675, "learning_rate": 3.979274501375939e-05, "loss": 0.1262, "step": 3281 }, { "epoch": 1.4931756141947226, "grad_norm": 0.7251711614666583, "learning_rate": 3.978698326701573e-05, "loss": 0.1654, "step": 3282 }, { "epoch": 1.4936305732484076, "grad_norm": 0.9411094335248743, "learning_rate": 3.978122031194657e-05, "loss": 0.1422, "step": 3283 }, { "epoch": 1.4940855323020927, "grad_norm": 0.662737887214335, "learning_rate": 3.9775456149022835e-05, "loss": 0.0941, "step": 3284 }, { "epoch": 1.494540491355778, "grad_norm": 0.664712610308803, "learning_rate": 3.976969077871555e-05, "loss": 0.1533, "step": 3285 }, { "epoch": 1.4949954504094631, "grad_norm": 0.8441834109979497, "learning_rate": 3.9763924201495836e-05, "loss": 0.1652, "step": 3286 }, { "epoch": 1.4954504094631482, "grad_norm": 0.7483321678946654, "learning_rate": 3.975815641783491e-05, "loss": 0.0663, "step": 3287 }, { "epoch": 1.4959053685168335, "grad_norm": 0.48053948065265406, "learning_rate": 3.975238742820408e-05, "loss": 0.1144, "step": 3288 }, { "epoch": 1.4963603275705186, "grad_norm": 0.6700050561479185, "learning_rate": 3.9746617233074786e-05, "loss": 0.1233, "step": 3289 }, { "epoch": 1.4968152866242037, "grad_norm": 0.8122823177431124, "learning_rate": 3.974084583291852e-05, "loss": 0.1678, "step": 3290 }, { "epoch": 1.497270245677889, "grad_norm": 1.0151721943295215, "learning_rate": 3.9735073228206896e-05, "loss": 0.1205, "step": 3291 }, { "epoch": 1.4977252047315741, "grad_norm": 0.44631242946792277, "learning_rate": 3.972929941941163e-05, "loss": 0.0842, "step": 3292 }, { "epoch": 1.4981801637852592, "grad_norm": 0.6424698260571825, "learning_rate": 3.972352440700455e-05, "loss": 0.1033, "step": 3293 }, { "epoch": 1.4986351228389445, "grad_norm": 0.6626485540208424, "learning_rate": 3.971774819145753e-05, "loss": 0.1341, "step": 3294 }, { "epoch": 1.4990900818926296, "grad_norm": 0.6834595915158236, "learning_rate": 3.97119707732426e-05, "loss": 0.1142, "step": 3295 }, { "epoch": 1.499545040946315, "grad_norm": 0.4666575275177755, "learning_rate": 3.970619215283185e-05, "loss": 0.1014, "step": 3296 }, { "epoch": 1.5, "grad_norm": 0.7096262636613291, "learning_rate": 3.9700412330697495e-05, "loss": 0.1394, "step": 3297 }, { "epoch": 1.5004549590536853, "grad_norm": 0.6752591411552851, "learning_rate": 3.969463130731183e-05, "loss": 0.1297, "step": 3298 }, { "epoch": 1.5009099181073702, "grad_norm": 0.6904133953873685, "learning_rate": 3.9688849083147254e-05, "loss": 0.1158, "step": 3299 }, { "epoch": 1.5013648771610555, "grad_norm": 0.6070578382602926, "learning_rate": 3.968306565867626e-05, "loss": 0.0922, "step": 3300 }, { "epoch": 1.5018198362147408, "grad_norm": 0.6621616922827441, "learning_rate": 3.967728103437146e-05, "loss": 0.0825, "step": 3301 }, { "epoch": 1.5022747952684259, "grad_norm": 0.6418811726280461, "learning_rate": 3.967149521070553e-05, "loss": 0.1332, "step": 3302 }, { "epoch": 1.502729754322111, "grad_norm": 0.7383266258593468, "learning_rate": 3.966570818815126e-05, "loss": 0.1022, "step": 3303 }, { "epoch": 1.5031847133757963, "grad_norm": 0.9618092806456039, "learning_rate": 3.965991996718156e-05, "loss": 0.1324, "step": 3304 }, { "epoch": 1.5036396724294814, "grad_norm": 0.8134272073543242, "learning_rate": 3.96541305482694e-05, "loss": 0.1452, "step": 3305 }, { "epoch": 1.5040946314831665, "grad_norm": 0.5880757448433142, "learning_rate": 3.9648339931887865e-05, "loss": 0.0955, "step": 3306 }, { "epoch": 1.5045495905368518, "grad_norm": 0.6958372304118954, "learning_rate": 3.9642548118510145e-05, "loss": 0.1337, "step": 3307 }, { "epoch": 1.5050045495905369, "grad_norm": 0.6427008844688749, "learning_rate": 3.963675510860952e-05, "loss": 0.2134, "step": 3308 }, { "epoch": 1.505459508644222, "grad_norm": 0.6677423991624508, "learning_rate": 3.963096090265937e-05, "loss": 0.1169, "step": 3309 }, { "epoch": 1.5059144676979073, "grad_norm": 0.5890605823369651, "learning_rate": 3.962516550113316e-05, "loss": 0.118, "step": 3310 }, { "epoch": 1.5063694267515924, "grad_norm": 0.6040696339697127, "learning_rate": 3.9619368904504466e-05, "loss": 0.1268, "step": 3311 }, { "epoch": 1.5068243858052774, "grad_norm": 0.5087707544552535, "learning_rate": 3.9613571113246974e-05, "loss": 0.148, "step": 3312 }, { "epoch": 1.5072793448589628, "grad_norm": 0.5909448847615165, "learning_rate": 3.960777212783445e-05, "loss": 0.0848, "step": 3313 }, { "epoch": 1.5077343039126478, "grad_norm": 0.6021390084824063, "learning_rate": 3.9601971948740745e-05, "loss": 0.1247, "step": 3314 }, { "epoch": 1.508189262966333, "grad_norm": 0.6429426417277578, "learning_rate": 3.959617057643984e-05, "loss": 0.0829, "step": 3315 }, { "epoch": 1.5086442220200182, "grad_norm": 0.7243206508901471, "learning_rate": 3.9590368011405785e-05, "loss": 0.0831, "step": 3316 }, { "epoch": 1.5090991810737033, "grad_norm": 0.7849096025372496, "learning_rate": 3.958456425411275e-05, "loss": 0.1141, "step": 3317 }, { "epoch": 1.5095541401273884, "grad_norm": 0.6599319883575847, "learning_rate": 3.9578759305035e-05, "loss": 0.1067, "step": 3318 }, { "epoch": 1.5100090991810737, "grad_norm": 0.6407290481023529, "learning_rate": 3.957295316464686e-05, "loss": 0.0958, "step": 3319 }, { "epoch": 1.5104640582347588, "grad_norm": 0.5496674942996197, "learning_rate": 3.95671458334228e-05, "loss": 0.1097, "step": 3320 }, { "epoch": 1.510919017288444, "grad_norm": 0.6690572477059645, "learning_rate": 3.9561337311837366e-05, "loss": 0.098, "step": 3321 }, { "epoch": 1.5113739763421292, "grad_norm": 0.5761963096047895, "learning_rate": 3.9555527600365215e-05, "loss": 0.148, "step": 3322 }, { "epoch": 1.5118289353958145, "grad_norm": 0.5954521022436691, "learning_rate": 3.954971669948108e-05, "loss": 0.1646, "step": 3323 }, { "epoch": 1.5122838944494994, "grad_norm": 0.7220270993921515, "learning_rate": 3.954390460965979e-05, "loss": 0.1675, "step": 3324 }, { "epoch": 1.5127388535031847, "grad_norm": 0.8658281892505827, "learning_rate": 3.9538091331376306e-05, "loss": 0.2092, "step": 3325 }, { "epoch": 1.51319381255687, "grad_norm": 0.5421698084265003, "learning_rate": 3.9532276865105644e-05, "loss": 0.1392, "step": 3326 }, { "epoch": 1.5136487716105549, "grad_norm": 0.5327437821983662, "learning_rate": 3.952646121132295e-05, "loss": 0.0834, "step": 3327 }, { "epoch": 1.5141037306642402, "grad_norm": 0.5479702891621437, "learning_rate": 3.952064437050345e-05, "loss": 0.1427, "step": 3328 }, { "epoch": 1.5145586897179255, "grad_norm": 0.7470379193814124, "learning_rate": 3.951482634312246e-05, "loss": 0.1066, "step": 3329 }, { "epoch": 1.5150136487716106, "grad_norm": 0.6117759629348974, "learning_rate": 3.950900712965541e-05, "loss": 0.1056, "step": 3330 }, { "epoch": 1.5154686078252957, "grad_norm": 0.6794928871305772, "learning_rate": 3.9503186730577816e-05, "loss": 0.1602, "step": 3331 }, { "epoch": 1.515923566878981, "grad_norm": 0.5236531313282496, "learning_rate": 3.9497365146365304e-05, "loss": 0.085, "step": 3332 }, { "epoch": 1.516378525932666, "grad_norm": 0.6625530571716749, "learning_rate": 3.949154237749358e-05, "loss": 0.1113, "step": 3333 }, { "epoch": 1.5168334849863512, "grad_norm": 0.6452009621984383, "learning_rate": 3.948571842443846e-05, "loss": 0.1059, "step": 3334 }, { "epoch": 1.5172884440400365, "grad_norm": 0.8474305062826546, "learning_rate": 3.9479893287675844e-05, "loss": 0.1119, "step": 3335 }, { "epoch": 1.5177434030937216, "grad_norm": 0.7521291784940319, "learning_rate": 3.947406696768174e-05, "loss": 0.2162, "step": 3336 }, { "epoch": 1.5181983621474067, "grad_norm": 1.0265166086818844, "learning_rate": 3.946823946493225e-05, "loss": 0.1538, "step": 3337 }, { "epoch": 1.518653321201092, "grad_norm": 0.7898558537577676, "learning_rate": 3.946241077990356e-05, "loss": 0.1417, "step": 3338 }, { "epoch": 1.519108280254777, "grad_norm": 0.7465282278626468, "learning_rate": 3.945658091307198e-05, "loss": 0.1174, "step": 3339 }, { "epoch": 1.5195632393084622, "grad_norm": 0.6657578243235431, "learning_rate": 3.945074986491389e-05, "loss": 0.2082, "step": 3340 }, { "epoch": 1.5200181983621475, "grad_norm": 0.7516758123395796, "learning_rate": 3.944491763590579e-05, "loss": 0.1343, "step": 3341 }, { "epoch": 1.5204731574158326, "grad_norm": 0.6351088844153624, "learning_rate": 3.943908422652424e-05, "loss": 0.129, "step": 3342 }, { "epoch": 1.5209281164695176, "grad_norm": 0.504389009927123, "learning_rate": 3.943324963724594e-05, "loss": 0.0913, "step": 3343 }, { "epoch": 1.521383075523203, "grad_norm": 0.8988677745006507, "learning_rate": 3.9427413868547655e-05, "loss": 0.1417, "step": 3344 }, { "epoch": 1.521838034576888, "grad_norm": 0.7155476823487918, "learning_rate": 3.9421576920906265e-05, "loss": 0.0951, "step": 3345 }, { "epoch": 1.5222929936305731, "grad_norm": 0.5484545898288997, "learning_rate": 3.941573879479874e-05, "loss": 0.0767, "step": 3346 }, { "epoch": 1.5227479526842584, "grad_norm": 0.730618729070292, "learning_rate": 3.940989949070214e-05, "loss": 0.1537, "step": 3347 }, { "epoch": 1.5232029117379435, "grad_norm": 1.3584739376889865, "learning_rate": 3.9404059009093616e-05, "loss": 0.0907, "step": 3348 }, { "epoch": 1.5236578707916286, "grad_norm": 0.7495292102036327, "learning_rate": 3.9398217350450454e-05, "loss": 0.1442, "step": 3349 }, { "epoch": 1.524112829845314, "grad_norm": 0.5161548175426471, "learning_rate": 3.939237451524999e-05, "loss": 0.1145, "step": 3350 }, { "epoch": 1.5245677888989992, "grad_norm": 0.559853425528789, "learning_rate": 3.938653050396967e-05, "loss": 0.1223, "step": 3351 }, { "epoch": 1.525022747952684, "grad_norm": 0.7450129830221259, "learning_rate": 3.9380685317087054e-05, "loss": 0.1221, "step": 3352 }, { "epoch": 1.5254777070063694, "grad_norm": 0.9678894580418079, "learning_rate": 3.937483895507977e-05, "loss": 0.1231, "step": 3353 }, { "epoch": 1.5259326660600547, "grad_norm": 0.9130033108108825, "learning_rate": 3.936899141842556e-05, "loss": 0.1324, "step": 3354 }, { "epoch": 1.5263876251137396, "grad_norm": 0.7862710441429508, "learning_rate": 3.936314270760226e-05, "loss": 0.14, "step": 3355 }, { "epoch": 1.526842584167425, "grad_norm": 0.50496956718997, "learning_rate": 3.935729282308781e-05, "loss": 0.0894, "step": 3356 }, { "epoch": 1.5272975432211102, "grad_norm": 0.6831351010320812, "learning_rate": 3.9351441765360224e-05, "loss": 0.1327, "step": 3357 }, { "epoch": 1.5277525022747953, "grad_norm": 0.6124502861466562, "learning_rate": 3.934558953489763e-05, "loss": 0.1153, "step": 3358 }, { "epoch": 1.5282074613284804, "grad_norm": 0.590125164364146, "learning_rate": 3.933973613217824e-05, "loss": 0.1007, "step": 3359 }, { "epoch": 1.5286624203821657, "grad_norm": 0.6741687253704073, "learning_rate": 3.9333881557680374e-05, "loss": 0.1104, "step": 3360 }, { "epoch": 1.5291173794358508, "grad_norm": 0.6817564112536888, "learning_rate": 3.932802581188243e-05, "loss": 0.0788, "step": 3361 }, { "epoch": 1.5295723384895359, "grad_norm": 0.6192717695812732, "learning_rate": 3.932216889526293e-05, "loss": 0.1105, "step": 3362 }, { "epoch": 1.5300272975432212, "grad_norm": 0.7985631575936666, "learning_rate": 3.931631080830046e-05, "loss": 0.1148, "step": 3363 }, { "epoch": 1.5304822565969063, "grad_norm": 1.1257099186406285, "learning_rate": 3.931045155147373e-05, "loss": 0.1666, "step": 3364 }, { "epoch": 1.5309372156505914, "grad_norm": 0.6068353328658489, "learning_rate": 3.930459112526153e-05, "loss": 0.1344, "step": 3365 }, { "epoch": 1.5313921747042767, "grad_norm": 0.6352778378093915, "learning_rate": 3.929872953014272e-05, "loss": 0.1294, "step": 3366 }, { "epoch": 1.5318471337579618, "grad_norm": 0.7029359786124155, "learning_rate": 3.929286676659632e-05, "loss": 0.1371, "step": 3367 }, { "epoch": 1.5323020928116469, "grad_norm": 0.5145652405696719, "learning_rate": 3.9287002835101394e-05, "loss": 0.0663, "step": 3368 }, { "epoch": 1.5327570518653322, "grad_norm": 0.4458253148761717, "learning_rate": 3.928113773613711e-05, "loss": 0.1225, "step": 3369 }, { "epoch": 1.5332120109190173, "grad_norm": 0.5928664954875789, "learning_rate": 3.927527147018275e-05, "loss": 0.1327, "step": 3370 }, { "epoch": 1.5336669699727024, "grad_norm": 0.5092897554224904, "learning_rate": 3.926940403771767e-05, "loss": 0.1118, "step": 3371 }, { "epoch": 1.5341219290263877, "grad_norm": 0.8691667947555208, "learning_rate": 3.9263535439221335e-05, "loss": 0.1363, "step": 3372 }, { "epoch": 1.5345768880800728, "grad_norm": 0.5963798196269696, "learning_rate": 3.925766567517329e-05, "loss": 0.0789, "step": 3373 }, { "epoch": 1.5350318471337578, "grad_norm": 0.536007633331686, "learning_rate": 3.9251794746053196e-05, "loss": 0.135, "step": 3374 }, { "epoch": 1.5354868061874432, "grad_norm": 0.5207966479496777, "learning_rate": 3.9245922652340795e-05, "loss": 0.0938, "step": 3375 }, { "epoch": 1.5359417652411285, "grad_norm": 0.5848916817918001, "learning_rate": 3.9240049394515933e-05, "loss": 0.1133, "step": 3376 }, { "epoch": 1.5363967242948133, "grad_norm": 0.6903399029255209, "learning_rate": 3.9234174973058534e-05, "loss": 0.159, "step": 3377 }, { "epoch": 1.5368516833484986, "grad_norm": 0.791717689639616, "learning_rate": 3.922829938844865e-05, "loss": 0.0938, "step": 3378 }, { "epoch": 1.537306642402184, "grad_norm": 0.5816568642347487, "learning_rate": 3.922242264116639e-05, "loss": 0.1071, "step": 3379 }, { "epoch": 1.5377616014558688, "grad_norm": 0.5956455135117953, "learning_rate": 3.9216544731691976e-05, "loss": 0.147, "step": 3380 }, { "epoch": 1.5382165605095541, "grad_norm": 0.6862242935021037, "learning_rate": 3.9210665660505726e-05, "loss": 0.119, "step": 3381 }, { "epoch": 1.5386715195632394, "grad_norm": 0.5420829984781645, "learning_rate": 3.920478542808805e-05, "loss": 0.1526, "step": 3382 }, { "epoch": 1.5391264786169245, "grad_norm": 0.7514455308589059, "learning_rate": 3.919890403491947e-05, "loss": 0.0979, "step": 3383 }, { "epoch": 1.5395814376706096, "grad_norm": 0.7624459012249215, "learning_rate": 3.9193021481480564e-05, "loss": 0.1531, "step": 3384 }, { "epoch": 1.540036396724295, "grad_norm": 0.5605064793039148, "learning_rate": 3.918713776825204e-05, "loss": 0.0946, "step": 3385 }, { "epoch": 1.54049135577798, "grad_norm": 0.6840865024547125, "learning_rate": 3.918125289571469e-05, "loss": 0.0928, "step": 3386 }, { "epoch": 1.540946314831665, "grad_norm": 0.6645338019987975, "learning_rate": 3.917536686434939e-05, "loss": 0.0983, "step": 3387 }, { "epoch": 1.5414012738853504, "grad_norm": 0.7576941717290235, "learning_rate": 3.9169479674637125e-05, "loss": 0.1368, "step": 3388 }, { "epoch": 1.5418562329390355, "grad_norm": 0.5919061517509573, "learning_rate": 3.916359132705897e-05, "loss": 0.0877, "step": 3389 }, { "epoch": 1.5423111919927206, "grad_norm": 0.5944759936048514, "learning_rate": 3.9157701822096096e-05, "loss": 0.1322, "step": 3390 }, { "epoch": 1.542766151046406, "grad_norm": 0.8784276320440566, "learning_rate": 3.915181116022977e-05, "loss": 0.1222, "step": 3391 }, { "epoch": 1.543221110100091, "grad_norm": 0.6139858103946678, "learning_rate": 3.914591934194134e-05, "loss": 0.1233, "step": 3392 }, { "epoch": 1.543676069153776, "grad_norm": 0.7187044038878407, "learning_rate": 3.9140026367712255e-05, "loss": 0.1259, "step": 3393 }, { "epoch": 1.5441310282074614, "grad_norm": 0.6180907274466235, "learning_rate": 3.913413223802408e-05, "loss": 0.1159, "step": 3394 }, { "epoch": 1.5445859872611465, "grad_norm": 0.4832682406922359, "learning_rate": 3.912823695335845e-05, "loss": 0.0777, "step": 3395 }, { "epoch": 1.5450409463148316, "grad_norm": 0.8351290787377931, "learning_rate": 3.9122340514197096e-05, "loss": 0.204, "step": 3396 }, { "epoch": 1.5454959053685169, "grad_norm": 0.7444586284170286, "learning_rate": 3.911644292102185e-05, "loss": 0.0863, "step": 3397 }, { "epoch": 1.545950864422202, "grad_norm": 0.6854187014324895, "learning_rate": 3.911054417431464e-05, "loss": 0.1081, "step": 3398 }, { "epoch": 1.546405823475887, "grad_norm": 0.6850211875154532, "learning_rate": 3.910464427455748e-05, "loss": 0.0935, "step": 3399 }, { "epoch": 1.5468607825295724, "grad_norm": 1.0666717829837205, "learning_rate": 3.9098743222232494e-05, "loss": 0.2339, "step": 3400 }, { "epoch": 1.5473157415832575, "grad_norm": 0.5561756276664624, "learning_rate": 3.909284101782187e-05, "loss": 0.1261, "step": 3401 }, { "epoch": 1.5477707006369426, "grad_norm": 0.46748121263737824, "learning_rate": 3.9086937661807924e-05, "loss": 0.1258, "step": 3402 }, { "epoch": 1.5482256596906279, "grad_norm": 0.7441769043215533, "learning_rate": 3.908103315467305e-05, "loss": 0.0907, "step": 3403 }, { "epoch": 1.5486806187443132, "grad_norm": 0.7205798714569024, "learning_rate": 3.907512749689973e-05, "loss": 0.1471, "step": 3404 }, { "epoch": 1.549135577797998, "grad_norm": 0.5229203052566355, "learning_rate": 3.9069220688970565e-05, "loss": 0.1567, "step": 3405 }, { "epoch": 1.5495905368516834, "grad_norm": 0.5368214779351004, "learning_rate": 3.9063312731368215e-05, "loss": 0.0977, "step": 3406 }, { "epoch": 1.5500454959053687, "grad_norm": 0.5829708764326387, "learning_rate": 3.905740362457546e-05, "loss": 0.1107, "step": 3407 }, { "epoch": 1.5505004549590535, "grad_norm": 0.47672160818646464, "learning_rate": 3.905149336907516e-05, "loss": 0.0944, "step": 3408 }, { "epoch": 1.5509554140127388, "grad_norm": 0.8534761965458342, "learning_rate": 3.904558196535028e-05, "loss": 0.1863, "step": 3409 }, { "epoch": 1.5514103730664242, "grad_norm": 0.78891554673658, "learning_rate": 3.903966941388387e-05, "loss": 0.1424, "step": 3410 }, { "epoch": 1.5518653321201092, "grad_norm": 0.8735388554264109, "learning_rate": 3.903375571515908e-05, "loss": 0.1301, "step": 3411 }, { "epoch": 1.5523202911737943, "grad_norm": 0.8141306864216058, "learning_rate": 3.902784086965915e-05, "loss": 0.1401, "step": 3412 }, { "epoch": 1.5527752502274796, "grad_norm": 0.6616227310917197, "learning_rate": 3.902192487786741e-05, "loss": 0.1202, "step": 3413 }, { "epoch": 1.5532302092811647, "grad_norm": 0.5885296796314342, "learning_rate": 3.90160077402673e-05, "loss": 0.0991, "step": 3414 }, { "epoch": 1.5536851683348498, "grad_norm": 0.6941478059131553, "learning_rate": 3.901008945734232e-05, "loss": 0.1364, "step": 3415 }, { "epoch": 1.5541401273885351, "grad_norm": 0.7774006390285141, "learning_rate": 3.9004170029576107e-05, "loss": 0.1593, "step": 3416 }, { "epoch": 1.5545950864422202, "grad_norm": 0.7609014013483011, "learning_rate": 3.899824945745236e-05, "loss": 0.1316, "step": 3417 }, { "epoch": 1.5550500454959053, "grad_norm": 0.5222944227811996, "learning_rate": 3.8992327741454874e-05, "loss": 0.0715, "step": 3418 }, { "epoch": 1.5555050045495906, "grad_norm": 0.5900500621250153, "learning_rate": 3.898640488206756e-05, "loss": 0.1169, "step": 3419 }, { "epoch": 1.5559599636032757, "grad_norm": 0.6237286503594046, "learning_rate": 3.8980480879774405e-05, "loss": 0.1314, "step": 3420 }, { "epoch": 1.5564149226569608, "grad_norm": 0.6335098600842141, "learning_rate": 3.897455573505949e-05, "loss": 0.0909, "step": 3421 }, { "epoch": 1.556869881710646, "grad_norm": 0.7575419403885314, "learning_rate": 3.896862944840698e-05, "loss": 0.1483, "step": 3422 }, { "epoch": 1.5573248407643312, "grad_norm": 0.6947973207839588, "learning_rate": 3.8962702020301155e-05, "loss": 0.148, "step": 3423 }, { "epoch": 1.5577797998180163, "grad_norm": 0.7425121214499031, "learning_rate": 3.895677345122638e-05, "loss": 0.1277, "step": 3424 }, { "epoch": 1.5582347588717016, "grad_norm": 0.6690944941527904, "learning_rate": 3.895084374166711e-05, "loss": 0.1233, "step": 3425 }, { "epoch": 1.5586897179253867, "grad_norm": 0.6557034460795368, "learning_rate": 3.894491289210788e-05, "loss": 0.1127, "step": 3426 }, { "epoch": 1.5591446769790718, "grad_norm": 0.766497850936518, "learning_rate": 3.893898090303335e-05, "loss": 0.1472, "step": 3427 }, { "epoch": 1.559599636032757, "grad_norm": 0.6649074855771692, "learning_rate": 3.8933047774928255e-05, "loss": 0.1444, "step": 3428 }, { "epoch": 1.5600545950864422, "grad_norm": 1.1005533041830495, "learning_rate": 3.8927113508277405e-05, "loss": 0.126, "step": 3429 }, { "epoch": 1.5605095541401273, "grad_norm": 0.6411751540733637, "learning_rate": 3.892117810356574e-05, "loss": 0.092, "step": 3430 }, { "epoch": 1.5609645131938126, "grad_norm": 0.6111090149740005, "learning_rate": 3.8915241561278266e-05, "loss": 0.1021, "step": 3431 }, { "epoch": 1.5614194722474979, "grad_norm": 0.5810119342461505, "learning_rate": 3.890930388190009e-05, "loss": 0.0958, "step": 3432 }, { "epoch": 1.5618744313011828, "grad_norm": 0.8201938602063868, "learning_rate": 3.890336506591642e-05, "loss": 0.1989, "step": 3433 }, { "epoch": 1.562329390354868, "grad_norm": 0.5932602389634708, "learning_rate": 3.889742511381254e-05, "loss": 0.112, "step": 3434 }, { "epoch": 1.5627843494085534, "grad_norm": 0.7714934335425487, "learning_rate": 3.8891484026073844e-05, "loss": 0.1377, "step": 3435 }, { "epoch": 1.5632393084622382, "grad_norm": 0.6285375146744548, "learning_rate": 3.8885541803185797e-05, "loss": 0.097, "step": 3436 }, { "epoch": 1.5636942675159236, "grad_norm": 0.5792866028098201, "learning_rate": 3.887959844563399e-05, "loss": 0.1005, "step": 3437 }, { "epoch": 1.5641492265696089, "grad_norm": 0.9309683942267654, "learning_rate": 3.887365395390407e-05, "loss": 0.1256, "step": 3438 }, { "epoch": 1.564604185623294, "grad_norm": 0.8265995848216252, "learning_rate": 3.886770832848181e-05, "loss": 0.1649, "step": 3439 }, { "epoch": 1.565059144676979, "grad_norm": 0.6581713053516026, "learning_rate": 3.8861761569853045e-05, "loss": 0.1381, "step": 3440 }, { "epoch": 1.5655141037306644, "grad_norm": 0.725287345122776, "learning_rate": 3.8855813678503725e-05, "loss": 0.1028, "step": 3441 }, { "epoch": 1.5659690627843494, "grad_norm": 0.8365341105809444, "learning_rate": 3.884986465491989e-05, "loss": 0.1365, "step": 3442 }, { "epoch": 1.5664240218380345, "grad_norm": 0.8309028009923348, "learning_rate": 3.884391449958765e-05, "loss": 0.1139, "step": 3443 }, { "epoch": 1.5668789808917198, "grad_norm": 0.730212340176555, "learning_rate": 3.883796321299325e-05, "loss": 0.1502, "step": 3444 }, { "epoch": 1.567333939945405, "grad_norm": 0.6470727678379335, "learning_rate": 3.883201079562298e-05, "loss": 0.1234, "step": 3445 }, { "epoch": 1.56778889899909, "grad_norm": 0.6050693138793282, "learning_rate": 3.8826057247963244e-05, "loss": 0.1164, "step": 3446 }, { "epoch": 1.5682438580527753, "grad_norm": 0.6074173579266383, "learning_rate": 3.882010257050055e-05, "loss": 0.1552, "step": 3447 }, { "epoch": 1.5686988171064604, "grad_norm": 1.2584621225430173, "learning_rate": 3.88141467637215e-05, "loss": 0.1108, "step": 3448 }, { "epoch": 1.5691537761601455, "grad_norm": 0.6838694474896838, "learning_rate": 3.8808189828112754e-05, "loss": 0.1465, "step": 3449 }, { "epoch": 1.5696087352138308, "grad_norm": 0.6226515985679952, "learning_rate": 3.8802231764161084e-05, "loss": 0.1079, "step": 3450 }, { "epoch": 1.570063694267516, "grad_norm": 0.5789923690294155, "learning_rate": 3.879627257235337e-05, "loss": 0.1331, "step": 3451 }, { "epoch": 1.570518653321201, "grad_norm": 0.5606350553220012, "learning_rate": 3.879031225317656e-05, "loss": 0.1528, "step": 3452 }, { "epoch": 1.5709736123748863, "grad_norm": 0.5057791693319934, "learning_rate": 3.8784350807117716e-05, "loss": 0.0897, "step": 3453 }, { "epoch": 1.5714285714285714, "grad_norm": 0.5035196200499429, "learning_rate": 3.8778388234663975e-05, "loss": 0.1152, "step": 3454 }, { "epoch": 1.5718835304822565, "grad_norm": 0.5648018368016795, "learning_rate": 3.8772424536302564e-05, "loss": 0.056, "step": 3455 }, { "epoch": 1.5723384895359418, "grad_norm": 0.5884423755669475, "learning_rate": 3.876645971252082e-05, "loss": 0.11, "step": 3456 }, { "epoch": 1.5727934485896269, "grad_norm": 0.49438026311070726, "learning_rate": 3.8760493763806156e-05, "loss": 0.135, "step": 3457 }, { "epoch": 1.573248407643312, "grad_norm": 0.804546842971448, "learning_rate": 3.875452669064609e-05, "loss": 0.1843, "step": 3458 }, { "epoch": 1.5737033666969973, "grad_norm": 0.592980861642544, "learning_rate": 3.874855849352821e-05, "loss": 0.1039, "step": 3459 }, { "epoch": 1.5741583257506826, "grad_norm": 0.802960802227618, "learning_rate": 3.874258917294021e-05, "loss": 0.1592, "step": 3460 }, { "epoch": 1.5746132848043675, "grad_norm": 0.5969251596942822, "learning_rate": 3.8736618729369886e-05, "loss": 0.1253, "step": 3461 }, { "epoch": 1.5750682438580528, "grad_norm": 0.6719427532116307, "learning_rate": 3.8730647163305127e-05, "loss": 0.1014, "step": 3462 }, { "epoch": 1.575523202911738, "grad_norm": 0.610278118713002, "learning_rate": 3.872467447523388e-05, "loss": 0.1472, "step": 3463 }, { "epoch": 1.575978161965423, "grad_norm": 1.6825982709893523, "learning_rate": 3.8718700665644215e-05, "loss": 0.1941, "step": 3464 }, { "epoch": 1.5764331210191083, "grad_norm": 0.6765162462082096, "learning_rate": 3.871272573502429e-05, "loss": 0.1365, "step": 3465 }, { "epoch": 1.5768880800727936, "grad_norm": 0.6472604247896423, "learning_rate": 3.8706749683862344e-05, "loss": 0.1277, "step": 3466 }, { "epoch": 1.5773430391264787, "grad_norm": 0.676445279598829, "learning_rate": 3.87007725126467e-05, "loss": 0.0957, "step": 3467 }, { "epoch": 1.5777979981801638, "grad_norm": 1.276150700276615, "learning_rate": 3.869479422186582e-05, "loss": 0.1177, "step": 3468 }, { "epoch": 1.578252957233849, "grad_norm": 0.6473771367834977, "learning_rate": 3.868881481200818e-05, "loss": 0.0989, "step": 3469 }, { "epoch": 1.5787079162875342, "grad_norm": 0.5259284677685861, "learning_rate": 3.868283428356243e-05, "loss": 0.1206, "step": 3470 }, { "epoch": 1.5791628753412192, "grad_norm": 0.7647083339089079, "learning_rate": 3.867685263701724e-05, "loss": 0.0821, "step": 3471 }, { "epoch": 1.5796178343949046, "grad_norm": 0.5567008893293357, "learning_rate": 3.867086987286141e-05, "loss": 0.1545, "step": 3472 }, { "epoch": 1.5800727934485896, "grad_norm": 0.5448081628675032, "learning_rate": 3.866488599158385e-05, "loss": 0.1154, "step": 3473 }, { "epoch": 1.5805277525022747, "grad_norm": 0.6789292377446415, "learning_rate": 3.865890099367351e-05, "loss": 0.1046, "step": 3474 }, { "epoch": 1.58098271155596, "grad_norm": 0.6335314679380721, "learning_rate": 3.865291487961946e-05, "loss": 0.1168, "step": 3475 }, { "epoch": 1.5814376706096451, "grad_norm": 0.6499444318731863, "learning_rate": 3.864692764991087e-05, "loss": 0.1813, "step": 3476 }, { "epoch": 1.5818926296633302, "grad_norm": 0.4271423973158619, "learning_rate": 3.864093930503697e-05, "loss": 0.1532, "step": 3477 }, { "epoch": 1.5823475887170155, "grad_norm": 0.4949674574569111, "learning_rate": 3.8634949845487125e-05, "loss": 0.1236, "step": 3478 }, { "epoch": 1.5828025477707006, "grad_norm": 0.5798629386741678, "learning_rate": 3.862895927175074e-05, "loss": 0.148, "step": 3479 }, { "epoch": 1.5832575068243857, "grad_norm": 0.5722451298240867, "learning_rate": 3.862296758431736e-05, "loss": 0.1389, "step": 3480 }, { "epoch": 1.583712465878071, "grad_norm": 0.5510638752870954, "learning_rate": 3.8616974783676586e-05, "loss": 0.115, "step": 3481 }, { "epoch": 1.584167424931756, "grad_norm": 0.662329241145299, "learning_rate": 3.8610980870318126e-05, "loss": 0.1046, "step": 3482 }, { "epoch": 1.5846223839854412, "grad_norm": 0.7919065309543015, "learning_rate": 3.8604985844731775e-05, "loss": 0.1035, "step": 3483 }, { "epoch": 1.5850773430391265, "grad_norm": 0.48858165686200816, "learning_rate": 3.859898970740743e-05, "loss": 0.0735, "step": 3484 }, { "epoch": 1.5855323020928116, "grad_norm": 0.8451392064995371, "learning_rate": 3.859299245883505e-05, "loss": 0.1543, "step": 3485 }, { "epoch": 1.5859872611464967, "grad_norm": 0.7328573629490068, "learning_rate": 3.858699409950471e-05, "loss": 0.1627, "step": 3486 }, { "epoch": 1.586442220200182, "grad_norm": 0.8090659806363778, "learning_rate": 3.858099462990658e-05, "loss": 0.1517, "step": 3487 }, { "epoch": 1.5868971792538673, "grad_norm": 0.6069023596049293, "learning_rate": 3.857499405053089e-05, "loss": 0.1546, "step": 3488 }, { "epoch": 1.5873521383075522, "grad_norm": 0.5532878568348374, "learning_rate": 3.8568992361867994e-05, "loss": 0.1077, "step": 3489 }, { "epoch": 1.5878070973612375, "grad_norm": 0.5981799538828259, "learning_rate": 3.856298956440832e-05, "loss": 0.0915, "step": 3490 }, { "epoch": 1.5882620564149228, "grad_norm": 0.6258932634790428, "learning_rate": 3.8556985658642394e-05, "loss": 0.1158, "step": 3491 }, { "epoch": 1.5887170154686077, "grad_norm": 0.5466865546761314, "learning_rate": 3.855098064506082e-05, "loss": 0.0749, "step": 3492 }, { "epoch": 1.589171974522293, "grad_norm": 0.6098113696123401, "learning_rate": 3.85449745241543e-05, "loss": 0.1083, "step": 3493 }, { "epoch": 1.5896269335759783, "grad_norm": 0.6304845541006605, "learning_rate": 3.853896729641363e-05, "loss": 0.1416, "step": 3494 }, { "epoch": 1.5900818926296634, "grad_norm": 0.6441291950360863, "learning_rate": 3.853295896232969e-05, "loss": 0.1814, "step": 3495 }, { "epoch": 1.5905368516833485, "grad_norm": 0.6427705996778728, "learning_rate": 3.852694952239346e-05, "loss": 0.1521, "step": 3496 }, { "epoch": 1.5909918107370338, "grad_norm": 0.8147267305097686, "learning_rate": 3.852093897709601e-05, "loss": 0.0977, "step": 3497 }, { "epoch": 1.5914467697907189, "grad_norm": 0.6023995547887404, "learning_rate": 3.851492732692849e-05, "loss": 0.1327, "step": 3498 }, { "epoch": 1.591901728844404, "grad_norm": 0.9088139641633866, "learning_rate": 3.8508914572382126e-05, "loss": 0.1829, "step": 3499 }, { "epoch": 1.5923566878980893, "grad_norm": 0.5477998802890491, "learning_rate": 3.8502900713948276e-05, "loss": 0.0969, "step": 3500 }, { "epoch": 1.5928116469517744, "grad_norm": 0.7798119574297929, "learning_rate": 3.849688575211836e-05, "loss": 0.1771, "step": 3501 }, { "epoch": 1.5932666060054594, "grad_norm": 0.6780467442182827, "learning_rate": 3.849086968738389e-05, "loss": 0.1546, "step": 3502 }, { "epoch": 1.5937215650591448, "grad_norm": 0.7914705453664702, "learning_rate": 3.848485252023647e-05, "loss": 0.1615, "step": 3503 }, { "epoch": 1.5941765241128298, "grad_norm": 0.7915111048932848, "learning_rate": 3.847883425116781e-05, "loss": 0.1449, "step": 3504 }, { "epoch": 1.594631483166515, "grad_norm": 0.5333242322858075, "learning_rate": 3.847281488066967e-05, "loss": 0.1413, "step": 3505 }, { "epoch": 1.5950864422202002, "grad_norm": 0.518542356625765, "learning_rate": 3.8466794409233946e-05, "loss": 0.073, "step": 3506 }, { "epoch": 1.5955414012738853, "grad_norm": 0.6509730897284789, "learning_rate": 3.846077283735261e-05, "loss": 0.0811, "step": 3507 }, { "epoch": 1.5959963603275704, "grad_norm": 0.5387990540627277, "learning_rate": 3.845475016551769e-05, "loss": 0.089, "step": 3508 }, { "epoch": 1.5964513193812557, "grad_norm": 0.6145524663082544, "learning_rate": 3.844872639422136e-05, "loss": 0.1202, "step": 3509 }, { "epoch": 1.5969062784349408, "grad_norm": 0.6691820028913418, "learning_rate": 3.844270152395583e-05, "loss": 0.1023, "step": 3510 }, { "epoch": 1.597361237488626, "grad_norm": 0.5858960080996368, "learning_rate": 3.843667555521345e-05, "loss": 0.1135, "step": 3511 }, { "epoch": 1.5978161965423112, "grad_norm": 0.6560089438493053, "learning_rate": 3.843064848848662e-05, "loss": 0.1413, "step": 3512 }, { "epoch": 1.5982711555959963, "grad_norm": 0.9348397228944021, "learning_rate": 3.842462032426784e-05, "loss": 0.1692, "step": 3513 }, { "epoch": 1.5987261146496814, "grad_norm": 0.5979462760502174, "learning_rate": 3.8418591063049725e-05, "loss": 0.0959, "step": 3514 }, { "epoch": 1.5991810737033667, "grad_norm": 0.5642804535612452, "learning_rate": 3.841256070532494e-05, "loss": 0.1495, "step": 3515 }, { "epoch": 1.599636032757052, "grad_norm": 0.7365965255965145, "learning_rate": 3.840652925158626e-05, "loss": 0.1311, "step": 3516 }, { "epoch": 1.6000909918107369, "grad_norm": 0.844344568348862, "learning_rate": 3.840049670232655e-05, "loss": 0.1369, "step": 3517 }, { "epoch": 1.6005459508644222, "grad_norm": 1.0979643553082414, "learning_rate": 3.8394463058038784e-05, "loss": 0.2011, "step": 3518 }, { "epoch": 1.6010009099181075, "grad_norm": 0.6226287961222842, "learning_rate": 3.838842831921598e-05, "loss": 0.0897, "step": 3519 }, { "epoch": 1.6014558689717924, "grad_norm": 0.5204539538657657, "learning_rate": 3.838239248635126e-05, "loss": 0.1326, "step": 3520 }, { "epoch": 1.6019108280254777, "grad_norm": 0.6189772852024599, "learning_rate": 3.837635555993787e-05, "loss": 0.1234, "step": 3521 }, { "epoch": 1.602365787079163, "grad_norm": 0.6493396084987957, "learning_rate": 3.837031754046911e-05, "loss": 0.1025, "step": 3522 }, { "epoch": 1.602820746132848, "grad_norm": 0.5800306443188828, "learning_rate": 3.836427842843838e-05, "loss": 0.1314, "step": 3523 }, { "epoch": 1.6032757051865332, "grad_norm": 0.5877697481379214, "learning_rate": 3.8358238224339175e-05, "loss": 0.1318, "step": 3524 }, { "epoch": 1.6037306642402185, "grad_norm": 1.1537596870953175, "learning_rate": 3.835219692866506e-05, "loss": 0.1009, "step": 3525 }, { "epoch": 1.6041856232939036, "grad_norm": 0.6600453247122101, "learning_rate": 3.834615454190972e-05, "loss": 0.1088, "step": 3526 }, { "epoch": 1.6046405823475887, "grad_norm": 0.6638938321691121, "learning_rate": 3.8340111064566895e-05, "loss": 0.137, "step": 3527 }, { "epoch": 1.605095541401274, "grad_norm": 0.7162848841777919, "learning_rate": 3.8334066497130437e-05, "loss": 0.1219, "step": 3528 }, { "epoch": 1.605550500454959, "grad_norm": 0.6531525511243691, "learning_rate": 3.832802084009428e-05, "loss": 0.1491, "step": 3529 }, { "epoch": 1.6060054595086442, "grad_norm": 0.48116446717262973, "learning_rate": 3.832197409395245e-05, "loss": 0.0695, "step": 3530 }, { "epoch": 1.6064604185623295, "grad_norm": 0.6693798327481042, "learning_rate": 3.831592625919906e-05, "loss": 0.1501, "step": 3531 }, { "epoch": 1.6069153776160146, "grad_norm": 0.8622235342392878, "learning_rate": 3.8309877336328304e-05, "loss": 0.1237, "step": 3532 }, { "epoch": 1.6073703366696996, "grad_norm": 0.6196649732752695, "learning_rate": 3.830382732583449e-05, "loss": 0.1038, "step": 3533 }, { "epoch": 1.607825295723385, "grad_norm": 0.6067883639863182, "learning_rate": 3.829777622821198e-05, "loss": 0.0991, "step": 3534 }, { "epoch": 1.60828025477707, "grad_norm": 0.4234136743397041, "learning_rate": 3.829172404395525e-05, "loss": 0.0721, "step": 3535 }, { "epoch": 1.6087352138307551, "grad_norm": 1.5698490295266043, "learning_rate": 3.828567077355885e-05, "loss": 0.1358, "step": 3536 }, { "epoch": 1.6091901728844404, "grad_norm": 0.8244292610429251, "learning_rate": 3.8279616417517436e-05, "loss": 0.1224, "step": 3537 }, { "epoch": 1.6096451319381255, "grad_norm": 0.6174139119220778, "learning_rate": 3.827356097632574e-05, "loss": 0.1155, "step": 3538 }, { "epoch": 1.6101000909918106, "grad_norm": 0.8321190267987871, "learning_rate": 3.8267504450478586e-05, "loss": 0.0765, "step": 3539 }, { "epoch": 1.610555050045496, "grad_norm": 0.7926546958366801, "learning_rate": 3.826144684047088e-05, "loss": 0.1064, "step": 3540 }, { "epoch": 1.6110100090991812, "grad_norm": 0.728771130577452, "learning_rate": 3.8255388146797634e-05, "loss": 0.1179, "step": 3541 }, { "epoch": 1.611464968152866, "grad_norm": 0.5490944477642556, "learning_rate": 3.824932836995392e-05, "loss": 0.116, "step": 3542 }, { "epoch": 1.6119199272065514, "grad_norm": 0.6299156084258023, "learning_rate": 3.824326751043493e-05, "loss": 0.1334, "step": 3543 }, { "epoch": 1.6123748862602367, "grad_norm": 0.6430925854323227, "learning_rate": 3.8237205568735916e-05, "loss": 0.0964, "step": 3544 }, { "epoch": 1.6128298453139216, "grad_norm": 0.5628997899667181, "learning_rate": 3.823114254535226e-05, "loss": 0.1322, "step": 3545 }, { "epoch": 1.613284804367607, "grad_norm": 0.4763279924295988, "learning_rate": 3.822507844077937e-05, "loss": 0.0944, "step": 3546 }, { "epoch": 1.6137397634212922, "grad_norm": 0.5730301081761596, "learning_rate": 3.8219013255512805e-05, "loss": 0.094, "step": 3547 }, { "epoch": 1.6141947224749773, "grad_norm": 0.5116223986480919, "learning_rate": 3.821294699004816e-05, "loss": 0.0949, "step": 3548 }, { "epoch": 1.6146496815286624, "grad_norm": 0.519661798696211, "learning_rate": 3.8206879644881166e-05, "loss": 0.0684, "step": 3549 }, { "epoch": 1.6151046405823477, "grad_norm": 0.5998802298732485, "learning_rate": 3.8200811220507605e-05, "loss": 0.134, "step": 3550 }, { "epoch": 1.6155595996360328, "grad_norm": 0.7358034402501613, "learning_rate": 3.819474171742337e-05, "loss": 0.1083, "step": 3551 }, { "epoch": 1.6160145586897179, "grad_norm": 1.0505085097960734, "learning_rate": 3.818867113612442e-05, "loss": 0.1228, "step": 3552 }, { "epoch": 1.6164695177434032, "grad_norm": 0.4568428409943666, "learning_rate": 3.818259947710683e-05, "loss": 0.0671, "step": 3553 }, { "epoch": 1.6169244767970883, "grad_norm": 0.5494668003827564, "learning_rate": 3.8176526740866746e-05, "loss": 0.0983, "step": 3554 }, { "epoch": 1.6173794358507734, "grad_norm": 0.589718871288418, "learning_rate": 3.81704529279004e-05, "loss": 0.1611, "step": 3555 }, { "epoch": 1.6178343949044587, "grad_norm": 0.5680592413828929, "learning_rate": 3.816437803870412e-05, "loss": 0.1155, "step": 3556 }, { "epoch": 1.6182893539581438, "grad_norm": 0.7211485365510559, "learning_rate": 3.815830207377431e-05, "loss": 0.128, "step": 3557 }, { "epoch": 1.6187443130118289, "grad_norm": 0.8296493477706437, "learning_rate": 3.815222503360748e-05, "loss": 0.1419, "step": 3558 }, { "epoch": 1.6191992720655142, "grad_norm": 0.630930614755368, "learning_rate": 3.814614691870021e-05, "loss": 0.1182, "step": 3559 }, { "epoch": 1.6196542311191993, "grad_norm": 0.6437156821135166, "learning_rate": 3.814006772954919e-05, "loss": 0.1488, "step": 3560 }, { "epoch": 1.6201091901728844, "grad_norm": 0.5945548667118402, "learning_rate": 3.8133987466651175e-05, "loss": 0.088, "step": 3561 }, { "epoch": 1.6205641492265697, "grad_norm": 0.7557883964523753, "learning_rate": 3.812790613050301e-05, "loss": 0.129, "step": 3562 }, { "epoch": 1.6210191082802548, "grad_norm": 0.8239786314961461, "learning_rate": 3.8121823721601646e-05, "loss": 0.1243, "step": 3563 }, { "epoch": 1.6214740673339398, "grad_norm": 0.6783203623799415, "learning_rate": 3.8115740240444106e-05, "loss": 0.1086, "step": 3564 }, { "epoch": 1.6219290263876252, "grad_norm": 0.751379398545525, "learning_rate": 3.8109655687527504e-05, "loss": 0.1196, "step": 3565 }, { "epoch": 1.6223839854413102, "grad_norm": 0.5770835099325122, "learning_rate": 3.810357006334903e-05, "loss": 0.0979, "step": 3566 }, { "epoch": 1.6228389444949953, "grad_norm": 0.897558611717571, "learning_rate": 3.8097483368406e-05, "loss": 0.2242, "step": 3567 }, { "epoch": 1.6232939035486806, "grad_norm": 0.5358030656078929, "learning_rate": 3.809139560319577e-05, "loss": 0.1164, "step": 3568 }, { "epoch": 1.623748862602366, "grad_norm": 0.5105783311083074, "learning_rate": 3.808530676821581e-05, "loss": 0.0659, "step": 3569 }, { "epoch": 1.6242038216560508, "grad_norm": 0.763602064659497, "learning_rate": 3.8079216863963676e-05, "loss": 0.1462, "step": 3570 }, { "epoch": 1.6246587807097361, "grad_norm": 0.49861182578616947, "learning_rate": 3.807312589093701e-05, "loss": 0.1409, "step": 3571 }, { "epoch": 1.6251137397634214, "grad_norm": 0.5604146983106649, "learning_rate": 3.8067033849633525e-05, "loss": 0.1168, "step": 3572 }, { "epoch": 1.6255686988171063, "grad_norm": 0.9538161569807225, "learning_rate": 3.8060940740551056e-05, "loss": 0.1767, "step": 3573 }, { "epoch": 1.6260236578707916, "grad_norm": 0.4776671766756012, "learning_rate": 3.805484656418748e-05, "loss": 0.0921, "step": 3574 }, { "epoch": 1.626478616924477, "grad_norm": 0.6501840153100107, "learning_rate": 3.80487513210408e-05, "loss": 0.1216, "step": 3575 }, { "epoch": 1.626933575978162, "grad_norm": 0.7457905709007109, "learning_rate": 3.80426550116091e-05, "loss": 0.1205, "step": 3576 }, { "epoch": 1.627388535031847, "grad_norm": 0.5674863948491703, "learning_rate": 3.803655763639053e-05, "loss": 0.0763, "step": 3577 }, { "epoch": 1.6278434940855324, "grad_norm": 0.5064637637192754, "learning_rate": 3.803045919588333e-05, "loss": 0.086, "step": 3578 }, { "epoch": 1.6282984531392175, "grad_norm": 0.8125850651889719, "learning_rate": 3.8024359690585856e-05, "loss": 0.1239, "step": 3579 }, { "epoch": 1.6287534121929026, "grad_norm": 0.5612829511871423, "learning_rate": 3.8018259120996525e-05, "loss": 0.1119, "step": 3580 }, { "epoch": 1.629208371246588, "grad_norm": 0.5789698059625424, "learning_rate": 3.801215748761385e-05, "loss": 0.1016, "step": 3581 }, { "epoch": 1.629663330300273, "grad_norm": 1.0983774173549339, "learning_rate": 3.800605479093643e-05, "loss": 0.1841, "step": 3582 }, { "epoch": 1.630118289353958, "grad_norm": 0.6538207522370052, "learning_rate": 3.7999951031462945e-05, "loss": 0.117, "step": 3583 }, { "epoch": 1.6305732484076434, "grad_norm": 0.5321176282870249, "learning_rate": 3.7993846209692176e-05, "loss": 0.1198, "step": 3584 }, { "epoch": 1.6310282074613285, "grad_norm": 0.6582808979871314, "learning_rate": 3.798774032612297e-05, "loss": 0.0925, "step": 3585 }, { "epoch": 1.6314831665150136, "grad_norm": 0.4368239188669831, "learning_rate": 3.7981633381254265e-05, "loss": 0.0716, "step": 3586 }, { "epoch": 1.6319381255686989, "grad_norm": 0.5003591460777104, "learning_rate": 3.797552537558511e-05, "loss": 0.0897, "step": 3587 }, { "epoch": 1.632393084622384, "grad_norm": 0.7447783762469441, "learning_rate": 3.796941630961463e-05, "loss": 0.1328, "step": 3588 }, { "epoch": 1.632848043676069, "grad_norm": 0.738254381685339, "learning_rate": 3.796330618384201e-05, "loss": 0.1318, "step": 3589 }, { "epoch": 1.6333030027297544, "grad_norm": 0.5959333619608442, "learning_rate": 3.7957194998766547e-05, "loss": 0.1124, "step": 3590 }, { "epoch": 1.6337579617834395, "grad_norm": 0.9120665752592264, "learning_rate": 3.7951082754887635e-05, "loss": 0.112, "step": 3591 }, { "epoch": 1.6342129208371245, "grad_norm": 0.6378319671699785, "learning_rate": 3.794496945270471e-05, "loss": 0.101, "step": 3592 }, { "epoch": 1.6346678798908099, "grad_norm": 0.7312620124891068, "learning_rate": 3.793885509271735e-05, "loss": 0.1343, "step": 3593 }, { "epoch": 1.635122838944495, "grad_norm": 0.40929988030137177, "learning_rate": 3.793273967542519e-05, "loss": 0.0742, "step": 3594 }, { "epoch": 1.63557779799818, "grad_norm": 1.0244080568085272, "learning_rate": 3.792662320132794e-05, "loss": 0.178, "step": 3595 }, { "epoch": 1.6360327570518653, "grad_norm": 0.5283301972715275, "learning_rate": 3.7920505670925423e-05, "loss": 0.1092, "step": 3596 }, { "epoch": 1.6364877161055507, "grad_norm": 0.9969636199118833, "learning_rate": 3.791438708471752e-05, "loss": 0.1289, "step": 3597 }, { "epoch": 1.6369426751592355, "grad_norm": 1.040297943429863, "learning_rate": 3.7908267443204224e-05, "loss": 0.2287, "step": 3598 }, { "epoch": 1.6373976342129208, "grad_norm": 0.7671313998773381, "learning_rate": 3.7902146746885616e-05, "loss": 0.1364, "step": 3599 }, { "epoch": 1.6378525932666061, "grad_norm": 0.6106687172121286, "learning_rate": 3.789602499626183e-05, "loss": 0.1028, "step": 3600 }, { "epoch": 1.638307552320291, "grad_norm": 0.6226287348989819, "learning_rate": 3.788990219183314e-05, "loss": 0.1182, "step": 3601 }, { "epoch": 1.6387625113739763, "grad_norm": 0.6117911747380007, "learning_rate": 3.788377833409984e-05, "loss": 0.0999, "step": 3602 }, { "epoch": 1.6392174704276616, "grad_norm": 0.47633359569658307, "learning_rate": 3.7877653423562366e-05, "loss": 0.0921, "step": 3603 }, { "epoch": 1.6396724294813467, "grad_norm": 0.5840234218673793, "learning_rate": 3.787152746072119e-05, "loss": 0.1221, "step": 3604 }, { "epoch": 1.6401273885350318, "grad_norm": 1.675960014649054, "learning_rate": 3.786540044607693e-05, "loss": 0.1352, "step": 3605 }, { "epoch": 1.6405823475887171, "grad_norm": 0.7223598470705921, "learning_rate": 3.785927238013024e-05, "loss": 0.1095, "step": 3606 }, { "epoch": 1.6410373066424022, "grad_norm": 0.4373478703232099, "learning_rate": 3.785314326338189e-05, "loss": 0.1035, "step": 3607 }, { "epoch": 1.6414922656960873, "grad_norm": 0.5819185856964102, "learning_rate": 3.784701309633272e-05, "loss": 0.1339, "step": 3608 }, { "epoch": 1.6419472247497726, "grad_norm": 1.2824543340627066, "learning_rate": 3.784088187948365e-05, "loss": 0.1272, "step": 3609 }, { "epoch": 1.6424021838034577, "grad_norm": 0.6998245273900676, "learning_rate": 3.78347496133357e-05, "loss": 0.0871, "step": 3610 }, { "epoch": 1.6428571428571428, "grad_norm": 0.5258479822661923, "learning_rate": 3.782861629838997e-05, "loss": 0.1716, "step": 3611 }, { "epoch": 1.643312101910828, "grad_norm": 0.5940882892940856, "learning_rate": 3.782248193514766e-05, "loss": 0.1187, "step": 3612 }, { "epoch": 1.6437670609645132, "grad_norm": 0.7462113078380159, "learning_rate": 3.781634652411002e-05, "loss": 0.1317, "step": 3613 }, { "epoch": 1.6442220200181983, "grad_norm": 0.4861374083063201, "learning_rate": 3.781021006577843e-05, "loss": 0.0857, "step": 3614 }, { "epoch": 1.6446769790718836, "grad_norm": 0.5686445038277734, "learning_rate": 3.7804072560654316e-05, "loss": 0.1053, "step": 3615 }, { "epoch": 1.6451319381255687, "grad_norm": 0.841540473197962, "learning_rate": 3.7797934009239224e-05, "loss": 0.1059, "step": 3616 }, { "epoch": 1.6455868971792538, "grad_norm": 0.574171865256877, "learning_rate": 3.7791794412034756e-05, "loss": 0.1299, "step": 3617 }, { "epoch": 1.646041856232939, "grad_norm": 0.7110102561230327, "learning_rate": 3.778565376954261e-05, "loss": 0.1111, "step": 3618 }, { "epoch": 1.6464968152866242, "grad_norm": 0.7600809961097583, "learning_rate": 3.7779512082264584e-05, "loss": 0.1092, "step": 3619 }, { "epoch": 1.6469517743403093, "grad_norm": 0.5535806462390692, "learning_rate": 3.777336935070255e-05, "loss": 0.135, "step": 3620 }, { "epoch": 1.6474067333939946, "grad_norm": 0.6595947503173879, "learning_rate": 3.7767225575358434e-05, "loss": 0.1317, "step": 3621 }, { "epoch": 1.6478616924476797, "grad_norm": 0.6883705656186464, "learning_rate": 3.776108075673432e-05, "loss": 0.1169, "step": 3622 }, { "epoch": 1.6483166515013647, "grad_norm": 0.6646833793143885, "learning_rate": 3.7754934895332304e-05, "loss": 0.1402, "step": 3623 }, { "epoch": 1.64877161055505, "grad_norm": 0.5872626126862396, "learning_rate": 3.774878799165462e-05, "loss": 0.1604, "step": 3624 }, { "epoch": 1.6492265696087354, "grad_norm": 1.0427417799787666, "learning_rate": 3.774264004620355e-05, "loss": 0.1415, "step": 3625 }, { "epoch": 1.6496815286624202, "grad_norm": 0.6198982735938007, "learning_rate": 3.773649105948147e-05, "loss": 0.08, "step": 3626 }, { "epoch": 1.6501364877161055, "grad_norm": 0.5538250057266526, "learning_rate": 3.7730341031990875e-05, "loss": 0.1401, "step": 3627 }, { "epoch": 1.6505914467697909, "grad_norm": 0.6952183962689231, "learning_rate": 3.772418996423428e-05, "loss": 0.1031, "step": 3628 }, { "epoch": 1.6510464058234757, "grad_norm": 0.8577331175023823, "learning_rate": 3.771803785671436e-05, "loss": 0.176, "step": 3629 }, { "epoch": 1.651501364877161, "grad_norm": 0.5225077169383394, "learning_rate": 3.771188470993382e-05, "loss": 0.087, "step": 3630 }, { "epoch": 1.6519563239308463, "grad_norm": 0.65703912112192, "learning_rate": 3.7705730524395464e-05, "loss": 0.1358, "step": 3631 }, { "epoch": 1.6524112829845314, "grad_norm": 0.7578332195389625, "learning_rate": 3.769957530060219e-05, "loss": 0.1218, "step": 3632 }, { "epoch": 1.6528662420382165, "grad_norm": 0.6162684265402045, "learning_rate": 3.7693419039056966e-05, "loss": 0.1261, "step": 3633 }, { "epoch": 1.6533212010919018, "grad_norm": 0.6483352288324249, "learning_rate": 3.7687261740262866e-05, "loss": 0.0961, "step": 3634 }, { "epoch": 1.653776160145587, "grad_norm": 0.6050800083807413, "learning_rate": 3.768110340472304e-05, "loss": 0.0921, "step": 3635 }, { "epoch": 1.654231119199272, "grad_norm": 0.9607959414658741, "learning_rate": 3.767494403294069e-05, "loss": 0.1956, "step": 3636 }, { "epoch": 1.6546860782529573, "grad_norm": 0.7986114505776511, "learning_rate": 3.766878362541918e-05, "loss": 0.1132, "step": 3637 }, { "epoch": 1.6551410373066424, "grad_norm": 0.8089451541586596, "learning_rate": 3.766262218266186e-05, "loss": 0.0754, "step": 3638 }, { "epoch": 1.6555959963603275, "grad_norm": 0.7706173647962418, "learning_rate": 3.765645970517225e-05, "loss": 0.1956, "step": 3639 }, { "epoch": 1.6560509554140128, "grad_norm": 0.5648956520882266, "learning_rate": 3.765029619345391e-05, "loss": 0.1331, "step": 3640 }, { "epoch": 1.656505914467698, "grad_norm": 0.5697895054879683, "learning_rate": 3.7644131648010494e-05, "loss": 0.1223, "step": 3641 }, { "epoch": 1.656960873521383, "grad_norm": 0.6362738166871537, "learning_rate": 3.763796606934574e-05, "loss": 0.12, "step": 3642 }, { "epoch": 1.6574158325750683, "grad_norm": 1.4948365969358601, "learning_rate": 3.7631799457963466e-05, "loss": 0.1575, "step": 3643 }, { "epoch": 1.6578707916287534, "grad_norm": 0.6346844259212422, "learning_rate": 3.762563181436759e-05, "loss": 0.1138, "step": 3644 }, { "epoch": 1.6583257506824385, "grad_norm": 0.633997490355106, "learning_rate": 3.7619463139062095e-05, "loss": 0.1213, "step": 3645 }, { "epoch": 1.6587807097361238, "grad_norm": 0.5001200122897566, "learning_rate": 3.7613293432551065e-05, "loss": 0.1186, "step": 3646 }, { "epoch": 1.6592356687898089, "grad_norm": 0.7059760598576926, "learning_rate": 3.760712269533866e-05, "loss": 0.1361, "step": 3647 }, { "epoch": 1.659690627843494, "grad_norm": 0.7840202796214972, "learning_rate": 3.760095092792911e-05, "loss": 0.1303, "step": 3648 }, { "epoch": 1.6601455868971793, "grad_norm": 0.5676339421793534, "learning_rate": 3.759477813082677e-05, "loss": 0.1169, "step": 3649 }, { "epoch": 1.6606005459508644, "grad_norm": 0.5933125326706672, "learning_rate": 3.7588604304536024e-05, "loss": 0.0966, "step": 3650 }, { "epoch": 1.6610555050045495, "grad_norm": 0.7732554112616696, "learning_rate": 3.7582429449561396e-05, "loss": 0.1188, "step": 3651 }, { "epoch": 1.6615104640582348, "grad_norm": 0.6600797955638447, "learning_rate": 3.757625356640745e-05, "loss": 0.1269, "step": 3652 }, { "epoch": 1.66196542311192, "grad_norm": 0.7646744253358221, "learning_rate": 3.757007665557886e-05, "loss": 0.1505, "step": 3653 }, { "epoch": 1.662420382165605, "grad_norm": 0.5940827501445668, "learning_rate": 3.756389871758036e-05, "loss": 0.1258, "step": 3654 }, { "epoch": 1.6628753412192903, "grad_norm": 0.4933523874033883, "learning_rate": 3.7557719752916806e-05, "loss": 0.0878, "step": 3655 }, { "epoch": 1.6633303002729756, "grad_norm": 0.72129516538024, "learning_rate": 3.75515397620931e-05, "loss": 0.0974, "step": 3656 }, { "epoch": 1.6637852593266604, "grad_norm": 0.6032401000935992, "learning_rate": 3.754535874561424e-05, "loss": 0.1048, "step": 3657 }, { "epoch": 1.6642402183803457, "grad_norm": 0.8566928529770859, "learning_rate": 3.7539176703985334e-05, "loss": 0.1496, "step": 3658 }, { "epoch": 1.664695177434031, "grad_norm": 0.6757964434142171, "learning_rate": 3.7532993637711525e-05, "loss": 0.1118, "step": 3659 }, { "epoch": 1.6651501364877161, "grad_norm": 0.6758554863377769, "learning_rate": 3.752680954729807e-05, "loss": 0.0759, "step": 3660 }, { "epoch": 1.6656050955414012, "grad_norm": 0.6362713798757196, "learning_rate": 3.752062443325032e-05, "loss": 0.1681, "step": 3661 }, { "epoch": 1.6660600545950865, "grad_norm": 0.5149362850571993, "learning_rate": 3.751443829607368e-05, "loss": 0.089, "step": 3662 }, { "epoch": 1.6665150136487716, "grad_norm": 0.4028225976473529, "learning_rate": 3.7508251136273656e-05, "loss": 0.0786, "step": 3663 }, { "epoch": 1.6669699727024567, "grad_norm": 0.7336714194894383, "learning_rate": 3.7502062954355834e-05, "loss": 0.1542, "step": 3664 }, { "epoch": 1.667424931756142, "grad_norm": 0.7241551299524682, "learning_rate": 3.749587375082589e-05, "loss": 0.1244, "step": 3665 }, { "epoch": 1.6678798908098271, "grad_norm": 0.5619342710480558, "learning_rate": 3.7489683526189574e-05, "loss": 0.1389, "step": 3666 }, { "epoch": 1.6683348498635122, "grad_norm": 0.6061897310908223, "learning_rate": 3.748349228095272e-05, "loss": 0.1336, "step": 3667 }, { "epoch": 1.6687898089171975, "grad_norm": 0.46639931243704646, "learning_rate": 3.747730001562125e-05, "loss": 0.0798, "step": 3668 }, { "epoch": 1.6692447679708826, "grad_norm": 0.5652656742263508, "learning_rate": 3.747110673070117e-05, "loss": 0.1517, "step": 3669 }, { "epoch": 1.6696997270245677, "grad_norm": 0.6659581641167922, "learning_rate": 3.746491242669857e-05, "loss": 0.0967, "step": 3670 }, { "epoch": 1.670154686078253, "grad_norm": 0.5829610144276582, "learning_rate": 3.7458717104119615e-05, "loss": 0.1016, "step": 3671 }, { "epoch": 1.670609645131938, "grad_norm": 1.7286248846674817, "learning_rate": 3.745252076347057e-05, "loss": 0.1012, "step": 3672 }, { "epoch": 1.6710646041856232, "grad_norm": 0.6346542494060575, "learning_rate": 3.7446323405257754e-05, "loss": 0.154, "step": 3673 }, { "epoch": 1.6715195632393085, "grad_norm": 0.5391885426307604, "learning_rate": 3.744012502998759e-05, "loss": 0.0942, "step": 3674 }, { "epoch": 1.6719745222929936, "grad_norm": 1.0501038230319506, "learning_rate": 3.74339256381666e-05, "loss": 0.1426, "step": 3675 }, { "epoch": 1.6724294813466787, "grad_norm": 0.5819622679114493, "learning_rate": 3.7427725230301356e-05, "loss": 0.1428, "step": 3676 }, { "epoch": 1.672884440400364, "grad_norm": 0.9435060011018824, "learning_rate": 3.742152380689853e-05, "loss": 0.1244, "step": 3677 }, { "epoch": 1.673339399454049, "grad_norm": 0.5286895843300911, "learning_rate": 3.741532136846487e-05, "loss": 0.0732, "step": 3678 }, { "epoch": 1.6737943585077342, "grad_norm": 0.631285589443201, "learning_rate": 3.740911791550722e-05, "loss": 0.0909, "step": 3679 }, { "epoch": 1.6742493175614195, "grad_norm": 0.5746915583860749, "learning_rate": 3.740291344853249e-05, "loss": 0.1231, "step": 3680 }, { "epoch": 1.6747042766151048, "grad_norm": 1.0287555583965, "learning_rate": 3.739670796804767e-05, "loss": 0.1331, "step": 3681 }, { "epoch": 1.6751592356687897, "grad_norm": 0.9487308447654705, "learning_rate": 3.739050147455988e-05, "loss": 0.1569, "step": 3682 }, { "epoch": 1.675614194722475, "grad_norm": 0.6088012934878856, "learning_rate": 3.7384293968576254e-05, "loss": 0.1677, "step": 3683 }, { "epoch": 1.6760691537761603, "grad_norm": 1.770105589210368, "learning_rate": 3.737808545060405e-05, "loss": 0.157, "step": 3684 }, { "epoch": 1.6765241128298451, "grad_norm": 0.5683662063778097, "learning_rate": 3.737187592115061e-05, "loss": 0.097, "step": 3685 }, { "epoch": 1.6769790718835305, "grad_norm": 0.7052530415346508, "learning_rate": 3.736566538072334e-05, "loss": 0.0819, "step": 3686 }, { "epoch": 1.6774340309372158, "grad_norm": 0.6035897226614633, "learning_rate": 3.7359453829829734e-05, "loss": 0.1252, "step": 3687 }, { "epoch": 1.6778889899909009, "grad_norm": 0.6754003898388826, "learning_rate": 3.735324126897737e-05, "loss": 0.1587, "step": 3688 }, { "epoch": 1.678343949044586, "grad_norm": 1.1978042482777937, "learning_rate": 3.734702769867393e-05, "loss": 0.1266, "step": 3689 }, { "epoch": 1.6787989080982713, "grad_norm": 0.7119560868637919, "learning_rate": 3.7340813119427135e-05, "loss": 0.1374, "step": 3690 }, { "epoch": 1.6792538671519563, "grad_norm": 0.5873662740062752, "learning_rate": 3.733459753174482e-05, "loss": 0.0894, "step": 3691 }, { "epoch": 1.6797088262056414, "grad_norm": 0.5861871414663905, "learning_rate": 3.73283809361349e-05, "loss": 0.0961, "step": 3692 }, { "epoch": 1.6801637852593267, "grad_norm": 0.5500039349705202, "learning_rate": 3.732216333310537e-05, "loss": 0.101, "step": 3693 }, { "epoch": 1.6806187443130118, "grad_norm": 0.7115188733635399, "learning_rate": 3.73159447231643e-05, "loss": 0.0869, "step": 3694 }, { "epoch": 1.681073703366697, "grad_norm": 0.6033911144201342, "learning_rate": 3.7309725106819835e-05, "loss": 0.1363, "step": 3695 }, { "epoch": 1.6815286624203822, "grad_norm": 0.5851745994926699, "learning_rate": 3.7303504484580235e-05, "loss": 0.1019, "step": 3696 }, { "epoch": 1.6819836214740673, "grad_norm": 0.9811025649169238, "learning_rate": 3.729728285695381e-05, "loss": 0.1503, "step": 3697 }, { "epoch": 1.6824385805277524, "grad_norm": 0.5202464230033237, "learning_rate": 3.729106022444895e-05, "loss": 0.1005, "step": 3698 }, { "epoch": 1.6828935395814377, "grad_norm": 0.5571022329919145, "learning_rate": 3.728483658757417e-05, "loss": 0.078, "step": 3699 }, { "epoch": 1.6833484986351228, "grad_norm": 0.6363286546672933, "learning_rate": 3.7278611946838015e-05, "loss": 0.1107, "step": 3700 }, { "epoch": 1.683803457688808, "grad_norm": 0.7176436271239127, "learning_rate": 3.727238630274914e-05, "loss": 0.1767, "step": 3701 }, { "epoch": 1.6842584167424932, "grad_norm": 0.8839025984381536, "learning_rate": 3.726615965581628e-05, "loss": 0.2348, "step": 3702 }, { "epoch": 1.6847133757961783, "grad_norm": 1.019252853529848, "learning_rate": 3.725993200654825e-05, "loss": 0.1704, "step": 3703 }, { "epoch": 1.6851683348498634, "grad_norm": 0.7081008796721364, "learning_rate": 3.725370335545394e-05, "loss": 0.1629, "step": 3704 }, { "epoch": 1.6856232939035487, "grad_norm": 0.5990053871288287, "learning_rate": 3.7247473703042325e-05, "loss": 0.105, "step": 3705 }, { "epoch": 1.686078252957234, "grad_norm": 0.6456774821408616, "learning_rate": 3.7241243049822474e-05, "loss": 0.1158, "step": 3706 }, { "epoch": 1.6865332120109189, "grad_norm": 0.664187104849711, "learning_rate": 3.723501139630352e-05, "loss": 0.1004, "step": 3707 }, { "epoch": 1.6869881710646042, "grad_norm": 0.5434730667961121, "learning_rate": 3.722877874299469e-05, "loss": 0.1241, "step": 3708 }, { "epoch": 1.6874431301182895, "grad_norm": 0.7904505649011287, "learning_rate": 3.722254509040527e-05, "loss": 0.1336, "step": 3709 }, { "epoch": 1.6878980891719744, "grad_norm": 0.5358495237116961, "learning_rate": 3.721631043904468e-05, "loss": 0.1122, "step": 3710 }, { "epoch": 1.6883530482256597, "grad_norm": 0.6392103463249674, "learning_rate": 3.721007478942236e-05, "loss": 0.095, "step": 3711 }, { "epoch": 1.688808007279345, "grad_norm": 0.8835231913764461, "learning_rate": 3.7203838142047874e-05, "loss": 0.1519, "step": 3712 }, { "epoch": 1.68926296633303, "grad_norm": 0.45012745327806786, "learning_rate": 3.719760049743084e-05, "loss": 0.1014, "step": 3713 }, { "epoch": 1.6897179253867152, "grad_norm": 0.8172884187868517, "learning_rate": 3.7191361856080985e-05, "loss": 0.1688, "step": 3714 }, { "epoch": 1.6901728844404005, "grad_norm": 0.4959382626790703, "learning_rate": 3.71851222185081e-05, "loss": 0.0987, "step": 3715 }, { "epoch": 1.6906278434940856, "grad_norm": 0.783904001938886, "learning_rate": 3.7178881585222035e-05, "loss": 0.0919, "step": 3716 }, { "epoch": 1.6910828025477707, "grad_norm": 0.6048807257250242, "learning_rate": 3.717263995673278e-05, "loss": 0.1142, "step": 3717 }, { "epoch": 1.691537761601456, "grad_norm": 1.0610880457594716, "learning_rate": 3.7166397333550355e-05, "loss": 0.1504, "step": 3718 }, { "epoch": 1.691992720655141, "grad_norm": 0.8585653974999917, "learning_rate": 3.7160153716184885e-05, "loss": 0.1214, "step": 3719 }, { "epoch": 1.6924476797088261, "grad_norm": 0.5590386785809927, "learning_rate": 3.7153909105146565e-05, "loss": 0.0944, "step": 3720 }, { "epoch": 1.6929026387625115, "grad_norm": 0.5982624659848622, "learning_rate": 3.714766350094569e-05, "loss": 0.0961, "step": 3721 }, { "epoch": 1.6933575978161965, "grad_norm": 0.6042960844446889, "learning_rate": 3.7141416904092604e-05, "loss": 0.1823, "step": 3722 }, { "epoch": 1.6938125568698816, "grad_norm": 0.5130829697773571, "learning_rate": 3.713516931509775e-05, "loss": 0.1273, "step": 3723 }, { "epoch": 1.694267515923567, "grad_norm": 0.5131402385277728, "learning_rate": 3.712892073447168e-05, "loss": 0.1454, "step": 3724 }, { "epoch": 1.694722474977252, "grad_norm": 3.9476686097435514, "learning_rate": 3.712267116272497e-05, "loss": 0.1666, "step": 3725 }, { "epoch": 1.6951774340309371, "grad_norm": 0.717753277367355, "learning_rate": 3.711642060036832e-05, "loss": 0.1162, "step": 3726 }, { "epoch": 1.6956323930846224, "grad_norm": 0.6571089054026673, "learning_rate": 3.711016904791249e-05, "loss": 0.117, "step": 3727 }, { "epoch": 1.6960873521383075, "grad_norm": 0.5867472663801584, "learning_rate": 3.710391650586834e-05, "loss": 0.1168, "step": 3728 }, { "epoch": 1.6965423111919926, "grad_norm": 0.5127374305289065, "learning_rate": 3.709766297474679e-05, "loss": 0.1027, "step": 3729 }, { "epoch": 1.696997270245678, "grad_norm": 0.7128282538797666, "learning_rate": 3.709140845505886e-05, "loss": 0.1303, "step": 3730 }, { "epoch": 1.697452229299363, "grad_norm": 0.7409091602099744, "learning_rate": 3.7085152947315635e-05, "loss": 0.1302, "step": 3731 }, { "epoch": 1.697907188353048, "grad_norm": 0.6060023805864273, "learning_rate": 3.707889645202829e-05, "loss": 0.1071, "step": 3732 }, { "epoch": 1.6983621474067334, "grad_norm": 0.6044073536062164, "learning_rate": 3.707263896970807e-05, "loss": 0.0926, "step": 3733 }, { "epoch": 1.6988171064604187, "grad_norm": 0.6683654906206411, "learning_rate": 3.706638050086631e-05, "loss": 0.102, "step": 3734 }, { "epoch": 1.6992720655141036, "grad_norm": 0.5834634047971878, "learning_rate": 3.706012104601443e-05, "loss": 0.1432, "step": 3735 }, { "epoch": 1.699727024567789, "grad_norm": 1.1157274766555358, "learning_rate": 3.705386060566393e-05, "loss": 0.0912, "step": 3736 }, { "epoch": 1.7001819836214742, "grad_norm": 0.8554418287991957, "learning_rate": 3.704759918032636e-05, "loss": 0.2199, "step": 3737 }, { "epoch": 1.700636942675159, "grad_norm": 0.603884901129916, "learning_rate": 3.70413367705134e-05, "loss": 0.1829, "step": 3738 }, { "epoch": 1.7010919017288444, "grad_norm": 0.5183058801254687, "learning_rate": 3.7035073376736775e-05, "loss": 0.0842, "step": 3739 }, { "epoch": 1.7015468607825297, "grad_norm": 0.7624269839179334, "learning_rate": 3.7028808999508306e-05, "loss": 0.1721, "step": 3740 }, { "epoch": 1.7020018198362148, "grad_norm": 0.6817768711477372, "learning_rate": 3.702254363933989e-05, "loss": 0.121, "step": 3741 }, { "epoch": 1.7024567788898999, "grad_norm": 0.7811842853490489, "learning_rate": 3.7016277296743496e-05, "loss": 0.1265, "step": 3742 }, { "epoch": 1.7029117379435852, "grad_norm": 0.5588265436100905, "learning_rate": 3.7010009972231184e-05, "loss": 0.136, "step": 3743 }, { "epoch": 1.7033666969972703, "grad_norm": 0.531080166834599, "learning_rate": 3.700374166631509e-05, "loss": 0.1152, "step": 3744 }, { "epoch": 1.7038216560509554, "grad_norm": 0.5603295985155842, "learning_rate": 3.699747237950745e-05, "loss": 0.0976, "step": 3745 }, { "epoch": 1.7042766151046407, "grad_norm": 0.5833157467613759, "learning_rate": 3.6991202112320544e-05, "loss": 0.1211, "step": 3746 }, { "epoch": 1.7047315741583258, "grad_norm": 0.5510539806179937, "learning_rate": 3.698493086526674e-05, "loss": 0.0911, "step": 3747 }, { "epoch": 1.7051865332120109, "grad_norm": 0.6574361503043761, "learning_rate": 3.6978658638858524e-05, "loss": 0.1087, "step": 3748 }, { "epoch": 1.7056414922656962, "grad_norm": 0.6775889238324545, "learning_rate": 3.6972385433608415e-05, "loss": 0.1349, "step": 3749 }, { "epoch": 1.7060964513193813, "grad_norm": 0.46499973744061796, "learning_rate": 3.696611125002904e-05, "loss": 0.0968, "step": 3750 }, { "epoch": 1.7065514103730663, "grad_norm": 0.9392517464832859, "learning_rate": 3.695983608863308e-05, "loss": 0.1213, "step": 3751 }, { "epoch": 1.7070063694267517, "grad_norm": 0.9388085823018968, "learning_rate": 3.695355994993333e-05, "loss": 0.1425, "step": 3752 }, { "epoch": 1.7074613284804367, "grad_norm": 0.6375281521549688, "learning_rate": 3.6947282834442645e-05, "loss": 0.1046, "step": 3753 }, { "epoch": 1.7079162875341218, "grad_norm": 0.6353030167864213, "learning_rate": 3.694100474267396e-05, "loss": 0.1299, "step": 3754 }, { "epoch": 1.7083712465878071, "grad_norm": 0.7334044640088319, "learning_rate": 3.6934725675140295e-05, "loss": 0.1087, "step": 3755 }, { "epoch": 1.7088262056414922, "grad_norm": 0.6756689192926467, "learning_rate": 3.692844563235474e-05, "loss": 0.092, "step": 3756 }, { "epoch": 1.7092811646951773, "grad_norm": 0.8568043306982238, "learning_rate": 3.692216461483047e-05, "loss": 0.1275, "step": 3757 }, { "epoch": 1.7097361237488626, "grad_norm": 0.5881882481579416, "learning_rate": 3.691588262308076e-05, "loss": 0.1119, "step": 3758 }, { "epoch": 1.7101910828025477, "grad_norm": 1.1972381658544693, "learning_rate": 3.6909599657618925e-05, "loss": 0.1406, "step": 3759 }, { "epoch": 1.7106460418562328, "grad_norm": 0.5169663463862318, "learning_rate": 3.69033157189584e-05, "loss": 0.0919, "step": 3760 }, { "epoch": 1.7111010009099181, "grad_norm": 0.5474096589042428, "learning_rate": 3.6897030807612655e-05, "loss": 0.0823, "step": 3761 }, { "epoch": 1.7115559599636034, "grad_norm": 0.6496918042240046, "learning_rate": 3.689074492409529e-05, "loss": 0.1555, "step": 3762 }, { "epoch": 1.7120109190172883, "grad_norm": 0.567877195853652, "learning_rate": 3.6884458068919934e-05, "loss": 0.1309, "step": 3763 }, { "epoch": 1.7124658780709736, "grad_norm": 0.5903074415828129, "learning_rate": 3.6878170242600346e-05, "loss": 0.1103, "step": 3764 }, { "epoch": 1.712920837124659, "grad_norm": 0.6719502773591889, "learning_rate": 3.687188144565033e-05, "loss": 0.1291, "step": 3765 }, { "epoch": 1.7133757961783438, "grad_norm": 0.7176771929691711, "learning_rate": 3.686559167858377e-05, "loss": 0.0912, "step": 3766 }, { "epoch": 1.713830755232029, "grad_norm": 0.4792112186897858, "learning_rate": 3.6859300941914645e-05, "loss": 0.0953, "step": 3767 }, { "epoch": 1.7142857142857144, "grad_norm": 0.7084493318537436, "learning_rate": 3.6853009236157e-05, "loss": 0.1675, "step": 3768 }, { "epoch": 1.7147406733393995, "grad_norm": 0.6897698624573239, "learning_rate": 3.6846716561824965e-05, "loss": 0.1103, "step": 3769 }, { "epoch": 1.7151956323930846, "grad_norm": 0.5633664219946678, "learning_rate": 3.684042291943276e-05, "loss": 0.0934, "step": 3770 }, { "epoch": 1.71565059144677, "grad_norm": 0.5465538261159945, "learning_rate": 3.683412830949466e-05, "loss": 0.1886, "step": 3771 }, { "epoch": 1.716105550500455, "grad_norm": 0.6430362193478963, "learning_rate": 3.6827832732525044e-05, "loss": 0.106, "step": 3772 }, { "epoch": 1.71656050955414, "grad_norm": 0.9060089068788008, "learning_rate": 3.6821536189038345e-05, "loss": 0.1061, "step": 3773 }, { "epoch": 1.7170154686078254, "grad_norm": 0.6657387185883136, "learning_rate": 3.681523867954909e-05, "loss": 0.0968, "step": 3774 }, { "epoch": 1.7174704276615105, "grad_norm": 0.5805761167111708, "learning_rate": 3.6808940204571896e-05, "loss": 0.1147, "step": 3775 }, { "epoch": 1.7179253867151956, "grad_norm": 0.7224393975543414, "learning_rate": 3.680264076462143e-05, "loss": 0.1263, "step": 3776 }, { "epoch": 1.7183803457688809, "grad_norm": 0.6661064343472831, "learning_rate": 3.6796340360212465e-05, "loss": 0.1003, "step": 3777 }, { "epoch": 1.718835304822566, "grad_norm": 0.6107134154952316, "learning_rate": 3.679003899185983e-05, "loss": 0.1164, "step": 3778 }, { "epoch": 1.719290263876251, "grad_norm": 0.6612744240880679, "learning_rate": 3.678373666007846e-05, "loss": 0.1221, "step": 3779 }, { "epoch": 1.7197452229299364, "grad_norm": 0.4647180933209534, "learning_rate": 3.6777433365383344e-05, "loss": 0.0977, "step": 3780 }, { "epoch": 1.7202001819836215, "grad_norm": 0.5325152126460988, "learning_rate": 3.677112910828957e-05, "loss": 0.1154, "step": 3781 }, { "epoch": 1.7206551410373065, "grad_norm": 0.49709529819086096, "learning_rate": 3.676482388931226e-05, "loss": 0.0972, "step": 3782 }, { "epoch": 1.7211101000909919, "grad_norm": 0.6792539241420419, "learning_rate": 3.675851770896669e-05, "loss": 0.1429, "step": 3783 }, { "epoch": 1.721565059144677, "grad_norm": 0.5644910390654007, "learning_rate": 3.675221056776815e-05, "loss": 0.1109, "step": 3784 }, { "epoch": 1.722020018198362, "grad_norm": 0.5589928533410956, "learning_rate": 3.674590246623202e-05, "loss": 0.1007, "step": 3785 }, { "epoch": 1.7224749772520473, "grad_norm": 0.49265494501575213, "learning_rate": 3.6739593404873804e-05, "loss": 0.0753, "step": 3786 }, { "epoch": 1.7229299363057324, "grad_norm": 0.5607818617367935, "learning_rate": 3.6733283384209025e-05, "loss": 0.1381, "step": 3787 }, { "epoch": 1.7233848953594175, "grad_norm": 0.6612408572627732, "learning_rate": 3.672697240475331e-05, "loss": 0.1093, "step": 3788 }, { "epoch": 1.7238398544131028, "grad_norm": 0.6576337842157468, "learning_rate": 3.672066046702237e-05, "loss": 0.1256, "step": 3789 }, { "epoch": 1.7242948134667881, "grad_norm": 0.5533686158025476, "learning_rate": 3.6714347571531995e-05, "loss": 0.103, "step": 3790 }, { "epoch": 1.724749772520473, "grad_norm": 0.5657344945240539, "learning_rate": 3.670803371879803e-05, "loss": 0.1056, "step": 3791 }, { "epoch": 1.7252047315741583, "grad_norm": 0.7389884807667029, "learning_rate": 3.6701718909336426e-05, "loss": 0.0857, "step": 3792 }, { "epoch": 1.7256596906278436, "grad_norm": 0.5358664707345666, "learning_rate": 3.669540314366319e-05, "loss": 0.0788, "step": 3793 }, { "epoch": 1.7261146496815285, "grad_norm": 0.5779834192880007, "learning_rate": 3.668908642229443e-05, "loss": 0.1243, "step": 3794 }, { "epoch": 1.7265696087352138, "grad_norm": 0.5781485186270442, "learning_rate": 3.668276874574631e-05, "loss": 0.1059, "step": 3795 }, { "epoch": 1.7270245677888991, "grad_norm": 0.578926797300102, "learning_rate": 3.66764501145351e-05, "loss": 0.0934, "step": 3796 }, { "epoch": 1.7274795268425842, "grad_norm": 0.895447391611582, "learning_rate": 3.667013052917711e-05, "loss": 0.1498, "step": 3797 }, { "epoch": 1.7279344858962693, "grad_norm": 0.7229739345433435, "learning_rate": 3.666380999018875e-05, "loss": 0.1119, "step": 3798 }, { "epoch": 1.7283894449499546, "grad_norm": 0.8533839896605034, "learning_rate": 3.6657488498086516e-05, "loss": 0.1317, "step": 3799 }, { "epoch": 1.7288444040036397, "grad_norm": 0.5455610878629117, "learning_rate": 3.665116605338697e-05, "loss": 0.1098, "step": 3800 }, { "epoch": 1.7292993630573248, "grad_norm": 0.8710996180739101, "learning_rate": 3.664484265660675e-05, "loss": 0.1351, "step": 3801 }, { "epoch": 1.72975432211101, "grad_norm": 0.8199455719959434, "learning_rate": 3.6638518308262565e-05, "loss": 0.1422, "step": 3802 }, { "epoch": 1.7302092811646952, "grad_norm": 0.6458561550104761, "learning_rate": 3.663219300887123e-05, "loss": 0.1198, "step": 3803 }, { "epoch": 1.7306642402183803, "grad_norm": 0.5866757574124075, "learning_rate": 3.662586675894961e-05, "loss": 0.128, "step": 3804 }, { "epoch": 1.7311191992720656, "grad_norm": 0.6355337639150164, "learning_rate": 3.661953955901467e-05, "loss": 0.1206, "step": 3805 }, { "epoch": 1.7315741583257507, "grad_norm": 1.6664492201408232, "learning_rate": 3.661321140958342e-05, "loss": 0.1223, "step": 3806 }, { "epoch": 1.7320291173794358, "grad_norm": 0.6407322764583652, "learning_rate": 3.660688231117298e-05, "loss": 0.1633, "step": 3807 }, { "epoch": 1.732484076433121, "grad_norm": 1.1449296877495174, "learning_rate": 3.660055226430054e-05, "loss": 0.139, "step": 3808 }, { "epoch": 1.7329390354868062, "grad_norm": 0.8086279697816722, "learning_rate": 3.6594221269483356e-05, "loss": 0.1354, "step": 3809 }, { "epoch": 1.7333939945404913, "grad_norm": 0.6033468035505695, "learning_rate": 3.6587889327238764e-05, "loss": 0.1381, "step": 3810 }, { "epoch": 1.7338489535941766, "grad_norm": 0.7842090160100366, "learning_rate": 3.658155643808419e-05, "loss": 0.1149, "step": 3811 }, { "epoch": 1.7343039126478617, "grad_norm": 0.5465839369961538, "learning_rate": 3.657522260253712e-05, "loss": 0.0929, "step": 3812 }, { "epoch": 1.7347588717015467, "grad_norm": 0.5345678769934538, "learning_rate": 3.6568887821115134e-05, "loss": 0.1314, "step": 3813 }, { "epoch": 1.735213830755232, "grad_norm": 0.5953835745329289, "learning_rate": 3.656255209433588e-05, "loss": 0.0892, "step": 3814 }, { "epoch": 1.7356687898089171, "grad_norm": 0.8129606028687366, "learning_rate": 3.6556215422717086e-05, "loss": 0.137, "step": 3815 }, { "epoch": 1.7361237488626022, "grad_norm": 0.7353018062292984, "learning_rate": 3.654987780677656e-05, "loss": 0.1604, "step": 3816 }, { "epoch": 1.7365787079162875, "grad_norm": 0.6952496088519464, "learning_rate": 3.654353924703217e-05, "loss": 0.115, "step": 3817 }, { "epoch": 1.7370336669699729, "grad_norm": 0.6122389838355891, "learning_rate": 3.6537199744001896e-05, "loss": 0.0861, "step": 3818 }, { "epoch": 1.7374886260236577, "grad_norm": 0.6325389047278275, "learning_rate": 3.6530859298203743e-05, "loss": 0.0905, "step": 3819 }, { "epoch": 1.737943585077343, "grad_norm": 0.44724447844985343, "learning_rate": 3.6524517910155854e-05, "loss": 0.0637, "step": 3820 }, { "epoch": 1.7383985441310283, "grad_norm": 0.6693611385866471, "learning_rate": 3.6518175580376395e-05, "loss": 0.1464, "step": 3821 }, { "epoch": 1.7388535031847132, "grad_norm": 0.7680213836173, "learning_rate": 3.6511832309383655e-05, "loss": 0.1054, "step": 3822 }, { "epoch": 1.7393084622383985, "grad_norm": 0.5654940509957286, "learning_rate": 3.650548809769596e-05, "loss": 0.1081, "step": 3823 }, { "epoch": 1.7397634212920838, "grad_norm": 0.6954842446296196, "learning_rate": 3.649914294583173e-05, "loss": 0.1652, "step": 3824 }, { "epoch": 1.740218380345769, "grad_norm": 0.6511338433616514, "learning_rate": 3.649279685430948e-05, "loss": 0.1303, "step": 3825 }, { "epoch": 1.740673339399454, "grad_norm": 0.7978222887619625, "learning_rate": 3.648644982364777e-05, "loss": 0.1254, "step": 3826 }, { "epoch": 1.7411282984531393, "grad_norm": 1.1663859692645049, "learning_rate": 3.648010185436524e-05, "loss": 0.1517, "step": 3827 }, { "epoch": 1.7415832575068244, "grad_norm": 0.43221249944207446, "learning_rate": 3.6473752946980645e-05, "loss": 0.1245, "step": 3828 }, { "epoch": 1.7420382165605095, "grad_norm": 0.6704953124875148, "learning_rate": 3.646740310201276e-05, "loss": 0.1227, "step": 3829 }, { "epoch": 1.7424931756141948, "grad_norm": 0.5805242098022484, "learning_rate": 3.64610523199805e-05, "loss": 0.1286, "step": 3830 }, { "epoch": 1.74294813466788, "grad_norm": 0.49438508793298575, "learning_rate": 3.645470060140278e-05, "loss": 0.0839, "step": 3831 }, { "epoch": 1.743403093721565, "grad_norm": 0.7195995135488155, "learning_rate": 3.644834794679867e-05, "loss": 0.1086, "step": 3832 }, { "epoch": 1.7438580527752503, "grad_norm": 1.2936359926224004, "learning_rate": 3.6441994356687266e-05, "loss": 0.2283, "step": 3833 }, { "epoch": 1.7443130118289354, "grad_norm": 0.5821132683631819, "learning_rate": 3.643563983158775e-05, "loss": 0.099, "step": 3834 }, { "epoch": 1.7447679708826205, "grad_norm": 0.7244455046115726, "learning_rate": 3.642928437201939e-05, "loss": 0.1003, "step": 3835 }, { "epoch": 1.7452229299363058, "grad_norm": 0.5792862030729549, "learning_rate": 3.642292797850153e-05, "loss": 0.1455, "step": 3836 }, { "epoch": 1.7456778889899909, "grad_norm": 0.6186601005714522, "learning_rate": 3.641657065155358e-05, "loss": 0.0835, "step": 3837 }, { "epoch": 1.746132848043676, "grad_norm": 0.6649130552920157, "learning_rate": 3.641021239169502e-05, "loss": 0.1138, "step": 3838 }, { "epoch": 1.7465878070973613, "grad_norm": 0.6691289937167585, "learning_rate": 3.640385319944545e-05, "loss": 0.0996, "step": 3839 }, { "epoch": 1.7470427661510464, "grad_norm": 0.673205806194577, "learning_rate": 3.6397493075324484e-05, "loss": 0.0833, "step": 3840 }, { "epoch": 1.7474977252047315, "grad_norm": 0.7187934554153533, "learning_rate": 3.6391132019851856e-05, "loss": 0.1184, "step": 3841 }, { "epoch": 1.7479526842584168, "grad_norm": 0.609032683523488, "learning_rate": 3.6384770033547366e-05, "loss": 0.0876, "step": 3842 }, { "epoch": 1.7484076433121019, "grad_norm": 0.5516483180624232, "learning_rate": 3.637840711693088e-05, "loss": 0.1124, "step": 3843 }, { "epoch": 1.748862602365787, "grad_norm": 0.7009739390004153, "learning_rate": 3.637204327052235e-05, "loss": 0.1686, "step": 3844 }, { "epoch": 1.7493175614194723, "grad_norm": 0.6120741150130702, "learning_rate": 3.6365678494841794e-05, "loss": 0.1747, "step": 3845 }, { "epoch": 1.7497725204731576, "grad_norm": 0.5519869153330011, "learning_rate": 3.6359312790409325e-05, "loss": 0.1208, "step": 3846 }, { "epoch": 1.7502274795268424, "grad_norm": 0.6354335420283488, "learning_rate": 3.635294615774511e-05, "loss": 0.1585, "step": 3847 }, { "epoch": 1.7506824385805277, "grad_norm": 0.5828908266806171, "learning_rate": 3.63465785973694e-05, "loss": 0.1563, "step": 3848 }, { "epoch": 1.751137397634213, "grad_norm": 0.8041477568035, "learning_rate": 3.6340210109802537e-05, "loss": 0.1419, "step": 3849 }, { "epoch": 1.7515923566878981, "grad_norm": 0.6090300998983702, "learning_rate": 3.633384069556491e-05, "loss": 0.1641, "step": 3850 }, { "epoch": 1.7520473157415832, "grad_norm": 0.5891562491351097, "learning_rate": 3.632747035517701e-05, "loss": 0.0966, "step": 3851 }, { "epoch": 1.7525022747952685, "grad_norm": 0.44928068834885726, "learning_rate": 3.632109908915938e-05, "loss": 0.1092, "step": 3852 }, { "epoch": 1.7529572338489536, "grad_norm": 0.5720891232664317, "learning_rate": 3.6314726898032656e-05, "loss": 0.1372, "step": 3853 }, { "epoch": 1.7534121929026387, "grad_norm": 0.5931635913202793, "learning_rate": 3.6308353782317554e-05, "loss": 0.131, "step": 3854 }, { "epoch": 1.753867151956324, "grad_norm": 0.6438652111829787, "learning_rate": 3.630197974253484e-05, "loss": 0.1361, "step": 3855 }, { "epoch": 1.7543221110100091, "grad_norm": 0.7115872546114456, "learning_rate": 3.629560477920539e-05, "loss": 0.1688, "step": 3856 }, { "epoch": 1.7547770700636942, "grad_norm": 0.6464317446737594, "learning_rate": 3.6289228892850126e-05, "loss": 0.1023, "step": 3857 }, { "epoch": 1.7552320291173795, "grad_norm": 0.6049806796253012, "learning_rate": 3.628285208399006e-05, "loss": 0.2055, "step": 3858 }, { "epoch": 1.7556869881710646, "grad_norm": 0.6081666710751312, "learning_rate": 3.627647435314627e-05, "loss": 0.1393, "step": 3859 }, { "epoch": 1.7561419472247497, "grad_norm": 0.6029793871960815, "learning_rate": 3.627009570083992e-05, "loss": 0.1688, "step": 3860 }, { "epoch": 1.756596906278435, "grad_norm": 1.0108663591114502, "learning_rate": 3.626371612759225e-05, "loss": 0.126, "step": 3861 }, { "epoch": 1.75705186533212, "grad_norm": 0.6912773227206589, "learning_rate": 3.625733563392456e-05, "loss": 0.0973, "step": 3862 }, { "epoch": 1.7575068243858052, "grad_norm": 0.564315467805032, "learning_rate": 3.625095422035825e-05, "loss": 0.1465, "step": 3863 }, { "epoch": 1.7579617834394905, "grad_norm": 0.5815550080975153, "learning_rate": 3.624457188741476e-05, "loss": 0.1199, "step": 3864 }, { "epoch": 1.7584167424931756, "grad_norm": 0.4573319826373327, "learning_rate": 3.623818863561563e-05, "loss": 0.0939, "step": 3865 }, { "epoch": 1.7588717015468607, "grad_norm": 0.6382448045480245, "learning_rate": 3.623180446548248e-05, "loss": 0.085, "step": 3866 }, { "epoch": 1.759326660600546, "grad_norm": 0.738816478881397, "learning_rate": 3.6225419377537e-05, "loss": 0.135, "step": 3867 }, { "epoch": 1.759781619654231, "grad_norm": 0.7617296419675228, "learning_rate": 3.6219033372300935e-05, "loss": 0.1352, "step": 3868 }, { "epoch": 1.7602365787079162, "grad_norm": 0.7678150469947984, "learning_rate": 3.621264645029613e-05, "loss": 0.1366, "step": 3869 }, { "epoch": 1.7606915377616015, "grad_norm": 0.6564959879512352, "learning_rate": 3.6206258612044484e-05, "loss": 0.098, "step": 3870 }, { "epoch": 1.7611464968152868, "grad_norm": 0.8248291604329667, "learning_rate": 3.6199869858068005e-05, "loss": 0.1424, "step": 3871 }, { "epoch": 1.7616014558689717, "grad_norm": 0.7413931742622847, "learning_rate": 3.619348018888873e-05, "loss": 0.1219, "step": 3872 }, { "epoch": 1.762056414922657, "grad_norm": 0.6749480579808849, "learning_rate": 3.6187089605028805e-05, "loss": 0.1042, "step": 3873 }, { "epoch": 1.7625113739763423, "grad_norm": 0.6646935067776121, "learning_rate": 3.6180698107010434e-05, "loss": 0.1988, "step": 3874 }, { "epoch": 1.7629663330300271, "grad_norm": 0.6047312949673874, "learning_rate": 3.617430569535592e-05, "loss": 0.1332, "step": 3875 }, { "epoch": 1.7634212920837125, "grad_norm": 0.521425445964697, "learning_rate": 3.6167912370587596e-05, "loss": 0.1437, "step": 3876 }, { "epoch": 1.7638762511373978, "grad_norm": 0.8068636362980761, "learning_rate": 3.616151813322791e-05, "loss": 0.1748, "step": 3877 }, { "epoch": 1.7643312101910829, "grad_norm": 0.6514119772123198, "learning_rate": 3.615512298379937e-05, "loss": 0.1441, "step": 3878 }, { "epoch": 1.764786169244768, "grad_norm": 0.5750463250040633, "learning_rate": 3.614872692282455e-05, "loss": 0.1112, "step": 3879 }, { "epoch": 1.7652411282984533, "grad_norm": 0.6652249270801787, "learning_rate": 3.614232995082611e-05, "loss": 0.1388, "step": 3880 }, { "epoch": 1.7656960873521383, "grad_norm": 0.6960427292400629, "learning_rate": 3.6135932068326795e-05, "loss": 0.1269, "step": 3881 }, { "epoch": 1.7661510464058234, "grad_norm": 0.6310124925589452, "learning_rate": 3.6129533275849395e-05, "loss": 0.1769, "step": 3882 }, { "epoch": 1.7666060054595087, "grad_norm": 0.9072451732017554, "learning_rate": 3.612313357391679e-05, "loss": 0.0971, "step": 3883 }, { "epoch": 1.7670609645131938, "grad_norm": 0.658158875407498, "learning_rate": 3.6116732963051945e-05, "loss": 0.1259, "step": 3884 }, { "epoch": 1.767515923566879, "grad_norm": 0.8767027316021019, "learning_rate": 3.611033144377789e-05, "loss": 0.1956, "step": 3885 }, { "epoch": 1.7679708826205642, "grad_norm": 0.8158355612125543, "learning_rate": 3.610392901661772e-05, "loss": 0.1042, "step": 3886 }, { "epoch": 1.7684258416742493, "grad_norm": 0.6918725570328738, "learning_rate": 3.609752568209462e-05, "loss": 0.0782, "step": 3887 }, { "epoch": 1.7688808007279344, "grad_norm": 0.6122027138397437, "learning_rate": 3.609112144073184e-05, "loss": 0.1384, "step": 3888 }, { "epoch": 1.7693357597816197, "grad_norm": 0.6067353845340138, "learning_rate": 3.608471629305269e-05, "loss": 0.1119, "step": 3889 }, { "epoch": 1.7697907188353048, "grad_norm": 0.7778120198925366, "learning_rate": 3.607831023958059e-05, "loss": 0.0988, "step": 3890 }, { "epoch": 1.77024567788899, "grad_norm": 0.6445340205531874, "learning_rate": 3.6071903280839e-05, "loss": 0.135, "step": 3891 }, { "epoch": 1.7707006369426752, "grad_norm": 0.6827267529049131, "learning_rate": 3.606549541735148e-05, "loss": 0.1331, "step": 3892 }, { "epoch": 1.7711555959963603, "grad_norm": 0.49715558623068395, "learning_rate": 3.6059086649641646e-05, "loss": 0.0936, "step": 3893 }, { "epoch": 1.7716105550500454, "grad_norm": 0.660574353802614, "learning_rate": 3.605267697823319e-05, "loss": 0.1525, "step": 3894 }, { "epoch": 1.7720655141037307, "grad_norm": 0.49274892002932374, "learning_rate": 3.60462664036499e-05, "loss": 0.0821, "step": 3895 }, { "epoch": 1.7725204731574158, "grad_norm": 0.7063294891101337, "learning_rate": 3.603985492641558e-05, "loss": 0.1164, "step": 3896 }, { "epoch": 1.7729754322111009, "grad_norm": 0.6559186233377788, "learning_rate": 3.6033442547054184e-05, "loss": 0.1505, "step": 3897 }, { "epoch": 1.7734303912647862, "grad_norm": 0.7637134357492092, "learning_rate": 3.602702926608969e-05, "loss": 0.1269, "step": 3898 }, { "epoch": 1.7738853503184715, "grad_norm": 0.5744327379566886, "learning_rate": 3.602061508404616e-05, "loss": 0.0944, "step": 3899 }, { "epoch": 1.7743403093721564, "grad_norm": 0.5646888352408478, "learning_rate": 3.601420000144774e-05, "loss": 0.1811, "step": 3900 }, { "epoch": 1.7747952684258417, "grad_norm": 0.6415757356817277, "learning_rate": 3.6007784018818626e-05, "loss": 0.1509, "step": 3901 }, { "epoch": 1.775250227479527, "grad_norm": 0.724021936903756, "learning_rate": 3.600136713668312e-05, "loss": 0.0964, "step": 3902 }, { "epoch": 1.7757051865332119, "grad_norm": 0.6611665404461639, "learning_rate": 3.599494935556556e-05, "loss": 0.1109, "step": 3903 }, { "epoch": 1.7761601455868972, "grad_norm": 0.7156352465322575, "learning_rate": 3.59885306759904e-05, "loss": 0.0961, "step": 3904 }, { "epoch": 1.7766151046405825, "grad_norm": 0.5232367292092489, "learning_rate": 3.598211109848215e-05, "loss": 0.0961, "step": 3905 }, { "epoch": 1.7770700636942676, "grad_norm": 13.08522002093078, "learning_rate": 3.597569062356536e-05, "loss": 0.2784, "step": 3906 }, { "epoch": 1.7775250227479527, "grad_norm": 0.5004313894662415, "learning_rate": 3.5969269251764704e-05, "loss": 0.0849, "step": 3907 }, { "epoch": 1.777979981801638, "grad_norm": 0.5924469424775982, "learning_rate": 3.5962846983604894e-05, "loss": 0.162, "step": 3908 }, { "epoch": 1.778434940855323, "grad_norm": 0.560703737518208, "learning_rate": 3.595642381961075e-05, "loss": 0.1067, "step": 3909 }, { "epoch": 1.7788898999090081, "grad_norm": 1.073305832317366, "learning_rate": 3.594999976030712e-05, "loss": 0.1176, "step": 3910 }, { "epoch": 1.7793448589626935, "grad_norm": 0.5467822520549764, "learning_rate": 3.594357480621896e-05, "loss": 0.1271, "step": 3911 }, { "epoch": 1.7797998180163785, "grad_norm": 0.7516944665806756, "learning_rate": 3.593714895787129e-05, "loss": 0.1371, "step": 3912 }, { "epoch": 1.7802547770700636, "grad_norm": 0.6941646649736307, "learning_rate": 3.59307222157892e-05, "loss": 0.1211, "step": 3913 }, { "epoch": 1.780709736123749, "grad_norm": 0.8719643667863111, "learning_rate": 3.592429458049785e-05, "loss": 0.1325, "step": 3914 }, { "epoch": 1.781164695177434, "grad_norm": 0.7393502571677978, "learning_rate": 3.591786605252248e-05, "loss": 0.1313, "step": 3915 }, { "epoch": 1.7816196542311191, "grad_norm": 0.6716513059977591, "learning_rate": 3.59114366323884e-05, "loss": 0.1258, "step": 3916 }, { "epoch": 1.7820746132848044, "grad_norm": 0.43405638654495854, "learning_rate": 3.5905006320621004e-05, "loss": 0.0827, "step": 3917 }, { "epoch": 1.7825295723384895, "grad_norm": 0.6897600080773184, "learning_rate": 3.5898575117745725e-05, "loss": 0.1134, "step": 3918 }, { "epoch": 1.7829845313921746, "grad_norm": 0.5703348562760493, "learning_rate": 3.589214302428811e-05, "loss": 0.0959, "step": 3919 }, { "epoch": 1.78343949044586, "grad_norm": 0.515637673324476, "learning_rate": 3.5885710040773755e-05, "loss": 0.0951, "step": 3920 }, { "epoch": 1.783894449499545, "grad_norm": 0.60317364201248, "learning_rate": 3.587927616772834e-05, "loss": 0.1407, "step": 3921 }, { "epoch": 1.78434940855323, "grad_norm": 0.6307075699092144, "learning_rate": 3.5872841405677606e-05, "loss": 0.1246, "step": 3922 }, { "epoch": 1.7848043676069154, "grad_norm": 0.5282782250025089, "learning_rate": 3.586640575514736e-05, "loss": 0.0989, "step": 3923 }, { "epoch": 1.7852593266606005, "grad_norm": 0.7389669228923799, "learning_rate": 3.585996921666353e-05, "loss": 0.1312, "step": 3924 }, { "epoch": 1.7857142857142856, "grad_norm": 0.5783292588345779, "learning_rate": 3.5853531790752036e-05, "loss": 0.1026, "step": 3925 }, { "epoch": 1.786169244767971, "grad_norm": 0.5776837637985519, "learning_rate": 3.5847093477938956e-05, "loss": 0.1353, "step": 3926 }, { "epoch": 1.7866242038216562, "grad_norm": 0.7182761629235439, "learning_rate": 3.5840654278750376e-05, "loss": 0.121, "step": 3927 }, { "epoch": 1.787079162875341, "grad_norm": 0.7005408370706538, "learning_rate": 3.5834214193712486e-05, "loss": 0.1238, "step": 3928 }, { "epoch": 1.7875341219290264, "grad_norm": 0.6420156502645886, "learning_rate": 3.582777322335154e-05, "loss": 0.1003, "step": 3929 }, { "epoch": 1.7879890809827117, "grad_norm": 0.662117898518335, "learning_rate": 3.582133136819386e-05, "loss": 0.1298, "step": 3930 }, { "epoch": 1.7884440400363966, "grad_norm": 0.6352755737083895, "learning_rate": 3.581488862876585e-05, "loss": 0.1157, "step": 3931 }, { "epoch": 1.7888989990900819, "grad_norm": 0.6417278982033011, "learning_rate": 3.580844500559397e-05, "loss": 0.1444, "step": 3932 }, { "epoch": 1.7893539581437672, "grad_norm": 0.7077032829282183, "learning_rate": 3.580200049920479e-05, "loss": 0.1637, "step": 3933 }, { "epoch": 1.7898089171974523, "grad_norm": 0.5719501265175421, "learning_rate": 3.579555511012491e-05, "loss": 0.0869, "step": 3934 }, { "epoch": 1.7902638762511374, "grad_norm": 0.5738800554147245, "learning_rate": 3.5789108838881015e-05, "loss": 0.1221, "step": 3935 }, { "epoch": 1.7907188353048227, "grad_norm": 1.0625076266425246, "learning_rate": 3.578266168599986e-05, "loss": 0.141, "step": 3936 }, { "epoch": 1.7911737943585078, "grad_norm": 0.8560756080508304, "learning_rate": 3.57762136520083e-05, "loss": 0.1434, "step": 3937 }, { "epoch": 1.7916287534121929, "grad_norm": 0.5628627999200788, "learning_rate": 3.5769764737433226e-05, "loss": 0.1333, "step": 3938 }, { "epoch": 1.7920837124658782, "grad_norm": 0.8682622584668048, "learning_rate": 3.5763314942801605e-05, "loss": 0.1708, "step": 3939 }, { "epoch": 1.7925386715195633, "grad_norm": 0.5702607192792253, "learning_rate": 3.5756864268640495e-05, "loss": 0.1084, "step": 3940 }, { "epoch": 1.7929936305732483, "grad_norm": 0.907084693091786, "learning_rate": 3.5750412715477016e-05, "loss": 0.1581, "step": 3941 }, { "epoch": 1.7934485896269337, "grad_norm": 0.8531183035424155, "learning_rate": 3.5743960283838355e-05, "loss": 0.1326, "step": 3942 }, { "epoch": 1.7939035486806187, "grad_norm": 0.4706049209884894, "learning_rate": 3.5737506974251784e-05, "loss": 0.0951, "step": 3943 }, { "epoch": 1.7943585077343038, "grad_norm": 0.5364900610018152, "learning_rate": 3.573105278724463e-05, "loss": 0.1167, "step": 3944 }, { "epoch": 1.7948134667879891, "grad_norm": 0.5022189318170432, "learning_rate": 3.572459772334431e-05, "loss": 0.0803, "step": 3945 }, { "epoch": 1.7952684258416742, "grad_norm": 0.5993677251923266, "learning_rate": 3.5718141783078285e-05, "loss": 0.1091, "step": 3946 }, { "epoch": 1.7957233848953593, "grad_norm": 0.5447500080578483, "learning_rate": 3.571168496697412e-05, "loss": 0.1057, "step": 3947 }, { "epoch": 1.7961783439490446, "grad_norm": 0.7812738878819573, "learning_rate": 3.570522727555944e-05, "loss": 0.1659, "step": 3948 }, { "epoch": 1.7966333030027297, "grad_norm": 0.614736535991774, "learning_rate": 3.569876870936192e-05, "loss": 0.1174, "step": 3949 }, { "epoch": 1.7970882620564148, "grad_norm": 0.6761880647812997, "learning_rate": 3.569230926890935e-05, "loss": 0.1414, "step": 3950 }, { "epoch": 1.7975432211101001, "grad_norm": 0.6092598174788595, "learning_rate": 3.568584895472954e-05, "loss": 0.1275, "step": 3951 }, { "epoch": 1.7979981801637852, "grad_norm": 2.669835130132292, "learning_rate": 3.567938776735042e-05, "loss": 0.1121, "step": 3952 }, { "epoch": 1.7984531392174703, "grad_norm": 0.7707147533664553, "learning_rate": 3.567292570729995e-05, "loss": 0.126, "step": 3953 }, { "epoch": 1.7989080982711556, "grad_norm": 0.49831048213500717, "learning_rate": 3.566646277510619e-05, "loss": 0.1024, "step": 3954 }, { "epoch": 1.799363057324841, "grad_norm": 0.5543305083110771, "learning_rate": 3.565999897129727e-05, "loss": 0.0843, "step": 3955 }, { "epoch": 1.7998180163785258, "grad_norm": 0.6441003896193739, "learning_rate": 3.565353429640137e-05, "loss": 0.1311, "step": 3956 }, { "epoch": 1.800272975432211, "grad_norm": 0.5128010804731, "learning_rate": 3.5647068750946755e-05, "loss": 0.0876, "step": 3957 }, { "epoch": 1.8007279344858964, "grad_norm": 0.6315564028048812, "learning_rate": 3.564060233546177e-05, "loss": 0.0971, "step": 3958 }, { "epoch": 1.8011828935395813, "grad_norm": 0.5535776139782941, "learning_rate": 3.563413505047481e-05, "loss": 0.1006, "step": 3959 }, { "epoch": 1.8016378525932666, "grad_norm": 0.5783597243985915, "learning_rate": 3.5627666896514354e-05, "loss": 0.1613, "step": 3960 }, { "epoch": 1.802092811646952, "grad_norm": 0.5826867637211096, "learning_rate": 3.562119787410896e-05, "loss": 0.1343, "step": 3961 }, { "epoch": 1.802547770700637, "grad_norm": 0.5744100015319414, "learning_rate": 3.561472798378725e-05, "loss": 0.1718, "step": 3962 }, { "epoch": 1.803002729754322, "grad_norm": 0.6367948789107337, "learning_rate": 3.560825722607788e-05, "loss": 0.1566, "step": 3963 }, { "epoch": 1.8034576888080074, "grad_norm": 0.8532887688861095, "learning_rate": 3.5601785601509654e-05, "loss": 0.1594, "step": 3964 }, { "epoch": 1.8039126478616925, "grad_norm": 0.7100298138880962, "learning_rate": 3.5595313110611387e-05, "loss": 0.1254, "step": 3965 }, { "epoch": 1.8043676069153776, "grad_norm": 0.5850367940323689, "learning_rate": 3.558883975391197e-05, "loss": 0.1092, "step": 3966 }, { "epoch": 1.8048225659690629, "grad_norm": 0.6288013694571852, "learning_rate": 3.558236553194039e-05, "loss": 0.1159, "step": 3967 }, { "epoch": 1.805277525022748, "grad_norm": 0.779215436661396, "learning_rate": 3.557589044522568e-05, "loss": 0.1277, "step": 3968 }, { "epoch": 1.805732484076433, "grad_norm": 0.5781543589693107, "learning_rate": 3.556941449429698e-05, "loss": 0.1339, "step": 3969 }, { "epoch": 1.8061874431301184, "grad_norm": 0.9772756500956358, "learning_rate": 3.5562937679683454e-05, "loss": 0.1376, "step": 3970 }, { "epoch": 1.8066424021838035, "grad_norm": 0.5161601038383716, "learning_rate": 3.555646000191436e-05, "loss": 0.1196, "step": 3971 }, { "epoch": 1.8070973612374885, "grad_norm": 0.9318327911976142, "learning_rate": 3.5549981461519025e-05, "loss": 0.1085, "step": 3972 }, { "epoch": 1.8075523202911739, "grad_norm": 0.48430116596738604, "learning_rate": 3.554350205902685e-05, "loss": 0.089, "step": 3973 }, { "epoch": 1.808007279344859, "grad_norm": 0.7354095918694938, "learning_rate": 3.55370217949673e-05, "loss": 0.1305, "step": 3974 }, { "epoch": 1.808462238398544, "grad_norm": 0.7191458478171209, "learning_rate": 3.553054066986992e-05, "loss": 0.1304, "step": 3975 }, { "epoch": 1.8089171974522293, "grad_norm": 0.6061796489053909, "learning_rate": 3.5524058684264304e-05, "loss": 0.0796, "step": 3976 }, { "epoch": 1.8093721565059144, "grad_norm": 0.7229040690924667, "learning_rate": 3.5517575838680144e-05, "loss": 0.1137, "step": 3977 }, { "epoch": 1.8098271155595995, "grad_norm": 0.5416229074914626, "learning_rate": 3.5511092133647174e-05, "loss": 0.0948, "step": 3978 }, { "epoch": 1.8102820746132848, "grad_norm": 0.73790110669117, "learning_rate": 3.5504607569695236e-05, "loss": 0.1264, "step": 3979 }, { "epoch": 1.81073703366697, "grad_norm": 0.5906476012140037, "learning_rate": 3.5498122147354193e-05, "loss": 0.0948, "step": 3980 }, { "epoch": 1.811191992720655, "grad_norm": 0.5379718356549165, "learning_rate": 3.5491635867154025e-05, "loss": 0.0923, "step": 3981 }, { "epoch": 1.8116469517743403, "grad_norm": 0.5538791989074533, "learning_rate": 3.5485148729624756e-05, "loss": 0.1336, "step": 3982 }, { "epoch": 1.8121019108280256, "grad_norm": 0.6531266061649107, "learning_rate": 3.5478660735296474e-05, "loss": 0.1026, "step": 3983 }, { "epoch": 1.8125568698817105, "grad_norm": 0.5274246585122185, "learning_rate": 3.547217188469937e-05, "loss": 0.0848, "step": 3984 }, { "epoch": 1.8130118289353958, "grad_norm": 0.7299403181358444, "learning_rate": 3.546568217836366e-05, "loss": 0.124, "step": 3985 }, { "epoch": 1.8134667879890811, "grad_norm": 0.8541336947146128, "learning_rate": 3.5459191616819675e-05, "loss": 0.1082, "step": 3986 }, { "epoch": 1.813921747042766, "grad_norm": 0.7259791047699355, "learning_rate": 3.545270020059778e-05, "loss": 0.1255, "step": 3987 }, { "epoch": 1.8143767060964513, "grad_norm": 0.8098940199703982, "learning_rate": 3.544620793022842e-05, "loss": 0.1307, "step": 3988 }, { "epoch": 1.8148316651501366, "grad_norm": 0.6161734259557393, "learning_rate": 3.5439714806242135e-05, "loss": 0.1092, "step": 3989 }, { "epoch": 1.8152866242038217, "grad_norm": 0.5748774667311499, "learning_rate": 3.5433220829169496e-05, "loss": 0.1258, "step": 3990 }, { "epoch": 1.8157415832575068, "grad_norm": 0.8146977146294752, "learning_rate": 3.5426725999541174e-05, "loss": 0.1653, "step": 3991 }, { "epoch": 1.816196542311192, "grad_norm": 0.6433628527723645, "learning_rate": 3.542023031788788e-05, "loss": 0.1269, "step": 3992 }, { "epoch": 1.8166515013648772, "grad_norm": 0.6647490844817127, "learning_rate": 3.541373378474042e-05, "loss": 0.1348, "step": 3993 }, { "epoch": 1.8171064604185623, "grad_norm": 0.5218671313060287, "learning_rate": 3.540723640062967e-05, "loss": 0.1083, "step": 3994 }, { "epoch": 1.8175614194722476, "grad_norm": 0.6969255779785646, "learning_rate": 3.540073816608656e-05, "loss": 0.0979, "step": 3995 }, { "epoch": 1.8180163785259327, "grad_norm": 0.6388597294796554, "learning_rate": 3.539423908164209e-05, "loss": 0.0776, "step": 3996 }, { "epoch": 1.8184713375796178, "grad_norm": 0.6182050686179215, "learning_rate": 3.5387739147827355e-05, "loss": 0.1427, "step": 3997 }, { "epoch": 1.818926296633303, "grad_norm": 0.5716867668286534, "learning_rate": 3.538123836517348e-05, "loss": 0.1187, "step": 3998 }, { "epoch": 1.8193812556869882, "grad_norm": 0.6367044943276341, "learning_rate": 3.5374736734211686e-05, "loss": 0.1341, "step": 3999 }, { "epoch": 1.8198362147406733, "grad_norm": 0.7937710951669981, "learning_rate": 3.536823425547325e-05, "loss": 0.1155, "step": 4000 }, { "epoch": 1.8202911737943586, "grad_norm": 0.5861391031770203, "learning_rate": 3.536173092948955e-05, "loss": 0.1322, "step": 4001 }, { "epoch": 1.8207461328480437, "grad_norm": 0.6349861675731755, "learning_rate": 3.535522675679198e-05, "loss": 0.1054, "step": 4002 }, { "epoch": 1.8212010919017287, "grad_norm": 0.7564961245676808, "learning_rate": 3.534872173791205e-05, "loss": 0.1212, "step": 4003 }, { "epoch": 1.821656050955414, "grad_norm": 0.813011564546999, "learning_rate": 3.5342215873381306e-05, "loss": 0.1563, "step": 4004 }, { "epoch": 1.8221110100090991, "grad_norm": 0.5530443385449343, "learning_rate": 3.533570916373139e-05, "loss": 0.1094, "step": 4005 }, { "epoch": 1.8225659690627842, "grad_norm": 0.6403973089757538, "learning_rate": 3.5329201609494e-05, "loss": 0.1486, "step": 4006 }, { "epoch": 1.8230209281164695, "grad_norm": 0.9370553192949451, "learning_rate": 3.5322693211200897e-05, "loss": 0.1067, "step": 4007 }, { "epoch": 1.8234758871701549, "grad_norm": 0.63436009975056, "learning_rate": 3.531618396938393e-05, "loss": 0.1195, "step": 4008 }, { "epoch": 1.8239308462238397, "grad_norm": 0.5085169563433526, "learning_rate": 3.5309673884574984e-05, "loss": 0.1015, "step": 4009 }, { "epoch": 1.824385805277525, "grad_norm": 0.6166000095639866, "learning_rate": 3.530316295730605e-05, "loss": 0.1397, "step": 4010 }, { "epoch": 1.8248407643312103, "grad_norm": 0.7107817917750509, "learning_rate": 3.5296651188109175e-05, "loss": 0.1358, "step": 4011 }, { "epoch": 1.8252957233848952, "grad_norm": 0.8261959858422149, "learning_rate": 3.5290138577516455e-05, "loss": 0.1532, "step": 4012 }, { "epoch": 1.8257506824385805, "grad_norm": 0.5982370874564086, "learning_rate": 3.528362512606008e-05, "loss": 0.1316, "step": 4013 }, { "epoch": 1.8262056414922658, "grad_norm": 0.5498920568039712, "learning_rate": 3.52771108342723e-05, "loss": 0.0752, "step": 4014 }, { "epoch": 1.826660600545951, "grad_norm": 0.6029051628617375, "learning_rate": 3.527059570268545e-05, "loss": 0.162, "step": 4015 }, { "epoch": 1.827115559599636, "grad_norm": 0.9166263550043331, "learning_rate": 3.526407973183188e-05, "loss": 0.1125, "step": 4016 }, { "epoch": 1.8275705186533213, "grad_norm": 0.6576860712604323, "learning_rate": 3.5257562922244074e-05, "loss": 0.1551, "step": 4017 }, { "epoch": 1.8280254777070064, "grad_norm": 0.6157109485294806, "learning_rate": 3.525104527445455e-05, "loss": 0.0978, "step": 4018 }, { "epoch": 1.8284804367606915, "grad_norm": 0.5814438302966742, "learning_rate": 3.52445267889959e-05, "loss": 0.0997, "step": 4019 }, { "epoch": 1.8289353958143768, "grad_norm": 0.7183028055153947, "learning_rate": 3.5238007466400786e-05, "loss": 0.0808, "step": 4020 }, { "epoch": 1.829390354868062, "grad_norm": 0.5390205531828818, "learning_rate": 3.523148730720193e-05, "loss": 0.1089, "step": 4021 }, { "epoch": 1.829845313921747, "grad_norm": 0.6632850917633378, "learning_rate": 3.5224966311932144e-05, "loss": 0.1809, "step": 4022 }, { "epoch": 1.8303002729754323, "grad_norm": 0.8926766065643909, "learning_rate": 3.521844448112428e-05, "loss": 0.1253, "step": 4023 }, { "epoch": 1.8307552320291174, "grad_norm": 0.5785429352448841, "learning_rate": 3.5211921815311284e-05, "loss": 0.1303, "step": 4024 }, { "epoch": 1.8312101910828025, "grad_norm": 0.6278911642508688, "learning_rate": 3.520539831502616e-05, "loss": 0.1107, "step": 4025 }, { "epoch": 1.8316651501364878, "grad_norm": 0.7562307584327395, "learning_rate": 3.519887398080195e-05, "loss": 0.0969, "step": 4026 }, { "epoch": 1.8321201091901729, "grad_norm": 0.6538308150189605, "learning_rate": 3.519234881317184e-05, "loss": 0.0979, "step": 4027 }, { "epoch": 1.832575068243858, "grad_norm": 0.600469506653278, "learning_rate": 3.5185822812669e-05, "loss": 0.1211, "step": 4028 }, { "epoch": 1.8330300272975433, "grad_norm": 0.7397154628439235, "learning_rate": 3.5179295979826724e-05, "loss": 0.1836, "step": 4029 }, { "epoch": 1.8334849863512284, "grad_norm": 0.4554561888147606, "learning_rate": 3.5172768315178346e-05, "loss": 0.0955, "step": 4030 }, { "epoch": 1.8339399454049135, "grad_norm": 0.6104932400635875, "learning_rate": 3.516623981925728e-05, "loss": 0.1411, "step": 4031 }, { "epoch": 1.8343949044585988, "grad_norm": 0.5997732465032011, "learning_rate": 3.5159710492597014e-05, "loss": 0.1504, "step": 4032 }, { "epoch": 1.8348498635122839, "grad_norm": 0.5556025374649385, "learning_rate": 3.515318033573108e-05, "loss": 0.0885, "step": 4033 }, { "epoch": 1.835304822565969, "grad_norm": 0.597744302439566, "learning_rate": 3.514664934919311e-05, "loss": 0.0843, "step": 4034 }, { "epoch": 1.8357597816196543, "grad_norm": 0.720482489718998, "learning_rate": 3.514011753351677e-05, "loss": 0.1389, "step": 4035 }, { "epoch": 1.8362147406733396, "grad_norm": 0.6045796812303565, "learning_rate": 3.5133584889235816e-05, "loss": 0.0829, "step": 4036 }, { "epoch": 1.8366696997270244, "grad_norm": 0.3808740574472068, "learning_rate": 3.512705141688407e-05, "loss": 0.1269, "step": 4037 }, { "epoch": 1.8371246587807097, "grad_norm": 0.6782298733566828, "learning_rate": 3.512051711699541e-05, "loss": 0.1187, "step": 4038 }, { "epoch": 1.837579617834395, "grad_norm": 0.5786389001523898, "learning_rate": 3.51139819901038e-05, "loss": 0.1181, "step": 4039 }, { "epoch": 1.83803457688808, "grad_norm": 0.8264340557014068, "learning_rate": 3.510744603674326e-05, "loss": 0.092, "step": 4040 }, { "epoch": 1.8384895359417652, "grad_norm": 0.6506597330567476, "learning_rate": 3.5100909257447864e-05, "loss": 0.2021, "step": 4041 }, { "epoch": 1.8389444949954505, "grad_norm": 0.509991901255519, "learning_rate": 3.509437165275179e-05, "loss": 0.1347, "step": 4042 }, { "epoch": 1.8393994540491356, "grad_norm": 0.8179144721768247, "learning_rate": 3.508783322318925e-05, "loss": 0.1125, "step": 4043 }, { "epoch": 1.8398544131028207, "grad_norm": 0.8256087538685475, "learning_rate": 3.508129396929452e-05, "loss": 0.1176, "step": 4044 }, { "epoch": 1.840309372156506, "grad_norm": 0.6612717066635047, "learning_rate": 3.5074753891601995e-05, "loss": 0.1262, "step": 4045 }, { "epoch": 1.8407643312101911, "grad_norm": 0.6477435465333239, "learning_rate": 3.5068212990646066e-05, "loss": 0.0832, "step": 4046 }, { "epoch": 1.8412192902638762, "grad_norm": 0.546912850590502, "learning_rate": 3.506167126696125e-05, "loss": 0.0943, "step": 4047 }, { "epoch": 1.8416742493175615, "grad_norm": 0.7044899558400196, "learning_rate": 3.505512872108208e-05, "loss": 0.0845, "step": 4048 }, { "epoch": 1.8421292083712466, "grad_norm": 0.559297836678499, "learning_rate": 3.504858535354321e-05, "loss": 0.0745, "step": 4049 }, { "epoch": 1.8425841674249317, "grad_norm": 0.6073230677567643, "learning_rate": 3.504204116487933e-05, "loss": 0.1606, "step": 4050 }, { "epoch": 1.843039126478617, "grad_norm": 0.628153327354501, "learning_rate": 3.503549615562518e-05, "loss": 0.1216, "step": 4051 }, { "epoch": 1.843494085532302, "grad_norm": 1.0273015773271983, "learning_rate": 3.502895032631561e-05, "loss": 0.1014, "step": 4052 }, { "epoch": 1.8439490445859872, "grad_norm": 0.7376122403479151, "learning_rate": 3.502240367748551e-05, "loss": 0.1515, "step": 4053 }, { "epoch": 1.8444040036396725, "grad_norm": 0.6386368860632036, "learning_rate": 3.501585620966985e-05, "loss": 0.1469, "step": 4054 }, { "epoch": 1.8448589626933576, "grad_norm": 0.7104807792222122, "learning_rate": 3.5009307923403636e-05, "loss": 0.1554, "step": 4055 }, { "epoch": 1.8453139217470427, "grad_norm": 0.6620926549776215, "learning_rate": 3.500275881922199e-05, "loss": 0.1011, "step": 4056 }, { "epoch": 1.845768880800728, "grad_norm": 0.7202836523242748, "learning_rate": 3.499620889766006e-05, "loss": 0.1395, "step": 4057 }, { "epoch": 1.846223839854413, "grad_norm": 0.6008783624185846, "learning_rate": 3.498965815925309e-05, "loss": 0.1028, "step": 4058 }, { "epoch": 1.8466787989080982, "grad_norm": 0.6046284431751676, "learning_rate": 3.498310660453636e-05, "loss": 0.0931, "step": 4059 }, { "epoch": 1.8471337579617835, "grad_norm": 0.5721241372690327, "learning_rate": 3.497655423404525e-05, "loss": 0.1453, "step": 4060 }, { "epoch": 1.8475887170154686, "grad_norm": 0.5733964270994036, "learning_rate": 3.497000104831518e-05, "loss": 0.0772, "step": 4061 }, { "epoch": 1.8480436760691537, "grad_norm": 0.4897988699033504, "learning_rate": 3.496344704788165e-05, "loss": 0.0778, "step": 4062 }, { "epoch": 1.848498635122839, "grad_norm": 0.618785349075561, "learning_rate": 3.4956892233280215e-05, "loss": 0.1289, "step": 4063 }, { "epoch": 1.8489535941765243, "grad_norm": 0.5800134424591697, "learning_rate": 3.495033660504651e-05, "loss": 0.1013, "step": 4064 }, { "epoch": 1.8494085532302091, "grad_norm": 0.891500579317675, "learning_rate": 3.494378016371623e-05, "loss": 0.1677, "step": 4065 }, { "epoch": 1.8498635122838945, "grad_norm": 0.8477639296963919, "learning_rate": 3.4937222909825155e-05, "loss": 0.1605, "step": 4066 }, { "epoch": 1.8503184713375798, "grad_norm": 0.6623588180446133, "learning_rate": 3.493066484390909e-05, "loss": 0.1052, "step": 4067 }, { "epoch": 1.8507734303912646, "grad_norm": 0.5541449298843713, "learning_rate": 3.492410596650395e-05, "loss": 0.1013, "step": 4068 }, { "epoch": 1.85122838944495, "grad_norm": 0.6640562748428137, "learning_rate": 3.491754627814568e-05, "loss": 0.1559, "step": 4069 }, { "epoch": 1.8516833484986353, "grad_norm": 0.8748901403171461, "learning_rate": 3.4910985779370306e-05, "loss": 0.1254, "step": 4070 }, { "epoch": 1.8521383075523203, "grad_norm": 0.6983960636547808, "learning_rate": 3.4904424470713945e-05, "loss": 0.1034, "step": 4071 }, { "epoch": 1.8525932666060054, "grad_norm": 0.4721233639137621, "learning_rate": 3.489786235271274e-05, "loss": 0.1129, "step": 4072 }, { "epoch": 1.8530482256596907, "grad_norm": 0.5346653771642866, "learning_rate": 3.4891299425902924e-05, "loss": 0.0952, "step": 4073 }, { "epoch": 1.8535031847133758, "grad_norm": 0.5198958514014564, "learning_rate": 3.4884735690820784e-05, "loss": 0.1111, "step": 4074 }, { "epoch": 1.853958143767061, "grad_norm": 0.7000183622252671, "learning_rate": 3.487817114800269e-05, "loss": 0.1515, "step": 4075 }, { "epoch": 1.8544131028207462, "grad_norm": 0.4650015425604801, "learning_rate": 3.487160579798505e-05, "loss": 0.0996, "step": 4076 }, { "epoch": 1.8548680618744313, "grad_norm": 0.6619753033754873, "learning_rate": 3.486503964130437e-05, "loss": 0.1138, "step": 4077 }, { "epoch": 1.8553230209281164, "grad_norm": 2.772586377105591, "learning_rate": 3.48584726784972e-05, "loss": 0.2726, "step": 4078 }, { "epoch": 1.8557779799818017, "grad_norm": 0.689308894151, "learning_rate": 3.485190491010016e-05, "loss": 0.1716, "step": 4079 }, { "epoch": 1.8562329390354868, "grad_norm": 0.8193754885230936, "learning_rate": 3.484533633664994e-05, "loss": 0.1831, "step": 4080 }, { "epoch": 1.856687898089172, "grad_norm": 0.8158761014597268, "learning_rate": 3.4838766958683304e-05, "loss": 0.1164, "step": 4081 }, { "epoch": 1.8571428571428572, "grad_norm": 1.1798601810846283, "learning_rate": 3.483219677673706e-05, "loss": 0.1871, "step": 4082 }, { "epoch": 1.8575978161965423, "grad_norm": 0.6180622730057671, "learning_rate": 3.4825625791348096e-05, "loss": 0.1124, "step": 4083 }, { "epoch": 1.8580527752502274, "grad_norm": 4.392864409315595, "learning_rate": 3.481905400305336e-05, "loss": 0.14, "step": 4084 }, { "epoch": 1.8585077343039127, "grad_norm": 0.6196062686271282, "learning_rate": 3.481248141238988e-05, "loss": 0.1137, "step": 4085 }, { "epoch": 1.8589626933575978, "grad_norm": 0.6412206431418077, "learning_rate": 3.480590801989473e-05, "loss": 0.1352, "step": 4086 }, { "epoch": 1.8594176524112829, "grad_norm": 0.7680457153070006, "learning_rate": 3.4799333826105054e-05, "loss": 0.1239, "step": 4087 }, { "epoch": 1.8598726114649682, "grad_norm": 0.6768095809268688, "learning_rate": 3.4792758831558084e-05, "loss": 0.1608, "step": 4088 }, { "epoch": 1.8603275705186533, "grad_norm": 0.6373793139696404, "learning_rate": 3.4786183036791075e-05, "loss": 0.1319, "step": 4089 }, { "epoch": 1.8607825295723384, "grad_norm": 0.5639762479926199, "learning_rate": 3.4779606442341386e-05, "loss": 0.1009, "step": 4090 }, { "epoch": 1.8612374886260237, "grad_norm": 0.8227853824402515, "learning_rate": 3.477302904874642e-05, "loss": 0.1315, "step": 4091 }, { "epoch": 1.861692447679709, "grad_norm": 0.5070407685444608, "learning_rate": 3.476645085654366e-05, "loss": 0.0803, "step": 4092 }, { "epoch": 1.8621474067333939, "grad_norm": 0.422707751050137, "learning_rate": 3.475987186627063e-05, "loss": 0.0717, "step": 4093 }, { "epoch": 1.8626023657870792, "grad_norm": 0.6072345022012531, "learning_rate": 3.475329207846496e-05, "loss": 0.0937, "step": 4094 }, { "epoch": 1.8630573248407645, "grad_norm": 0.5391265397353996, "learning_rate": 3.47467114936643e-05, "loss": 0.1306, "step": 4095 }, { "epoch": 1.8635122838944493, "grad_norm": 0.6092201440906208, "learning_rate": 3.474013011240639e-05, "loss": 0.1393, "step": 4096 }, { "epoch": 1.8639672429481347, "grad_norm": 0.537281537531884, "learning_rate": 3.473354793522904e-05, "loss": 0.0732, "step": 4097 }, { "epoch": 1.86442220200182, "grad_norm": 0.621360916930337, "learning_rate": 3.4726964962670105e-05, "loss": 0.1111, "step": 4098 }, { "epoch": 1.864877161055505, "grad_norm": 0.6045461988564991, "learning_rate": 3.4720381195267524e-05, "loss": 0.0942, "step": 4099 }, { "epoch": 1.8653321201091901, "grad_norm": 0.7045653680194242, "learning_rate": 3.471379663355928e-05, "loss": 0.1457, "step": 4100 }, { "epoch": 1.8657870791628755, "grad_norm": 0.7555102866741992, "learning_rate": 3.470721127808345e-05, "loss": 0.116, "step": 4101 }, { "epoch": 1.8662420382165605, "grad_norm": 0.7778122816226306, "learning_rate": 3.470062512937815e-05, "loss": 0.1457, "step": 4102 }, { "epoch": 1.8666969972702456, "grad_norm": 0.5112847305529499, "learning_rate": 3.469403818798157e-05, "loss": 0.0967, "step": 4103 }, { "epoch": 1.867151956323931, "grad_norm": 0.7000543448903995, "learning_rate": 3.468745045443197e-05, "loss": 0.1365, "step": 4104 }, { "epoch": 1.867606915377616, "grad_norm": 0.49838237729979606, "learning_rate": 3.4680861929267665e-05, "loss": 0.0617, "step": 4105 }, { "epoch": 1.8680618744313011, "grad_norm": 0.5349178151623888, "learning_rate": 3.467427261302704e-05, "loss": 0.0876, "step": 4106 }, { "epoch": 1.8685168334849864, "grad_norm": 0.5432617701857464, "learning_rate": 3.4667682506248544e-05, "loss": 0.1044, "step": 4107 }, { "epoch": 1.8689717925386715, "grad_norm": 0.6854514424644449, "learning_rate": 3.46610916094707e-05, "loss": 0.0935, "step": 4108 }, { "epoch": 1.8694267515923566, "grad_norm": 0.6043997888837951, "learning_rate": 3.4654499923232074e-05, "loss": 0.0962, "step": 4109 }, { "epoch": 1.869881710646042, "grad_norm": 0.7623646899568769, "learning_rate": 3.464790744807132e-05, "loss": 0.1195, "step": 4110 }, { "epoch": 1.870336669699727, "grad_norm": 0.5830542124604974, "learning_rate": 3.464131418452713e-05, "loss": 0.1102, "step": 4111 }, { "epoch": 1.870791628753412, "grad_norm": 0.7013694094355, "learning_rate": 3.463472013313829e-05, "loss": 0.0988, "step": 4112 }, { "epoch": 1.8712465878070974, "grad_norm": 0.5694465067899013, "learning_rate": 3.4628125294443624e-05, "loss": 0.101, "step": 4113 }, { "epoch": 1.8717015468607825, "grad_norm": 0.9001617318352928, "learning_rate": 3.462152966898205e-05, "loss": 0.1087, "step": 4114 }, { "epoch": 1.8721565059144676, "grad_norm": 0.5040881971660128, "learning_rate": 3.4614933257292515e-05, "loss": 0.1145, "step": 4115 }, { "epoch": 1.872611464968153, "grad_norm": 0.7137429414696579, "learning_rate": 3.460833605991405e-05, "loss": 0.1213, "step": 4116 }, { "epoch": 1.873066424021838, "grad_norm": 0.5681099520662589, "learning_rate": 3.4601738077385765e-05, "loss": 0.0845, "step": 4117 }, { "epoch": 1.873521383075523, "grad_norm": 0.6607971642430284, "learning_rate": 3.459513931024679e-05, "loss": 0.1353, "step": 4118 }, { "epoch": 1.8739763421292084, "grad_norm": 0.692890191871579, "learning_rate": 3.458853975903638e-05, "loss": 0.1073, "step": 4119 }, { "epoch": 1.8744313011828937, "grad_norm": 0.9784668873423582, "learning_rate": 3.4581939424293794e-05, "loss": 0.136, "step": 4120 }, { "epoch": 1.8748862602365786, "grad_norm": 0.6658641575399574, "learning_rate": 3.457533830655838e-05, "loss": 0.1345, "step": 4121 }, { "epoch": 1.8753412192902639, "grad_norm": 0.8507991035838187, "learning_rate": 3.456873640636958e-05, "loss": 0.1671, "step": 4122 }, { "epoch": 1.8757961783439492, "grad_norm": 0.8020285735167498, "learning_rate": 3.456213372426684e-05, "loss": 0.1096, "step": 4123 }, { "epoch": 1.876251137397634, "grad_norm": 0.6010299874158781, "learning_rate": 3.4555530260789715e-05, "loss": 0.111, "step": 4124 }, { "epoch": 1.8767060964513194, "grad_norm": 0.6553001999397429, "learning_rate": 3.4548926016477815e-05, "loss": 0.1547, "step": 4125 }, { "epoch": 1.8771610555050047, "grad_norm": 0.8318197016678447, "learning_rate": 3.45423209918708e-05, "loss": 0.1396, "step": 4126 }, { "epoch": 1.8776160145586898, "grad_norm": 0.6731128603143806, "learning_rate": 3.4535715187508405e-05, "loss": 0.1046, "step": 4127 }, { "epoch": 1.8780709736123748, "grad_norm": 1.951095864507846, "learning_rate": 3.452910860393043e-05, "loss": 0.2052, "step": 4128 }, { "epoch": 1.8785259326660602, "grad_norm": 0.6127054165205925, "learning_rate": 3.452250124167674e-05, "loss": 0.1419, "step": 4129 }, { "epoch": 1.8789808917197452, "grad_norm": 0.5657294162739868, "learning_rate": 3.451589310128724e-05, "loss": 0.1246, "step": 4130 }, { "epoch": 1.8794358507734303, "grad_norm": 0.5704258287053939, "learning_rate": 3.450928418330193e-05, "loss": 0.0954, "step": 4131 }, { "epoch": 1.8798908098271156, "grad_norm": 1.1769884006839508, "learning_rate": 3.4502674488260866e-05, "loss": 0.1802, "step": 4132 }, { "epoch": 1.8803457688808007, "grad_norm": 0.6289387707395766, "learning_rate": 3.449606401670415e-05, "loss": 0.1317, "step": 4133 }, { "epoch": 1.8808007279344858, "grad_norm": 0.6715029069628532, "learning_rate": 3.448945276917198e-05, "loss": 0.0925, "step": 4134 }, { "epoch": 1.8812556869881711, "grad_norm": 0.6673233476240419, "learning_rate": 3.448284074620457e-05, "loss": 0.1267, "step": 4135 }, { "epoch": 1.8817106460418562, "grad_norm": 1.0083047601387674, "learning_rate": 3.447622794834224e-05, "loss": 0.1404, "step": 4136 }, { "epoch": 1.8821656050955413, "grad_norm": 0.569565458620979, "learning_rate": 3.446961437612536e-05, "loss": 0.1346, "step": 4137 }, { "epoch": 1.8826205641492266, "grad_norm": 0.5501718539440719, "learning_rate": 3.4463000030094354e-05, "loss": 0.1093, "step": 4138 }, { "epoch": 1.8830755232029117, "grad_norm": 0.48844458377023403, "learning_rate": 3.445638491078973e-05, "loss": 0.1027, "step": 4139 }, { "epoch": 1.8835304822565968, "grad_norm": 0.6917885275535727, "learning_rate": 3.444976901875203e-05, "loss": 0.15, "step": 4140 }, { "epoch": 1.8839854413102821, "grad_norm": 0.5522747709165132, "learning_rate": 3.444315235452188e-05, "loss": 0.1224, "step": 4141 }, { "epoch": 1.8844404003639672, "grad_norm": 0.6655350543705966, "learning_rate": 3.4436534918639955e-05, "loss": 0.116, "step": 4142 }, { "epoch": 1.8848953594176523, "grad_norm": 0.7675995680850697, "learning_rate": 3.442991671164703e-05, "loss": 0.1276, "step": 4143 }, { "epoch": 1.8853503184713376, "grad_norm": 0.7344368935697, "learning_rate": 3.442329773408388e-05, "loss": 0.1268, "step": 4144 }, { "epoch": 1.8858052775250227, "grad_norm": 0.6370993048521529, "learning_rate": 3.4416677986491395e-05, "loss": 0.1088, "step": 4145 }, { "epoch": 1.8862602365787078, "grad_norm": 0.5471353492807612, "learning_rate": 3.441005746941052e-05, "loss": 0.1159, "step": 4146 }, { "epoch": 1.886715195632393, "grad_norm": 0.5819064742612399, "learning_rate": 3.440343618338224e-05, "loss": 0.1392, "step": 4147 }, { "epoch": 1.8871701546860784, "grad_norm": 0.7396177531466654, "learning_rate": 3.439681412894762e-05, "loss": 0.1369, "step": 4148 }, { "epoch": 1.8876251137397633, "grad_norm": 0.5377157170833544, "learning_rate": 3.4390191306647784e-05, "loss": 0.0838, "step": 4149 }, { "epoch": 1.8880800727934486, "grad_norm": 0.8115617843873039, "learning_rate": 3.4383567717023924e-05, "loss": 0.1435, "step": 4150 }, { "epoch": 1.888535031847134, "grad_norm": 0.5263419480390826, "learning_rate": 3.4376943360617295e-05, "loss": 0.1007, "step": 4151 }, { "epoch": 1.8889899909008188, "grad_norm": 0.6752228061124457, "learning_rate": 3.437031823796918e-05, "loss": 0.0937, "step": 4152 }, { "epoch": 1.889444949954504, "grad_norm": 0.6539255568743292, "learning_rate": 3.436369234962098e-05, "loss": 0.1521, "step": 4153 }, { "epoch": 1.8898999090081894, "grad_norm": 0.5200628320426651, "learning_rate": 3.4357065696114134e-05, "loss": 0.0874, "step": 4154 }, { "epoch": 1.8903548680618745, "grad_norm": 0.6522037286890563, "learning_rate": 3.4350438277990135e-05, "loss": 0.1898, "step": 4155 }, { "epoch": 1.8908098271155596, "grad_norm": 0.4482862436560359, "learning_rate": 3.4343810095790545e-05, "loss": 0.1063, "step": 4156 }, { "epoch": 1.8912647861692449, "grad_norm": 0.5585747929400661, "learning_rate": 3.4337181150056984e-05, "loss": 0.0726, "step": 4157 }, { "epoch": 1.89171974522293, "grad_norm": 0.7228645271197705, "learning_rate": 3.433055144133116e-05, "loss": 0.0979, "step": 4158 }, { "epoch": 1.892174704276615, "grad_norm": 0.8662353873938982, "learning_rate": 3.432392097015479e-05, "loss": 0.091, "step": 4159 }, { "epoch": 1.8926296633303004, "grad_norm": 0.5478805457001271, "learning_rate": 3.431728973706972e-05, "loss": 0.1421, "step": 4160 }, { "epoch": 1.8930846223839854, "grad_norm": 0.6144642255887814, "learning_rate": 3.43106577426178e-05, "loss": 0.1589, "step": 4161 }, { "epoch": 1.8935395814376705, "grad_norm": 0.7013314851181148, "learning_rate": 3.430402498734098e-05, "loss": 0.1186, "step": 4162 }, { "epoch": 1.8939945404913558, "grad_norm": 0.7180311100709929, "learning_rate": 3.429739147178126e-05, "loss": 0.1276, "step": 4163 }, { "epoch": 1.894449499545041, "grad_norm": 0.7039988100318366, "learning_rate": 3.4290757196480686e-05, "loss": 0.1576, "step": 4164 }, { "epoch": 1.894904458598726, "grad_norm": 0.6392438953497951, "learning_rate": 3.4284122161981396e-05, "loss": 0.1634, "step": 4165 }, { "epoch": 1.8953594176524113, "grad_norm": 0.8687343957981292, "learning_rate": 3.427748636882556e-05, "loss": 0.1333, "step": 4166 }, { "epoch": 1.8958143767060964, "grad_norm": 0.5238100053538426, "learning_rate": 3.427084981755545e-05, "loss": 0.1117, "step": 4167 }, { "epoch": 1.8962693357597815, "grad_norm": 0.6108434815517393, "learning_rate": 3.4264212508713354e-05, "loss": 0.1216, "step": 4168 }, { "epoch": 1.8967242948134668, "grad_norm": 0.5141198075406735, "learning_rate": 3.425757444284164e-05, "loss": 0.0842, "step": 4169 }, { "epoch": 1.897179253867152, "grad_norm": 0.46376452043095145, "learning_rate": 3.425093562048276e-05, "loss": 0.1303, "step": 4170 }, { "epoch": 1.897634212920837, "grad_norm": 0.659603030862727, "learning_rate": 3.4244296042179194e-05, "loss": 0.1083, "step": 4171 }, { "epoch": 1.8980891719745223, "grad_norm": 0.5986572079326599, "learning_rate": 3.4237655708473505e-05, "loss": 0.1359, "step": 4172 }, { "epoch": 1.8985441310282076, "grad_norm": 0.6482903980335606, "learning_rate": 3.42310146199083e-05, "loss": 0.131, "step": 4173 }, { "epoch": 1.8989990900818925, "grad_norm": 0.5621324104407012, "learning_rate": 3.422437277702628e-05, "loss": 0.1023, "step": 4174 }, { "epoch": 1.8994540491355778, "grad_norm": 0.47951821015351165, "learning_rate": 3.421773018037017e-05, "loss": 0.1212, "step": 4175 }, { "epoch": 1.8999090081892631, "grad_norm": 0.6803253218242251, "learning_rate": 3.421108683048276e-05, "loss": 0.0809, "step": 4176 }, { "epoch": 1.900363967242948, "grad_norm": 0.883465744540734, "learning_rate": 3.420444272790695e-05, "loss": 0.1294, "step": 4177 }, { "epoch": 1.9008189262966333, "grad_norm": 0.6483951852496427, "learning_rate": 3.4197797873185637e-05, "loss": 0.15, "step": 4178 }, { "epoch": 1.9012738853503186, "grad_norm": 0.721345715811447, "learning_rate": 3.4191152266861824e-05, "loss": 0.08, "step": 4179 }, { "epoch": 1.9017288444040037, "grad_norm": 0.8036596457224673, "learning_rate": 3.418450590947855e-05, "loss": 0.1188, "step": 4180 }, { "epoch": 1.9021838034576888, "grad_norm": 0.7729860602685976, "learning_rate": 3.417785880157894e-05, "loss": 0.192, "step": 4181 }, { "epoch": 1.902638762511374, "grad_norm": 0.6331810240764438, "learning_rate": 3.417121094370615e-05, "loss": 0.1055, "step": 4182 }, { "epoch": 1.9030937215650592, "grad_norm": 0.6832333743825814, "learning_rate": 3.4164562336403415e-05, "loss": 0.1037, "step": 4183 }, { "epoch": 1.9035486806187443, "grad_norm": 0.6555303579720353, "learning_rate": 3.415791298021404e-05, "loss": 0.1402, "step": 4184 }, { "epoch": 1.9040036396724296, "grad_norm": 0.5780783086873399, "learning_rate": 3.415126287568136e-05, "loss": 0.1642, "step": 4185 }, { "epoch": 1.9044585987261147, "grad_norm": 0.5301475393412647, "learning_rate": 3.414461202334882e-05, "loss": 0.1024, "step": 4186 }, { "epoch": 1.9049135577797998, "grad_norm": 0.4379539949555472, "learning_rate": 3.413796042375987e-05, "loss": 0.1039, "step": 4187 }, { "epoch": 1.905368516833485, "grad_norm": 0.6606068174700802, "learning_rate": 3.413130807745807e-05, "loss": 0.1387, "step": 4188 }, { "epoch": 1.9058234758871702, "grad_norm": 0.6383355010977033, "learning_rate": 3.4124654984987004e-05, "loss": 0.0922, "step": 4189 }, { "epoch": 1.9062784349408552, "grad_norm": 0.6729406326387064, "learning_rate": 3.411800114689034e-05, "loss": 0.1214, "step": 4190 }, { "epoch": 1.9067333939945406, "grad_norm": 0.6163607235378216, "learning_rate": 3.4111346563711796e-05, "loss": 0.0805, "step": 4191 }, { "epoch": 1.9071883530482256, "grad_norm": 0.6460348102984439, "learning_rate": 3.410469123599517e-05, "loss": 0.1592, "step": 4192 }, { "epoch": 1.9076433121019107, "grad_norm": 0.6890562748638434, "learning_rate": 3.409803516428428e-05, "loss": 0.0971, "step": 4193 }, { "epoch": 1.908098271155596, "grad_norm": 0.6633497152728575, "learning_rate": 3.409137834912305e-05, "loss": 0.1178, "step": 4194 }, { "epoch": 1.9085532302092811, "grad_norm": 0.7394215818906649, "learning_rate": 3.408472079105544e-05, "loss": 0.1417, "step": 4195 }, { "epoch": 1.9090081892629662, "grad_norm": 0.8125208405986047, "learning_rate": 3.407806249062546e-05, "loss": 0.1422, "step": 4196 }, { "epoch": 1.9094631483166515, "grad_norm": 0.661849771369428, "learning_rate": 3.407140344837722e-05, "loss": 0.1673, "step": 4197 }, { "epoch": 1.9099181073703366, "grad_norm": 0.5608694686439253, "learning_rate": 3.406474366485485e-05, "loss": 0.0671, "step": 4198 }, { "epoch": 1.9103730664240217, "grad_norm": 0.5835259385275783, "learning_rate": 3.405808314060257e-05, "loss": 0.141, "step": 4199 }, { "epoch": 1.910828025477707, "grad_norm": 0.5360251366282514, "learning_rate": 3.405142187616464e-05, "loss": 0.0986, "step": 4200 }, { "epoch": 1.9112829845313923, "grad_norm": 0.690474082150042, "learning_rate": 3.404475987208539e-05, "loss": 0.1376, "step": 4201 }, { "epoch": 1.9117379435850772, "grad_norm": 0.696598101052573, "learning_rate": 3.4038097128909206e-05, "loss": 0.1373, "step": 4202 }, { "epoch": 1.9121929026387625, "grad_norm": 0.6649385831796273, "learning_rate": 3.403143364718054e-05, "loss": 0.1196, "step": 4203 }, { "epoch": 1.9126478616924478, "grad_norm": 0.6484798266387195, "learning_rate": 3.4024769427443916e-05, "loss": 0.0698, "step": 4204 }, { "epoch": 1.9131028207461327, "grad_norm": 0.6030983970741872, "learning_rate": 3.401810447024387e-05, "loss": 0.1268, "step": 4205 }, { "epoch": 1.913557779799818, "grad_norm": 0.6049041781357947, "learning_rate": 3.401143877612506e-05, "loss": 0.1388, "step": 4206 }, { "epoch": 1.9140127388535033, "grad_norm": 0.5743981546865794, "learning_rate": 3.400477234563217e-05, "loss": 0.0865, "step": 4207 }, { "epoch": 1.9144676979071884, "grad_norm": 0.6755775558029066, "learning_rate": 3.3998105179309946e-05, "loss": 0.1065, "step": 4208 }, { "epoch": 1.9149226569608735, "grad_norm": 0.49456736519318817, "learning_rate": 3.399143727770321e-05, "loss": 0.0893, "step": 4209 }, { "epoch": 1.9153776160145588, "grad_norm": 0.6270847590454492, "learning_rate": 3.398476864135681e-05, "loss": 0.1159, "step": 4210 }, { "epoch": 1.915832575068244, "grad_norm": 0.5212897648437572, "learning_rate": 3.397809927081571e-05, "loss": 0.1362, "step": 4211 }, { "epoch": 1.916287534121929, "grad_norm": 1.0689905016463677, "learning_rate": 3.397142916662486e-05, "loss": 0.1175, "step": 4212 }, { "epoch": 1.9167424931756143, "grad_norm": 0.45675313303466947, "learning_rate": 3.396475832932935e-05, "loss": 0.0899, "step": 4213 }, { "epoch": 1.9171974522292994, "grad_norm": 0.7222530047101566, "learning_rate": 3.3958086759474275e-05, "loss": 0.1237, "step": 4214 }, { "epoch": 1.9176524112829845, "grad_norm": 0.5781365056713005, "learning_rate": 3.395141445760479e-05, "loss": 0.1162, "step": 4215 }, { "epoch": 1.9181073703366698, "grad_norm": 0.674377818489383, "learning_rate": 3.394474142426616e-05, "loss": 0.1078, "step": 4216 }, { "epoch": 1.9185623293903549, "grad_norm": 0.7905392741637409, "learning_rate": 3.393806766000364e-05, "loss": 0.1214, "step": 4217 }, { "epoch": 1.91901728844404, "grad_norm": 0.7242952094070126, "learning_rate": 3.39313931653626e-05, "loss": 0.1313, "step": 4218 }, { "epoch": 1.9194722474977253, "grad_norm": 0.6546858106357962, "learning_rate": 3.392471794088844e-05, "loss": 0.1282, "step": 4219 }, { "epoch": 1.9199272065514104, "grad_norm": 0.6468493109425776, "learning_rate": 3.3918041987126635e-05, "loss": 0.1389, "step": 4220 }, { "epoch": 1.9203821656050954, "grad_norm": 0.5656083318083209, "learning_rate": 3.3911365304622714e-05, "loss": 0.0864, "step": 4221 }, { "epoch": 1.9208371246587808, "grad_norm": 0.7613317624705463, "learning_rate": 3.390468789392226e-05, "loss": 0.154, "step": 4222 }, { "epoch": 1.9212920837124658, "grad_norm": 0.6097143383437371, "learning_rate": 3.389800975557093e-05, "loss": 0.0855, "step": 4223 }, { "epoch": 1.921747042766151, "grad_norm": 0.6189910413942646, "learning_rate": 3.389133089011442e-05, "loss": 0.1285, "step": 4224 }, { "epoch": 1.9222020018198362, "grad_norm": 0.568531872618919, "learning_rate": 3.388465129809851e-05, "loss": 0.1727, "step": 4225 }, { "epoch": 1.9226569608735213, "grad_norm": 1.1150448716740244, "learning_rate": 3.3877970980069014e-05, "loss": 0.1337, "step": 4226 }, { "epoch": 1.9231119199272064, "grad_norm": 0.6264489419264911, "learning_rate": 3.387128993657182e-05, "loss": 0.1019, "step": 4227 }, { "epoch": 1.9235668789808917, "grad_norm": 0.6124636938688336, "learning_rate": 3.386460816815288e-05, "loss": 0.0884, "step": 4228 }, { "epoch": 1.924021838034577, "grad_norm": 0.5494817744922231, "learning_rate": 3.385792567535819e-05, "loss": 0.1046, "step": 4229 }, { "epoch": 1.924476797088262, "grad_norm": 0.6229075279847761, "learning_rate": 3.385124245873382e-05, "loss": 0.1254, "step": 4230 }, { "epoch": 1.9249317561419472, "grad_norm": 0.7675992302309707, "learning_rate": 3.384455851882588e-05, "loss": 0.1691, "step": 4231 }, { "epoch": 1.9253867151956325, "grad_norm": 0.5699860424756125, "learning_rate": 3.383787385618057e-05, "loss": 0.1231, "step": 4232 }, { "epoch": 1.9258416742493174, "grad_norm": 0.8445840774132664, "learning_rate": 3.383118847134411e-05, "loss": 0.0936, "step": 4233 }, { "epoch": 1.9262966333030027, "grad_norm": 1.042279172510895, "learning_rate": 3.382450236486281e-05, "loss": 0.096, "step": 4234 }, { "epoch": 1.926751592356688, "grad_norm": 0.6339957036028224, "learning_rate": 3.3817815537283035e-05, "loss": 0.0864, "step": 4235 }, { "epoch": 1.9272065514103731, "grad_norm": 0.74815156027168, "learning_rate": 3.3811127989151184e-05, "loss": 0.1139, "step": 4236 }, { "epoch": 1.9276615104640582, "grad_norm": 0.7368820211625379, "learning_rate": 3.3804439721013755e-05, "loss": 0.1015, "step": 4237 }, { "epoch": 1.9281164695177435, "grad_norm": 0.7091631106691398, "learning_rate": 3.379775073341727e-05, "loss": 0.1381, "step": 4238 }, { "epoch": 1.9285714285714286, "grad_norm": 0.4992626244935722, "learning_rate": 3.379106102690832e-05, "loss": 0.0986, "step": 4239 }, { "epoch": 1.9290263876251137, "grad_norm": 0.7001609152461387, "learning_rate": 3.378437060203357e-05, "loss": 0.1324, "step": 4240 }, { "epoch": 1.929481346678799, "grad_norm": 0.7996440833657636, "learning_rate": 3.3777679459339716e-05, "loss": 0.1072, "step": 4241 }, { "epoch": 1.929936305732484, "grad_norm": 1.1989963782975182, "learning_rate": 3.377098759937355e-05, "loss": 0.1458, "step": 4242 }, { "epoch": 1.9303912647861692, "grad_norm": 0.6412752728593426, "learning_rate": 3.376429502268188e-05, "loss": 0.0997, "step": 4243 }, { "epoch": 1.9308462238398545, "grad_norm": 0.7117204370102257, "learning_rate": 3.3757601729811596e-05, "loss": 0.153, "step": 4244 }, { "epoch": 1.9313011828935396, "grad_norm": 0.6684528449816252, "learning_rate": 3.375090772130966e-05, "loss": 0.094, "step": 4245 }, { "epoch": 1.9317561419472247, "grad_norm": 0.655826890329857, "learning_rate": 3.3744212997723043e-05, "loss": 0.0871, "step": 4246 }, { "epoch": 1.93221110100091, "grad_norm": 0.5732481545873328, "learning_rate": 3.373751755959884e-05, "loss": 0.1034, "step": 4247 }, { "epoch": 1.932666060054595, "grad_norm": 0.5795391534382663, "learning_rate": 3.3730821407484156e-05, "loss": 0.1153, "step": 4248 }, { "epoch": 1.9331210191082802, "grad_norm": 0.5038068528490652, "learning_rate": 3.372412454192618e-05, "loss": 0.1583, "step": 4249 }, { "epoch": 1.9335759781619655, "grad_norm": 0.6351656779177546, "learning_rate": 3.371742696347214e-05, "loss": 0.0923, "step": 4250 }, { "epoch": 1.9340309372156506, "grad_norm": 0.5294655036838565, "learning_rate": 3.371072867266934e-05, "loss": 0.0633, "step": 4251 }, { "epoch": 1.9344858962693356, "grad_norm": 0.5266198261598204, "learning_rate": 3.370402967006513e-05, "loss": 0.1074, "step": 4252 }, { "epoch": 1.934940855323021, "grad_norm": 0.563948481928545, "learning_rate": 3.369732995620692e-05, "loss": 0.1274, "step": 4253 }, { "epoch": 1.935395814376706, "grad_norm": 0.5955923052694833, "learning_rate": 3.3690629531642186e-05, "loss": 0.1116, "step": 4254 }, { "epoch": 1.9358507734303911, "grad_norm": 0.5479786949419669, "learning_rate": 3.3683928396918454e-05, "loss": 0.1064, "step": 4255 }, { "epoch": 1.9363057324840764, "grad_norm": 1.1029430342543942, "learning_rate": 3.367722655258331e-05, "loss": 0.1815, "step": 4256 }, { "epoch": 1.9367606915377618, "grad_norm": 0.5310937343884611, "learning_rate": 3.367052399918439e-05, "loss": 0.0602, "step": 4257 }, { "epoch": 1.9372156505914466, "grad_norm": 0.6219667463029348, "learning_rate": 3.3663820737269406e-05, "loss": 0.0932, "step": 4258 }, { "epoch": 1.937670609645132, "grad_norm": 0.5090477291860741, "learning_rate": 3.365711676738612e-05, "loss": 0.0892, "step": 4259 }, { "epoch": 1.9381255686988172, "grad_norm": 0.9421939943006665, "learning_rate": 3.365041209008235e-05, "loss": 0.1848, "step": 4260 }, { "epoch": 1.9385805277525021, "grad_norm": 0.6505699543666258, "learning_rate": 3.364370670590596e-05, "loss": 0.0824, "step": 4261 }, { "epoch": 1.9390354868061874, "grad_norm": 2.102083639019624, "learning_rate": 3.3637000615404907e-05, "loss": 0.1714, "step": 4262 }, { "epoch": 1.9394904458598727, "grad_norm": 0.6682621075554491, "learning_rate": 3.363029381912716e-05, "loss": 0.1015, "step": 4263 }, { "epoch": 1.9399454049135578, "grad_norm": 0.581958449043036, "learning_rate": 3.362358631762077e-05, "loss": 0.089, "step": 4264 }, { "epoch": 1.940400363967243, "grad_norm": 0.6947061380685713, "learning_rate": 3.361687811143386e-05, "loss": 0.16, "step": 4265 }, { "epoch": 1.9408553230209282, "grad_norm": 0.5492679115392354, "learning_rate": 3.3610169201114584e-05, "loss": 0.0951, "step": 4266 }, { "epoch": 1.9413102820746133, "grad_norm": 0.8613749716949574, "learning_rate": 3.360345958721115e-05, "loss": 0.1318, "step": 4267 }, { "epoch": 1.9417652411282984, "grad_norm": 0.8045138817092433, "learning_rate": 3.3596749270271866e-05, "loss": 0.1077, "step": 4268 }, { "epoch": 1.9422202001819837, "grad_norm": 0.7084093757161231, "learning_rate": 3.359003825084505e-05, "loss": 0.102, "step": 4269 }, { "epoch": 1.9426751592356688, "grad_norm": 1.4820228502363177, "learning_rate": 3.35833265294791e-05, "loss": 0.1679, "step": 4270 }, { "epoch": 1.943130118289354, "grad_norm": 0.6057922910492682, "learning_rate": 3.357661410672247e-05, "loss": 0.1373, "step": 4271 }, { "epoch": 1.9435850773430392, "grad_norm": 0.8979547028782279, "learning_rate": 3.3569900983123656e-05, "loss": 0.0899, "step": 4272 }, { "epoch": 1.9440400363967243, "grad_norm": 0.7774992748234054, "learning_rate": 3.356318715923125e-05, "loss": 0.1517, "step": 4273 }, { "epoch": 1.9444949954504094, "grad_norm": 1.11554096411055, "learning_rate": 3.355647263559386e-05, "loss": 0.1341, "step": 4274 }, { "epoch": 1.9449499545040947, "grad_norm": 0.4962505038804321, "learning_rate": 3.3549757412760156e-05, "loss": 0.0862, "step": 4275 }, { "epoch": 1.9454049135577798, "grad_norm": 0.8082507175954948, "learning_rate": 3.354304149127889e-05, "loss": 0.1342, "step": 4276 }, { "epoch": 1.9458598726114649, "grad_norm": 0.7499564735151145, "learning_rate": 3.353632487169886e-05, "loss": 0.137, "step": 4277 }, { "epoch": 1.9463148316651502, "grad_norm": 0.6141170575259088, "learning_rate": 3.35296075545689e-05, "loss": 0.1199, "step": 4278 }, { "epoch": 1.9467697907188353, "grad_norm": 0.5538787123922235, "learning_rate": 3.352288954043795e-05, "loss": 0.1538, "step": 4279 }, { "epoch": 1.9472247497725204, "grad_norm": 1.0425936200669832, "learning_rate": 3.351617082985494e-05, "loss": 0.1693, "step": 4280 }, { "epoch": 1.9476797088262057, "grad_norm": 0.5888368563174744, "learning_rate": 3.350945142336891e-05, "loss": 0.0916, "step": 4281 }, { "epoch": 1.9481346678798908, "grad_norm": 0.6049310784586162, "learning_rate": 3.3502731321528935e-05, "loss": 0.0977, "step": 4282 }, { "epoch": 1.9485896269335758, "grad_norm": 1.2995195763534368, "learning_rate": 3.349601052488416e-05, "loss": 0.2194, "step": 4283 }, { "epoch": 1.9490445859872612, "grad_norm": 0.640430952172325, "learning_rate": 3.348928903398377e-05, "loss": 0.1349, "step": 4284 }, { "epoch": 1.9494995450409465, "grad_norm": 0.5428906671006751, "learning_rate": 3.348256684937702e-05, "loss": 0.1116, "step": 4285 }, { "epoch": 1.9499545040946313, "grad_norm": 0.6284322718506554, "learning_rate": 3.3475843971613205e-05, "loss": 0.1056, "step": 4286 }, { "epoch": 1.9504094631483166, "grad_norm": 0.6912943947924209, "learning_rate": 3.3469120401241705e-05, "loss": 0.1033, "step": 4287 }, { "epoch": 1.950864422202002, "grad_norm": 0.5802509789381782, "learning_rate": 3.346239613881194e-05, "loss": 0.1054, "step": 4288 }, { "epoch": 1.9513193812556868, "grad_norm": 0.653720941947449, "learning_rate": 3.345567118487336e-05, "loss": 0.1052, "step": 4289 }, { "epoch": 1.9517743403093721, "grad_norm": 0.6828263647230736, "learning_rate": 3.344894553997553e-05, "loss": 0.1328, "step": 4290 }, { "epoch": 1.9522292993630574, "grad_norm": 0.8107580399544722, "learning_rate": 3.3442219204668024e-05, "loss": 0.1659, "step": 4291 }, { "epoch": 1.9526842584167425, "grad_norm": 0.6409569421392773, "learning_rate": 3.3435492179500485e-05, "loss": 0.1123, "step": 4292 }, { "epoch": 1.9531392174704276, "grad_norm": 0.6482829858559593, "learning_rate": 3.342876446502262e-05, "loss": 0.1488, "step": 4293 }, { "epoch": 1.953594176524113, "grad_norm": 0.8673291984136156, "learning_rate": 3.342203606178419e-05, "loss": 0.1159, "step": 4294 }, { "epoch": 1.954049135577798, "grad_norm": 0.7739979834494801, "learning_rate": 3.3415306970335006e-05, "loss": 0.1352, "step": 4295 }, { "epoch": 1.9545040946314831, "grad_norm": 0.8944517521595131, "learning_rate": 3.340857719122494e-05, "loss": 0.2028, "step": 4296 }, { "epoch": 1.9549590536851684, "grad_norm": 0.4878549170954671, "learning_rate": 3.3401846725003914e-05, "loss": 0.0789, "step": 4297 }, { "epoch": 1.9554140127388535, "grad_norm": 0.6774744689352477, "learning_rate": 3.3395115572221926e-05, "loss": 0.1066, "step": 4298 }, { "epoch": 1.9558689717925386, "grad_norm": 0.5242146713715867, "learning_rate": 3.3388383733428984e-05, "loss": 0.1151, "step": 4299 }, { "epoch": 1.956323930846224, "grad_norm": 0.7125654201658779, "learning_rate": 3.3381651209175225e-05, "loss": 0.1126, "step": 4300 }, { "epoch": 1.956778889899909, "grad_norm": 0.7648701280804289, "learning_rate": 3.3374918000010775e-05, "loss": 0.092, "step": 4301 }, { "epoch": 1.957233848953594, "grad_norm": 0.5643645437988267, "learning_rate": 3.336818410648585e-05, "loss": 0.1031, "step": 4302 }, { "epoch": 1.9576888080072794, "grad_norm": 0.58751218733435, "learning_rate": 3.336144952915071e-05, "loss": 0.0829, "step": 4303 }, { "epoch": 1.9581437670609645, "grad_norm": 0.8381911456836129, "learning_rate": 3.3354714268555664e-05, "loss": 0.1123, "step": 4304 }, { "epoch": 1.9585987261146496, "grad_norm": 0.5807644810281264, "learning_rate": 3.334797832525111e-05, "loss": 0.0778, "step": 4305 }, { "epoch": 1.959053685168335, "grad_norm": 0.5720524865852676, "learning_rate": 3.3341241699787454e-05, "loss": 0.093, "step": 4306 }, { "epoch": 1.95950864422202, "grad_norm": 0.477022340591648, "learning_rate": 3.3334504392715206e-05, "loss": 0.0679, "step": 4307 }, { "epoch": 1.959963603275705, "grad_norm": 1.0434657487838364, "learning_rate": 3.332776640458489e-05, "loss": 0.1311, "step": 4308 }, { "epoch": 1.9604185623293904, "grad_norm": 0.5899143751777528, "learning_rate": 3.332102773594712e-05, "loss": 0.1461, "step": 4309 }, { "epoch": 1.9608735213830755, "grad_norm": 0.799992528436309, "learning_rate": 3.331428838735254e-05, "loss": 0.0729, "step": 4310 }, { "epoch": 1.9613284804367606, "grad_norm": 0.5553295881687743, "learning_rate": 3.330754835935185e-05, "loss": 0.1151, "step": 4311 }, { "epoch": 1.9617834394904459, "grad_norm": 0.5060267753482525, "learning_rate": 3.3300807652495836e-05, "loss": 0.0741, "step": 4312 }, { "epoch": 1.9622383985441312, "grad_norm": 0.5669511150575774, "learning_rate": 3.32940662673353e-05, "loss": 0.115, "step": 4313 }, { "epoch": 1.962693357597816, "grad_norm": 0.5532392542217481, "learning_rate": 3.3287324204421123e-05, "loss": 0.1032, "step": 4314 }, { "epoch": 1.9631483166515014, "grad_norm": 0.5132222831158011, "learning_rate": 3.328058146430424e-05, "loss": 0.1341, "step": 4315 }, { "epoch": 1.9636032757051867, "grad_norm": 0.6193852125371514, "learning_rate": 3.3273838047535635e-05, "loss": 0.1634, "step": 4316 }, { "epoch": 1.9640582347588715, "grad_norm": 0.6558344512583405, "learning_rate": 3.3267093954666345e-05, "loss": 0.1384, "step": 4317 }, { "epoch": 1.9645131938125568, "grad_norm": 0.6677183576837196, "learning_rate": 3.326034918624747e-05, "loss": 0.0831, "step": 4318 }, { "epoch": 1.9649681528662422, "grad_norm": 0.9030805572229573, "learning_rate": 3.325360374283017e-05, "loss": 0.1264, "step": 4319 }, { "epoch": 1.9654231119199272, "grad_norm": 0.7273296765109084, "learning_rate": 3.3246857624965646e-05, "loss": 0.1267, "step": 4320 }, { "epoch": 1.9658780709736123, "grad_norm": 0.7721052411749748, "learning_rate": 3.324011083320515e-05, "loss": 0.1314, "step": 4321 }, { "epoch": 1.9663330300272976, "grad_norm": 0.592333766813276, "learning_rate": 3.3233363368100026e-05, "loss": 0.1913, "step": 4322 }, { "epoch": 1.9667879890809827, "grad_norm": 0.644859064902758, "learning_rate": 3.322661523020161e-05, "loss": 0.1437, "step": 4323 }, { "epoch": 1.9672429481346678, "grad_norm": 0.7949025014604201, "learning_rate": 3.3219866420061356e-05, "loss": 0.119, "step": 4324 }, { "epoch": 1.9676979071883531, "grad_norm": 0.5768945860209929, "learning_rate": 3.3213116938230734e-05, "loss": 0.1039, "step": 4325 }, { "epoch": 1.9681528662420382, "grad_norm": 0.7480841976429833, "learning_rate": 3.32063667852613e-05, "loss": 0.1682, "step": 4326 }, { "epoch": 1.9686078252957233, "grad_norm": 0.5185732927466159, "learning_rate": 3.3199615961704614e-05, "loss": 0.1003, "step": 4327 }, { "epoch": 1.9690627843494086, "grad_norm": 0.5393715955981582, "learning_rate": 3.319286446811235e-05, "loss": 0.1139, "step": 4328 }, { "epoch": 1.9695177434030937, "grad_norm": 0.5648622442385766, "learning_rate": 3.3186112305036205e-05, "loss": 0.1075, "step": 4329 }, { "epoch": 1.9699727024567788, "grad_norm": 0.6118770418783448, "learning_rate": 3.317935947302792e-05, "loss": 0.1281, "step": 4330 }, { "epoch": 1.9704276615104641, "grad_norm": 0.4793501170869104, "learning_rate": 3.317260597263932e-05, "loss": 0.0714, "step": 4331 }, { "epoch": 1.9708826205641492, "grad_norm": 1.0913592337277365, "learning_rate": 3.316585180442228e-05, "loss": 0.199, "step": 4332 }, { "epoch": 1.9713375796178343, "grad_norm": 0.3896795638023996, "learning_rate": 3.315909696892869e-05, "loss": 0.1004, "step": 4333 }, { "epoch": 1.9717925386715196, "grad_norm": 0.5609088487714605, "learning_rate": 3.315234146671054e-05, "loss": 0.1, "step": 4334 }, { "epoch": 1.9722474977252047, "grad_norm": 0.7051026263072405, "learning_rate": 3.314558529831987e-05, "loss": 0.1381, "step": 4335 }, { "epoch": 1.9727024567788898, "grad_norm": 0.5095070510834645, "learning_rate": 3.313882846430876e-05, "loss": 0.0953, "step": 4336 }, { "epoch": 1.973157415832575, "grad_norm": 0.6801494607097475, "learning_rate": 3.313207096522933e-05, "loss": 0.1476, "step": 4337 }, { "epoch": 1.9736123748862604, "grad_norm": 0.6367265097809129, "learning_rate": 3.312531280163379e-05, "loss": 0.1722, "step": 4338 }, { "epoch": 1.9740673339399453, "grad_norm": 1.0183489214704617, "learning_rate": 3.311855397407439e-05, "loss": 0.1759, "step": 4339 }, { "epoch": 1.9745222929936306, "grad_norm": 0.5026258889811644, "learning_rate": 3.3111794483103406e-05, "loss": 0.0623, "step": 4340 }, { "epoch": 1.974977252047316, "grad_norm": 0.5349375593460299, "learning_rate": 3.310503432927322e-05, "loss": 0.0761, "step": 4341 }, { "epoch": 1.9754322111010008, "grad_norm": 0.6119719955016084, "learning_rate": 3.309827351313623e-05, "loss": 0.0955, "step": 4342 }, { "epoch": 1.975887170154686, "grad_norm": 0.6323367118140354, "learning_rate": 3.30915120352449e-05, "loss": 0.1234, "step": 4343 }, { "epoch": 1.9763421292083714, "grad_norm": 0.5416364311627817, "learning_rate": 3.308474989615174e-05, "loss": 0.155, "step": 4344 }, { "epoch": 1.9767970882620565, "grad_norm": 0.5961743012015047, "learning_rate": 3.307798709640933e-05, "loss": 0.1098, "step": 4345 }, { "epoch": 1.9772520473157416, "grad_norm": 0.5916393609007093, "learning_rate": 3.307122363657032e-05, "loss": 0.1313, "step": 4346 }, { "epoch": 1.9777070063694269, "grad_norm": 0.6493218247798929, "learning_rate": 3.306445951718733e-05, "loss": 0.1191, "step": 4347 }, { "epoch": 1.978161965423112, "grad_norm": 0.5211694008657178, "learning_rate": 3.305769473881314e-05, "loss": 0.1248, "step": 4348 }, { "epoch": 1.978616924476797, "grad_norm": 0.5550725707536979, "learning_rate": 3.305092930200053e-05, "loss": 0.1431, "step": 4349 }, { "epoch": 1.9790718835304824, "grad_norm": 0.5440260952360739, "learning_rate": 3.3044163207302325e-05, "loss": 0.129, "step": 4350 }, { "epoch": 1.9795268425841674, "grad_norm": 0.5006408986920231, "learning_rate": 3.303739645527144e-05, "loss": 0.0916, "step": 4351 }, { "epoch": 1.9799818016378525, "grad_norm": 0.41143873454739227, "learning_rate": 3.3030629046460795e-05, "loss": 0.0946, "step": 4352 }, { "epoch": 1.9804367606915378, "grad_norm": 0.4727417749565405, "learning_rate": 3.302386098142343e-05, "loss": 0.0838, "step": 4353 }, { "epoch": 1.980891719745223, "grad_norm": 0.622540198872782, "learning_rate": 3.3017092260712375e-05, "loss": 0.1001, "step": 4354 }, { "epoch": 1.981346678798908, "grad_norm": 0.507089303073889, "learning_rate": 3.301032288488074e-05, "loss": 0.1223, "step": 4355 }, { "epoch": 1.9818016378525933, "grad_norm": 0.6512098908161831, "learning_rate": 3.30035528544817e-05, "loss": 0.1094, "step": 4356 }, { "epoch": 1.9822565969062784, "grad_norm": 1.1484403391260964, "learning_rate": 3.299678217006846e-05, "loss": 0.1506, "step": 4357 }, { "epoch": 1.9827115559599635, "grad_norm": 0.43455477317378843, "learning_rate": 3.2990010832194294e-05, "loss": 0.0728, "step": 4358 }, { "epoch": 1.9831665150136488, "grad_norm": 0.5106336081791154, "learning_rate": 3.298323884141252e-05, "loss": 0.1229, "step": 4359 }, { "epoch": 1.983621474067334, "grad_norm": 0.6994494430974169, "learning_rate": 3.2976466198276534e-05, "loss": 0.1137, "step": 4360 }, { "epoch": 1.984076433121019, "grad_norm": 0.6243736372290695, "learning_rate": 3.296969290333974e-05, "loss": 0.0937, "step": 4361 }, { "epoch": 1.9845313921747043, "grad_norm": 0.7486541118290793, "learning_rate": 3.2962918957155645e-05, "loss": 0.1844, "step": 4362 }, { "epoch": 1.9849863512283894, "grad_norm": 0.7165840311278447, "learning_rate": 3.295614436027776e-05, "loss": 0.1044, "step": 4363 }, { "epoch": 1.9854413102820745, "grad_norm": 0.7392257603367701, "learning_rate": 3.29493691132597e-05, "loss": 0.1457, "step": 4364 }, { "epoch": 1.9858962693357598, "grad_norm": 0.6067676672705052, "learning_rate": 3.294259321665509e-05, "loss": 0.0821, "step": 4365 }, { "epoch": 1.9863512283894451, "grad_norm": 0.6133194816515701, "learning_rate": 3.2935816671017625e-05, "loss": 0.0972, "step": 4366 }, { "epoch": 1.98680618744313, "grad_norm": 0.5470658071916548, "learning_rate": 3.292903947690106e-05, "loss": 0.0986, "step": 4367 }, { "epoch": 1.9872611464968153, "grad_norm": 0.5608892704847239, "learning_rate": 3.2922261634859205e-05, "loss": 0.1088, "step": 4368 }, { "epoch": 1.9877161055505006, "grad_norm": 0.5401802873781202, "learning_rate": 3.291548314544589e-05, "loss": 0.1488, "step": 4369 }, { "epoch": 1.9881710646041855, "grad_norm": 0.4549931876881923, "learning_rate": 3.290870400921505e-05, "loss": 0.0831, "step": 4370 }, { "epoch": 1.9886260236578708, "grad_norm": 1.3910387357308147, "learning_rate": 3.2901924226720637e-05, "loss": 0.1395, "step": 4371 }, { "epoch": 1.989080982711556, "grad_norm": 0.4975391067968489, "learning_rate": 3.2895143798516656e-05, "loss": 0.0759, "step": 4372 }, { "epoch": 1.9895359417652412, "grad_norm": 0.5374320651248919, "learning_rate": 3.2888362725157176e-05, "loss": 0.0912, "step": 4373 }, { "epoch": 1.9899909008189263, "grad_norm": 0.4898718329527609, "learning_rate": 3.288158100719632e-05, "loss": 0.1075, "step": 4374 }, { "epoch": 1.9904458598726116, "grad_norm": 0.6204071145777278, "learning_rate": 3.287479864518826e-05, "loss": 0.1232, "step": 4375 }, { "epoch": 1.9909008189262967, "grad_norm": 0.537605845460887, "learning_rate": 3.286801563968721e-05, "loss": 0.1544, "step": 4376 }, { "epoch": 1.9913557779799818, "grad_norm": 0.6319865988827659, "learning_rate": 3.286123199124746e-05, "loss": 0.1224, "step": 4377 }, { "epoch": 1.991810737033667, "grad_norm": 0.7231755081117697, "learning_rate": 3.285444770042333e-05, "loss": 0.1285, "step": 4378 }, { "epoch": 1.9922656960873522, "grad_norm": 0.5828635064792237, "learning_rate": 3.284766276776921e-05, "loss": 0.1221, "step": 4379 }, { "epoch": 1.9927206551410372, "grad_norm": 0.5928923452433071, "learning_rate": 3.284087719383952e-05, "loss": 0.1179, "step": 4380 }, { "epoch": 1.9931756141947226, "grad_norm": 0.43275072966183803, "learning_rate": 3.283409097918875e-05, "loss": 0.082, "step": 4381 }, { "epoch": 1.9936305732484076, "grad_norm": 0.9070129793503217, "learning_rate": 3.282730412437146e-05, "loss": 0.1233, "step": 4382 }, { "epoch": 1.9940855323020927, "grad_norm": 0.7533225891024534, "learning_rate": 3.282051662994221e-05, "loss": 0.1507, "step": 4383 }, { "epoch": 1.994540491355778, "grad_norm": 0.7539293914610105, "learning_rate": 3.2813728496455664e-05, "loss": 0.0947, "step": 4384 }, { "epoch": 1.9949954504094631, "grad_norm": 0.4961373418449938, "learning_rate": 3.280693972446652e-05, "loss": 0.1056, "step": 4385 }, { "epoch": 1.9954504094631482, "grad_norm": 0.5248784113076403, "learning_rate": 3.2800150314529505e-05, "loss": 0.1201, "step": 4386 }, { "epoch": 1.9959053685168335, "grad_norm": 0.5744473044116138, "learning_rate": 3.279336026719944e-05, "loss": 0.1199, "step": 4387 }, { "epoch": 1.9963603275705186, "grad_norm": 1.0622713765696317, "learning_rate": 3.278656958303116e-05, "loss": 0.0955, "step": 4388 }, { "epoch": 1.9968152866242037, "grad_norm": 0.5360536125145177, "learning_rate": 3.2779778262579584e-05, "loss": 0.0785, "step": 4389 }, { "epoch": 1.997270245677889, "grad_norm": 0.5365757191830322, "learning_rate": 3.2772986306399655e-05, "loss": 0.1125, "step": 4390 }, { "epoch": 1.9977252047315741, "grad_norm": 0.7096289376908269, "learning_rate": 3.276619371504639e-05, "loss": 0.1531, "step": 4391 }, { "epoch": 1.9981801637852592, "grad_norm": 0.6051779581888237, "learning_rate": 3.2759400489074855e-05, "loss": 0.1107, "step": 4392 }, { "epoch": 1.9986351228389445, "grad_norm": 0.7622396293008836, "learning_rate": 3.275260662904015e-05, "loss": 0.1273, "step": 4393 }, { "epoch": 1.9990900818926298, "grad_norm": 0.7452334329637487, "learning_rate": 3.2745812135497435e-05, "loss": 0.1523, "step": 4394 }, { "epoch": 1.9995450409463147, "grad_norm": 1.1316079793638416, "learning_rate": 3.273901700900193e-05, "loss": 0.1532, "step": 4395 }, { "epoch": 2.0, "grad_norm": 0.5816181720431618, "learning_rate": 3.2732221250108916e-05, "loss": 0.081, "step": 4396 }, { "epoch": 2.0004549590536853, "grad_norm": 0.5173360878904363, "learning_rate": 3.272542485937369e-05, "loss": 0.0535, "step": 4397 }, { "epoch": 2.00090991810737, "grad_norm": 0.44147099045650007, "learning_rate": 3.2718627837351634e-05, "loss": 0.0536, "step": 4398 }, { "epoch": 2.0013648771610555, "grad_norm": 0.5463986313591161, "learning_rate": 3.271183018459817e-05, "loss": 0.0695, "step": 4399 }, { "epoch": 2.001819836214741, "grad_norm": 0.44943806762904703, "learning_rate": 3.270503190166877e-05, "loss": 0.063, "step": 4400 }, { "epoch": 2.0022747952684257, "grad_norm": 0.3918699399951742, "learning_rate": 3.2698232989118956e-05, "loss": 0.0618, "step": 4401 }, { "epoch": 2.002729754322111, "grad_norm": 0.36776063953890875, "learning_rate": 3.269143344750431e-05, "loss": 0.0659, "step": 4402 }, { "epoch": 2.0031847133757963, "grad_norm": 0.44229923823291, "learning_rate": 3.268463327738045e-05, "loss": 0.086, "step": 4403 }, { "epoch": 2.003639672429481, "grad_norm": 0.39188429675290803, "learning_rate": 3.2677832479303075e-05, "loss": 0.0462, "step": 4404 }, { "epoch": 2.0040946314831665, "grad_norm": 0.49153326502772127, "learning_rate": 3.267103105382789e-05, "loss": 0.0668, "step": 4405 }, { "epoch": 2.0045495905368518, "grad_norm": 0.35860977892897283, "learning_rate": 3.26642290015107e-05, "loss": 0.0616, "step": 4406 }, { "epoch": 2.0050045495905366, "grad_norm": 0.4701464576025628, "learning_rate": 3.265742632290732e-05, "loss": 0.0696, "step": 4407 }, { "epoch": 2.005459508644222, "grad_norm": 0.4810586221220186, "learning_rate": 3.265062301857364e-05, "loss": 0.044, "step": 4408 }, { "epoch": 2.0059144676979073, "grad_norm": 0.35166442209165344, "learning_rate": 3.264381908906561e-05, "loss": 0.0519, "step": 4409 }, { "epoch": 2.0063694267515926, "grad_norm": 0.4122964651693508, "learning_rate": 3.263701453493919e-05, "loss": 0.0349, "step": 4410 }, { "epoch": 2.0068243858052774, "grad_norm": 0.641946873174019, "learning_rate": 3.263020935675043e-05, "loss": 0.0725, "step": 4411 }, { "epoch": 2.0072793448589628, "grad_norm": 0.6306622669200281, "learning_rate": 3.2623403555055423e-05, "loss": 0.0717, "step": 4412 }, { "epoch": 2.007734303912648, "grad_norm": 0.5502151713720627, "learning_rate": 3.261659713041031e-05, "loss": 0.0423, "step": 4413 }, { "epoch": 2.008189262966333, "grad_norm": 0.6387198725148823, "learning_rate": 3.2609790083371264e-05, "loss": 0.0733, "step": 4414 }, { "epoch": 2.0086442220200182, "grad_norm": 0.5663926614210214, "learning_rate": 3.2602982414494546e-05, "loss": 0.084, "step": 4415 }, { "epoch": 2.0090991810737036, "grad_norm": 0.48008141058173587, "learning_rate": 3.259617412433644e-05, "loss": 0.0508, "step": 4416 }, { "epoch": 2.0095541401273884, "grad_norm": 0.9199921162568026, "learning_rate": 3.2589365213453286e-05, "loss": 0.067, "step": 4417 }, { "epoch": 2.0100090991810737, "grad_norm": 0.4436519454422566, "learning_rate": 3.258255568240148e-05, "loss": 0.0377, "step": 4418 }, { "epoch": 2.010464058234759, "grad_norm": 0.34490141127831647, "learning_rate": 3.257574553173747e-05, "loss": 0.0426, "step": 4419 }, { "epoch": 2.010919017288444, "grad_norm": 0.5153170056579165, "learning_rate": 3.256893476201775e-05, "loss": 0.0395, "step": 4420 }, { "epoch": 2.011373976342129, "grad_norm": 0.42293534595555626, "learning_rate": 3.256212337379886e-05, "loss": 0.0428, "step": 4421 }, { "epoch": 2.0118289353958145, "grad_norm": 0.4985378659914828, "learning_rate": 3.2555311367637385e-05, "loss": 0.0794, "step": 4422 }, { "epoch": 2.0122838944494994, "grad_norm": 0.5940474636861163, "learning_rate": 3.2548498744089994e-05, "loss": 0.055, "step": 4423 }, { "epoch": 2.0127388535031847, "grad_norm": 0.639114907507852, "learning_rate": 3.2541685503713376e-05, "loss": 0.0496, "step": 4424 }, { "epoch": 2.01319381255687, "grad_norm": 0.630275764122585, "learning_rate": 3.253487164706428e-05, "loss": 0.0343, "step": 4425 }, { "epoch": 2.013648771610555, "grad_norm": 0.4606648313218669, "learning_rate": 3.252805717469949e-05, "loss": 0.0498, "step": 4426 }, { "epoch": 2.01410373066424, "grad_norm": 0.46008721876776176, "learning_rate": 3.252124208717587e-05, "loss": 0.0531, "step": 4427 }, { "epoch": 2.0145586897179255, "grad_norm": 0.4226999163260016, "learning_rate": 3.2514426385050315e-05, "loss": 0.0358, "step": 4428 }, { "epoch": 2.0150136487716104, "grad_norm": 0.5512893666890264, "learning_rate": 3.250761006887976e-05, "loss": 0.0919, "step": 4429 }, { "epoch": 2.0154686078252957, "grad_norm": 0.565298758582782, "learning_rate": 3.250079313922122e-05, "loss": 0.0572, "step": 4430 }, { "epoch": 2.015923566878981, "grad_norm": 0.6135199324758212, "learning_rate": 3.249397559663174e-05, "loss": 0.0706, "step": 4431 }, { "epoch": 2.016378525932666, "grad_norm": 0.6421039541500345, "learning_rate": 3.2487157441668415e-05, "loss": 0.0554, "step": 4432 }, { "epoch": 2.016833484986351, "grad_norm": 0.5565452382060598, "learning_rate": 3.24803386748884e-05, "loss": 0.0547, "step": 4433 }, { "epoch": 2.0172884440400365, "grad_norm": 0.8302100507679544, "learning_rate": 3.247351929684889e-05, "loss": 0.0694, "step": 4434 }, { "epoch": 2.0177434030937214, "grad_norm": 0.63789808923938, "learning_rate": 3.2466699308107126e-05, "loss": 0.06, "step": 4435 }, { "epoch": 2.0181983621474067, "grad_norm": 0.4159147642053169, "learning_rate": 3.245987870922041e-05, "loss": 0.0355, "step": 4436 }, { "epoch": 2.018653321201092, "grad_norm": 0.776811580378117, "learning_rate": 3.245305750074611e-05, "loss": 0.0942, "step": 4437 }, { "epoch": 2.0191082802547773, "grad_norm": 0.420663106137314, "learning_rate": 3.244623568324161e-05, "loss": 0.0477, "step": 4438 }, { "epoch": 2.019563239308462, "grad_norm": 0.34168168093855983, "learning_rate": 3.243941325726433e-05, "loss": 0.0365, "step": 4439 }, { "epoch": 2.0200181983621475, "grad_norm": 0.4370173573735262, "learning_rate": 3.2432590223371815e-05, "loss": 0.0526, "step": 4440 }, { "epoch": 2.0204731574158328, "grad_norm": 0.5309589951927823, "learning_rate": 3.2425766582121595e-05, "loss": 0.086, "step": 4441 }, { "epoch": 2.0209281164695176, "grad_norm": 0.534170651189108, "learning_rate": 3.2418942334071255e-05, "loss": 0.051, "step": 4442 }, { "epoch": 2.021383075523203, "grad_norm": 0.574574516023661, "learning_rate": 3.2412117479778456e-05, "loss": 0.0488, "step": 4443 }, { "epoch": 2.0218380345768883, "grad_norm": 0.933269851743438, "learning_rate": 3.2405292019800884e-05, "loss": 0.0845, "step": 4444 }, { "epoch": 2.022292993630573, "grad_norm": 0.7432100071579194, "learning_rate": 3.23984659546963e-05, "loss": 0.0264, "step": 4445 }, { "epoch": 2.0227479526842584, "grad_norm": 0.4838146364490901, "learning_rate": 3.239163928502248e-05, "loss": 0.058, "step": 4446 }, { "epoch": 2.0232029117379438, "grad_norm": 0.4922393209462653, "learning_rate": 3.238481201133729e-05, "loss": 0.0823, "step": 4447 }, { "epoch": 2.0236578707916286, "grad_norm": 0.48297347566881876, "learning_rate": 3.237798413419859e-05, "loss": 0.0786, "step": 4448 }, { "epoch": 2.024112829845314, "grad_norm": 0.6962088618121739, "learning_rate": 3.237115565416436e-05, "loss": 0.0662, "step": 4449 }, { "epoch": 2.0245677888989992, "grad_norm": 0.5036681150746113, "learning_rate": 3.236432657179258e-05, "loss": 0.0729, "step": 4450 }, { "epoch": 2.025022747952684, "grad_norm": 0.494528094813184, "learning_rate": 3.2357496887641284e-05, "loss": 0.0477, "step": 4451 }, { "epoch": 2.0254777070063694, "grad_norm": 0.5666820424089722, "learning_rate": 3.2350666602268576e-05, "loss": 0.0468, "step": 4452 }, { "epoch": 2.0259326660600547, "grad_norm": 0.41567441380788706, "learning_rate": 3.234383571623257e-05, "loss": 0.0543, "step": 4453 }, { "epoch": 2.0263876251137396, "grad_norm": 0.607143999543226, "learning_rate": 3.2337004230091496e-05, "loss": 0.0431, "step": 4454 }, { "epoch": 2.026842584167425, "grad_norm": 0.4259620512848622, "learning_rate": 3.233017214440357e-05, "loss": 0.1013, "step": 4455 }, { "epoch": 2.02729754322111, "grad_norm": 0.41150190318512697, "learning_rate": 3.232333945972707e-05, "loss": 0.0436, "step": 4456 }, { "epoch": 2.027752502274795, "grad_norm": 0.37053785853436877, "learning_rate": 3.231650617662033e-05, "loss": 0.0325, "step": 4457 }, { "epoch": 2.0282074613284804, "grad_norm": 0.6800596321675817, "learning_rate": 3.2309672295641756e-05, "loss": 0.0699, "step": 4458 }, { "epoch": 2.0286624203821657, "grad_norm": 0.46228243757974835, "learning_rate": 3.230283781734978e-05, "loss": 0.037, "step": 4459 }, { "epoch": 2.0291173794358506, "grad_norm": 0.5440738688173047, "learning_rate": 3.229600274230287e-05, "loss": 0.0518, "step": 4460 }, { "epoch": 2.029572338489536, "grad_norm": 0.5665732100557446, "learning_rate": 3.2289167071059565e-05, "loss": 0.053, "step": 4461 }, { "epoch": 2.030027297543221, "grad_norm": 0.5070494233499602, "learning_rate": 3.228233080417844e-05, "loss": 0.0571, "step": 4462 }, { "epoch": 2.030482256596906, "grad_norm": 0.3615423571030499, "learning_rate": 3.227549394221814e-05, "loss": 0.0329, "step": 4463 }, { "epoch": 2.0309372156505914, "grad_norm": 0.47218297518233554, "learning_rate": 3.226865648573732e-05, "loss": 0.0282, "step": 4464 }, { "epoch": 2.0313921747042767, "grad_norm": 0.5657077831948504, "learning_rate": 3.226181843529472e-05, "loss": 0.0472, "step": 4465 }, { "epoch": 2.031847133757962, "grad_norm": 0.5070450613020155, "learning_rate": 3.225497979144911e-05, "loss": 0.047, "step": 4466 }, { "epoch": 2.032302092811647, "grad_norm": 0.34618523225139747, "learning_rate": 3.224814055475932e-05, "loss": 0.0385, "step": 4467 }, { "epoch": 2.032757051865332, "grad_norm": 0.4718571932767974, "learning_rate": 3.2241300725784205e-05, "loss": 0.0512, "step": 4468 }, { "epoch": 2.0332120109190175, "grad_norm": 0.7759182404862881, "learning_rate": 3.2234460305082716e-05, "loss": 0.054, "step": 4469 }, { "epoch": 2.0336669699727024, "grad_norm": 0.4127412636881424, "learning_rate": 3.2227619293213785e-05, "loss": 0.0232, "step": 4470 }, { "epoch": 2.0341219290263877, "grad_norm": 0.570616973079528, "learning_rate": 3.222077769073645e-05, "loss": 0.0516, "step": 4471 }, { "epoch": 2.034576888080073, "grad_norm": 0.46763843628847834, "learning_rate": 3.2213935498209766e-05, "loss": 0.0656, "step": 4472 }, { "epoch": 2.035031847133758, "grad_norm": 0.6196945665121599, "learning_rate": 3.220709271619286e-05, "loss": 0.0349, "step": 4473 }, { "epoch": 2.035486806187443, "grad_norm": 0.6243997861783781, "learning_rate": 3.2200249345244874e-05, "loss": 0.0541, "step": 4474 }, { "epoch": 2.0359417652411285, "grad_norm": 0.8124029083116407, "learning_rate": 3.219340538592503e-05, "loss": 0.0944, "step": 4475 }, { "epoch": 2.0363967242948133, "grad_norm": 0.5312671249916076, "learning_rate": 3.2186560838792596e-05, "loss": 0.0185, "step": 4476 }, { "epoch": 2.0368516833484986, "grad_norm": 0.39696403413616765, "learning_rate": 3.217971570440686e-05, "loss": 0.0411, "step": 4477 }, { "epoch": 2.037306642402184, "grad_norm": 0.9544032305882774, "learning_rate": 3.217286998332716e-05, "loss": 0.0618, "step": 4478 }, { "epoch": 2.037761601455869, "grad_norm": 0.5686112973158385, "learning_rate": 3.216602367611294e-05, "loss": 0.0511, "step": 4479 }, { "epoch": 2.038216560509554, "grad_norm": 0.5277645366404805, "learning_rate": 3.215917678332362e-05, "loss": 0.0449, "step": 4480 }, { "epoch": 2.0386715195632394, "grad_norm": 0.5250340866200898, "learning_rate": 3.21523293055187e-05, "loss": 0.0443, "step": 4481 }, { "epoch": 2.0391264786169243, "grad_norm": 0.5172388040700197, "learning_rate": 3.2145481243257726e-05, "loss": 0.0414, "step": 4482 }, { "epoch": 2.0395814376706096, "grad_norm": 0.3530960323137407, "learning_rate": 3.21386325971003e-05, "loss": 0.0308, "step": 4483 }, { "epoch": 2.040036396724295, "grad_norm": 0.5498245920707481, "learning_rate": 3.2131783367606055e-05, "loss": 0.0689, "step": 4484 }, { "epoch": 2.04049135577798, "grad_norm": 0.5129303408025444, "learning_rate": 3.212493355533468e-05, "loss": 0.0651, "step": 4485 }, { "epoch": 2.040946314831665, "grad_norm": 0.6442608593167594, "learning_rate": 3.211808316084591e-05, "loss": 0.057, "step": 4486 }, { "epoch": 2.0414012738853504, "grad_norm": 0.4441171006762999, "learning_rate": 3.211123218469953e-05, "loss": 0.0428, "step": 4487 }, { "epoch": 2.0418562329390353, "grad_norm": 0.6563088793979018, "learning_rate": 3.2104380627455366e-05, "loss": 0.0708, "step": 4488 }, { "epoch": 2.0423111919927206, "grad_norm": 0.6995826758191043, "learning_rate": 3.20975284896733e-05, "loss": 0.0513, "step": 4489 }, { "epoch": 2.042766151046406, "grad_norm": 0.5111612992225971, "learning_rate": 3.209067577191327e-05, "loss": 0.0773, "step": 4490 }, { "epoch": 2.0432211101000908, "grad_norm": 0.5049144259998231, "learning_rate": 3.2083822474735234e-05, "loss": 0.0535, "step": 4491 }, { "epoch": 2.043676069153776, "grad_norm": 0.5310779418720882, "learning_rate": 3.2076968598699195e-05, "loss": 0.074, "step": 4492 }, { "epoch": 2.0441310282074614, "grad_norm": 0.4993770911374598, "learning_rate": 3.207011414436527e-05, "loss": 0.0383, "step": 4493 }, { "epoch": 2.0445859872611467, "grad_norm": 0.5981741157046078, "learning_rate": 3.2063259112293526e-05, "loss": 0.044, "step": 4494 }, { "epoch": 2.0450409463148316, "grad_norm": 0.41807418818668124, "learning_rate": 3.2056403503044155e-05, "loss": 0.0366, "step": 4495 }, { "epoch": 2.045495905368517, "grad_norm": 0.3397303961880902, "learning_rate": 3.2049547317177356e-05, "loss": 0.0403, "step": 4496 }, { "epoch": 2.045950864422202, "grad_norm": 0.43062938170267007, "learning_rate": 3.204269055525338e-05, "loss": 0.079, "step": 4497 }, { "epoch": 2.046405823475887, "grad_norm": 1.0336259991412753, "learning_rate": 3.203583321783253e-05, "loss": 0.0816, "step": 4498 }, { "epoch": 2.0468607825295724, "grad_norm": 0.6970536455580889, "learning_rate": 3.2028975305475174e-05, "loss": 0.0291, "step": 4499 }, { "epoch": 2.0473157415832577, "grad_norm": 0.4138574741655441, "learning_rate": 3.20221168187417e-05, "loss": 0.0561, "step": 4500 }, { "epoch": 2.0477707006369426, "grad_norm": 0.506175958899418, "learning_rate": 3.201525775819254e-05, "loss": 0.0566, "step": 4501 }, { "epoch": 2.048225659690628, "grad_norm": 0.5126767050945494, "learning_rate": 3.200839812438821e-05, "loss": 0.0494, "step": 4502 }, { "epoch": 2.048680618744313, "grad_norm": 0.5589076571010784, "learning_rate": 3.200153791788922e-05, "loss": 0.1073, "step": 4503 }, { "epoch": 2.049135577797998, "grad_norm": 0.6015001708436157, "learning_rate": 3.199467713925618e-05, "loss": 0.0688, "step": 4504 }, { "epoch": 2.0495905368516834, "grad_norm": 0.6061644982385069, "learning_rate": 3.198781578904972e-05, "loss": 0.0506, "step": 4505 }, { "epoch": 2.0500454959053687, "grad_norm": 0.5289232723175666, "learning_rate": 3.198095386783049e-05, "loss": 0.0321, "step": 4506 }, { "epoch": 2.0505004549590535, "grad_norm": 0.3227426060643412, "learning_rate": 3.1974091376159246e-05, "loss": 0.0423, "step": 4507 }, { "epoch": 2.050955414012739, "grad_norm": 0.6157254409779627, "learning_rate": 3.196722831459676e-05, "loss": 0.0423, "step": 4508 }, { "epoch": 2.051410373066424, "grad_norm": 0.572372672569538, "learning_rate": 3.196036468370382e-05, "loss": 0.0905, "step": 4509 }, { "epoch": 2.051865332120109, "grad_norm": 0.5457551221376152, "learning_rate": 3.1953500484041326e-05, "loss": 0.0514, "step": 4510 }, { "epoch": 2.0523202911737943, "grad_norm": 0.39299164514198015, "learning_rate": 3.1946635716170164e-05, "loss": 0.0417, "step": 4511 }, { "epoch": 2.0527752502274796, "grad_norm": 0.6671217417181438, "learning_rate": 3.1939770380651314e-05, "loss": 0.1063, "step": 4512 }, { "epoch": 2.0532302092811645, "grad_norm": 0.488853050299273, "learning_rate": 3.1932904478045756e-05, "loss": 0.0565, "step": 4513 }, { "epoch": 2.05368516833485, "grad_norm": 0.5176571470170644, "learning_rate": 3.1926038008914564e-05, "loss": 0.0452, "step": 4514 }, { "epoch": 2.054140127388535, "grad_norm": 0.45813605931723284, "learning_rate": 3.191917097381881e-05, "loss": 0.0328, "step": 4515 }, { "epoch": 2.05459508644222, "grad_norm": 1.1205393261001215, "learning_rate": 3.191230337331966e-05, "loss": 0.1674, "step": 4516 }, { "epoch": 2.0550500454959053, "grad_norm": 0.4139525291977058, "learning_rate": 3.190543520797829e-05, "loss": 0.0321, "step": 4517 }, { "epoch": 2.0555050045495906, "grad_norm": 0.5334943317720415, "learning_rate": 3.189856647835594e-05, "loss": 0.0423, "step": 4518 }, { "epoch": 2.055959963603276, "grad_norm": 0.5177632150245287, "learning_rate": 3.189169718501389e-05, "loss": 0.0781, "step": 4519 }, { "epoch": 2.056414922656961, "grad_norm": 0.6993629776510301, "learning_rate": 3.188482732851348e-05, "loss": 0.0689, "step": 4520 }, { "epoch": 2.056869881710646, "grad_norm": 0.9559820029931113, "learning_rate": 3.1877956909416065e-05, "loss": 0.0518, "step": 4521 }, { "epoch": 2.0573248407643314, "grad_norm": 0.43461881327054547, "learning_rate": 3.1871085928283064e-05, "loss": 0.0478, "step": 4522 }, { "epoch": 2.0577797998180163, "grad_norm": 0.5189702108256509, "learning_rate": 3.186421438567596e-05, "loss": 0.069, "step": 4523 }, { "epoch": 2.0582347588717016, "grad_norm": 0.5560706038754165, "learning_rate": 3.185734228215625e-05, "loss": 0.0615, "step": 4524 }, { "epoch": 2.058689717925387, "grad_norm": 0.5677323055297938, "learning_rate": 3.18504696182855e-05, "loss": 0.0483, "step": 4525 }, { "epoch": 2.0591446769790718, "grad_norm": 0.4423928209349849, "learning_rate": 3.18435963946253e-05, "loss": 0.0646, "step": 4526 }, { "epoch": 2.059599636032757, "grad_norm": 0.4531008200980407, "learning_rate": 3.1836722611737325e-05, "loss": 0.0336, "step": 4527 }, { "epoch": 2.0600545950864424, "grad_norm": 0.525733069655487, "learning_rate": 3.182984827018324e-05, "loss": 0.079, "step": 4528 }, { "epoch": 2.0605095541401273, "grad_norm": 0.3751669905894671, "learning_rate": 3.18229733705248e-05, "loss": 0.0456, "step": 4529 }, { "epoch": 2.0609645131938126, "grad_norm": 0.4213592336662888, "learning_rate": 3.181609791332379e-05, "loss": 0.0371, "step": 4530 }, { "epoch": 2.061419472247498, "grad_norm": 0.46583677655891953, "learning_rate": 3.180922189914204e-05, "loss": 0.0458, "step": 4531 }, { "epoch": 2.0618744313011828, "grad_norm": 0.4560038034852146, "learning_rate": 3.180234532854143e-05, "loss": 0.0217, "step": 4532 }, { "epoch": 2.062329390354868, "grad_norm": 0.5074598180049745, "learning_rate": 3.1795468202083863e-05, "loss": 0.0468, "step": 4533 }, { "epoch": 2.0627843494085534, "grad_norm": 2.0285167646913984, "learning_rate": 3.178859052033133e-05, "loss": 0.0793, "step": 4534 }, { "epoch": 2.0632393084622382, "grad_norm": 0.7518502449836174, "learning_rate": 3.1781712283845844e-05, "loss": 0.0697, "step": 4535 }, { "epoch": 2.0636942675159236, "grad_norm": 0.5751675635102664, "learning_rate": 3.177483349318946e-05, "loss": 0.0613, "step": 4536 }, { "epoch": 2.064149226569609, "grad_norm": 0.44349402785807934, "learning_rate": 3.176795414892427e-05, "loss": 0.032, "step": 4537 }, { "epoch": 2.0646041856232937, "grad_norm": 0.7173013077625106, "learning_rate": 3.176107425161243e-05, "loss": 0.0734, "step": 4538 }, { "epoch": 2.065059144676979, "grad_norm": 0.5349615599224455, "learning_rate": 3.1754193801816136e-05, "loss": 0.05, "step": 4539 }, { "epoch": 2.0655141037306644, "grad_norm": 0.3102618005347954, "learning_rate": 3.174731280009762e-05, "loss": 0.0138, "step": 4540 }, { "epoch": 2.065969062784349, "grad_norm": 0.6442746719926109, "learning_rate": 3.174043124701918e-05, "loss": 0.1164, "step": 4541 }, { "epoch": 2.0664240218380345, "grad_norm": 0.6069624825868528, "learning_rate": 3.173354914314314e-05, "loss": 0.0378, "step": 4542 }, { "epoch": 2.06687898089172, "grad_norm": 0.41949426777553045, "learning_rate": 3.172666648903187e-05, "loss": 0.0512, "step": 4543 }, { "epoch": 2.0673339399454047, "grad_norm": 0.4870716931666346, "learning_rate": 3.171978328524779e-05, "loss": 0.0381, "step": 4544 }, { "epoch": 2.06778889899909, "grad_norm": 0.4937282059714588, "learning_rate": 3.171289953235336e-05, "loss": 0.052, "step": 4545 }, { "epoch": 2.0682438580527753, "grad_norm": 0.5325023669552021, "learning_rate": 3.1706015230911114e-05, "loss": 0.064, "step": 4546 }, { "epoch": 2.06869881710646, "grad_norm": 0.9813273617298607, "learning_rate": 3.169913038148357e-05, "loss": 0.1225, "step": 4547 }, { "epoch": 2.0691537761601455, "grad_norm": 0.48779282191695567, "learning_rate": 3.169224498463335e-05, "loss": 0.0548, "step": 4548 }, { "epoch": 2.069608735213831, "grad_norm": 0.5569588881653458, "learning_rate": 3.16853590409231e-05, "loss": 0.1266, "step": 4549 }, { "epoch": 2.070063694267516, "grad_norm": 0.35659917809280817, "learning_rate": 3.167847255091549e-05, "loss": 0.0265, "step": 4550 }, { "epoch": 2.070518653321201, "grad_norm": 0.4663280709239099, "learning_rate": 3.167158551517326e-05, "loss": 0.0409, "step": 4551 }, { "epoch": 2.0709736123748863, "grad_norm": 0.47041807218368037, "learning_rate": 3.166469793425919e-05, "loss": 0.0615, "step": 4552 }, { "epoch": 2.0714285714285716, "grad_norm": 6.218478863432306, "learning_rate": 3.165780980873612e-05, "loss": 0.0916, "step": 4553 }, { "epoch": 2.0718835304822565, "grad_norm": 0.505444143238489, "learning_rate": 3.165092113916688e-05, "loss": 0.0741, "step": 4554 }, { "epoch": 2.072338489535942, "grad_norm": 0.49612582849702014, "learning_rate": 3.1644031926114405e-05, "loss": 0.0506, "step": 4555 }, { "epoch": 2.072793448589627, "grad_norm": 0.3996989264533471, "learning_rate": 3.1637142170141655e-05, "loss": 0.0453, "step": 4556 }, { "epoch": 2.073248407643312, "grad_norm": 0.6002014735976767, "learning_rate": 3.163025187181161e-05, "loss": 0.0637, "step": 4557 }, { "epoch": 2.0737033666969973, "grad_norm": 0.6348770211960799, "learning_rate": 3.162336103168732e-05, "loss": 0.0458, "step": 4558 }, { "epoch": 2.0741583257506826, "grad_norm": 0.536069637839747, "learning_rate": 3.161646965033188e-05, "loss": 0.0363, "step": 4559 }, { "epoch": 2.0746132848043675, "grad_norm": 0.5968850838892891, "learning_rate": 3.1609577728308425e-05, "loss": 0.0739, "step": 4560 }, { "epoch": 2.0750682438580528, "grad_norm": 0.4531946509201975, "learning_rate": 3.160268526618012e-05, "loss": 0.0353, "step": 4561 }, { "epoch": 2.075523202911738, "grad_norm": 0.9144874181524465, "learning_rate": 3.15957922645102e-05, "loss": 0.0699, "step": 4562 }, { "epoch": 2.075978161965423, "grad_norm": 0.44620815522108037, "learning_rate": 3.158889872386192e-05, "loss": 0.0561, "step": 4563 }, { "epoch": 2.0764331210191083, "grad_norm": 0.45020198933540173, "learning_rate": 3.158200464479859e-05, "loss": 0.0563, "step": 4564 }, { "epoch": 2.0768880800727936, "grad_norm": 0.4212377009131794, "learning_rate": 3.157511002788356e-05, "loss": 0.0305, "step": 4565 }, { "epoch": 2.0773430391264784, "grad_norm": 0.5313427652360558, "learning_rate": 3.156821487368024e-05, "loss": 0.0519, "step": 4566 }, { "epoch": 2.0777979981801638, "grad_norm": 0.37499573005318665, "learning_rate": 3.1561319182752065e-05, "loss": 0.0448, "step": 4567 }, { "epoch": 2.078252957233849, "grad_norm": 0.6239271839915356, "learning_rate": 3.155442295566251e-05, "loss": 0.0876, "step": 4568 }, { "epoch": 2.078707916287534, "grad_norm": 0.5971710917649616, "learning_rate": 3.1547526192975105e-05, "loss": 0.0737, "step": 4569 }, { "epoch": 2.0791628753412192, "grad_norm": 0.6186102296192217, "learning_rate": 3.154062889525344e-05, "loss": 0.0659, "step": 4570 }, { "epoch": 2.0796178343949046, "grad_norm": 0.4581026223030611, "learning_rate": 3.153373106306111e-05, "loss": 0.0293, "step": 4571 }, { "epoch": 2.0800727934485894, "grad_norm": 0.38051889281597706, "learning_rate": 3.152683269696179e-05, "loss": 0.049, "step": 4572 }, { "epoch": 2.0805277525022747, "grad_norm": 0.5270498845255345, "learning_rate": 3.1519933797519175e-05, "loss": 0.0444, "step": 4573 }, { "epoch": 2.08098271155596, "grad_norm": 0.3530660723528281, "learning_rate": 3.1513034365297013e-05, "loss": 0.0325, "step": 4574 }, { "epoch": 2.0814376706096454, "grad_norm": 0.7737405242743535, "learning_rate": 3.150613440085909e-05, "loss": 0.0572, "step": 4575 }, { "epoch": 2.08189262966333, "grad_norm": 0.6474300744493358, "learning_rate": 3.149923390476925e-05, "loss": 0.0359, "step": 4576 }, { "epoch": 2.0823475887170155, "grad_norm": 0.5235540960137254, "learning_rate": 3.1492332877591366e-05, "loss": 0.0609, "step": 4577 }, { "epoch": 2.082802547770701, "grad_norm": 0.5039532989907544, "learning_rate": 3.148543131988936e-05, "loss": 0.0535, "step": 4578 }, { "epoch": 2.0832575068243857, "grad_norm": 0.47210447758894886, "learning_rate": 3.1478529232227194e-05, "loss": 0.05, "step": 4579 }, { "epoch": 2.083712465878071, "grad_norm": 0.48141435051551373, "learning_rate": 3.1471626615168876e-05, "loss": 0.0282, "step": 4580 }, { "epoch": 2.0841674249317563, "grad_norm": 0.5228056694021335, "learning_rate": 3.146472346927845e-05, "loss": 0.0543, "step": 4581 }, { "epoch": 2.084622383985441, "grad_norm": 0.4050967072645985, "learning_rate": 3.145781979512003e-05, "loss": 0.0644, "step": 4582 }, { "epoch": 2.0850773430391265, "grad_norm": 0.3945110489628106, "learning_rate": 3.145091559325773e-05, "loss": 0.0346, "step": 4583 }, { "epoch": 2.085532302092812, "grad_norm": 0.4395275547458577, "learning_rate": 3.1444010864255736e-05, "loss": 0.0348, "step": 4584 }, { "epoch": 2.0859872611464967, "grad_norm": 0.5821659390303388, "learning_rate": 3.143710560867829e-05, "loss": 0.0535, "step": 4585 }, { "epoch": 2.086442220200182, "grad_norm": 0.541556078722379, "learning_rate": 3.1430199827089624e-05, "loss": 0.0373, "step": 4586 }, { "epoch": 2.0868971792538673, "grad_norm": 0.4776968880139396, "learning_rate": 3.1423293520054074e-05, "loss": 0.0319, "step": 4587 }, { "epoch": 2.087352138307552, "grad_norm": 0.729000697875313, "learning_rate": 3.141638668813599e-05, "loss": 0.0497, "step": 4588 }, { "epoch": 2.0878070973612375, "grad_norm": 0.4290761884694932, "learning_rate": 3.140947933189976e-05, "loss": 0.0489, "step": 4589 }, { "epoch": 2.088262056414923, "grad_norm": 0.4914737265484664, "learning_rate": 3.140257145190982e-05, "loss": 0.0391, "step": 4590 }, { "epoch": 2.0887170154686077, "grad_norm": 0.4837026913656671, "learning_rate": 3.139566304873066e-05, "loss": 0.0378, "step": 4591 }, { "epoch": 2.089171974522293, "grad_norm": 0.6413612931553208, "learning_rate": 3.13887541229268e-05, "loss": 0.0692, "step": 4592 }, { "epoch": 2.0896269335759783, "grad_norm": 0.5659698248917533, "learning_rate": 3.1381844675062794e-05, "loss": 0.0592, "step": 4593 }, { "epoch": 2.090081892629663, "grad_norm": 0.42512804065452353, "learning_rate": 3.137493470570327e-05, "loss": 0.0438, "step": 4594 }, { "epoch": 2.0905368516833485, "grad_norm": 0.6338268847009871, "learning_rate": 3.1368024215412866e-05, "loss": 0.0838, "step": 4595 }, { "epoch": 2.0909918107370338, "grad_norm": 0.4396140490843282, "learning_rate": 3.136111320475628e-05, "loss": 0.0488, "step": 4596 }, { "epoch": 2.0914467697907186, "grad_norm": 0.39555579871318997, "learning_rate": 3.135420167429826e-05, "loss": 0.0337, "step": 4597 }, { "epoch": 2.091901728844404, "grad_norm": 0.3742261404104616, "learning_rate": 3.1347289624603566e-05, "loss": 0.0286, "step": 4598 }, { "epoch": 2.0923566878980893, "grad_norm": 0.43315282247069725, "learning_rate": 3.134037705623703e-05, "loss": 0.0404, "step": 4599 }, { "epoch": 2.092811646951774, "grad_norm": 0.521072667704643, "learning_rate": 3.133346396976351e-05, "loss": 0.0436, "step": 4600 }, { "epoch": 2.0932666060054594, "grad_norm": 0.5587141245775109, "learning_rate": 3.132655036574792e-05, "loss": 0.1085, "step": 4601 }, { "epoch": 2.0937215650591448, "grad_norm": 0.6583146906679008, "learning_rate": 3.131963624475521e-05, "loss": 0.0518, "step": 4602 }, { "epoch": 2.0941765241128296, "grad_norm": 0.3722646506158047, "learning_rate": 3.1312721607350345e-05, "loss": 0.0417, "step": 4603 }, { "epoch": 2.094631483166515, "grad_norm": 0.7244991323192166, "learning_rate": 3.13058064540984e-05, "loss": 0.0635, "step": 4604 }, { "epoch": 2.0950864422202002, "grad_norm": 0.7437254620455781, "learning_rate": 3.129889078556442e-05, "loss": 0.0766, "step": 4605 }, { "epoch": 2.0955414012738856, "grad_norm": 0.46590805651073264, "learning_rate": 3.1291974602313535e-05, "loss": 0.0403, "step": 4606 }, { "epoch": 2.0959963603275704, "grad_norm": 0.5055923904688392, "learning_rate": 3.12850579049109e-05, "loss": 0.0658, "step": 4607 }, { "epoch": 2.0964513193812557, "grad_norm": 0.5290828514082235, "learning_rate": 3.1278140693921704e-05, "loss": 0.0509, "step": 4608 }, { "epoch": 2.096906278434941, "grad_norm": 0.559999844983136, "learning_rate": 3.127122296991122e-05, "loss": 0.0608, "step": 4609 }, { "epoch": 2.097361237488626, "grad_norm": 0.6806102354168493, "learning_rate": 3.126430473344469e-05, "loss": 0.0544, "step": 4610 }, { "epoch": 2.097816196542311, "grad_norm": 0.5515916186629524, "learning_rate": 3.125738598508749e-05, "loss": 0.0603, "step": 4611 }, { "epoch": 2.0982711555959965, "grad_norm": 0.6017470736069401, "learning_rate": 3.125046672540496e-05, "loss": 0.1009, "step": 4612 }, { "epoch": 2.0987261146496814, "grad_norm": 0.5600717424164426, "learning_rate": 3.1243546954962515e-05, "loss": 0.0453, "step": 4613 }, { "epoch": 2.0991810737033667, "grad_norm": 0.3844780567808523, "learning_rate": 3.12366266743256e-05, "loss": 0.0344, "step": 4614 }, { "epoch": 2.099636032757052, "grad_norm": 0.5778836017078547, "learning_rate": 3.122970588405973e-05, "loss": 0.0697, "step": 4615 }, { "epoch": 2.100090991810737, "grad_norm": 0.4724869649384583, "learning_rate": 3.122278458473042e-05, "loss": 0.0524, "step": 4616 }, { "epoch": 2.100545950864422, "grad_norm": 0.4380275394428644, "learning_rate": 3.121586277690326e-05, "loss": 0.047, "step": 4617 }, { "epoch": 2.1010009099181075, "grad_norm": 0.48553274286999415, "learning_rate": 3.120894046114387e-05, "loss": 0.0525, "step": 4618 }, { "epoch": 2.1014558689717924, "grad_norm": 1.3295587507392075, "learning_rate": 3.1202017638017895e-05, "loss": 0.0843, "step": 4619 }, { "epoch": 2.1019108280254777, "grad_norm": 0.45055594331255916, "learning_rate": 3.119509430809105e-05, "loss": 0.0547, "step": 4620 }, { "epoch": 2.102365787079163, "grad_norm": 0.40740964174952765, "learning_rate": 3.118817047192907e-05, "loss": 0.0556, "step": 4621 }, { "epoch": 2.102820746132848, "grad_norm": 0.4849994467884097, "learning_rate": 3.118124613009775e-05, "loss": 0.0711, "step": 4622 }, { "epoch": 2.103275705186533, "grad_norm": 0.4597919812634306, "learning_rate": 3.117432128316291e-05, "loss": 0.0698, "step": 4623 }, { "epoch": 2.1037306642402185, "grad_norm": 0.5104444954512691, "learning_rate": 3.1167395931690416e-05, "loss": 0.09, "step": 4624 }, { "epoch": 2.1041856232939034, "grad_norm": 0.5641854953085871, "learning_rate": 3.116047007624618e-05, "loss": 0.0361, "step": 4625 }, { "epoch": 2.1046405823475887, "grad_norm": 0.49711817328811253, "learning_rate": 3.115354371739616e-05, "loss": 0.0618, "step": 4626 }, { "epoch": 2.105095541401274, "grad_norm": 0.63401303227943, "learning_rate": 3.1146616855706315e-05, "loss": 0.0539, "step": 4627 }, { "epoch": 2.105550500454959, "grad_norm": 0.6523100430212241, "learning_rate": 3.113968949174271e-05, "loss": 0.0414, "step": 4628 }, { "epoch": 2.106005459508644, "grad_norm": 0.5615839725660621, "learning_rate": 3.113276162607141e-05, "loss": 0.0959, "step": 4629 }, { "epoch": 2.1064604185623295, "grad_norm": 0.3114063692835452, "learning_rate": 3.112583325925852e-05, "loss": 0.028, "step": 4630 }, { "epoch": 2.1069153776160148, "grad_norm": 0.5456793750574318, "learning_rate": 3.111890439187019e-05, "loss": 0.0561, "step": 4631 }, { "epoch": 2.1073703366696996, "grad_norm": 0.9730892022527583, "learning_rate": 3.1111975024472647e-05, "loss": 0.0784, "step": 4632 }, { "epoch": 2.107825295723385, "grad_norm": 0.4744020304109642, "learning_rate": 3.11050451576321e-05, "loss": 0.0553, "step": 4633 }, { "epoch": 2.1082802547770703, "grad_norm": 0.5512803431866962, "learning_rate": 3.1098114791914824e-05, "loss": 0.0645, "step": 4634 }, { "epoch": 2.108735213830755, "grad_norm": 0.4889824171000353, "learning_rate": 3.109118392788715e-05, "loss": 0.0371, "step": 4635 }, { "epoch": 2.1091901728844404, "grad_norm": 0.5087731295118931, "learning_rate": 3.108425256611544e-05, "loss": 0.0522, "step": 4636 }, { "epoch": 2.1096451319381258, "grad_norm": 0.4860140832702659, "learning_rate": 3.107732070716608e-05, "loss": 0.0271, "step": 4637 }, { "epoch": 2.1101000909918106, "grad_norm": 0.43148088636069426, "learning_rate": 3.107038835160551e-05, "loss": 0.0221, "step": 4638 }, { "epoch": 2.110555050045496, "grad_norm": 0.5274148925928356, "learning_rate": 3.106345550000023e-05, "loss": 0.0536, "step": 4639 }, { "epoch": 2.1110100090991812, "grad_norm": 0.41901398796204065, "learning_rate": 3.1056522152916745e-05, "loss": 0.0384, "step": 4640 }, { "epoch": 2.111464968152866, "grad_norm": 0.5572496968434298, "learning_rate": 3.104958831092162e-05, "loss": 0.0606, "step": 4641 }, { "epoch": 2.1119199272065514, "grad_norm": 0.3190295384286397, "learning_rate": 3.104265397458146e-05, "loss": 0.0348, "step": 4642 }, { "epoch": 2.1123748862602367, "grad_norm": 0.32137384841172884, "learning_rate": 3.1035719144462896e-05, "loss": 0.0215, "step": 4643 }, { "epoch": 2.1128298453139216, "grad_norm": 0.7407012927521023, "learning_rate": 3.102878382113263e-05, "loss": 0.0487, "step": 4644 }, { "epoch": 2.113284804367607, "grad_norm": 0.469725072276876, "learning_rate": 3.102184800515737e-05, "loss": 0.0474, "step": 4645 }, { "epoch": 2.113739763421292, "grad_norm": 0.6240803446059225, "learning_rate": 3.101491169710389e-05, "loss": 0.0612, "step": 4646 }, { "epoch": 2.114194722474977, "grad_norm": 0.5108556714729628, "learning_rate": 3.100797489753899e-05, "loss": 0.0618, "step": 4647 }, { "epoch": 2.1146496815286624, "grad_norm": 0.7195716414560026, "learning_rate": 3.100103760702951e-05, "loss": 0.0403, "step": 4648 }, { "epoch": 2.1151046405823477, "grad_norm": 0.527490376980494, "learning_rate": 3.0994099826142336e-05, "loss": 0.0419, "step": 4649 }, { "epoch": 2.1155595996360326, "grad_norm": 0.5689275321872995, "learning_rate": 3.09871615554444e-05, "loss": 0.0489, "step": 4650 }, { "epoch": 2.116014558689718, "grad_norm": 0.49179223463746874, "learning_rate": 3.098022279550265e-05, "loss": 0.0665, "step": 4651 }, { "epoch": 2.116469517743403, "grad_norm": 0.8803373883263261, "learning_rate": 3.097328354688411e-05, "loss": 0.0459, "step": 4652 }, { "epoch": 2.116924476797088, "grad_norm": 0.6206002428962073, "learning_rate": 3.096634381015581e-05, "loss": 0.0547, "step": 4653 }, { "epoch": 2.1173794358507734, "grad_norm": 0.5863859360210761, "learning_rate": 3.095940358588483e-05, "loss": 0.0773, "step": 4654 }, { "epoch": 2.1178343949044587, "grad_norm": 0.4178818811873014, "learning_rate": 3.0952462874638315e-05, "loss": 0.0449, "step": 4655 }, { "epoch": 2.1182893539581436, "grad_norm": 0.5396331665759083, "learning_rate": 3.09455216769834e-05, "loss": 0.0494, "step": 4656 }, { "epoch": 2.118744313011829, "grad_norm": 0.5101990837152651, "learning_rate": 3.0938579993487314e-05, "loss": 0.0489, "step": 4657 }, { "epoch": 2.119199272065514, "grad_norm": 0.5613997158412849, "learning_rate": 3.093163782471728e-05, "loss": 0.081, "step": 4658 }, { "epoch": 2.1196542311191995, "grad_norm": 0.5833163260560749, "learning_rate": 3.0924695171240604e-05, "loss": 0.0505, "step": 4659 }, { "epoch": 2.1201091901728844, "grad_norm": 0.405036006185702, "learning_rate": 3.0917752033624585e-05, "loss": 0.0371, "step": 4660 }, { "epoch": 2.1205641492265697, "grad_norm": 0.4844740721764284, "learning_rate": 3.0910808412436596e-05, "loss": 0.0375, "step": 4661 }, { "epoch": 2.121019108280255, "grad_norm": 0.557896989180666, "learning_rate": 3.090386430824404e-05, "loss": 0.0646, "step": 4662 }, { "epoch": 2.12147406733394, "grad_norm": 0.4601186811481348, "learning_rate": 3.089691972161434e-05, "loss": 0.0485, "step": 4663 }, { "epoch": 2.121929026387625, "grad_norm": 0.430811493183555, "learning_rate": 3.0889974653115006e-05, "loss": 0.0851, "step": 4664 }, { "epoch": 2.1223839854413105, "grad_norm": 0.38945638252901155, "learning_rate": 3.088302910331354e-05, "loss": 0.0471, "step": 4665 }, { "epoch": 2.1228389444949953, "grad_norm": 0.45519761472648873, "learning_rate": 3.087608307277749e-05, "loss": 0.0284, "step": 4666 }, { "epoch": 2.1232939035486806, "grad_norm": 0.39193249468435803, "learning_rate": 3.0869136562074486e-05, "loss": 0.0495, "step": 4667 }, { "epoch": 2.123748862602366, "grad_norm": 0.47740134020726677, "learning_rate": 3.0862189571772136e-05, "loss": 0.0605, "step": 4668 }, { "epoch": 2.124203821656051, "grad_norm": 0.48679428530127883, "learning_rate": 3.0855242102438136e-05, "loss": 0.047, "step": 4669 }, { "epoch": 2.124658780709736, "grad_norm": 0.4277625621966664, "learning_rate": 3.084829415464019e-05, "loss": 0.0316, "step": 4670 }, { "epoch": 2.1251137397634214, "grad_norm": 0.7595323677673687, "learning_rate": 3.084134572894605e-05, "loss": 0.0679, "step": 4671 }, { "epoch": 2.1255686988171063, "grad_norm": 0.576486682880547, "learning_rate": 3.0834396825923524e-05, "loss": 0.066, "step": 4672 }, { "epoch": 2.1260236578707916, "grad_norm": 0.4697096119939214, "learning_rate": 3.082744744614043e-05, "loss": 0.0754, "step": 4673 }, { "epoch": 2.126478616924477, "grad_norm": 0.5506285267990382, "learning_rate": 3.0820497590164655e-05, "loss": 0.0357, "step": 4674 }, { "epoch": 2.126933575978162, "grad_norm": 0.43136009349391335, "learning_rate": 3.08135472585641e-05, "loss": 0.0435, "step": 4675 }, { "epoch": 2.127388535031847, "grad_norm": 0.5788572037962345, "learning_rate": 3.0806596451906714e-05, "loss": 0.062, "step": 4676 }, { "epoch": 2.1278434940855324, "grad_norm": 0.6015349331216712, "learning_rate": 3.0799645170760486e-05, "loss": 0.0482, "step": 4677 }, { "epoch": 2.1282984531392173, "grad_norm": 0.3948446147804793, "learning_rate": 3.079269341569345e-05, "loss": 0.0347, "step": 4678 }, { "epoch": 2.1287534121929026, "grad_norm": 0.46177572468853445, "learning_rate": 3.078574118727367e-05, "loss": 0.0436, "step": 4679 }, { "epoch": 2.129208371246588, "grad_norm": 0.41103648084576416, "learning_rate": 3.077878848606924e-05, "loss": 0.0591, "step": 4680 }, { "epoch": 2.1296633303002728, "grad_norm": 2.0194690361147294, "learning_rate": 3.0771835312648316e-05, "loss": 0.1529, "step": 4681 }, { "epoch": 2.130118289353958, "grad_norm": 0.4404280887424082, "learning_rate": 3.076488166757908e-05, "loss": 0.0479, "step": 4682 }, { "epoch": 2.1305732484076434, "grad_norm": 0.5081072795642788, "learning_rate": 3.0757927551429744e-05, "loss": 0.0507, "step": 4683 }, { "epoch": 2.1310282074613287, "grad_norm": 0.41891679788755554, "learning_rate": 3.075097296476857e-05, "loss": 0.0423, "step": 4684 }, { "epoch": 2.1314831665150136, "grad_norm": 0.4801064321408018, "learning_rate": 3.074401790816385e-05, "loss": 0.0661, "step": 4685 }, { "epoch": 2.131938125568699, "grad_norm": 0.5567002573638222, "learning_rate": 3.0737062382183946e-05, "loss": 0.0553, "step": 4686 }, { "epoch": 2.132393084622384, "grad_norm": 0.7580831818107994, "learning_rate": 3.073010638739719e-05, "loss": 0.0861, "step": 4687 }, { "epoch": 2.132848043676069, "grad_norm": 0.5193328435777546, "learning_rate": 3.072314992437203e-05, "loss": 0.0689, "step": 4688 }, { "epoch": 2.1333030027297544, "grad_norm": 0.4473292104445085, "learning_rate": 3.071619299367691e-05, "loss": 0.0484, "step": 4689 }, { "epoch": 2.1337579617834397, "grad_norm": 0.4498486429541877, "learning_rate": 3.0709235595880295e-05, "loss": 0.0367, "step": 4690 }, { "epoch": 2.1342129208371245, "grad_norm": 0.5404005940705554, "learning_rate": 3.070227773155074e-05, "loss": 0.057, "step": 4691 }, { "epoch": 2.13466787989081, "grad_norm": 0.36238877290036775, "learning_rate": 3.06953194012568e-05, "loss": 0.0536, "step": 4692 }, { "epoch": 2.135122838944495, "grad_norm": 0.48776718987439194, "learning_rate": 3.0688360605567076e-05, "loss": 0.0734, "step": 4693 }, { "epoch": 2.13557779799818, "grad_norm": 0.7775528486066172, "learning_rate": 3.068140134505022e-05, "loss": 0.083, "step": 4694 }, { "epoch": 2.1360327570518653, "grad_norm": 0.40855348874568326, "learning_rate": 3.067444162027489e-05, "loss": 0.0474, "step": 4695 }, { "epoch": 2.1364877161055507, "grad_norm": 0.431682742688954, "learning_rate": 3.0667481431809826e-05, "loss": 0.0753, "step": 4696 }, { "epoch": 2.1369426751592355, "grad_norm": 0.47865343273618355, "learning_rate": 3.066052078022377e-05, "loss": 0.05, "step": 4697 }, { "epoch": 2.137397634212921, "grad_norm": 1.4117382333070385, "learning_rate": 3.0653559666085516e-05, "loss": 0.0908, "step": 4698 }, { "epoch": 2.137852593266606, "grad_norm": 0.5281618247990397, "learning_rate": 3.06465980899639e-05, "loss": 0.0583, "step": 4699 }, { "epoch": 2.138307552320291, "grad_norm": 0.39940564098265446, "learning_rate": 3.063963605242779e-05, "loss": 0.0277, "step": 4700 }, { "epoch": 2.1387625113739763, "grad_norm": 0.4928406113734878, "learning_rate": 3.063267355404608e-05, "loss": 0.0668, "step": 4701 }, { "epoch": 2.1392174704276616, "grad_norm": 0.41676124364681744, "learning_rate": 3.0625710595387734e-05, "loss": 0.0342, "step": 4702 }, { "epoch": 2.1396724294813465, "grad_norm": 0.3768104449449032, "learning_rate": 3.061874717702172e-05, "loss": 0.0422, "step": 4703 }, { "epoch": 2.140127388535032, "grad_norm": 0.4386851250222116, "learning_rate": 3.061178329951707e-05, "loss": 0.0318, "step": 4704 }, { "epoch": 2.140582347588717, "grad_norm": 0.3342460199580044, "learning_rate": 3.060481896344282e-05, "loss": 0.0333, "step": 4705 }, { "epoch": 2.141037306642402, "grad_norm": 0.4518126646414811, "learning_rate": 3.059785416936808e-05, "loss": 0.0454, "step": 4706 }, { "epoch": 2.1414922656960873, "grad_norm": 0.7669769400300909, "learning_rate": 3.059088891786197e-05, "loss": 0.0763, "step": 4707 }, { "epoch": 2.1419472247497726, "grad_norm": 0.6243604086483462, "learning_rate": 3.058392320949367e-05, "loss": 0.0478, "step": 4708 }, { "epoch": 2.1424021838034575, "grad_norm": 0.5739005627768441, "learning_rate": 3.057695704483239e-05, "loss": 0.0558, "step": 4709 }, { "epoch": 2.142857142857143, "grad_norm": 1.0838800345580646, "learning_rate": 3.0569990424447356e-05, "loss": 0.0653, "step": 4710 }, { "epoch": 2.143312101910828, "grad_norm": 0.5518925710791583, "learning_rate": 3.056302334890786e-05, "loss": 0.0491, "step": 4711 }, { "epoch": 2.143767060964513, "grad_norm": 0.4520992709275539, "learning_rate": 3.055605581878322e-05, "loss": 0.0392, "step": 4712 }, { "epoch": 2.1442220200181983, "grad_norm": 0.5511270238757014, "learning_rate": 3.05490878346428e-05, "loss": 0.0363, "step": 4713 }, { "epoch": 2.1446769790718836, "grad_norm": 0.5330081543860439, "learning_rate": 3.054211939705596e-05, "loss": 0.0347, "step": 4714 }, { "epoch": 2.145131938125569, "grad_norm": 0.6390108168757249, "learning_rate": 3.0535150506592166e-05, "loss": 0.0473, "step": 4715 }, { "epoch": 2.1455868971792538, "grad_norm": 0.5081370498313962, "learning_rate": 3.052818116382086e-05, "loss": 0.0436, "step": 4716 }, { "epoch": 2.146041856232939, "grad_norm": 0.5540427331208139, "learning_rate": 3.052121136931156e-05, "loss": 0.0426, "step": 4717 }, { "epoch": 2.1464968152866244, "grad_norm": 0.4296824583809129, "learning_rate": 3.0514241123633802e-05, "loss": 0.0697, "step": 4718 }, { "epoch": 2.1469517743403093, "grad_norm": 0.4341437767055969, "learning_rate": 3.050727042735716e-05, "loss": 0.0409, "step": 4719 }, { "epoch": 2.1474067333939946, "grad_norm": 1.241047966456606, "learning_rate": 3.050029928105125e-05, "loss": 0.0617, "step": 4720 }, { "epoch": 2.14786169244768, "grad_norm": 0.5483864489311577, "learning_rate": 3.0493327685285723e-05, "loss": 0.0649, "step": 4721 }, { "epoch": 2.1483166515013647, "grad_norm": 0.6657434501198982, "learning_rate": 3.048635564063026e-05, "loss": 0.0888, "step": 4722 }, { "epoch": 2.14877161055505, "grad_norm": 0.3434400016064579, "learning_rate": 3.047938314765459e-05, "loss": 0.0369, "step": 4723 }, { "epoch": 2.1492265696087354, "grad_norm": 0.5566435688461728, "learning_rate": 3.0472410206928483e-05, "loss": 0.0193, "step": 4724 }, { "epoch": 2.1496815286624202, "grad_norm": 0.563307078227041, "learning_rate": 3.0465436819021727e-05, "loss": 0.0516, "step": 4725 }, { "epoch": 2.1501364877161055, "grad_norm": 0.48192241462870056, "learning_rate": 3.0458462984504137e-05, "loss": 0.0385, "step": 4726 }, { "epoch": 2.150591446769791, "grad_norm": 0.8841204183945818, "learning_rate": 3.0451488703945618e-05, "loss": 0.0476, "step": 4727 }, { "epoch": 2.1510464058234757, "grad_norm": 0.5256929906937368, "learning_rate": 3.0444513977916057e-05, "loss": 0.0442, "step": 4728 }, { "epoch": 2.151501364877161, "grad_norm": 0.47900140160795346, "learning_rate": 3.04375388069854e-05, "loss": 0.0316, "step": 4729 }, { "epoch": 2.1519563239308463, "grad_norm": 0.4559195215717941, "learning_rate": 3.0430563191723633e-05, "loss": 0.0378, "step": 4730 }, { "epoch": 2.152411282984531, "grad_norm": 0.5483312993917605, "learning_rate": 3.0423587132700758e-05, "loss": 0.0547, "step": 4731 }, { "epoch": 2.1528662420382165, "grad_norm": 0.5066799470359781, "learning_rate": 3.0416610630486847e-05, "loss": 0.0341, "step": 4732 }, { "epoch": 2.153321201091902, "grad_norm": 0.5110893979521599, "learning_rate": 3.0409633685651962e-05, "loss": 0.0506, "step": 4733 }, { "epoch": 2.1537761601455867, "grad_norm": 0.5039255567348374, "learning_rate": 3.0402656298766253e-05, "loss": 0.0395, "step": 4734 }, { "epoch": 2.154231119199272, "grad_norm": 0.8066091286489462, "learning_rate": 3.0395678470399863e-05, "loss": 0.1034, "step": 4735 }, { "epoch": 2.1546860782529573, "grad_norm": 0.4596728938198554, "learning_rate": 3.0388700201123e-05, "loss": 0.0518, "step": 4736 }, { "epoch": 2.1551410373066426, "grad_norm": 0.3657970194548396, "learning_rate": 3.038172149150589e-05, "loss": 0.0276, "step": 4737 }, { "epoch": 2.1555959963603275, "grad_norm": 0.5065204376853066, "learning_rate": 3.03747423421188e-05, "loss": 0.0714, "step": 4738 }, { "epoch": 2.156050955414013, "grad_norm": 0.6859821859340364, "learning_rate": 3.0367762753532043e-05, "loss": 0.052, "step": 4739 }, { "epoch": 2.156505914467698, "grad_norm": 0.5207099220439142, "learning_rate": 3.0360782726315946e-05, "loss": 0.0527, "step": 4740 }, { "epoch": 2.156960873521383, "grad_norm": 0.6659145129537747, "learning_rate": 3.0353802261040904e-05, "loss": 0.0712, "step": 4741 }, { "epoch": 2.1574158325750683, "grad_norm": 0.511460537036186, "learning_rate": 3.0346821358277318e-05, "loss": 0.0304, "step": 4742 }, { "epoch": 2.1578707916287536, "grad_norm": 0.5612144029879704, "learning_rate": 3.0339840018595622e-05, "loss": 0.0632, "step": 4743 }, { "epoch": 2.1583257506824385, "grad_norm": 0.41931114360196386, "learning_rate": 3.033285824256633e-05, "loss": 0.0495, "step": 4744 }, { "epoch": 2.158780709736124, "grad_norm": 0.39958720035097234, "learning_rate": 3.0325876030759942e-05, "loss": 0.0312, "step": 4745 }, { "epoch": 2.159235668789809, "grad_norm": 0.6428433331407641, "learning_rate": 3.0318893383747016e-05, "loss": 0.0782, "step": 4746 }, { "epoch": 2.159690627843494, "grad_norm": 0.48749093117009185, "learning_rate": 3.031191030209814e-05, "loss": 0.0487, "step": 4747 }, { "epoch": 2.1601455868971793, "grad_norm": 0.44319076872710456, "learning_rate": 3.030492678638394e-05, "loss": 0.0414, "step": 4748 }, { "epoch": 2.1606005459508646, "grad_norm": 0.538163088358284, "learning_rate": 3.029794283717509e-05, "loss": 0.0577, "step": 4749 }, { "epoch": 2.1610555050045495, "grad_norm": 0.6645526789864907, "learning_rate": 3.029095845504226e-05, "loss": 0.0611, "step": 4750 }, { "epoch": 2.1615104640582348, "grad_norm": 0.5367135984170838, "learning_rate": 3.0283973640556218e-05, "loss": 0.1137, "step": 4751 }, { "epoch": 2.16196542311192, "grad_norm": 0.4911216146238002, "learning_rate": 3.0276988394287696e-05, "loss": 0.0306, "step": 4752 }, { "epoch": 2.162420382165605, "grad_norm": 0.46306106431739646, "learning_rate": 3.0270002716807526e-05, "loss": 0.0602, "step": 4753 }, { "epoch": 2.1628753412192903, "grad_norm": 0.44934363328305227, "learning_rate": 3.0263016608686528e-05, "loss": 0.0416, "step": 4754 }, { "epoch": 2.1633303002729756, "grad_norm": 0.49919327662096197, "learning_rate": 3.025603007049558e-05, "loss": 0.0387, "step": 4755 }, { "epoch": 2.1637852593266604, "grad_norm": 0.9418829427827239, "learning_rate": 3.024904310280559e-05, "loss": 0.0897, "step": 4756 }, { "epoch": 2.1642402183803457, "grad_norm": 0.61797409665625, "learning_rate": 3.02420557061875e-05, "loss": 0.0501, "step": 4757 }, { "epoch": 2.164695177434031, "grad_norm": 0.4579426906089131, "learning_rate": 3.0235067881212293e-05, "loss": 0.0236, "step": 4758 }, { "epoch": 2.165150136487716, "grad_norm": 0.49487778714684555, "learning_rate": 3.0228079628450978e-05, "loss": 0.0446, "step": 4759 }, { "epoch": 2.1656050955414012, "grad_norm": 0.30361689157196, "learning_rate": 3.0221090948474613e-05, "loss": 0.0366, "step": 4760 }, { "epoch": 2.1660600545950865, "grad_norm": 0.4197244687257767, "learning_rate": 3.021410184185427e-05, "loss": 0.034, "step": 4761 }, { "epoch": 2.1665150136487714, "grad_norm": 0.44445317730575407, "learning_rate": 3.0207112309161066e-05, "loss": 0.0379, "step": 4762 }, { "epoch": 2.1669699727024567, "grad_norm": 0.48161788981995174, "learning_rate": 3.0200122350966163e-05, "loss": 0.0766, "step": 4763 }, { "epoch": 2.167424931756142, "grad_norm": 0.6104329043421239, "learning_rate": 3.019313196784075e-05, "loss": 0.0442, "step": 4764 }, { "epoch": 2.167879890809827, "grad_norm": 0.4878403349488129, "learning_rate": 3.0186141160356037e-05, "loss": 0.0478, "step": 4765 }, { "epoch": 2.168334849863512, "grad_norm": 0.5183508444834761, "learning_rate": 3.0179149929083295e-05, "loss": 0.0427, "step": 4766 }, { "epoch": 2.1687898089171975, "grad_norm": 0.28238000997624585, "learning_rate": 3.0172158274593805e-05, "loss": 0.0155, "step": 4767 }, { "epoch": 2.1692447679708824, "grad_norm": 0.4864672806722927, "learning_rate": 3.0165166197458895e-05, "loss": 0.0308, "step": 4768 }, { "epoch": 2.1696997270245677, "grad_norm": 0.5275770178973284, "learning_rate": 3.015817369824993e-05, "loss": 0.0529, "step": 4769 }, { "epoch": 2.170154686078253, "grad_norm": 0.3648833269273759, "learning_rate": 3.015118077753831e-05, "loss": 0.0311, "step": 4770 }, { "epoch": 2.1706096451319383, "grad_norm": 0.48787976148156875, "learning_rate": 3.014418743589546e-05, "loss": 0.0423, "step": 4771 }, { "epoch": 2.171064604185623, "grad_norm": 0.4764294598555145, "learning_rate": 3.0137193673892837e-05, "loss": 0.0471, "step": 4772 }, { "epoch": 2.1715195632393085, "grad_norm": 0.6175598561562093, "learning_rate": 3.0130199492101957e-05, "loss": 0.0507, "step": 4773 }, { "epoch": 2.171974522292994, "grad_norm": 0.5845009167822551, "learning_rate": 3.012320489109433e-05, "loss": 0.0502, "step": 4774 }, { "epoch": 2.1724294813466787, "grad_norm": 0.7295456502677632, "learning_rate": 3.0116209871441543e-05, "loss": 0.0815, "step": 4775 }, { "epoch": 2.172884440400364, "grad_norm": 0.5745871524153818, "learning_rate": 3.0109214433715182e-05, "loss": 0.0619, "step": 4776 }, { "epoch": 2.1733393994540493, "grad_norm": 0.8117029148317078, "learning_rate": 3.01022185784869e-05, "loss": 0.072, "step": 4777 }, { "epoch": 2.173794358507734, "grad_norm": 0.3407528344920062, "learning_rate": 3.0095222306328357e-05, "loss": 0.0425, "step": 4778 }, { "epoch": 2.1742493175614195, "grad_norm": 0.5518554007539561, "learning_rate": 3.008822561781125e-05, "loss": 0.0529, "step": 4779 }, { "epoch": 2.174704276615105, "grad_norm": 0.3954362714251021, "learning_rate": 3.008122851350733e-05, "loss": 0.044, "step": 4780 }, { "epoch": 2.1751592356687897, "grad_norm": 0.621400145023446, "learning_rate": 3.0074230993988362e-05, "loss": 0.0739, "step": 4781 }, { "epoch": 2.175614194722475, "grad_norm": 0.7831151933561222, "learning_rate": 3.0067233059826143e-05, "loss": 0.076, "step": 4782 }, { "epoch": 2.1760691537761603, "grad_norm": 0.4884378986983659, "learning_rate": 3.0060234711592532e-05, "loss": 0.0444, "step": 4783 }, { "epoch": 2.176524112829845, "grad_norm": 0.5957521230482236, "learning_rate": 3.0053235949859394e-05, "loss": 0.0737, "step": 4784 }, { "epoch": 2.1769790718835305, "grad_norm": 0.4550670017712964, "learning_rate": 3.0046236775198623e-05, "loss": 0.0898, "step": 4785 }, { "epoch": 2.1774340309372158, "grad_norm": 0.7149909477581905, "learning_rate": 3.003923718818218e-05, "loss": 0.0766, "step": 4786 }, { "epoch": 2.1778889899909006, "grad_norm": 0.5117369764085709, "learning_rate": 3.003223718938203e-05, "loss": 0.053, "step": 4787 }, { "epoch": 2.178343949044586, "grad_norm": 0.5187687398493134, "learning_rate": 3.002523677937018e-05, "loss": 0.0517, "step": 4788 }, { "epoch": 2.1787989080982713, "grad_norm": 0.5698815512205937, "learning_rate": 3.0018235958718672e-05, "loss": 0.0504, "step": 4789 }, { "epoch": 2.179253867151956, "grad_norm": 0.5175147946727842, "learning_rate": 3.0011234727999588e-05, "loss": 0.0823, "step": 4790 }, { "epoch": 2.1797088262056414, "grad_norm": 0.41141937049560967, "learning_rate": 3.0004233087785034e-05, "loss": 0.027, "step": 4791 }, { "epoch": 2.1801637852593267, "grad_norm": 0.5787919505139126, "learning_rate": 2.9997231038647145e-05, "loss": 0.0672, "step": 4792 }, { "epoch": 2.180618744313012, "grad_norm": 0.47705530642769484, "learning_rate": 2.9990228581158104e-05, "loss": 0.0484, "step": 4793 }, { "epoch": 2.181073703366697, "grad_norm": 0.6109744473354789, "learning_rate": 2.9983225715890122e-05, "loss": 0.0505, "step": 4794 }, { "epoch": 2.1815286624203822, "grad_norm": 0.40451727274556853, "learning_rate": 2.9976222443415442e-05, "loss": 0.0377, "step": 4795 }, { "epoch": 2.1819836214740675, "grad_norm": 0.5205934359826156, "learning_rate": 2.996921876430633e-05, "loss": 0.0584, "step": 4796 }, { "epoch": 2.1824385805277524, "grad_norm": 0.36095959175099207, "learning_rate": 2.9962214679135104e-05, "loss": 0.0435, "step": 4797 }, { "epoch": 2.1828935395814377, "grad_norm": 0.48734421993605365, "learning_rate": 2.99552101884741e-05, "loss": 0.0462, "step": 4798 }, { "epoch": 2.183348498635123, "grad_norm": 0.4812163862133915, "learning_rate": 2.99482052928957e-05, "loss": 0.0719, "step": 4799 }, { "epoch": 2.183803457688808, "grad_norm": 0.3707641495834768, "learning_rate": 2.9941199992972314e-05, "loss": 0.0301, "step": 4800 }, { "epoch": 2.184258416742493, "grad_norm": 0.9496732914021755, "learning_rate": 2.9934194289276367e-05, "loss": 0.0869, "step": 4801 }, { "epoch": 2.1847133757961785, "grad_norm": 0.44434563569487007, "learning_rate": 2.992718818238036e-05, "loss": 0.0416, "step": 4802 }, { "epoch": 2.1851683348498634, "grad_norm": 0.5761676872349002, "learning_rate": 2.9920181672856766e-05, "loss": 0.0365, "step": 4803 }, { "epoch": 2.1856232939035487, "grad_norm": 0.4951392262452752, "learning_rate": 2.9913174761278162e-05, "loss": 0.0759, "step": 4804 }, { "epoch": 2.186078252957234, "grad_norm": 0.37331274947558757, "learning_rate": 2.9906167448217098e-05, "loss": 0.0479, "step": 4805 }, { "epoch": 2.186533212010919, "grad_norm": 0.5200178662809939, "learning_rate": 2.9899159734246185e-05, "loss": 0.0546, "step": 4806 }, { "epoch": 2.186988171064604, "grad_norm": 0.5124378408493246, "learning_rate": 2.989215161993807e-05, "loss": 0.0509, "step": 4807 }, { "epoch": 2.1874431301182895, "grad_norm": 0.48570007722087144, "learning_rate": 2.988514310586541e-05, "loss": 0.0377, "step": 4808 }, { "epoch": 2.1878980891719744, "grad_norm": 0.5885327527930303, "learning_rate": 2.9878134192600927e-05, "loss": 0.0553, "step": 4809 }, { "epoch": 2.1883530482256597, "grad_norm": 0.44216798162134485, "learning_rate": 2.987112488071733e-05, "loss": 0.0392, "step": 4810 }, { "epoch": 2.188808007279345, "grad_norm": 0.3902219336571337, "learning_rate": 2.986411517078742e-05, "loss": 0.0524, "step": 4811 }, { "epoch": 2.18926296633303, "grad_norm": 0.5835482441347631, "learning_rate": 2.9857105063383976e-05, "loss": 0.0513, "step": 4812 }, { "epoch": 2.189717925386715, "grad_norm": 0.3792397898752033, "learning_rate": 2.9850094559079846e-05, "loss": 0.0395, "step": 4813 }, { "epoch": 2.1901728844404005, "grad_norm": 0.4993760224129263, "learning_rate": 2.984308365844789e-05, "loss": 0.0423, "step": 4814 }, { "epoch": 2.1906278434940853, "grad_norm": 0.6901561409173353, "learning_rate": 2.983607236206101e-05, "loss": 0.0669, "step": 4815 }, { "epoch": 2.1910828025477707, "grad_norm": 0.5395701305513978, "learning_rate": 2.9829060670492137e-05, "loss": 0.0531, "step": 4816 }, { "epoch": 2.191537761601456, "grad_norm": 0.39365527638498515, "learning_rate": 2.9822048584314228e-05, "loss": 0.0481, "step": 4817 }, { "epoch": 2.191992720655141, "grad_norm": 0.4522316512135156, "learning_rate": 2.9815036104100285e-05, "loss": 0.0592, "step": 4818 }, { "epoch": 2.192447679708826, "grad_norm": 0.4183632169668145, "learning_rate": 2.9808023230423342e-05, "loss": 0.0505, "step": 4819 }, { "epoch": 2.1929026387625115, "grad_norm": 0.5021481052870164, "learning_rate": 2.9801009963856446e-05, "loss": 0.036, "step": 4820 }, { "epoch": 2.1933575978161963, "grad_norm": 0.2667372306287812, "learning_rate": 2.9793996304972704e-05, "loss": 0.0327, "step": 4821 }, { "epoch": 2.1938125568698816, "grad_norm": 0.43565350594942354, "learning_rate": 2.978698225434523e-05, "loss": 0.0449, "step": 4822 }, { "epoch": 2.194267515923567, "grad_norm": 0.421122960446856, "learning_rate": 2.977996781254719e-05, "loss": 0.0617, "step": 4823 }, { "epoch": 2.194722474977252, "grad_norm": 0.45251111687793927, "learning_rate": 2.9772952980151757e-05, "loss": 0.038, "step": 4824 }, { "epoch": 2.195177434030937, "grad_norm": 0.5698744019491265, "learning_rate": 2.9765937757732165e-05, "loss": 0.0534, "step": 4825 }, { "epoch": 2.1956323930846224, "grad_norm": 0.5170305867487769, "learning_rate": 2.9758922145861666e-05, "loss": 0.061, "step": 4826 }, { "epoch": 2.1960873521383077, "grad_norm": 0.6418762439970866, "learning_rate": 2.9751906145113527e-05, "loss": 0.0709, "step": 4827 }, { "epoch": 2.1965423111919926, "grad_norm": 0.5993063194644977, "learning_rate": 2.9744889756061094e-05, "loss": 0.082, "step": 4828 }, { "epoch": 2.196997270245678, "grad_norm": 0.645363830149857, "learning_rate": 2.9737872979277693e-05, "loss": 0.0754, "step": 4829 }, { "epoch": 2.1974522292993632, "grad_norm": 0.4330182214936328, "learning_rate": 2.9730855815336707e-05, "loss": 0.042, "step": 4830 }, { "epoch": 2.197907188353048, "grad_norm": 0.4870059773568326, "learning_rate": 2.9723838264811545e-05, "loss": 0.0813, "step": 4831 }, { "epoch": 2.1983621474067334, "grad_norm": 0.4449244823090228, "learning_rate": 2.9716820328275653e-05, "loss": 0.0487, "step": 4832 }, { "epoch": 2.1988171064604187, "grad_norm": 0.6350776971257093, "learning_rate": 2.970980200630251e-05, "loss": 0.0766, "step": 4833 }, { "epoch": 2.1992720655141036, "grad_norm": 0.6607850533907079, "learning_rate": 2.9702783299465615e-05, "loss": 0.1205, "step": 4834 }, { "epoch": 2.199727024567789, "grad_norm": 0.8188249184913744, "learning_rate": 2.96957642083385e-05, "loss": 0.0499, "step": 4835 }, { "epoch": 2.200181983621474, "grad_norm": 0.5431541017935493, "learning_rate": 2.968874473349475e-05, "loss": 0.0295, "step": 4836 }, { "epoch": 2.200636942675159, "grad_norm": 0.5203130160770402, "learning_rate": 2.9681724875507945e-05, "loss": 0.0499, "step": 4837 }, { "epoch": 2.2010919017288444, "grad_norm": 0.3728035032514477, "learning_rate": 2.9674704634951727e-05, "loss": 0.033, "step": 4838 }, { "epoch": 2.2015468607825297, "grad_norm": 0.5602011936045199, "learning_rate": 2.966768401239976e-05, "loss": 0.0626, "step": 4839 }, { "epoch": 2.2020018198362146, "grad_norm": 0.3903053832803591, "learning_rate": 2.966066300842574e-05, "loss": 0.0438, "step": 4840 }, { "epoch": 2.2024567788899, "grad_norm": 0.6419558480975055, "learning_rate": 2.9653641623603378e-05, "loss": 0.0595, "step": 4841 }, { "epoch": 2.202911737943585, "grad_norm": 0.44434182239793457, "learning_rate": 2.9646619858506436e-05, "loss": 0.029, "step": 4842 }, { "epoch": 2.20336669699727, "grad_norm": 0.5963214198460424, "learning_rate": 2.963959771370871e-05, "loss": 0.0548, "step": 4843 }, { "epoch": 2.2038216560509554, "grad_norm": 0.6498400539487387, "learning_rate": 2.9632575189784006e-05, "loss": 0.0683, "step": 4844 }, { "epoch": 2.2042766151046407, "grad_norm": 0.5038307854628329, "learning_rate": 2.962555228730618e-05, "loss": 0.0635, "step": 4845 }, { "epoch": 2.2047315741583255, "grad_norm": 0.3818347614810291, "learning_rate": 2.9618529006849112e-05, "loss": 0.0364, "step": 4846 }, { "epoch": 2.205186533212011, "grad_norm": 0.38117392987331095, "learning_rate": 2.9611505348986712e-05, "loss": 0.0352, "step": 4847 }, { "epoch": 2.205641492265696, "grad_norm": 0.9419709207255681, "learning_rate": 2.9604481314292916e-05, "loss": 0.1018, "step": 4848 }, { "epoch": 2.2060964513193815, "grad_norm": 0.7276824795960022, "learning_rate": 2.9597456903341704e-05, "loss": 0.0978, "step": 4849 }, { "epoch": 2.2065514103730663, "grad_norm": 0.4997556074650717, "learning_rate": 2.9590432116707074e-05, "loss": 0.0452, "step": 4850 }, { "epoch": 2.2070063694267517, "grad_norm": 0.7287515620280278, "learning_rate": 2.958340695496306e-05, "loss": 0.042, "step": 4851 }, { "epoch": 2.207461328480437, "grad_norm": 0.5741288840364697, "learning_rate": 2.957638141868373e-05, "loss": 0.0611, "step": 4852 }, { "epoch": 2.207916287534122, "grad_norm": 0.5014901770030712, "learning_rate": 2.956935550844318e-05, "loss": 0.0352, "step": 4853 }, { "epoch": 2.208371246587807, "grad_norm": 0.479777452566317, "learning_rate": 2.9562329224815527e-05, "loss": 0.0425, "step": 4854 }, { "epoch": 2.2088262056414925, "grad_norm": 0.5868905147271465, "learning_rate": 2.955530256837493e-05, "loss": 0.0627, "step": 4855 }, { "epoch": 2.2092811646951773, "grad_norm": 0.37842476444137185, "learning_rate": 2.9548275539695585e-05, "loss": 0.0325, "step": 4856 }, { "epoch": 2.2097361237488626, "grad_norm": 0.7233742673583267, "learning_rate": 2.9541248139351708e-05, "loss": 0.1332, "step": 4857 }, { "epoch": 2.210191082802548, "grad_norm": 0.6434225306955976, "learning_rate": 2.953422036791753e-05, "loss": 0.051, "step": 4858 }, { "epoch": 2.210646041856233, "grad_norm": 1.4481084733668284, "learning_rate": 2.952719222596735e-05, "loss": 0.0908, "step": 4859 }, { "epoch": 2.211101000909918, "grad_norm": 0.4104466253511282, "learning_rate": 2.952016371407546e-05, "loss": 0.0376, "step": 4860 }, { "epoch": 2.2115559599636034, "grad_norm": 0.42739036949996384, "learning_rate": 2.9513134832816204e-05, "loss": 0.0472, "step": 4861 }, { "epoch": 2.2120109190172883, "grad_norm": 0.4453492236795173, "learning_rate": 2.9506105582763956e-05, "loss": 0.0661, "step": 4862 }, { "epoch": 2.2124658780709736, "grad_norm": 0.550439909544215, "learning_rate": 2.9499075964493102e-05, "loss": 0.07, "step": 4863 }, { "epoch": 2.212920837124659, "grad_norm": 0.5809158318905299, "learning_rate": 2.949204597857808e-05, "loss": 0.0993, "step": 4864 }, { "epoch": 2.213375796178344, "grad_norm": 0.42498311908213193, "learning_rate": 2.9485015625593348e-05, "loss": 0.0458, "step": 4865 }, { "epoch": 2.213830755232029, "grad_norm": 0.4144409598604083, "learning_rate": 2.947798490611339e-05, "loss": 0.0377, "step": 4866 }, { "epoch": 2.2142857142857144, "grad_norm": 0.37519413653951245, "learning_rate": 2.9470953820712738e-05, "loss": 0.0492, "step": 4867 }, { "epoch": 2.2147406733393993, "grad_norm": 0.38724885484495847, "learning_rate": 2.9463922369965917e-05, "loss": 0.0353, "step": 4868 }, { "epoch": 2.2151956323930846, "grad_norm": 0.41732037701131786, "learning_rate": 2.9456890554447525e-05, "loss": 0.0816, "step": 4869 }, { "epoch": 2.21565059144677, "grad_norm": 0.6830617304712229, "learning_rate": 2.944985837473217e-05, "loss": 0.0432, "step": 4870 }, { "epoch": 2.2161055505004548, "grad_norm": 0.3815489735540472, "learning_rate": 2.9442825831394473e-05, "loss": 0.0438, "step": 4871 }, { "epoch": 2.21656050955414, "grad_norm": 0.5079953219643636, "learning_rate": 2.943579292500912e-05, "loss": 0.0735, "step": 4872 }, { "epoch": 2.2170154686078254, "grad_norm": 0.47955719995939505, "learning_rate": 2.9428759656150796e-05, "loss": 0.0531, "step": 4873 }, { "epoch": 2.2174704276615103, "grad_norm": 0.5110112974967479, "learning_rate": 2.9421726025394235e-05, "loss": 0.0781, "step": 4874 }, { "epoch": 2.2179253867151956, "grad_norm": 0.3716818861743613, "learning_rate": 2.9414692033314195e-05, "loss": 0.0329, "step": 4875 }, { "epoch": 2.218380345768881, "grad_norm": 0.9043838656970387, "learning_rate": 2.9407657680485452e-05, "loss": 0.0892, "step": 4876 }, { "epoch": 2.2188353048225657, "grad_norm": 0.3971141567129126, "learning_rate": 2.940062296748284e-05, "loss": 0.0496, "step": 4877 }, { "epoch": 2.219290263876251, "grad_norm": 0.5678124106489887, "learning_rate": 2.939358789488118e-05, "loss": 0.0591, "step": 4878 }, { "epoch": 2.2197452229299364, "grad_norm": 0.2811353276078472, "learning_rate": 2.9386552463255363e-05, "loss": 0.0366, "step": 4879 }, { "epoch": 2.2202001819836217, "grad_norm": 0.3528131895807377, "learning_rate": 2.937951667318028e-05, "loss": 0.0476, "step": 4880 }, { "epoch": 2.2206551410373065, "grad_norm": 0.4734510546959942, "learning_rate": 2.9372480525230883e-05, "loss": 0.052, "step": 4881 }, { "epoch": 2.221110100090992, "grad_norm": 0.5461134327616006, "learning_rate": 2.936544401998212e-05, "loss": 0.0537, "step": 4882 }, { "epoch": 2.221565059144677, "grad_norm": 0.685929615858676, "learning_rate": 2.935840715800898e-05, "loss": 0.0533, "step": 4883 }, { "epoch": 2.222020018198362, "grad_norm": 0.3728473660323041, "learning_rate": 2.93513699398865e-05, "loss": 0.0337, "step": 4884 }, { "epoch": 2.2224749772520473, "grad_norm": 0.4161655494923142, "learning_rate": 2.934433236618972e-05, "loss": 0.0571, "step": 4885 }, { "epoch": 2.2229299363057327, "grad_norm": 0.9000731863962375, "learning_rate": 2.933729443749371e-05, "loss": 0.0778, "step": 4886 }, { "epoch": 2.2233848953594175, "grad_norm": 0.5345516935697647, "learning_rate": 2.9330256154373593e-05, "loss": 0.0434, "step": 4887 }, { "epoch": 2.223839854413103, "grad_norm": 0.678733498839431, "learning_rate": 2.932321751740449e-05, "loss": 0.0602, "step": 4888 }, { "epoch": 2.224294813466788, "grad_norm": 0.41238258007593076, "learning_rate": 2.9316178527161582e-05, "loss": 0.0757, "step": 4889 }, { "epoch": 2.224749772520473, "grad_norm": 1.126766672956674, "learning_rate": 2.9309139184220047e-05, "loss": 0.1064, "step": 4890 }, { "epoch": 2.2252047315741583, "grad_norm": 0.5293920941292853, "learning_rate": 2.9302099489155126e-05, "loss": 0.073, "step": 4891 }, { "epoch": 2.2256596906278436, "grad_norm": 0.4785592655829485, "learning_rate": 2.929505944254206e-05, "loss": 0.0598, "step": 4892 }, { "epoch": 2.2261146496815285, "grad_norm": 0.6181695900554144, "learning_rate": 2.9288019044956138e-05, "loss": 0.0574, "step": 4893 }, { "epoch": 2.226569608735214, "grad_norm": 0.4463451400473221, "learning_rate": 2.9280978296972654e-05, "loss": 0.0427, "step": 4894 }, { "epoch": 2.227024567788899, "grad_norm": 0.7359517069708225, "learning_rate": 2.9273937199166962e-05, "loss": 0.0781, "step": 4895 }, { "epoch": 2.227479526842584, "grad_norm": 0.5294260940770704, "learning_rate": 2.9266895752114427e-05, "loss": 0.0652, "step": 4896 }, { "epoch": 2.2279344858962693, "grad_norm": 0.547340572578013, "learning_rate": 2.9259853956390426e-05, "loss": 0.052, "step": 4897 }, { "epoch": 2.2283894449499546, "grad_norm": 0.39183588398844826, "learning_rate": 2.9252811812570415e-05, "loss": 0.045, "step": 4898 }, { "epoch": 2.2288444040036395, "grad_norm": 0.3191414525826814, "learning_rate": 2.9245769321229814e-05, "loss": 0.0358, "step": 4899 }, { "epoch": 2.229299363057325, "grad_norm": 0.4809508226093138, "learning_rate": 2.9238726482944135e-05, "loss": 0.0462, "step": 4900 }, { "epoch": 2.22975432211101, "grad_norm": 0.36446193478834316, "learning_rate": 2.9231683298288853e-05, "loss": 0.0476, "step": 4901 }, { "epoch": 2.2302092811646954, "grad_norm": 0.6314698202143114, "learning_rate": 2.922463976783953e-05, "loss": 0.0655, "step": 4902 }, { "epoch": 2.2306642402183803, "grad_norm": 0.37400950594662524, "learning_rate": 2.9217595892171722e-05, "loss": 0.0324, "step": 4903 }, { "epoch": 2.2311191992720656, "grad_norm": 0.4854432761869674, "learning_rate": 2.9210551671861015e-05, "loss": 0.0785, "step": 4904 }, { "epoch": 2.231574158325751, "grad_norm": 0.6848131997141341, "learning_rate": 2.920350710748305e-05, "loss": 0.0564, "step": 4905 }, { "epoch": 2.2320291173794358, "grad_norm": 0.6621372900853196, "learning_rate": 2.9196462199613472e-05, "loss": 0.0643, "step": 4906 }, { "epoch": 2.232484076433121, "grad_norm": 0.3603338144711263, "learning_rate": 2.9189416948827946e-05, "loss": 0.0385, "step": 4907 }, { "epoch": 2.2329390354868064, "grad_norm": 0.5336627857063783, "learning_rate": 2.9182371355702187e-05, "loss": 0.0517, "step": 4908 }, { "epoch": 2.2333939945404913, "grad_norm": 0.6420078586532316, "learning_rate": 2.9175325420811928e-05, "loss": 0.0446, "step": 4909 }, { "epoch": 2.2338489535941766, "grad_norm": 0.6376109435219153, "learning_rate": 2.9168279144732936e-05, "loss": 0.0656, "step": 4910 }, { "epoch": 2.234303912647862, "grad_norm": 0.43839502329355534, "learning_rate": 2.916123252804099e-05, "loss": 0.0587, "step": 4911 }, { "epoch": 2.2347588717015467, "grad_norm": 0.3998764533041563, "learning_rate": 2.9154185571311914e-05, "loss": 0.0784, "step": 4912 }, { "epoch": 2.235213830755232, "grad_norm": 0.5567604623814032, "learning_rate": 2.914713827512156e-05, "loss": 0.0647, "step": 4913 }, { "epoch": 2.2356687898089174, "grad_norm": 0.5360046574643695, "learning_rate": 2.914009064004578e-05, "loss": 0.0856, "step": 4914 }, { "epoch": 2.2361237488626022, "grad_norm": 0.36000720507771145, "learning_rate": 2.9133042666660505e-05, "loss": 0.048, "step": 4915 }, { "epoch": 2.2365787079162875, "grad_norm": 0.38492499169755384, "learning_rate": 2.912599435554164e-05, "loss": 0.0354, "step": 4916 }, { "epoch": 2.237033666969973, "grad_norm": 0.28972330920842765, "learning_rate": 2.9118945707265155e-05, "loss": 0.0622, "step": 4917 }, { "epoch": 2.2374886260236577, "grad_norm": 0.5276870742880532, "learning_rate": 2.9111896722407022e-05, "loss": 0.0805, "step": 4918 }, { "epoch": 2.237943585077343, "grad_norm": 0.35989349445175317, "learning_rate": 2.910484740154326e-05, "loss": 0.0342, "step": 4919 }, { "epoch": 2.2383985441310283, "grad_norm": 0.6365804924963496, "learning_rate": 2.909779774524991e-05, "loss": 0.0305, "step": 4920 }, { "epoch": 2.238853503184713, "grad_norm": 0.5147380958677458, "learning_rate": 2.909074775410302e-05, "loss": 0.0502, "step": 4921 }, { "epoch": 2.2393084622383985, "grad_norm": 0.4126522052246801, "learning_rate": 2.9083697428678712e-05, "loss": 0.0472, "step": 4922 }, { "epoch": 2.239763421292084, "grad_norm": 0.5953087657539935, "learning_rate": 2.907664676955309e-05, "loss": 0.0752, "step": 4923 }, { "epoch": 2.2402183803457687, "grad_norm": 0.4614387658480772, "learning_rate": 2.9069595777302298e-05, "loss": 0.0365, "step": 4924 }, { "epoch": 2.240673339399454, "grad_norm": 0.40033169054066514, "learning_rate": 2.9062544452502516e-05, "loss": 0.0421, "step": 4925 }, { "epoch": 2.2411282984531393, "grad_norm": 0.3657954774260902, "learning_rate": 2.9055492795729955e-05, "loss": 0.0541, "step": 4926 }, { "epoch": 2.241583257506824, "grad_norm": 0.4850992233273116, "learning_rate": 2.9048440807560833e-05, "loss": 0.0877, "step": 4927 }, { "epoch": 2.2420382165605095, "grad_norm": 1.8743435735839562, "learning_rate": 2.9041388488571413e-05, "loss": 0.0675, "step": 4928 }, { "epoch": 2.242493175614195, "grad_norm": 0.5238054480269254, "learning_rate": 2.903433583933797e-05, "loss": 0.0738, "step": 4929 }, { "epoch": 2.2429481346678797, "grad_norm": 0.6310659108395212, "learning_rate": 2.9027282860436833e-05, "loss": 0.0635, "step": 4930 }, { "epoch": 2.243403093721565, "grad_norm": 0.5648113366065084, "learning_rate": 2.9020229552444317e-05, "loss": 0.0628, "step": 4931 }, { "epoch": 2.2438580527752503, "grad_norm": 0.35151171697159583, "learning_rate": 2.9013175915936798e-05, "loss": 0.0239, "step": 4932 }, { "epoch": 2.244313011828935, "grad_norm": 0.48883933825449205, "learning_rate": 2.9006121951490674e-05, "loss": 0.0736, "step": 4933 }, { "epoch": 2.2447679708826205, "grad_norm": 0.4672050716784874, "learning_rate": 2.899906765968235e-05, "loss": 0.0613, "step": 4934 }, { "epoch": 2.245222929936306, "grad_norm": 0.40837347088798326, "learning_rate": 2.8992013041088272e-05, "loss": 0.0333, "step": 4935 }, { "epoch": 2.245677888989991, "grad_norm": 0.27517866260420354, "learning_rate": 2.8984958096284925e-05, "loss": 0.0289, "step": 4936 }, { "epoch": 2.246132848043676, "grad_norm": 0.4472928352358472, "learning_rate": 2.8977902825848797e-05, "loss": 0.046, "step": 4937 }, { "epoch": 2.2465878070973613, "grad_norm": 0.588475007177216, "learning_rate": 2.8970847230356413e-05, "loss": 0.0841, "step": 4938 }, { "epoch": 2.2470427661510466, "grad_norm": 0.5825243776373681, "learning_rate": 2.8963791310384318e-05, "loss": 0.0998, "step": 4939 }, { "epoch": 2.2474977252047315, "grad_norm": 0.8191143754604018, "learning_rate": 2.895673506650911e-05, "loss": 0.0699, "step": 4940 }, { "epoch": 2.2479526842584168, "grad_norm": 0.5028075148279473, "learning_rate": 2.8949678499307374e-05, "loss": 0.1119, "step": 4941 }, { "epoch": 2.248407643312102, "grad_norm": 0.3811400009630958, "learning_rate": 2.8942621609355747e-05, "loss": 0.0359, "step": 4942 }, { "epoch": 2.248862602365787, "grad_norm": 0.6265126077004788, "learning_rate": 2.8935564397230886e-05, "loss": 0.0518, "step": 4943 }, { "epoch": 2.2493175614194723, "grad_norm": 0.41539565224272773, "learning_rate": 2.892850686350948e-05, "loss": 0.0452, "step": 4944 }, { "epoch": 2.2497725204731576, "grad_norm": 0.6144112142069689, "learning_rate": 2.8921449008768232e-05, "loss": 0.0684, "step": 4945 }, { "epoch": 2.2502274795268424, "grad_norm": 0.5980251877903849, "learning_rate": 2.8914390833583875e-05, "loss": 0.079, "step": 4946 }, { "epoch": 2.2506824385805277, "grad_norm": 0.6230144210887858, "learning_rate": 2.890733233853318e-05, "loss": 0.0482, "step": 4947 }, { "epoch": 2.251137397634213, "grad_norm": 0.7975163091076617, "learning_rate": 2.8900273524192934e-05, "loss": 0.0932, "step": 4948 }, { "epoch": 2.251592356687898, "grad_norm": 0.5397898380942239, "learning_rate": 2.889321439113995e-05, "loss": 0.0656, "step": 4949 }, { "epoch": 2.2520473157415832, "grad_norm": 0.4562063282697629, "learning_rate": 2.888615493995106e-05, "loss": 0.0544, "step": 4950 }, { "epoch": 2.2525022747952685, "grad_norm": 0.5424613570921624, "learning_rate": 2.8879095171203145e-05, "loss": 0.0363, "step": 4951 }, { "epoch": 2.2529572338489534, "grad_norm": 0.4292814955107491, "learning_rate": 2.887203508547309e-05, "loss": 0.0339, "step": 4952 }, { "epoch": 2.2534121929026387, "grad_norm": 0.5405064930371605, "learning_rate": 2.8864974683337807e-05, "loss": 0.0592, "step": 4953 }, { "epoch": 2.253867151956324, "grad_norm": 0.36842742124305455, "learning_rate": 2.885791396537426e-05, "loss": 0.0326, "step": 4954 }, { "epoch": 2.2543221110100093, "grad_norm": 0.5379456031105317, "learning_rate": 2.88508529321594e-05, "loss": 0.0415, "step": 4955 }, { "epoch": 2.254777070063694, "grad_norm": 0.507962720882615, "learning_rate": 2.8843791584270224e-05, "loss": 0.0402, "step": 4956 }, { "epoch": 2.2552320291173795, "grad_norm": 0.3856656555918774, "learning_rate": 2.8836729922283755e-05, "loss": 0.0325, "step": 4957 }, { "epoch": 2.255686988171065, "grad_norm": 0.4533092761535203, "learning_rate": 2.8829667946777056e-05, "loss": 0.0494, "step": 4958 }, { "epoch": 2.2561419472247497, "grad_norm": 0.49915819448718024, "learning_rate": 2.8822605658327184e-05, "loss": 0.059, "step": 4959 }, { "epoch": 2.256596906278435, "grad_norm": 0.6781756141518444, "learning_rate": 2.881554305751123e-05, "loss": 0.0522, "step": 4960 }, { "epoch": 2.2570518653321203, "grad_norm": 0.5428014699328015, "learning_rate": 2.880848014490634e-05, "loss": 0.0857, "step": 4961 }, { "epoch": 2.257506824385805, "grad_norm": 0.39051098717444405, "learning_rate": 2.880141692108964e-05, "loss": 0.0416, "step": 4962 }, { "epoch": 2.2579617834394905, "grad_norm": 0.3850718914569119, "learning_rate": 2.8794353386638324e-05, "loss": 0.0208, "step": 4963 }, { "epoch": 2.258416742493176, "grad_norm": 0.40952431369316283, "learning_rate": 2.8787289542129585e-05, "loss": 0.0399, "step": 4964 }, { "epoch": 2.2588717015468607, "grad_norm": 0.417644051250782, "learning_rate": 2.878022538814065e-05, "loss": 0.0275, "step": 4965 }, { "epoch": 2.259326660600546, "grad_norm": 0.4502970262378086, "learning_rate": 2.8773160925248764e-05, "loss": 0.0374, "step": 4966 }, { "epoch": 2.2597816196542313, "grad_norm": 0.4256177075268273, "learning_rate": 2.8766096154031198e-05, "loss": 0.0454, "step": 4967 }, { "epoch": 2.260236578707916, "grad_norm": 0.5072715495570999, "learning_rate": 2.8759031075065275e-05, "loss": 0.0401, "step": 4968 }, { "epoch": 2.2606915377616015, "grad_norm": 0.3453124706705502, "learning_rate": 2.87519656889283e-05, "loss": 0.0251, "step": 4969 }, { "epoch": 2.261146496815287, "grad_norm": 1.3061901966524714, "learning_rate": 2.874489999619764e-05, "loss": 0.097, "step": 4970 }, { "epoch": 2.2616014558689717, "grad_norm": 0.6284000953503887, "learning_rate": 2.873783399745066e-05, "loss": 0.1113, "step": 4971 }, { "epoch": 2.262056414922657, "grad_norm": 0.5239934587693348, "learning_rate": 2.8730767693264764e-05, "loss": 0.031, "step": 4972 }, { "epoch": 2.2625113739763423, "grad_norm": 0.4225586070989838, "learning_rate": 2.8723701084217386e-05, "loss": 0.0689, "step": 4973 }, { "epoch": 2.262966333030027, "grad_norm": 0.8480021863056716, "learning_rate": 2.8716634170885958e-05, "loss": 0.0895, "step": 4974 }, { "epoch": 2.2634212920837125, "grad_norm": 0.9970014618075281, "learning_rate": 2.8709566953847984e-05, "loss": 0.1153, "step": 4975 }, { "epoch": 2.2638762511373978, "grad_norm": 0.6025301239725125, "learning_rate": 2.8702499433680953e-05, "loss": 0.0822, "step": 4976 }, { "epoch": 2.2643312101910826, "grad_norm": 0.5726651165342603, "learning_rate": 2.869543161096237e-05, "loss": 0.055, "step": 4977 }, { "epoch": 2.264786169244768, "grad_norm": 0.4329244176314951, "learning_rate": 2.868836348626982e-05, "loss": 0.0359, "step": 4978 }, { "epoch": 2.2652411282984533, "grad_norm": 0.52846084375492, "learning_rate": 2.8681295060180856e-05, "loss": 0.0573, "step": 4979 }, { "epoch": 2.265696087352138, "grad_norm": 0.3784196375752576, "learning_rate": 2.867422633327309e-05, "loss": 0.0283, "step": 4980 }, { "epoch": 2.2661510464058234, "grad_norm": 0.5480041508789959, "learning_rate": 2.8667157306124136e-05, "loss": 0.0343, "step": 4981 }, { "epoch": 2.2666060054595087, "grad_norm": 0.7258360308358968, "learning_rate": 2.8660087979311645e-05, "loss": 0.0636, "step": 4982 }, { "epoch": 2.2670609645131936, "grad_norm": 0.8152296391673908, "learning_rate": 2.8653018353413304e-05, "loss": 0.0964, "step": 4983 }, { "epoch": 2.267515923566879, "grad_norm": 0.4482935017254553, "learning_rate": 2.8645948429006786e-05, "loss": 0.0539, "step": 4984 }, { "epoch": 2.2679708826205642, "grad_norm": 0.6494071274782519, "learning_rate": 2.863887820666984e-05, "loss": 0.0568, "step": 4985 }, { "epoch": 2.268425841674249, "grad_norm": 0.558843153351712, "learning_rate": 2.863180768698019e-05, "loss": 0.0637, "step": 4986 }, { "epoch": 2.2688808007279344, "grad_norm": 0.46647881012734593, "learning_rate": 2.8624736870515624e-05, "loss": 0.0553, "step": 4987 }, { "epoch": 2.2693357597816197, "grad_norm": 0.3491181565775975, "learning_rate": 2.8617665757853923e-05, "loss": 0.0335, "step": 4988 }, { "epoch": 2.2697907188353046, "grad_norm": 0.6044852289497091, "learning_rate": 2.8610594349572917e-05, "loss": 0.0478, "step": 4989 }, { "epoch": 2.27024567788899, "grad_norm": 0.5498420495017519, "learning_rate": 2.8603522646250453e-05, "loss": 0.0502, "step": 4990 }, { "epoch": 2.270700636942675, "grad_norm": 0.5743472950478402, "learning_rate": 2.859645064846438e-05, "loss": 0.0338, "step": 4991 }, { "epoch": 2.2711555959963605, "grad_norm": 0.6336327686039844, "learning_rate": 2.8589378356792606e-05, "loss": 0.0475, "step": 4992 }, { "epoch": 2.2716105550500454, "grad_norm": 0.9023679045829798, "learning_rate": 2.8582305771813046e-05, "loss": 0.1045, "step": 4993 }, { "epoch": 2.2720655141037307, "grad_norm": 0.675650445659818, "learning_rate": 2.8575232894103633e-05, "loss": 0.0548, "step": 4994 }, { "epoch": 2.272520473157416, "grad_norm": 0.48947355859542496, "learning_rate": 2.856815972424233e-05, "loss": 0.0761, "step": 4995 }, { "epoch": 2.272975432211101, "grad_norm": 0.46798878745195893, "learning_rate": 2.856108626280713e-05, "loss": 0.052, "step": 4996 }, { "epoch": 2.273430391264786, "grad_norm": 0.38121244041136865, "learning_rate": 2.855401251037605e-05, "loss": 0.0239, "step": 4997 }, { "epoch": 2.2738853503184715, "grad_norm": 0.6143592106348503, "learning_rate": 2.8546938467527106e-05, "loss": 0.0737, "step": 4998 }, { "epoch": 2.2743403093721564, "grad_norm": 0.5332310345718906, "learning_rate": 2.8539864134838374e-05, "loss": 0.0741, "step": 4999 }, { "epoch": 2.2747952684258417, "grad_norm": 0.4650498363894723, "learning_rate": 2.8532789512887935e-05, "loss": 0.0555, "step": 5000 }, { "epoch": 2.275250227479527, "grad_norm": 1.0687188625967494, "learning_rate": 2.8525714602253885e-05, "loss": 0.0676, "step": 5001 }, { "epoch": 2.275705186533212, "grad_norm": 0.42887900460256057, "learning_rate": 2.8518639403514358e-05, "loss": 0.0348, "step": 5002 }, { "epoch": 2.276160145586897, "grad_norm": 0.5927920407309083, "learning_rate": 2.8511563917247508e-05, "loss": 0.0624, "step": 5003 }, { "epoch": 2.2766151046405825, "grad_norm": 0.4878590636006841, "learning_rate": 2.850448814403152e-05, "loss": 0.0451, "step": 5004 }, { "epoch": 2.2770700636942673, "grad_norm": 0.4857913691809199, "learning_rate": 2.8497412084444586e-05, "loss": 0.0458, "step": 5005 }, { "epoch": 2.2775250227479527, "grad_norm": 0.47801383728406743, "learning_rate": 2.8490335739064927e-05, "loss": 0.0351, "step": 5006 }, { "epoch": 2.277979981801638, "grad_norm": 0.4099750961773968, "learning_rate": 2.8483259108470795e-05, "loss": 0.0715, "step": 5007 }, { "epoch": 2.278434940855323, "grad_norm": 0.570191981495799, "learning_rate": 2.8476182193240458e-05, "loss": 0.0554, "step": 5008 }, { "epoch": 2.278889899909008, "grad_norm": 0.6424336501168779, "learning_rate": 2.846910499395221e-05, "loss": 0.074, "step": 5009 }, { "epoch": 2.2793448589626935, "grad_norm": 0.5196579029864625, "learning_rate": 2.846202751118437e-05, "loss": 0.0804, "step": 5010 }, { "epoch": 2.2797998180163788, "grad_norm": 0.6562478089501705, "learning_rate": 2.845494974551528e-05, "loss": 0.0519, "step": 5011 }, { "epoch": 2.2802547770700636, "grad_norm": 0.5643625459691686, "learning_rate": 2.8447871697523294e-05, "loss": 0.0651, "step": 5012 }, { "epoch": 2.280709736123749, "grad_norm": 0.5214736930460366, "learning_rate": 2.84407933677868e-05, "loss": 0.0533, "step": 5013 }, { "epoch": 2.2811646951774343, "grad_norm": 0.5966838623430893, "learning_rate": 2.8433714756884217e-05, "loss": 0.0644, "step": 5014 }, { "epoch": 2.281619654231119, "grad_norm": 0.4822296982810904, "learning_rate": 2.8426635865393968e-05, "loss": 0.0587, "step": 5015 }, { "epoch": 2.2820746132848044, "grad_norm": 0.28369688959855854, "learning_rate": 2.841955669389451e-05, "loss": 0.0205, "step": 5016 }, { "epoch": 2.2825295723384897, "grad_norm": 0.4995027231518513, "learning_rate": 2.8412477242964326e-05, "loss": 0.0591, "step": 5017 }, { "epoch": 2.2829845313921746, "grad_norm": 0.8256476875876193, "learning_rate": 2.8405397513181907e-05, "loss": 0.0615, "step": 5018 }, { "epoch": 2.28343949044586, "grad_norm": 0.4935672700860798, "learning_rate": 2.8398317505125783e-05, "loss": 0.0496, "step": 5019 }, { "epoch": 2.2838944494995452, "grad_norm": 0.5828887904509668, "learning_rate": 2.8391237219374494e-05, "loss": 0.0538, "step": 5020 }, { "epoch": 2.28434940855323, "grad_norm": 0.46761187911477936, "learning_rate": 2.8384156656506626e-05, "loss": 0.0527, "step": 5021 }, { "epoch": 2.2848043676069154, "grad_norm": 0.6689352332436546, "learning_rate": 2.8377075817100752e-05, "loss": 0.0492, "step": 5022 }, { "epoch": 2.2852593266606007, "grad_norm": 0.48085075866294175, "learning_rate": 2.8369994701735492e-05, "loss": 0.0553, "step": 5023 }, { "epoch": 2.2857142857142856, "grad_norm": 0.4583906316758031, "learning_rate": 2.8362913310989486e-05, "loss": 0.0365, "step": 5024 }, { "epoch": 2.286169244767971, "grad_norm": 0.9283111378120206, "learning_rate": 2.8355831645441388e-05, "loss": 0.0379, "step": 5025 }, { "epoch": 2.286624203821656, "grad_norm": 0.49912113568337335, "learning_rate": 2.8348749705669887e-05, "loss": 0.0598, "step": 5026 }, { "epoch": 2.287079162875341, "grad_norm": 0.42890059956623744, "learning_rate": 2.8341667492253675e-05, "loss": 0.0568, "step": 5027 }, { "epoch": 2.2875341219290264, "grad_norm": 0.6419745655315906, "learning_rate": 2.8334585005771497e-05, "loss": 0.0726, "step": 5028 }, { "epoch": 2.2879890809827117, "grad_norm": 0.39236527156493806, "learning_rate": 2.832750224680209e-05, "loss": 0.0762, "step": 5029 }, { "epoch": 2.2884440400363966, "grad_norm": 0.4799828924357416, "learning_rate": 2.8320419215924215e-05, "loss": 0.0576, "step": 5030 }, { "epoch": 2.288898999090082, "grad_norm": 0.3538751303136044, "learning_rate": 2.8313335913716688e-05, "loss": 0.0472, "step": 5031 }, { "epoch": 2.289353958143767, "grad_norm": 0.49882003674282505, "learning_rate": 2.830625234075831e-05, "loss": 0.0512, "step": 5032 }, { "epoch": 2.289808917197452, "grad_norm": 0.6253315172568089, "learning_rate": 2.829916849762792e-05, "loss": 0.0804, "step": 5033 }, { "epoch": 2.2902638762511374, "grad_norm": 0.3568015996780912, "learning_rate": 2.829208438490438e-05, "loss": 0.0403, "step": 5034 }, { "epoch": 2.2907188353048227, "grad_norm": 0.558000520407452, "learning_rate": 2.8285000003166573e-05, "loss": 0.0652, "step": 5035 }, { "epoch": 2.2911737943585075, "grad_norm": 0.49362955356585075, "learning_rate": 2.82779153529934e-05, "loss": 0.024, "step": 5036 }, { "epoch": 2.291628753412193, "grad_norm": 0.36074158474160506, "learning_rate": 2.827083043496378e-05, "loss": 0.0311, "step": 5037 }, { "epoch": 2.292083712465878, "grad_norm": 0.5027851778769455, "learning_rate": 2.826374524965668e-05, "loss": 0.0711, "step": 5038 }, { "epoch": 2.292538671519563, "grad_norm": 0.5803919964755262, "learning_rate": 2.8256659797651046e-05, "loss": 0.0828, "step": 5039 }, { "epoch": 2.2929936305732483, "grad_norm": 0.43822176691041587, "learning_rate": 2.8249574079525887e-05, "loss": 0.0606, "step": 5040 }, { "epoch": 2.2934485896269337, "grad_norm": 0.41890204913287504, "learning_rate": 2.824248809586021e-05, "loss": 0.0561, "step": 5041 }, { "epoch": 2.2939035486806185, "grad_norm": 0.37986595686201724, "learning_rate": 2.8235401847233044e-05, "loss": 0.0465, "step": 5042 }, { "epoch": 2.294358507734304, "grad_norm": 0.5477812083168319, "learning_rate": 2.822831533422346e-05, "loss": 0.0306, "step": 5043 }, { "epoch": 2.294813466787989, "grad_norm": 1.1210245230655547, "learning_rate": 2.8221228557410505e-05, "loss": 0.072, "step": 5044 }, { "epoch": 2.295268425841674, "grad_norm": 0.4496984419217967, "learning_rate": 2.821414151737332e-05, "loss": 0.087, "step": 5045 }, { "epoch": 2.2957233848953593, "grad_norm": 0.7130740841754619, "learning_rate": 2.8207054214690997e-05, "loss": 0.0638, "step": 5046 }, { "epoch": 2.2961783439490446, "grad_norm": 0.8076572986885169, "learning_rate": 2.8199966649942683e-05, "loss": 0.06, "step": 5047 }, { "epoch": 2.29663330300273, "grad_norm": 0.5724220943518874, "learning_rate": 2.8192878823707553e-05, "loss": 0.0394, "step": 5048 }, { "epoch": 2.297088262056415, "grad_norm": 0.41029284076727446, "learning_rate": 2.8185790736564777e-05, "loss": 0.0307, "step": 5049 }, { "epoch": 2.2975432211101, "grad_norm": 0.6003125425557315, "learning_rate": 2.817870238909358e-05, "loss": 0.0713, "step": 5050 }, { "epoch": 2.2979981801637854, "grad_norm": 0.27162504090525674, "learning_rate": 2.8171613781873168e-05, "loss": 0.0195, "step": 5051 }, { "epoch": 2.2984531392174703, "grad_norm": 0.8059885236367299, "learning_rate": 2.8164524915482805e-05, "loss": 0.0693, "step": 5052 }, { "epoch": 2.2989080982711556, "grad_norm": 0.5822545235751444, "learning_rate": 2.8157435790501756e-05, "loss": 0.0365, "step": 5053 }, { "epoch": 2.299363057324841, "grad_norm": 0.6897864247693075, "learning_rate": 2.815034640750931e-05, "loss": 0.0465, "step": 5054 }, { "epoch": 2.299818016378526, "grad_norm": 0.7750800452731091, "learning_rate": 2.8143256767084785e-05, "loss": 0.0575, "step": 5055 }, { "epoch": 2.300272975432211, "grad_norm": 0.5278717234235438, "learning_rate": 2.8136166869807512e-05, "loss": 0.045, "step": 5056 }, { "epoch": 2.3007279344858964, "grad_norm": 0.6328955742526793, "learning_rate": 2.8129076716256846e-05, "loss": 0.0618, "step": 5057 }, { "epoch": 2.3011828935395813, "grad_norm": 0.4875310717277937, "learning_rate": 2.812198630701216e-05, "loss": 0.0515, "step": 5058 }, { "epoch": 2.3016378525932666, "grad_norm": 1.0455754519316136, "learning_rate": 2.811489564265285e-05, "loss": 0.0841, "step": 5059 }, { "epoch": 2.302092811646952, "grad_norm": 0.3851578802901117, "learning_rate": 2.810780472375834e-05, "loss": 0.0679, "step": 5060 }, { "epoch": 2.3025477707006368, "grad_norm": 0.3357780339462273, "learning_rate": 2.8100713550908053e-05, "loss": 0.0209, "step": 5061 }, { "epoch": 2.303002729754322, "grad_norm": 0.3765435122345253, "learning_rate": 2.8093622124681473e-05, "loss": 0.0294, "step": 5062 }, { "epoch": 2.3034576888080074, "grad_norm": 0.5560881235067677, "learning_rate": 2.808653044565805e-05, "loss": 0.0328, "step": 5063 }, { "epoch": 2.3039126478616927, "grad_norm": 0.5966963062191957, "learning_rate": 2.8079438514417307e-05, "loss": 0.0489, "step": 5064 }, { "epoch": 2.3043676069153776, "grad_norm": 0.45782715508651706, "learning_rate": 2.8072346331538753e-05, "loss": 0.052, "step": 5065 }, { "epoch": 2.304822565969063, "grad_norm": 0.5603665240910509, "learning_rate": 2.8065253897601924e-05, "loss": 0.0455, "step": 5066 }, { "epoch": 2.305277525022748, "grad_norm": 0.5593464215817986, "learning_rate": 2.8058161213186397e-05, "loss": 0.0506, "step": 5067 }, { "epoch": 2.305732484076433, "grad_norm": 0.32913289214640595, "learning_rate": 2.8051068278871746e-05, "loss": 0.0222, "step": 5068 }, { "epoch": 2.3061874431301184, "grad_norm": 0.6110469755294292, "learning_rate": 2.804397509523757e-05, "loss": 0.0373, "step": 5069 }, { "epoch": 2.3066424021838037, "grad_norm": 0.4623269390873473, "learning_rate": 2.8036881662863496e-05, "loss": 0.0415, "step": 5070 }, { "epoch": 2.3070973612374885, "grad_norm": 0.33809362189738995, "learning_rate": 2.802978798232917e-05, "loss": 0.0572, "step": 5071 }, { "epoch": 2.307552320291174, "grad_norm": 0.5286725156666178, "learning_rate": 2.8022694054214248e-05, "loss": 0.0446, "step": 5072 }, { "epoch": 2.308007279344859, "grad_norm": 0.47406707872495996, "learning_rate": 2.801559987909842e-05, "loss": 0.0411, "step": 5073 }, { "epoch": 2.308462238398544, "grad_norm": 0.49261248051075396, "learning_rate": 2.800850545756139e-05, "loss": 0.0696, "step": 5074 }, { "epoch": 2.3089171974522293, "grad_norm": 0.542557527174663, "learning_rate": 2.8001410790182874e-05, "loss": 0.0528, "step": 5075 }, { "epoch": 2.3093721565059147, "grad_norm": 0.7441985465486826, "learning_rate": 2.7994315877542632e-05, "loss": 0.042, "step": 5076 }, { "epoch": 2.3098271155595995, "grad_norm": 0.6291532760042785, "learning_rate": 2.7987220720220412e-05, "loss": 0.0597, "step": 5077 }, { "epoch": 2.310282074613285, "grad_norm": 0.5286554734360585, "learning_rate": 2.7980125318796007e-05, "loss": 0.0822, "step": 5078 }, { "epoch": 2.31073703366697, "grad_norm": 0.3042031222221176, "learning_rate": 2.7973029673849222e-05, "loss": 0.04, "step": 5079 }, { "epoch": 2.311191992720655, "grad_norm": 0.4604920066167865, "learning_rate": 2.796593378595987e-05, "loss": 0.049, "step": 5080 }, { "epoch": 2.3116469517743403, "grad_norm": 0.538690357714281, "learning_rate": 2.7958837655707814e-05, "loss": 0.1078, "step": 5081 }, { "epoch": 2.3121019108280256, "grad_norm": 0.4046019450799734, "learning_rate": 2.79517412836729e-05, "loss": 0.0626, "step": 5082 }, { "epoch": 2.3125568698817105, "grad_norm": 0.4867214588982794, "learning_rate": 2.7944644670435015e-05, "loss": 0.0467, "step": 5083 }, { "epoch": 2.313011828935396, "grad_norm": 0.7675692735009414, "learning_rate": 2.7937547816574073e-05, "loss": 0.059, "step": 5084 }, { "epoch": 2.313466787989081, "grad_norm": 0.4373168792114881, "learning_rate": 2.7930450722669987e-05, "loss": 0.0718, "step": 5085 }, { "epoch": 2.313921747042766, "grad_norm": 0.8599475690822104, "learning_rate": 2.7923353389302698e-05, "loss": 0.1482, "step": 5086 }, { "epoch": 2.3143767060964513, "grad_norm": 0.5052737131535356, "learning_rate": 2.7916255817052178e-05, "loss": 0.0716, "step": 5087 }, { "epoch": 2.3148316651501366, "grad_norm": 0.4622393660999926, "learning_rate": 2.7909158006498398e-05, "loss": 0.0494, "step": 5088 }, { "epoch": 2.3152866242038215, "grad_norm": 0.6666314029475584, "learning_rate": 2.790205995822136e-05, "loss": 0.0821, "step": 5089 }, { "epoch": 2.315741583257507, "grad_norm": 0.4919858040145226, "learning_rate": 2.7894961672801095e-05, "loss": 0.0594, "step": 5090 }, { "epoch": 2.316196542311192, "grad_norm": 0.48653402092383324, "learning_rate": 2.7887863150817635e-05, "loss": 0.0506, "step": 5091 }, { "epoch": 2.316651501364877, "grad_norm": 0.5540839055068407, "learning_rate": 2.788076439285104e-05, "loss": 0.0644, "step": 5092 }, { "epoch": 2.3171064604185623, "grad_norm": 0.42456558818878537, "learning_rate": 2.787366539948138e-05, "loss": 0.0504, "step": 5093 }, { "epoch": 2.3175614194722476, "grad_norm": 0.6883815287581728, "learning_rate": 2.786656617128877e-05, "loss": 0.0651, "step": 5094 }, { "epoch": 2.3180163785259325, "grad_norm": 0.4224092753043764, "learning_rate": 2.7859466708853315e-05, "loss": 0.047, "step": 5095 }, { "epoch": 2.3184713375796178, "grad_norm": 0.6260312434891583, "learning_rate": 2.7852367012755148e-05, "loss": 0.0683, "step": 5096 }, { "epoch": 2.318926296633303, "grad_norm": 0.45302679768017406, "learning_rate": 2.7845267083574432e-05, "loss": 0.0752, "step": 5097 }, { "epoch": 2.319381255686988, "grad_norm": 0.4919199962044806, "learning_rate": 2.7838166921891352e-05, "loss": 0.0488, "step": 5098 }, { "epoch": 2.3198362147406733, "grad_norm": 0.4735874589578503, "learning_rate": 2.7831066528286076e-05, "loss": 0.0814, "step": 5099 }, { "epoch": 2.3202911737943586, "grad_norm": 0.4843320153901606, "learning_rate": 2.7823965903338826e-05, "loss": 0.0329, "step": 5100 }, { "epoch": 2.3207461328480434, "grad_norm": 0.6240006589628496, "learning_rate": 2.7816865047629847e-05, "loss": 0.0489, "step": 5101 }, { "epoch": 2.3212010919017287, "grad_norm": 0.5548097301489215, "learning_rate": 2.7809763961739367e-05, "loss": 0.0344, "step": 5102 }, { "epoch": 2.321656050955414, "grad_norm": 0.5462020728004968, "learning_rate": 2.7802662646247667e-05, "loss": 0.0504, "step": 5103 }, { "epoch": 2.3221110100090994, "grad_norm": 0.530741221216726, "learning_rate": 2.7795561101735035e-05, "loss": 0.0678, "step": 5104 }, { "epoch": 2.3225659690627842, "grad_norm": 0.47170594539401844, "learning_rate": 2.7788459328781776e-05, "loss": 0.0406, "step": 5105 }, { "epoch": 2.3230209281164695, "grad_norm": 0.472234709764689, "learning_rate": 2.778135732796821e-05, "loss": 0.07, "step": 5106 }, { "epoch": 2.323475887170155, "grad_norm": 0.457493191315752, "learning_rate": 2.7774255099874674e-05, "loss": 0.0294, "step": 5107 }, { "epoch": 2.3239308462238397, "grad_norm": 0.771398315707496, "learning_rate": 2.7767152645081556e-05, "loss": 0.0628, "step": 5108 }, { "epoch": 2.324385805277525, "grad_norm": 0.36537508441669775, "learning_rate": 2.776004996416921e-05, "loss": 0.0437, "step": 5109 }, { "epoch": 2.3248407643312103, "grad_norm": 0.6358618623394726, "learning_rate": 2.775294705771805e-05, "loss": 0.0507, "step": 5110 }, { "epoch": 2.325295723384895, "grad_norm": 0.4703702325616176, "learning_rate": 2.774584392630849e-05, "loss": 0.066, "step": 5111 }, { "epoch": 2.3257506824385805, "grad_norm": 0.5473898466534027, "learning_rate": 2.773874057052096e-05, "loss": 0.0728, "step": 5112 }, { "epoch": 2.326205641492266, "grad_norm": 0.5791819492027629, "learning_rate": 2.7731636990935923e-05, "loss": 0.0689, "step": 5113 }, { "epoch": 2.3266606005459507, "grad_norm": 0.6750103521260322, "learning_rate": 2.772453318813384e-05, "loss": 0.0605, "step": 5114 }, { "epoch": 2.327115559599636, "grad_norm": 0.4081110160649325, "learning_rate": 2.7717429162695215e-05, "loss": 0.053, "step": 5115 }, { "epoch": 2.3275705186533213, "grad_norm": 0.500486973938597, "learning_rate": 2.7710324915200546e-05, "loss": 0.0388, "step": 5116 }, { "epoch": 2.328025477707006, "grad_norm": 0.49494157221246265, "learning_rate": 2.7703220446230364e-05, "loss": 0.0653, "step": 5117 }, { "epoch": 2.3284804367606915, "grad_norm": 0.6634844360248779, "learning_rate": 2.7696115756365228e-05, "loss": 0.0793, "step": 5118 }, { "epoch": 2.328935395814377, "grad_norm": 0.5783910378008713, "learning_rate": 2.768901084618567e-05, "loss": 0.0506, "step": 5119 }, { "epoch": 2.329390354868062, "grad_norm": 0.5036604222633562, "learning_rate": 2.7681905716272304e-05, "loss": 0.0684, "step": 5120 }, { "epoch": 2.329845313921747, "grad_norm": 0.5207568200770253, "learning_rate": 2.7674800367205704e-05, "loss": 0.0527, "step": 5121 }, { "epoch": 2.3303002729754323, "grad_norm": 0.4072368494649761, "learning_rate": 2.76676947995665e-05, "loss": 0.0319, "step": 5122 }, { "epoch": 2.3307552320291176, "grad_norm": 0.34800261360287493, "learning_rate": 2.766058901393533e-05, "loss": 0.053, "step": 5123 }, { "epoch": 2.3312101910828025, "grad_norm": 0.39451420556502637, "learning_rate": 2.765348301089283e-05, "loss": 0.045, "step": 5124 }, { "epoch": 2.331665150136488, "grad_norm": 0.43451876474358386, "learning_rate": 2.764637679101969e-05, "loss": 0.067, "step": 5125 }, { "epoch": 2.332120109190173, "grad_norm": 0.6633127807568905, "learning_rate": 2.7639270354896586e-05, "loss": 0.04, "step": 5126 }, { "epoch": 2.332575068243858, "grad_norm": 0.5662499658807225, "learning_rate": 2.763216370310423e-05, "loss": 0.0465, "step": 5127 }, { "epoch": 2.3330300272975433, "grad_norm": 0.4973237206119738, "learning_rate": 2.762505683622334e-05, "loss": 0.0307, "step": 5128 }, { "epoch": 2.3334849863512286, "grad_norm": 0.5386690414631146, "learning_rate": 2.761794975483466e-05, "loss": 0.1059, "step": 5129 }, { "epoch": 2.3339399454049135, "grad_norm": 0.46301877590218155, "learning_rate": 2.7610842459518953e-05, "loss": 0.0496, "step": 5130 }, { "epoch": 2.3343949044585988, "grad_norm": 0.4048889435464769, "learning_rate": 2.760373495085698e-05, "loss": 0.0314, "step": 5131 }, { "epoch": 2.334849863512284, "grad_norm": 0.5658324861541996, "learning_rate": 2.7596627229429556e-05, "loss": 0.0549, "step": 5132 }, { "epoch": 2.335304822565969, "grad_norm": 0.46148860179893764, "learning_rate": 2.7589519295817478e-05, "loss": 0.0629, "step": 5133 }, { "epoch": 2.3357597816196543, "grad_norm": 0.4101088642123228, "learning_rate": 2.758241115060158e-05, "loss": 0.0569, "step": 5134 }, { "epoch": 2.3362147406733396, "grad_norm": 0.5744897711131312, "learning_rate": 2.75753027943627e-05, "loss": 0.0893, "step": 5135 }, { "epoch": 2.3366696997270244, "grad_norm": 0.593633500533872, "learning_rate": 2.7568194227681702e-05, "loss": 0.0606, "step": 5136 }, { "epoch": 2.3371246587807097, "grad_norm": 0.4397200333463446, "learning_rate": 2.756108545113948e-05, "loss": 0.0507, "step": 5137 }, { "epoch": 2.337579617834395, "grad_norm": 0.36232529808070635, "learning_rate": 2.7553976465316918e-05, "loss": 0.0415, "step": 5138 }, { "epoch": 2.33803457688808, "grad_norm": 0.4422648147706471, "learning_rate": 2.754686727079493e-05, "loss": 0.026, "step": 5139 }, { "epoch": 2.3384895359417652, "grad_norm": 0.39056625378638704, "learning_rate": 2.753975786815445e-05, "loss": 0.0414, "step": 5140 }, { "epoch": 2.3389444949954505, "grad_norm": 0.5018563478878245, "learning_rate": 2.753264825797643e-05, "loss": 0.0915, "step": 5141 }, { "epoch": 2.3393994540491354, "grad_norm": 0.5088081013293235, "learning_rate": 2.7525538440841824e-05, "loss": 0.0757, "step": 5142 }, { "epoch": 2.3398544131028207, "grad_norm": 0.7620587180382967, "learning_rate": 2.7518428417331626e-05, "loss": 0.0715, "step": 5143 }, { "epoch": 2.340309372156506, "grad_norm": 0.6297700356548471, "learning_rate": 2.7511318188026835e-05, "loss": 0.0617, "step": 5144 }, { "epoch": 2.340764331210191, "grad_norm": 0.5506771684183895, "learning_rate": 2.750420775350846e-05, "loss": 0.0634, "step": 5145 }, { "epoch": 2.341219290263876, "grad_norm": 0.4829072891809961, "learning_rate": 2.749709711435753e-05, "loss": 0.0517, "step": 5146 }, { "epoch": 2.3416742493175615, "grad_norm": 0.47614714959607085, "learning_rate": 2.748998627115511e-05, "loss": 0.0483, "step": 5147 }, { "epoch": 2.3421292083712464, "grad_norm": 0.37976109997969476, "learning_rate": 2.7482875224482253e-05, "loss": 0.0403, "step": 5148 }, { "epoch": 2.3425841674249317, "grad_norm": 0.35730163235557677, "learning_rate": 2.7475763974920043e-05, "loss": 0.027, "step": 5149 }, { "epoch": 2.343039126478617, "grad_norm": 0.44026406139665913, "learning_rate": 2.7468652523049583e-05, "loss": 0.0435, "step": 5150 }, { "epoch": 2.343494085532302, "grad_norm": 0.5281574931021025, "learning_rate": 2.746154086945199e-05, "loss": 0.0727, "step": 5151 }, { "epoch": 2.343949044585987, "grad_norm": 0.5166419028428526, "learning_rate": 2.7454429014708387e-05, "loss": 0.0428, "step": 5152 }, { "epoch": 2.3444040036396725, "grad_norm": 0.4082204640592235, "learning_rate": 2.7447316959399933e-05, "loss": 0.0361, "step": 5153 }, { "epoch": 2.3448589626933574, "grad_norm": 0.6037342239870883, "learning_rate": 2.7440204704107792e-05, "loss": 0.0742, "step": 5154 }, { "epoch": 2.3453139217470427, "grad_norm": 0.4564205521317059, "learning_rate": 2.743309224941314e-05, "loss": 0.0676, "step": 5155 }, { "epoch": 2.345768880800728, "grad_norm": 0.6736274915332963, "learning_rate": 2.7425979595897172e-05, "loss": 0.0446, "step": 5156 }, { "epoch": 2.3462238398544133, "grad_norm": 0.4387178266814777, "learning_rate": 2.741886674414112e-05, "loss": 0.0512, "step": 5157 }, { "epoch": 2.346678798908098, "grad_norm": 0.42412506772735736, "learning_rate": 2.741175369472619e-05, "loss": 0.0402, "step": 5158 }, { "epoch": 2.3471337579617835, "grad_norm": 0.48254086712072747, "learning_rate": 2.7404640448233638e-05, "loss": 0.0462, "step": 5159 }, { "epoch": 2.347588717015469, "grad_norm": 0.9894516538837362, "learning_rate": 2.7397527005244734e-05, "loss": 0.0989, "step": 5160 }, { "epoch": 2.3480436760691537, "grad_norm": 0.8268373414855967, "learning_rate": 2.739041336634075e-05, "loss": 0.1191, "step": 5161 }, { "epoch": 2.348498635122839, "grad_norm": 0.6551177907103193, "learning_rate": 2.738329953210298e-05, "loss": 0.0649, "step": 5162 }, { "epoch": 2.3489535941765243, "grad_norm": 0.3830030959403386, "learning_rate": 2.737618550311273e-05, "loss": 0.0307, "step": 5163 }, { "epoch": 2.349408553230209, "grad_norm": 0.7656513913936133, "learning_rate": 2.736907127995134e-05, "loss": 0.0939, "step": 5164 }, { "epoch": 2.3498635122838945, "grad_norm": 0.5164289617361199, "learning_rate": 2.736195686320014e-05, "loss": 0.0556, "step": 5165 }, { "epoch": 2.3503184713375798, "grad_norm": 0.5297700427045324, "learning_rate": 2.7354842253440487e-05, "loss": 0.0703, "step": 5166 }, { "epoch": 2.3507734303912646, "grad_norm": 0.44110957592694594, "learning_rate": 2.7347727451253763e-05, "loss": 0.0453, "step": 5167 }, { "epoch": 2.35122838944495, "grad_norm": 0.38956372452990234, "learning_rate": 2.7340612457221354e-05, "loss": 0.0338, "step": 5168 }, { "epoch": 2.3516833484986353, "grad_norm": 0.4197033125720248, "learning_rate": 2.7333497271924668e-05, "loss": 0.0554, "step": 5169 }, { "epoch": 2.35213830755232, "grad_norm": 0.40424697379572827, "learning_rate": 2.732638189594512e-05, "loss": 0.0771, "step": 5170 }, { "epoch": 2.3525932666060054, "grad_norm": 0.5541694062206072, "learning_rate": 2.7319266329864152e-05, "loss": 0.056, "step": 5171 }, { "epoch": 2.3530482256596907, "grad_norm": 0.6356358752477014, "learning_rate": 2.7312150574263208e-05, "loss": 0.0464, "step": 5172 }, { "epoch": 2.3535031847133756, "grad_norm": 0.41247526184321015, "learning_rate": 2.7305034629723765e-05, "loss": 0.0325, "step": 5173 }, { "epoch": 2.353958143767061, "grad_norm": 0.526926726849789, "learning_rate": 2.72979184968273e-05, "loss": 0.0681, "step": 5174 }, { "epoch": 2.3544131028207462, "grad_norm": 0.4952643366406119, "learning_rate": 2.7290802176155313e-05, "loss": 0.0393, "step": 5175 }, { "epoch": 2.3548680618744315, "grad_norm": 0.43312148695945296, "learning_rate": 2.728368566828932e-05, "loss": 0.0349, "step": 5176 }, { "epoch": 2.3553230209281164, "grad_norm": 0.3721532576651426, "learning_rate": 2.7276568973810833e-05, "loss": 0.0314, "step": 5177 }, { "epoch": 2.3557779799818017, "grad_norm": 0.44642974566441096, "learning_rate": 2.7269452093301427e-05, "loss": 0.0346, "step": 5178 }, { "epoch": 2.356232939035487, "grad_norm": 0.5322041327778935, "learning_rate": 2.726233502734264e-05, "loss": 0.036, "step": 5179 }, { "epoch": 2.356687898089172, "grad_norm": 0.5665279975852141, "learning_rate": 2.725521777651605e-05, "loss": 0.0609, "step": 5180 }, { "epoch": 2.357142857142857, "grad_norm": 0.39524299808152386, "learning_rate": 2.7248100341403247e-05, "loss": 0.0417, "step": 5181 }, { "epoch": 2.3575978161965425, "grad_norm": 0.37957218658131164, "learning_rate": 2.724098272258584e-05, "loss": 0.0441, "step": 5182 }, { "epoch": 2.3580527752502274, "grad_norm": 0.37127626648329537, "learning_rate": 2.7233864920645445e-05, "loss": 0.0508, "step": 5183 }, { "epoch": 2.3585077343039127, "grad_norm": 0.5690733543668482, "learning_rate": 2.7226746936163687e-05, "loss": 0.0646, "step": 5184 }, { "epoch": 2.358962693357598, "grad_norm": 0.44843633073835765, "learning_rate": 2.7219628769722237e-05, "loss": 0.0435, "step": 5185 }, { "epoch": 2.359417652411283, "grad_norm": 0.32183702513260537, "learning_rate": 2.7212510421902742e-05, "loss": 0.0385, "step": 5186 }, { "epoch": 2.359872611464968, "grad_norm": 0.43792250131296595, "learning_rate": 2.720539189328689e-05, "loss": 0.0366, "step": 5187 }, { "epoch": 2.3603275705186535, "grad_norm": 0.46965752502381175, "learning_rate": 2.7198273184456376e-05, "loss": 0.0482, "step": 5188 }, { "epoch": 2.3607825295723384, "grad_norm": 0.47411735082046924, "learning_rate": 2.7191154295992894e-05, "loss": 0.0581, "step": 5189 }, { "epoch": 2.3612374886260237, "grad_norm": 0.3108599458461967, "learning_rate": 2.7184035228478188e-05, "loss": 0.0276, "step": 5190 }, { "epoch": 2.361692447679709, "grad_norm": 0.5130278262349623, "learning_rate": 2.717691598249397e-05, "loss": 0.0476, "step": 5191 }, { "epoch": 2.362147406733394, "grad_norm": 0.41845479622614845, "learning_rate": 2.7169796558622028e-05, "loss": 0.0399, "step": 5192 }, { "epoch": 2.362602365787079, "grad_norm": 0.6086218364924232, "learning_rate": 2.7162676957444106e-05, "loss": 0.0726, "step": 5193 }, { "epoch": 2.3630573248407645, "grad_norm": 0.5316195216238105, "learning_rate": 2.715555717954198e-05, "loss": 0.0398, "step": 5194 }, { "epoch": 2.3635122838944493, "grad_norm": 0.517613445608115, "learning_rate": 2.7148437225497464e-05, "loss": 0.0338, "step": 5195 }, { "epoch": 2.3639672429481347, "grad_norm": 0.6102833161915936, "learning_rate": 2.7141317095892353e-05, "loss": 0.0603, "step": 5196 }, { "epoch": 2.36442220200182, "grad_norm": 0.37681464670998666, "learning_rate": 2.713419679130849e-05, "loss": 0.038, "step": 5197 }, { "epoch": 2.364877161055505, "grad_norm": 0.5913281764076884, "learning_rate": 2.7127076312327693e-05, "loss": 0.0621, "step": 5198 }, { "epoch": 2.36533212010919, "grad_norm": 0.43385564960243533, "learning_rate": 2.711995565953183e-05, "loss": 0.0515, "step": 5199 }, { "epoch": 2.3657870791628755, "grad_norm": 0.4443299763049423, "learning_rate": 2.7112834833502763e-05, "loss": 0.0456, "step": 5200 }, { "epoch": 2.3662420382165603, "grad_norm": 0.5679964851157858, "learning_rate": 2.710571383482237e-05, "loss": 0.0595, "step": 5201 }, { "epoch": 2.3666969972702456, "grad_norm": 0.4942354477745815, "learning_rate": 2.7098592664072563e-05, "loss": 0.0454, "step": 5202 }, { "epoch": 2.367151956323931, "grad_norm": 0.4893949819579538, "learning_rate": 2.7091471321835234e-05, "loss": 0.0462, "step": 5203 }, { "epoch": 2.367606915377616, "grad_norm": 0.7470757388044067, "learning_rate": 2.7084349808692318e-05, "loss": 0.0423, "step": 5204 }, { "epoch": 2.368061874431301, "grad_norm": 0.5271813999083532, "learning_rate": 2.707722812522574e-05, "loss": 0.0716, "step": 5205 }, { "epoch": 2.3685168334849864, "grad_norm": 0.5189918427409667, "learning_rate": 2.7070106272017465e-05, "loss": 0.043, "step": 5206 }, { "epoch": 2.3689717925386713, "grad_norm": 0.5117845880107829, "learning_rate": 2.706298424964946e-05, "loss": 0.0433, "step": 5207 }, { "epoch": 2.3694267515923566, "grad_norm": 0.3851588762961197, "learning_rate": 2.7055862058703685e-05, "loss": 0.0458, "step": 5208 }, { "epoch": 2.369881710646042, "grad_norm": 0.6023545181866938, "learning_rate": 2.704873969976216e-05, "loss": 0.0571, "step": 5209 }, { "epoch": 2.370336669699727, "grad_norm": 0.5529474783508186, "learning_rate": 2.7041617173406874e-05, "loss": 0.0527, "step": 5210 }, { "epoch": 2.370791628753412, "grad_norm": 0.6182590064291531, "learning_rate": 2.703449448021985e-05, "loss": 0.0651, "step": 5211 }, { "epoch": 2.3712465878070974, "grad_norm": 0.34012561923005014, "learning_rate": 2.7027371620783127e-05, "loss": 0.0277, "step": 5212 }, { "epoch": 2.3717015468607827, "grad_norm": 0.43072616369306566, "learning_rate": 2.7020248595678743e-05, "loss": 0.0313, "step": 5213 }, { "epoch": 2.3721565059144676, "grad_norm": 0.3663266716741045, "learning_rate": 2.7013125405488782e-05, "loss": 0.0293, "step": 5214 }, { "epoch": 2.372611464968153, "grad_norm": 0.4872086376510781, "learning_rate": 2.7006002050795293e-05, "loss": 0.1053, "step": 5215 }, { "epoch": 2.373066424021838, "grad_norm": 0.5193791156961444, "learning_rate": 2.699887853218038e-05, "loss": 0.057, "step": 5216 }, { "epoch": 2.373521383075523, "grad_norm": 0.44064315186794417, "learning_rate": 2.699175485022614e-05, "loss": 0.0352, "step": 5217 }, { "epoch": 2.3739763421292084, "grad_norm": 0.33877904274421183, "learning_rate": 2.6984631005514683e-05, "loss": 0.0375, "step": 5218 }, { "epoch": 2.3744313011828937, "grad_norm": 0.5187100740396482, "learning_rate": 2.697750699862815e-05, "loss": 0.0469, "step": 5219 }, { "epoch": 2.3748862602365786, "grad_norm": 0.40245623489336074, "learning_rate": 2.6970382830148666e-05, "loss": 0.0441, "step": 5220 }, { "epoch": 2.375341219290264, "grad_norm": 0.44292395716408733, "learning_rate": 2.6963258500658405e-05, "loss": 0.0421, "step": 5221 }, { "epoch": 2.375796178343949, "grad_norm": 0.5871165551699794, "learning_rate": 2.695613401073952e-05, "loss": 0.0462, "step": 5222 }, { "epoch": 2.376251137397634, "grad_norm": 0.44492299930673307, "learning_rate": 2.6949009360974197e-05, "loss": 0.0397, "step": 5223 }, { "epoch": 2.3767060964513194, "grad_norm": 0.5438875620567183, "learning_rate": 2.694188455194464e-05, "loss": 0.0467, "step": 5224 }, { "epoch": 2.3771610555050047, "grad_norm": 0.5853815035877634, "learning_rate": 2.6934759584233037e-05, "loss": 0.0792, "step": 5225 }, { "epoch": 2.3776160145586895, "grad_norm": 0.7533386511311315, "learning_rate": 2.692763445842162e-05, "loss": 0.0796, "step": 5226 }, { "epoch": 2.378070973612375, "grad_norm": 0.44260077954293436, "learning_rate": 2.6920509175092622e-05, "loss": 0.0349, "step": 5227 }, { "epoch": 2.37852593266606, "grad_norm": 1.216069154850525, "learning_rate": 2.691338373482829e-05, "loss": 0.0683, "step": 5228 }, { "epoch": 2.3789808917197455, "grad_norm": 0.4286194684814938, "learning_rate": 2.6906258138210873e-05, "loss": 0.0471, "step": 5229 }, { "epoch": 2.3794358507734303, "grad_norm": 0.37488260232457216, "learning_rate": 2.689913238582265e-05, "loss": 0.0496, "step": 5230 }, { "epoch": 2.3798908098271156, "grad_norm": 0.5087441185799831, "learning_rate": 2.689200647824591e-05, "loss": 0.0823, "step": 5231 }, { "epoch": 2.380345768880801, "grad_norm": 0.5226295595015015, "learning_rate": 2.6884880416062942e-05, "loss": 0.0684, "step": 5232 }, { "epoch": 2.380800727934486, "grad_norm": 0.48696615357125417, "learning_rate": 2.6877754199856058e-05, "loss": 0.0465, "step": 5233 }, { "epoch": 2.381255686988171, "grad_norm": 0.5958490033996262, "learning_rate": 2.6870627830207583e-05, "loss": 0.054, "step": 5234 }, { "epoch": 2.3817106460418564, "grad_norm": 0.6926703222592203, "learning_rate": 2.686350130769985e-05, "loss": 0.0849, "step": 5235 }, { "epoch": 2.3821656050955413, "grad_norm": 0.31603624782726014, "learning_rate": 2.68563746329152e-05, "loss": 0.0219, "step": 5236 }, { "epoch": 2.3826205641492266, "grad_norm": 0.3965506835409313, "learning_rate": 2.6849247806436002e-05, "loss": 0.0393, "step": 5237 }, { "epoch": 2.383075523202912, "grad_norm": 0.45949881929583397, "learning_rate": 2.6842120828844625e-05, "loss": 0.0851, "step": 5238 }, { "epoch": 2.383530482256597, "grad_norm": 0.513006622939777, "learning_rate": 2.6834993700723453e-05, "loss": 0.0485, "step": 5239 }, { "epoch": 2.383985441310282, "grad_norm": 0.48353422680044034, "learning_rate": 2.682786642265488e-05, "loss": 0.035, "step": 5240 }, { "epoch": 2.3844404003639674, "grad_norm": 0.8943753535210223, "learning_rate": 2.6820738995221324e-05, "loss": 0.0424, "step": 5241 }, { "epoch": 2.3848953594176523, "grad_norm": 0.4812094105795155, "learning_rate": 2.681361141900519e-05, "loss": 0.0537, "step": 5242 }, { "epoch": 2.3853503184713376, "grad_norm": 0.5901547264001639, "learning_rate": 2.6806483694588923e-05, "loss": 0.0708, "step": 5243 }, { "epoch": 2.385805277525023, "grad_norm": 0.604936694424851, "learning_rate": 2.6799355822554972e-05, "loss": 0.0625, "step": 5244 }, { "epoch": 2.386260236578708, "grad_norm": 0.6500289074912193, "learning_rate": 2.6792227803485785e-05, "loss": 0.0586, "step": 5245 }, { "epoch": 2.386715195632393, "grad_norm": 0.5066564011098335, "learning_rate": 2.6785099637963846e-05, "loss": 0.0363, "step": 5246 }, { "epoch": 2.3871701546860784, "grad_norm": 0.4147341404163081, "learning_rate": 2.6777971326571605e-05, "loss": 0.0532, "step": 5247 }, { "epoch": 2.3876251137397633, "grad_norm": 0.522518530672865, "learning_rate": 2.6770842869891594e-05, "loss": 0.0436, "step": 5248 }, { "epoch": 2.3880800727934486, "grad_norm": 0.6821822671554195, "learning_rate": 2.6763714268506294e-05, "loss": 0.0585, "step": 5249 }, { "epoch": 2.388535031847134, "grad_norm": 0.4532009401434036, "learning_rate": 2.675658552299823e-05, "loss": 0.0352, "step": 5250 }, { "epoch": 2.3889899909008188, "grad_norm": 0.514603448045486, "learning_rate": 2.6749456633949932e-05, "loss": 0.0617, "step": 5251 }, { "epoch": 2.389444949954504, "grad_norm": 0.554536574740188, "learning_rate": 2.6742327601943934e-05, "loss": 0.0554, "step": 5252 }, { "epoch": 2.3898999090081894, "grad_norm": 0.4818374612546212, "learning_rate": 2.6735198427562796e-05, "loss": 0.0615, "step": 5253 }, { "epoch": 2.3903548680618742, "grad_norm": 0.6409613452376246, "learning_rate": 2.672806911138907e-05, "loss": 0.0505, "step": 5254 }, { "epoch": 2.3908098271155596, "grad_norm": 0.4726020763155465, "learning_rate": 2.6720939654005357e-05, "loss": 0.0459, "step": 5255 }, { "epoch": 2.391264786169245, "grad_norm": 0.33143472004021896, "learning_rate": 2.6713810055994216e-05, "loss": 0.0259, "step": 5256 }, { "epoch": 2.3917197452229297, "grad_norm": 0.51336264994708, "learning_rate": 2.6706680317938253e-05, "loss": 0.0538, "step": 5257 }, { "epoch": 2.392174704276615, "grad_norm": 0.4826621200818224, "learning_rate": 2.669955044042009e-05, "loss": 0.0502, "step": 5258 }, { "epoch": 2.3926296633303004, "grad_norm": 1.0119895142834288, "learning_rate": 2.6692420424022335e-05, "loss": 0.0609, "step": 5259 }, { "epoch": 2.3930846223839852, "grad_norm": 1.274040681722149, "learning_rate": 2.6685290269327633e-05, "loss": 0.1512, "step": 5260 }, { "epoch": 2.3935395814376705, "grad_norm": 0.578227644468989, "learning_rate": 2.667815997691861e-05, "loss": 0.0571, "step": 5261 }, { "epoch": 2.393994540491356, "grad_norm": 0.6785358509857673, "learning_rate": 2.667102954737794e-05, "loss": 0.0561, "step": 5262 }, { "epoch": 2.3944494995450407, "grad_norm": 0.5907936651443437, "learning_rate": 2.666389898128828e-05, "loss": 0.0298, "step": 5263 }, { "epoch": 2.394904458598726, "grad_norm": 0.5342281780439325, "learning_rate": 2.6656768279232307e-05, "loss": 0.0499, "step": 5264 }, { "epoch": 2.3953594176524113, "grad_norm": 0.854922386649433, "learning_rate": 2.6649637441792718e-05, "loss": 0.0923, "step": 5265 }, { "epoch": 2.395814376706096, "grad_norm": 0.4838725780184853, "learning_rate": 2.6642506469552197e-05, "loss": 0.0408, "step": 5266 }, { "epoch": 2.3962693357597815, "grad_norm": 0.5291502186077292, "learning_rate": 2.6635375363093477e-05, "loss": 0.0654, "step": 5267 }, { "epoch": 2.396724294813467, "grad_norm": 0.550105953124475, "learning_rate": 2.6628244122999264e-05, "loss": 0.0581, "step": 5268 }, { "epoch": 2.397179253867152, "grad_norm": 0.5606512476774962, "learning_rate": 2.662111274985229e-05, "loss": 0.1023, "step": 5269 }, { "epoch": 2.397634212920837, "grad_norm": 0.4839086913419936, "learning_rate": 2.6613981244235307e-05, "loss": 0.0415, "step": 5270 }, { "epoch": 2.3980891719745223, "grad_norm": 0.43096568137654967, "learning_rate": 2.6606849606731053e-05, "loss": 0.0311, "step": 5271 }, { "epoch": 2.3985441310282076, "grad_norm": 0.6333298827570828, "learning_rate": 2.6599717837922324e-05, "loss": 0.0495, "step": 5272 }, { "epoch": 2.3989990900818925, "grad_norm": 0.4240757726983691, "learning_rate": 2.6592585938391867e-05, "loss": 0.0551, "step": 5273 }, { "epoch": 2.399454049135578, "grad_norm": 0.5228453100573016, "learning_rate": 2.6585453908722485e-05, "loss": 0.0589, "step": 5274 }, { "epoch": 2.399909008189263, "grad_norm": 0.4819383048499104, "learning_rate": 2.6578321749496965e-05, "loss": 0.0725, "step": 5275 }, { "epoch": 2.400363967242948, "grad_norm": 0.3990268928728333, "learning_rate": 2.6571189461298118e-05, "loss": 0.0372, "step": 5276 }, { "epoch": 2.4008189262966333, "grad_norm": 0.47853354679906235, "learning_rate": 2.656405704470877e-05, "loss": 0.0325, "step": 5277 }, { "epoch": 2.4012738853503186, "grad_norm": 0.36710073674975263, "learning_rate": 2.6556924500311732e-05, "loss": 0.0352, "step": 5278 }, { "epoch": 2.4017288444040035, "grad_norm": 0.9265931506483782, "learning_rate": 2.6549791828689862e-05, "loss": 0.0834, "step": 5279 }, { "epoch": 2.402183803457689, "grad_norm": 0.6975870275909408, "learning_rate": 2.654265903042601e-05, "loss": 0.1023, "step": 5280 }, { "epoch": 2.402638762511374, "grad_norm": 0.5578430611741171, "learning_rate": 2.653552610610302e-05, "loss": 0.0665, "step": 5281 }, { "epoch": 2.403093721565059, "grad_norm": 0.5415100541962509, "learning_rate": 2.652839305630377e-05, "loss": 0.0478, "step": 5282 }, { "epoch": 2.4035486806187443, "grad_norm": 0.5061489423626414, "learning_rate": 2.6521259881611144e-05, "loss": 0.0429, "step": 5283 }, { "epoch": 2.4040036396724296, "grad_norm": 0.540384644831248, "learning_rate": 2.6514126582608033e-05, "loss": 0.0555, "step": 5284 }, { "epoch": 2.404458598726115, "grad_norm": 0.6588558688578203, "learning_rate": 2.650699315987733e-05, "loss": 0.0669, "step": 5285 }, { "epoch": 2.4049135577797998, "grad_norm": 0.6761456266538293, "learning_rate": 2.6499859614001954e-05, "loss": 0.0664, "step": 5286 }, { "epoch": 2.405368516833485, "grad_norm": 0.9754481263014255, "learning_rate": 2.6492725945564827e-05, "loss": 0.065, "step": 5287 }, { "epoch": 2.4058234758871704, "grad_norm": 0.48828786954198405, "learning_rate": 2.6485592155148874e-05, "loss": 0.0559, "step": 5288 }, { "epoch": 2.4062784349408552, "grad_norm": 0.4496111916462132, "learning_rate": 2.6478458243337034e-05, "loss": 0.0338, "step": 5289 }, { "epoch": 2.4067333939945406, "grad_norm": 0.45101724505290247, "learning_rate": 2.6471324210712267e-05, "loss": 0.0367, "step": 5290 }, { "epoch": 2.407188353048226, "grad_norm": 0.5066206529464826, "learning_rate": 2.6464190057857537e-05, "loss": 0.0355, "step": 5291 }, { "epoch": 2.4076433121019107, "grad_norm": 0.6502817858093155, "learning_rate": 2.64570557853558e-05, "loss": 0.064, "step": 5292 }, { "epoch": 2.408098271155596, "grad_norm": 1.1927247409562427, "learning_rate": 2.6449921393790045e-05, "loss": 0.0806, "step": 5293 }, { "epoch": 2.4085532302092814, "grad_norm": 0.3786047791221423, "learning_rate": 2.6442786883743266e-05, "loss": 0.0248, "step": 5294 }, { "epoch": 2.4090081892629662, "grad_norm": 0.465630498187419, "learning_rate": 2.6435652255798448e-05, "loss": 0.0474, "step": 5295 }, { "epoch": 2.4094631483166515, "grad_norm": 0.4447966407662905, "learning_rate": 2.642851751053862e-05, "loss": 0.0327, "step": 5296 }, { "epoch": 2.409918107370337, "grad_norm": 0.42273415520633284, "learning_rate": 2.642138264854679e-05, "loss": 0.0439, "step": 5297 }, { "epoch": 2.4103730664240217, "grad_norm": 0.5040759213143168, "learning_rate": 2.641424767040599e-05, "loss": 0.0554, "step": 5298 }, { "epoch": 2.410828025477707, "grad_norm": 0.5577312097375564, "learning_rate": 2.640711257669925e-05, "loss": 0.0447, "step": 5299 }, { "epoch": 2.4112829845313923, "grad_norm": 0.42779350271226735, "learning_rate": 2.639997736800963e-05, "loss": 0.0569, "step": 5300 }, { "epoch": 2.411737943585077, "grad_norm": 0.6410957457763777, "learning_rate": 2.6392842044920184e-05, "loss": 0.0558, "step": 5301 }, { "epoch": 2.4121929026387625, "grad_norm": 0.5891203748192563, "learning_rate": 2.638570660801398e-05, "loss": 0.0807, "step": 5302 }, { "epoch": 2.412647861692448, "grad_norm": 0.35479767767074016, "learning_rate": 2.637857105787408e-05, "loss": 0.0288, "step": 5303 }, { "epoch": 2.4131028207461327, "grad_norm": 0.2982718296560461, "learning_rate": 2.6371435395083582e-05, "loss": 0.0312, "step": 5304 }, { "epoch": 2.413557779799818, "grad_norm": 0.5096776205398458, "learning_rate": 2.636429962022558e-05, "loss": 0.0391, "step": 5305 }, { "epoch": 2.4140127388535033, "grad_norm": 0.38908750777325385, "learning_rate": 2.6357163733883167e-05, "loss": 0.0516, "step": 5306 }, { "epoch": 2.414467697907188, "grad_norm": 0.5835450137700169, "learning_rate": 2.6350027736639466e-05, "loss": 0.0702, "step": 5307 }, { "epoch": 2.4149226569608735, "grad_norm": 0.4379657854452444, "learning_rate": 2.6342891629077604e-05, "loss": 0.041, "step": 5308 }, { "epoch": 2.415377616014559, "grad_norm": 0.7480888535717817, "learning_rate": 2.633575541178069e-05, "loss": 0.0748, "step": 5309 }, { "epoch": 2.4158325750682437, "grad_norm": 0.4785033411555042, "learning_rate": 2.632861908533188e-05, "loss": 0.0497, "step": 5310 }, { "epoch": 2.416287534121929, "grad_norm": 0.7469064597642722, "learning_rate": 2.6321482650314322e-05, "loss": 0.0512, "step": 5311 }, { "epoch": 2.4167424931756143, "grad_norm": 0.9111312784109332, "learning_rate": 2.6314346107311165e-05, "loss": 0.066, "step": 5312 }, { "epoch": 2.417197452229299, "grad_norm": 0.44580877817778886, "learning_rate": 2.630720945690558e-05, "loss": 0.0273, "step": 5313 }, { "epoch": 2.4176524112829845, "grad_norm": 0.6555230110262363, "learning_rate": 2.630007269968074e-05, "loss": 0.0643, "step": 5314 }, { "epoch": 2.41810737033667, "grad_norm": 0.6735730603751088, "learning_rate": 2.629293583621984e-05, "loss": 0.0682, "step": 5315 }, { "epoch": 2.4185623293903546, "grad_norm": 0.42915510128664713, "learning_rate": 2.6285798867106055e-05, "loss": 0.052, "step": 5316 }, { "epoch": 2.41901728844404, "grad_norm": 0.5264437881712438, "learning_rate": 2.6278661792922588e-05, "loss": 0.072, "step": 5317 }, { "epoch": 2.4194722474977253, "grad_norm": 0.3278324877805491, "learning_rate": 2.6271524614252664e-05, "loss": 0.0292, "step": 5318 }, { "epoch": 2.41992720655141, "grad_norm": 0.8252904598210649, "learning_rate": 2.6264387331679484e-05, "loss": 0.0747, "step": 5319 }, { "epoch": 2.4203821656050954, "grad_norm": 0.5001254621567343, "learning_rate": 2.6257249945786283e-05, "loss": 0.0555, "step": 5320 }, { "epoch": 2.4208371246587808, "grad_norm": 0.47221820308290413, "learning_rate": 2.6250112457156296e-05, "loss": 0.0766, "step": 5321 }, { "epoch": 2.421292083712466, "grad_norm": 0.6853944912762625, "learning_rate": 2.624297486637276e-05, "loss": 0.0328, "step": 5322 }, { "epoch": 2.421747042766151, "grad_norm": 0.39373562347002444, "learning_rate": 2.6235837174018936e-05, "loss": 0.0406, "step": 5323 }, { "epoch": 2.4222020018198362, "grad_norm": 0.5969868971150638, "learning_rate": 2.6228699380678073e-05, "loss": 0.0708, "step": 5324 }, { "epoch": 2.4226569608735216, "grad_norm": 0.36036026789667946, "learning_rate": 2.6221561486933454e-05, "loss": 0.0566, "step": 5325 }, { "epoch": 2.4231119199272064, "grad_norm": 0.6319532132967224, "learning_rate": 2.621442349336834e-05, "loss": 0.077, "step": 5326 }, { "epoch": 2.4235668789808917, "grad_norm": 0.5036285942799396, "learning_rate": 2.6207285400566022e-05, "loss": 0.0357, "step": 5327 }, { "epoch": 2.424021838034577, "grad_norm": 0.5002752503157054, "learning_rate": 2.6200147209109797e-05, "loss": 0.0628, "step": 5328 }, { "epoch": 2.424476797088262, "grad_norm": 0.4090663479621226, "learning_rate": 2.6193008919582962e-05, "loss": 0.0333, "step": 5329 }, { "epoch": 2.4249317561419472, "grad_norm": 0.5079335165052559, "learning_rate": 2.618587053256882e-05, "loss": 0.0424, "step": 5330 }, { "epoch": 2.4253867151956325, "grad_norm": 0.6410446669624362, "learning_rate": 2.6178732048650694e-05, "loss": 0.0532, "step": 5331 }, { "epoch": 2.4258416742493174, "grad_norm": 0.6474179952657356, "learning_rate": 2.617159346841192e-05, "loss": 0.0632, "step": 5332 }, { "epoch": 2.4262966333030027, "grad_norm": 0.42250036835360055, "learning_rate": 2.6164454792435806e-05, "loss": 0.0319, "step": 5333 }, { "epoch": 2.426751592356688, "grad_norm": 0.4668063730629067, "learning_rate": 2.6157316021305706e-05, "loss": 0.0581, "step": 5334 }, { "epoch": 2.427206551410373, "grad_norm": 0.41043095094556314, "learning_rate": 2.6150177155604977e-05, "loss": 0.063, "step": 5335 }, { "epoch": 2.427661510464058, "grad_norm": 0.42388620305698915, "learning_rate": 2.6143038195916956e-05, "loss": 0.0948, "step": 5336 }, { "epoch": 2.4281164695177435, "grad_norm": 0.5675950846601847, "learning_rate": 2.6135899142825014e-05, "loss": 0.0394, "step": 5337 }, { "epoch": 2.4285714285714284, "grad_norm": 0.37941425930348627, "learning_rate": 2.6128759996912535e-05, "loss": 0.0308, "step": 5338 }, { "epoch": 2.4290263876251137, "grad_norm": 0.5653310305282044, "learning_rate": 2.6121620758762877e-05, "loss": 0.0558, "step": 5339 }, { "epoch": 2.429481346678799, "grad_norm": 0.5199872364896526, "learning_rate": 2.6114481428959443e-05, "loss": 0.062, "step": 5340 }, { "epoch": 2.4299363057324843, "grad_norm": 0.36526917686834115, "learning_rate": 2.6107342008085605e-05, "loss": 0.0289, "step": 5341 }, { "epoch": 2.430391264786169, "grad_norm": 0.502373425730057, "learning_rate": 2.610020249672479e-05, "loss": 0.0663, "step": 5342 }, { "epoch": 2.4308462238398545, "grad_norm": 0.4541240882107369, "learning_rate": 2.6093062895460398e-05, "loss": 0.0427, "step": 5343 }, { "epoch": 2.43130118289354, "grad_norm": 0.4059362091549173, "learning_rate": 2.6085923204875833e-05, "loss": 0.0256, "step": 5344 }, { "epoch": 2.4317561419472247, "grad_norm": 0.5716287636274322, "learning_rate": 2.607878342555454e-05, "loss": 0.0442, "step": 5345 }, { "epoch": 2.43221110100091, "grad_norm": 0.6036665647942621, "learning_rate": 2.6071643558079923e-05, "loss": 0.049, "step": 5346 }, { "epoch": 2.4326660600545953, "grad_norm": 0.4972280955150882, "learning_rate": 2.6064503603035446e-05, "loss": 0.0361, "step": 5347 }, { "epoch": 2.43312101910828, "grad_norm": 0.44437162705663147, "learning_rate": 2.6057363561004528e-05, "loss": 0.0621, "step": 5348 }, { "epoch": 2.4335759781619655, "grad_norm": 0.4242326848073521, "learning_rate": 2.605022343257065e-05, "loss": 0.0738, "step": 5349 }, { "epoch": 2.434030937215651, "grad_norm": 0.6528174643726697, "learning_rate": 2.6043083218317248e-05, "loss": 0.0387, "step": 5350 }, { "epoch": 2.4344858962693356, "grad_norm": 0.43777784312835094, "learning_rate": 2.6035942918827793e-05, "loss": 0.0399, "step": 5351 }, { "epoch": 2.434940855323021, "grad_norm": 0.4952922067270186, "learning_rate": 2.602880253468577e-05, "loss": 0.0545, "step": 5352 }, { "epoch": 2.4353958143767063, "grad_norm": 0.4798877296149557, "learning_rate": 2.6021662066474646e-05, "loss": 0.0504, "step": 5353 }, { "epoch": 2.435850773430391, "grad_norm": 0.4461387850135867, "learning_rate": 2.601452151477791e-05, "loss": 0.0387, "step": 5354 }, { "epoch": 2.4363057324840764, "grad_norm": 0.4354653573418376, "learning_rate": 2.600738088017906e-05, "loss": 0.0329, "step": 5355 }, { "epoch": 2.4367606915377618, "grad_norm": 0.5190889019686068, "learning_rate": 2.600024016326159e-05, "loss": 0.0559, "step": 5356 }, { "epoch": 2.4372156505914466, "grad_norm": 0.7945412232027134, "learning_rate": 2.599309936460902e-05, "loss": 0.0734, "step": 5357 }, { "epoch": 2.437670609645132, "grad_norm": 0.5121031149327548, "learning_rate": 2.598595848480484e-05, "loss": 0.0566, "step": 5358 }, { "epoch": 2.4381255686988172, "grad_norm": 0.6203020208755026, "learning_rate": 2.597881752443259e-05, "loss": 0.0706, "step": 5359 }, { "epoch": 2.438580527752502, "grad_norm": 0.6821558711320381, "learning_rate": 2.5971676484075797e-05, "loss": 0.0734, "step": 5360 }, { "epoch": 2.4390354868061874, "grad_norm": 0.39668124818164385, "learning_rate": 2.5964535364317992e-05, "loss": 0.0428, "step": 5361 }, { "epoch": 2.4394904458598727, "grad_norm": 0.5701187755295511, "learning_rate": 2.595739416574271e-05, "loss": 0.0627, "step": 5362 }, { "epoch": 2.4399454049135576, "grad_norm": 0.6081108966886711, "learning_rate": 2.5950252888933495e-05, "loss": 0.0558, "step": 5363 }, { "epoch": 2.440400363967243, "grad_norm": 0.47366562170981913, "learning_rate": 2.5943111534473913e-05, "loss": 0.056, "step": 5364 }, { "epoch": 2.4408553230209282, "grad_norm": 0.6176391883864611, "learning_rate": 2.5935970102947505e-05, "loss": 0.0593, "step": 5365 }, { "epoch": 2.441310282074613, "grad_norm": 0.37833908570513386, "learning_rate": 2.5928828594937854e-05, "loss": 0.0202, "step": 5366 }, { "epoch": 2.4417652411282984, "grad_norm": 0.5165666509649061, "learning_rate": 2.5921687011028522e-05, "loss": 0.0621, "step": 5367 }, { "epoch": 2.4422202001819837, "grad_norm": 0.49416584622951565, "learning_rate": 2.59145453518031e-05, "loss": 0.0465, "step": 5368 }, { "epoch": 2.4426751592356686, "grad_norm": 0.3589878974664427, "learning_rate": 2.590740361784515e-05, "loss": 0.0245, "step": 5369 }, { "epoch": 2.443130118289354, "grad_norm": 0.39983242253234735, "learning_rate": 2.590026180973828e-05, "loss": 0.0332, "step": 5370 }, { "epoch": 2.443585077343039, "grad_norm": 0.719416351544756, "learning_rate": 2.589311992806608e-05, "loss": 0.0984, "step": 5371 }, { "epoch": 2.444040036396724, "grad_norm": 0.6645775956810426, "learning_rate": 2.588597797341215e-05, "loss": 0.0537, "step": 5372 }, { "epoch": 2.4444949954504094, "grad_norm": 0.46050494112610524, "learning_rate": 2.5878835946360103e-05, "loss": 0.0632, "step": 5373 }, { "epoch": 2.4449499545040947, "grad_norm": 0.513499534088739, "learning_rate": 2.5871693847493556e-05, "loss": 0.0484, "step": 5374 }, { "epoch": 2.4454049135577796, "grad_norm": 0.4931873117383866, "learning_rate": 2.5864551677396116e-05, "loss": 0.0808, "step": 5375 }, { "epoch": 2.445859872611465, "grad_norm": 0.4909785868477964, "learning_rate": 2.5857409436651416e-05, "loss": 0.0661, "step": 5376 }, { "epoch": 2.44631483166515, "grad_norm": 0.4648872338173167, "learning_rate": 2.585026712584309e-05, "loss": 0.0471, "step": 5377 }, { "epoch": 2.4467697907188355, "grad_norm": 0.29750403719995927, "learning_rate": 2.584312474555478e-05, "loss": 0.0346, "step": 5378 }, { "epoch": 2.4472247497725204, "grad_norm": 0.5376262617018263, "learning_rate": 2.5835982296370122e-05, "loss": 0.061, "step": 5379 }, { "epoch": 2.4476797088262057, "grad_norm": 0.4414048576963632, "learning_rate": 2.5828839778872764e-05, "loss": 0.044, "step": 5380 }, { "epoch": 2.448134667879891, "grad_norm": 0.2428885128047031, "learning_rate": 2.5821697193646367e-05, "loss": 0.0308, "step": 5381 }, { "epoch": 2.448589626933576, "grad_norm": 0.36721780187088005, "learning_rate": 2.581455454127458e-05, "loss": 0.0484, "step": 5382 }, { "epoch": 2.449044585987261, "grad_norm": 0.47487723231243356, "learning_rate": 2.5807411822341076e-05, "loss": 0.0485, "step": 5383 }, { "epoch": 2.4494995450409465, "grad_norm": 0.6104448442382557, "learning_rate": 2.5800269037429518e-05, "loss": 0.0572, "step": 5384 }, { "epoch": 2.4499545040946313, "grad_norm": 0.46185236649536676, "learning_rate": 2.57931261871236e-05, "loss": 0.0554, "step": 5385 }, { "epoch": 2.4504094631483166, "grad_norm": 0.5292548205938601, "learning_rate": 2.5785983272006986e-05, "loss": 0.0672, "step": 5386 }, { "epoch": 2.450864422202002, "grad_norm": 0.5595386986249066, "learning_rate": 2.577884029266337e-05, "loss": 0.0783, "step": 5387 }, { "epoch": 2.451319381255687, "grad_norm": 0.44029262862749224, "learning_rate": 2.5771697249676447e-05, "loss": 0.0351, "step": 5388 }, { "epoch": 2.451774340309372, "grad_norm": 1.7110198434690096, "learning_rate": 2.5764554143629898e-05, "loss": 0.0746, "step": 5389 }, { "epoch": 2.4522292993630574, "grad_norm": 0.4079165932178896, "learning_rate": 2.575741097510744e-05, "loss": 0.0411, "step": 5390 }, { "epoch": 2.4526842584167423, "grad_norm": 0.5629187644135207, "learning_rate": 2.5750267744692786e-05, "loss": 0.0564, "step": 5391 }, { "epoch": 2.4531392174704276, "grad_norm": 0.44981140214911053, "learning_rate": 2.5743124452969635e-05, "loss": 0.034, "step": 5392 }, { "epoch": 2.453594176524113, "grad_norm": 0.5296629024049033, "learning_rate": 2.5735981100521706e-05, "loss": 0.0524, "step": 5393 }, { "epoch": 2.4540491355777982, "grad_norm": 0.7547228184996357, "learning_rate": 2.5728837687932727e-05, "loss": 0.0553, "step": 5394 }, { "epoch": 2.454504094631483, "grad_norm": 0.4044443173803507, "learning_rate": 2.572169421578643e-05, "loss": 0.0334, "step": 5395 }, { "epoch": 2.4549590536851684, "grad_norm": 0.5176939546384998, "learning_rate": 2.5714550684666532e-05, "loss": 0.0723, "step": 5396 }, { "epoch": 2.4554140127388537, "grad_norm": 0.4536257499891099, "learning_rate": 2.5707407095156782e-05, "loss": 0.0391, "step": 5397 }, { "epoch": 2.4558689717925386, "grad_norm": 0.3627437299759535, "learning_rate": 2.5700263447840926e-05, "loss": 0.0446, "step": 5398 }, { "epoch": 2.456323930846224, "grad_norm": 0.4164740257575489, "learning_rate": 2.5693119743302696e-05, "loss": 0.0288, "step": 5399 }, { "epoch": 2.4567788898999092, "grad_norm": 0.4790772276675939, "learning_rate": 2.568597598212585e-05, "loss": 0.0315, "step": 5400 }, { "epoch": 2.457233848953594, "grad_norm": 0.4758538958504711, "learning_rate": 2.5678832164894147e-05, "loss": 0.0451, "step": 5401 }, { "epoch": 2.4576888080072794, "grad_norm": 0.4140031652207273, "learning_rate": 2.5671688292191348e-05, "loss": 0.0365, "step": 5402 }, { "epoch": 2.4581437670609647, "grad_norm": 0.6328992458039169, "learning_rate": 2.5664544364601207e-05, "loss": 0.0385, "step": 5403 }, { "epoch": 2.4585987261146496, "grad_norm": 0.5017551288469181, "learning_rate": 2.565740038270751e-05, "loss": 0.0443, "step": 5404 }, { "epoch": 2.459053685168335, "grad_norm": 0.47403178916758437, "learning_rate": 2.565025634709402e-05, "loss": 0.0529, "step": 5405 }, { "epoch": 2.45950864422202, "grad_norm": 0.6368001021312609, "learning_rate": 2.5643112258344515e-05, "loss": 0.0789, "step": 5406 }, { "epoch": 2.459963603275705, "grad_norm": 0.45261537894826986, "learning_rate": 2.563596811704278e-05, "loss": 0.0268, "step": 5407 }, { "epoch": 2.4604185623293904, "grad_norm": 0.44619495962811895, "learning_rate": 2.5628823923772605e-05, "loss": 0.0352, "step": 5408 }, { "epoch": 2.4608735213830757, "grad_norm": 0.42643246293519693, "learning_rate": 2.5621679679117778e-05, "loss": 0.0367, "step": 5409 }, { "epoch": 2.4613284804367606, "grad_norm": 0.4613754073410433, "learning_rate": 2.56145353836621e-05, "loss": 0.0612, "step": 5410 }, { "epoch": 2.461783439490446, "grad_norm": 0.6743606957766825, "learning_rate": 2.5607391037989353e-05, "loss": 0.0509, "step": 5411 }, { "epoch": 2.462238398544131, "grad_norm": 0.5002785636609074, "learning_rate": 2.5600246642683367e-05, "loss": 0.0417, "step": 5412 }, { "epoch": 2.462693357597816, "grad_norm": 0.6092363359369006, "learning_rate": 2.559310219832793e-05, "loss": 0.0679, "step": 5413 }, { "epoch": 2.4631483166515014, "grad_norm": 0.5694691201912191, "learning_rate": 2.5585957705506858e-05, "loss": 0.0708, "step": 5414 }, { "epoch": 2.4636032757051867, "grad_norm": 0.35276381436068466, "learning_rate": 2.557881316480397e-05, "loss": 0.0378, "step": 5415 }, { "epoch": 2.4640582347588715, "grad_norm": 0.4666922804339553, "learning_rate": 2.5571668576803088e-05, "loss": 0.0639, "step": 5416 }, { "epoch": 2.464513193812557, "grad_norm": 0.41817190796820614, "learning_rate": 2.556452394208803e-05, "loss": 0.0446, "step": 5417 }, { "epoch": 2.464968152866242, "grad_norm": 0.5114868820941185, "learning_rate": 2.5557379261242616e-05, "loss": 0.0699, "step": 5418 }, { "epoch": 2.465423111919927, "grad_norm": 0.39185436400091955, "learning_rate": 2.55502345348507e-05, "loss": 0.0344, "step": 5419 }, { "epoch": 2.4658780709736123, "grad_norm": 0.43352267927882343, "learning_rate": 2.554308976349609e-05, "loss": 0.0348, "step": 5420 }, { "epoch": 2.4663330300272976, "grad_norm": 0.40246955643282656, "learning_rate": 2.553594494776264e-05, "loss": 0.0379, "step": 5421 }, { "epoch": 2.4667879890809825, "grad_norm": 0.6698564543059201, "learning_rate": 2.5528800088234194e-05, "loss": 0.05, "step": 5422 }, { "epoch": 2.467242948134668, "grad_norm": 0.44930901612665564, "learning_rate": 2.552165518549459e-05, "loss": 0.0732, "step": 5423 }, { "epoch": 2.467697907188353, "grad_norm": 0.46848246348467404, "learning_rate": 2.5514510240127686e-05, "loss": 0.0415, "step": 5424 }, { "epoch": 2.468152866242038, "grad_norm": 0.7990787944688615, "learning_rate": 2.5507365252717318e-05, "loss": 0.0825, "step": 5425 }, { "epoch": 2.4686078252957233, "grad_norm": 0.4058686074979472, "learning_rate": 2.550022022384736e-05, "loss": 0.04, "step": 5426 }, { "epoch": 2.4690627843494086, "grad_norm": 0.6005933800446578, "learning_rate": 2.5493075154101663e-05, "loss": 0.0484, "step": 5427 }, { "epoch": 2.4695177434030935, "grad_norm": 0.6399962713835051, "learning_rate": 2.548593004406409e-05, "loss": 0.0595, "step": 5428 }, { "epoch": 2.469972702456779, "grad_norm": 0.6048660171497269, "learning_rate": 2.5478784894318508e-05, "loss": 0.0687, "step": 5429 }, { "epoch": 2.470427661510464, "grad_norm": 0.5798650365443068, "learning_rate": 2.5471639705448787e-05, "loss": 0.0247, "step": 5430 }, { "epoch": 2.470882620564149, "grad_norm": 0.6265560226587757, "learning_rate": 2.5464494478038802e-05, "loss": 0.0548, "step": 5431 }, { "epoch": 2.4713375796178343, "grad_norm": 0.694067735524749, "learning_rate": 2.545734921267242e-05, "loss": 0.0507, "step": 5432 }, { "epoch": 2.4717925386715196, "grad_norm": 0.6653038939024689, "learning_rate": 2.545020390993353e-05, "loss": 0.0707, "step": 5433 }, { "epoch": 2.472247497725205, "grad_norm": 0.5371315512494729, "learning_rate": 2.5443058570406016e-05, "loss": 0.068, "step": 5434 }, { "epoch": 2.47270245677889, "grad_norm": 0.6072020006805207, "learning_rate": 2.543591319467374e-05, "loss": 0.0889, "step": 5435 }, { "epoch": 2.473157415832575, "grad_norm": 0.570613190979134, "learning_rate": 2.5428767783320616e-05, "loss": 0.0757, "step": 5436 }, { "epoch": 2.4736123748862604, "grad_norm": 0.5373751233023091, "learning_rate": 2.5421622336930527e-05, "loss": 0.0446, "step": 5437 }, { "epoch": 2.4740673339399453, "grad_norm": 0.4821405223363442, "learning_rate": 2.5414476856087367e-05, "loss": 0.0449, "step": 5438 }, { "epoch": 2.4745222929936306, "grad_norm": 0.5013098173519832, "learning_rate": 2.5407331341375025e-05, "loss": 0.0489, "step": 5439 }, { "epoch": 2.474977252047316, "grad_norm": 0.48860747615772354, "learning_rate": 2.54001857933774e-05, "loss": 0.0562, "step": 5440 }, { "epoch": 2.4754322111010008, "grad_norm": 0.3704127131225902, "learning_rate": 2.539304021267841e-05, "loss": 0.0315, "step": 5441 }, { "epoch": 2.475887170154686, "grad_norm": 0.5689382310363464, "learning_rate": 2.5385894599861937e-05, "loss": 0.0606, "step": 5442 }, { "epoch": 2.4763421292083714, "grad_norm": 0.9787646578377719, "learning_rate": 2.537874895551191e-05, "loss": 0.1287, "step": 5443 }, { "epoch": 2.4767970882620562, "grad_norm": 0.4973833997148491, "learning_rate": 2.5371603280212232e-05, "loss": 0.0514, "step": 5444 }, { "epoch": 2.4772520473157416, "grad_norm": 0.6292443474180177, "learning_rate": 2.5364457574546802e-05, "loss": 0.0658, "step": 5445 }, { "epoch": 2.477707006369427, "grad_norm": 0.7909777145519322, "learning_rate": 2.5357311839099547e-05, "loss": 0.0623, "step": 5446 }, { "epoch": 2.4781619654231117, "grad_norm": 0.7357668236596548, "learning_rate": 2.535016607445438e-05, "loss": 0.0606, "step": 5447 }, { "epoch": 2.478616924476797, "grad_norm": 0.45414286041792257, "learning_rate": 2.534302028119523e-05, "loss": 0.0528, "step": 5448 }, { "epoch": 2.4790718835304824, "grad_norm": 0.3700588520189087, "learning_rate": 2.5335874459906007e-05, "loss": 0.0366, "step": 5449 }, { "epoch": 2.4795268425841677, "grad_norm": 0.45504567675869784, "learning_rate": 2.5328728611170636e-05, "loss": 0.0608, "step": 5450 }, { "epoch": 2.4799818016378525, "grad_norm": 0.5756651107188442, "learning_rate": 2.5321582735573056e-05, "loss": 0.0302, "step": 5451 }, { "epoch": 2.480436760691538, "grad_norm": 1.8489201839606753, "learning_rate": 2.531443683369718e-05, "loss": 0.0548, "step": 5452 }, { "epoch": 2.480891719745223, "grad_norm": 0.4613319678912711, "learning_rate": 2.5307290906126953e-05, "loss": 0.0331, "step": 5453 }, { "epoch": 2.481346678798908, "grad_norm": 0.4309747466330984, "learning_rate": 2.530014495344629e-05, "loss": 0.0504, "step": 5454 }, { "epoch": 2.4818016378525933, "grad_norm": 0.3981581475262126, "learning_rate": 2.529299897623915e-05, "loss": 0.0688, "step": 5455 }, { "epoch": 2.4822565969062786, "grad_norm": 0.47795335842442027, "learning_rate": 2.528585297508945e-05, "loss": 0.0459, "step": 5456 }, { "epoch": 2.4827115559599635, "grad_norm": 0.4339843345569723, "learning_rate": 2.527870695058113e-05, "loss": 0.0252, "step": 5457 }, { "epoch": 2.483166515013649, "grad_norm": 0.4483646830910995, "learning_rate": 2.5271560903298152e-05, "loss": 0.053, "step": 5458 }, { "epoch": 2.483621474067334, "grad_norm": 0.32962582296566223, "learning_rate": 2.5264414833824434e-05, "loss": 0.032, "step": 5459 }, { "epoch": 2.484076433121019, "grad_norm": 0.45651946401997634, "learning_rate": 2.525726874274393e-05, "loss": 0.0533, "step": 5460 }, { "epoch": 2.4845313921747043, "grad_norm": 0.3938495403895227, "learning_rate": 2.5250122630640587e-05, "loss": 0.0428, "step": 5461 }, { "epoch": 2.4849863512283896, "grad_norm": 0.4076315519464821, "learning_rate": 2.5242976498098355e-05, "loss": 0.0412, "step": 5462 }, { "epoch": 2.4854413102820745, "grad_norm": 0.5388339580390473, "learning_rate": 2.523583034570117e-05, "loss": 0.0566, "step": 5463 }, { "epoch": 2.48589626933576, "grad_norm": 0.4810804052354936, "learning_rate": 2.5228684174033003e-05, "loss": 0.0376, "step": 5464 }, { "epoch": 2.486351228389445, "grad_norm": 0.496303861871243, "learning_rate": 2.52215379836778e-05, "loss": 0.0693, "step": 5465 }, { "epoch": 2.48680618744313, "grad_norm": 0.5652266382112753, "learning_rate": 2.5214391775219508e-05, "loss": 0.0748, "step": 5466 }, { "epoch": 2.4872611464968153, "grad_norm": 0.5315145349920012, "learning_rate": 2.5207245549242087e-05, "loss": 0.0341, "step": 5467 }, { "epoch": 2.4877161055505006, "grad_norm": 0.39137521239011486, "learning_rate": 2.5200099306329506e-05, "loss": 0.0541, "step": 5468 }, { "epoch": 2.4881710646041855, "grad_norm": 0.43491216413953865, "learning_rate": 2.5192953047065703e-05, "loss": 0.0356, "step": 5469 }, { "epoch": 2.488626023657871, "grad_norm": 0.6520518110915712, "learning_rate": 2.518580677203465e-05, "loss": 0.0601, "step": 5470 }, { "epoch": 2.489080982711556, "grad_norm": 0.5148584180194857, "learning_rate": 2.5178660481820303e-05, "loss": 0.0647, "step": 5471 }, { "epoch": 2.489535941765241, "grad_norm": 0.4760325836159984, "learning_rate": 2.517151417700664e-05, "loss": 0.0312, "step": 5472 }, { "epoch": 2.4899909008189263, "grad_norm": 0.4939501778150275, "learning_rate": 2.5164367858177606e-05, "loss": 0.0664, "step": 5473 }, { "epoch": 2.4904458598726116, "grad_norm": 0.42774994437018565, "learning_rate": 2.5157221525917174e-05, "loss": 0.0559, "step": 5474 }, { "epoch": 2.4909008189262964, "grad_norm": 0.46622616105110537, "learning_rate": 2.5150075180809313e-05, "loss": 0.0562, "step": 5475 }, { "epoch": 2.4913557779799818, "grad_norm": 0.6049312981736529, "learning_rate": 2.5142928823437977e-05, "loss": 0.0457, "step": 5476 }, { "epoch": 2.491810737033667, "grad_norm": 0.4748912689455372, "learning_rate": 2.5135782454387147e-05, "loss": 0.0482, "step": 5477 }, { "epoch": 2.492265696087352, "grad_norm": 0.5204041607218404, "learning_rate": 2.5128636074240785e-05, "loss": 0.0511, "step": 5478 }, { "epoch": 2.4927206551410372, "grad_norm": 0.48387236285062407, "learning_rate": 2.5121489683582873e-05, "loss": 0.0463, "step": 5479 }, { "epoch": 2.4931756141947226, "grad_norm": 0.4539192973021173, "learning_rate": 2.5114343282997372e-05, "loss": 0.0409, "step": 5480 }, { "epoch": 2.4936305732484074, "grad_norm": 0.6907670469989551, "learning_rate": 2.5107196873068238e-05, "loss": 0.0547, "step": 5481 }, { "epoch": 2.4940855323020927, "grad_norm": 0.4897669907692905, "learning_rate": 2.5100050454379475e-05, "loss": 0.0406, "step": 5482 }, { "epoch": 2.494540491355778, "grad_norm": 0.5420778174027471, "learning_rate": 2.5092904027515034e-05, "loss": 0.0667, "step": 5483 }, { "epoch": 2.494995450409463, "grad_norm": 0.5387678619625801, "learning_rate": 2.5085757593058902e-05, "loss": 0.0958, "step": 5484 }, { "epoch": 2.4954504094631482, "grad_norm": 0.5899866149394483, "learning_rate": 2.5078611151595043e-05, "loss": 0.0684, "step": 5485 }, { "epoch": 2.4959053685168335, "grad_norm": 0.631165459698672, "learning_rate": 2.5071464703707438e-05, "loss": 0.0427, "step": 5486 }, { "epoch": 2.496360327570519, "grad_norm": 2.0359679935273034, "learning_rate": 2.5064318249980062e-05, "loss": 0.1176, "step": 5487 }, { "epoch": 2.4968152866242037, "grad_norm": 0.426761256868588, "learning_rate": 2.5057171790996875e-05, "loss": 0.0764, "step": 5488 }, { "epoch": 2.497270245677889, "grad_norm": 0.40786525543582947, "learning_rate": 2.505002532734188e-05, "loss": 0.0687, "step": 5489 }, { "epoch": 2.4977252047315743, "grad_norm": 0.4637760266015241, "learning_rate": 2.504287885959904e-05, "loss": 0.0355, "step": 5490 }, { "epoch": 2.498180163785259, "grad_norm": 0.40441365209365276, "learning_rate": 2.503573238835233e-05, "loss": 0.065, "step": 5491 }, { "epoch": 2.4986351228389445, "grad_norm": 0.4534593776530488, "learning_rate": 2.5028585914185738e-05, "loss": 0.0399, "step": 5492 }, { "epoch": 2.49909008189263, "grad_norm": 0.5938204266777994, "learning_rate": 2.5021439437683226e-05, "loss": 0.0522, "step": 5493 }, { "epoch": 2.4995450409463147, "grad_norm": 0.5247568700184884, "learning_rate": 2.501429295942878e-05, "loss": 0.0296, "step": 5494 }, { "epoch": 2.5, "grad_norm": 0.5214210521980621, "learning_rate": 2.5007146480006377e-05, "loss": 0.069, "step": 5495 }, { "epoch": 2.5004549590536853, "grad_norm": 0.6643089972620109, "learning_rate": 2.5e-05, "loss": 0.0353, "step": 5496 }, { "epoch": 2.50090991810737, "grad_norm": 0.48950625647902596, "learning_rate": 2.499285351999363e-05, "loss": 0.0518, "step": 5497 }, { "epoch": 2.5013648771610555, "grad_norm": 0.5456635185176991, "learning_rate": 2.4985707040571228e-05, "loss": 0.0822, "step": 5498 }, { "epoch": 2.501819836214741, "grad_norm": 0.33947936354806063, "learning_rate": 2.4978560562316787e-05, "loss": 0.0352, "step": 5499 }, { "epoch": 2.502274795268426, "grad_norm": 0.5093230260895456, "learning_rate": 2.497141408581427e-05, "loss": 0.0338, "step": 5500 }, { "epoch": 2.502729754322111, "grad_norm": 0.3226628620949302, "learning_rate": 2.4964267611647672e-05, "loss": 0.0312, "step": 5501 }, { "epoch": 2.5031847133757963, "grad_norm": 0.4420112984079047, "learning_rate": 2.4957121140400965e-05, "loss": 0.0445, "step": 5502 }, { "epoch": 2.5036396724294816, "grad_norm": 0.5420993928117616, "learning_rate": 2.4949974672658126e-05, "loss": 0.0393, "step": 5503 }, { "epoch": 2.5040946314831665, "grad_norm": 0.43160688917689416, "learning_rate": 2.494282820900313e-05, "loss": 0.0516, "step": 5504 }, { "epoch": 2.5045495905368518, "grad_norm": 0.33990717357602684, "learning_rate": 2.493568175001995e-05, "loss": 0.0345, "step": 5505 }, { "epoch": 2.505004549590537, "grad_norm": 0.6632961121058418, "learning_rate": 2.4928535296292575e-05, "loss": 0.0815, "step": 5506 }, { "epoch": 2.505459508644222, "grad_norm": 0.5320604057085614, "learning_rate": 2.492138884840496e-05, "loss": 0.0482, "step": 5507 }, { "epoch": 2.5059144676979073, "grad_norm": 0.4130991980080722, "learning_rate": 2.49142424069411e-05, "loss": 0.0276, "step": 5508 }, { "epoch": 2.5063694267515926, "grad_norm": 0.36769070695444983, "learning_rate": 2.4907095972484968e-05, "loss": 0.0297, "step": 5509 }, { "epoch": 2.5068243858052774, "grad_norm": 0.41134772559339355, "learning_rate": 2.4899949545620528e-05, "loss": 0.0418, "step": 5510 }, { "epoch": 2.5072793448589628, "grad_norm": 0.49592377264906307, "learning_rate": 2.4892803126931765e-05, "loss": 0.0587, "step": 5511 }, { "epoch": 2.507734303912648, "grad_norm": 0.38038571714341657, "learning_rate": 2.488565671700264e-05, "loss": 0.0283, "step": 5512 }, { "epoch": 2.508189262966333, "grad_norm": 0.4698542184508619, "learning_rate": 2.487851031641714e-05, "loss": 0.0519, "step": 5513 }, { "epoch": 2.5086442220200182, "grad_norm": 0.7426298655152241, "learning_rate": 2.4871363925759214e-05, "loss": 0.0694, "step": 5514 }, { "epoch": 2.5090991810737036, "grad_norm": 0.3160566648760521, "learning_rate": 2.4864217545612852e-05, "loss": 0.0268, "step": 5515 }, { "epoch": 2.5095541401273884, "grad_norm": 0.536298427770528, "learning_rate": 2.4857071176562026e-05, "loss": 0.0375, "step": 5516 }, { "epoch": 2.5100090991810737, "grad_norm": 0.6070847375115861, "learning_rate": 2.4849924819190696e-05, "loss": 0.0665, "step": 5517 }, { "epoch": 2.510464058234759, "grad_norm": 0.5298770960933196, "learning_rate": 2.4842778474082835e-05, "loss": 0.0477, "step": 5518 }, { "epoch": 2.510919017288444, "grad_norm": 0.6746256214457255, "learning_rate": 2.48356321418224e-05, "loss": 0.0604, "step": 5519 }, { "epoch": 2.511373976342129, "grad_norm": 0.550614678654802, "learning_rate": 2.482848582299337e-05, "loss": 0.0511, "step": 5520 }, { "epoch": 2.5118289353958145, "grad_norm": 0.682080841087004, "learning_rate": 2.4821339518179692e-05, "loss": 0.0682, "step": 5521 }, { "epoch": 2.5122838944494994, "grad_norm": 0.890442522790603, "learning_rate": 2.481419322796535e-05, "loss": 0.061, "step": 5522 }, { "epoch": 2.5127388535031847, "grad_norm": 0.5168794924665957, "learning_rate": 2.4807046952934303e-05, "loss": 0.0345, "step": 5523 }, { "epoch": 2.51319381255687, "grad_norm": 0.4787724868014362, "learning_rate": 2.4799900693670503e-05, "loss": 0.0327, "step": 5524 }, { "epoch": 2.513648771610555, "grad_norm": 0.4557644333187764, "learning_rate": 2.479275445075792e-05, "loss": 0.0367, "step": 5525 }, { "epoch": 2.51410373066424, "grad_norm": 0.4866443617028417, "learning_rate": 2.4785608224780498e-05, "loss": 0.0404, "step": 5526 }, { "epoch": 2.5145586897179255, "grad_norm": 0.6845128646216186, "learning_rate": 2.4778462016322212e-05, "loss": 0.0947, "step": 5527 }, { "epoch": 2.5150136487716104, "grad_norm": 0.41625781713816457, "learning_rate": 2.4771315825967e-05, "loss": 0.0285, "step": 5528 }, { "epoch": 2.5154686078252957, "grad_norm": 0.5376152016620043, "learning_rate": 2.4764169654298828e-05, "loss": 0.0537, "step": 5529 }, { "epoch": 2.515923566878981, "grad_norm": 0.37452849269805805, "learning_rate": 2.4757023501901655e-05, "loss": 0.0269, "step": 5530 }, { "epoch": 2.516378525932666, "grad_norm": 0.5385060870134909, "learning_rate": 2.474987736935942e-05, "loss": 0.0663, "step": 5531 }, { "epoch": 2.516833484986351, "grad_norm": 0.5975398175301113, "learning_rate": 2.4742731257256077e-05, "loss": 0.0368, "step": 5532 }, { "epoch": 2.5172884440400365, "grad_norm": 0.41655945550054024, "learning_rate": 2.473558516617558e-05, "loss": 0.0554, "step": 5533 }, { "epoch": 2.5177434030937214, "grad_norm": 0.4515468948537396, "learning_rate": 2.4728439096701857e-05, "loss": 0.0418, "step": 5534 }, { "epoch": 2.5181983621474067, "grad_norm": 0.5594619707605112, "learning_rate": 2.4721293049418864e-05, "loss": 0.0739, "step": 5535 }, { "epoch": 2.518653321201092, "grad_norm": 0.5228690064678359, "learning_rate": 2.4714147024910554e-05, "loss": 0.0829, "step": 5536 }, { "epoch": 2.519108280254777, "grad_norm": 0.4772688402606834, "learning_rate": 2.4707001023760855e-05, "loss": 0.0327, "step": 5537 }, { "epoch": 2.519563239308462, "grad_norm": 0.46157094593837716, "learning_rate": 2.4699855046553714e-05, "loss": 0.0465, "step": 5538 }, { "epoch": 2.5200181983621475, "grad_norm": 0.5183946969830342, "learning_rate": 2.4692709093873053e-05, "loss": 0.0507, "step": 5539 }, { "epoch": 2.5204731574158323, "grad_norm": 0.6389775138187559, "learning_rate": 2.468556316630283e-05, "loss": 0.0677, "step": 5540 }, { "epoch": 2.5209281164695176, "grad_norm": 0.4572580025195098, "learning_rate": 2.4678417264426953e-05, "loss": 0.0683, "step": 5541 }, { "epoch": 2.521383075523203, "grad_norm": 0.46941076564700723, "learning_rate": 2.4671271388829363e-05, "loss": 0.0946, "step": 5542 }, { "epoch": 2.521838034576888, "grad_norm": 0.4095401737734664, "learning_rate": 2.4664125540094e-05, "loss": 0.0302, "step": 5543 }, { "epoch": 2.522292993630573, "grad_norm": 0.4586128214392574, "learning_rate": 2.4656979718804777e-05, "loss": 0.0533, "step": 5544 }, { "epoch": 2.5227479526842584, "grad_norm": 0.45660887316300236, "learning_rate": 2.4649833925545626e-05, "loss": 0.0487, "step": 5545 }, { "epoch": 2.5232029117379433, "grad_norm": 0.42288209665170945, "learning_rate": 2.4642688160900462e-05, "loss": 0.0471, "step": 5546 }, { "epoch": 2.5236578707916286, "grad_norm": 0.45385105371115797, "learning_rate": 2.463554242545321e-05, "loss": 0.0328, "step": 5547 }, { "epoch": 2.524112829845314, "grad_norm": 0.5133827917409834, "learning_rate": 2.4628396719787784e-05, "loss": 0.0499, "step": 5548 }, { "epoch": 2.5245677888989992, "grad_norm": 0.37545124649466055, "learning_rate": 2.462125104448809e-05, "loss": 0.0371, "step": 5549 }, { "epoch": 2.525022747952684, "grad_norm": 0.3225001511231169, "learning_rate": 2.4614105400138065e-05, "loss": 0.0282, "step": 5550 }, { "epoch": 2.5254777070063694, "grad_norm": 0.3903658485538955, "learning_rate": 2.4606959787321595e-05, "loss": 0.0252, "step": 5551 }, { "epoch": 2.5259326660600547, "grad_norm": 0.471651212038331, "learning_rate": 2.4599814206622604e-05, "loss": 0.0602, "step": 5552 }, { "epoch": 2.5263876251137396, "grad_norm": 0.498627253224025, "learning_rate": 2.4592668658624985e-05, "loss": 0.0501, "step": 5553 }, { "epoch": 2.526842584167425, "grad_norm": 0.41344801569031847, "learning_rate": 2.4585523143912646e-05, "loss": 0.0399, "step": 5554 }, { "epoch": 2.52729754322111, "grad_norm": 0.5223822477034625, "learning_rate": 2.457837766306948e-05, "loss": 0.0404, "step": 5555 }, { "epoch": 2.5277525022747955, "grad_norm": 0.36127970348468835, "learning_rate": 2.457123221667938e-05, "loss": 0.0268, "step": 5556 }, { "epoch": 2.5282074613284804, "grad_norm": 0.47780753615633975, "learning_rate": 2.456408680532626e-05, "loss": 0.0577, "step": 5557 }, { "epoch": 2.5286624203821657, "grad_norm": 0.4117590781058292, "learning_rate": 2.455694142959399e-05, "loss": 0.0324, "step": 5558 }, { "epoch": 2.529117379435851, "grad_norm": 0.45507278097829046, "learning_rate": 2.4549796090066473e-05, "loss": 0.0604, "step": 5559 }, { "epoch": 2.529572338489536, "grad_norm": 0.5820442113536387, "learning_rate": 2.454265078732758e-05, "loss": 0.0448, "step": 5560 }, { "epoch": 2.530027297543221, "grad_norm": 0.3333474533613232, "learning_rate": 2.4535505521961207e-05, "loss": 0.0241, "step": 5561 }, { "epoch": 2.5304822565969065, "grad_norm": 0.4570901990706895, "learning_rate": 2.4528360294551216e-05, "loss": 0.0971, "step": 5562 }, { "epoch": 2.5309372156505914, "grad_norm": 0.6030904188511081, "learning_rate": 2.452121510568149e-05, "loss": 0.0354, "step": 5563 }, { "epoch": 2.5313921747042767, "grad_norm": 0.5257391335258799, "learning_rate": 2.4514069955935914e-05, "loss": 0.0487, "step": 5564 }, { "epoch": 2.531847133757962, "grad_norm": 0.37759164888094776, "learning_rate": 2.450692484589834e-05, "loss": 0.0313, "step": 5565 }, { "epoch": 2.532302092811647, "grad_norm": 0.44948021821069345, "learning_rate": 2.4499779776152645e-05, "loss": 0.0636, "step": 5566 }, { "epoch": 2.532757051865332, "grad_norm": 0.39776181386199555, "learning_rate": 2.4492634747282684e-05, "loss": 0.0522, "step": 5567 }, { "epoch": 2.5332120109190175, "grad_norm": 0.8285676309596113, "learning_rate": 2.4485489759872323e-05, "loss": 0.0542, "step": 5568 }, { "epoch": 2.5336669699727024, "grad_norm": 0.4792351520245384, "learning_rate": 2.4478344814505418e-05, "loss": 0.063, "step": 5569 }, { "epoch": 2.5341219290263877, "grad_norm": 0.7479609300661525, "learning_rate": 2.447119991176581e-05, "loss": 0.0982, "step": 5570 }, { "epoch": 2.534576888080073, "grad_norm": 0.551425989118464, "learning_rate": 2.4464055052237357e-05, "loss": 0.0537, "step": 5571 }, { "epoch": 2.535031847133758, "grad_norm": 0.7411546316669237, "learning_rate": 2.4456910236503915e-05, "loss": 0.0387, "step": 5572 }, { "epoch": 2.535486806187443, "grad_norm": 0.579004511525009, "learning_rate": 2.444976546514931e-05, "loss": 0.058, "step": 5573 }, { "epoch": 2.5359417652411285, "grad_norm": 0.5250438978715126, "learning_rate": 2.444262073875739e-05, "loss": 0.0597, "step": 5574 }, { "epoch": 2.5363967242948133, "grad_norm": 0.4654723853708579, "learning_rate": 2.4435476057911983e-05, "loss": 0.0528, "step": 5575 }, { "epoch": 2.5368516833484986, "grad_norm": 0.3924947239100442, "learning_rate": 2.4428331423196925e-05, "loss": 0.0536, "step": 5576 }, { "epoch": 2.537306642402184, "grad_norm": 0.6188513882056421, "learning_rate": 2.4421186835196032e-05, "loss": 0.0578, "step": 5577 }, { "epoch": 2.537761601455869, "grad_norm": 0.415062069334779, "learning_rate": 2.4414042294493144e-05, "loss": 0.038, "step": 5578 }, { "epoch": 2.538216560509554, "grad_norm": 0.42163484331909074, "learning_rate": 2.440689780167208e-05, "loss": 0.0279, "step": 5579 }, { "epoch": 2.5386715195632394, "grad_norm": 0.5638132441169138, "learning_rate": 2.4399753357316642e-05, "loss": 0.0452, "step": 5580 }, { "epoch": 2.5391264786169243, "grad_norm": 0.68980980232465, "learning_rate": 2.4392608962010652e-05, "loss": 0.0743, "step": 5581 }, { "epoch": 2.5395814376706096, "grad_norm": 0.3789226391392901, "learning_rate": 2.438546461633791e-05, "loss": 0.0328, "step": 5582 }, { "epoch": 2.540036396724295, "grad_norm": 0.43978232394840344, "learning_rate": 2.4378320320882235e-05, "loss": 0.0379, "step": 5583 }, { "epoch": 2.54049135577798, "grad_norm": 0.5366920221810944, "learning_rate": 2.4371176076227398e-05, "loss": 0.065, "step": 5584 }, { "epoch": 2.540946314831665, "grad_norm": 0.5944189967747088, "learning_rate": 2.436403188295722e-05, "loss": 0.0691, "step": 5585 }, { "epoch": 2.5414012738853504, "grad_norm": 0.5792047271367469, "learning_rate": 2.4356887741655494e-05, "loss": 0.0399, "step": 5586 }, { "epoch": 2.5418562329390353, "grad_norm": 0.44513388420478084, "learning_rate": 2.434974365290599e-05, "loss": 0.0375, "step": 5587 }, { "epoch": 2.5423111919927206, "grad_norm": 0.3932129351557788, "learning_rate": 2.43425996172925e-05, "loss": 0.0528, "step": 5588 }, { "epoch": 2.542766151046406, "grad_norm": 0.334670689476066, "learning_rate": 2.4335455635398795e-05, "loss": 0.0388, "step": 5589 }, { "epoch": 2.5432211101000908, "grad_norm": 0.4894097736355102, "learning_rate": 2.4328311707808665e-05, "loss": 0.0397, "step": 5590 }, { "epoch": 2.543676069153776, "grad_norm": 0.6491151783645842, "learning_rate": 2.4321167835105853e-05, "loss": 0.0664, "step": 5591 }, { "epoch": 2.5441310282074614, "grad_norm": 0.5307645627528823, "learning_rate": 2.431402401787415e-05, "loss": 0.0548, "step": 5592 }, { "epoch": 2.5445859872611463, "grad_norm": 0.48522464272639726, "learning_rate": 2.430688025669731e-05, "loss": 0.0542, "step": 5593 }, { "epoch": 2.5450409463148316, "grad_norm": 0.6087940840300865, "learning_rate": 2.429973655215908e-05, "loss": 0.0359, "step": 5594 }, { "epoch": 2.545495905368517, "grad_norm": 0.38909154243288446, "learning_rate": 2.429259290484322e-05, "loss": 0.0472, "step": 5595 }, { "epoch": 2.5459508644222018, "grad_norm": 0.4058268037762287, "learning_rate": 2.428544931533347e-05, "loss": 0.0238, "step": 5596 }, { "epoch": 2.546405823475887, "grad_norm": 0.5366855405362526, "learning_rate": 2.427830578421358e-05, "loss": 0.0639, "step": 5597 }, { "epoch": 2.5468607825295724, "grad_norm": 0.5481256998504377, "learning_rate": 2.427116231206727e-05, "loss": 0.0789, "step": 5598 }, { "epoch": 2.5473157415832572, "grad_norm": 0.5212503608925003, "learning_rate": 2.4264018899478293e-05, "loss": 0.0521, "step": 5599 }, { "epoch": 2.5477707006369426, "grad_norm": 0.3830006986350483, "learning_rate": 2.425687554703037e-05, "loss": 0.0477, "step": 5600 }, { "epoch": 2.548225659690628, "grad_norm": 0.3916659560057949, "learning_rate": 2.4249732255307216e-05, "loss": 0.0287, "step": 5601 }, { "epoch": 2.548680618744313, "grad_norm": 0.37681018113920334, "learning_rate": 2.424258902489256e-05, "loss": 0.0364, "step": 5602 }, { "epoch": 2.549135577797998, "grad_norm": 0.47588396056925847, "learning_rate": 2.423544585637011e-05, "loss": 0.0522, "step": 5603 }, { "epoch": 2.5495905368516834, "grad_norm": 0.6243895937959805, "learning_rate": 2.4228302750323566e-05, "loss": 0.0684, "step": 5604 }, { "epoch": 2.5500454959053687, "grad_norm": 0.7637625826277196, "learning_rate": 2.4221159707336634e-05, "loss": 0.0461, "step": 5605 }, { "epoch": 2.5505004549590535, "grad_norm": 0.6420122683407204, "learning_rate": 2.421401672799302e-05, "loss": 0.0555, "step": 5606 }, { "epoch": 2.550955414012739, "grad_norm": 0.37172444575068825, "learning_rate": 2.4206873812876404e-05, "loss": 0.0269, "step": 5607 }, { "epoch": 2.551410373066424, "grad_norm": 0.42433289856968087, "learning_rate": 2.4199730962570484e-05, "loss": 0.0385, "step": 5608 }, { "epoch": 2.5518653321201095, "grad_norm": 0.5281467538811254, "learning_rate": 2.419258817765893e-05, "loss": 0.0545, "step": 5609 }, { "epoch": 2.5523202911737943, "grad_norm": 0.5519594092946799, "learning_rate": 2.418544545872543e-05, "loss": 0.0725, "step": 5610 }, { "epoch": 2.5527752502274796, "grad_norm": 7.015532837964843, "learning_rate": 2.4178302806353645e-05, "loss": 0.0313, "step": 5611 }, { "epoch": 2.553230209281165, "grad_norm": 0.501632850142659, "learning_rate": 2.4171160221127238e-05, "loss": 0.0449, "step": 5612 }, { "epoch": 2.55368516833485, "grad_norm": 0.4119735856269457, "learning_rate": 2.416401770362988e-05, "loss": 0.037, "step": 5613 }, { "epoch": 2.554140127388535, "grad_norm": 0.2741273894324278, "learning_rate": 2.4156875254445222e-05, "loss": 0.0354, "step": 5614 }, { "epoch": 2.5545950864422204, "grad_norm": 0.4612225011662519, "learning_rate": 2.4149732874156912e-05, "loss": 0.0312, "step": 5615 }, { "epoch": 2.5550500454959053, "grad_norm": 0.783054199709881, "learning_rate": 2.4142590563348587e-05, "loss": 0.0707, "step": 5616 }, { "epoch": 2.5555050045495906, "grad_norm": 0.4538343107545417, "learning_rate": 2.4135448322603896e-05, "loss": 0.0652, "step": 5617 }, { "epoch": 2.555959963603276, "grad_norm": 0.46965991173324445, "learning_rate": 2.4128306152506456e-05, "loss": 0.0423, "step": 5618 }, { "epoch": 2.556414922656961, "grad_norm": 0.5299815932357995, "learning_rate": 2.41211640536399e-05, "loss": 0.036, "step": 5619 }, { "epoch": 2.556869881710646, "grad_norm": 0.48853878988203164, "learning_rate": 2.4114022026587853e-05, "loss": 0.035, "step": 5620 }, { "epoch": 2.5573248407643314, "grad_norm": 0.8277287263997678, "learning_rate": 2.4106880071933924e-05, "loss": 0.0739, "step": 5621 }, { "epoch": 2.5577797998180163, "grad_norm": 0.43659796724745353, "learning_rate": 2.409973819026173e-05, "loss": 0.0417, "step": 5622 }, { "epoch": 2.5582347588717016, "grad_norm": 0.6703547969502143, "learning_rate": 2.409259638215485e-05, "loss": 0.0614, "step": 5623 }, { "epoch": 2.558689717925387, "grad_norm": 0.7289895007576778, "learning_rate": 2.4085454648196912e-05, "loss": 0.0483, "step": 5624 }, { "epoch": 2.5591446769790718, "grad_norm": 0.5330318943213967, "learning_rate": 2.407831298897148e-05, "loss": 0.053, "step": 5625 }, { "epoch": 2.559599636032757, "grad_norm": 0.4556696058186075, "learning_rate": 2.407117140506214e-05, "loss": 0.0417, "step": 5626 }, { "epoch": 2.5600545950864424, "grad_norm": 0.5105550840075803, "learning_rate": 2.4064029897052494e-05, "loss": 0.0543, "step": 5627 }, { "epoch": 2.5605095541401273, "grad_norm": 0.44511226423123135, "learning_rate": 2.4056888465526093e-05, "loss": 0.071, "step": 5628 }, { "epoch": 2.5609645131938126, "grad_norm": 0.6191330992323385, "learning_rate": 2.404974711106651e-05, "loss": 0.0584, "step": 5629 }, { "epoch": 2.561419472247498, "grad_norm": 0.5169845945088337, "learning_rate": 2.4042605834257298e-05, "loss": 0.0463, "step": 5630 }, { "epoch": 2.5618744313011828, "grad_norm": 0.7585222773440606, "learning_rate": 2.403546463568202e-05, "loss": 0.0813, "step": 5631 }, { "epoch": 2.562329390354868, "grad_norm": 0.5796928149860934, "learning_rate": 2.402832351592421e-05, "loss": 0.0608, "step": 5632 }, { "epoch": 2.5627843494085534, "grad_norm": 0.3037958903055108, "learning_rate": 2.4021182475567404e-05, "loss": 0.0271, "step": 5633 }, { "epoch": 2.5632393084622382, "grad_norm": 0.3446356504533474, "learning_rate": 2.4014041515195162e-05, "loss": 0.0414, "step": 5634 }, { "epoch": 2.5636942675159236, "grad_norm": 0.5679217668850483, "learning_rate": 2.400690063539099e-05, "loss": 0.036, "step": 5635 }, { "epoch": 2.564149226569609, "grad_norm": 0.4861278315660022, "learning_rate": 2.3999759836738415e-05, "loss": 0.0551, "step": 5636 }, { "epoch": 2.5646041856232937, "grad_norm": 0.43109726378977054, "learning_rate": 2.3992619119820945e-05, "loss": 0.0302, "step": 5637 }, { "epoch": 2.565059144676979, "grad_norm": 0.5464619311522003, "learning_rate": 2.3985478485222098e-05, "loss": 0.0407, "step": 5638 }, { "epoch": 2.5655141037306644, "grad_norm": 0.5846962968872537, "learning_rate": 2.3978337933525367e-05, "loss": 0.0336, "step": 5639 }, { "epoch": 2.565969062784349, "grad_norm": 0.5075562467755304, "learning_rate": 2.397119746531423e-05, "loss": 0.0364, "step": 5640 }, { "epoch": 2.5664240218380345, "grad_norm": 0.46833567336024495, "learning_rate": 2.3964057081172206e-05, "loss": 0.0525, "step": 5641 }, { "epoch": 2.56687898089172, "grad_norm": 0.5910407208230449, "learning_rate": 2.3956916781682758e-05, "loss": 0.094, "step": 5642 }, { "epoch": 2.5673339399454047, "grad_norm": 0.37285787884120536, "learning_rate": 2.3949776567429353e-05, "loss": 0.0345, "step": 5643 }, { "epoch": 2.56778889899909, "grad_norm": 0.4440068730678052, "learning_rate": 2.3942636438995474e-05, "loss": 0.0364, "step": 5644 }, { "epoch": 2.5682438580527753, "grad_norm": 0.4830358783634657, "learning_rate": 2.3935496396964566e-05, "loss": 0.0597, "step": 5645 }, { "epoch": 2.56869881710646, "grad_norm": 0.5361425073450351, "learning_rate": 2.3928356441920086e-05, "loss": 0.0457, "step": 5646 }, { "epoch": 2.5691537761601455, "grad_norm": 0.5210181650058047, "learning_rate": 2.3921216574445467e-05, "loss": 0.0789, "step": 5647 }, { "epoch": 2.569608735213831, "grad_norm": 0.5464145850650705, "learning_rate": 2.391407679512417e-05, "loss": 0.0577, "step": 5648 }, { "epoch": 2.5700636942675157, "grad_norm": 0.5519401925150651, "learning_rate": 2.390693710453961e-05, "loss": 0.0487, "step": 5649 }, { "epoch": 2.570518653321201, "grad_norm": 0.43574741624422897, "learning_rate": 2.3899797503275213e-05, "loss": 0.0422, "step": 5650 }, { "epoch": 2.5709736123748863, "grad_norm": 0.4534857925652595, "learning_rate": 2.38926579919144e-05, "loss": 0.0638, "step": 5651 }, { "epoch": 2.571428571428571, "grad_norm": 0.39664723088741, "learning_rate": 2.3885518571040573e-05, "loss": 0.0435, "step": 5652 }, { "epoch": 2.5718835304822565, "grad_norm": 0.33124923582922294, "learning_rate": 2.3878379241237136e-05, "loss": 0.0231, "step": 5653 }, { "epoch": 2.572338489535942, "grad_norm": 0.39827511103509206, "learning_rate": 2.387124000308747e-05, "loss": 0.0399, "step": 5654 }, { "epoch": 2.5727934485896267, "grad_norm": 0.5139036009837125, "learning_rate": 2.3864100857174985e-05, "loss": 0.09, "step": 5655 }, { "epoch": 2.573248407643312, "grad_norm": 0.5606864311974736, "learning_rate": 2.385696180408305e-05, "loss": 0.035, "step": 5656 }, { "epoch": 2.5737033666969973, "grad_norm": 0.4483753278796, "learning_rate": 2.384982284439503e-05, "loss": 0.0615, "step": 5657 }, { "epoch": 2.5741583257506826, "grad_norm": 0.651411465230942, "learning_rate": 2.3842683978694297e-05, "loss": 0.0659, "step": 5658 }, { "epoch": 2.5746132848043675, "grad_norm": 0.47120022506766757, "learning_rate": 2.38355452075642e-05, "loss": 0.0741, "step": 5659 }, { "epoch": 2.5750682438580528, "grad_norm": 0.4408085920634104, "learning_rate": 2.3828406531588093e-05, "loss": 0.0335, "step": 5660 }, { "epoch": 2.575523202911738, "grad_norm": 0.6510718211647826, "learning_rate": 2.3821267951349305e-05, "loss": 0.0575, "step": 5661 }, { "epoch": 2.575978161965423, "grad_norm": 0.6260670076055039, "learning_rate": 2.381412946743118e-05, "loss": 0.0582, "step": 5662 }, { "epoch": 2.5764331210191083, "grad_norm": 0.4947021034482374, "learning_rate": 2.3806991080417047e-05, "loss": 0.0523, "step": 5663 }, { "epoch": 2.5768880800727936, "grad_norm": 0.48155560190395047, "learning_rate": 2.3799852790890205e-05, "loss": 0.053, "step": 5664 }, { "epoch": 2.577343039126479, "grad_norm": 0.47039524185799403, "learning_rate": 2.3792714599433983e-05, "loss": 0.0469, "step": 5665 }, { "epoch": 2.5777979981801638, "grad_norm": 0.44707366146999084, "learning_rate": 2.378557650663167e-05, "loss": 0.0464, "step": 5666 }, { "epoch": 2.578252957233849, "grad_norm": 0.7656096355415235, "learning_rate": 2.377843851306656e-05, "loss": 0.0747, "step": 5667 }, { "epoch": 2.5787079162875344, "grad_norm": 0.45383608787168633, "learning_rate": 2.377130061932193e-05, "loss": 0.0404, "step": 5668 }, { "epoch": 2.5791628753412192, "grad_norm": 0.44928885059730006, "learning_rate": 2.3764162825981066e-05, "loss": 0.0436, "step": 5669 }, { "epoch": 2.5796178343949046, "grad_norm": 0.5777542171133059, "learning_rate": 2.3757025133627245e-05, "loss": 0.056, "step": 5670 }, { "epoch": 2.58007279344859, "grad_norm": 0.5309279348650393, "learning_rate": 2.374988754284371e-05, "loss": 0.0779, "step": 5671 }, { "epoch": 2.5805277525022747, "grad_norm": 0.5000198797283987, "learning_rate": 2.3742750054213726e-05, "loss": 0.0354, "step": 5672 }, { "epoch": 2.58098271155596, "grad_norm": 0.43725761173614025, "learning_rate": 2.373561266832052e-05, "loss": 0.0526, "step": 5673 }, { "epoch": 2.5814376706096454, "grad_norm": 0.6675377722658138, "learning_rate": 2.372847538574735e-05, "loss": 0.0712, "step": 5674 }, { "epoch": 2.58189262966333, "grad_norm": 0.5682144940235722, "learning_rate": 2.372133820707741e-05, "loss": 0.06, "step": 5675 }, { "epoch": 2.5823475887170155, "grad_norm": 0.3665588732890273, "learning_rate": 2.3714201132893947e-05, "loss": 0.0238, "step": 5676 }, { "epoch": 2.582802547770701, "grad_norm": 0.6304410296745396, "learning_rate": 2.3707064163780167e-05, "loss": 0.0549, "step": 5677 }, { "epoch": 2.5832575068243857, "grad_norm": 0.39863432840998014, "learning_rate": 2.3699927300319262e-05, "loss": 0.0511, "step": 5678 }, { "epoch": 2.583712465878071, "grad_norm": 0.6716207073957158, "learning_rate": 2.3692790543094426e-05, "loss": 0.0746, "step": 5679 }, { "epoch": 2.5841674249317563, "grad_norm": 0.5169791072218424, "learning_rate": 2.3685653892688844e-05, "loss": 0.064, "step": 5680 }, { "epoch": 2.584622383985441, "grad_norm": 0.5909390793511994, "learning_rate": 2.3678517349685687e-05, "loss": 0.0629, "step": 5681 }, { "epoch": 2.5850773430391265, "grad_norm": 0.6746414584895837, "learning_rate": 2.3671380914668122e-05, "loss": 0.0488, "step": 5682 }, { "epoch": 2.585532302092812, "grad_norm": 0.6189086058774501, "learning_rate": 2.3664244588219313e-05, "loss": 0.0434, "step": 5683 }, { "epoch": 2.5859872611464967, "grad_norm": 0.6714416004006085, "learning_rate": 2.3657108370922405e-05, "loss": 0.0435, "step": 5684 }, { "epoch": 2.586442220200182, "grad_norm": 0.46467825375875166, "learning_rate": 2.364997226336054e-05, "loss": 0.0649, "step": 5685 }, { "epoch": 2.5868971792538673, "grad_norm": 0.4454235812765235, "learning_rate": 2.3642836266116836e-05, "loss": 0.0422, "step": 5686 }, { "epoch": 2.587352138307552, "grad_norm": 0.5577979293842438, "learning_rate": 2.3635700379774433e-05, "loss": 0.0544, "step": 5687 }, { "epoch": 2.5878070973612375, "grad_norm": 0.4333899968335713, "learning_rate": 2.3628564604916427e-05, "loss": 0.0483, "step": 5688 }, { "epoch": 2.588262056414923, "grad_norm": 0.3215383024496318, "learning_rate": 2.362142894212592e-05, "loss": 0.0284, "step": 5689 }, { "epoch": 2.5887170154686077, "grad_norm": 0.3887871855687961, "learning_rate": 2.361429339198603e-05, "loss": 0.031, "step": 5690 }, { "epoch": 2.589171974522293, "grad_norm": 0.7640655013347382, "learning_rate": 2.360715795507982e-05, "loss": 0.0577, "step": 5691 }, { "epoch": 2.5896269335759783, "grad_norm": 0.4418370270175836, "learning_rate": 2.3600022631990372e-05, "loss": 0.0661, "step": 5692 }, { "epoch": 2.590081892629663, "grad_norm": 0.6193429372220801, "learning_rate": 2.359288742330075e-05, "loss": 0.0573, "step": 5693 }, { "epoch": 2.5905368516833485, "grad_norm": 0.569108564752182, "learning_rate": 2.3585752329594024e-05, "loss": 0.0735, "step": 5694 }, { "epoch": 2.5909918107370338, "grad_norm": 0.6638311375042868, "learning_rate": 2.357861735145322e-05, "loss": 0.0421, "step": 5695 }, { "epoch": 2.5914467697907186, "grad_norm": 0.39508162064732955, "learning_rate": 2.3571482489461384e-05, "loss": 0.0258, "step": 5696 }, { "epoch": 2.591901728844404, "grad_norm": 0.43924662422546623, "learning_rate": 2.3564347744201558e-05, "loss": 0.0516, "step": 5697 }, { "epoch": 2.5923566878980893, "grad_norm": 0.4749010766667937, "learning_rate": 2.3557213116256743e-05, "loss": 0.0391, "step": 5698 }, { "epoch": 2.592811646951774, "grad_norm": 0.4158537655324468, "learning_rate": 2.3550078606209964e-05, "loss": 0.0197, "step": 5699 }, { "epoch": 2.5932666060054594, "grad_norm": 0.4245453748250273, "learning_rate": 2.354294421464421e-05, "loss": 0.0433, "step": 5700 }, { "epoch": 2.5937215650591448, "grad_norm": 0.6357932396359844, "learning_rate": 2.3535809942142476e-05, "loss": 0.0479, "step": 5701 }, { "epoch": 2.5941765241128296, "grad_norm": 0.36689556150753805, "learning_rate": 2.352867578928774e-05, "loss": 0.0261, "step": 5702 }, { "epoch": 2.594631483166515, "grad_norm": 0.5205614611042388, "learning_rate": 2.3521541756662965e-05, "loss": 0.0483, "step": 5703 }, { "epoch": 2.5950864422202002, "grad_norm": 0.5592921671015776, "learning_rate": 2.351440784485113e-05, "loss": 0.0532, "step": 5704 }, { "epoch": 2.595541401273885, "grad_norm": 0.5288619542878404, "learning_rate": 2.350727405443518e-05, "loss": 0.0679, "step": 5705 }, { "epoch": 2.5959963603275704, "grad_norm": 0.5005638646165019, "learning_rate": 2.3500140385998052e-05, "loss": 0.0329, "step": 5706 }, { "epoch": 2.5964513193812557, "grad_norm": 0.6264305503560467, "learning_rate": 2.3493006840122676e-05, "loss": 0.0704, "step": 5707 }, { "epoch": 2.5969062784349406, "grad_norm": 0.5220632507814772, "learning_rate": 2.348587341739198e-05, "loss": 0.0772, "step": 5708 }, { "epoch": 2.597361237488626, "grad_norm": 0.5151790621051016, "learning_rate": 2.3478740118388865e-05, "loss": 0.0301, "step": 5709 }, { "epoch": 2.597816196542311, "grad_norm": 0.5088350366660601, "learning_rate": 2.3471606943696232e-05, "loss": 0.0465, "step": 5710 }, { "epoch": 2.598271155595996, "grad_norm": 0.6285796139918639, "learning_rate": 2.3464473893896988e-05, "loss": 0.0282, "step": 5711 }, { "epoch": 2.5987261146496814, "grad_norm": 0.5662040989566292, "learning_rate": 2.3457340969573996e-05, "loss": 0.0591, "step": 5712 }, { "epoch": 2.5991810737033667, "grad_norm": 0.5206464678437185, "learning_rate": 2.345020817131014e-05, "loss": 0.0516, "step": 5713 }, { "epoch": 2.599636032757052, "grad_norm": 0.32785360252897977, "learning_rate": 2.3443075499688274e-05, "loss": 0.0358, "step": 5714 }, { "epoch": 2.600090991810737, "grad_norm": 0.542720159471813, "learning_rate": 2.3435942955291244e-05, "loss": 0.04, "step": 5715 }, { "epoch": 2.600545950864422, "grad_norm": 0.41673605808973974, "learning_rate": 2.3428810538701895e-05, "loss": 0.0531, "step": 5716 }, { "epoch": 2.6010009099181075, "grad_norm": 0.5525713072180474, "learning_rate": 2.3421678250503045e-05, "loss": 0.0336, "step": 5717 }, { "epoch": 2.6014558689717924, "grad_norm": 0.40749231966813804, "learning_rate": 2.3414546091277524e-05, "loss": 0.0307, "step": 5718 }, { "epoch": 2.6019108280254777, "grad_norm": 0.385882858726408, "learning_rate": 2.340741406160814e-05, "loss": 0.0353, "step": 5719 }, { "epoch": 2.602365787079163, "grad_norm": 0.6626052604658128, "learning_rate": 2.3400282162077682e-05, "loss": 0.0292, "step": 5720 }, { "epoch": 2.6028207461328483, "grad_norm": 0.6878612638905982, "learning_rate": 2.339315039326895e-05, "loss": 0.0638, "step": 5721 }, { "epoch": 2.603275705186533, "grad_norm": 0.7305756788128462, "learning_rate": 2.3386018755764705e-05, "loss": 0.0479, "step": 5722 }, { "epoch": 2.6037306642402185, "grad_norm": 0.5877080701177513, "learning_rate": 2.3378887250147723e-05, "loss": 0.0561, "step": 5723 }, { "epoch": 2.604185623293904, "grad_norm": 0.42517696044843345, "learning_rate": 2.3371755877000745e-05, "loss": 0.0366, "step": 5724 }, { "epoch": 2.6046405823475887, "grad_norm": 0.5880421233438384, "learning_rate": 2.3364624636906525e-05, "loss": 0.041, "step": 5725 }, { "epoch": 2.605095541401274, "grad_norm": 0.4356942109934894, "learning_rate": 2.3357493530447806e-05, "loss": 0.0618, "step": 5726 }, { "epoch": 2.6055505004549593, "grad_norm": 0.4691569856774944, "learning_rate": 2.335036255820729e-05, "loss": 0.0611, "step": 5727 }, { "epoch": 2.606005459508644, "grad_norm": 0.4744729079474149, "learning_rate": 2.33432317207677e-05, "loss": 0.0338, "step": 5728 }, { "epoch": 2.6064604185623295, "grad_norm": 0.44725287186247137, "learning_rate": 2.3336101018711725e-05, "loss": 0.035, "step": 5729 }, { "epoch": 2.6069153776160148, "grad_norm": 0.8283173730425223, "learning_rate": 2.3328970452622072e-05, "loss": 0.0718, "step": 5730 }, { "epoch": 2.6073703366696996, "grad_norm": 0.6047055470272484, "learning_rate": 2.332184002308139e-05, "loss": 0.0476, "step": 5731 }, { "epoch": 2.607825295723385, "grad_norm": 0.4522092421370913, "learning_rate": 2.3314709730672372e-05, "loss": 0.0555, "step": 5732 }, { "epoch": 2.6082802547770703, "grad_norm": 0.5151369547880271, "learning_rate": 2.330757957597767e-05, "loss": 0.0638, "step": 5733 }, { "epoch": 2.608735213830755, "grad_norm": 0.4083215740898627, "learning_rate": 2.3300449559579915e-05, "loss": 0.0374, "step": 5734 }, { "epoch": 2.6091901728844404, "grad_norm": 0.46602061992814336, "learning_rate": 2.3293319682061752e-05, "loss": 0.0592, "step": 5735 }, { "epoch": 2.6096451319381258, "grad_norm": 0.44972926711306216, "learning_rate": 2.3286189944005793e-05, "loss": 0.0271, "step": 5736 }, { "epoch": 2.6101000909918106, "grad_norm": 0.4936539029575609, "learning_rate": 2.327906034599466e-05, "loss": 0.0468, "step": 5737 }, { "epoch": 2.610555050045496, "grad_norm": 0.4341504571698978, "learning_rate": 2.3271930888610927e-05, "loss": 0.0377, "step": 5738 }, { "epoch": 2.6110100090991812, "grad_norm": 0.5343135135879318, "learning_rate": 2.3264801572437206e-05, "loss": 0.0296, "step": 5739 }, { "epoch": 2.611464968152866, "grad_norm": 0.5442451654472918, "learning_rate": 2.3257672398056072e-05, "loss": 0.061, "step": 5740 }, { "epoch": 2.6119199272065514, "grad_norm": 0.43447040973081336, "learning_rate": 2.3250543366050074e-05, "loss": 0.0394, "step": 5741 }, { "epoch": 2.6123748862602367, "grad_norm": 0.4313103763307572, "learning_rate": 2.3243414477001776e-05, "loss": 0.0398, "step": 5742 }, { "epoch": 2.6128298453139216, "grad_norm": 0.6254312387986631, "learning_rate": 2.323628573149371e-05, "loss": 0.0709, "step": 5743 }, { "epoch": 2.613284804367607, "grad_norm": 0.546263559071124, "learning_rate": 2.3229157130108418e-05, "loss": 0.0602, "step": 5744 }, { "epoch": 2.613739763421292, "grad_norm": 0.5686843408947064, "learning_rate": 2.3222028673428394e-05, "loss": 0.0515, "step": 5745 }, { "epoch": 2.614194722474977, "grad_norm": 0.4499077246940125, "learning_rate": 2.3214900362036163e-05, "loss": 0.0411, "step": 5746 }, { "epoch": 2.6146496815286624, "grad_norm": 0.5113986262840144, "learning_rate": 2.3207772196514217e-05, "loss": 0.0408, "step": 5747 }, { "epoch": 2.6151046405823477, "grad_norm": 0.6178977617579792, "learning_rate": 2.3200644177445037e-05, "loss": 0.0537, "step": 5748 }, { "epoch": 2.6155595996360326, "grad_norm": 0.5753784217224742, "learning_rate": 2.319351630541108e-05, "loss": 0.0507, "step": 5749 }, { "epoch": 2.616014558689718, "grad_norm": 0.38428534360428024, "learning_rate": 2.318638858099482e-05, "loss": 0.0475, "step": 5750 }, { "epoch": 2.616469517743403, "grad_norm": 0.469455555967972, "learning_rate": 2.317926100477869e-05, "loss": 0.0421, "step": 5751 }, { "epoch": 2.616924476797088, "grad_norm": 2.8950380883179156, "learning_rate": 2.317213357734512e-05, "loss": 0.1928, "step": 5752 }, { "epoch": 2.6173794358507734, "grad_norm": 0.8221680144379903, "learning_rate": 2.3165006299276553e-05, "loss": 0.0554, "step": 5753 }, { "epoch": 2.6178343949044587, "grad_norm": 0.4160455271161314, "learning_rate": 2.3157879171155377e-05, "loss": 0.0366, "step": 5754 }, { "epoch": 2.6182893539581436, "grad_norm": 0.46079863599051335, "learning_rate": 2.3150752193564004e-05, "loss": 0.055, "step": 5755 }, { "epoch": 2.618744313011829, "grad_norm": 0.4557263610150498, "learning_rate": 2.3143625367084802e-05, "loss": 0.0323, "step": 5756 }, { "epoch": 2.619199272065514, "grad_norm": 0.47611755832545666, "learning_rate": 2.3136498692300162e-05, "loss": 0.0609, "step": 5757 }, { "epoch": 2.619654231119199, "grad_norm": 0.5568503601695207, "learning_rate": 2.3129372169792426e-05, "loss": 0.0483, "step": 5758 }, { "epoch": 2.6201091901728844, "grad_norm": 0.5594408992754019, "learning_rate": 2.312224580014394e-05, "loss": 0.0444, "step": 5759 }, { "epoch": 2.6205641492265697, "grad_norm": 0.5857534538508824, "learning_rate": 2.311511958393706e-05, "loss": 0.0481, "step": 5760 }, { "epoch": 2.6210191082802545, "grad_norm": 0.3759436148106875, "learning_rate": 2.310799352175409e-05, "loss": 0.0326, "step": 5761 }, { "epoch": 2.62147406733394, "grad_norm": 0.5595381316317446, "learning_rate": 2.3100867614177353e-05, "loss": 0.0612, "step": 5762 }, { "epoch": 2.621929026387625, "grad_norm": 0.3437091799274789, "learning_rate": 2.3093741861789133e-05, "loss": 0.0375, "step": 5763 }, { "epoch": 2.62238398544131, "grad_norm": 0.40653966795648705, "learning_rate": 2.3086616265171722e-05, "loss": 0.0406, "step": 5764 }, { "epoch": 2.6228389444949953, "grad_norm": 0.4148939452315395, "learning_rate": 2.3079490824907384e-05, "loss": 0.0528, "step": 5765 }, { "epoch": 2.6232939035486806, "grad_norm": 0.5344853688736766, "learning_rate": 2.307236554157838e-05, "loss": 0.0734, "step": 5766 }, { "epoch": 2.623748862602366, "grad_norm": 0.32935345291875273, "learning_rate": 2.3065240415766966e-05, "loss": 0.044, "step": 5767 }, { "epoch": 2.624203821656051, "grad_norm": 0.41341023319233294, "learning_rate": 2.3058115448055363e-05, "loss": 0.0358, "step": 5768 }, { "epoch": 2.624658780709736, "grad_norm": 0.4370612640417141, "learning_rate": 2.3050990639025805e-05, "loss": 0.0404, "step": 5769 }, { "epoch": 2.6251137397634214, "grad_norm": 0.509122610243724, "learning_rate": 2.304386598926048e-05, "loss": 0.0362, "step": 5770 }, { "epoch": 2.6255686988171063, "grad_norm": 0.5334832664980266, "learning_rate": 2.3036741499341604e-05, "loss": 0.04, "step": 5771 }, { "epoch": 2.6260236578707916, "grad_norm": 0.491318962934088, "learning_rate": 2.3029617169851336e-05, "loss": 0.0478, "step": 5772 }, { "epoch": 2.626478616924477, "grad_norm": 0.480875272292522, "learning_rate": 2.3022493001371854e-05, "loss": 0.0569, "step": 5773 }, { "epoch": 2.6269335759781622, "grad_norm": 0.35676953285525786, "learning_rate": 2.301536899448532e-05, "loss": 0.0265, "step": 5774 }, { "epoch": 2.627388535031847, "grad_norm": 0.47639296169007894, "learning_rate": 2.3008245149773867e-05, "loss": 0.048, "step": 5775 }, { "epoch": 2.6278434940855324, "grad_norm": 0.45246995965584835, "learning_rate": 2.3001121467819627e-05, "loss": 0.0565, "step": 5776 }, { "epoch": 2.6282984531392177, "grad_norm": 0.4993601691285959, "learning_rate": 2.2993997949204713e-05, "loss": 0.0651, "step": 5777 }, { "epoch": 2.6287534121929026, "grad_norm": 0.45097854566707746, "learning_rate": 2.298687459451123e-05, "loss": 0.0411, "step": 5778 }, { "epoch": 2.629208371246588, "grad_norm": 0.42992758334854614, "learning_rate": 2.297975140432126e-05, "loss": 0.0477, "step": 5779 }, { "epoch": 2.629663330300273, "grad_norm": 0.9505151656934254, "learning_rate": 2.2972628379216875e-05, "loss": 0.0792, "step": 5780 }, { "epoch": 2.630118289353958, "grad_norm": 0.6684801197274788, "learning_rate": 2.2965505519780155e-05, "loss": 0.0473, "step": 5781 }, { "epoch": 2.6305732484076434, "grad_norm": 0.5329125808642439, "learning_rate": 2.2958382826593132e-05, "loss": 0.0492, "step": 5782 }, { "epoch": 2.6310282074613287, "grad_norm": 0.5641893012486157, "learning_rate": 2.2951260300237847e-05, "loss": 0.0481, "step": 5783 }, { "epoch": 2.6314831665150136, "grad_norm": 0.5341563962914352, "learning_rate": 2.294413794129632e-05, "loss": 0.0773, "step": 5784 }, { "epoch": 2.631938125568699, "grad_norm": 0.4721606979352544, "learning_rate": 2.2937015750350552e-05, "loss": 0.0569, "step": 5785 }, { "epoch": 2.632393084622384, "grad_norm": 0.5474664282870414, "learning_rate": 2.2929893727982547e-05, "loss": 0.0789, "step": 5786 }, { "epoch": 2.632848043676069, "grad_norm": 0.4009279636269705, "learning_rate": 2.2922771874774263e-05, "loss": 0.034, "step": 5787 }, { "epoch": 2.6333030027297544, "grad_norm": 0.453653554535206, "learning_rate": 2.2915650191307688e-05, "loss": 0.0715, "step": 5788 }, { "epoch": 2.6337579617834397, "grad_norm": 0.49526890106506094, "learning_rate": 2.2908528678164772e-05, "loss": 0.0802, "step": 5789 }, { "epoch": 2.6342129208371245, "grad_norm": 0.56857372624486, "learning_rate": 2.2901407335927442e-05, "loss": 0.0435, "step": 5790 }, { "epoch": 2.63466787989081, "grad_norm": 1.1172090949025792, "learning_rate": 2.2894286165177632e-05, "loss": 0.0421, "step": 5791 }, { "epoch": 2.635122838944495, "grad_norm": 0.44194167907050297, "learning_rate": 2.2887165166497242e-05, "loss": 0.0598, "step": 5792 }, { "epoch": 2.63557779799818, "grad_norm": 0.49878001718234155, "learning_rate": 2.288004434046818e-05, "loss": 0.0532, "step": 5793 }, { "epoch": 2.6360327570518653, "grad_norm": 0.4678946912236897, "learning_rate": 2.287292368767231e-05, "loss": 0.0554, "step": 5794 }, { "epoch": 2.6364877161055507, "grad_norm": 0.5438014371247417, "learning_rate": 2.2865803208691515e-05, "loss": 0.0416, "step": 5795 }, { "epoch": 2.6369426751592355, "grad_norm": 0.3813682132701843, "learning_rate": 2.285868290410765e-05, "loss": 0.0581, "step": 5796 }, { "epoch": 2.637397634212921, "grad_norm": 0.4155819745565378, "learning_rate": 2.2851562774502542e-05, "loss": 0.0539, "step": 5797 }, { "epoch": 2.637852593266606, "grad_norm": 0.5002807009024308, "learning_rate": 2.284444282045803e-05, "loss": 0.0794, "step": 5798 }, { "epoch": 2.638307552320291, "grad_norm": 0.5843282554863417, "learning_rate": 2.2837323042555906e-05, "loss": 0.0642, "step": 5799 }, { "epoch": 2.6387625113739763, "grad_norm": 0.6103006083005632, "learning_rate": 2.2830203441377985e-05, "loss": 0.0611, "step": 5800 }, { "epoch": 2.6392174704276616, "grad_norm": 0.5926525981872542, "learning_rate": 2.2823084017506024e-05, "loss": 0.0803, "step": 5801 }, { "epoch": 2.6396724294813465, "grad_norm": 0.5260905979708044, "learning_rate": 2.2815964771521818e-05, "loss": 0.0535, "step": 5802 }, { "epoch": 2.640127388535032, "grad_norm": 0.40492956651450496, "learning_rate": 2.280884570400711e-05, "loss": 0.0527, "step": 5803 }, { "epoch": 2.640582347588717, "grad_norm": 0.5324349351605755, "learning_rate": 2.2801726815543633e-05, "loss": 0.0947, "step": 5804 }, { "epoch": 2.641037306642402, "grad_norm": 0.48362316018254836, "learning_rate": 2.2794608106713116e-05, "loss": 0.0572, "step": 5805 }, { "epoch": 2.6414922656960873, "grad_norm": 0.37652892799079846, "learning_rate": 2.2787489578097264e-05, "loss": 0.0369, "step": 5806 }, { "epoch": 2.6419472247497726, "grad_norm": 0.4486258289234133, "learning_rate": 2.2780371230277772e-05, "loss": 0.041, "step": 5807 }, { "epoch": 2.6424021838034575, "grad_norm": 0.42024846747973554, "learning_rate": 2.2773253063836312e-05, "loss": 0.0379, "step": 5808 }, { "epoch": 2.642857142857143, "grad_norm": 0.3431491393146498, "learning_rate": 2.2766135079354557e-05, "loss": 0.0254, "step": 5809 }, { "epoch": 2.643312101910828, "grad_norm": 0.6208191363704632, "learning_rate": 2.2759017277414166e-05, "loss": 0.0606, "step": 5810 }, { "epoch": 2.643767060964513, "grad_norm": 0.45472645408662254, "learning_rate": 2.2751899658596755e-05, "loss": 0.0489, "step": 5811 }, { "epoch": 2.6442220200181983, "grad_norm": 0.4565747737194272, "learning_rate": 2.2744782223483958e-05, "loss": 0.0443, "step": 5812 }, { "epoch": 2.6446769790718836, "grad_norm": 0.41088263971922073, "learning_rate": 2.2737664972657365e-05, "loss": 0.0254, "step": 5813 }, { "epoch": 2.6451319381255685, "grad_norm": 0.503524862755163, "learning_rate": 2.2730547906698582e-05, "loss": 0.0625, "step": 5814 }, { "epoch": 2.6455868971792538, "grad_norm": 0.5198355607292341, "learning_rate": 2.2723431026189162e-05, "loss": 0.0417, "step": 5815 }, { "epoch": 2.646041856232939, "grad_norm": 0.3936420133080471, "learning_rate": 2.2716314331710685e-05, "loss": 0.044, "step": 5816 }, { "epoch": 2.646496815286624, "grad_norm": 0.45869472293240965, "learning_rate": 2.2709197823844693e-05, "loss": 0.0553, "step": 5817 }, { "epoch": 2.6469517743403093, "grad_norm": 0.49862407989575447, "learning_rate": 2.2702081503172708e-05, "loss": 0.0405, "step": 5818 }, { "epoch": 2.6474067333939946, "grad_norm": 0.5355118611246024, "learning_rate": 2.2694965370276244e-05, "loss": 0.0679, "step": 5819 }, { "epoch": 2.6478616924476794, "grad_norm": 0.41938168114968827, "learning_rate": 2.2687849425736805e-05, "loss": 0.0404, "step": 5820 }, { "epoch": 2.6483166515013647, "grad_norm": 0.9660343084060399, "learning_rate": 2.268073367013586e-05, "loss": 0.0713, "step": 5821 }, { "epoch": 2.64877161055505, "grad_norm": 0.5414611838721645, "learning_rate": 2.2673618104054882e-05, "loss": 0.0676, "step": 5822 }, { "epoch": 2.6492265696087354, "grad_norm": 0.5490549303329155, "learning_rate": 2.2666502728075338e-05, "loss": 0.0372, "step": 5823 }, { "epoch": 2.6496815286624202, "grad_norm": 0.344439665175893, "learning_rate": 2.2659387542778648e-05, "loss": 0.0234, "step": 5824 }, { "epoch": 2.6501364877161055, "grad_norm": 0.4003038002182466, "learning_rate": 2.2652272548746246e-05, "loss": 0.0697, "step": 5825 }, { "epoch": 2.650591446769791, "grad_norm": 0.4938399958609477, "learning_rate": 2.264515774655952e-05, "loss": 0.033, "step": 5826 }, { "epoch": 2.6510464058234757, "grad_norm": 0.5585465290426453, "learning_rate": 2.2638043136799873e-05, "loss": 0.0711, "step": 5827 }, { "epoch": 2.651501364877161, "grad_norm": 0.4923361250514678, "learning_rate": 2.2630928720048672e-05, "loss": 0.0775, "step": 5828 }, { "epoch": 2.6519563239308463, "grad_norm": 0.4175097509275799, "learning_rate": 2.262381449688727e-05, "loss": 0.034, "step": 5829 }, { "epoch": 2.6524112829845317, "grad_norm": 0.4735661493789965, "learning_rate": 2.261670046789703e-05, "loss": 0.0607, "step": 5830 }, { "epoch": 2.6528662420382165, "grad_norm": 0.3214933545240895, "learning_rate": 2.2609586633659256e-05, "loss": 0.0286, "step": 5831 }, { "epoch": 2.653321201091902, "grad_norm": 0.45114571213855403, "learning_rate": 2.2602472994755276e-05, "loss": 0.0602, "step": 5832 }, { "epoch": 2.653776160145587, "grad_norm": 0.5178105044801509, "learning_rate": 2.2595359551766364e-05, "loss": 0.048, "step": 5833 }, { "epoch": 2.654231119199272, "grad_norm": 0.42568505036557264, "learning_rate": 2.2588246305273823e-05, "loss": 0.0362, "step": 5834 }, { "epoch": 2.6546860782529573, "grad_norm": 0.3642301142319768, "learning_rate": 2.2581133255858895e-05, "loss": 0.0456, "step": 5835 }, { "epoch": 2.6551410373066426, "grad_norm": 0.34804610421909055, "learning_rate": 2.2574020404102823e-05, "loss": 0.0406, "step": 5836 }, { "epoch": 2.6555959963603275, "grad_norm": 0.6679754738744302, "learning_rate": 2.2566907750586867e-05, "loss": 0.0628, "step": 5837 }, { "epoch": 2.656050955414013, "grad_norm": 0.3274375090218894, "learning_rate": 2.255979529589221e-05, "loss": 0.0316, "step": 5838 }, { "epoch": 2.656505914467698, "grad_norm": 0.4705094764113618, "learning_rate": 2.2552683040600073e-05, "loss": 0.056, "step": 5839 }, { "epoch": 2.656960873521383, "grad_norm": 1.082260413572846, "learning_rate": 2.2545570985291616e-05, "loss": 0.0775, "step": 5840 }, { "epoch": 2.6574158325750683, "grad_norm": 0.42619124997537017, "learning_rate": 2.253845913054802e-05, "loss": 0.0384, "step": 5841 }, { "epoch": 2.6578707916287536, "grad_norm": 0.4044641541855415, "learning_rate": 2.2531347476950423e-05, "loss": 0.0346, "step": 5842 }, { "epoch": 2.6583257506824385, "grad_norm": 0.40730552252714614, "learning_rate": 2.2524236025079957e-05, "loss": 0.0472, "step": 5843 }, { "epoch": 2.658780709736124, "grad_norm": 0.620337723410297, "learning_rate": 2.2517124775517753e-05, "loss": 0.0486, "step": 5844 }, { "epoch": 2.659235668789809, "grad_norm": 0.3756926023204291, "learning_rate": 2.2510013728844894e-05, "loss": 0.0483, "step": 5845 }, { "epoch": 2.659690627843494, "grad_norm": 0.5859779640968235, "learning_rate": 2.2502902885642474e-05, "loss": 0.0459, "step": 5846 }, { "epoch": 2.6601455868971793, "grad_norm": 0.4001940965923679, "learning_rate": 2.249579224649155e-05, "loss": 0.0395, "step": 5847 }, { "epoch": 2.6606005459508646, "grad_norm": 0.4710401806197161, "learning_rate": 2.2488681811973178e-05, "loss": 0.036, "step": 5848 }, { "epoch": 2.6610555050045495, "grad_norm": 0.4016413388298426, "learning_rate": 2.248157158266838e-05, "loss": 0.0351, "step": 5849 }, { "epoch": 2.6615104640582348, "grad_norm": 0.36686867743413476, "learning_rate": 2.2474461559158178e-05, "loss": 0.0299, "step": 5850 }, { "epoch": 2.66196542311192, "grad_norm": 0.4325351914466157, "learning_rate": 2.246735174202358e-05, "loss": 0.0328, "step": 5851 }, { "epoch": 2.662420382165605, "grad_norm": 0.5022567677096583, "learning_rate": 2.2460242131845554e-05, "loss": 0.0408, "step": 5852 }, { "epoch": 2.6628753412192903, "grad_norm": 0.4991461965359129, "learning_rate": 2.245313272920508e-05, "loss": 0.057, "step": 5853 }, { "epoch": 2.6633303002729756, "grad_norm": 0.6429698529874431, "learning_rate": 2.2446023534683088e-05, "loss": 0.0796, "step": 5854 }, { "epoch": 2.6637852593266604, "grad_norm": 0.4611059815471597, "learning_rate": 2.243891454886053e-05, "loss": 0.0554, "step": 5855 }, { "epoch": 2.6642402183803457, "grad_norm": 0.38900525137221115, "learning_rate": 2.2431805772318307e-05, "loss": 0.0408, "step": 5856 }, { "epoch": 2.664695177434031, "grad_norm": 0.6835404250594459, "learning_rate": 2.2424697205637306e-05, "loss": 0.0334, "step": 5857 }, { "epoch": 2.665150136487716, "grad_norm": 0.4738888091619409, "learning_rate": 2.2417588849398426e-05, "loss": 0.0358, "step": 5858 }, { "epoch": 2.6656050955414012, "grad_norm": 0.4476241457697317, "learning_rate": 2.2410480704182528e-05, "loss": 0.0603, "step": 5859 }, { "epoch": 2.6660600545950865, "grad_norm": 0.5815234937568909, "learning_rate": 2.240337277057045e-05, "loss": 0.0439, "step": 5860 }, { "epoch": 2.6665150136487714, "grad_norm": 0.44456240998395974, "learning_rate": 2.2396265049143024e-05, "loss": 0.0565, "step": 5861 }, { "epoch": 2.6669699727024567, "grad_norm": 0.8464375262013383, "learning_rate": 2.238915754048106e-05, "loss": 0.0585, "step": 5862 }, { "epoch": 2.667424931756142, "grad_norm": 0.35860851880748024, "learning_rate": 2.2382050245165352e-05, "loss": 0.0405, "step": 5863 }, { "epoch": 2.667879890809827, "grad_norm": 0.4240532012674243, "learning_rate": 2.2374943163776666e-05, "loss": 0.0304, "step": 5864 }, { "epoch": 2.668334849863512, "grad_norm": 0.508112592077324, "learning_rate": 2.2367836296895776e-05, "loss": 0.049, "step": 5865 }, { "epoch": 2.6687898089171975, "grad_norm": 0.4399860227783622, "learning_rate": 2.236072964510342e-05, "loss": 0.0436, "step": 5866 }, { "epoch": 2.6692447679708824, "grad_norm": 0.6965294729487945, "learning_rate": 2.2353623208980316e-05, "loss": 0.0563, "step": 5867 }, { "epoch": 2.6696997270245677, "grad_norm": 0.5618620932509044, "learning_rate": 2.2346516989107177e-05, "loss": 0.0732, "step": 5868 }, { "epoch": 2.670154686078253, "grad_norm": 0.5133917531833098, "learning_rate": 2.2339410986064683e-05, "loss": 0.0562, "step": 5869 }, { "epoch": 2.670609645131938, "grad_norm": 0.5328974198597438, "learning_rate": 2.233230520043351e-05, "loss": 0.0456, "step": 5870 }, { "epoch": 2.671064604185623, "grad_norm": 0.6289399830017458, "learning_rate": 2.2325199632794298e-05, "loss": 0.068, "step": 5871 }, { "epoch": 2.6715195632393085, "grad_norm": 0.5224157592983018, "learning_rate": 2.23180942837277e-05, "loss": 0.0417, "step": 5872 }, { "epoch": 2.6719745222929934, "grad_norm": 1.1446454577032812, "learning_rate": 2.2310989153814333e-05, "loss": 0.0447, "step": 5873 }, { "epoch": 2.6724294813466787, "grad_norm": 0.5115704994428732, "learning_rate": 2.230388424363478e-05, "loss": 0.0476, "step": 5874 }, { "epoch": 2.672884440400364, "grad_norm": 0.4127706504227122, "learning_rate": 2.2296779553769638e-05, "loss": 0.049, "step": 5875 }, { "epoch": 2.673339399454049, "grad_norm": 0.3658183541711587, "learning_rate": 2.228967508479946e-05, "loss": 0.033, "step": 5876 }, { "epoch": 2.673794358507734, "grad_norm": 0.5551390170564624, "learning_rate": 2.2282570837304798e-05, "loss": 0.068, "step": 5877 }, { "epoch": 2.6742493175614195, "grad_norm": 0.49309391786759615, "learning_rate": 2.2275466811866163e-05, "loss": 0.0635, "step": 5878 }, { "epoch": 2.674704276615105, "grad_norm": 0.4428196350581227, "learning_rate": 2.2268363009064083e-05, "loss": 0.0504, "step": 5879 }, { "epoch": 2.6751592356687897, "grad_norm": 0.48659654882605935, "learning_rate": 2.2261259429479048e-05, "loss": 0.0425, "step": 5880 }, { "epoch": 2.675614194722475, "grad_norm": 0.5745082278738365, "learning_rate": 2.2254156073691518e-05, "loss": 0.0438, "step": 5881 }, { "epoch": 2.6760691537761603, "grad_norm": 0.4408456648660868, "learning_rate": 2.2247052942281956e-05, "loss": 0.0429, "step": 5882 }, { "epoch": 2.676524112829845, "grad_norm": 0.39543826274842336, "learning_rate": 2.2239950035830795e-05, "loss": 0.0452, "step": 5883 }, { "epoch": 2.6769790718835305, "grad_norm": 0.4760513358759914, "learning_rate": 2.2232847354918457e-05, "loss": 0.0327, "step": 5884 }, { "epoch": 2.6774340309372158, "grad_norm": 0.5895765510800465, "learning_rate": 2.222574490012532e-05, "loss": 0.0573, "step": 5885 }, { "epoch": 2.677888989990901, "grad_norm": 0.5036290553047188, "learning_rate": 2.2218642672031796e-05, "loss": 0.0399, "step": 5886 }, { "epoch": 2.678343949044586, "grad_norm": 0.5419053717182158, "learning_rate": 2.2211540671218233e-05, "loss": 0.0458, "step": 5887 }, { "epoch": 2.6787989080982713, "grad_norm": 0.37343180332632253, "learning_rate": 2.220443889826497e-05, "loss": 0.0396, "step": 5888 }, { "epoch": 2.6792538671519566, "grad_norm": 0.4285186263185177, "learning_rate": 2.219733735375234e-05, "loss": 0.032, "step": 5889 }, { "epoch": 2.6797088262056414, "grad_norm": 0.4338415605633316, "learning_rate": 2.2190236038260646e-05, "loss": 0.067, "step": 5890 }, { "epoch": 2.6801637852593267, "grad_norm": 0.5075616414877185, "learning_rate": 2.2183134952370155e-05, "loss": 0.0531, "step": 5891 }, { "epoch": 2.680618744313012, "grad_norm": 0.3772948124074021, "learning_rate": 2.2176034096661173e-05, "loss": 0.0554, "step": 5892 }, { "epoch": 2.681073703366697, "grad_norm": 0.3681377279794007, "learning_rate": 2.2168933471713933e-05, "loss": 0.0255, "step": 5893 }, { "epoch": 2.6815286624203822, "grad_norm": 0.623538301282629, "learning_rate": 2.2161833078108657e-05, "loss": 0.0539, "step": 5894 }, { "epoch": 2.6819836214740675, "grad_norm": 0.4089419074705362, "learning_rate": 2.215473291642557e-05, "loss": 0.0225, "step": 5895 }, { "epoch": 2.6824385805277524, "grad_norm": 0.5575270202652739, "learning_rate": 2.2147632987244854e-05, "loss": 0.0626, "step": 5896 }, { "epoch": 2.6828935395814377, "grad_norm": 0.5250146493972752, "learning_rate": 2.2140533291146697e-05, "loss": 0.0452, "step": 5897 }, { "epoch": 2.683348498635123, "grad_norm": 0.3525761732045285, "learning_rate": 2.2133433828711232e-05, "loss": 0.0508, "step": 5898 }, { "epoch": 2.683803457688808, "grad_norm": 0.5889191182304386, "learning_rate": 2.212633460051862e-05, "loss": 0.0304, "step": 5899 }, { "epoch": 2.684258416742493, "grad_norm": 0.29681584046649595, "learning_rate": 2.211923560714897e-05, "loss": 0.0144, "step": 5900 }, { "epoch": 2.6847133757961785, "grad_norm": 0.403259364801587, "learning_rate": 2.2112136849182368e-05, "loss": 0.0323, "step": 5901 }, { "epoch": 2.6851683348498634, "grad_norm": 0.416926327059606, "learning_rate": 2.2105038327198914e-05, "loss": 0.0554, "step": 5902 }, { "epoch": 2.6856232939035487, "grad_norm": 0.5726927700567032, "learning_rate": 2.2097940041778643e-05, "loss": 0.0531, "step": 5903 }, { "epoch": 2.686078252957234, "grad_norm": 0.5344078730482182, "learning_rate": 2.209084199350161e-05, "loss": 0.0503, "step": 5904 }, { "epoch": 2.686533212010919, "grad_norm": 0.43142688822417585, "learning_rate": 2.2083744182947828e-05, "loss": 0.039, "step": 5905 }, { "epoch": 2.686988171064604, "grad_norm": 0.6963211225019157, "learning_rate": 2.20766466106973e-05, "loss": 0.0296, "step": 5906 }, { "epoch": 2.6874431301182895, "grad_norm": 0.5386035956515998, "learning_rate": 2.206954927733002e-05, "loss": 0.0588, "step": 5907 }, { "epoch": 2.6878980891719744, "grad_norm": 0.539837300671694, "learning_rate": 2.206245218342593e-05, "loss": 0.0781, "step": 5908 }, { "epoch": 2.6883530482256597, "grad_norm": 0.5098742551834576, "learning_rate": 2.2055355329564988e-05, "loss": 0.0469, "step": 5909 }, { "epoch": 2.688808007279345, "grad_norm": 0.4823919820289071, "learning_rate": 2.204825871632711e-05, "loss": 0.0532, "step": 5910 }, { "epoch": 2.68926296633303, "grad_norm": 0.46112518432935645, "learning_rate": 2.20411623442922e-05, "loss": 0.065, "step": 5911 }, { "epoch": 2.689717925386715, "grad_norm": 0.6994015465698566, "learning_rate": 2.2034066214040127e-05, "loss": 0.049, "step": 5912 }, { "epoch": 2.6901728844404005, "grad_norm": 0.4097417080215588, "learning_rate": 2.202697032615078e-05, "loss": 0.0416, "step": 5913 }, { "epoch": 2.6906278434940853, "grad_norm": 0.6746226956623274, "learning_rate": 2.2019874681203996e-05, "loss": 0.0909, "step": 5914 }, { "epoch": 2.6910828025477707, "grad_norm": 0.4948089750248944, "learning_rate": 2.201277927977959e-05, "loss": 0.0449, "step": 5915 }, { "epoch": 2.691537761601456, "grad_norm": 0.48860219830575224, "learning_rate": 2.2005684122457377e-05, "loss": 0.0796, "step": 5916 }, { "epoch": 2.691992720655141, "grad_norm": 0.6385059732217075, "learning_rate": 2.199858920981713e-05, "loss": 0.0426, "step": 5917 }, { "epoch": 2.692447679708826, "grad_norm": 0.4143882219535619, "learning_rate": 2.199149454243862e-05, "loss": 0.0472, "step": 5918 }, { "epoch": 2.6929026387625115, "grad_norm": 0.48551938244188825, "learning_rate": 2.1984400120901582e-05, "loss": 0.0406, "step": 5919 }, { "epoch": 2.6933575978161963, "grad_norm": 0.6756784922307804, "learning_rate": 2.1977305945785755e-05, "loss": 0.0569, "step": 5920 }, { "epoch": 2.6938125568698816, "grad_norm": 0.6417857370027692, "learning_rate": 2.1970212017670837e-05, "loss": 0.0534, "step": 5921 }, { "epoch": 2.694267515923567, "grad_norm": 0.6097422281080599, "learning_rate": 2.1963118337136507e-05, "loss": 0.0649, "step": 5922 }, { "epoch": 2.694722474977252, "grad_norm": 0.6528608474714616, "learning_rate": 2.195602490476244e-05, "loss": 0.0551, "step": 5923 }, { "epoch": 2.695177434030937, "grad_norm": 0.3328851179099179, "learning_rate": 2.1948931721128263e-05, "loss": 0.042, "step": 5924 }, { "epoch": 2.6956323930846224, "grad_norm": 0.49911257468841475, "learning_rate": 2.1941838786813612e-05, "loss": 0.048, "step": 5925 }, { "epoch": 2.6960873521383073, "grad_norm": 0.5081550546487115, "learning_rate": 2.1934746102398075e-05, "loss": 0.0535, "step": 5926 }, { "epoch": 2.6965423111919926, "grad_norm": 0.49706796592663327, "learning_rate": 2.192765366846125e-05, "loss": 0.0437, "step": 5927 }, { "epoch": 2.696997270245678, "grad_norm": 0.4042868632576277, "learning_rate": 2.1920561485582696e-05, "loss": 0.0394, "step": 5928 }, { "epoch": 2.697452229299363, "grad_norm": 0.5607633300169651, "learning_rate": 2.1913469554341954e-05, "loss": 0.0736, "step": 5929 }, { "epoch": 2.697907188353048, "grad_norm": 0.3761334950324731, "learning_rate": 2.1906377875318533e-05, "loss": 0.0449, "step": 5930 }, { "epoch": 2.6983621474067334, "grad_norm": 0.5672630709835306, "learning_rate": 2.189928644909195e-05, "loss": 0.0636, "step": 5931 }, { "epoch": 2.6988171064604187, "grad_norm": 0.4058680329305384, "learning_rate": 2.1892195276241666e-05, "loss": 0.044, "step": 5932 }, { "epoch": 2.6992720655141036, "grad_norm": 0.3583373003473687, "learning_rate": 2.1885104357347147e-05, "loss": 0.0525, "step": 5933 }, { "epoch": 2.699727024567789, "grad_norm": 0.45066210860577793, "learning_rate": 2.1878013692987844e-05, "loss": 0.0331, "step": 5934 }, { "epoch": 2.700181983621474, "grad_norm": 0.41925601721879147, "learning_rate": 2.1870923283743156e-05, "loss": 0.0481, "step": 5935 }, { "epoch": 2.700636942675159, "grad_norm": 0.6841646729293432, "learning_rate": 2.1863833130192494e-05, "loss": 0.0678, "step": 5936 }, { "epoch": 2.7010919017288444, "grad_norm": 0.4686604797372016, "learning_rate": 2.185674323291522e-05, "loss": 0.0939, "step": 5937 }, { "epoch": 2.7015468607825297, "grad_norm": 0.6533720759666163, "learning_rate": 2.18496535924907e-05, "loss": 0.0506, "step": 5938 }, { "epoch": 2.702001819836215, "grad_norm": 0.6971015168682578, "learning_rate": 2.1842564209498253e-05, "loss": 0.0434, "step": 5939 }, { "epoch": 2.7024567788899, "grad_norm": 0.6131469479206585, "learning_rate": 2.1835475084517198e-05, "loss": 0.0323, "step": 5940 }, { "epoch": 2.702911737943585, "grad_norm": 0.43175349144394737, "learning_rate": 2.1828386218126838e-05, "loss": 0.0343, "step": 5941 }, { "epoch": 2.7033666969972705, "grad_norm": 0.45813828528189754, "learning_rate": 2.182129761090643e-05, "loss": 0.0423, "step": 5942 }, { "epoch": 2.7038216560509554, "grad_norm": 0.5214846054566591, "learning_rate": 2.1814209263435226e-05, "loss": 0.0444, "step": 5943 }, { "epoch": 2.7042766151046407, "grad_norm": 0.47015296761273645, "learning_rate": 2.1807121176292453e-05, "loss": 0.0416, "step": 5944 }, { "epoch": 2.704731574158326, "grad_norm": 0.5533848189112269, "learning_rate": 2.1800033350057323e-05, "loss": 0.0463, "step": 5945 }, { "epoch": 2.705186533212011, "grad_norm": 0.3440544451282442, "learning_rate": 2.1792945785309012e-05, "loss": 0.029, "step": 5946 }, { "epoch": 2.705641492265696, "grad_norm": 0.5089459758648578, "learning_rate": 2.178585848262668e-05, "loss": 0.0436, "step": 5947 }, { "epoch": 2.7060964513193815, "grad_norm": 1.3475056680392774, "learning_rate": 2.177877144258949e-05, "loss": 0.2623, "step": 5948 }, { "epoch": 2.7065514103730663, "grad_norm": 0.5254886654274252, "learning_rate": 2.177168466577655e-05, "loss": 0.0649, "step": 5949 }, { "epoch": 2.7070063694267517, "grad_norm": 0.4756127505248916, "learning_rate": 2.176459815276696e-05, "loss": 0.043, "step": 5950 }, { "epoch": 2.707461328480437, "grad_norm": 0.45620371815569255, "learning_rate": 2.1757511904139793e-05, "loss": 0.0406, "step": 5951 }, { "epoch": 2.707916287534122, "grad_norm": 0.4912218619116507, "learning_rate": 2.175042592047412e-05, "loss": 0.0527, "step": 5952 }, { "epoch": 2.708371246587807, "grad_norm": 0.683983634497564, "learning_rate": 2.1743340202348956e-05, "loss": 0.0614, "step": 5953 }, { "epoch": 2.7088262056414925, "grad_norm": 0.32919647526563783, "learning_rate": 2.173625475034332e-05, "loss": 0.0188, "step": 5954 }, { "epoch": 2.7092811646951773, "grad_norm": 0.40646018711823134, "learning_rate": 2.1729169565036218e-05, "loss": 0.0519, "step": 5955 }, { "epoch": 2.7097361237488626, "grad_norm": 0.4469650135334224, "learning_rate": 2.17220846470066e-05, "loss": 0.0693, "step": 5956 }, { "epoch": 2.710191082802548, "grad_norm": 0.6492738236760879, "learning_rate": 2.1714999996833433e-05, "loss": 0.0647, "step": 5957 }, { "epoch": 2.710646041856233, "grad_norm": 0.6866887185436601, "learning_rate": 2.170791561509562e-05, "loss": 0.0654, "step": 5958 }, { "epoch": 2.711101000909918, "grad_norm": 0.5468194608543133, "learning_rate": 2.170083150237209e-05, "loss": 0.0546, "step": 5959 }, { "epoch": 2.7115559599636034, "grad_norm": 0.6646366031338852, "learning_rate": 2.1693747659241695e-05, "loss": 0.0246, "step": 5960 }, { "epoch": 2.7120109190172883, "grad_norm": 0.5928284865862189, "learning_rate": 2.1686664086283308e-05, "loss": 0.06, "step": 5961 }, { "epoch": 2.7124658780709736, "grad_norm": 0.47794012738376057, "learning_rate": 2.167958078407578e-05, "loss": 0.0521, "step": 5962 }, { "epoch": 2.712920837124659, "grad_norm": 0.5907434321915297, "learning_rate": 2.1672497753197914e-05, "loss": 0.046, "step": 5963 }, { "epoch": 2.713375796178344, "grad_norm": 0.7685644162794872, "learning_rate": 2.1665414994228506e-05, "loss": 0.0877, "step": 5964 }, { "epoch": 2.713830755232029, "grad_norm": 0.39425090492318327, "learning_rate": 2.1658332507746328e-05, "loss": 0.0609, "step": 5965 }, { "epoch": 2.7142857142857144, "grad_norm": 0.6719139329824394, "learning_rate": 2.1651250294330123e-05, "loss": 0.0691, "step": 5966 }, { "epoch": 2.7147406733393993, "grad_norm": 0.3764366386774363, "learning_rate": 2.164416835455862e-05, "loss": 0.0601, "step": 5967 }, { "epoch": 2.7151956323930846, "grad_norm": 0.5167084963604417, "learning_rate": 2.163708668901052e-05, "loss": 0.0364, "step": 5968 }, { "epoch": 2.71565059144677, "grad_norm": 0.36264177585436463, "learning_rate": 2.163000529826451e-05, "loss": 0.0413, "step": 5969 }, { "epoch": 2.7161055505004548, "grad_norm": 0.30545116463681754, "learning_rate": 2.1622924182899257e-05, "loss": 0.0316, "step": 5970 }, { "epoch": 2.71656050955414, "grad_norm": 0.47277934993127213, "learning_rate": 2.161584334349338e-05, "loss": 0.0805, "step": 5971 }, { "epoch": 2.7170154686078254, "grad_norm": 0.43965618632338155, "learning_rate": 2.160876278062551e-05, "loss": 0.0644, "step": 5972 }, { "epoch": 2.7174704276615103, "grad_norm": 0.5933661963837007, "learning_rate": 2.1601682494874227e-05, "loss": 0.0569, "step": 5973 }, { "epoch": 2.7179253867151956, "grad_norm": 0.6393568103319038, "learning_rate": 2.1594602486818106e-05, "loss": 0.0654, "step": 5974 }, { "epoch": 2.718380345768881, "grad_norm": 0.4148567275445335, "learning_rate": 2.158752275703568e-05, "loss": 0.0284, "step": 5975 }, { "epoch": 2.7188353048225657, "grad_norm": 0.549288240531818, "learning_rate": 2.158044330610549e-05, "loss": 0.0427, "step": 5976 }, { "epoch": 2.719290263876251, "grad_norm": 0.4661892769145249, "learning_rate": 2.1573364134606038e-05, "loss": 0.0659, "step": 5977 }, { "epoch": 2.7197452229299364, "grad_norm": 0.3371139422871171, "learning_rate": 2.156628524311579e-05, "loss": 0.033, "step": 5978 }, { "epoch": 2.7202001819836212, "grad_norm": 0.4254970560189018, "learning_rate": 2.1559206632213206e-05, "loss": 0.0335, "step": 5979 }, { "epoch": 2.7206551410373065, "grad_norm": 0.5071181103792407, "learning_rate": 2.1552128302476715e-05, "loss": 0.0469, "step": 5980 }, { "epoch": 2.721110100090992, "grad_norm": 0.46286062760238894, "learning_rate": 2.1545050254484735e-05, "loss": 0.0588, "step": 5981 }, { "epoch": 2.7215650591446767, "grad_norm": 0.4347738157117299, "learning_rate": 2.1537972488815632e-05, "loss": 0.0468, "step": 5982 }, { "epoch": 2.722020018198362, "grad_norm": 0.3254163799165676, "learning_rate": 2.1530895006047792e-05, "loss": 0.0259, "step": 5983 }, { "epoch": 2.7224749772520473, "grad_norm": 0.4117036175844765, "learning_rate": 2.1523817806759548e-05, "loss": 0.0293, "step": 5984 }, { "epoch": 2.722929936305732, "grad_norm": 0.4276566509502617, "learning_rate": 2.1516740891529207e-05, "loss": 0.0387, "step": 5985 }, { "epoch": 2.7233848953594175, "grad_norm": 0.46066037357593004, "learning_rate": 2.150966426093508e-05, "loss": 0.0351, "step": 5986 }, { "epoch": 2.723839854413103, "grad_norm": 0.4350987996759223, "learning_rate": 2.1502587915555423e-05, "loss": 0.0688, "step": 5987 }, { "epoch": 2.724294813466788, "grad_norm": 0.418273097611472, "learning_rate": 2.149551185596849e-05, "loss": 0.0303, "step": 5988 }, { "epoch": 2.724749772520473, "grad_norm": 0.35278926044361036, "learning_rate": 2.1488436082752487e-05, "loss": 0.0345, "step": 5989 }, { "epoch": 2.7252047315741583, "grad_norm": 0.4160150696293616, "learning_rate": 2.148136059648564e-05, "loss": 0.0366, "step": 5990 }, { "epoch": 2.7256596906278436, "grad_norm": 0.44773502912647734, "learning_rate": 2.147428539774612e-05, "loss": 0.0386, "step": 5991 }, { "epoch": 2.7261146496815285, "grad_norm": 0.4780462265958811, "learning_rate": 2.146721048711207e-05, "loss": 0.0426, "step": 5992 }, { "epoch": 2.726569608735214, "grad_norm": 0.4987848223612622, "learning_rate": 2.146013586516163e-05, "loss": 0.0451, "step": 5993 }, { "epoch": 2.727024567788899, "grad_norm": 0.5565531078859572, "learning_rate": 2.1453061532472896e-05, "loss": 0.0812, "step": 5994 }, { "epoch": 2.7274795268425844, "grad_norm": 0.80627046688223, "learning_rate": 2.1445987489623962e-05, "loss": 0.0976, "step": 5995 }, { "epoch": 2.7279344858962693, "grad_norm": 0.7049566297605805, "learning_rate": 2.1438913737192867e-05, "loss": 0.0923, "step": 5996 }, { "epoch": 2.7283894449499546, "grad_norm": 0.4749098591874718, "learning_rate": 2.143184027575767e-05, "loss": 0.0531, "step": 5997 }, { "epoch": 2.72884440400364, "grad_norm": 0.3099785125576241, "learning_rate": 2.142476710589637e-05, "loss": 0.0265, "step": 5998 }, { "epoch": 2.729299363057325, "grad_norm": 0.47380674595603894, "learning_rate": 2.1417694228186956e-05, "loss": 0.0311, "step": 5999 }, { "epoch": 2.72975432211101, "grad_norm": 0.5139405326303114, "learning_rate": 2.14106216432074e-05, "loss": 0.07, "step": 6000 }, { "epoch": 2.7302092811646954, "grad_norm": 0.3822401115705026, "learning_rate": 2.1403549351535627e-05, "loss": 0.0542, "step": 6001 }, { "epoch": 2.7306642402183803, "grad_norm": 0.5671460060766492, "learning_rate": 2.1396477353749563e-05, "loss": 0.0472, "step": 6002 }, { "epoch": 2.7311191992720656, "grad_norm": 0.7156106841299211, "learning_rate": 2.1389405650427082e-05, "loss": 0.0535, "step": 6003 }, { "epoch": 2.731574158325751, "grad_norm": 0.49959669987954036, "learning_rate": 2.138233424214608e-05, "loss": 0.0484, "step": 6004 }, { "epoch": 2.7320291173794358, "grad_norm": 0.5449794946752824, "learning_rate": 2.1375263129484382e-05, "loss": 0.0655, "step": 6005 }, { "epoch": 2.732484076433121, "grad_norm": 0.43168328542678763, "learning_rate": 2.1368192313019817e-05, "loss": 0.0685, "step": 6006 }, { "epoch": 2.7329390354868064, "grad_norm": 0.3307772527177035, "learning_rate": 2.136112179333017e-05, "loss": 0.0349, "step": 6007 }, { "epoch": 2.7333939945404913, "grad_norm": 0.8472878889135727, "learning_rate": 2.1354051570993223e-05, "loss": 0.1274, "step": 6008 }, { "epoch": 2.7338489535941766, "grad_norm": 0.6425313270377055, "learning_rate": 2.134698164658671e-05, "loss": 0.0665, "step": 6009 }, { "epoch": 2.734303912647862, "grad_norm": 0.32864187622699337, "learning_rate": 2.133991202068835e-05, "loss": 0.0289, "step": 6010 }, { "epoch": 2.7347588717015467, "grad_norm": 0.3078160123186234, "learning_rate": 2.133284269387587e-05, "loss": 0.0391, "step": 6011 }, { "epoch": 2.735213830755232, "grad_norm": 0.5138538312796341, "learning_rate": 2.1325773666726915e-05, "loss": 0.0361, "step": 6012 }, { "epoch": 2.7356687898089174, "grad_norm": 0.6828498055984022, "learning_rate": 2.131870493981915e-05, "loss": 0.0616, "step": 6013 }, { "epoch": 2.7361237488626022, "grad_norm": 0.44581559639494844, "learning_rate": 2.1311636513730185e-05, "loss": 0.0436, "step": 6014 }, { "epoch": 2.7365787079162875, "grad_norm": 0.3632434021478805, "learning_rate": 2.1304568389037637e-05, "loss": 0.0298, "step": 6015 }, { "epoch": 2.737033666969973, "grad_norm": 0.46034398275833577, "learning_rate": 2.1297500566319063e-05, "loss": 0.0554, "step": 6016 }, { "epoch": 2.7374886260236577, "grad_norm": 0.40405620812057047, "learning_rate": 2.1290433046152015e-05, "loss": 0.0429, "step": 6017 }, { "epoch": 2.737943585077343, "grad_norm": 0.4623120644559097, "learning_rate": 2.128336582911404e-05, "loss": 0.0257, "step": 6018 }, { "epoch": 2.7383985441310283, "grad_norm": 0.49835978149891114, "learning_rate": 2.127629891578262e-05, "loss": 0.0622, "step": 6019 }, { "epoch": 2.738853503184713, "grad_norm": 0.6098992172264203, "learning_rate": 2.1269232306735242e-05, "loss": 0.0577, "step": 6020 }, { "epoch": 2.7393084622383985, "grad_norm": 0.3757443290398358, "learning_rate": 2.1262166002549344e-05, "loss": 0.0505, "step": 6021 }, { "epoch": 2.739763421292084, "grad_norm": 0.4968467250630504, "learning_rate": 2.125510000380237e-05, "loss": 0.041, "step": 6022 }, { "epoch": 2.7402183803457687, "grad_norm": 0.37748047433997794, "learning_rate": 2.12480343110717e-05, "loss": 0.0321, "step": 6023 }, { "epoch": 2.740673339399454, "grad_norm": 0.5812700953393495, "learning_rate": 2.1240968924934725e-05, "loss": 0.0807, "step": 6024 }, { "epoch": 2.7411282984531393, "grad_norm": 0.38184738908811966, "learning_rate": 2.12339038459688e-05, "loss": 0.0249, "step": 6025 }, { "epoch": 2.741583257506824, "grad_norm": 0.5468522688239589, "learning_rate": 2.1226839074751242e-05, "loss": 0.0714, "step": 6026 }, { "epoch": 2.7420382165605095, "grad_norm": 0.4020299967681138, "learning_rate": 2.1219774611859357e-05, "loss": 0.0338, "step": 6027 }, { "epoch": 2.742493175614195, "grad_norm": 0.5389333727150023, "learning_rate": 2.1212710457870417e-05, "loss": 0.0669, "step": 6028 }, { "epoch": 2.7429481346678797, "grad_norm": 0.4375901080939148, "learning_rate": 2.1205646613361678e-05, "loss": 0.0426, "step": 6029 }, { "epoch": 2.743403093721565, "grad_norm": 0.40338031731255086, "learning_rate": 2.1198583078910363e-05, "loss": 0.0264, "step": 6030 }, { "epoch": 2.7438580527752503, "grad_norm": 0.5399427621638399, "learning_rate": 2.1191519855093662e-05, "loss": 0.0555, "step": 6031 }, { "epoch": 2.744313011828935, "grad_norm": 0.5264826819655847, "learning_rate": 2.118445694248877e-05, "loss": 0.0632, "step": 6032 }, { "epoch": 2.7447679708826205, "grad_norm": 0.5286743227811543, "learning_rate": 2.117739434167282e-05, "loss": 0.0665, "step": 6033 }, { "epoch": 2.745222929936306, "grad_norm": 0.35852661772047173, "learning_rate": 2.117033205322295e-05, "loss": 0.0244, "step": 6034 }, { "epoch": 2.7456778889899907, "grad_norm": 0.41110743918013876, "learning_rate": 2.1163270077716248e-05, "loss": 0.0296, "step": 6035 }, { "epoch": 2.746132848043676, "grad_norm": 0.5093941282916222, "learning_rate": 2.1156208415729782e-05, "loss": 0.0658, "step": 6036 }, { "epoch": 2.7465878070973613, "grad_norm": 2.215036956914982, "learning_rate": 2.1149147067840613e-05, "loss": 0.1152, "step": 6037 }, { "epoch": 2.747042766151046, "grad_norm": 0.40401630002087136, "learning_rate": 2.1142086034625745e-05, "loss": 0.0336, "step": 6038 }, { "epoch": 2.7474977252047315, "grad_norm": 0.41023950448131524, "learning_rate": 2.113502531666219e-05, "loss": 0.0348, "step": 6039 }, { "epoch": 2.7479526842584168, "grad_norm": 0.5416211959209015, "learning_rate": 2.1127964914526914e-05, "loss": 0.0462, "step": 6040 }, { "epoch": 2.7484076433121016, "grad_norm": 0.4908979020341917, "learning_rate": 2.1120904828796857e-05, "loss": 0.0537, "step": 6041 }, { "epoch": 2.748862602365787, "grad_norm": 0.39624374345880564, "learning_rate": 2.111384506004894e-05, "loss": 0.0499, "step": 6042 }, { "epoch": 2.7493175614194723, "grad_norm": 0.40501355421766444, "learning_rate": 2.1106785608860057e-05, "loss": 0.0365, "step": 6043 }, { "epoch": 2.7497725204731576, "grad_norm": 0.4171857668877367, "learning_rate": 2.1099726475807075e-05, "loss": 0.0457, "step": 6044 }, { "epoch": 2.7502274795268424, "grad_norm": 0.5184111351574123, "learning_rate": 2.109266766146682e-05, "loss": 0.0584, "step": 6045 }, { "epoch": 2.7506824385805277, "grad_norm": 0.4589446235627232, "learning_rate": 2.1085609166416128e-05, "loss": 0.0829, "step": 6046 }, { "epoch": 2.751137397634213, "grad_norm": 0.4328885731461058, "learning_rate": 2.1078550991231777e-05, "loss": 0.0575, "step": 6047 }, { "epoch": 2.7515923566878984, "grad_norm": 0.4452079119319962, "learning_rate": 2.1071493136490524e-05, "loss": 0.0397, "step": 6048 }, { "epoch": 2.7520473157415832, "grad_norm": 0.44489395131203674, "learning_rate": 2.106443560276912e-05, "loss": 0.0463, "step": 6049 }, { "epoch": 2.7525022747952685, "grad_norm": 0.6524074099905048, "learning_rate": 2.1057378390644263e-05, "loss": 0.0787, "step": 6050 }, { "epoch": 2.752957233848954, "grad_norm": 0.3958983689985069, "learning_rate": 2.105032150069264e-05, "loss": 0.0444, "step": 6051 }, { "epoch": 2.7534121929026387, "grad_norm": 0.4208252801565677, "learning_rate": 2.1043264933490898e-05, "loss": 0.0622, "step": 6052 }, { "epoch": 2.753867151956324, "grad_norm": 0.5788246075141971, "learning_rate": 2.103620868961568e-05, "loss": 0.0608, "step": 6053 }, { "epoch": 2.7543221110100093, "grad_norm": 0.3192679254292854, "learning_rate": 2.1029152769643596e-05, "loss": 0.0172, "step": 6054 }, { "epoch": 2.754777070063694, "grad_norm": 0.44699982047012915, "learning_rate": 2.102209717415121e-05, "loss": 0.043, "step": 6055 }, { "epoch": 2.7552320291173795, "grad_norm": 0.49511552248186996, "learning_rate": 2.101504190371508e-05, "loss": 0.0711, "step": 6056 }, { "epoch": 2.755686988171065, "grad_norm": 0.5009387632256341, "learning_rate": 2.100798695891173e-05, "loss": 0.056, "step": 6057 }, { "epoch": 2.7561419472247497, "grad_norm": 0.5661982865019688, "learning_rate": 2.100093234031766e-05, "loss": 0.0469, "step": 6058 }, { "epoch": 2.756596906278435, "grad_norm": 0.4025785425259507, "learning_rate": 2.099387804850933e-05, "loss": 0.0433, "step": 6059 }, { "epoch": 2.7570518653321203, "grad_norm": 0.5688990597749115, "learning_rate": 2.09868240840632e-05, "loss": 0.0477, "step": 6060 }, { "epoch": 2.757506824385805, "grad_norm": 0.5115444495783659, "learning_rate": 2.097977044755569e-05, "loss": 0.0549, "step": 6061 }, { "epoch": 2.7579617834394905, "grad_norm": 0.5043628760983173, "learning_rate": 2.0972717139563176e-05, "loss": 0.042, "step": 6062 }, { "epoch": 2.758416742493176, "grad_norm": 0.3900313763076612, "learning_rate": 2.0965664160662034e-05, "loss": 0.0427, "step": 6063 }, { "epoch": 2.7588717015468607, "grad_norm": 0.4625572522639061, "learning_rate": 2.0958611511428593e-05, "loss": 0.0542, "step": 6064 }, { "epoch": 2.759326660600546, "grad_norm": 0.3997974866758315, "learning_rate": 2.0951559192439176e-05, "loss": 0.0358, "step": 6065 }, { "epoch": 2.7597816196542313, "grad_norm": 0.6152806590926112, "learning_rate": 2.0944507204270047e-05, "loss": 0.0489, "step": 6066 }, { "epoch": 2.760236578707916, "grad_norm": 0.4811812922932751, "learning_rate": 2.093745554749748e-05, "loss": 0.0669, "step": 6067 }, { "epoch": 2.7606915377616015, "grad_norm": 0.48046235267500614, "learning_rate": 2.0930404222697708e-05, "loss": 0.0308, "step": 6068 }, { "epoch": 2.761146496815287, "grad_norm": 0.4572721185691301, "learning_rate": 2.0923353230446916e-05, "loss": 0.0696, "step": 6069 }, { "epoch": 2.7616014558689717, "grad_norm": 0.45800869153469265, "learning_rate": 2.0916302571321294e-05, "loss": 0.0286, "step": 6070 }, { "epoch": 2.762056414922657, "grad_norm": 0.5276177490659598, "learning_rate": 2.0909252245896983e-05, "loss": 0.038, "step": 6071 }, { "epoch": 2.7625113739763423, "grad_norm": 0.4814122542311682, "learning_rate": 2.0902202254750102e-05, "loss": 0.0481, "step": 6072 }, { "epoch": 2.762966333030027, "grad_norm": 0.48364453947818115, "learning_rate": 2.089515259845674e-05, "loss": 0.0505, "step": 6073 }, { "epoch": 2.7634212920837125, "grad_norm": 0.40114375451692535, "learning_rate": 2.0888103277592984e-05, "loss": 0.0255, "step": 6074 }, { "epoch": 2.7638762511373978, "grad_norm": 0.6508596518066387, "learning_rate": 2.088105429273485e-05, "loss": 0.0327, "step": 6075 }, { "epoch": 2.7643312101910826, "grad_norm": 0.5057950385540041, "learning_rate": 2.0874005644458368e-05, "loss": 0.0395, "step": 6076 }, { "epoch": 2.764786169244768, "grad_norm": 0.45511616392252163, "learning_rate": 2.08669573333395e-05, "loss": 0.06, "step": 6077 }, { "epoch": 2.7652411282984533, "grad_norm": 0.4619280388571203, "learning_rate": 2.085990935995422e-05, "loss": 0.0674, "step": 6078 }, { "epoch": 2.765696087352138, "grad_norm": 0.6793862296413639, "learning_rate": 2.085286172487845e-05, "loss": 0.0698, "step": 6079 }, { "epoch": 2.7661510464058234, "grad_norm": 0.5685544659658354, "learning_rate": 2.084581442868809e-05, "loss": 0.067, "step": 6080 }, { "epoch": 2.7666060054595087, "grad_norm": 0.6194910380290523, "learning_rate": 2.0838767471959014e-05, "loss": 0.0562, "step": 6081 }, { "epoch": 2.7670609645131936, "grad_norm": 0.47733417410246254, "learning_rate": 2.083172085526707e-05, "loss": 0.0481, "step": 6082 }, { "epoch": 2.767515923566879, "grad_norm": 0.415676885379499, "learning_rate": 2.0824674579188078e-05, "loss": 0.0482, "step": 6083 }, { "epoch": 2.7679708826205642, "grad_norm": 0.3803060176376276, "learning_rate": 2.081762864429782e-05, "loss": 0.0233, "step": 6084 }, { "epoch": 2.768425841674249, "grad_norm": 0.3932822938031075, "learning_rate": 2.0810583051172063e-05, "loss": 0.0483, "step": 6085 }, { "epoch": 2.7688808007279344, "grad_norm": 1.0304257581027765, "learning_rate": 2.0803537800386537e-05, "loss": 0.0421, "step": 6086 }, { "epoch": 2.7693357597816197, "grad_norm": 0.5530159326227698, "learning_rate": 2.0796492892516948e-05, "loss": 0.0486, "step": 6087 }, { "epoch": 2.7697907188353046, "grad_norm": 0.3625978325244785, "learning_rate": 2.0789448328138984e-05, "loss": 0.0208, "step": 6088 }, { "epoch": 2.77024567788899, "grad_norm": 0.4414989967942009, "learning_rate": 2.0782404107828284e-05, "loss": 0.0374, "step": 6089 }, { "epoch": 2.770700636942675, "grad_norm": 0.4618251928222087, "learning_rate": 2.077536023216048e-05, "loss": 0.0839, "step": 6090 }, { "epoch": 2.77115559599636, "grad_norm": 0.5304997670014294, "learning_rate": 2.0768316701711153e-05, "loss": 0.0418, "step": 6091 }, { "epoch": 2.7716105550500454, "grad_norm": 0.6825899365383119, "learning_rate": 2.076127351705588e-05, "loss": 0.0523, "step": 6092 }, { "epoch": 2.7720655141037307, "grad_norm": 0.5791807856726406, "learning_rate": 2.075423067877019e-05, "loss": 0.0778, "step": 6093 }, { "epoch": 2.7725204731574156, "grad_norm": 0.5556595879345221, "learning_rate": 2.0747188187429588e-05, "loss": 0.0495, "step": 6094 }, { "epoch": 2.772975432211101, "grad_norm": 0.5933213656676103, "learning_rate": 2.074014604360957e-05, "loss": 0.0457, "step": 6095 }, { "epoch": 2.773430391264786, "grad_norm": 0.4703839645360761, "learning_rate": 2.073310424788558e-05, "loss": 0.047, "step": 6096 }, { "epoch": 2.7738853503184715, "grad_norm": 0.6674160488748527, "learning_rate": 2.0726062800833044e-05, "loss": 0.0481, "step": 6097 }, { "epoch": 2.7743403093721564, "grad_norm": 0.5982632079708214, "learning_rate": 2.071902170302735e-05, "loss": 0.0527, "step": 6098 }, { "epoch": 2.7747952684258417, "grad_norm": 0.5068512493590205, "learning_rate": 2.0711980955043874e-05, "loss": 0.0431, "step": 6099 }, { "epoch": 2.775250227479527, "grad_norm": 0.4151897112321803, "learning_rate": 2.0704940557457948e-05, "loss": 0.0701, "step": 6100 }, { "epoch": 2.775705186533212, "grad_norm": 0.6646159017836933, "learning_rate": 2.0697900510844873e-05, "loss": 0.05, "step": 6101 }, { "epoch": 2.776160145586897, "grad_norm": 0.705776180782828, "learning_rate": 2.069086081577995e-05, "loss": 0.1312, "step": 6102 }, { "epoch": 2.7766151046405825, "grad_norm": 0.4268859418356984, "learning_rate": 2.068382147283842e-05, "loss": 0.0439, "step": 6103 }, { "epoch": 2.777070063694268, "grad_norm": 0.32008696149237786, "learning_rate": 2.0676782482595513e-05, "loss": 0.0404, "step": 6104 }, { "epoch": 2.7775250227479527, "grad_norm": 0.522723658470643, "learning_rate": 2.0669743845626416e-05, "loss": 0.0442, "step": 6105 }, { "epoch": 2.777979981801638, "grad_norm": 0.48753518958464537, "learning_rate": 2.0662705562506294e-05, "loss": 0.0551, "step": 6106 }, { "epoch": 2.7784349408553233, "grad_norm": 0.42696917114100547, "learning_rate": 2.0655667633810295e-05, "loss": 0.0436, "step": 6107 }, { "epoch": 2.778889899909008, "grad_norm": 0.4511285645726788, "learning_rate": 2.0648630060113494e-05, "loss": 0.0282, "step": 6108 }, { "epoch": 2.7793448589626935, "grad_norm": 0.5407630049112786, "learning_rate": 2.0641592841991015e-05, "loss": 0.04, "step": 6109 }, { "epoch": 2.7797998180163788, "grad_norm": 0.5416880056784062, "learning_rate": 2.0634555980017883e-05, "loss": 0.0341, "step": 6110 }, { "epoch": 2.7802547770700636, "grad_norm": 0.8174592474747402, "learning_rate": 2.062751947476912e-05, "loss": 0.1116, "step": 6111 }, { "epoch": 2.780709736123749, "grad_norm": 0.407728698306958, "learning_rate": 2.062048332681972e-05, "loss": 0.0604, "step": 6112 }, { "epoch": 2.7811646951774343, "grad_norm": 0.43748356026769475, "learning_rate": 2.0613447536744646e-05, "loss": 0.0561, "step": 6113 }, { "epoch": 2.781619654231119, "grad_norm": 0.5473155649560474, "learning_rate": 2.0606412105118834e-05, "loss": 0.0461, "step": 6114 }, { "epoch": 2.7820746132848044, "grad_norm": 0.4322149942881394, "learning_rate": 2.059937703251717e-05, "loss": 0.0473, "step": 6115 }, { "epoch": 2.7825295723384897, "grad_norm": 0.39879107380263323, "learning_rate": 2.0592342319514547e-05, "loss": 0.0609, "step": 6116 }, { "epoch": 2.7829845313921746, "grad_norm": 0.5343208291266379, "learning_rate": 2.0585307966685814e-05, "loss": 0.0469, "step": 6117 }, { "epoch": 2.78343949044586, "grad_norm": 0.39349979474796754, "learning_rate": 2.057827397460577e-05, "loss": 0.0316, "step": 6118 }, { "epoch": 2.7838944494995452, "grad_norm": 0.40496226595022927, "learning_rate": 2.0571240343849213e-05, "loss": 0.0629, "step": 6119 }, { "epoch": 2.78434940855323, "grad_norm": 0.39779067955745767, "learning_rate": 2.0564207074990888e-05, "loss": 0.0412, "step": 6120 }, { "epoch": 2.7848043676069154, "grad_norm": 0.8017879718389951, "learning_rate": 2.0557174168605536e-05, "loss": 0.0968, "step": 6121 }, { "epoch": 2.7852593266606007, "grad_norm": 0.39997150639358636, "learning_rate": 2.0550141625267837e-05, "loss": 0.0534, "step": 6122 }, { "epoch": 2.7857142857142856, "grad_norm": 0.446848633098185, "learning_rate": 2.0543109445552474e-05, "loss": 0.0431, "step": 6123 }, { "epoch": 2.786169244767971, "grad_norm": 0.6041656340459113, "learning_rate": 2.0536077630034086e-05, "loss": 0.0635, "step": 6124 }, { "epoch": 2.786624203821656, "grad_norm": 1.0204466747264038, "learning_rate": 2.052904617928727e-05, "loss": 0.0655, "step": 6125 }, { "epoch": 2.787079162875341, "grad_norm": 0.6187179688985142, "learning_rate": 2.0522015093886615e-05, "loss": 0.0456, "step": 6126 }, { "epoch": 2.7875341219290264, "grad_norm": 0.5182459021695779, "learning_rate": 2.0514984374406658e-05, "loss": 0.0348, "step": 6127 }, { "epoch": 2.7879890809827117, "grad_norm": 0.6171450155196234, "learning_rate": 2.050795402142193e-05, "loss": 0.0837, "step": 6128 }, { "epoch": 2.7884440400363966, "grad_norm": 0.4685928108989055, "learning_rate": 2.05009240355069e-05, "loss": 0.0331, "step": 6129 }, { "epoch": 2.788898999090082, "grad_norm": 0.4316027920299954, "learning_rate": 2.0493894417236047e-05, "loss": 0.0507, "step": 6130 }, { "epoch": 2.789353958143767, "grad_norm": 0.7055194387282908, "learning_rate": 2.04868651671838e-05, "loss": 0.0504, "step": 6131 }, { "epoch": 2.789808917197452, "grad_norm": 0.367040641060973, "learning_rate": 2.0479836285924542e-05, "loss": 0.0551, "step": 6132 }, { "epoch": 2.7902638762511374, "grad_norm": 0.4950933092820721, "learning_rate": 2.047280777403266e-05, "loss": 0.0466, "step": 6133 }, { "epoch": 2.7907188353048227, "grad_norm": 0.41914950569166887, "learning_rate": 2.0465779632082473e-05, "loss": 0.0239, "step": 6134 }, { "epoch": 2.7911737943585075, "grad_norm": 0.42753137775151046, "learning_rate": 2.0458751860648305e-05, "loss": 0.0376, "step": 6135 }, { "epoch": 2.791628753412193, "grad_norm": 0.47029017741838725, "learning_rate": 2.0451724460304414e-05, "loss": 0.043, "step": 6136 }, { "epoch": 2.792083712465878, "grad_norm": 0.3705878489650533, "learning_rate": 2.0444697431625065e-05, "loss": 0.0345, "step": 6137 }, { "epoch": 2.792538671519563, "grad_norm": 0.4559969603656559, "learning_rate": 2.043767077518448e-05, "loss": 0.03, "step": 6138 }, { "epoch": 2.7929936305732483, "grad_norm": 0.42379405953942345, "learning_rate": 2.043064449155683e-05, "loss": 0.0364, "step": 6139 }, { "epoch": 2.7934485896269337, "grad_norm": 0.535218478662332, "learning_rate": 2.0423618581316277e-05, "loss": 0.0599, "step": 6140 }, { "epoch": 2.7939035486806185, "grad_norm": 0.4558607265920353, "learning_rate": 2.0416593045036946e-05, "loss": 0.0371, "step": 6141 }, { "epoch": 2.794358507734304, "grad_norm": 0.4537731218808329, "learning_rate": 2.040956788329294e-05, "loss": 0.0504, "step": 6142 }, { "epoch": 2.794813466787989, "grad_norm": 0.35312290521004025, "learning_rate": 2.0402543096658298e-05, "loss": 0.0414, "step": 6143 }, { "epoch": 2.795268425841674, "grad_norm": 0.7956464067660821, "learning_rate": 2.0395518685707087e-05, "loss": 0.0776, "step": 6144 }, { "epoch": 2.7957233848953593, "grad_norm": 0.4747206654123222, "learning_rate": 2.0388494651013294e-05, "loss": 0.0303, "step": 6145 }, { "epoch": 2.7961783439490446, "grad_norm": 0.42712265134765987, "learning_rate": 2.0381470993150894e-05, "loss": 0.0477, "step": 6146 }, { "epoch": 2.7966333030027295, "grad_norm": 0.4991610612698166, "learning_rate": 2.037444771269382e-05, "loss": 0.0663, "step": 6147 }, { "epoch": 2.797088262056415, "grad_norm": 0.4619681594122051, "learning_rate": 2.0367424810216003e-05, "loss": 0.0516, "step": 6148 }, { "epoch": 2.7975432211101, "grad_norm": 0.4676784917916548, "learning_rate": 2.03604022862913e-05, "loss": 0.0511, "step": 6149 }, { "epoch": 2.797998180163785, "grad_norm": 0.5745919724940983, "learning_rate": 2.0353380141493563e-05, "loss": 0.0535, "step": 6150 }, { "epoch": 2.7984531392174703, "grad_norm": 0.39825508945504645, "learning_rate": 2.0346358376396628e-05, "loss": 0.035, "step": 6151 }, { "epoch": 2.7989080982711556, "grad_norm": 0.4484560250702204, "learning_rate": 2.0339336991574267e-05, "loss": 0.0922, "step": 6152 }, { "epoch": 2.799363057324841, "grad_norm": 0.4322954612782686, "learning_rate": 2.0332315987600246e-05, "loss": 0.0375, "step": 6153 }, { "epoch": 2.799818016378526, "grad_norm": 0.44016382979627733, "learning_rate": 2.0325295365048276e-05, "loss": 0.0295, "step": 6154 }, { "epoch": 2.800272975432211, "grad_norm": 0.5852251388963365, "learning_rate": 2.0318275124492064e-05, "loss": 0.0474, "step": 6155 }, { "epoch": 2.8007279344858964, "grad_norm": 0.9092554580467163, "learning_rate": 2.0311255266505263e-05, "loss": 0.0555, "step": 6156 }, { "epoch": 2.8011828935395813, "grad_norm": 0.4486654087999966, "learning_rate": 2.03042357916615e-05, "loss": 0.0466, "step": 6157 }, { "epoch": 2.8016378525932666, "grad_norm": 0.5236642332203466, "learning_rate": 2.0297216700534394e-05, "loss": 0.0566, "step": 6158 }, { "epoch": 2.802092811646952, "grad_norm": 0.49738376738838713, "learning_rate": 2.0290197993697494e-05, "loss": 0.0523, "step": 6159 }, { "epoch": 2.802547770700637, "grad_norm": 0.38037674782811237, "learning_rate": 2.028317967172435e-05, "loss": 0.0533, "step": 6160 }, { "epoch": 2.803002729754322, "grad_norm": 0.2889189178155449, "learning_rate": 2.0276161735188458e-05, "loss": 0.0311, "step": 6161 }, { "epoch": 2.8034576888080074, "grad_norm": 0.7056229797640136, "learning_rate": 2.0269144184663302e-05, "loss": 0.081, "step": 6162 }, { "epoch": 2.8039126478616927, "grad_norm": 0.35452386835438143, "learning_rate": 2.0262127020722316e-05, "loss": 0.034, "step": 6163 }, { "epoch": 2.8043676069153776, "grad_norm": 0.498460376179076, "learning_rate": 2.0255110243938905e-05, "loss": 0.0542, "step": 6164 }, { "epoch": 2.804822565969063, "grad_norm": 0.5208878716165033, "learning_rate": 2.024809385488647e-05, "loss": 0.0503, "step": 6165 }, { "epoch": 2.805277525022748, "grad_norm": 0.4827800278948875, "learning_rate": 2.0241077854138336e-05, "loss": 0.0347, "step": 6166 }, { "epoch": 2.805732484076433, "grad_norm": 0.5078594094976916, "learning_rate": 2.023406224226784e-05, "loss": 0.0378, "step": 6167 }, { "epoch": 2.8061874431301184, "grad_norm": 0.5839711241314136, "learning_rate": 2.022704701984825e-05, "loss": 0.0479, "step": 6168 }, { "epoch": 2.8066424021838037, "grad_norm": 0.45195535276892107, "learning_rate": 2.0220032187452823e-05, "loss": 0.047, "step": 6169 }, { "epoch": 2.8070973612374885, "grad_norm": 0.4644804405848854, "learning_rate": 2.0213017745654773e-05, "loss": 0.0621, "step": 6170 }, { "epoch": 2.807552320291174, "grad_norm": 0.33268268299220677, "learning_rate": 2.0206003695027295e-05, "loss": 0.0262, "step": 6171 }, { "epoch": 2.808007279344859, "grad_norm": 0.7701559942153321, "learning_rate": 2.0198990036143553e-05, "loss": 0.0512, "step": 6172 }, { "epoch": 2.808462238398544, "grad_norm": 0.39396430396132975, "learning_rate": 2.019197676957666e-05, "loss": 0.0406, "step": 6173 }, { "epoch": 2.8089171974522293, "grad_norm": 0.4006631889873903, "learning_rate": 2.018496389589972e-05, "loss": 0.0388, "step": 6174 }, { "epoch": 2.8093721565059147, "grad_norm": 0.5967414236498557, "learning_rate": 2.0177951415685778e-05, "loss": 0.0508, "step": 6175 }, { "epoch": 2.8098271155595995, "grad_norm": 0.8078092501970923, "learning_rate": 2.0170939329507875e-05, "loss": 0.0356, "step": 6176 }, { "epoch": 2.810282074613285, "grad_norm": 0.40637579456982487, "learning_rate": 2.0163927637939004e-05, "loss": 0.0572, "step": 6177 }, { "epoch": 2.81073703366697, "grad_norm": 0.3973202191229182, "learning_rate": 2.0156916341552108e-05, "loss": 0.036, "step": 6178 }, { "epoch": 2.811191992720655, "grad_norm": 0.5442158916996105, "learning_rate": 2.0149905440920157e-05, "loss": 0.0439, "step": 6179 }, { "epoch": 2.8116469517743403, "grad_norm": 0.5435322980345697, "learning_rate": 2.0142894936616026e-05, "loss": 0.0486, "step": 6180 }, { "epoch": 2.8121019108280256, "grad_norm": 0.3621432748437104, "learning_rate": 2.0135884829212588e-05, "loss": 0.0337, "step": 6181 }, { "epoch": 2.8125568698817105, "grad_norm": 0.44269033323009116, "learning_rate": 2.0128875119282674e-05, "loss": 0.0411, "step": 6182 }, { "epoch": 2.813011828935396, "grad_norm": 0.38284995988203113, "learning_rate": 2.0121865807399085e-05, "loss": 0.036, "step": 6183 }, { "epoch": 2.813466787989081, "grad_norm": 0.43661050337310897, "learning_rate": 2.01148568941346e-05, "loss": 0.0593, "step": 6184 }, { "epoch": 2.813921747042766, "grad_norm": 0.4832196745867915, "learning_rate": 2.0107848380061934e-05, "loss": 0.0455, "step": 6185 }, { "epoch": 2.8143767060964513, "grad_norm": 0.4198972445360846, "learning_rate": 2.010084026575381e-05, "loss": 0.0429, "step": 6186 }, { "epoch": 2.8148316651501366, "grad_norm": 0.5469217380658631, "learning_rate": 2.0093832551782908e-05, "loss": 0.0812, "step": 6187 }, { "epoch": 2.8152866242038215, "grad_norm": 0.6016321601726805, "learning_rate": 2.008682523872184e-05, "loss": 0.0917, "step": 6188 }, { "epoch": 2.815741583257507, "grad_norm": 0.4273303419048748, "learning_rate": 2.0079818327143236e-05, "loss": 0.0444, "step": 6189 }, { "epoch": 2.816196542311192, "grad_norm": 0.28361113102699653, "learning_rate": 2.0072811817619652e-05, "loss": 0.0259, "step": 6190 }, { "epoch": 2.816651501364877, "grad_norm": 0.46235035831915494, "learning_rate": 2.0065805710723642e-05, "loss": 0.0391, "step": 6191 }, { "epoch": 2.8171064604185623, "grad_norm": 0.5701637089068297, "learning_rate": 2.0058800007027695e-05, "loss": 0.0684, "step": 6192 }, { "epoch": 2.8175614194722476, "grad_norm": 0.5258208157429665, "learning_rate": 2.00517947071043e-05, "loss": 0.0563, "step": 6193 }, { "epoch": 2.8180163785259325, "grad_norm": 0.4833646269276167, "learning_rate": 2.0044789811525904e-05, "loss": 0.0333, "step": 6194 }, { "epoch": 2.8184713375796178, "grad_norm": 0.44867874748270803, "learning_rate": 2.0037785320864902e-05, "loss": 0.0725, "step": 6195 }, { "epoch": 2.818926296633303, "grad_norm": 0.3751335828278457, "learning_rate": 2.003078123569368e-05, "loss": 0.0311, "step": 6196 }, { "epoch": 2.819381255686988, "grad_norm": 0.6462980197692135, "learning_rate": 2.0023777556584567e-05, "loss": 0.0516, "step": 6197 }, { "epoch": 2.8198362147406733, "grad_norm": 0.41767752379170847, "learning_rate": 2.0016774284109887e-05, "loss": 0.0395, "step": 6198 }, { "epoch": 2.8202911737943586, "grad_norm": 0.4686074270171081, "learning_rate": 2.0009771418841895e-05, "loss": 0.0721, "step": 6199 }, { "epoch": 2.8207461328480434, "grad_norm": 0.38448438012514885, "learning_rate": 2.0002768961352858e-05, "loss": 0.0593, "step": 6200 }, { "epoch": 2.8212010919017287, "grad_norm": 0.468088802100771, "learning_rate": 1.9995766912214975e-05, "loss": 0.0536, "step": 6201 }, { "epoch": 2.821656050955414, "grad_norm": 0.4667349335296379, "learning_rate": 1.9988765272000414e-05, "loss": 0.0526, "step": 6202 }, { "epoch": 2.822111010009099, "grad_norm": 0.39522706091550913, "learning_rate": 1.9981764041281334e-05, "loss": 0.0395, "step": 6203 }, { "epoch": 2.8225659690627842, "grad_norm": 0.5279340412693907, "learning_rate": 1.997476322062983e-05, "loss": 0.0521, "step": 6204 }, { "epoch": 2.8230209281164695, "grad_norm": 0.5849339419047165, "learning_rate": 1.9967762810617983e-05, "loss": 0.0565, "step": 6205 }, { "epoch": 2.823475887170155, "grad_norm": 0.3251012802742008, "learning_rate": 1.9960762811817823e-05, "loss": 0.0263, "step": 6206 }, { "epoch": 2.8239308462238397, "grad_norm": 0.40486253960369484, "learning_rate": 1.9953763224801376e-05, "loss": 0.0406, "step": 6207 }, { "epoch": 2.824385805277525, "grad_norm": 0.35923082691786234, "learning_rate": 1.9946764050140615e-05, "loss": 0.021, "step": 6208 }, { "epoch": 2.8248407643312103, "grad_norm": 0.5496375796832497, "learning_rate": 1.993976528840747e-05, "loss": 0.0567, "step": 6209 }, { "epoch": 2.825295723384895, "grad_norm": 0.5108433887303772, "learning_rate": 1.993276694017386e-05, "loss": 0.0362, "step": 6210 }, { "epoch": 2.8257506824385805, "grad_norm": 0.5608099673023839, "learning_rate": 1.9925769006011647e-05, "loss": 0.0528, "step": 6211 }, { "epoch": 2.826205641492266, "grad_norm": 0.4656083383531987, "learning_rate": 1.991877148649268e-05, "loss": 0.0345, "step": 6212 }, { "epoch": 2.826660600545951, "grad_norm": 0.42531789811072424, "learning_rate": 1.991177438218875e-05, "loss": 0.0425, "step": 6213 }, { "epoch": 2.827115559599636, "grad_norm": 0.5483375089760125, "learning_rate": 1.9904777693671645e-05, "loss": 0.0581, "step": 6214 }, { "epoch": 2.8275705186533213, "grad_norm": 0.5581817909981841, "learning_rate": 1.98977814215131e-05, "loss": 0.0593, "step": 6215 }, { "epoch": 2.8280254777070066, "grad_norm": 0.4939511805731096, "learning_rate": 1.989078556628482e-05, "loss": 0.0328, "step": 6216 }, { "epoch": 2.8284804367606915, "grad_norm": 0.3444091683329319, "learning_rate": 1.9883790128558463e-05, "loss": 0.032, "step": 6217 }, { "epoch": 2.828935395814377, "grad_norm": 0.5528160772206563, "learning_rate": 1.9876795108905678e-05, "loss": 0.0468, "step": 6218 }, { "epoch": 2.829390354868062, "grad_norm": 0.5862090616388171, "learning_rate": 1.9869800507898052e-05, "loss": 0.0523, "step": 6219 }, { "epoch": 2.829845313921747, "grad_norm": 0.41057861598093265, "learning_rate": 1.986280632610716e-05, "loss": 0.0446, "step": 6220 }, { "epoch": 2.8303002729754323, "grad_norm": 0.5518044621657628, "learning_rate": 1.9855812564104546e-05, "loss": 0.0492, "step": 6221 }, { "epoch": 2.8307552320291176, "grad_norm": 0.3601775018009938, "learning_rate": 1.984881922246169e-05, "loss": 0.042, "step": 6222 }, { "epoch": 2.8312101910828025, "grad_norm": 0.5041625705981974, "learning_rate": 1.984182630175007e-05, "loss": 0.0598, "step": 6223 }, { "epoch": 2.831665150136488, "grad_norm": 0.4073167410594689, "learning_rate": 1.9834833802541107e-05, "loss": 0.0311, "step": 6224 }, { "epoch": 2.832120109190173, "grad_norm": 0.42351810051705596, "learning_rate": 1.9827841725406208e-05, "loss": 0.0373, "step": 6225 }, { "epoch": 2.832575068243858, "grad_norm": 0.45416613107452547, "learning_rate": 1.9820850070916717e-05, "loss": 0.0544, "step": 6226 }, { "epoch": 2.8330300272975433, "grad_norm": 0.4354723657455048, "learning_rate": 1.9813858839643966e-05, "loss": 0.0281, "step": 6227 }, { "epoch": 2.8334849863512286, "grad_norm": 0.4078556779876248, "learning_rate": 1.980686803215926e-05, "loss": 0.0456, "step": 6228 }, { "epoch": 2.8339399454049135, "grad_norm": 0.4920584215354128, "learning_rate": 1.979987764903384e-05, "loss": 0.0381, "step": 6229 }, { "epoch": 2.8343949044585988, "grad_norm": 0.8161358619164789, "learning_rate": 1.979288769083894e-05, "loss": 0.073, "step": 6230 }, { "epoch": 2.834849863512284, "grad_norm": 0.7200648374933227, "learning_rate": 1.9785898158145738e-05, "loss": 0.0489, "step": 6231 }, { "epoch": 2.835304822565969, "grad_norm": 0.6972420060073984, "learning_rate": 1.9778909051525396e-05, "loss": 0.0588, "step": 6232 }, { "epoch": 2.8357597816196543, "grad_norm": 0.4743092465146222, "learning_rate": 1.9771920371549025e-05, "loss": 0.0258, "step": 6233 }, { "epoch": 2.8362147406733396, "grad_norm": 0.3946888889505506, "learning_rate": 1.9764932118787706e-05, "loss": 0.0372, "step": 6234 }, { "epoch": 2.8366696997270244, "grad_norm": 0.35113878439889673, "learning_rate": 1.9757944293812503e-05, "loss": 0.022, "step": 6235 }, { "epoch": 2.8371246587807097, "grad_norm": 0.5520496761648999, "learning_rate": 1.9750956897194415e-05, "loss": 0.0846, "step": 6236 }, { "epoch": 2.837579617834395, "grad_norm": 0.30259550269103414, "learning_rate": 1.9743969929504427e-05, "loss": 0.0342, "step": 6237 }, { "epoch": 2.83803457688808, "grad_norm": 0.4285286408606785, "learning_rate": 1.9736983391313478e-05, "loss": 0.06, "step": 6238 }, { "epoch": 2.8384895359417652, "grad_norm": 0.4234543432987722, "learning_rate": 1.9729997283192484e-05, "loss": 0.0373, "step": 6239 }, { "epoch": 2.8389444949954505, "grad_norm": 0.43611486237761643, "learning_rate": 1.9723011605712306e-05, "loss": 0.0749, "step": 6240 }, { "epoch": 2.8393994540491354, "grad_norm": 0.6677774083407487, "learning_rate": 1.9716026359443784e-05, "loss": 0.0892, "step": 6241 }, { "epoch": 2.8398544131028207, "grad_norm": 0.48419249468771874, "learning_rate": 1.9709041544957738e-05, "loss": 0.046, "step": 6242 }, { "epoch": 2.840309372156506, "grad_norm": 0.5621410941693857, "learning_rate": 1.9702057162824918e-05, "loss": 0.0446, "step": 6243 }, { "epoch": 2.840764331210191, "grad_norm": 0.4900364054567105, "learning_rate": 1.9695073213616065e-05, "loss": 0.0405, "step": 6244 }, { "epoch": 2.841219290263876, "grad_norm": 0.39835021568249407, "learning_rate": 1.9688089697901865e-05, "loss": 0.0252, "step": 6245 }, { "epoch": 2.8416742493175615, "grad_norm": 0.46977886711341865, "learning_rate": 1.9681106616252997e-05, "loss": 0.0463, "step": 6246 }, { "epoch": 2.8421292083712464, "grad_norm": 0.4081636214960576, "learning_rate": 1.9674123969240067e-05, "loss": 0.0488, "step": 6247 }, { "epoch": 2.8425841674249317, "grad_norm": 0.4645620096212963, "learning_rate": 1.966714175743367e-05, "loss": 0.041, "step": 6248 }, { "epoch": 2.843039126478617, "grad_norm": 0.7611423056814135, "learning_rate": 1.9660159981404373e-05, "loss": 0.0605, "step": 6249 }, { "epoch": 2.843494085532302, "grad_norm": 0.6375840177792031, "learning_rate": 1.9653178641722688e-05, "loss": 0.0607, "step": 6250 }, { "epoch": 2.843949044585987, "grad_norm": 0.4536915603900502, "learning_rate": 1.9646197738959102e-05, "loss": 0.032, "step": 6251 }, { "epoch": 2.8444040036396725, "grad_norm": 0.47083595446196214, "learning_rate": 1.963921727368406e-05, "loss": 0.0536, "step": 6252 }, { "epoch": 2.8448589626933574, "grad_norm": 0.6073874502120837, "learning_rate": 1.9632237246467966e-05, "loss": 0.0989, "step": 6253 }, { "epoch": 2.8453139217470427, "grad_norm": 0.3880353849256843, "learning_rate": 1.962525765788121e-05, "loss": 0.054, "step": 6254 }, { "epoch": 2.845768880800728, "grad_norm": 0.5734980958799302, "learning_rate": 1.9618278508494114e-05, "loss": 0.061, "step": 6255 }, { "epoch": 2.846223839854413, "grad_norm": 0.2911807448643926, "learning_rate": 1.9611299798877004e-05, "loss": 0.0184, "step": 6256 }, { "epoch": 2.846678798908098, "grad_norm": 0.7230568970689332, "learning_rate": 1.9604321529600142e-05, "loss": 0.1062, "step": 6257 }, { "epoch": 2.8471337579617835, "grad_norm": 0.44097063425813626, "learning_rate": 1.9597343701233753e-05, "loss": 0.0421, "step": 6258 }, { "epoch": 2.8475887170154683, "grad_norm": 0.37819147796036306, "learning_rate": 1.9590366314348043e-05, "loss": 0.0188, "step": 6259 }, { "epoch": 2.8480436760691537, "grad_norm": 0.5120136501492323, "learning_rate": 1.9583389369513162e-05, "loss": 0.0579, "step": 6260 }, { "epoch": 2.848498635122839, "grad_norm": 0.36093955345471324, "learning_rate": 1.9576412867299248e-05, "loss": 0.0399, "step": 6261 }, { "epoch": 2.8489535941765243, "grad_norm": 0.7045553713491062, "learning_rate": 1.956943680827637e-05, "loss": 0.0343, "step": 6262 }, { "epoch": 2.849408553230209, "grad_norm": 0.4900186684460623, "learning_rate": 1.9562461193014598e-05, "loss": 0.0277, "step": 6263 }, { "epoch": 2.8498635122838945, "grad_norm": 0.3838226741659891, "learning_rate": 1.9555486022083946e-05, "loss": 0.0534, "step": 6264 }, { "epoch": 2.8503184713375798, "grad_norm": 0.6969254722019949, "learning_rate": 1.9548511296054384e-05, "loss": 0.0406, "step": 6265 }, { "epoch": 2.8507734303912646, "grad_norm": 0.4015280029029296, "learning_rate": 1.954153701549587e-05, "loss": 0.0626, "step": 6266 }, { "epoch": 2.85122838944495, "grad_norm": 0.364731669968054, "learning_rate": 1.953456318097829e-05, "loss": 0.0226, "step": 6267 }, { "epoch": 2.8516833484986353, "grad_norm": 0.6012230688175582, "learning_rate": 1.952758979307153e-05, "loss": 0.059, "step": 6268 }, { "epoch": 2.8521383075523206, "grad_norm": 0.5253334835917285, "learning_rate": 1.9520616852345407e-05, "loss": 0.062, "step": 6269 }, { "epoch": 2.8525932666060054, "grad_norm": 0.4134306877444337, "learning_rate": 1.951364435936974e-05, "loss": 0.0298, "step": 6270 }, { "epoch": 2.8530482256596907, "grad_norm": 0.4926701917674513, "learning_rate": 1.9506672314714283e-05, "loss": 0.0449, "step": 6271 }, { "epoch": 2.853503184713376, "grad_norm": 1.583851903854687, "learning_rate": 1.9499700718948754e-05, "loss": 0.1411, "step": 6272 }, { "epoch": 2.853958143767061, "grad_norm": 0.4465070627195124, "learning_rate": 1.9492729572642845e-05, "loss": 0.0426, "step": 6273 }, { "epoch": 2.8544131028207462, "grad_norm": 0.5839478590560654, "learning_rate": 1.94857588763662e-05, "loss": 0.0867, "step": 6274 }, { "epoch": 2.8548680618744315, "grad_norm": 0.4372848283362783, "learning_rate": 1.9478788630688445e-05, "loss": 0.0464, "step": 6275 }, { "epoch": 2.8553230209281164, "grad_norm": 0.3645555155072635, "learning_rate": 1.9471818836179136e-05, "loss": 0.0509, "step": 6276 }, { "epoch": 2.8557779799818017, "grad_norm": 1.0996321596787306, "learning_rate": 1.9464849493407836e-05, "loss": 0.1014, "step": 6277 }, { "epoch": 2.856232939035487, "grad_norm": 0.3933734312335107, "learning_rate": 1.945788060294404e-05, "loss": 0.0363, "step": 6278 }, { "epoch": 2.856687898089172, "grad_norm": 0.43846363452000664, "learning_rate": 1.945091216535721e-05, "loss": 0.0446, "step": 6279 }, { "epoch": 2.857142857142857, "grad_norm": 0.4168072449306982, "learning_rate": 1.9443944181216782e-05, "loss": 0.0428, "step": 6280 }, { "epoch": 2.8575978161965425, "grad_norm": 0.4039306893087694, "learning_rate": 1.9436976651092144e-05, "loss": 0.0535, "step": 6281 }, { "epoch": 2.8580527752502274, "grad_norm": 0.6179312478000137, "learning_rate": 1.9430009575552653e-05, "loss": 0.0818, "step": 6282 }, { "epoch": 2.8585077343039127, "grad_norm": 0.5000730662157742, "learning_rate": 1.9423042955167615e-05, "loss": 0.0387, "step": 6283 }, { "epoch": 2.858962693357598, "grad_norm": 0.3252306384758088, "learning_rate": 1.9416076790506325e-05, "loss": 0.0303, "step": 6284 }, { "epoch": 2.859417652411283, "grad_norm": 0.40333822433527894, "learning_rate": 1.9409111082138032e-05, "loss": 0.027, "step": 6285 }, { "epoch": 2.859872611464968, "grad_norm": 0.46214521147438253, "learning_rate": 1.9402145830631928e-05, "loss": 0.0326, "step": 6286 }, { "epoch": 2.8603275705186535, "grad_norm": 0.5837845585497694, "learning_rate": 1.9395181036557188e-05, "loss": 0.0491, "step": 6287 }, { "epoch": 2.8607825295723384, "grad_norm": 0.41576247419018364, "learning_rate": 1.9388216700482948e-05, "loss": 0.0528, "step": 6288 }, { "epoch": 2.8612374886260237, "grad_norm": 0.4606833537037853, "learning_rate": 1.9381252822978288e-05, "loss": 0.0344, "step": 6289 }, { "epoch": 2.861692447679709, "grad_norm": 0.5181289778274767, "learning_rate": 1.9374289404612265e-05, "loss": 0.0364, "step": 6290 }, { "epoch": 2.862147406733394, "grad_norm": 0.36340831744349006, "learning_rate": 1.9367326445953924e-05, "loss": 0.066, "step": 6291 }, { "epoch": 2.862602365787079, "grad_norm": 0.4571900325849666, "learning_rate": 1.9360363947572218e-05, "loss": 0.0419, "step": 6292 }, { "epoch": 2.8630573248407645, "grad_norm": 0.49388538955197037, "learning_rate": 1.935340191003611e-05, "loss": 0.0609, "step": 6293 }, { "epoch": 2.8635122838944493, "grad_norm": 0.4725924173899935, "learning_rate": 1.9346440333914494e-05, "loss": 0.0503, "step": 6294 }, { "epoch": 2.8639672429481347, "grad_norm": 0.4648721514418881, "learning_rate": 1.9339479219776243e-05, "loss": 0.0358, "step": 6295 }, { "epoch": 2.86442220200182, "grad_norm": 1.0584688895416101, "learning_rate": 1.9332518568190186e-05, "loss": 0.0445, "step": 6296 }, { "epoch": 2.864877161055505, "grad_norm": 0.3417521925020785, "learning_rate": 1.9325558379725113e-05, "loss": 0.0395, "step": 6297 }, { "epoch": 2.86533212010919, "grad_norm": 0.5637391626037871, "learning_rate": 1.931859865494979e-05, "loss": 0.078, "step": 6298 }, { "epoch": 2.8657870791628755, "grad_norm": 0.4713414389694761, "learning_rate": 1.9311639394432926e-05, "loss": 0.039, "step": 6299 }, { "epoch": 2.8662420382165603, "grad_norm": 0.5313279903067643, "learning_rate": 1.9304680598743206e-05, "loss": 0.0458, "step": 6300 }, { "epoch": 2.8666969972702456, "grad_norm": 0.408451276612519, "learning_rate": 1.9297722268449264e-05, "loss": 0.0328, "step": 6301 }, { "epoch": 2.867151956323931, "grad_norm": 0.34799402202358154, "learning_rate": 1.9290764404119714e-05, "loss": 0.061, "step": 6302 }, { "epoch": 2.867606915377616, "grad_norm": 0.8338439353340322, "learning_rate": 1.9283807006323103e-05, "loss": 0.1437, "step": 6303 }, { "epoch": 2.868061874431301, "grad_norm": 0.3701527784648082, "learning_rate": 1.927685007562797e-05, "loss": 0.0495, "step": 6304 }, { "epoch": 2.8685168334849864, "grad_norm": 0.5322820664800777, "learning_rate": 1.926989361260281e-05, "loss": 0.1031, "step": 6305 }, { "epoch": 2.8689717925386713, "grad_norm": 0.5601684740799774, "learning_rate": 1.9262937617816063e-05, "loss": 0.0393, "step": 6306 }, { "epoch": 2.8694267515923566, "grad_norm": 0.4223416388562856, "learning_rate": 1.925598209183615e-05, "loss": 0.0562, "step": 6307 }, { "epoch": 2.869881710646042, "grad_norm": 0.42507860908411543, "learning_rate": 1.9249027035231436e-05, "loss": 0.0381, "step": 6308 }, { "epoch": 2.870336669699727, "grad_norm": 0.5822718982424732, "learning_rate": 1.9242072448570268e-05, "loss": 0.0415, "step": 6309 }, { "epoch": 2.870791628753412, "grad_norm": 0.466111177301628, "learning_rate": 1.923511833242093e-05, "loss": 0.0411, "step": 6310 }, { "epoch": 2.8712465878070974, "grad_norm": 0.6261909724996231, "learning_rate": 1.9228164687351687e-05, "loss": 0.0698, "step": 6311 }, { "epoch": 2.8717015468607823, "grad_norm": 0.5123823275119227, "learning_rate": 1.9221211513930764e-05, "loss": 0.0697, "step": 6312 }, { "epoch": 2.8721565059144676, "grad_norm": 0.5597773062394297, "learning_rate": 1.921425881272634e-05, "loss": 0.0919, "step": 6313 }, { "epoch": 2.872611464968153, "grad_norm": 0.600619769360237, "learning_rate": 1.9207306584306557e-05, "loss": 0.0567, "step": 6314 }, { "epoch": 2.8730664240218378, "grad_norm": 0.49347615824968377, "learning_rate": 1.920035482923952e-05, "loss": 0.0387, "step": 6315 }, { "epoch": 2.873521383075523, "grad_norm": 0.4443806677301261, "learning_rate": 1.91934035480933e-05, "loss": 0.073, "step": 6316 }, { "epoch": 2.8739763421292084, "grad_norm": 0.44010292584308236, "learning_rate": 1.9186452741435913e-05, "loss": 0.0676, "step": 6317 }, { "epoch": 2.8744313011828937, "grad_norm": 0.84351772429949, "learning_rate": 1.9179502409835347e-05, "loss": 0.0785, "step": 6318 }, { "epoch": 2.8748862602365786, "grad_norm": 0.975246741886884, "learning_rate": 1.9172552553859573e-05, "loss": 0.1544, "step": 6319 }, { "epoch": 2.875341219290264, "grad_norm": 0.5088440713819513, "learning_rate": 1.916560317407648e-05, "loss": 0.0531, "step": 6320 }, { "epoch": 2.875796178343949, "grad_norm": 0.4621735135774628, "learning_rate": 1.9158654271053956e-05, "loss": 0.0527, "step": 6321 }, { "epoch": 2.876251137397634, "grad_norm": 0.4604706876108093, "learning_rate": 1.9151705845359825e-05, "loss": 0.0445, "step": 6322 }, { "epoch": 2.8767060964513194, "grad_norm": 0.5160671107650726, "learning_rate": 1.914475789756187e-05, "loss": 0.0358, "step": 6323 }, { "epoch": 2.8771610555050047, "grad_norm": 0.38917112886240607, "learning_rate": 1.913781042822787e-05, "loss": 0.0297, "step": 6324 }, { "epoch": 2.87761601455869, "grad_norm": 0.3933329517940909, "learning_rate": 1.9130863437925516e-05, "loss": 0.0414, "step": 6325 }, { "epoch": 2.878070973612375, "grad_norm": 0.4300107521156796, "learning_rate": 1.9123916927222504e-05, "loss": 0.0586, "step": 6326 }, { "epoch": 2.87852593266606, "grad_norm": 0.4460212348203218, "learning_rate": 1.9116970896686467e-05, "loss": 0.0541, "step": 6327 }, { "epoch": 2.8789808917197455, "grad_norm": 0.5077275336257604, "learning_rate": 1.9110025346884996e-05, "loss": 0.0467, "step": 6328 }, { "epoch": 2.8794358507734303, "grad_norm": 0.4041465976859691, "learning_rate": 1.9103080278385664e-05, "loss": 0.0568, "step": 6329 }, { "epoch": 2.8798908098271156, "grad_norm": 0.4341609083448615, "learning_rate": 1.909613569175597e-05, "loss": 0.0644, "step": 6330 }, { "epoch": 2.880345768880801, "grad_norm": 0.4732120863347165, "learning_rate": 1.9089191587563417e-05, "loss": 0.0484, "step": 6331 }, { "epoch": 2.880800727934486, "grad_norm": 0.430637555171544, "learning_rate": 1.9082247966375417e-05, "loss": 0.0474, "step": 6332 }, { "epoch": 2.881255686988171, "grad_norm": 0.5048948286809466, "learning_rate": 1.90753048287594e-05, "loss": 0.0282, "step": 6333 }, { "epoch": 2.8817106460418564, "grad_norm": 0.7874160481799456, "learning_rate": 1.906836217528272e-05, "loss": 0.0617, "step": 6334 }, { "epoch": 2.8821656050955413, "grad_norm": 0.4975783410160012, "learning_rate": 1.906142000651269e-05, "loss": 0.0925, "step": 6335 }, { "epoch": 2.8826205641492266, "grad_norm": 0.42989625877616167, "learning_rate": 1.9054478323016606e-05, "loss": 0.0564, "step": 6336 }, { "epoch": 2.883075523202912, "grad_norm": 0.3989531401353556, "learning_rate": 1.9047537125361694e-05, "loss": 0.0403, "step": 6337 }, { "epoch": 2.883530482256597, "grad_norm": 0.6461518890253398, "learning_rate": 1.9040596414115175e-05, "loss": 0.0643, "step": 6338 }, { "epoch": 2.883985441310282, "grad_norm": 0.5057829759432887, "learning_rate": 1.9033656189844197e-05, "loss": 0.0652, "step": 6339 }, { "epoch": 2.8844404003639674, "grad_norm": 0.32740441389191277, "learning_rate": 1.9026716453115893e-05, "loss": 0.0328, "step": 6340 }, { "epoch": 2.8848953594176523, "grad_norm": 0.6977215143140824, "learning_rate": 1.9019777204497353e-05, "loss": 0.0518, "step": 6341 }, { "epoch": 2.8853503184713376, "grad_norm": 0.4015196603686799, "learning_rate": 1.9012838444555606e-05, "loss": 0.0325, "step": 6342 }, { "epoch": 2.885805277525023, "grad_norm": 0.46414583154821304, "learning_rate": 1.900590017385767e-05, "loss": 0.0464, "step": 6343 }, { "epoch": 2.886260236578708, "grad_norm": 0.509283184662487, "learning_rate": 1.8998962392970495e-05, "loss": 0.0554, "step": 6344 }, { "epoch": 2.886715195632393, "grad_norm": 0.5640560935146285, "learning_rate": 1.899202510246102e-05, "loss": 0.0635, "step": 6345 }, { "epoch": 2.8871701546860784, "grad_norm": 0.36041576783339935, "learning_rate": 1.898508830289611e-05, "loss": 0.0294, "step": 6346 }, { "epoch": 2.8876251137397633, "grad_norm": 0.39672332124640297, "learning_rate": 1.897815199484263e-05, "loss": 0.0333, "step": 6347 }, { "epoch": 2.8880800727934486, "grad_norm": 0.6829717212717494, "learning_rate": 1.8971216178867378e-05, "loss": 0.0728, "step": 6348 }, { "epoch": 2.888535031847134, "grad_norm": 0.3205228796724038, "learning_rate": 1.8964280855537107e-05, "loss": 0.0224, "step": 6349 }, { "epoch": 2.8889899909008188, "grad_norm": 0.5536718936656592, "learning_rate": 1.8957346025418556e-05, "loss": 0.0572, "step": 6350 }, { "epoch": 2.889444949954504, "grad_norm": 0.4370333998865178, "learning_rate": 1.895041168907839e-05, "loss": 0.0444, "step": 6351 }, { "epoch": 2.8898999090081894, "grad_norm": 0.4904084147322348, "learning_rate": 1.8943477847083267e-05, "loss": 0.0412, "step": 6352 }, { "epoch": 2.8903548680618742, "grad_norm": 1.0872153188104625, "learning_rate": 1.8936544499999775e-05, "loss": 0.0502, "step": 6353 }, { "epoch": 2.8908098271155596, "grad_norm": 0.49364414557417696, "learning_rate": 1.8929611648394488e-05, "loss": 0.0352, "step": 6354 }, { "epoch": 2.891264786169245, "grad_norm": 0.5106220449471344, "learning_rate": 1.8922679292833927e-05, "loss": 0.0426, "step": 6355 }, { "epoch": 2.8917197452229297, "grad_norm": 0.5454941302447947, "learning_rate": 1.8915747433884568e-05, "loss": 0.0744, "step": 6356 }, { "epoch": 2.892174704276615, "grad_norm": 0.5553517895206562, "learning_rate": 1.8908816072112856e-05, "loss": 0.0551, "step": 6357 }, { "epoch": 2.8926296633303004, "grad_norm": 0.5013639032526548, "learning_rate": 1.8901885208085185e-05, "loss": 0.0301, "step": 6358 }, { "epoch": 2.8930846223839852, "grad_norm": 0.3443810892714601, "learning_rate": 1.8894954842367912e-05, "loss": 0.0321, "step": 6359 }, { "epoch": 2.8935395814376705, "grad_norm": 0.5524805309186994, "learning_rate": 1.8888024975527356e-05, "loss": 0.0375, "step": 6360 }, { "epoch": 2.893994540491356, "grad_norm": 0.48668116136364886, "learning_rate": 1.8881095608129806e-05, "loss": 0.0551, "step": 6361 }, { "epoch": 2.8944494995450407, "grad_norm": 0.3416303468046886, "learning_rate": 1.8874166740741486e-05, "loss": 0.0268, "step": 6362 }, { "epoch": 2.894904458598726, "grad_norm": 0.5016605055004641, "learning_rate": 1.88672383739286e-05, "loss": 0.0774, "step": 6363 }, { "epoch": 2.8953594176524113, "grad_norm": 0.7030059708806011, "learning_rate": 1.8860310508257296e-05, "loss": 0.054, "step": 6364 }, { "epoch": 2.895814376706096, "grad_norm": 0.38788572889991524, "learning_rate": 1.8853383144293694e-05, "loss": 0.0636, "step": 6365 }, { "epoch": 2.8962693357597815, "grad_norm": 0.6456613836021996, "learning_rate": 1.884645628260386e-05, "loss": 0.0449, "step": 6366 }, { "epoch": 2.896724294813467, "grad_norm": 0.4372834750649628, "learning_rate": 1.883952992375382e-05, "loss": 0.0523, "step": 6367 }, { "epoch": 2.8971792538671517, "grad_norm": 0.44255568258953437, "learning_rate": 1.8832604068309586e-05, "loss": 0.0308, "step": 6368 }, { "epoch": 2.897634212920837, "grad_norm": 0.4444225286856157, "learning_rate": 1.8825678716837093e-05, "loss": 0.0425, "step": 6369 }, { "epoch": 2.8980891719745223, "grad_norm": 0.6352318612231749, "learning_rate": 1.8818753869902256e-05, "loss": 0.0388, "step": 6370 }, { "epoch": 2.8985441310282076, "grad_norm": 0.639153753405357, "learning_rate": 1.8811829528070935e-05, "loss": 0.0744, "step": 6371 }, { "epoch": 2.8989990900818925, "grad_norm": 0.7989997837531893, "learning_rate": 1.8804905691908963e-05, "loss": 0.0806, "step": 6372 }, { "epoch": 2.899454049135578, "grad_norm": 0.44222594608027777, "learning_rate": 1.8797982361982118e-05, "loss": 0.0279, "step": 6373 }, { "epoch": 2.899909008189263, "grad_norm": 0.3094255655589653, "learning_rate": 1.879105953885614e-05, "loss": 0.0248, "step": 6374 }, { "epoch": 2.900363967242948, "grad_norm": 0.5566738234844197, "learning_rate": 1.878413722309674e-05, "loss": 0.0611, "step": 6375 }, { "epoch": 2.9008189262966333, "grad_norm": 0.39228953272442796, "learning_rate": 1.877721541526958e-05, "loss": 0.0347, "step": 6376 }, { "epoch": 2.9012738853503186, "grad_norm": 0.415109886720911, "learning_rate": 1.877029411594028e-05, "loss": 0.0193, "step": 6377 }, { "epoch": 2.901728844404004, "grad_norm": 0.5050528547569508, "learning_rate": 1.87633733256744e-05, "loss": 0.0564, "step": 6378 }, { "epoch": 2.902183803457689, "grad_norm": 0.6155279184016003, "learning_rate": 1.8756453045037498e-05, "loss": 0.0536, "step": 6379 }, { "epoch": 2.902638762511374, "grad_norm": 0.5814855214954384, "learning_rate": 1.8749533274595048e-05, "loss": 0.0507, "step": 6380 }, { "epoch": 2.9030937215650594, "grad_norm": 0.6899497643493389, "learning_rate": 1.874261401491251e-05, "loss": 0.0381, "step": 6381 }, { "epoch": 2.9035486806187443, "grad_norm": 0.4158753389995806, "learning_rate": 1.8735695266555306e-05, "loss": 0.0391, "step": 6382 }, { "epoch": 2.9040036396724296, "grad_norm": 0.4627914601520902, "learning_rate": 1.872877703008879e-05, "loss": 0.0473, "step": 6383 }, { "epoch": 2.904458598726115, "grad_norm": 0.48609334204235416, "learning_rate": 1.87218593060783e-05, "loss": 0.0475, "step": 6384 }, { "epoch": 2.9049135577797998, "grad_norm": 0.46666764349195816, "learning_rate": 1.8714942095089112e-05, "loss": 0.087, "step": 6385 }, { "epoch": 2.905368516833485, "grad_norm": 0.8921368506482092, "learning_rate": 1.8708025397686474e-05, "loss": 0.0846, "step": 6386 }, { "epoch": 2.9058234758871704, "grad_norm": 0.4571189394038078, "learning_rate": 1.8701109214435587e-05, "loss": 0.0303, "step": 6387 }, { "epoch": 2.9062784349408552, "grad_norm": 0.3744383679220671, "learning_rate": 1.86941935459016e-05, "loss": 0.0586, "step": 6388 }, { "epoch": 2.9067333939945406, "grad_norm": 0.40013547741049993, "learning_rate": 1.8687278392649647e-05, "loss": 0.0366, "step": 6389 }, { "epoch": 2.907188353048226, "grad_norm": 0.4676882023307694, "learning_rate": 1.8680363755244798e-05, "loss": 0.0333, "step": 6390 }, { "epoch": 2.9076433121019107, "grad_norm": 0.459273723986158, "learning_rate": 1.8673449634252085e-05, "loss": 0.0397, "step": 6391 }, { "epoch": 2.908098271155596, "grad_norm": 0.3732017316357512, "learning_rate": 1.866653603023649e-05, "loss": 0.0394, "step": 6392 }, { "epoch": 2.9085532302092814, "grad_norm": 0.44536229211391126, "learning_rate": 1.865962294376298e-05, "loss": 0.0489, "step": 6393 }, { "epoch": 2.9090081892629662, "grad_norm": 1.0732095610355634, "learning_rate": 1.865271037539645e-05, "loss": 0.0793, "step": 6394 }, { "epoch": 2.9094631483166515, "grad_norm": 0.562474628422375, "learning_rate": 1.864579832570174e-05, "loss": 0.048, "step": 6395 }, { "epoch": 2.909918107370337, "grad_norm": 0.6004182731804657, "learning_rate": 1.8638886795243717e-05, "loss": 0.058, "step": 6396 }, { "epoch": 2.9103730664240217, "grad_norm": 0.6370406856490164, "learning_rate": 1.863197578458714e-05, "loss": 0.0379, "step": 6397 }, { "epoch": 2.910828025477707, "grad_norm": 0.8100652080902297, "learning_rate": 1.8625065294296735e-05, "loss": 0.0291, "step": 6398 }, { "epoch": 2.9112829845313923, "grad_norm": 0.5392411964212512, "learning_rate": 1.8618155324937215e-05, "loss": 0.0606, "step": 6399 }, { "epoch": 2.911737943585077, "grad_norm": 0.3975453261200665, "learning_rate": 1.8611245877073213e-05, "loss": 0.0246, "step": 6400 }, { "epoch": 2.9121929026387625, "grad_norm": 0.33520747999858574, "learning_rate": 1.8604336951269353e-05, "loss": 0.0255, "step": 6401 }, { "epoch": 2.912647861692448, "grad_norm": 0.5513081288065786, "learning_rate": 1.859742854809018e-05, "loss": 0.0586, "step": 6402 }, { "epoch": 2.9131028207461327, "grad_norm": 0.40925282161460025, "learning_rate": 1.859052066810024e-05, "loss": 0.0275, "step": 6403 }, { "epoch": 2.913557779799818, "grad_norm": 0.35332045568122467, "learning_rate": 1.8583613311864018e-05, "loss": 0.0363, "step": 6404 }, { "epoch": 2.9140127388535033, "grad_norm": 0.4838701006643091, "learning_rate": 1.8576706479945928e-05, "loss": 0.0512, "step": 6405 }, { "epoch": 2.914467697907188, "grad_norm": 0.7701684771946786, "learning_rate": 1.8569800172910385e-05, "loss": 0.073, "step": 6406 }, { "epoch": 2.9149226569608735, "grad_norm": 0.6854996044655165, "learning_rate": 1.8562894391321725e-05, "loss": 0.0549, "step": 6407 }, { "epoch": 2.915377616014559, "grad_norm": 0.37652332859750715, "learning_rate": 1.8555989135744273e-05, "loss": 0.0351, "step": 6408 }, { "epoch": 2.9158325750682437, "grad_norm": 0.42061976771594667, "learning_rate": 1.8549084406742278e-05, "loss": 0.0365, "step": 6409 }, { "epoch": 2.916287534121929, "grad_norm": 0.5782044305315749, "learning_rate": 1.854218020487998e-05, "loss": 0.0587, "step": 6410 }, { "epoch": 2.9167424931756143, "grad_norm": 0.48890641102936716, "learning_rate": 1.853527653072155e-05, "loss": 0.0423, "step": 6411 }, { "epoch": 2.917197452229299, "grad_norm": 0.5725478541805709, "learning_rate": 1.852837338483113e-05, "loss": 0.0371, "step": 6412 }, { "epoch": 2.9176524112829845, "grad_norm": 0.6020558289770852, "learning_rate": 1.8521470767772815e-05, "loss": 0.0525, "step": 6413 }, { "epoch": 2.91810737033667, "grad_norm": 0.5732151824281042, "learning_rate": 1.8514568680110644e-05, "loss": 0.0764, "step": 6414 }, { "epoch": 2.9185623293903546, "grad_norm": 0.3231846657500617, "learning_rate": 1.850766712240864e-05, "loss": 0.0179, "step": 6415 }, { "epoch": 2.91901728844404, "grad_norm": 0.5140618685428541, "learning_rate": 1.850076609523075e-05, "loss": 0.0221, "step": 6416 }, { "epoch": 2.9194722474977253, "grad_norm": 0.4331464747900244, "learning_rate": 1.849386559914091e-05, "loss": 0.0624, "step": 6417 }, { "epoch": 2.91992720655141, "grad_norm": 0.47012727224978357, "learning_rate": 1.8486965634702996e-05, "loss": 0.0401, "step": 6418 }, { "epoch": 2.9203821656050954, "grad_norm": 0.40488566638243373, "learning_rate": 1.848006620248083e-05, "loss": 0.0576, "step": 6419 }, { "epoch": 2.9208371246587808, "grad_norm": 0.42362498084851663, "learning_rate": 1.8473167303038218e-05, "loss": 0.0199, "step": 6420 }, { "epoch": 2.9212920837124656, "grad_norm": 0.3699346679673103, "learning_rate": 1.8466268936938893e-05, "loss": 0.0283, "step": 6421 }, { "epoch": 2.921747042766151, "grad_norm": 0.46377868675245537, "learning_rate": 1.845937110474657e-05, "loss": 0.0392, "step": 6422 }, { "epoch": 2.9222020018198362, "grad_norm": 0.2848053896746991, "learning_rate": 1.8452473807024894e-05, "loss": 0.0278, "step": 6423 }, { "epoch": 2.922656960873521, "grad_norm": 0.3813317845588789, "learning_rate": 1.844557704433749e-05, "loss": 0.0371, "step": 6424 }, { "epoch": 2.9231119199272064, "grad_norm": 0.5943834178995641, "learning_rate": 1.8438680817247944e-05, "loss": 0.0772, "step": 6425 }, { "epoch": 2.9235668789808917, "grad_norm": 0.43838797500346816, "learning_rate": 1.8431785126319762e-05, "loss": 0.0594, "step": 6426 }, { "epoch": 2.924021838034577, "grad_norm": 0.3668944869175957, "learning_rate": 1.842488997211644e-05, "loss": 0.0271, "step": 6427 }, { "epoch": 2.924476797088262, "grad_norm": 0.44128846273372124, "learning_rate": 1.8417995355201415e-05, "loss": 0.0442, "step": 6428 }, { "epoch": 2.9249317561419472, "grad_norm": 0.4858604813494631, "learning_rate": 1.8411101276138087e-05, "loss": 0.0571, "step": 6429 }, { "epoch": 2.9253867151956325, "grad_norm": 0.5968583623702792, "learning_rate": 1.84042077354898e-05, "loss": 0.0298, "step": 6430 }, { "epoch": 2.9258416742493174, "grad_norm": 0.43393685133344273, "learning_rate": 1.8397314733819875e-05, "loss": 0.0392, "step": 6431 }, { "epoch": 2.9262966333030027, "grad_norm": 0.6412241329968685, "learning_rate": 1.839042227169158e-05, "loss": 0.0573, "step": 6432 }, { "epoch": 2.926751592356688, "grad_norm": 0.3765917791367808, "learning_rate": 1.8383530349668126e-05, "loss": 0.04, "step": 6433 }, { "epoch": 2.9272065514103733, "grad_norm": 0.48713913655333596, "learning_rate": 1.8376638968312687e-05, "loss": 0.0359, "step": 6434 }, { "epoch": 2.927661510464058, "grad_norm": 0.5408768372585492, "learning_rate": 1.8369748128188407e-05, "loss": 0.0312, "step": 6435 }, { "epoch": 2.9281164695177435, "grad_norm": 0.5583990403163154, "learning_rate": 1.836285782985836e-05, "loss": 0.0467, "step": 6436 }, { "epoch": 2.928571428571429, "grad_norm": 0.47436309678852784, "learning_rate": 1.8355968073885594e-05, "loss": 0.0592, "step": 6437 }, { "epoch": 2.9290263876251137, "grad_norm": 0.41226246428620245, "learning_rate": 1.8349078860833123e-05, "loss": 0.0356, "step": 6438 }, { "epoch": 2.929481346678799, "grad_norm": 0.7581316770267718, "learning_rate": 1.8342190191263893e-05, "loss": 0.0644, "step": 6439 }, { "epoch": 2.9299363057324843, "grad_norm": 0.4697693886574373, "learning_rate": 1.833530206574081e-05, "loss": 0.0665, "step": 6440 }, { "epoch": 2.930391264786169, "grad_norm": 0.47144028562315216, "learning_rate": 1.8328414484826745e-05, "loss": 0.0395, "step": 6441 }, { "epoch": 2.9308462238398545, "grad_norm": 0.48368131143477283, "learning_rate": 1.8321527449084523e-05, "loss": 0.0547, "step": 6442 }, { "epoch": 2.93130118289354, "grad_norm": 0.46771105284724984, "learning_rate": 1.8314640959076916e-05, "loss": 0.0415, "step": 6443 }, { "epoch": 2.9317561419472247, "grad_norm": 0.5148354477767096, "learning_rate": 1.830775501536665e-05, "loss": 0.0301, "step": 6444 }, { "epoch": 2.93221110100091, "grad_norm": 0.48451859072515135, "learning_rate": 1.8300869618516433e-05, "loss": 0.0353, "step": 6445 }, { "epoch": 2.9326660600545953, "grad_norm": 0.39206831709361073, "learning_rate": 1.8293984769088895e-05, "loss": 0.0346, "step": 6446 }, { "epoch": 2.93312101910828, "grad_norm": 0.4848426067527017, "learning_rate": 1.828710046764664e-05, "loss": 0.0704, "step": 6447 }, { "epoch": 2.9335759781619655, "grad_norm": 0.41432809959756034, "learning_rate": 1.8280216714752217e-05, "loss": 0.0427, "step": 6448 }, { "epoch": 2.934030937215651, "grad_norm": 0.41695623914981184, "learning_rate": 1.827333351096814e-05, "loss": 0.0432, "step": 6449 }, { "epoch": 2.9344858962693356, "grad_norm": 0.434494154899831, "learning_rate": 1.826645085685687e-05, "loss": 0.0378, "step": 6450 }, { "epoch": 2.934940855323021, "grad_norm": 0.42875144237814594, "learning_rate": 1.8259568752980817e-05, "loss": 0.0421, "step": 6451 }, { "epoch": 2.9353958143767063, "grad_norm": 0.36183644138144816, "learning_rate": 1.825268719990238e-05, "loss": 0.0332, "step": 6452 }, { "epoch": 2.935850773430391, "grad_norm": 0.5262438709052168, "learning_rate": 1.8245806198183867e-05, "loss": 0.0649, "step": 6453 }, { "epoch": 2.9363057324840764, "grad_norm": 0.5975590511822287, "learning_rate": 1.823892574838758e-05, "loss": 0.055, "step": 6454 }, { "epoch": 2.9367606915377618, "grad_norm": 0.38845367067125425, "learning_rate": 1.823204585107574e-05, "loss": 0.0297, "step": 6455 }, { "epoch": 2.9372156505914466, "grad_norm": 0.6554154010549523, "learning_rate": 1.8225166506810553e-05, "loss": 0.0873, "step": 6456 }, { "epoch": 2.937670609645132, "grad_norm": 0.5887707956563889, "learning_rate": 1.821828771615416e-05, "loss": 0.0426, "step": 6457 }, { "epoch": 2.9381255686988172, "grad_norm": 0.6245329939751205, "learning_rate": 1.821140947966866e-05, "loss": 0.0587, "step": 6458 }, { "epoch": 2.938580527752502, "grad_norm": 0.3990389788153692, "learning_rate": 1.820453179791614e-05, "loss": 0.06, "step": 6459 }, { "epoch": 2.9390354868061874, "grad_norm": 0.5635143974978624, "learning_rate": 1.819765467145858e-05, "loss": 0.0231, "step": 6460 }, { "epoch": 2.9394904458598727, "grad_norm": 0.5323109009580734, "learning_rate": 1.819077810085797e-05, "loss": 0.0927, "step": 6461 }, { "epoch": 2.9399454049135576, "grad_norm": 0.4896734533472743, "learning_rate": 1.8183902086676217e-05, "loss": 0.0546, "step": 6462 }, { "epoch": 2.940400363967243, "grad_norm": 0.5666974214612259, "learning_rate": 1.8177026629475208e-05, "loss": 0.0429, "step": 6463 }, { "epoch": 2.9408553230209282, "grad_norm": 0.45446299119526196, "learning_rate": 1.8170151729816774e-05, "loss": 0.0439, "step": 6464 }, { "epoch": 2.941310282074613, "grad_norm": 0.46455943415922296, "learning_rate": 1.8163277388262678e-05, "loss": 0.0423, "step": 6465 }, { "epoch": 2.9417652411282984, "grad_norm": 0.5589469965070846, "learning_rate": 1.81564036053747e-05, "loss": 0.0738, "step": 6466 }, { "epoch": 2.9422202001819837, "grad_norm": 0.8438807980749227, "learning_rate": 1.814953038171451e-05, "loss": 0.034, "step": 6467 }, { "epoch": 2.9426751592356686, "grad_norm": 0.4963596847485868, "learning_rate": 1.8142657717843757e-05, "loss": 0.0437, "step": 6468 }, { "epoch": 2.943130118289354, "grad_norm": 0.452592654005445, "learning_rate": 1.813578561432405e-05, "loss": 0.0306, "step": 6469 }, { "epoch": 2.943585077343039, "grad_norm": 0.4055820814779068, "learning_rate": 1.812891407171694e-05, "loss": 0.042, "step": 6470 }, { "epoch": 2.944040036396724, "grad_norm": 0.48829459936580477, "learning_rate": 1.812204309058395e-05, "loss": 0.0577, "step": 6471 }, { "epoch": 2.9444949954504094, "grad_norm": 0.47791656579062725, "learning_rate": 1.811517267148653e-05, "loss": 0.0667, "step": 6472 }, { "epoch": 2.9449499545040947, "grad_norm": 0.3592772300508168, "learning_rate": 1.810830281498611e-05, "loss": 0.041, "step": 6473 }, { "epoch": 2.9454049135577796, "grad_norm": 0.5224976574497467, "learning_rate": 1.810143352164406e-05, "loss": 0.0549, "step": 6474 }, { "epoch": 2.945859872611465, "grad_norm": 0.4807018373529452, "learning_rate": 1.809456479202171e-05, "loss": 0.0473, "step": 6475 }, { "epoch": 2.94631483166515, "grad_norm": 0.3234785407404316, "learning_rate": 1.808769662668035e-05, "loss": 0.0303, "step": 6476 }, { "epoch": 2.946769790718835, "grad_norm": 0.49898044893043564, "learning_rate": 1.8080829026181196e-05, "loss": 0.0565, "step": 6477 }, { "epoch": 2.9472247497725204, "grad_norm": 0.5662462492028029, "learning_rate": 1.8073961991085452e-05, "loss": 0.0622, "step": 6478 }, { "epoch": 2.9476797088262057, "grad_norm": 0.4595957025856471, "learning_rate": 1.8067095521954247e-05, "loss": 0.0705, "step": 6479 }, { "epoch": 2.9481346678798905, "grad_norm": 0.48517388398606903, "learning_rate": 1.806022961934869e-05, "loss": 0.0316, "step": 6480 }, { "epoch": 2.948589626933576, "grad_norm": 0.5775757358787572, "learning_rate": 1.8053364283829838e-05, "loss": 0.0574, "step": 6481 }, { "epoch": 2.949044585987261, "grad_norm": 0.39634232118829965, "learning_rate": 1.8046499515958683e-05, "loss": 0.0497, "step": 6482 }, { "epoch": 2.9494995450409465, "grad_norm": 0.5129241659216747, "learning_rate": 1.8039635316296182e-05, "loss": 0.0802, "step": 6483 }, { "epoch": 2.9499545040946313, "grad_norm": 0.4199711076449035, "learning_rate": 1.803277168540325e-05, "loss": 0.0463, "step": 6484 }, { "epoch": 2.9504094631483166, "grad_norm": 0.5848538892198767, "learning_rate": 1.802590862384076e-05, "loss": 0.0448, "step": 6485 }, { "epoch": 2.950864422202002, "grad_norm": 0.5271199364441901, "learning_rate": 1.801904613216951e-05, "loss": 0.0746, "step": 6486 }, { "epoch": 2.951319381255687, "grad_norm": 0.5814752095812844, "learning_rate": 1.801218421095029e-05, "loss": 0.0558, "step": 6487 }, { "epoch": 2.951774340309372, "grad_norm": 0.37432528066419946, "learning_rate": 1.8005322860743824e-05, "loss": 0.0347, "step": 6488 }, { "epoch": 2.9522292993630574, "grad_norm": 0.7577972758805229, "learning_rate": 1.7998462082110778e-05, "loss": 0.0264, "step": 6489 }, { "epoch": 2.9526842584167428, "grad_norm": 0.42324493588954853, "learning_rate": 1.7991601875611803e-05, "loss": 0.0346, "step": 6490 }, { "epoch": 2.9531392174704276, "grad_norm": 0.37967750330681305, "learning_rate": 1.798474224180746e-05, "loss": 0.0219, "step": 6491 }, { "epoch": 2.953594176524113, "grad_norm": 0.46460249621847455, "learning_rate": 1.7977883181258315e-05, "loss": 0.0469, "step": 6492 }, { "epoch": 2.9540491355777982, "grad_norm": 0.4256403554324309, "learning_rate": 1.7971024694524828e-05, "loss": 0.0478, "step": 6493 }, { "epoch": 2.954504094631483, "grad_norm": 0.7067148038690045, "learning_rate": 1.796416678216747e-05, "loss": 0.0319, "step": 6494 }, { "epoch": 2.9549590536851684, "grad_norm": 0.7682637276475771, "learning_rate": 1.795730944474663e-05, "loss": 0.0604, "step": 6495 }, { "epoch": 2.9554140127388537, "grad_norm": 0.47511100727051503, "learning_rate": 1.7950452682822653e-05, "loss": 0.0455, "step": 6496 }, { "epoch": 2.9558689717925386, "grad_norm": 0.41751890576930634, "learning_rate": 1.7943596496955854e-05, "loss": 0.0521, "step": 6497 }, { "epoch": 2.956323930846224, "grad_norm": 0.5463758731995829, "learning_rate": 1.7936740887706477e-05, "loss": 0.0513, "step": 6498 }, { "epoch": 2.9567788898999092, "grad_norm": 0.5205363669998008, "learning_rate": 1.792988585563474e-05, "loss": 0.0826, "step": 6499 }, { "epoch": 2.957233848953594, "grad_norm": 0.8570788868373138, "learning_rate": 1.79230314013008e-05, "loss": 0.0648, "step": 6500 }, { "epoch": 2.9576888080072794, "grad_norm": 0.641893558766693, "learning_rate": 1.7916177525264775e-05, "loss": 0.0838, "step": 6501 }, { "epoch": 2.9581437670609647, "grad_norm": 0.3835849025100129, "learning_rate": 1.790932422808674e-05, "loss": 0.0472, "step": 6502 }, { "epoch": 2.9585987261146496, "grad_norm": 0.43551969493172343, "learning_rate": 1.7902471510326703e-05, "loss": 0.0256, "step": 6503 }, { "epoch": 2.959053685168335, "grad_norm": 0.45516325981278877, "learning_rate": 1.7895619372544637e-05, "loss": 0.0765, "step": 6504 }, { "epoch": 2.95950864422202, "grad_norm": 0.4197901220857688, "learning_rate": 1.788876781530048e-05, "loss": 0.0636, "step": 6505 }, { "epoch": 2.959963603275705, "grad_norm": 0.4806067183204972, "learning_rate": 1.78819168391541e-05, "loss": 0.0442, "step": 6506 }, { "epoch": 2.9604185623293904, "grad_norm": 0.3578715363154314, "learning_rate": 1.7875066444665322e-05, "loss": 0.0491, "step": 6507 }, { "epoch": 2.9608735213830757, "grad_norm": 0.4609607106012463, "learning_rate": 1.786821663239395e-05, "loss": 0.0475, "step": 6508 }, { "epoch": 2.9613284804367606, "grad_norm": 0.4243276772377247, "learning_rate": 1.7861367402899703e-05, "loss": 0.0386, "step": 6509 }, { "epoch": 2.961783439490446, "grad_norm": 0.34119921987950586, "learning_rate": 1.7854518756742277e-05, "loss": 0.0444, "step": 6510 }, { "epoch": 2.962238398544131, "grad_norm": 0.5687484829774583, "learning_rate": 1.7847670694481307e-05, "loss": 0.061, "step": 6511 }, { "epoch": 2.962693357597816, "grad_norm": 0.6044887812118247, "learning_rate": 1.7840823216676395e-05, "loss": 0.0741, "step": 6512 }, { "epoch": 2.9631483166515014, "grad_norm": 0.6655988871973072, "learning_rate": 1.783397632388707e-05, "loss": 0.0575, "step": 6513 }, { "epoch": 2.9636032757051867, "grad_norm": 0.5234930347988113, "learning_rate": 1.7827130016672837e-05, "loss": 0.0651, "step": 6514 }, { "epoch": 2.9640582347588715, "grad_norm": 0.4867160808905847, "learning_rate": 1.7820284295593152e-05, "loss": 0.0515, "step": 6515 }, { "epoch": 2.964513193812557, "grad_norm": 0.3884461132080658, "learning_rate": 1.7813439161207413e-05, "loss": 0.0408, "step": 6516 }, { "epoch": 2.964968152866242, "grad_norm": 0.38788798390266777, "learning_rate": 1.7806594614074973e-05, "loss": 0.033, "step": 6517 }, { "epoch": 2.965423111919927, "grad_norm": 0.41024040963354375, "learning_rate": 1.7799750654755125e-05, "loss": 0.0374, "step": 6518 }, { "epoch": 2.9658780709736123, "grad_norm": 0.4701304846801108, "learning_rate": 1.7792907283807154e-05, "loss": 0.0576, "step": 6519 }, { "epoch": 2.9663330300272976, "grad_norm": 0.5310020995542613, "learning_rate": 1.778606450179024e-05, "loss": 0.0494, "step": 6520 }, { "epoch": 2.9667879890809825, "grad_norm": 0.4685386565190601, "learning_rate": 1.7779222309263555e-05, "loss": 0.048, "step": 6521 }, { "epoch": 2.967242948134668, "grad_norm": 0.46991410923528565, "learning_rate": 1.777238070678622e-05, "loss": 0.0712, "step": 6522 }, { "epoch": 2.967697907188353, "grad_norm": 0.5661472389216139, "learning_rate": 1.7765539694917293e-05, "loss": 0.0411, "step": 6523 }, { "epoch": 2.968152866242038, "grad_norm": 0.3727448899880203, "learning_rate": 1.7758699274215794e-05, "loss": 0.0406, "step": 6524 }, { "epoch": 2.9686078252957233, "grad_norm": 0.41416036226848724, "learning_rate": 1.7751859445240687e-05, "loss": 0.0427, "step": 6525 }, { "epoch": 2.9690627843494086, "grad_norm": 0.3859995882260844, "learning_rate": 1.7745020208550897e-05, "loss": 0.0635, "step": 6526 }, { "epoch": 2.9695177434030935, "grad_norm": 0.3934265428623855, "learning_rate": 1.7738181564705286e-05, "loss": 0.0481, "step": 6527 }, { "epoch": 2.969972702456779, "grad_norm": 0.49184826582273167, "learning_rate": 1.7731343514262683e-05, "loss": 0.05, "step": 6528 }, { "epoch": 2.970427661510464, "grad_norm": 0.6949004111536389, "learning_rate": 1.772450605778187e-05, "loss": 0.0858, "step": 6529 }, { "epoch": 2.970882620564149, "grad_norm": 0.4082413504065272, "learning_rate": 1.771766919582156e-05, "loss": 0.0406, "step": 6530 }, { "epoch": 2.9713375796178343, "grad_norm": 0.45486573558751703, "learning_rate": 1.7710832928940445e-05, "loss": 0.032, "step": 6531 }, { "epoch": 2.9717925386715196, "grad_norm": 0.4844300680886603, "learning_rate": 1.7703997257697137e-05, "loss": 0.064, "step": 6532 }, { "epoch": 2.9722474977252045, "grad_norm": 0.5188643560315637, "learning_rate": 1.7697162182650227e-05, "loss": 0.0427, "step": 6533 }, { "epoch": 2.97270245677889, "grad_norm": 0.6171469983355653, "learning_rate": 1.7690327704358246e-05, "loss": 0.0826, "step": 6534 }, { "epoch": 2.973157415832575, "grad_norm": 0.6037284744720023, "learning_rate": 1.7683493823379666e-05, "loss": 0.072, "step": 6535 }, { "epoch": 2.9736123748862604, "grad_norm": 0.5266719625746314, "learning_rate": 1.7676660540272942e-05, "loss": 0.0963, "step": 6536 }, { "epoch": 2.9740673339399453, "grad_norm": 0.4837935691142685, "learning_rate": 1.766982785559644e-05, "loss": 0.0539, "step": 6537 }, { "epoch": 2.9745222929936306, "grad_norm": 0.5908331864246189, "learning_rate": 1.7662995769908507e-05, "loss": 0.0474, "step": 6538 }, { "epoch": 2.974977252047316, "grad_norm": 0.40464451980276656, "learning_rate": 1.765616428376743e-05, "loss": 0.0339, "step": 6539 }, { "epoch": 2.9754322111010008, "grad_norm": 0.6478953985280611, "learning_rate": 1.7649333397731433e-05, "loss": 0.0788, "step": 6540 }, { "epoch": 2.975887170154686, "grad_norm": 0.266825850813696, "learning_rate": 1.7642503112358725e-05, "loss": 0.0153, "step": 6541 }, { "epoch": 2.9763421292083714, "grad_norm": 0.40437699005818056, "learning_rate": 1.763567342820742e-05, "loss": 0.0577, "step": 6542 }, { "epoch": 2.9767970882620567, "grad_norm": 0.41218493228582825, "learning_rate": 1.7628844345835638e-05, "loss": 0.0451, "step": 6543 }, { "epoch": 2.9772520473157416, "grad_norm": 0.422682229893751, "learning_rate": 1.762201586580141e-05, "loss": 0.0353, "step": 6544 }, { "epoch": 2.977707006369427, "grad_norm": 0.45844465535579415, "learning_rate": 1.7615187988662722e-05, "loss": 0.0457, "step": 6545 }, { "epoch": 2.978161965423112, "grad_norm": 0.3972114699878978, "learning_rate": 1.760836071497753e-05, "loss": 0.0372, "step": 6546 }, { "epoch": 2.978616924476797, "grad_norm": 0.5210843400788762, "learning_rate": 1.7601534045303707e-05, "loss": 0.0286, "step": 6547 }, { "epoch": 2.9790718835304824, "grad_norm": 1.246301977705609, "learning_rate": 1.7594707980199122e-05, "loss": 0.1261, "step": 6548 }, { "epoch": 2.9795268425841677, "grad_norm": 0.5222407450558753, "learning_rate": 1.758788252022155e-05, "loss": 0.0662, "step": 6549 }, { "epoch": 2.9799818016378525, "grad_norm": 0.8957386136756781, "learning_rate": 1.7581057665928747e-05, "loss": 0.0779, "step": 6550 }, { "epoch": 2.980436760691538, "grad_norm": 0.5227490026764984, "learning_rate": 1.7574233417878414e-05, "loss": 0.0605, "step": 6551 }, { "epoch": 2.980891719745223, "grad_norm": 0.37784139397193717, "learning_rate": 1.7567409776628184e-05, "loss": 0.06, "step": 6552 }, { "epoch": 2.981346678798908, "grad_norm": 0.4093196131262748, "learning_rate": 1.756058674273567e-05, "loss": 0.0453, "step": 6553 }, { "epoch": 2.9818016378525933, "grad_norm": 0.4515348287087562, "learning_rate": 1.755376431675841e-05, "loss": 0.0267, "step": 6554 }, { "epoch": 2.9822565969062786, "grad_norm": 0.5137883706010631, "learning_rate": 1.75469424992539e-05, "loss": 0.0446, "step": 6555 }, { "epoch": 2.9827115559599635, "grad_norm": 0.44567121351124883, "learning_rate": 1.7540121290779586e-05, "loss": 0.0392, "step": 6556 }, { "epoch": 2.983166515013649, "grad_norm": 0.4619322999329031, "learning_rate": 1.7533300691892873e-05, "loss": 0.0453, "step": 6557 }, { "epoch": 2.983621474067334, "grad_norm": 0.4146959494389726, "learning_rate": 1.7526480703151117e-05, "loss": 0.0494, "step": 6558 }, { "epoch": 2.984076433121019, "grad_norm": 0.3735203494935569, "learning_rate": 1.7519661325111602e-05, "loss": 0.0408, "step": 6559 }, { "epoch": 2.9845313921747043, "grad_norm": 0.4698302646709769, "learning_rate": 1.7512842558331587e-05, "loss": 0.0582, "step": 6560 }, { "epoch": 2.9849863512283896, "grad_norm": 0.5399323859858671, "learning_rate": 1.7506024403368264e-05, "loss": 0.0569, "step": 6561 }, { "epoch": 2.9854413102820745, "grad_norm": 0.5969732600456661, "learning_rate": 1.7499206860778784e-05, "loss": 0.0305, "step": 6562 }, { "epoch": 2.98589626933576, "grad_norm": 0.4227579415344859, "learning_rate": 1.749238993112024e-05, "loss": 0.033, "step": 6563 }, { "epoch": 2.986351228389445, "grad_norm": 0.4575279292308002, "learning_rate": 1.748557361494969e-05, "loss": 0.0587, "step": 6564 }, { "epoch": 2.98680618744313, "grad_norm": 0.5946647614891626, "learning_rate": 1.7478757912824133e-05, "loss": 0.0201, "step": 6565 }, { "epoch": 2.9872611464968153, "grad_norm": 0.3735455486353604, "learning_rate": 1.7471942825300514e-05, "loss": 0.0268, "step": 6566 }, { "epoch": 2.9877161055505006, "grad_norm": 0.41896115422875674, "learning_rate": 1.7465128352935732e-05, "loss": 0.0348, "step": 6567 }, { "epoch": 2.9881710646041855, "grad_norm": 0.4842900517859916, "learning_rate": 1.7458314496286633e-05, "loss": 0.0416, "step": 6568 }, { "epoch": 2.988626023657871, "grad_norm": 1.2639293256461306, "learning_rate": 1.7451501255910012e-05, "loss": 0.0377, "step": 6569 }, { "epoch": 2.989080982711556, "grad_norm": 0.33548738167592146, "learning_rate": 1.7444688632362614e-05, "loss": 0.0245, "step": 6570 }, { "epoch": 2.989535941765241, "grad_norm": 0.43517852691836983, "learning_rate": 1.743787662620115e-05, "loss": 0.0503, "step": 6571 }, { "epoch": 2.9899909008189263, "grad_norm": 0.5446260257236959, "learning_rate": 1.743106523798226e-05, "loss": 0.0605, "step": 6572 }, { "epoch": 2.9904458598726116, "grad_norm": 0.45202714920770154, "learning_rate": 1.742425446826253e-05, "loss": 0.057, "step": 6573 }, { "epoch": 2.9909008189262964, "grad_norm": 0.42548345393920395, "learning_rate": 1.741744431759852e-05, "loss": 0.0577, "step": 6574 }, { "epoch": 2.9913557779799818, "grad_norm": 0.5218551250419212, "learning_rate": 1.741063478654672e-05, "loss": 0.0823, "step": 6575 }, { "epoch": 2.991810737033667, "grad_norm": 0.531571345933043, "learning_rate": 1.7403825875663565e-05, "loss": 0.0741, "step": 6576 }, { "epoch": 2.992265696087352, "grad_norm": 0.4841638432352804, "learning_rate": 1.7397017585505453e-05, "loss": 0.0403, "step": 6577 }, { "epoch": 2.9927206551410372, "grad_norm": 0.7296893633155478, "learning_rate": 1.7390209916628735e-05, "loss": 0.1043, "step": 6578 }, { "epoch": 2.9931756141947226, "grad_norm": 0.42734070781198313, "learning_rate": 1.7383402869589695e-05, "loss": 0.0765, "step": 6579 }, { "epoch": 2.9936305732484074, "grad_norm": 0.5599105216136888, "learning_rate": 1.7376596444944582e-05, "loss": 0.0555, "step": 6580 }, { "epoch": 2.9940855323020927, "grad_norm": 0.40611228417112843, "learning_rate": 1.7369790643249573e-05, "loss": 0.0261, "step": 6581 }, { "epoch": 2.994540491355778, "grad_norm": 0.4673999196292963, "learning_rate": 1.7362985465060823e-05, "loss": 0.0342, "step": 6582 }, { "epoch": 2.994995450409463, "grad_norm": 0.5982884650435178, "learning_rate": 1.7356180910934404e-05, "loss": 0.0311, "step": 6583 }, { "epoch": 2.9954504094631482, "grad_norm": 0.4545636469686473, "learning_rate": 1.7349376981426357e-05, "loss": 0.0457, "step": 6584 }, { "epoch": 2.9959053685168335, "grad_norm": 0.5688995855114375, "learning_rate": 1.734257367709268e-05, "loss": 0.0251, "step": 6585 }, { "epoch": 2.9963603275705184, "grad_norm": 0.6039000468429917, "learning_rate": 1.7335770998489305e-05, "loss": 0.0546, "step": 6586 }, { "epoch": 2.9968152866242037, "grad_norm": 0.3521714686769946, "learning_rate": 1.7328968946172115e-05, "loss": 0.0334, "step": 6587 }, { "epoch": 2.997270245677889, "grad_norm": 0.43650976024594124, "learning_rate": 1.7322167520696934e-05, "loss": 0.0445, "step": 6588 }, { "epoch": 2.997725204731574, "grad_norm": 0.5518357848846434, "learning_rate": 1.7315366722619553e-05, "loss": 0.0619, "step": 6589 }, { "epoch": 2.998180163785259, "grad_norm": 0.4737700109878584, "learning_rate": 1.7308566552495698e-05, "loss": 0.0372, "step": 6590 }, { "epoch": 2.9986351228389445, "grad_norm": 0.3714281850363674, "learning_rate": 1.7301767010881044e-05, "loss": 0.023, "step": 6591 }, { "epoch": 2.99909008189263, "grad_norm": 0.603539517226536, "learning_rate": 1.7294968098331237e-05, "loss": 0.0408, "step": 6592 }, { "epoch": 2.9995450409463147, "grad_norm": 0.8450786219644911, "learning_rate": 1.7288169815401833e-05, "loss": 0.051, "step": 6593 }, { "epoch": 3.0, "grad_norm": 0.4338767866000819, "learning_rate": 1.7281372162648375e-05, "loss": 0.0401, "step": 6594 }, { "epoch": 3.0004549590536853, "grad_norm": 0.18231984742176618, "learning_rate": 1.7274575140626318e-05, "loss": 0.0069, "step": 6595 }, { "epoch": 3.00090991810737, "grad_norm": 0.24131129850071523, "learning_rate": 1.7267778749891096e-05, "loss": 0.0119, "step": 6596 }, { "epoch": 3.0013648771610555, "grad_norm": 0.17451860610892722, "learning_rate": 1.7260982990998076e-05, "loss": 0.0102, "step": 6597 }, { "epoch": 3.001819836214741, "grad_norm": 0.24506465856523257, "learning_rate": 1.7254187864502567e-05, "loss": 0.0186, "step": 6598 }, { "epoch": 3.0022747952684257, "grad_norm": 0.2946557879363898, "learning_rate": 1.724739337095986e-05, "loss": 0.0197, "step": 6599 }, { "epoch": 3.002729754322111, "grad_norm": 0.447212243530715, "learning_rate": 1.724059951092515e-05, "loss": 0.0236, "step": 6600 }, { "epoch": 3.0031847133757963, "grad_norm": 0.35674633037323766, "learning_rate": 1.7233806284953614e-05, "loss": 0.0267, "step": 6601 }, { "epoch": 3.003639672429481, "grad_norm": 0.2620464716903149, "learning_rate": 1.7227013693600347e-05, "loss": 0.0186, "step": 6602 }, { "epoch": 3.0040946314831665, "grad_norm": 0.24766265291241216, "learning_rate": 1.722022173742043e-05, "loss": 0.011, "step": 6603 }, { "epoch": 3.0045495905368518, "grad_norm": 0.23255981926205008, "learning_rate": 1.7213430416968847e-05, "loss": 0.0116, "step": 6604 }, { "epoch": 3.0050045495905366, "grad_norm": 0.15992566812973472, "learning_rate": 1.720663973280057e-05, "loss": 0.0178, "step": 6605 }, { "epoch": 3.005459508644222, "grad_norm": 0.20215781926289925, "learning_rate": 1.7199849685470498e-05, "loss": 0.01, "step": 6606 }, { "epoch": 3.0059144676979073, "grad_norm": 0.27085076305729455, "learning_rate": 1.719306027553349e-05, "loss": 0.0136, "step": 6607 }, { "epoch": 3.0063694267515926, "grad_norm": 0.1841916024121814, "learning_rate": 1.718627150354434e-05, "loss": 0.0102, "step": 6608 }, { "epoch": 3.0068243858052774, "grad_norm": 0.13989903549122493, "learning_rate": 1.7179483370057796e-05, "loss": 0.0079, "step": 6609 }, { "epoch": 3.0072793448589628, "grad_norm": 0.4944929581175028, "learning_rate": 1.717269587562855e-05, "loss": 0.0294, "step": 6610 }, { "epoch": 3.007734303912648, "grad_norm": 0.3880966870694317, "learning_rate": 1.7165909020811256e-05, "loss": 0.0303, "step": 6611 }, { "epoch": 3.008189262966333, "grad_norm": 0.2591048766217711, "learning_rate": 1.7159122806160487e-05, "loss": 0.0176, "step": 6612 }, { "epoch": 3.0086442220200182, "grad_norm": 0.29170371032548936, "learning_rate": 1.7152337232230796e-05, "loss": 0.0175, "step": 6613 }, { "epoch": 3.0090991810737036, "grad_norm": 0.4045516544003558, "learning_rate": 1.7145552299576677e-05, "loss": 0.0214, "step": 6614 }, { "epoch": 3.0095541401273884, "grad_norm": 0.4049697035872757, "learning_rate": 1.7138768008752546e-05, "loss": 0.0216, "step": 6615 }, { "epoch": 3.0100090991810737, "grad_norm": 0.22076778715417567, "learning_rate": 1.7131984360312797e-05, "loss": 0.0068, "step": 6616 }, { "epoch": 3.010464058234759, "grad_norm": 0.3867467279792092, "learning_rate": 1.7125201354811747e-05, "loss": 0.0129, "step": 6617 }, { "epoch": 3.010919017288444, "grad_norm": 0.29500047259109663, "learning_rate": 1.711841899280369e-05, "loss": 0.0153, "step": 6618 }, { "epoch": 3.011373976342129, "grad_norm": 0.2750672488417733, "learning_rate": 1.7111637274842827e-05, "loss": 0.0183, "step": 6619 }, { "epoch": 3.0118289353958145, "grad_norm": 0.23409499967610034, "learning_rate": 1.7104856201483347e-05, "loss": 0.0097, "step": 6620 }, { "epoch": 3.0122838944494994, "grad_norm": 0.3927964080166131, "learning_rate": 1.709807577327937e-05, "loss": 0.0245, "step": 6621 }, { "epoch": 3.0127388535031847, "grad_norm": 0.28541725723786515, "learning_rate": 1.709129599078495e-05, "loss": 0.0155, "step": 6622 }, { "epoch": 3.01319381255687, "grad_norm": 0.41114001853874094, "learning_rate": 1.708451685455411e-05, "loss": 0.0186, "step": 6623 }, { "epoch": 3.013648771610555, "grad_norm": 0.43710871021189623, "learning_rate": 1.7077738365140804e-05, "loss": 0.0165, "step": 6624 }, { "epoch": 3.01410373066424, "grad_norm": 0.38784903715115804, "learning_rate": 1.707096052309895e-05, "loss": 0.0237, "step": 6625 }, { "epoch": 3.0145586897179255, "grad_norm": 0.4923508877395764, "learning_rate": 1.706418332898238e-05, "loss": 0.0337, "step": 6626 }, { "epoch": 3.0150136487716104, "grad_norm": 0.2078989312089293, "learning_rate": 1.7057406783344916e-05, "loss": 0.0089, "step": 6627 }, { "epoch": 3.0154686078252957, "grad_norm": 0.2612104902676017, "learning_rate": 1.705063088674031e-05, "loss": 0.0077, "step": 6628 }, { "epoch": 3.015923566878981, "grad_norm": 0.6334035741614387, "learning_rate": 1.7043855639722238e-05, "loss": 0.0166, "step": 6629 }, { "epoch": 3.016378525932666, "grad_norm": 0.38503135169054276, "learning_rate": 1.7037081042844368e-05, "loss": 0.0142, "step": 6630 }, { "epoch": 3.016833484986351, "grad_norm": 0.5415669879216404, "learning_rate": 1.703030709666026e-05, "loss": 0.0109, "step": 6631 }, { "epoch": 3.0172884440400365, "grad_norm": 0.4429099414130978, "learning_rate": 1.7023533801723475e-05, "loss": 0.0299, "step": 6632 }, { "epoch": 3.0177434030937214, "grad_norm": 0.4506223007421393, "learning_rate": 1.7016761158587474e-05, "loss": 0.0195, "step": 6633 }, { "epoch": 3.0181983621474067, "grad_norm": 0.36805473311319215, "learning_rate": 1.7009989167805705e-05, "loss": 0.0163, "step": 6634 }, { "epoch": 3.018653321201092, "grad_norm": 0.35028326884500033, "learning_rate": 1.7003217829931545e-05, "loss": 0.0093, "step": 6635 }, { "epoch": 3.0191082802547773, "grad_norm": 0.29537533747527095, "learning_rate": 1.6996447145518306e-05, "loss": 0.02, "step": 6636 }, { "epoch": 3.019563239308462, "grad_norm": 0.2904456371250762, "learning_rate": 1.6989677115119267e-05, "loss": 0.0171, "step": 6637 }, { "epoch": 3.0200181983621475, "grad_norm": 1.2157454981995983, "learning_rate": 1.6982907739287634e-05, "loss": 0.0342, "step": 6638 }, { "epoch": 3.0204731574158328, "grad_norm": 0.2673747237854927, "learning_rate": 1.6976139018576583e-05, "loss": 0.0105, "step": 6639 }, { "epoch": 3.0209281164695176, "grad_norm": 0.5095323187332772, "learning_rate": 1.69693709535392e-05, "loss": 0.029, "step": 6640 }, { "epoch": 3.021383075523203, "grad_norm": 0.27470894793929945, "learning_rate": 1.6962603544728566e-05, "loss": 0.0139, "step": 6641 }, { "epoch": 3.0218380345768883, "grad_norm": 0.3005779073807014, "learning_rate": 1.695583679269768e-05, "loss": 0.0107, "step": 6642 }, { "epoch": 3.022292993630573, "grad_norm": 0.3077290545560617, "learning_rate": 1.6949070697999477e-05, "loss": 0.0137, "step": 6643 }, { "epoch": 3.0227479526842584, "grad_norm": 0.39171716959328523, "learning_rate": 1.6942305261186864e-05, "loss": 0.0397, "step": 6644 }, { "epoch": 3.0232029117379438, "grad_norm": 0.28256120449202776, "learning_rate": 1.693554048281268e-05, "loss": 0.0087, "step": 6645 }, { "epoch": 3.0236578707916286, "grad_norm": 0.2530325113916612, "learning_rate": 1.69287763634297e-05, "loss": 0.0066, "step": 6646 }, { "epoch": 3.024112829845314, "grad_norm": 0.4212771682725313, "learning_rate": 1.6922012903590663e-05, "loss": 0.0284, "step": 6647 }, { "epoch": 3.0245677888989992, "grad_norm": 0.5029805147264269, "learning_rate": 1.691525010384826e-05, "loss": 0.0142, "step": 6648 }, { "epoch": 3.025022747952684, "grad_norm": 0.39725706882869544, "learning_rate": 1.6908487964755104e-05, "loss": 0.0099, "step": 6649 }, { "epoch": 3.0254777070063694, "grad_norm": 0.7849536369642153, "learning_rate": 1.690172648686378e-05, "loss": 0.0347, "step": 6650 }, { "epoch": 3.0259326660600547, "grad_norm": 0.30284399552718627, "learning_rate": 1.689496567072678e-05, "loss": 0.0151, "step": 6651 }, { "epoch": 3.0263876251137396, "grad_norm": 0.21988300981974973, "learning_rate": 1.68882055168966e-05, "loss": 0.0056, "step": 6652 }, { "epoch": 3.026842584167425, "grad_norm": 0.33797127071127897, "learning_rate": 1.6881446025925625e-05, "loss": 0.0154, "step": 6653 }, { "epoch": 3.02729754322111, "grad_norm": 0.2657864970805723, "learning_rate": 1.6874687198366206e-05, "loss": 0.0088, "step": 6654 }, { "epoch": 3.027752502274795, "grad_norm": 0.1775590297312961, "learning_rate": 1.6867929034770673e-05, "loss": 0.0153, "step": 6655 }, { "epoch": 3.0282074613284804, "grad_norm": 0.25334089059240683, "learning_rate": 1.6861171535691243e-05, "loss": 0.0087, "step": 6656 }, { "epoch": 3.0286624203821657, "grad_norm": 0.2949648898732741, "learning_rate": 1.6854414701680133e-05, "loss": 0.0121, "step": 6657 }, { "epoch": 3.0291173794358506, "grad_norm": 0.3969323761665509, "learning_rate": 1.684765853328946e-05, "loss": 0.0256, "step": 6658 }, { "epoch": 3.029572338489536, "grad_norm": 0.44891995650623456, "learning_rate": 1.684090303107132e-05, "loss": 0.0188, "step": 6659 }, { "epoch": 3.030027297543221, "grad_norm": 0.3503369419275542, "learning_rate": 1.6834148195577736e-05, "loss": 0.0144, "step": 6660 }, { "epoch": 3.030482256596906, "grad_norm": 0.49225954679505907, "learning_rate": 1.6827394027360676e-05, "loss": 0.0313, "step": 6661 }, { "epoch": 3.0309372156505914, "grad_norm": 0.4489031247920474, "learning_rate": 1.682064052697208e-05, "loss": 0.0177, "step": 6662 }, { "epoch": 3.0313921747042767, "grad_norm": 0.2955480625658013, "learning_rate": 1.68138876949638e-05, "loss": 0.0148, "step": 6663 }, { "epoch": 3.031847133757962, "grad_norm": 0.5250089035645503, "learning_rate": 1.6807135531887654e-05, "loss": 0.0229, "step": 6664 }, { "epoch": 3.032302092811647, "grad_norm": 0.39012009240361434, "learning_rate": 1.6800384038295385e-05, "loss": 0.0138, "step": 6665 }, { "epoch": 3.032757051865332, "grad_norm": 0.3017685481591064, "learning_rate": 1.6793633214738714e-05, "loss": 0.0142, "step": 6666 }, { "epoch": 3.0332120109190175, "grad_norm": 0.30342860733828564, "learning_rate": 1.6786883061769268e-05, "loss": 0.0067, "step": 6667 }, { "epoch": 3.0336669699727024, "grad_norm": 0.3146054858059456, "learning_rate": 1.6780133579938643e-05, "loss": 0.0255, "step": 6668 }, { "epoch": 3.0341219290263877, "grad_norm": 0.21310040850813775, "learning_rate": 1.6773384769798393e-05, "loss": 0.0086, "step": 6669 }, { "epoch": 3.034576888080073, "grad_norm": 0.40806074551717036, "learning_rate": 1.6766636631899984e-05, "loss": 0.0265, "step": 6670 }, { "epoch": 3.035031847133758, "grad_norm": 0.3549594666616254, "learning_rate": 1.675988916679485e-05, "loss": 0.0197, "step": 6671 }, { "epoch": 3.035486806187443, "grad_norm": 0.445211950719475, "learning_rate": 1.675314237503436e-05, "loss": 0.036, "step": 6672 }, { "epoch": 3.0359417652411285, "grad_norm": 0.14669551216220678, "learning_rate": 1.6746396257169837e-05, "loss": 0.0054, "step": 6673 }, { "epoch": 3.0363967242948133, "grad_norm": 0.6339305627875662, "learning_rate": 1.6739650813752526e-05, "loss": 0.0147, "step": 6674 }, { "epoch": 3.0368516833484986, "grad_norm": 0.32831208423885355, "learning_rate": 1.673290604533365e-05, "loss": 0.022, "step": 6675 }, { "epoch": 3.037306642402184, "grad_norm": 0.2881023436429194, "learning_rate": 1.672616195246437e-05, "loss": 0.0065, "step": 6676 }, { "epoch": 3.037761601455869, "grad_norm": 0.45374395992997035, "learning_rate": 1.6719418535695763e-05, "loss": 0.0167, "step": 6677 }, { "epoch": 3.038216560509554, "grad_norm": 0.29840193388103764, "learning_rate": 1.6712675795578882e-05, "loss": 0.0284, "step": 6678 }, { "epoch": 3.0386715195632394, "grad_norm": 0.5046719652326068, "learning_rate": 1.6705933732664708e-05, "loss": 0.0133, "step": 6679 }, { "epoch": 3.0391264786169243, "grad_norm": 0.28177796504885205, "learning_rate": 1.6699192347504176e-05, "loss": 0.0118, "step": 6680 }, { "epoch": 3.0395814376706096, "grad_norm": 0.3413649141239145, "learning_rate": 1.6692451640648148e-05, "loss": 0.0194, "step": 6681 }, { "epoch": 3.040036396724295, "grad_norm": 0.35658258324266373, "learning_rate": 1.6685711612647464e-05, "loss": 0.0157, "step": 6682 }, { "epoch": 3.04049135577798, "grad_norm": 0.33832367194705765, "learning_rate": 1.6678972264052883e-05, "loss": 0.0156, "step": 6683 }, { "epoch": 3.040946314831665, "grad_norm": 0.39075153805386287, "learning_rate": 1.6672233595415108e-05, "loss": 0.0199, "step": 6684 }, { "epoch": 3.0414012738853504, "grad_norm": 0.2239998478820597, "learning_rate": 1.66654956072848e-05, "loss": 0.0135, "step": 6685 }, { "epoch": 3.0418562329390353, "grad_norm": 0.2680523901498396, "learning_rate": 1.665875830021255e-05, "loss": 0.0197, "step": 6686 }, { "epoch": 3.0423111919927206, "grad_norm": 0.16029177365701583, "learning_rate": 1.66520216747489e-05, "loss": 0.006, "step": 6687 }, { "epoch": 3.042766151046406, "grad_norm": 0.21992279839133946, "learning_rate": 1.664528573144433e-05, "loss": 0.012, "step": 6688 }, { "epoch": 3.0432211101000908, "grad_norm": 0.33148731333902126, "learning_rate": 1.6638550470849298e-05, "loss": 0.0239, "step": 6689 }, { "epoch": 3.043676069153776, "grad_norm": 0.27000700666446953, "learning_rate": 1.663181589351415e-05, "loss": 0.0124, "step": 6690 }, { "epoch": 3.0441310282074614, "grad_norm": 0.381338293532307, "learning_rate": 1.6625081999989228e-05, "loss": 0.014, "step": 6691 }, { "epoch": 3.0445859872611467, "grad_norm": 0.30651235832157003, "learning_rate": 1.6618348790824777e-05, "loss": 0.0164, "step": 6692 }, { "epoch": 3.0450409463148316, "grad_norm": 0.33161824517165095, "learning_rate": 1.661161626657102e-05, "loss": 0.0342, "step": 6693 }, { "epoch": 3.045495905368517, "grad_norm": 0.34694079563657165, "learning_rate": 1.660488442777809e-05, "loss": 0.0109, "step": 6694 }, { "epoch": 3.045950864422202, "grad_norm": 0.21811910501865567, "learning_rate": 1.6598153274996088e-05, "loss": 0.0139, "step": 6695 }, { "epoch": 3.046405823475887, "grad_norm": 0.3459243759386974, "learning_rate": 1.6591422808775066e-05, "loss": 0.014, "step": 6696 }, { "epoch": 3.0468607825295724, "grad_norm": 0.3928403082288292, "learning_rate": 1.6584693029665e-05, "loss": 0.0253, "step": 6697 }, { "epoch": 3.0473157415832577, "grad_norm": 0.2435024252866398, "learning_rate": 1.657796393821582e-05, "loss": 0.0245, "step": 6698 }, { "epoch": 3.0477707006369426, "grad_norm": 0.34229189943536, "learning_rate": 1.6571235534977384e-05, "loss": 0.0127, "step": 6699 }, { "epoch": 3.048225659690628, "grad_norm": 0.31337476186781776, "learning_rate": 1.6564507820499525e-05, "loss": 0.0143, "step": 6700 }, { "epoch": 3.048680618744313, "grad_norm": 0.2296126048159018, "learning_rate": 1.6557780795331985e-05, "loss": 0.0093, "step": 6701 }, { "epoch": 3.049135577797998, "grad_norm": 0.30594838743010694, "learning_rate": 1.6551054460024467e-05, "loss": 0.021, "step": 6702 }, { "epoch": 3.0495905368516834, "grad_norm": 0.3384908025378404, "learning_rate": 1.654432881512664e-05, "loss": 0.0113, "step": 6703 }, { "epoch": 3.0500454959053687, "grad_norm": 0.25147653939372566, "learning_rate": 1.6537603861188068e-05, "loss": 0.0121, "step": 6704 }, { "epoch": 3.0505004549590535, "grad_norm": 0.2639483957456195, "learning_rate": 1.6530879598758297e-05, "loss": 0.0139, "step": 6705 }, { "epoch": 3.050955414012739, "grad_norm": 0.3971346034345404, "learning_rate": 1.6524156028386794e-05, "loss": 0.0233, "step": 6706 }, { "epoch": 3.051410373066424, "grad_norm": 0.47031853749647695, "learning_rate": 1.651743315062299e-05, "loss": 0.021, "step": 6707 }, { "epoch": 3.051865332120109, "grad_norm": 0.29134836858237706, "learning_rate": 1.6510710966016236e-05, "loss": 0.017, "step": 6708 }, { "epoch": 3.0523202911737943, "grad_norm": 0.30466887649014507, "learning_rate": 1.650398947511584e-05, "loss": 0.016, "step": 6709 }, { "epoch": 3.0527752502274796, "grad_norm": 0.3144296964260341, "learning_rate": 1.6497268678471068e-05, "loss": 0.008, "step": 6710 }, { "epoch": 3.0532302092811645, "grad_norm": 0.21136682851600844, "learning_rate": 1.6490548576631095e-05, "loss": 0.0104, "step": 6711 }, { "epoch": 3.05368516833485, "grad_norm": 0.3329362631700118, "learning_rate": 1.6483829170145072e-05, "loss": 0.0149, "step": 6712 }, { "epoch": 3.054140127388535, "grad_norm": 0.33453278522437574, "learning_rate": 1.647711045956206e-05, "loss": 0.0175, "step": 6713 }, { "epoch": 3.05459508644222, "grad_norm": 0.15642580746078813, "learning_rate": 1.64703924454311e-05, "loss": 0.0036, "step": 6714 }, { "epoch": 3.0550500454959053, "grad_norm": 0.12485733670344981, "learning_rate": 1.6463675128301146e-05, "loss": 0.0041, "step": 6715 }, { "epoch": 3.0555050045495906, "grad_norm": 0.2918659550823297, "learning_rate": 1.6456958508721104e-05, "loss": 0.0101, "step": 6716 }, { "epoch": 3.055959963603276, "grad_norm": 0.4626491599249287, "learning_rate": 1.6450242587239843e-05, "loss": 0.0397, "step": 6717 }, { "epoch": 3.056414922656961, "grad_norm": 0.38817479097768465, "learning_rate": 1.644352736440614e-05, "loss": 0.0222, "step": 6718 }, { "epoch": 3.056869881710646, "grad_norm": 0.26116082730667256, "learning_rate": 1.643681284076875e-05, "loss": 0.0108, "step": 6719 }, { "epoch": 3.0573248407643314, "grad_norm": 0.42609775487173884, "learning_rate": 1.6430099016876343e-05, "loss": 0.0188, "step": 6720 }, { "epoch": 3.0577797998180163, "grad_norm": 0.28382554330084075, "learning_rate": 1.6423385893277536e-05, "loss": 0.0138, "step": 6721 }, { "epoch": 3.0582347588717016, "grad_norm": 0.3535186793137884, "learning_rate": 1.641667347052091e-05, "loss": 0.0172, "step": 6722 }, { "epoch": 3.058689717925387, "grad_norm": 0.5026431157229347, "learning_rate": 1.640996174915495e-05, "loss": 0.017, "step": 6723 }, { "epoch": 3.0591446769790718, "grad_norm": 0.2903858031012856, "learning_rate": 1.6403250729728133e-05, "loss": 0.0145, "step": 6724 }, { "epoch": 3.059599636032757, "grad_norm": 0.2577528446326475, "learning_rate": 1.639654041278885e-05, "loss": 0.0096, "step": 6725 }, { "epoch": 3.0600545950864424, "grad_norm": 0.33582535124768853, "learning_rate": 1.6389830798885425e-05, "loss": 0.0146, "step": 6726 }, { "epoch": 3.0605095541401273, "grad_norm": 0.16876198902896183, "learning_rate": 1.638312188856615e-05, "loss": 0.0045, "step": 6727 }, { "epoch": 3.0609645131938126, "grad_norm": 0.2411950192048593, "learning_rate": 1.6376413682379233e-05, "loss": 0.007, "step": 6728 }, { "epoch": 3.061419472247498, "grad_norm": 0.5716264475613381, "learning_rate": 1.636970618087285e-05, "loss": 0.0216, "step": 6729 }, { "epoch": 3.0618744313011828, "grad_norm": 0.24537632475435575, "learning_rate": 1.63629993845951e-05, "loss": 0.0096, "step": 6730 }, { "epoch": 3.062329390354868, "grad_norm": 0.37849432486087514, "learning_rate": 1.6356293294094036e-05, "loss": 0.009, "step": 6731 }, { "epoch": 3.0627843494085534, "grad_norm": 0.3130448339847525, "learning_rate": 1.6349587909917654e-05, "loss": 0.0119, "step": 6732 }, { "epoch": 3.0632393084622382, "grad_norm": 0.30734819006033137, "learning_rate": 1.634288323261388e-05, "loss": 0.0212, "step": 6733 }, { "epoch": 3.0636942675159236, "grad_norm": 0.19294187322374282, "learning_rate": 1.63361792627306e-05, "loss": 0.0067, "step": 6734 }, { "epoch": 3.064149226569609, "grad_norm": 0.4102813293281607, "learning_rate": 1.6329476000815616e-05, "loss": 0.0265, "step": 6735 }, { "epoch": 3.0646041856232937, "grad_norm": 0.24699189336377128, "learning_rate": 1.6322773447416706e-05, "loss": 0.0103, "step": 6736 }, { "epoch": 3.065059144676979, "grad_norm": 0.3143893381268081, "learning_rate": 1.631607160308155e-05, "loss": 0.0179, "step": 6737 }, { "epoch": 3.0655141037306644, "grad_norm": 0.2732321257043185, "learning_rate": 1.6309370468357816e-05, "loss": 0.0112, "step": 6738 }, { "epoch": 3.065969062784349, "grad_norm": 0.3136186195252272, "learning_rate": 1.6302670043793084e-05, "loss": 0.0123, "step": 6739 }, { "epoch": 3.0664240218380345, "grad_norm": 0.397478547435094, "learning_rate": 1.6295970329934873e-05, "loss": 0.0162, "step": 6740 }, { "epoch": 3.06687898089172, "grad_norm": 0.4670518851671697, "learning_rate": 1.6289271327330662e-05, "loss": 0.016, "step": 6741 }, { "epoch": 3.0673339399454047, "grad_norm": 0.3336214174736772, "learning_rate": 1.628257303652786e-05, "loss": 0.0134, "step": 6742 }, { "epoch": 3.06778889899909, "grad_norm": 0.33532013465783117, "learning_rate": 1.6275875458073828e-05, "loss": 0.0257, "step": 6743 }, { "epoch": 3.0682438580527753, "grad_norm": 0.26213434804735564, "learning_rate": 1.6269178592515843e-05, "loss": 0.0142, "step": 6744 }, { "epoch": 3.06869881710646, "grad_norm": 0.3179284965216964, "learning_rate": 1.626248244040116e-05, "loss": 0.0145, "step": 6745 }, { "epoch": 3.0691537761601455, "grad_norm": 0.1729954530577749, "learning_rate": 1.6255787002276962e-05, "loss": 0.0044, "step": 6746 }, { "epoch": 3.069608735213831, "grad_norm": 0.4101803901091559, "learning_rate": 1.6249092278690355e-05, "loss": 0.0211, "step": 6747 }, { "epoch": 3.070063694267516, "grad_norm": 0.2111141770197323, "learning_rate": 1.624239827018841e-05, "loss": 0.0089, "step": 6748 }, { "epoch": 3.070518653321201, "grad_norm": 0.42573992841665564, "learning_rate": 1.623570497731813e-05, "loss": 0.0265, "step": 6749 }, { "epoch": 3.0709736123748863, "grad_norm": 0.26632688339301497, "learning_rate": 1.6229012400626458e-05, "loss": 0.0163, "step": 6750 }, { "epoch": 3.0714285714285716, "grad_norm": 0.25113622112711315, "learning_rate": 1.622232054066028e-05, "loss": 0.0124, "step": 6751 }, { "epoch": 3.0718835304822565, "grad_norm": 0.3741315450428866, "learning_rate": 1.621562939796643e-05, "loss": 0.0232, "step": 6752 }, { "epoch": 3.072338489535942, "grad_norm": 0.3082710680761942, "learning_rate": 1.620893897309168e-05, "loss": 0.0116, "step": 6753 }, { "epoch": 3.072793448589627, "grad_norm": 0.22969582670699812, "learning_rate": 1.620224926658274e-05, "loss": 0.0087, "step": 6754 }, { "epoch": 3.073248407643312, "grad_norm": 0.5285724952459375, "learning_rate": 1.619556027898625e-05, "loss": 0.0311, "step": 6755 }, { "epoch": 3.0737033666969973, "grad_norm": 0.17880513718903235, "learning_rate": 1.6188872010848822e-05, "loss": 0.0058, "step": 6756 }, { "epoch": 3.0741583257506826, "grad_norm": 0.2869872414030073, "learning_rate": 1.6182184462716977e-05, "loss": 0.0214, "step": 6757 }, { "epoch": 3.0746132848043675, "grad_norm": 0.3747153402925334, "learning_rate": 1.617549763513719e-05, "loss": 0.0282, "step": 6758 }, { "epoch": 3.0750682438580528, "grad_norm": 0.28386179983805365, "learning_rate": 1.61688115286559e-05, "loss": 0.0099, "step": 6759 }, { "epoch": 3.075523202911738, "grad_norm": 0.04192355772303141, "learning_rate": 1.616212614381944e-05, "loss": 0.0011, "step": 6760 }, { "epoch": 3.075978161965423, "grad_norm": 0.2276521103513163, "learning_rate": 1.615544148117413e-05, "loss": 0.0064, "step": 6761 }, { "epoch": 3.0764331210191083, "grad_norm": 0.3168657981614861, "learning_rate": 1.6148757541266192e-05, "loss": 0.0146, "step": 6762 }, { "epoch": 3.0768880800727936, "grad_norm": 2.6522119410246554, "learning_rate": 1.614207432464182e-05, "loss": 0.0104, "step": 6763 }, { "epoch": 3.0773430391264784, "grad_norm": 0.2481291220236229, "learning_rate": 1.6135391831847127e-05, "loss": 0.0121, "step": 6764 }, { "epoch": 3.0777979981801638, "grad_norm": 0.29303319681548934, "learning_rate": 1.612871006342818e-05, "loss": 0.0151, "step": 6765 }, { "epoch": 3.078252957233849, "grad_norm": 0.2753911632892928, "learning_rate": 1.612202901993099e-05, "loss": 0.0171, "step": 6766 }, { "epoch": 3.078707916287534, "grad_norm": 0.43429899353472495, "learning_rate": 1.6115348701901495e-05, "loss": 0.0235, "step": 6767 }, { "epoch": 3.0791628753412192, "grad_norm": 0.30861727120834326, "learning_rate": 1.6108669109885583e-05, "loss": 0.0152, "step": 6768 }, { "epoch": 3.0796178343949046, "grad_norm": 0.4116960598053695, "learning_rate": 1.6101990244429075e-05, "loss": 0.0255, "step": 6769 }, { "epoch": 3.0800727934485894, "grad_norm": 0.18999507625753775, "learning_rate": 1.609531210607775e-05, "loss": 0.0058, "step": 6770 }, { "epoch": 3.0805277525022747, "grad_norm": 0.3391255026923223, "learning_rate": 1.6088634695377295e-05, "loss": 0.0183, "step": 6771 }, { "epoch": 3.08098271155596, "grad_norm": 0.40500359227222815, "learning_rate": 1.6081958012873367e-05, "loss": 0.0128, "step": 6772 }, { "epoch": 3.0814376706096454, "grad_norm": 0.2906661596990021, "learning_rate": 1.6075282059111566e-05, "loss": 0.0124, "step": 6773 }, { "epoch": 3.08189262966333, "grad_norm": 0.40972620109677466, "learning_rate": 1.6068606834637406e-05, "loss": 0.0252, "step": 6774 }, { "epoch": 3.0823475887170155, "grad_norm": 0.33198465324264315, "learning_rate": 1.6061932339996366e-05, "loss": 0.0044, "step": 6775 }, { "epoch": 3.082802547770701, "grad_norm": 0.3903860029797379, "learning_rate": 1.605525857573385e-05, "loss": 0.012, "step": 6776 }, { "epoch": 3.0832575068243857, "grad_norm": 0.23429469131104347, "learning_rate": 1.604858554239521e-05, "loss": 0.0102, "step": 6777 }, { "epoch": 3.083712465878071, "grad_norm": 0.413175851911749, "learning_rate": 1.6041913240525734e-05, "loss": 0.0284, "step": 6778 }, { "epoch": 3.0841674249317563, "grad_norm": 0.2556612652811136, "learning_rate": 1.603524167067065e-05, "loss": 0.0168, "step": 6779 }, { "epoch": 3.084622383985441, "grad_norm": 0.30427369988176717, "learning_rate": 1.6028570833375133e-05, "loss": 0.0118, "step": 6780 }, { "epoch": 3.0850773430391265, "grad_norm": 0.24507401400970344, "learning_rate": 1.6021900729184298e-05, "loss": 0.0078, "step": 6781 }, { "epoch": 3.085532302092812, "grad_norm": 0.2276305911776354, "learning_rate": 1.601523135864319e-05, "loss": 0.013, "step": 6782 }, { "epoch": 3.0859872611464967, "grad_norm": 0.35381343228537987, "learning_rate": 1.6008562722296796e-05, "loss": 0.0171, "step": 6783 }, { "epoch": 3.086442220200182, "grad_norm": 0.3484873892213921, "learning_rate": 1.6001894820690057e-05, "loss": 0.0208, "step": 6784 }, { "epoch": 3.0868971792538673, "grad_norm": 0.7095263098377792, "learning_rate": 1.5995227654367832e-05, "loss": 0.0126, "step": 6785 }, { "epoch": 3.087352138307552, "grad_norm": 0.47787483997636004, "learning_rate": 1.5988561223874936e-05, "loss": 0.0271, "step": 6786 }, { "epoch": 3.0878070973612375, "grad_norm": 0.3334711916785193, "learning_rate": 1.598189552975613e-05, "loss": 0.0252, "step": 6787 }, { "epoch": 3.088262056414923, "grad_norm": 0.3123137856324724, "learning_rate": 1.5975230572556093e-05, "loss": 0.0232, "step": 6788 }, { "epoch": 3.0887170154686077, "grad_norm": 0.5987794829454671, "learning_rate": 1.596856635281946e-05, "loss": 0.0267, "step": 6789 }, { "epoch": 3.089171974522293, "grad_norm": 0.2871616253587075, "learning_rate": 1.59619028710908e-05, "loss": 0.0147, "step": 6790 }, { "epoch": 3.0896269335759783, "grad_norm": 0.2915149107106514, "learning_rate": 1.5955240127914618e-05, "loss": 0.0171, "step": 6791 }, { "epoch": 3.090081892629663, "grad_norm": 0.4090851890213226, "learning_rate": 1.594857812383537e-05, "loss": 0.0185, "step": 6792 }, { "epoch": 3.0905368516833485, "grad_norm": 0.38251829309232693, "learning_rate": 1.5941916859397434e-05, "loss": 0.0214, "step": 6793 }, { "epoch": 3.0909918107370338, "grad_norm": 0.36220636261564076, "learning_rate": 1.593525633514515e-05, "loss": 0.0246, "step": 6794 }, { "epoch": 3.0914467697907186, "grad_norm": 0.3085933050069159, "learning_rate": 1.5928596551622785e-05, "loss": 0.0227, "step": 6795 }, { "epoch": 3.091901728844404, "grad_norm": 0.3785479652248736, "learning_rate": 1.592193750937454e-05, "loss": 0.0148, "step": 6796 }, { "epoch": 3.0923566878980893, "grad_norm": 0.486960370076018, "learning_rate": 1.5915279208944572e-05, "loss": 0.0272, "step": 6797 }, { "epoch": 3.092811646951774, "grad_norm": 0.16979496066651348, "learning_rate": 1.5908621650876955e-05, "loss": 0.0071, "step": 6798 }, { "epoch": 3.0932666060054594, "grad_norm": 0.34763294917288, "learning_rate": 1.5901964835715726e-05, "loss": 0.0226, "step": 6799 }, { "epoch": 3.0937215650591448, "grad_norm": 0.5092846024614761, "learning_rate": 1.5895308764004835e-05, "loss": 0.0293, "step": 6800 }, { "epoch": 3.0941765241128296, "grad_norm": 0.29928606549356596, "learning_rate": 1.5888653436288196e-05, "loss": 0.0124, "step": 6801 }, { "epoch": 3.094631483166515, "grad_norm": 0.16364212313318102, "learning_rate": 1.5881998853109663e-05, "loss": 0.0054, "step": 6802 }, { "epoch": 3.0950864422202002, "grad_norm": 0.5096696544529873, "learning_rate": 1.5875345015013e-05, "loss": 0.0215, "step": 6803 }, { "epoch": 3.0955414012738856, "grad_norm": 0.22012915618761178, "learning_rate": 1.5868691922541938e-05, "loss": 0.0121, "step": 6804 }, { "epoch": 3.0959963603275704, "grad_norm": 0.23935831240124636, "learning_rate": 1.5862039576240134e-05, "loss": 0.0116, "step": 6805 }, { "epoch": 3.0964513193812557, "grad_norm": 0.26091030515242697, "learning_rate": 1.5855387976651192e-05, "loss": 0.0106, "step": 6806 }, { "epoch": 3.096906278434941, "grad_norm": 0.38742025305603883, "learning_rate": 1.5848737124318637e-05, "loss": 0.0316, "step": 6807 }, { "epoch": 3.097361237488626, "grad_norm": 0.35452198084186787, "learning_rate": 1.5842087019785964e-05, "loss": 0.0269, "step": 6808 }, { "epoch": 3.097816196542311, "grad_norm": 0.2923685680642452, "learning_rate": 1.5835437663596587e-05, "loss": 0.0187, "step": 6809 }, { "epoch": 3.0982711555959965, "grad_norm": 0.6407916612924038, "learning_rate": 1.5828789056293854e-05, "loss": 0.0297, "step": 6810 }, { "epoch": 3.0987261146496814, "grad_norm": 0.27141570225597617, "learning_rate": 1.5822141198421067e-05, "loss": 0.0115, "step": 6811 }, { "epoch": 3.0991810737033667, "grad_norm": 0.2326799037491342, "learning_rate": 1.581549409052145e-05, "loss": 0.0116, "step": 6812 }, { "epoch": 3.099636032757052, "grad_norm": 0.3033928783282417, "learning_rate": 1.5808847733138182e-05, "loss": 0.0084, "step": 6813 }, { "epoch": 3.100090991810737, "grad_norm": 0.376088701596592, "learning_rate": 1.5802202126814362e-05, "loss": 0.0406, "step": 6814 }, { "epoch": 3.100545950864422, "grad_norm": 0.3148727268502313, "learning_rate": 1.579555727209305e-05, "loss": 0.0214, "step": 6815 }, { "epoch": 3.1010009099181075, "grad_norm": 0.2607907233880985, "learning_rate": 1.578891316951724e-05, "loss": 0.0076, "step": 6816 }, { "epoch": 3.1014558689717924, "grad_norm": 0.2616225034160509, "learning_rate": 1.578226981962984e-05, "loss": 0.0097, "step": 6817 }, { "epoch": 3.1019108280254777, "grad_norm": 0.4064478431179085, "learning_rate": 1.577562722297373e-05, "loss": 0.0182, "step": 6818 }, { "epoch": 3.102365787079163, "grad_norm": 0.3293857281941794, "learning_rate": 1.5768985380091703e-05, "loss": 0.0188, "step": 6819 }, { "epoch": 3.102820746132848, "grad_norm": 0.4276159279196301, "learning_rate": 1.5762344291526508e-05, "loss": 0.0225, "step": 6820 }, { "epoch": 3.103275705186533, "grad_norm": 0.3863476428706979, "learning_rate": 1.5755703957820812e-05, "loss": 0.0126, "step": 6821 }, { "epoch": 3.1037306642402185, "grad_norm": 0.37978322125551633, "learning_rate": 1.5749064379517242e-05, "loss": 0.0332, "step": 6822 }, { "epoch": 3.1041856232939034, "grad_norm": 0.23211883932873353, "learning_rate": 1.574242555715836e-05, "loss": 0.0067, "step": 6823 }, { "epoch": 3.1046405823475887, "grad_norm": 0.34077333211638183, "learning_rate": 1.5735787491286652e-05, "loss": 0.0311, "step": 6824 }, { "epoch": 3.105095541401274, "grad_norm": 0.2456290547411394, "learning_rate": 1.5729150182444556e-05, "loss": 0.0121, "step": 6825 }, { "epoch": 3.105550500454959, "grad_norm": 0.31323499957223067, "learning_rate": 1.5722513631174447e-05, "loss": 0.0167, "step": 6826 }, { "epoch": 3.106005459508644, "grad_norm": 0.25094742480675486, "learning_rate": 1.5715877838018613e-05, "loss": 0.008, "step": 6827 }, { "epoch": 3.1064604185623295, "grad_norm": 0.3004269478024851, "learning_rate": 1.5709242803519313e-05, "loss": 0.009, "step": 6828 }, { "epoch": 3.1069153776160148, "grad_norm": 0.320969433070581, "learning_rate": 1.5702608528218748e-05, "loss": 0.012, "step": 6829 }, { "epoch": 3.1073703366696996, "grad_norm": 0.18926823346475477, "learning_rate": 1.569597501265902e-05, "loss": 0.008, "step": 6830 }, { "epoch": 3.107825295723385, "grad_norm": 0.25398982564099276, "learning_rate": 1.5689342257382207e-05, "loss": 0.0182, "step": 6831 }, { "epoch": 3.1082802547770703, "grad_norm": 0.4051492442127858, "learning_rate": 1.5682710262930288e-05, "loss": 0.0112, "step": 6832 }, { "epoch": 3.108735213830755, "grad_norm": 0.2769088281853518, "learning_rate": 1.5676079029845216e-05, "loss": 0.0138, "step": 6833 }, { "epoch": 3.1091901728844404, "grad_norm": 0.2960525661112022, "learning_rate": 1.5669448558668855e-05, "loss": 0.0103, "step": 6834 }, { "epoch": 3.1096451319381258, "grad_norm": 0.295213885637805, "learning_rate": 1.566281884994301e-05, "loss": 0.0126, "step": 6835 }, { "epoch": 3.1101000909918106, "grad_norm": 0.2454922212938578, "learning_rate": 1.565618990420946e-05, "loss": 0.0217, "step": 6836 }, { "epoch": 3.110555050045496, "grad_norm": 0.37251833089073494, "learning_rate": 1.5649561722009868e-05, "loss": 0.0128, "step": 6837 }, { "epoch": 3.1110100090991812, "grad_norm": 0.3439699293276242, "learning_rate": 1.564293430388587e-05, "loss": 0.013, "step": 6838 }, { "epoch": 3.111464968152866, "grad_norm": 0.3947859149954794, "learning_rate": 1.563630765037902e-05, "loss": 0.0294, "step": 6839 }, { "epoch": 3.1119199272065514, "grad_norm": 0.3619137865333428, "learning_rate": 1.5629681762030828e-05, "loss": 0.0184, "step": 6840 }, { "epoch": 3.1123748862602367, "grad_norm": 0.22330872371608235, "learning_rate": 1.562305663938272e-05, "loss": 0.0072, "step": 6841 }, { "epoch": 3.1128298453139216, "grad_norm": 0.3152199218548694, "learning_rate": 1.5616432282976072e-05, "loss": 0.0185, "step": 6842 }, { "epoch": 3.113284804367607, "grad_norm": 0.3231484526290729, "learning_rate": 1.5609808693352215e-05, "loss": 0.0261, "step": 6843 }, { "epoch": 3.113739763421292, "grad_norm": 0.40937886362140763, "learning_rate": 1.5603185871052378e-05, "loss": 0.017, "step": 6844 }, { "epoch": 3.114194722474977, "grad_norm": 0.27480629524846933, "learning_rate": 1.5596563816617764e-05, "loss": 0.0086, "step": 6845 }, { "epoch": 3.1146496815286624, "grad_norm": 0.306190859726507, "learning_rate": 1.558994253058948e-05, "loss": 0.0092, "step": 6846 }, { "epoch": 3.1151046405823477, "grad_norm": 0.26473682331781506, "learning_rate": 1.5583322013508604e-05, "loss": 0.008, "step": 6847 }, { "epoch": 3.1155595996360326, "grad_norm": 0.25006562726033377, "learning_rate": 1.5576702265916126e-05, "loss": 0.0143, "step": 6848 }, { "epoch": 3.116014558689718, "grad_norm": 0.2757852150316796, "learning_rate": 1.5570083288352975e-05, "loss": 0.0127, "step": 6849 }, { "epoch": 3.116469517743403, "grad_norm": 0.23632975361647057, "learning_rate": 1.5563465081360047e-05, "loss": 0.0133, "step": 6850 }, { "epoch": 3.116924476797088, "grad_norm": 0.29871818158058816, "learning_rate": 1.555684764547813e-05, "loss": 0.0057, "step": 6851 }, { "epoch": 3.1173794358507734, "grad_norm": 0.19742718859368272, "learning_rate": 1.5550230981247982e-05, "loss": 0.0117, "step": 6852 }, { "epoch": 3.1178343949044587, "grad_norm": 0.2493002730192134, "learning_rate": 1.5543615089210278e-05, "loss": 0.0073, "step": 6853 }, { "epoch": 3.1182893539581436, "grad_norm": 0.45923939927871166, "learning_rate": 1.553699996990565e-05, "loss": 0.0285, "step": 6854 }, { "epoch": 3.118744313011829, "grad_norm": 0.47953630626591465, "learning_rate": 1.5530385623874645e-05, "loss": 0.0213, "step": 6855 }, { "epoch": 3.119199272065514, "grad_norm": 0.3380318141253538, "learning_rate": 1.5523772051657757e-05, "loss": 0.0173, "step": 6856 }, { "epoch": 3.1196542311191995, "grad_norm": 0.30420595467104733, "learning_rate": 1.5517159253795434e-05, "loss": 0.0137, "step": 6857 }, { "epoch": 3.1201091901728844, "grad_norm": 0.7059940554403323, "learning_rate": 1.5510547230828027e-05, "loss": 0.0341, "step": 6858 }, { "epoch": 3.1205641492265697, "grad_norm": 0.1933517308906975, "learning_rate": 1.550393598329585e-05, "loss": 0.0027, "step": 6859 }, { "epoch": 3.121019108280255, "grad_norm": 0.24368880446156838, "learning_rate": 1.5497325511739136e-05, "loss": 0.0067, "step": 6860 }, { "epoch": 3.12147406733394, "grad_norm": 0.42536551821524954, "learning_rate": 1.5490715816698077e-05, "loss": 0.0271, "step": 6861 }, { "epoch": 3.121929026387625, "grad_norm": 0.2668646107665881, "learning_rate": 1.548410689871277e-05, "loss": 0.0143, "step": 6862 }, { "epoch": 3.1223839854413105, "grad_norm": 0.27195264766277616, "learning_rate": 1.5477498758323268e-05, "loss": 0.0188, "step": 6863 }, { "epoch": 3.1228389444949953, "grad_norm": 0.4351561532104109, "learning_rate": 1.547089139606957e-05, "loss": 0.0214, "step": 6864 }, { "epoch": 3.1232939035486806, "grad_norm": 0.2562882957643055, "learning_rate": 1.54642848124916e-05, "loss": 0.0079, "step": 6865 }, { "epoch": 3.123748862602366, "grad_norm": 0.4051596294040265, "learning_rate": 1.5457679008129204e-05, "loss": 0.0132, "step": 6866 }, { "epoch": 3.124203821656051, "grad_norm": 0.3310425982266381, "learning_rate": 1.5451073983522198e-05, "loss": 0.0145, "step": 6867 }, { "epoch": 3.124658780709736, "grad_norm": 0.30119762532685745, "learning_rate": 1.544446973921029e-05, "loss": 0.0089, "step": 6868 }, { "epoch": 3.1251137397634214, "grad_norm": 0.30146722050226304, "learning_rate": 1.5437866275733172e-05, "loss": 0.0134, "step": 6869 }, { "epoch": 3.1255686988171063, "grad_norm": 0.24985634714811633, "learning_rate": 1.543126359363043e-05, "loss": 0.008, "step": 6870 }, { "epoch": 3.1260236578707916, "grad_norm": 0.44756925345399745, "learning_rate": 1.5424661693441616e-05, "loss": 0.013, "step": 6871 }, { "epoch": 3.126478616924477, "grad_norm": 0.3371052673076746, "learning_rate": 1.541806057570622e-05, "loss": 0.0142, "step": 6872 }, { "epoch": 3.126933575978162, "grad_norm": 0.39873299527089406, "learning_rate": 1.5411460240963628e-05, "loss": 0.033, "step": 6873 }, { "epoch": 3.127388535031847, "grad_norm": 0.3861112328605196, "learning_rate": 1.5404860689753213e-05, "loss": 0.0215, "step": 6874 }, { "epoch": 3.1278434940855324, "grad_norm": 0.34381949137923434, "learning_rate": 1.5398261922614244e-05, "loss": 0.0236, "step": 6875 }, { "epoch": 3.1282984531392173, "grad_norm": 0.3602491390071061, "learning_rate": 1.5391663940085956e-05, "loss": 0.0165, "step": 6876 }, { "epoch": 3.1287534121929026, "grad_norm": 0.21592128685821604, "learning_rate": 1.538506674270749e-05, "loss": 0.0079, "step": 6877 }, { "epoch": 3.129208371246588, "grad_norm": 0.14509161269938997, "learning_rate": 1.5378470331017955e-05, "loss": 0.0052, "step": 6878 }, { "epoch": 3.1296633303002728, "grad_norm": 0.30457642506977045, "learning_rate": 1.5371874705556378e-05, "loss": 0.014, "step": 6879 }, { "epoch": 3.130118289353958, "grad_norm": 0.3731971958227254, "learning_rate": 1.5365279866861713e-05, "loss": 0.0146, "step": 6880 }, { "epoch": 3.1305732484076434, "grad_norm": 0.47289681621875634, "learning_rate": 1.5358685815472877e-05, "loss": 0.0226, "step": 6881 }, { "epoch": 3.1310282074613287, "grad_norm": 0.33365618044371864, "learning_rate": 1.535209255192869e-05, "loss": 0.0121, "step": 6882 }, { "epoch": 3.1314831665150136, "grad_norm": 0.3280867651969782, "learning_rate": 1.534550007676793e-05, "loss": 0.0132, "step": 6883 }, { "epoch": 3.131938125568699, "grad_norm": 0.514295437171822, "learning_rate": 1.5338908390529302e-05, "loss": 0.0288, "step": 6884 }, { "epoch": 3.132393084622384, "grad_norm": 0.3802457743220548, "learning_rate": 1.533231749375145e-05, "loss": 0.0182, "step": 6885 }, { "epoch": 3.132848043676069, "grad_norm": 0.6024922301602588, "learning_rate": 1.5325727386972964e-05, "loss": 0.0094, "step": 6886 }, { "epoch": 3.1333030027297544, "grad_norm": 0.3076638637021277, "learning_rate": 1.5319138070732338e-05, "loss": 0.0087, "step": 6887 }, { "epoch": 3.1337579617834397, "grad_norm": 0.5861090335283989, "learning_rate": 1.531254954556804e-05, "loss": 0.0264, "step": 6888 }, { "epoch": 3.1342129208371245, "grad_norm": 0.27135639061132666, "learning_rate": 1.5305961812018434e-05, "loss": 0.0121, "step": 6889 }, { "epoch": 3.13466787989081, "grad_norm": 0.3046672586889011, "learning_rate": 1.5299374870621856e-05, "loss": 0.0194, "step": 6890 }, { "epoch": 3.135122838944495, "grad_norm": 0.4057511607252653, "learning_rate": 1.5292788721916553e-05, "loss": 0.0223, "step": 6891 }, { "epoch": 3.13557779799818, "grad_norm": 0.3273310917571077, "learning_rate": 1.5286203366440718e-05, "loss": 0.0172, "step": 6892 }, { "epoch": 3.1360327570518653, "grad_norm": 0.4054484300781569, "learning_rate": 1.5279618804732482e-05, "loss": 0.0228, "step": 6893 }, { "epoch": 3.1364877161055507, "grad_norm": 0.3296941517472131, "learning_rate": 1.5273035037329897e-05, "loss": 0.0187, "step": 6894 }, { "epoch": 3.1369426751592355, "grad_norm": 0.350321124466926, "learning_rate": 1.5266452064770965e-05, "loss": 0.029, "step": 6895 }, { "epoch": 3.137397634212921, "grad_norm": 0.20150614890885302, "learning_rate": 1.5259869887593616e-05, "loss": 0.0129, "step": 6896 }, { "epoch": 3.137852593266606, "grad_norm": 0.30042034006193125, "learning_rate": 1.5253288506335709e-05, "loss": 0.0131, "step": 6897 }, { "epoch": 3.138307552320291, "grad_norm": 0.23186069286578492, "learning_rate": 1.5246707921535041e-05, "loss": 0.0072, "step": 6898 }, { "epoch": 3.1387625113739763, "grad_norm": 0.3388477908899279, "learning_rate": 1.524012813372937e-05, "loss": 0.0187, "step": 6899 }, { "epoch": 3.1392174704276616, "grad_norm": 0.2613764232885643, "learning_rate": 1.5233549143456346e-05, "loss": 0.0129, "step": 6900 }, { "epoch": 3.1396724294813465, "grad_norm": 0.34343777751936816, "learning_rate": 1.5226970951253588e-05, "loss": 0.0191, "step": 6901 }, { "epoch": 3.140127388535032, "grad_norm": 0.44747553134942003, "learning_rate": 1.5220393557658622e-05, "loss": 0.0287, "step": 6902 }, { "epoch": 3.140582347588717, "grad_norm": 0.2629320222489881, "learning_rate": 1.5213816963208937e-05, "loss": 0.0236, "step": 6903 }, { "epoch": 3.141037306642402, "grad_norm": 0.3278105882571849, "learning_rate": 1.5207241168441927e-05, "loss": 0.0127, "step": 6904 }, { "epoch": 3.1414922656960873, "grad_norm": 0.09814567946744175, "learning_rate": 1.5200666173894945e-05, "loss": 0.0044, "step": 6905 }, { "epoch": 3.1419472247497726, "grad_norm": 0.4548603316167416, "learning_rate": 1.5194091980105277e-05, "loss": 0.0416, "step": 6906 }, { "epoch": 3.1424021838034575, "grad_norm": 0.24832733025938053, "learning_rate": 1.5187518587610123e-05, "loss": 0.0145, "step": 6907 }, { "epoch": 3.142857142857143, "grad_norm": 0.3507664703099078, "learning_rate": 1.5180945996946644e-05, "loss": 0.0273, "step": 6908 }, { "epoch": 3.143312101910828, "grad_norm": 0.24272636141685092, "learning_rate": 1.5174374208651912e-05, "loss": 0.0113, "step": 6909 }, { "epoch": 3.143767060964513, "grad_norm": 0.3186307498395447, "learning_rate": 1.5167803223262949e-05, "loss": 0.0203, "step": 6910 }, { "epoch": 3.1442220200181983, "grad_norm": 0.20013741442331012, "learning_rate": 1.5161233041316702e-05, "loss": 0.017, "step": 6911 }, { "epoch": 3.1446769790718836, "grad_norm": 0.22279278044989714, "learning_rate": 1.5154663663350056e-05, "loss": 0.0089, "step": 6912 }, { "epoch": 3.145131938125569, "grad_norm": 0.3746200397136572, "learning_rate": 1.5148095089899845e-05, "loss": 0.0325, "step": 6913 }, { "epoch": 3.1455868971792538, "grad_norm": 0.18768192372498269, "learning_rate": 1.5141527321502801e-05, "loss": 0.0079, "step": 6914 }, { "epoch": 3.146041856232939, "grad_norm": 0.3443684630048493, "learning_rate": 1.5134960358695635e-05, "loss": 0.0174, "step": 6915 }, { "epoch": 3.1464968152866244, "grad_norm": 0.35338998721913184, "learning_rate": 1.512839420201495e-05, "loss": 0.0153, "step": 6916 }, { "epoch": 3.1469517743403093, "grad_norm": 0.30873970132116807, "learning_rate": 1.5121828851997319e-05, "loss": 0.0196, "step": 6917 }, { "epoch": 3.1474067333939946, "grad_norm": 0.49773205334473053, "learning_rate": 1.5115264309179217e-05, "loss": 0.0173, "step": 6918 }, { "epoch": 3.14786169244768, "grad_norm": 0.33332487141180367, "learning_rate": 1.5108700574097073e-05, "loss": 0.0177, "step": 6919 }, { "epoch": 3.1483166515013647, "grad_norm": 0.23153480897262008, "learning_rate": 1.5102137647287262e-05, "loss": 0.0086, "step": 6920 }, { "epoch": 3.14877161055505, "grad_norm": 0.1443784304602423, "learning_rate": 1.5095575529286055e-05, "loss": 0.0059, "step": 6921 }, { "epoch": 3.1492265696087354, "grad_norm": 0.39126565533680563, "learning_rate": 1.5089014220629693e-05, "loss": 0.0134, "step": 6922 }, { "epoch": 3.1496815286624202, "grad_norm": 0.28959189467211766, "learning_rate": 1.5082453721854329e-05, "loss": 0.0187, "step": 6923 }, { "epoch": 3.1501364877161055, "grad_norm": 0.2941244384648485, "learning_rate": 1.5075894033496063e-05, "loss": 0.016, "step": 6924 }, { "epoch": 3.150591446769791, "grad_norm": 0.4296853813581609, "learning_rate": 1.5069335156090914e-05, "loss": 0.0225, "step": 6925 }, { "epoch": 3.1510464058234757, "grad_norm": 0.3335059084118382, "learning_rate": 1.5062777090174846e-05, "loss": 0.0116, "step": 6926 }, { "epoch": 3.151501364877161, "grad_norm": 0.5372723049931036, "learning_rate": 1.5056219836283764e-05, "loss": 0.0259, "step": 6927 }, { "epoch": 3.1519563239308463, "grad_norm": 0.4360373463512614, "learning_rate": 1.5049663394953489e-05, "loss": 0.026, "step": 6928 }, { "epoch": 3.152411282984531, "grad_norm": 0.31986793668691305, "learning_rate": 1.5043107766719795e-05, "loss": 0.0164, "step": 6929 }, { "epoch": 3.1528662420382165, "grad_norm": 0.18553905833367315, "learning_rate": 1.503655295211836e-05, "loss": 0.0073, "step": 6930 }, { "epoch": 3.153321201091902, "grad_norm": 0.1689879583061243, "learning_rate": 1.5029998951684828e-05, "loss": 0.0029, "step": 6931 }, { "epoch": 3.1537761601455867, "grad_norm": 0.28131898627934876, "learning_rate": 1.5023445765954758e-05, "loss": 0.012, "step": 6932 }, { "epoch": 3.154231119199272, "grad_norm": 0.29063899563955786, "learning_rate": 1.5016893395463633e-05, "loss": 0.0103, "step": 6933 }, { "epoch": 3.1546860782529573, "grad_norm": 0.34348774276974187, "learning_rate": 1.501034184074691e-05, "loss": 0.02, "step": 6934 }, { "epoch": 3.1551410373066426, "grad_norm": 0.4886170645943859, "learning_rate": 1.500379110233994e-05, "loss": 0.0265, "step": 6935 }, { "epoch": 3.1555959963603275, "grad_norm": 0.3520619532165206, "learning_rate": 1.4997241180778013e-05, "loss": 0.0121, "step": 6936 }, { "epoch": 3.156050955414013, "grad_norm": 0.44263963533619277, "learning_rate": 1.499069207659637e-05, "loss": 0.0315, "step": 6937 }, { "epoch": 3.156505914467698, "grad_norm": 0.43211780153878976, "learning_rate": 1.4984143790330163e-05, "loss": 0.0129, "step": 6938 }, { "epoch": 3.156960873521383, "grad_norm": 0.3427249003032742, "learning_rate": 1.4977596322514498e-05, "loss": 0.0159, "step": 6939 }, { "epoch": 3.1574158325750683, "grad_norm": 0.21918712512929672, "learning_rate": 1.4971049673684395e-05, "loss": 0.0108, "step": 6940 }, { "epoch": 3.1578707916287536, "grad_norm": 0.311619574763327, "learning_rate": 1.4964503844374823e-05, "loss": 0.0185, "step": 6941 }, { "epoch": 3.1583257506824385, "grad_norm": 0.37352961548525077, "learning_rate": 1.4957958835120684e-05, "loss": 0.0103, "step": 6942 }, { "epoch": 3.158780709736124, "grad_norm": 0.31762416144096034, "learning_rate": 1.4951414646456793e-05, "loss": 0.0124, "step": 6943 }, { "epoch": 3.159235668789809, "grad_norm": 0.47464600081800784, "learning_rate": 1.4944871278917926e-05, "loss": 0.0113, "step": 6944 }, { "epoch": 3.159690627843494, "grad_norm": 0.32393738990959026, "learning_rate": 1.4938328733038762e-05, "loss": 0.0144, "step": 6945 }, { "epoch": 3.1601455868971793, "grad_norm": 0.42987811823382255, "learning_rate": 1.4931787009353942e-05, "loss": 0.0229, "step": 6946 }, { "epoch": 3.1606005459508646, "grad_norm": 0.4123946686636253, "learning_rate": 1.4925246108398008e-05, "loss": 0.0184, "step": 6947 }, { "epoch": 3.1610555050045495, "grad_norm": 0.39644699961261903, "learning_rate": 1.4918706030705471e-05, "loss": 0.0192, "step": 6948 }, { "epoch": 3.1615104640582348, "grad_norm": 0.48471879544385216, "learning_rate": 1.4912166776810758e-05, "loss": 0.0187, "step": 6949 }, { "epoch": 3.16196542311192, "grad_norm": 0.17191484465687634, "learning_rate": 1.4905628347248213e-05, "loss": 0.0032, "step": 6950 }, { "epoch": 3.162420382165605, "grad_norm": 0.3579961216157077, "learning_rate": 1.4899090742552135e-05, "loss": 0.0191, "step": 6951 }, { "epoch": 3.1628753412192903, "grad_norm": 0.4107790114048163, "learning_rate": 1.4892553963256745e-05, "loss": 0.0188, "step": 6952 }, { "epoch": 3.1633303002729756, "grad_norm": 0.23230201270454975, "learning_rate": 1.4886018009896208e-05, "loss": 0.0082, "step": 6953 }, { "epoch": 3.1637852593266604, "grad_norm": 0.37061159119460824, "learning_rate": 1.4879482883004591e-05, "loss": 0.0276, "step": 6954 }, { "epoch": 3.1642402183803457, "grad_norm": 0.4512863580158586, "learning_rate": 1.4872948583115933e-05, "loss": 0.0178, "step": 6955 }, { "epoch": 3.164695177434031, "grad_norm": 0.21094262513568174, "learning_rate": 1.4866415110764193e-05, "loss": 0.0062, "step": 6956 }, { "epoch": 3.165150136487716, "grad_norm": 0.2969337970500188, "learning_rate": 1.4859882466483239e-05, "loss": 0.012, "step": 6957 }, { "epoch": 3.1656050955414012, "grad_norm": 0.5254547432490868, "learning_rate": 1.4853350650806902e-05, "loss": 0.0244, "step": 6958 }, { "epoch": 3.1660600545950865, "grad_norm": 0.33286448790756523, "learning_rate": 1.4846819664268925e-05, "loss": 0.0152, "step": 6959 }, { "epoch": 3.1665150136487714, "grad_norm": 0.3056064371435755, "learning_rate": 1.4840289507402994e-05, "loss": 0.0178, "step": 6960 }, { "epoch": 3.1669699727024567, "grad_norm": 0.3234343533978295, "learning_rate": 1.4833760180742717e-05, "loss": 0.028, "step": 6961 }, { "epoch": 3.167424931756142, "grad_norm": 0.5098934188515319, "learning_rate": 1.4827231684821653e-05, "loss": 0.0373, "step": 6962 }, { "epoch": 3.167879890809827, "grad_norm": 0.40617162734950607, "learning_rate": 1.4820704020173282e-05, "loss": 0.0207, "step": 6963 }, { "epoch": 3.168334849863512, "grad_norm": 0.317865248863202, "learning_rate": 1.4814177187331003e-05, "loss": 0.0149, "step": 6964 }, { "epoch": 3.1687898089171975, "grad_norm": 0.3660606901488924, "learning_rate": 1.480765118682817e-05, "loss": 0.0175, "step": 6965 }, { "epoch": 3.1692447679708824, "grad_norm": 0.37091895615911946, "learning_rate": 1.4801126019198047e-05, "loss": 0.012, "step": 6966 }, { "epoch": 3.1696997270245677, "grad_norm": 0.30700585786715273, "learning_rate": 1.4794601684973857e-05, "loss": 0.0175, "step": 6967 }, { "epoch": 3.170154686078253, "grad_norm": 0.27693621704595406, "learning_rate": 1.478807818468872e-05, "loss": 0.0153, "step": 6968 }, { "epoch": 3.1706096451319383, "grad_norm": 0.3707981448875508, "learning_rate": 1.4781555518875717e-05, "loss": 0.0095, "step": 6969 }, { "epoch": 3.171064604185623, "grad_norm": 0.3576699855011565, "learning_rate": 1.477503368806786e-05, "loss": 0.0188, "step": 6970 }, { "epoch": 3.1715195632393085, "grad_norm": 0.31032273606864796, "learning_rate": 1.4768512692798075e-05, "loss": 0.0197, "step": 6971 }, { "epoch": 3.171974522292994, "grad_norm": 0.3320870569848008, "learning_rate": 1.476199253359922e-05, "loss": 0.0188, "step": 6972 }, { "epoch": 3.1724294813466787, "grad_norm": 0.19767119776197928, "learning_rate": 1.4755473211004105e-05, "loss": 0.0111, "step": 6973 }, { "epoch": 3.172884440400364, "grad_norm": 0.3712958633062185, "learning_rate": 1.4748954725545456e-05, "loss": 0.0095, "step": 6974 }, { "epoch": 3.1733393994540493, "grad_norm": 0.49850758192011885, "learning_rate": 1.4742437077755925e-05, "loss": 0.0252, "step": 6975 }, { "epoch": 3.173794358507734, "grad_norm": 0.3723118285990712, "learning_rate": 1.4735920268168124e-05, "loss": 0.0122, "step": 6976 }, { "epoch": 3.1742493175614195, "grad_norm": 0.360946800172145, "learning_rate": 1.4729404297314559e-05, "loss": 0.0238, "step": 6977 }, { "epoch": 3.174704276615105, "grad_norm": 0.22811324381156212, "learning_rate": 1.47228891657277e-05, "loss": 0.009, "step": 6978 }, { "epoch": 3.1751592356687897, "grad_norm": 0.3406476189037723, "learning_rate": 1.4716374873939922e-05, "loss": 0.0131, "step": 6979 }, { "epoch": 3.175614194722475, "grad_norm": 0.5926599828241914, "learning_rate": 1.4709861422483555e-05, "loss": 0.0266, "step": 6980 }, { "epoch": 3.1760691537761603, "grad_norm": 0.23016831998154305, "learning_rate": 1.4703348811890838e-05, "loss": 0.0174, "step": 6981 }, { "epoch": 3.176524112829845, "grad_norm": 0.5360339894131263, "learning_rate": 1.4696837042693951e-05, "loss": 0.0205, "step": 6982 }, { "epoch": 3.1769790718835305, "grad_norm": 0.2160808912010615, "learning_rate": 1.4690326115425019e-05, "loss": 0.0059, "step": 6983 }, { "epoch": 3.1774340309372158, "grad_norm": 0.2434789368074694, "learning_rate": 1.4683816030616077e-05, "loss": 0.0106, "step": 6984 }, { "epoch": 3.1778889899909006, "grad_norm": 0.3030947509129196, "learning_rate": 1.4677306788799106e-05, "loss": 0.0139, "step": 6985 }, { "epoch": 3.178343949044586, "grad_norm": 0.3670707323857927, "learning_rate": 1.4670798390506002e-05, "loss": 0.0108, "step": 6986 }, { "epoch": 3.1787989080982713, "grad_norm": 0.2285831081318595, "learning_rate": 1.4664290836268613e-05, "loss": 0.0097, "step": 6987 }, { "epoch": 3.179253867151956, "grad_norm": 0.37597093541943677, "learning_rate": 1.4657784126618695e-05, "loss": 0.0186, "step": 6988 }, { "epoch": 3.1797088262056414, "grad_norm": 0.3281159088202725, "learning_rate": 1.4651278262087953e-05, "loss": 0.0118, "step": 6989 }, { "epoch": 3.1801637852593267, "grad_norm": 0.3978233163502127, "learning_rate": 1.4644773243208022e-05, "loss": 0.0162, "step": 6990 }, { "epoch": 3.180618744313012, "grad_norm": 0.2880781058035638, "learning_rate": 1.4638269070510452e-05, "loss": 0.0176, "step": 6991 }, { "epoch": 3.181073703366697, "grad_norm": 0.34980062769437636, "learning_rate": 1.4631765744526748e-05, "loss": 0.0205, "step": 6992 }, { "epoch": 3.1815286624203822, "grad_norm": 0.4045478078952188, "learning_rate": 1.462526326578832e-05, "loss": 0.018, "step": 6993 }, { "epoch": 3.1819836214740675, "grad_norm": 0.46286776204577634, "learning_rate": 1.461876163482653e-05, "loss": 0.0391, "step": 6994 }, { "epoch": 3.1824385805277524, "grad_norm": 0.2902732495486041, "learning_rate": 1.4612260852172655e-05, "loss": 0.0133, "step": 6995 }, { "epoch": 3.1828935395814377, "grad_norm": 0.27925465102267416, "learning_rate": 1.4605760918357903e-05, "loss": 0.0149, "step": 6996 }, { "epoch": 3.183348498635123, "grad_norm": 0.4004233604752787, "learning_rate": 1.4599261833913441e-05, "loss": 0.0163, "step": 6997 }, { "epoch": 3.183803457688808, "grad_norm": 0.3267028185127559, "learning_rate": 1.4592763599370335e-05, "loss": 0.0205, "step": 6998 }, { "epoch": 3.184258416742493, "grad_norm": 0.24760957220108293, "learning_rate": 1.4586266215259574e-05, "loss": 0.0245, "step": 6999 }, { "epoch": 3.1847133757961785, "grad_norm": 0.2744465236492228, "learning_rate": 1.4579769682112126e-05, "loss": 0.0133, "step": 7000 }, { "epoch": 3.1851683348498634, "grad_norm": 0.19972715941608693, "learning_rate": 1.457327400045884e-05, "loss": 0.007, "step": 7001 }, { "epoch": 3.1856232939035487, "grad_norm": 0.34254178084692344, "learning_rate": 1.4566779170830513e-05, "loss": 0.0174, "step": 7002 }, { "epoch": 3.186078252957234, "grad_norm": 0.23673629502377894, "learning_rate": 1.456028519375787e-05, "loss": 0.0076, "step": 7003 }, { "epoch": 3.186533212010919, "grad_norm": 0.22172000570221662, "learning_rate": 1.4553792069771574e-05, "loss": 0.0079, "step": 7004 }, { "epoch": 3.186988171064604, "grad_norm": 0.41090897479861077, "learning_rate": 1.4547299799402226e-05, "loss": 0.0115, "step": 7005 }, { "epoch": 3.1874431301182895, "grad_norm": 0.2646577665221535, "learning_rate": 1.4540808383180333e-05, "loss": 0.0097, "step": 7006 }, { "epoch": 3.1878980891719744, "grad_norm": 0.3865870603020007, "learning_rate": 1.4534317821636345e-05, "loss": 0.0107, "step": 7007 }, { "epoch": 3.1883530482256597, "grad_norm": 0.19487806073068445, "learning_rate": 1.4527828115300646e-05, "loss": 0.005, "step": 7008 }, { "epoch": 3.188808007279345, "grad_norm": 0.439390043477255, "learning_rate": 1.4521339264703526e-05, "loss": 0.0327, "step": 7009 }, { "epoch": 3.18926296633303, "grad_norm": 0.32266840653398976, "learning_rate": 1.4514851270375246e-05, "loss": 0.0089, "step": 7010 }, { "epoch": 3.189717925386715, "grad_norm": 0.41904070574006697, "learning_rate": 1.4508364132845976e-05, "loss": 0.046, "step": 7011 }, { "epoch": 3.1901728844404005, "grad_norm": 0.3399953006544352, "learning_rate": 1.4501877852645809e-05, "loss": 0.0172, "step": 7012 }, { "epoch": 3.1906278434940853, "grad_norm": 0.26649453497289133, "learning_rate": 1.4495392430304777e-05, "loss": 0.0173, "step": 7013 }, { "epoch": 3.1910828025477707, "grad_norm": 0.21690258947196223, "learning_rate": 1.4488907866352824e-05, "loss": 0.0101, "step": 7014 }, { "epoch": 3.191537761601456, "grad_norm": 0.3193137664330855, "learning_rate": 1.4482424161319864e-05, "loss": 0.0095, "step": 7015 }, { "epoch": 3.191992720655141, "grad_norm": 0.17840290262961464, "learning_rate": 1.4475941315735706e-05, "loss": 0.004, "step": 7016 }, { "epoch": 3.192447679708826, "grad_norm": 0.38983143709543655, "learning_rate": 1.4469459330130087e-05, "loss": 0.0205, "step": 7017 }, { "epoch": 3.1929026387625115, "grad_norm": 0.29524973107132524, "learning_rate": 1.4462978205032707e-05, "loss": 0.0169, "step": 7018 }, { "epoch": 3.1933575978161963, "grad_norm": 0.3725620199188234, "learning_rate": 1.4456497940973151e-05, "loss": 0.0233, "step": 7019 }, { "epoch": 3.1938125568698816, "grad_norm": 0.44998450536653445, "learning_rate": 1.445001853848098e-05, "loss": 0.0136, "step": 7020 }, { "epoch": 3.194267515923567, "grad_norm": 0.20937172795363626, "learning_rate": 1.4443539998085647e-05, "loss": 0.0097, "step": 7021 }, { "epoch": 3.194722474977252, "grad_norm": 0.49447915080765364, "learning_rate": 1.4437062320316558e-05, "loss": 0.036, "step": 7022 }, { "epoch": 3.195177434030937, "grad_norm": 0.453849867098988, "learning_rate": 1.4430585505703026e-05, "loss": 0.0115, "step": 7023 }, { "epoch": 3.1956323930846224, "grad_norm": 0.4361736185293604, "learning_rate": 1.4424109554774313e-05, "loss": 0.0237, "step": 7024 }, { "epoch": 3.1960873521383077, "grad_norm": 0.37841425242964916, "learning_rate": 1.4417634468059616e-05, "loss": 0.0154, "step": 7025 }, { "epoch": 3.1965423111919926, "grad_norm": 0.33195513557167317, "learning_rate": 1.441116024608804e-05, "loss": 0.0175, "step": 7026 }, { "epoch": 3.196997270245678, "grad_norm": 0.19344537267084802, "learning_rate": 1.440468688938863e-05, "loss": 0.0084, "step": 7027 }, { "epoch": 3.1974522292993632, "grad_norm": 0.10559998069564358, "learning_rate": 1.439821439849035e-05, "loss": 0.0019, "step": 7028 }, { "epoch": 3.197907188353048, "grad_norm": 0.5924122058129596, "learning_rate": 1.4391742773922123e-05, "loss": 0.0202, "step": 7029 }, { "epoch": 3.1983621474067334, "grad_norm": 0.29428652440789177, "learning_rate": 1.4385272016212769e-05, "loss": 0.0085, "step": 7030 }, { "epoch": 3.1988171064604187, "grad_norm": 0.41245467367996425, "learning_rate": 1.4378802125891038e-05, "loss": 0.0101, "step": 7031 }, { "epoch": 3.1992720655141036, "grad_norm": 0.3927512533689107, "learning_rate": 1.4372333103485647e-05, "loss": 0.009, "step": 7032 }, { "epoch": 3.199727024567789, "grad_norm": 0.21113061424933363, "learning_rate": 1.4365864949525188e-05, "loss": 0.0123, "step": 7033 }, { "epoch": 3.200181983621474, "grad_norm": 0.3842205638010386, "learning_rate": 1.4359397664538232e-05, "loss": 0.0098, "step": 7034 }, { "epoch": 3.200636942675159, "grad_norm": 0.4282455002955429, "learning_rate": 1.4352931249053247e-05, "loss": 0.0292, "step": 7035 }, { "epoch": 3.2010919017288444, "grad_norm": 0.5534761805974358, "learning_rate": 1.4346465703598638e-05, "loss": 0.0415, "step": 7036 }, { "epoch": 3.2015468607825297, "grad_norm": 0.25789363969391854, "learning_rate": 1.4340001028702732e-05, "loss": 0.0135, "step": 7037 }, { "epoch": 3.2020018198362146, "grad_norm": 0.3883983208937735, "learning_rate": 1.43335372248938e-05, "loss": 0.0106, "step": 7038 }, { "epoch": 3.2024567788899, "grad_norm": 0.21437725660761, "learning_rate": 1.4327074292700049e-05, "loss": 0.0084, "step": 7039 }, { "epoch": 3.202911737943585, "grad_norm": 0.2547130190880509, "learning_rate": 1.4320612232649589e-05, "loss": 0.0144, "step": 7040 }, { "epoch": 3.20336669699727, "grad_norm": 0.2956747103696751, "learning_rate": 1.4314151045270469e-05, "loss": 0.0129, "step": 7041 }, { "epoch": 3.2038216560509554, "grad_norm": 0.7102592865636066, "learning_rate": 1.4307690731090664e-05, "loss": 0.0186, "step": 7042 }, { "epoch": 3.2042766151046407, "grad_norm": 0.16645316888827308, "learning_rate": 1.4301231290638081e-05, "loss": 0.0035, "step": 7043 }, { "epoch": 3.2047315741583255, "grad_norm": 0.4954376724715628, "learning_rate": 1.4294772724440569e-05, "loss": 0.0176, "step": 7044 }, { "epoch": 3.205186533212011, "grad_norm": 0.2754909546894117, "learning_rate": 1.4288315033025878e-05, "loss": 0.0122, "step": 7045 }, { "epoch": 3.205641492265696, "grad_norm": 0.6364090315132244, "learning_rate": 1.4281858216921717e-05, "loss": 0.038, "step": 7046 }, { "epoch": 3.2060964513193815, "grad_norm": 0.2936701272243834, "learning_rate": 1.4275402276655703e-05, "loss": 0.0304, "step": 7047 }, { "epoch": 3.2065514103730663, "grad_norm": 0.4189181375228544, "learning_rate": 1.4268947212755371e-05, "loss": 0.0261, "step": 7048 }, { "epoch": 3.2070063694267517, "grad_norm": 0.2882242531459211, "learning_rate": 1.426249302574822e-05, "loss": 0.0177, "step": 7049 }, { "epoch": 3.207461328480437, "grad_norm": 0.3188067822719279, "learning_rate": 1.4256039716161651e-05, "loss": 0.0157, "step": 7050 }, { "epoch": 3.207916287534122, "grad_norm": 0.3043624446278785, "learning_rate": 1.4249587284522998e-05, "loss": 0.0108, "step": 7051 }, { "epoch": 3.208371246587807, "grad_norm": 0.28024029816303314, "learning_rate": 1.424313573135951e-05, "loss": 0.0136, "step": 7052 }, { "epoch": 3.2088262056414925, "grad_norm": 0.20524792267197678, "learning_rate": 1.4236685057198396e-05, "loss": 0.0052, "step": 7053 }, { "epoch": 3.2092811646951773, "grad_norm": 0.37696563415821377, "learning_rate": 1.4230235262566782e-05, "loss": 0.0218, "step": 7054 }, { "epoch": 3.2097361237488626, "grad_norm": 0.3941363755089757, "learning_rate": 1.4223786347991705e-05, "loss": 0.0213, "step": 7055 }, { "epoch": 3.210191082802548, "grad_norm": 0.3661073255348635, "learning_rate": 1.4217338314000144e-05, "loss": 0.0213, "step": 7056 }, { "epoch": 3.210646041856233, "grad_norm": 0.29250242299142554, "learning_rate": 1.4210891161118992e-05, "loss": 0.0123, "step": 7057 }, { "epoch": 3.211101000909918, "grad_norm": 0.37790534059242215, "learning_rate": 1.42044448898751e-05, "loss": 0.0153, "step": 7058 }, { "epoch": 3.2115559599636034, "grad_norm": 0.17004933218796228, "learning_rate": 1.4197999500795209e-05, "loss": 0.0077, "step": 7059 }, { "epoch": 3.2120109190172883, "grad_norm": 0.33313647959797693, "learning_rate": 1.4191554994406028e-05, "loss": 0.0204, "step": 7060 }, { "epoch": 3.2124658780709736, "grad_norm": 0.3354805378992467, "learning_rate": 1.418511137123416e-05, "loss": 0.0098, "step": 7061 }, { "epoch": 3.212920837124659, "grad_norm": 0.2097322384977347, "learning_rate": 1.4178668631806147e-05, "loss": 0.0152, "step": 7062 }, { "epoch": 3.213375796178344, "grad_norm": 0.5315921828057618, "learning_rate": 1.417222677664847e-05, "loss": 0.0319, "step": 7063 }, { "epoch": 3.213830755232029, "grad_norm": 0.3102488593363635, "learning_rate": 1.4165785806287523e-05, "loss": 0.0175, "step": 7064 }, { "epoch": 3.2142857142857144, "grad_norm": 0.423087049589951, "learning_rate": 1.4159345721249637e-05, "loss": 0.0144, "step": 7065 }, { "epoch": 3.2147406733393993, "grad_norm": 0.1616288186975373, "learning_rate": 1.4152906522061048e-05, "loss": 0.0039, "step": 7066 }, { "epoch": 3.2151956323930846, "grad_norm": 0.403817834161398, "learning_rate": 1.4146468209247956e-05, "loss": 0.0405, "step": 7067 }, { "epoch": 3.21565059144677, "grad_norm": 0.27785592112619223, "learning_rate": 1.4140030783336477e-05, "loss": 0.0128, "step": 7068 }, { "epoch": 3.2161055505004548, "grad_norm": 0.3271672450296681, "learning_rate": 1.4133594244852639e-05, "loss": 0.0149, "step": 7069 }, { "epoch": 3.21656050955414, "grad_norm": 0.3644988694606582, "learning_rate": 1.412715859432241e-05, "loss": 0.0125, "step": 7070 }, { "epoch": 3.2170154686078254, "grad_norm": 0.4466061478733523, "learning_rate": 1.4120723832271665e-05, "loss": 0.0317, "step": 7071 }, { "epoch": 3.2174704276615103, "grad_norm": 0.2814224098910347, "learning_rate": 1.4114289959226249e-05, "loss": 0.0124, "step": 7072 }, { "epoch": 3.2179253867151956, "grad_norm": 0.3813084133732126, "learning_rate": 1.4107856975711886e-05, "loss": 0.0091, "step": 7073 }, { "epoch": 3.218380345768881, "grad_norm": 0.3004489272744679, "learning_rate": 1.4101424882254277e-05, "loss": 0.0157, "step": 7074 }, { "epoch": 3.2188353048225657, "grad_norm": 0.2797443599713269, "learning_rate": 1.409499367937901e-05, "loss": 0.0195, "step": 7075 }, { "epoch": 3.219290263876251, "grad_norm": 0.22833452694735437, "learning_rate": 1.4088563367611596e-05, "loss": 0.0161, "step": 7076 }, { "epoch": 3.2197452229299364, "grad_norm": 0.2505473323344767, "learning_rate": 1.4082133947477521e-05, "loss": 0.0264, "step": 7077 }, { "epoch": 3.2202001819836217, "grad_norm": 0.3666024134155451, "learning_rate": 1.4075705419502161e-05, "loss": 0.0245, "step": 7078 }, { "epoch": 3.2206551410373065, "grad_norm": 0.3402023583201992, "learning_rate": 1.4069277784210814e-05, "loss": 0.0167, "step": 7079 }, { "epoch": 3.221110100090992, "grad_norm": 0.31605494615050894, "learning_rate": 1.4062851042128717e-05, "loss": 0.0125, "step": 7080 }, { "epoch": 3.221565059144677, "grad_norm": 0.23373568548806237, "learning_rate": 1.4056425193781048e-05, "loss": 0.0121, "step": 7081 }, { "epoch": 3.222020018198362, "grad_norm": 0.27155357313517176, "learning_rate": 1.4050000239692885e-05, "loss": 0.0091, "step": 7082 }, { "epoch": 3.2224749772520473, "grad_norm": 0.2973421593120053, "learning_rate": 1.4043576180389256e-05, "loss": 0.0072, "step": 7083 }, { "epoch": 3.2229299363057327, "grad_norm": 0.2725025420210873, "learning_rate": 1.403715301639511e-05, "loss": 0.016, "step": 7084 }, { "epoch": 3.2233848953594175, "grad_norm": 0.282361590285663, "learning_rate": 1.403073074823531e-05, "loss": 0.0121, "step": 7085 }, { "epoch": 3.223839854413103, "grad_norm": 0.45200201762882003, "learning_rate": 1.4024309376434644e-05, "loss": 0.01, "step": 7086 }, { "epoch": 3.224294813466788, "grad_norm": 0.30028938263910643, "learning_rate": 1.4017888901517851e-05, "loss": 0.0108, "step": 7087 }, { "epoch": 3.224749772520473, "grad_norm": 0.20897320897295782, "learning_rate": 1.4011469324009594e-05, "loss": 0.0063, "step": 7088 }, { "epoch": 3.2252047315741583, "grad_norm": 0.210853153906506, "learning_rate": 1.4005050644434439e-05, "loss": 0.0048, "step": 7089 }, { "epoch": 3.2256596906278436, "grad_norm": 0.5525387408272381, "learning_rate": 1.399863286331689e-05, "loss": 0.058, "step": 7090 }, { "epoch": 3.2261146496815285, "grad_norm": 0.1845961971181326, "learning_rate": 1.3992215981181378e-05, "loss": 0.0043, "step": 7091 }, { "epoch": 3.226569608735214, "grad_norm": 0.2922333730420303, "learning_rate": 1.3985799998552267e-05, "loss": 0.0127, "step": 7092 }, { "epoch": 3.227024567788899, "grad_norm": 0.27035340561794585, "learning_rate": 1.3979384915953846e-05, "loss": 0.0072, "step": 7093 }, { "epoch": 3.227479526842584, "grad_norm": 0.2360963357795898, "learning_rate": 1.3972970733910313e-05, "loss": 0.009, "step": 7094 }, { "epoch": 3.2279344858962693, "grad_norm": 0.24739535225179834, "learning_rate": 1.3966557452945819e-05, "loss": 0.0137, "step": 7095 }, { "epoch": 3.2283894449499546, "grad_norm": 0.2866946158698997, "learning_rate": 1.3960145073584415e-05, "loss": 0.0113, "step": 7096 }, { "epoch": 3.2288444040036395, "grad_norm": 0.4840841635946223, "learning_rate": 1.3953733596350111e-05, "loss": 0.0216, "step": 7097 }, { "epoch": 3.229299363057325, "grad_norm": 0.2933463960878231, "learning_rate": 1.3947323021766811e-05, "loss": 0.0106, "step": 7098 }, { "epoch": 3.22975432211101, "grad_norm": 0.1510538230620957, "learning_rate": 1.3940913350358361e-05, "loss": 0.0031, "step": 7099 }, { "epoch": 3.2302092811646954, "grad_norm": 0.3904239146123444, "learning_rate": 1.3934504582648522e-05, "loss": 0.016, "step": 7100 }, { "epoch": 3.2306642402183803, "grad_norm": 0.13298988808509118, "learning_rate": 1.3928096719160993e-05, "loss": 0.0027, "step": 7101 }, { "epoch": 3.2311191992720656, "grad_norm": 0.3585877165267703, "learning_rate": 1.3921689760419414e-05, "loss": 0.021, "step": 7102 }, { "epoch": 3.231574158325751, "grad_norm": 0.5139506995429687, "learning_rate": 1.3915283706947319e-05, "loss": 0.0263, "step": 7103 }, { "epoch": 3.2320291173794358, "grad_norm": 0.32235901960916474, "learning_rate": 1.3908878559268177e-05, "loss": 0.0155, "step": 7104 }, { "epoch": 3.232484076433121, "grad_norm": 0.4126397818972143, "learning_rate": 1.3902474317905384e-05, "loss": 0.0243, "step": 7105 }, { "epoch": 3.2329390354868064, "grad_norm": 0.5828721442831699, "learning_rate": 1.3896070983382284e-05, "loss": 0.0164, "step": 7106 }, { "epoch": 3.2333939945404913, "grad_norm": 0.3232740492331838, "learning_rate": 1.388966855622212e-05, "loss": 0.0134, "step": 7107 }, { "epoch": 3.2338489535941766, "grad_norm": 0.6062002166616676, "learning_rate": 1.3883267036948056e-05, "loss": 0.0142, "step": 7108 }, { "epoch": 3.234303912647862, "grad_norm": 0.20675286360624276, "learning_rate": 1.3876866426083213e-05, "loss": 0.0069, "step": 7109 }, { "epoch": 3.2347588717015467, "grad_norm": 0.49992448873430534, "learning_rate": 1.387046672415061e-05, "loss": 0.0095, "step": 7110 }, { "epoch": 3.235213830755232, "grad_norm": 0.46475769214226414, "learning_rate": 1.3864067931673214e-05, "loss": 0.0303, "step": 7111 }, { "epoch": 3.2356687898089174, "grad_norm": 0.3470198245108376, "learning_rate": 1.3857670049173895e-05, "loss": 0.02, "step": 7112 }, { "epoch": 3.2361237488626022, "grad_norm": 0.3913172838253249, "learning_rate": 1.3851273077175464e-05, "loss": 0.0163, "step": 7113 }, { "epoch": 3.2365787079162875, "grad_norm": 0.15468743878049393, "learning_rate": 1.384487701620065e-05, "loss": 0.0054, "step": 7114 }, { "epoch": 3.237033666969973, "grad_norm": 0.1960221612562595, "learning_rate": 1.3838481866772099e-05, "loss": 0.0066, "step": 7115 }, { "epoch": 3.2374886260236577, "grad_norm": 0.23022775100444143, "learning_rate": 1.3832087629412404e-05, "loss": 0.0082, "step": 7116 }, { "epoch": 3.237943585077343, "grad_norm": 0.28435136969050295, "learning_rate": 1.3825694304644087e-05, "loss": 0.0107, "step": 7117 }, { "epoch": 3.2383985441310283, "grad_norm": 0.2292120381684319, "learning_rate": 1.3819301892989566e-05, "loss": 0.0114, "step": 7118 }, { "epoch": 3.238853503184713, "grad_norm": 0.3457702467716614, "learning_rate": 1.3812910394971204e-05, "loss": 0.0121, "step": 7119 }, { "epoch": 3.2393084622383985, "grad_norm": 0.5898017498053202, "learning_rate": 1.3806519811111274e-05, "loss": 0.0135, "step": 7120 }, { "epoch": 3.239763421292084, "grad_norm": 0.40497767178407534, "learning_rate": 1.3800130141932006e-05, "loss": 0.0191, "step": 7121 }, { "epoch": 3.2402183803457687, "grad_norm": 0.3685865144697614, "learning_rate": 1.3793741387955512e-05, "loss": 0.0286, "step": 7122 }, { "epoch": 3.240673339399454, "grad_norm": 0.29920442526810664, "learning_rate": 1.378735354970388e-05, "loss": 0.0114, "step": 7123 }, { "epoch": 3.2411282984531393, "grad_norm": 0.32867628285368544, "learning_rate": 1.3780966627699077e-05, "loss": 0.0141, "step": 7124 }, { "epoch": 3.241583257506824, "grad_norm": 0.29746237408484577, "learning_rate": 1.3774580622463004e-05, "loss": 0.0141, "step": 7125 }, { "epoch": 3.2420382165605095, "grad_norm": 0.3064171254639482, "learning_rate": 1.3768195534517522e-05, "loss": 0.0095, "step": 7126 }, { "epoch": 3.242493175614195, "grad_norm": 0.2411156120397197, "learning_rate": 1.3761811364384378e-05, "loss": 0.0087, "step": 7127 }, { "epoch": 3.2429481346678797, "grad_norm": 0.2546059483148722, "learning_rate": 1.3755428112585257e-05, "loss": 0.0076, "step": 7128 }, { "epoch": 3.243403093721565, "grad_norm": 0.21934074381940816, "learning_rate": 1.3749045779641762e-05, "loss": 0.0123, "step": 7129 }, { "epoch": 3.2438580527752503, "grad_norm": 0.28114956305918526, "learning_rate": 1.3742664366075435e-05, "loss": 0.0164, "step": 7130 }, { "epoch": 3.244313011828935, "grad_norm": 0.565097305450993, "learning_rate": 1.3736283872407752e-05, "loss": 0.0342, "step": 7131 }, { "epoch": 3.2447679708826205, "grad_norm": 0.4183284592314326, "learning_rate": 1.3729904299160082e-05, "loss": 0.0231, "step": 7132 }, { "epoch": 3.245222929936306, "grad_norm": 0.5079626910524364, "learning_rate": 1.3723525646853739e-05, "loss": 0.0184, "step": 7133 }, { "epoch": 3.245677888989991, "grad_norm": 0.37785976953034667, "learning_rate": 1.3717147916009942e-05, "loss": 0.0127, "step": 7134 }, { "epoch": 3.246132848043676, "grad_norm": 0.44248928565102397, "learning_rate": 1.3710771107149878e-05, "loss": 0.0278, "step": 7135 }, { "epoch": 3.2465878070973613, "grad_norm": 0.396135959268526, "learning_rate": 1.3704395220794608e-05, "loss": 0.0154, "step": 7136 }, { "epoch": 3.2470427661510466, "grad_norm": 0.3491399124807685, "learning_rate": 1.3698020257465158e-05, "loss": 0.014, "step": 7137 }, { "epoch": 3.2474977252047315, "grad_norm": 0.3593391813653268, "learning_rate": 1.3691646217682452e-05, "loss": 0.0193, "step": 7138 }, { "epoch": 3.2479526842584168, "grad_norm": 0.2984591808834275, "learning_rate": 1.3685273101967344e-05, "loss": 0.009, "step": 7139 }, { "epoch": 3.248407643312102, "grad_norm": 0.2836380247406946, "learning_rate": 1.3678900910840627e-05, "loss": 0.0071, "step": 7140 }, { "epoch": 3.248862602365787, "grad_norm": 0.20508990391594364, "learning_rate": 1.3672529644823004e-05, "loss": 0.0095, "step": 7141 }, { "epoch": 3.2493175614194723, "grad_norm": 0.5885077821540676, "learning_rate": 1.3666159304435102e-05, "loss": 0.0177, "step": 7142 }, { "epoch": 3.2497725204731576, "grad_norm": 0.3573230159946327, "learning_rate": 1.3659789890197473e-05, "loss": 0.0097, "step": 7143 }, { "epoch": 3.2502274795268424, "grad_norm": 0.3575759590544314, "learning_rate": 1.3653421402630596e-05, "loss": 0.0144, "step": 7144 }, { "epoch": 3.2506824385805277, "grad_norm": 0.3510283256140073, "learning_rate": 1.3647053842254895e-05, "loss": 0.0076, "step": 7145 }, { "epoch": 3.251137397634213, "grad_norm": 0.33602369255946196, "learning_rate": 1.3640687209590684e-05, "loss": 0.0231, "step": 7146 }, { "epoch": 3.251592356687898, "grad_norm": 0.38342443384287006, "learning_rate": 1.3634321505158215e-05, "loss": 0.0254, "step": 7147 }, { "epoch": 3.2520473157415832, "grad_norm": 0.27600342514788845, "learning_rate": 1.3627956729477665e-05, "loss": 0.0141, "step": 7148 }, { "epoch": 3.2525022747952685, "grad_norm": 0.4228606448617727, "learning_rate": 1.3621592883069129e-05, "loss": 0.0277, "step": 7149 }, { "epoch": 3.2529572338489534, "grad_norm": 0.314388414580674, "learning_rate": 1.3615229966452637e-05, "loss": 0.0163, "step": 7150 }, { "epoch": 3.2534121929026387, "grad_norm": 0.20641037437509271, "learning_rate": 1.3608867980148147e-05, "loss": 0.0077, "step": 7151 }, { "epoch": 3.253867151956324, "grad_norm": 0.4912804676637108, "learning_rate": 1.3602506924675523e-05, "loss": 0.0216, "step": 7152 }, { "epoch": 3.2543221110100093, "grad_norm": 0.41188319981308336, "learning_rate": 1.3596146800554565e-05, "loss": 0.0127, "step": 7153 }, { "epoch": 3.254777070063694, "grad_norm": 0.33925051806923456, "learning_rate": 1.3589787608304979e-05, "loss": 0.0308, "step": 7154 }, { "epoch": 3.2552320291173795, "grad_norm": 0.21901350379700543, "learning_rate": 1.3583429348446433e-05, "loss": 0.0115, "step": 7155 }, { "epoch": 3.255686988171065, "grad_norm": 0.3032918505344438, "learning_rate": 1.3577072021498483e-05, "loss": 0.0104, "step": 7156 }, { "epoch": 3.2561419472247497, "grad_norm": 0.22249430234500017, "learning_rate": 1.3570715627980613e-05, "loss": 0.0055, "step": 7157 }, { "epoch": 3.256596906278435, "grad_norm": 0.257085689089563, "learning_rate": 1.356436016841226e-05, "loss": 0.014, "step": 7158 }, { "epoch": 3.2570518653321203, "grad_norm": 0.3634555075917952, "learning_rate": 1.3558005643312738e-05, "loss": 0.0164, "step": 7159 }, { "epoch": 3.257506824385805, "grad_norm": 0.20643953561703374, "learning_rate": 1.3551652053201333e-05, "loss": 0.006, "step": 7160 }, { "epoch": 3.2579617834394905, "grad_norm": 0.28562822407987176, "learning_rate": 1.3545299398597223e-05, "loss": 0.0186, "step": 7161 }, { "epoch": 3.258416742493176, "grad_norm": 0.27564232751023116, "learning_rate": 1.3538947680019514e-05, "loss": 0.0066, "step": 7162 }, { "epoch": 3.2588717015468607, "grad_norm": 0.43312913688723315, "learning_rate": 1.3532596897987238e-05, "loss": 0.0133, "step": 7163 }, { "epoch": 3.259326660600546, "grad_norm": 0.3317464671637731, "learning_rate": 1.3526247053019353e-05, "loss": 0.0204, "step": 7164 }, { "epoch": 3.2597816196542313, "grad_norm": 0.17149490750152238, "learning_rate": 1.3519898145634757e-05, "loss": 0.0057, "step": 7165 }, { "epoch": 3.260236578707916, "grad_norm": 0.27716901126871135, "learning_rate": 1.3513550176352241e-05, "loss": 0.0088, "step": 7166 }, { "epoch": 3.2606915377616015, "grad_norm": 0.2523508373336315, "learning_rate": 1.3507203145690527e-05, "loss": 0.0043, "step": 7167 }, { "epoch": 3.261146496815287, "grad_norm": 0.34871732913520825, "learning_rate": 1.3500857054168267e-05, "loss": 0.0204, "step": 7168 }, { "epoch": 3.2616014558689717, "grad_norm": 0.3013537208251628, "learning_rate": 1.3494511902304047e-05, "loss": 0.0088, "step": 7169 }, { "epoch": 3.262056414922657, "grad_norm": 0.20632658014660168, "learning_rate": 1.3488167690616354e-05, "loss": 0.0056, "step": 7170 }, { "epoch": 3.2625113739763423, "grad_norm": 0.38152004093155345, "learning_rate": 1.3481824419623604e-05, "loss": 0.0174, "step": 7171 }, { "epoch": 3.262966333030027, "grad_norm": 0.4164584181653351, "learning_rate": 1.3475482089844155e-05, "loss": 0.0185, "step": 7172 }, { "epoch": 3.2634212920837125, "grad_norm": 0.2597663895102382, "learning_rate": 1.3469140701796254e-05, "loss": 0.0073, "step": 7173 }, { "epoch": 3.2638762511373978, "grad_norm": 0.3393602986299218, "learning_rate": 1.3462800255998115e-05, "loss": 0.0096, "step": 7174 }, { "epoch": 3.2643312101910826, "grad_norm": 0.4048839362144346, "learning_rate": 1.3456460752967834e-05, "loss": 0.0254, "step": 7175 }, { "epoch": 3.264786169244768, "grad_norm": 0.30219873881995035, "learning_rate": 1.3450122193223452e-05, "loss": 0.0069, "step": 7176 }, { "epoch": 3.2652411282984533, "grad_norm": 0.20402083726728773, "learning_rate": 1.3443784577282914e-05, "loss": 0.0068, "step": 7177 }, { "epoch": 3.265696087352138, "grad_norm": 0.3525160782963367, "learning_rate": 1.3437447905664114e-05, "loss": 0.0327, "step": 7178 }, { "epoch": 3.2661510464058234, "grad_norm": 0.3260620416807946, "learning_rate": 1.3431112178884866e-05, "loss": 0.016, "step": 7179 }, { "epoch": 3.2666060054595087, "grad_norm": 0.34046622585351144, "learning_rate": 1.3424777397462885e-05, "loss": 0.0096, "step": 7180 }, { "epoch": 3.2670609645131936, "grad_norm": 0.2743632459605044, "learning_rate": 1.3418443561915822e-05, "loss": 0.0065, "step": 7181 }, { "epoch": 3.267515923566879, "grad_norm": 0.40854824460821565, "learning_rate": 1.3412110672761242e-05, "loss": 0.0138, "step": 7182 }, { "epoch": 3.2679708826205642, "grad_norm": 0.2844099117162366, "learning_rate": 1.3405778730516655e-05, "loss": 0.0115, "step": 7183 }, { "epoch": 3.268425841674249, "grad_norm": 0.40753709783170494, "learning_rate": 1.3399447735699471e-05, "loss": 0.0272, "step": 7184 }, { "epoch": 3.2688808007279344, "grad_norm": 0.6336275014418743, "learning_rate": 1.339311768882702e-05, "loss": 0.0406, "step": 7185 }, { "epoch": 3.2693357597816197, "grad_norm": 0.23430081700218938, "learning_rate": 1.3386788590416585e-05, "loss": 0.0103, "step": 7186 }, { "epoch": 3.2697907188353046, "grad_norm": 0.23606211173601935, "learning_rate": 1.3380460440985343e-05, "loss": 0.0067, "step": 7187 }, { "epoch": 3.27024567788899, "grad_norm": 0.2638543162565105, "learning_rate": 1.3374133241050391e-05, "loss": 0.0108, "step": 7188 }, { "epoch": 3.270700636942675, "grad_norm": 0.3799081999584277, "learning_rate": 1.3367806991128775e-05, "loss": 0.0448, "step": 7189 }, { "epoch": 3.2711555959963605, "grad_norm": 0.29300178410870786, "learning_rate": 1.3361481691737443e-05, "loss": 0.0098, "step": 7190 }, { "epoch": 3.2716105550500454, "grad_norm": 0.34199927077345227, "learning_rate": 1.335515734339327e-05, "loss": 0.0167, "step": 7191 }, { "epoch": 3.2720655141037307, "grad_norm": 0.30205015964845744, "learning_rate": 1.3348833946613038e-05, "loss": 0.0123, "step": 7192 }, { "epoch": 3.272520473157416, "grad_norm": 0.3911799960300734, "learning_rate": 1.3342511501913483e-05, "loss": 0.0211, "step": 7193 }, { "epoch": 3.272975432211101, "grad_norm": 0.27025118900301087, "learning_rate": 1.3336190009811251e-05, "loss": 0.0087, "step": 7194 }, { "epoch": 3.273430391264786, "grad_norm": 0.23758946886396737, "learning_rate": 1.3329869470822897e-05, "loss": 0.0069, "step": 7195 }, { "epoch": 3.2738853503184715, "grad_norm": 0.4358311354484548, "learning_rate": 1.3323549885464912e-05, "loss": 0.0185, "step": 7196 }, { "epoch": 3.2743403093721564, "grad_norm": 0.30194794341467246, "learning_rate": 1.3317231254253687e-05, "loss": 0.0093, "step": 7197 }, { "epoch": 3.2747952684258417, "grad_norm": 0.3766365074174692, "learning_rate": 1.3310913577705575e-05, "loss": 0.0238, "step": 7198 }, { "epoch": 3.275250227479527, "grad_norm": 0.25570786763149217, "learning_rate": 1.330459685633681e-05, "loss": 0.0086, "step": 7199 }, { "epoch": 3.275705186533212, "grad_norm": 0.24324326058949278, "learning_rate": 1.3298281090663583e-05, "loss": 0.0109, "step": 7200 }, { "epoch": 3.276160145586897, "grad_norm": 0.24867763179011132, "learning_rate": 1.3291966281201978e-05, "loss": 0.0116, "step": 7201 }, { "epoch": 3.2766151046405825, "grad_norm": 0.32246680302829117, "learning_rate": 1.3285652428468009e-05, "loss": 0.0125, "step": 7202 }, { "epoch": 3.2770700636942673, "grad_norm": 0.20978322456366671, "learning_rate": 1.327933953297763e-05, "loss": 0.0084, "step": 7203 }, { "epoch": 3.2775250227479527, "grad_norm": 0.2752337441399456, "learning_rate": 1.3273027595246698e-05, "loss": 0.008, "step": 7204 }, { "epoch": 3.277979981801638, "grad_norm": 0.39243164937241565, "learning_rate": 1.326671661579099e-05, "loss": 0.0168, "step": 7205 }, { "epoch": 3.278434940855323, "grad_norm": 0.22404869121450857, "learning_rate": 1.3260406595126202e-05, "loss": 0.01, "step": 7206 }, { "epoch": 3.278889899909008, "grad_norm": 0.30481444136852426, "learning_rate": 1.3254097533767974e-05, "loss": 0.006, "step": 7207 }, { "epoch": 3.2793448589626935, "grad_norm": 0.20672404634298996, "learning_rate": 1.3247789432231859e-05, "loss": 0.0072, "step": 7208 }, { "epoch": 3.2797998180163788, "grad_norm": 0.37730982441851196, "learning_rate": 1.3241482291033318e-05, "loss": 0.0082, "step": 7209 }, { "epoch": 3.2802547770700636, "grad_norm": 0.39703891388959384, "learning_rate": 1.3235176110687747e-05, "loss": 0.0203, "step": 7210 }, { "epoch": 3.280709736123749, "grad_norm": 0.2742227117560313, "learning_rate": 1.3228870891710443e-05, "loss": 0.008, "step": 7211 }, { "epoch": 3.2811646951774343, "grad_norm": 0.2352676644042824, "learning_rate": 1.3222566634616662e-05, "loss": 0.0072, "step": 7212 }, { "epoch": 3.281619654231119, "grad_norm": 0.18345854380351925, "learning_rate": 1.3216263339921536e-05, "loss": 0.005, "step": 7213 }, { "epoch": 3.2820746132848044, "grad_norm": 0.35751316061023225, "learning_rate": 1.320996100814017e-05, "loss": 0.0224, "step": 7214 }, { "epoch": 3.2825295723384897, "grad_norm": 0.369716462790721, "learning_rate": 1.3203659639787544e-05, "loss": 0.0109, "step": 7215 }, { "epoch": 3.2829845313921746, "grad_norm": 0.552951586021487, "learning_rate": 1.319735923537857e-05, "loss": 0.032, "step": 7216 }, { "epoch": 3.28343949044586, "grad_norm": 0.3321896724909968, "learning_rate": 1.3191059795428113e-05, "loss": 0.0184, "step": 7217 }, { "epoch": 3.2838944494995452, "grad_norm": 0.2941692771707654, "learning_rate": 1.3184761320450917e-05, "loss": 0.0215, "step": 7218 }, { "epoch": 3.28434940855323, "grad_norm": 0.32922332347218725, "learning_rate": 1.3178463810961673e-05, "loss": 0.0217, "step": 7219 }, { "epoch": 3.2848043676069154, "grad_norm": 0.347418445361241, "learning_rate": 1.3172167267474967e-05, "loss": 0.019, "step": 7220 }, { "epoch": 3.2852593266606007, "grad_norm": 0.31079364153095645, "learning_rate": 1.3165871690505337e-05, "loss": 0.0127, "step": 7221 }, { "epoch": 3.2857142857142856, "grad_norm": 0.3718807833547884, "learning_rate": 1.3159577080567242e-05, "loss": 0.0124, "step": 7222 }, { "epoch": 3.286169244767971, "grad_norm": 0.20880985255776235, "learning_rate": 1.3153283438175034e-05, "loss": 0.0037, "step": 7223 }, { "epoch": 3.286624203821656, "grad_norm": 0.4432340483848916, "learning_rate": 1.314699076384301e-05, "loss": 0.0222, "step": 7224 }, { "epoch": 3.287079162875341, "grad_norm": 0.2946579784767494, "learning_rate": 1.3140699058085367e-05, "loss": 0.0261, "step": 7225 }, { "epoch": 3.2875341219290264, "grad_norm": 0.28670045326932964, "learning_rate": 1.3134408321416236e-05, "loss": 0.0091, "step": 7226 }, { "epoch": 3.2879890809827117, "grad_norm": 0.362703136731676, "learning_rate": 1.3128118554349669e-05, "loss": 0.0112, "step": 7227 }, { "epoch": 3.2884440400363966, "grad_norm": 0.3201846050036119, "learning_rate": 1.3121829757399651e-05, "loss": 0.0083, "step": 7228 }, { "epoch": 3.288898999090082, "grad_norm": 0.2687577094694049, "learning_rate": 1.3115541931080066e-05, "loss": 0.0045, "step": 7229 }, { "epoch": 3.289353958143767, "grad_norm": 0.4955435305853816, "learning_rate": 1.3109255075904725e-05, "loss": 0.0222, "step": 7230 }, { "epoch": 3.289808917197452, "grad_norm": 0.43117165062647694, "learning_rate": 1.3102969192387349e-05, "loss": 0.0289, "step": 7231 }, { "epoch": 3.2902638762511374, "grad_norm": 0.20410485725843128, "learning_rate": 1.3096684281041613e-05, "loss": 0.0089, "step": 7232 }, { "epoch": 3.2907188353048227, "grad_norm": 0.26811962910791826, "learning_rate": 1.3090400342381084e-05, "loss": 0.0216, "step": 7233 }, { "epoch": 3.2911737943585075, "grad_norm": 0.27931192253195036, "learning_rate": 1.3084117376919247e-05, "loss": 0.019, "step": 7234 }, { "epoch": 3.291628753412193, "grad_norm": 0.5788207240929748, "learning_rate": 1.3077835385169535e-05, "loss": 0.0134, "step": 7235 }, { "epoch": 3.292083712465878, "grad_norm": 0.27160690125753123, "learning_rate": 1.3071554367645267e-05, "loss": 0.0154, "step": 7236 }, { "epoch": 3.292538671519563, "grad_norm": 0.4034225091515354, "learning_rate": 1.3065274324859716e-05, "loss": 0.0176, "step": 7237 }, { "epoch": 3.2929936305732483, "grad_norm": 0.2922889538598251, "learning_rate": 1.305899525732605e-05, "loss": 0.0149, "step": 7238 }, { "epoch": 3.2934485896269337, "grad_norm": 0.34202159476496774, "learning_rate": 1.3052717165557366e-05, "loss": 0.0153, "step": 7239 }, { "epoch": 3.2939035486806185, "grad_norm": 0.21732337713325112, "learning_rate": 1.3046440050066675e-05, "loss": 0.016, "step": 7240 }, { "epoch": 3.294358507734304, "grad_norm": 0.2332802661553695, "learning_rate": 1.3040163911366917e-05, "loss": 0.0094, "step": 7241 }, { "epoch": 3.294813466787989, "grad_norm": 0.28564111891037985, "learning_rate": 1.3033888749970969e-05, "loss": 0.0076, "step": 7242 }, { "epoch": 3.295268425841674, "grad_norm": 0.7540675050540175, "learning_rate": 1.3027614566391588e-05, "loss": 0.0736, "step": 7243 }, { "epoch": 3.2957233848953593, "grad_norm": 0.3143592258090815, "learning_rate": 1.3021341361141481e-05, "loss": 0.0146, "step": 7244 }, { "epoch": 3.2961783439490446, "grad_norm": 0.2564267900187642, "learning_rate": 1.3015069134733254e-05, "loss": 0.0121, "step": 7245 }, { "epoch": 3.29663330300273, "grad_norm": 0.4110793406621682, "learning_rate": 1.3008797887679463e-05, "loss": 0.031, "step": 7246 }, { "epoch": 3.297088262056415, "grad_norm": 0.26917560328410073, "learning_rate": 1.3002527620492556e-05, "loss": 0.0072, "step": 7247 }, { "epoch": 3.2975432211101, "grad_norm": 0.555701334954897, "learning_rate": 1.2996258333684902e-05, "loss": 0.0408, "step": 7248 }, { "epoch": 3.2979981801637854, "grad_norm": 0.35271543633073965, "learning_rate": 1.298999002776882e-05, "loss": 0.0124, "step": 7249 }, { "epoch": 3.2984531392174703, "grad_norm": 0.32186255967145827, "learning_rate": 1.2983722703256507e-05, "loss": 0.0248, "step": 7250 }, { "epoch": 3.2989080982711556, "grad_norm": 0.40887121550705924, "learning_rate": 1.2977456360660117e-05, "loss": 0.0134, "step": 7251 }, { "epoch": 3.299363057324841, "grad_norm": 0.3850205627227336, "learning_rate": 1.29711910004917e-05, "loss": 0.0153, "step": 7252 }, { "epoch": 3.299818016378526, "grad_norm": 0.342537911701963, "learning_rate": 1.2964926623263232e-05, "loss": 0.0133, "step": 7253 }, { "epoch": 3.300272975432211, "grad_norm": 0.3764451685757201, "learning_rate": 1.2958663229486612e-05, "loss": 0.0279, "step": 7254 }, { "epoch": 3.3007279344858964, "grad_norm": 0.29736597590148534, "learning_rate": 1.2952400819673636e-05, "loss": 0.017, "step": 7255 }, { "epoch": 3.3011828935395813, "grad_norm": 0.32010399739004136, "learning_rate": 1.2946139394336076e-05, "loss": 0.0252, "step": 7256 }, { "epoch": 3.3016378525932666, "grad_norm": 0.34015908132187767, "learning_rate": 1.2939878953985571e-05, "loss": 0.0153, "step": 7257 }, { "epoch": 3.302092811646952, "grad_norm": 0.2820200834464035, "learning_rate": 1.2933619499133693e-05, "loss": 0.0116, "step": 7258 }, { "epoch": 3.3025477707006368, "grad_norm": 0.29169194732859005, "learning_rate": 1.292736103029194e-05, "loss": 0.0038, "step": 7259 }, { "epoch": 3.303002729754322, "grad_norm": 0.25549881555157034, "learning_rate": 1.2921103547971713e-05, "loss": 0.0183, "step": 7260 }, { "epoch": 3.3034576888080074, "grad_norm": 0.24299385228539627, "learning_rate": 1.2914847052684371e-05, "loss": 0.0099, "step": 7261 }, { "epoch": 3.3039126478616927, "grad_norm": 0.47273098876474834, "learning_rate": 1.2908591544941138e-05, "loss": 0.0235, "step": 7262 }, { "epoch": 3.3043676069153776, "grad_norm": 0.15884533704897713, "learning_rate": 1.2902337025253208e-05, "loss": 0.0069, "step": 7263 }, { "epoch": 3.304822565969063, "grad_norm": 0.2528931977784097, "learning_rate": 1.2896083494131666e-05, "loss": 0.008, "step": 7264 }, { "epoch": 3.305277525022748, "grad_norm": 0.2838122718747626, "learning_rate": 1.2889830952087511e-05, "loss": 0.0135, "step": 7265 }, { "epoch": 3.305732484076433, "grad_norm": 0.27965217048230373, "learning_rate": 1.288357939963169e-05, "loss": 0.013, "step": 7266 }, { "epoch": 3.3061874431301184, "grad_norm": 0.377478070482321, "learning_rate": 1.2877328837275044e-05, "loss": 0.0247, "step": 7267 }, { "epoch": 3.3066424021838037, "grad_norm": 0.418423181212512, "learning_rate": 1.2871079265528336e-05, "loss": 0.0174, "step": 7268 }, { "epoch": 3.3070973612374885, "grad_norm": 0.3471832781926896, "learning_rate": 1.2864830684902252e-05, "loss": 0.0168, "step": 7269 }, { "epoch": 3.307552320291174, "grad_norm": 0.32149392083944855, "learning_rate": 1.28585830959074e-05, "loss": 0.013, "step": 7270 }, { "epoch": 3.308007279344859, "grad_norm": 0.18120951411887642, "learning_rate": 1.2852336499054319e-05, "loss": 0.0074, "step": 7271 }, { "epoch": 3.308462238398544, "grad_norm": 0.14727068760380801, "learning_rate": 1.2846090894853437e-05, "loss": 0.0053, "step": 7272 }, { "epoch": 3.3089171974522293, "grad_norm": 0.2678777367655011, "learning_rate": 1.2839846283815124e-05, "loss": 0.017, "step": 7273 }, { "epoch": 3.3093721565059147, "grad_norm": 0.2915369307355007, "learning_rate": 1.2833602666449645e-05, "loss": 0.0143, "step": 7274 }, { "epoch": 3.3098271155595995, "grad_norm": 0.15512545257711935, "learning_rate": 1.2827360043267228e-05, "loss": 0.0058, "step": 7275 }, { "epoch": 3.310282074613285, "grad_norm": 0.5439921219013062, "learning_rate": 1.2821118414777964e-05, "loss": 0.0341, "step": 7276 }, { "epoch": 3.31073703366697, "grad_norm": 0.5090103651210361, "learning_rate": 1.2814877781491913e-05, "loss": 0.0394, "step": 7277 }, { "epoch": 3.311191992720655, "grad_norm": 0.44850844924063854, "learning_rate": 1.2808638143919021e-05, "loss": 0.0187, "step": 7278 }, { "epoch": 3.3116469517743403, "grad_norm": 0.3461802418995437, "learning_rate": 1.280239950256916e-05, "loss": 0.0123, "step": 7279 }, { "epoch": 3.3121019108280256, "grad_norm": 0.2704123802016209, "learning_rate": 1.2796161857952133e-05, "loss": 0.0124, "step": 7280 }, { "epoch": 3.3125568698817105, "grad_norm": 0.2778139433312784, "learning_rate": 1.2789925210577647e-05, "loss": 0.0156, "step": 7281 }, { "epoch": 3.313011828935396, "grad_norm": 0.3753474475913387, "learning_rate": 1.2783689560955336e-05, "loss": 0.0165, "step": 7282 }, { "epoch": 3.313466787989081, "grad_norm": 0.28637660414178645, "learning_rate": 1.2777454909594733e-05, "loss": 0.0167, "step": 7283 }, { "epoch": 3.313921747042766, "grad_norm": 0.2561805000400296, "learning_rate": 1.2771221257005317e-05, "loss": 0.0109, "step": 7284 }, { "epoch": 3.3143767060964513, "grad_norm": 0.4795157370597914, "learning_rate": 1.2764988603696486e-05, "loss": 0.0097, "step": 7285 }, { "epoch": 3.3148316651501366, "grad_norm": 0.3108141030092025, "learning_rate": 1.2758756950177537e-05, "loss": 0.024, "step": 7286 }, { "epoch": 3.3152866242038215, "grad_norm": 0.2948334348050522, "learning_rate": 1.2752526296957684e-05, "loss": 0.0153, "step": 7287 }, { "epoch": 3.315741583257507, "grad_norm": 0.38069758673094073, "learning_rate": 1.2746296644546068e-05, "loss": 0.0222, "step": 7288 }, { "epoch": 3.316196542311192, "grad_norm": 0.21278633404552788, "learning_rate": 1.274006799345176e-05, "loss": 0.0063, "step": 7289 }, { "epoch": 3.316651501364877, "grad_norm": 0.40958943338578896, "learning_rate": 1.2733840344183718e-05, "loss": 0.0158, "step": 7290 }, { "epoch": 3.3171064604185623, "grad_norm": 0.31561386002237374, "learning_rate": 1.2727613697250861e-05, "loss": 0.0224, "step": 7291 }, { "epoch": 3.3175614194722476, "grad_norm": 0.22027935896850243, "learning_rate": 1.2721388053161992e-05, "loss": 0.0092, "step": 7292 }, { "epoch": 3.3180163785259325, "grad_norm": 0.25915386001529306, "learning_rate": 1.2715163412425846e-05, "loss": 0.0101, "step": 7293 }, { "epoch": 3.3184713375796178, "grad_norm": 0.1806286629200526, "learning_rate": 1.2708939775551052e-05, "loss": 0.0099, "step": 7294 }, { "epoch": 3.318926296633303, "grad_norm": 0.4546206396355657, "learning_rate": 1.2702717143046205e-05, "loss": 0.014, "step": 7295 }, { "epoch": 3.319381255686988, "grad_norm": 0.3931969735515987, "learning_rate": 1.269649551541978e-05, "loss": 0.0137, "step": 7296 }, { "epoch": 3.3198362147406733, "grad_norm": 0.4089379101138908, "learning_rate": 1.2690274893180168e-05, "loss": 0.0239, "step": 7297 }, { "epoch": 3.3202911737943586, "grad_norm": 0.3067686616936551, "learning_rate": 1.2684055276835713e-05, "loss": 0.0151, "step": 7298 }, { "epoch": 3.3207461328480434, "grad_norm": 0.287873980243183, "learning_rate": 1.2677836666894633e-05, "loss": 0.0085, "step": 7299 }, { "epoch": 3.3212010919017287, "grad_norm": 0.25894532531528475, "learning_rate": 1.2671619063865101e-05, "loss": 0.0112, "step": 7300 }, { "epoch": 3.321656050955414, "grad_norm": 0.2980747514234426, "learning_rate": 1.2665402468255186e-05, "loss": 0.0175, "step": 7301 }, { "epoch": 3.3221110100090994, "grad_norm": 0.3131834848190865, "learning_rate": 1.265918688057288e-05, "loss": 0.014, "step": 7302 }, { "epoch": 3.3225659690627842, "grad_norm": 0.20850654014729614, "learning_rate": 1.2652972301326083e-05, "loss": 0.006, "step": 7303 }, { "epoch": 3.3230209281164695, "grad_norm": 0.353818239076349, "learning_rate": 1.2646758731022626e-05, "loss": 0.0224, "step": 7304 }, { "epoch": 3.323475887170155, "grad_norm": 0.31352894299081674, "learning_rate": 1.264054617017027e-05, "loss": 0.0066, "step": 7305 }, { "epoch": 3.3239308462238397, "grad_norm": 0.31654598118286525, "learning_rate": 1.2634334619276669e-05, "loss": 0.0141, "step": 7306 }, { "epoch": 3.324385805277525, "grad_norm": 0.46655918765348803, "learning_rate": 1.2628124078849398e-05, "loss": 0.0192, "step": 7307 }, { "epoch": 3.3248407643312103, "grad_norm": 0.21487880641351723, "learning_rate": 1.2621914549395946e-05, "loss": 0.0051, "step": 7308 }, { "epoch": 3.325295723384895, "grad_norm": 0.36926651162737933, "learning_rate": 1.2615706031423752e-05, "loss": 0.0101, "step": 7309 }, { "epoch": 3.3257506824385805, "grad_norm": 0.3271789147365313, "learning_rate": 1.260949852544013e-05, "loss": 0.0144, "step": 7310 }, { "epoch": 3.326205641492266, "grad_norm": 0.3172275821675618, "learning_rate": 1.2603292031952324e-05, "loss": 0.0153, "step": 7311 }, { "epoch": 3.3266606005459507, "grad_norm": 0.41614461895873983, "learning_rate": 1.259708655146752e-05, "loss": 0.0258, "step": 7312 }, { "epoch": 3.327115559599636, "grad_norm": 0.4502674573304287, "learning_rate": 1.2590882084492784e-05, "loss": 0.0269, "step": 7313 }, { "epoch": 3.3275705186533213, "grad_norm": 0.6617509519886727, "learning_rate": 1.2584678631535134e-05, "loss": 0.0139, "step": 7314 }, { "epoch": 3.328025477707006, "grad_norm": 0.5023760189568226, "learning_rate": 1.257847619310148e-05, "loss": 0.0127, "step": 7315 }, { "epoch": 3.3284804367606915, "grad_norm": 0.2621113993843152, "learning_rate": 1.2572274769698655e-05, "loss": 0.0109, "step": 7316 }, { "epoch": 3.328935395814377, "grad_norm": 0.4415191508911057, "learning_rate": 1.2566074361833402e-05, "loss": 0.0156, "step": 7317 }, { "epoch": 3.329390354868062, "grad_norm": 0.3882435359826308, "learning_rate": 1.2559874970012403e-05, "loss": 0.0384, "step": 7318 }, { "epoch": 3.329845313921747, "grad_norm": 0.379810602357138, "learning_rate": 1.255367659474225e-05, "loss": 0.0249, "step": 7319 }, { "epoch": 3.3303002729754323, "grad_norm": 0.2793373003276657, "learning_rate": 1.2547479236529442e-05, "loss": 0.0145, "step": 7320 }, { "epoch": 3.3307552320291176, "grad_norm": 0.27907175168094095, "learning_rate": 1.254128289588039e-05, "loss": 0.0238, "step": 7321 }, { "epoch": 3.3312101910828025, "grad_norm": 0.30684377529736995, "learning_rate": 1.2535087573301432e-05, "loss": 0.0066, "step": 7322 }, { "epoch": 3.331665150136488, "grad_norm": 0.45619613101887496, "learning_rate": 1.2528893269298835e-05, "loss": 0.0085, "step": 7323 }, { "epoch": 3.332120109190173, "grad_norm": 0.19827663751395938, "learning_rate": 1.252269998437876e-05, "loss": 0.0092, "step": 7324 }, { "epoch": 3.332575068243858, "grad_norm": 0.29385629913687267, "learning_rate": 1.2516507719047287e-05, "loss": 0.0152, "step": 7325 }, { "epoch": 3.3330300272975433, "grad_norm": 0.28368021882961675, "learning_rate": 1.2510316473810435e-05, "loss": 0.0129, "step": 7326 }, { "epoch": 3.3334849863512286, "grad_norm": 0.3843250834769873, "learning_rate": 1.2504126249174114e-05, "loss": 0.0163, "step": 7327 }, { "epoch": 3.3339399454049135, "grad_norm": 0.3117640306321617, "learning_rate": 1.249793704564417e-05, "loss": 0.023, "step": 7328 }, { "epoch": 3.3343949044585988, "grad_norm": 0.3809385996816368, "learning_rate": 1.2491748863726351e-05, "loss": 0.0216, "step": 7329 }, { "epoch": 3.334849863512284, "grad_norm": 0.22565445056437236, "learning_rate": 1.2485561703926332e-05, "loss": 0.011, "step": 7330 }, { "epoch": 3.335304822565969, "grad_norm": 0.32635384700431513, "learning_rate": 1.2479375566749694e-05, "loss": 0.0123, "step": 7331 }, { "epoch": 3.3357597816196543, "grad_norm": 0.32860519720905046, "learning_rate": 1.2473190452701933e-05, "loss": 0.0207, "step": 7332 }, { "epoch": 3.3362147406733396, "grad_norm": 0.354713169049671, "learning_rate": 1.2467006362288474e-05, "loss": 0.0065, "step": 7333 }, { "epoch": 3.3366696997270244, "grad_norm": 0.3008873503175419, "learning_rate": 1.246082329601467e-05, "loss": 0.0198, "step": 7334 }, { "epoch": 3.3371246587807097, "grad_norm": 0.27823558868791815, "learning_rate": 1.245464125438576e-05, "loss": 0.0122, "step": 7335 }, { "epoch": 3.337579617834395, "grad_norm": 0.2930858060588547, "learning_rate": 1.2448460237906911e-05, "loss": 0.0136, "step": 7336 }, { "epoch": 3.33803457688808, "grad_norm": 0.42215852153042005, "learning_rate": 1.2442280247083198e-05, "loss": 0.0282, "step": 7337 }, { "epoch": 3.3384895359417652, "grad_norm": 0.2864273129174413, "learning_rate": 1.2436101282419647e-05, "loss": 0.0204, "step": 7338 }, { "epoch": 3.3389444949954505, "grad_norm": 0.5604981292083647, "learning_rate": 1.242992334442115e-05, "loss": 0.0211, "step": 7339 }, { "epoch": 3.3393994540491354, "grad_norm": 0.3682506741828671, "learning_rate": 1.2423746433592556e-05, "loss": 0.0157, "step": 7340 }, { "epoch": 3.3398544131028207, "grad_norm": 0.3187039399672488, "learning_rate": 1.2417570550438615e-05, "loss": 0.012, "step": 7341 }, { "epoch": 3.340309372156506, "grad_norm": 0.8455567428799347, "learning_rate": 1.2411395695463976e-05, "loss": 0.0528, "step": 7342 }, { "epoch": 3.340764331210191, "grad_norm": 0.3042910817113273, "learning_rate": 1.2405221869173238e-05, "loss": 0.0201, "step": 7343 }, { "epoch": 3.341219290263876, "grad_norm": 0.36515699239477445, "learning_rate": 1.2399049072070894e-05, "loss": 0.0184, "step": 7344 }, { "epoch": 3.3416742493175615, "grad_norm": 0.4154088585964866, "learning_rate": 1.2392877304661358e-05, "loss": 0.0105, "step": 7345 }, { "epoch": 3.3421292083712464, "grad_norm": 0.3018955275610592, "learning_rate": 1.2386706567448941e-05, "loss": 0.021, "step": 7346 }, { "epoch": 3.3425841674249317, "grad_norm": 0.26113311575791265, "learning_rate": 1.2380536860937902e-05, "loss": 0.0068, "step": 7347 }, { "epoch": 3.343039126478617, "grad_norm": 0.1601509758643776, "learning_rate": 1.2374368185632412e-05, "loss": 0.0051, "step": 7348 }, { "epoch": 3.343494085532302, "grad_norm": 0.18237354919579252, "learning_rate": 1.2368200542036537e-05, "loss": 0.0179, "step": 7349 }, { "epoch": 3.343949044585987, "grad_norm": 0.4455253093798334, "learning_rate": 1.236203393065427e-05, "loss": 0.0205, "step": 7350 }, { "epoch": 3.3444040036396725, "grad_norm": 0.266443956777735, "learning_rate": 1.2355868351989509e-05, "loss": 0.0101, "step": 7351 }, { "epoch": 3.3448589626933574, "grad_norm": 0.6045992301905436, "learning_rate": 1.2349703806546092e-05, "loss": 0.0315, "step": 7352 }, { "epoch": 3.3453139217470427, "grad_norm": 0.8145791634746437, "learning_rate": 1.2343540294827746e-05, "loss": 0.0563, "step": 7353 }, { "epoch": 3.345768880800728, "grad_norm": 0.48017613318563507, "learning_rate": 1.2337377817338139e-05, "loss": 0.0345, "step": 7354 }, { "epoch": 3.3462238398544133, "grad_norm": 0.2035501789218177, "learning_rate": 1.2331216374580831e-05, "loss": 0.0066, "step": 7355 }, { "epoch": 3.346678798908098, "grad_norm": 0.21882290275564012, "learning_rate": 1.2325055967059302e-05, "loss": 0.0084, "step": 7356 }, { "epoch": 3.3471337579617835, "grad_norm": 0.3792384552672922, "learning_rate": 1.2318896595276968e-05, "loss": 0.0147, "step": 7357 }, { "epoch": 3.347588717015469, "grad_norm": 0.24024094161465717, "learning_rate": 1.231273825973714e-05, "loss": 0.0075, "step": 7358 }, { "epoch": 3.3480436760691537, "grad_norm": 0.16514570970568643, "learning_rate": 1.2306580960943043e-05, "loss": 0.0039, "step": 7359 }, { "epoch": 3.348498635122839, "grad_norm": 0.25437589198905963, "learning_rate": 1.2300424699397817e-05, "loss": 0.0109, "step": 7360 }, { "epoch": 3.3489535941765243, "grad_norm": 0.42358463333153845, "learning_rate": 1.2294269475604536e-05, "loss": 0.0359, "step": 7361 }, { "epoch": 3.349408553230209, "grad_norm": 0.2973043756058589, "learning_rate": 1.2288115290066182e-05, "loss": 0.0122, "step": 7362 }, { "epoch": 3.3498635122838945, "grad_norm": 0.27518630840220126, "learning_rate": 1.2281962143285641e-05, "loss": 0.012, "step": 7363 }, { "epoch": 3.3503184713375798, "grad_norm": 0.5038697825075006, "learning_rate": 1.227581003576572e-05, "loss": 0.0205, "step": 7364 }, { "epoch": 3.3507734303912646, "grad_norm": 0.1441356033901506, "learning_rate": 1.2269658968009143e-05, "loss": 0.0049, "step": 7365 }, { "epoch": 3.35122838944495, "grad_norm": 0.41363029043217314, "learning_rate": 1.2263508940518534e-05, "loss": 0.0121, "step": 7366 }, { "epoch": 3.3516833484986353, "grad_norm": 0.5594702225933883, "learning_rate": 1.2257359953796455e-05, "loss": 0.0298, "step": 7367 }, { "epoch": 3.35213830755232, "grad_norm": 0.2565406276190331, "learning_rate": 1.2251212008345387e-05, "loss": 0.0168, "step": 7368 }, { "epoch": 3.3525932666060054, "grad_norm": 0.5873982020671791, "learning_rate": 1.22450651046677e-05, "loss": 0.0177, "step": 7369 }, { "epoch": 3.3530482256596907, "grad_norm": 1.3347841253157926, "learning_rate": 1.2238919243265692e-05, "loss": 0.0312, "step": 7370 }, { "epoch": 3.3535031847133756, "grad_norm": 0.45911924420701655, "learning_rate": 1.2232774424641565e-05, "loss": 0.0241, "step": 7371 }, { "epoch": 3.353958143767061, "grad_norm": 0.3319174319455619, "learning_rate": 1.2226630649297466e-05, "loss": 0.0144, "step": 7372 }, { "epoch": 3.3544131028207462, "grad_norm": 0.3022991010797471, "learning_rate": 1.2220487917735426e-05, "loss": 0.0189, "step": 7373 }, { "epoch": 3.3548680618744315, "grad_norm": 0.3398266996362851, "learning_rate": 1.221434623045739e-05, "loss": 0.0168, "step": 7374 }, { "epoch": 3.3553230209281164, "grad_norm": 0.3116671406488034, "learning_rate": 1.2208205587965255e-05, "loss": 0.0159, "step": 7375 }, { "epoch": 3.3557779799818017, "grad_norm": 0.30602233985226773, "learning_rate": 1.220206599076078e-05, "loss": 0.012, "step": 7376 }, { "epoch": 3.356232939035487, "grad_norm": 0.4541176094250105, "learning_rate": 1.2195927439345687e-05, "loss": 0.0192, "step": 7377 }, { "epoch": 3.356687898089172, "grad_norm": 0.15986779176376875, "learning_rate": 1.2189789934221579e-05, "loss": 0.0052, "step": 7378 }, { "epoch": 3.357142857142857, "grad_norm": 0.28195780506011603, "learning_rate": 1.218365347588999e-05, "loss": 0.0095, "step": 7379 }, { "epoch": 3.3575978161965425, "grad_norm": 0.25148414257098467, "learning_rate": 1.217751806485235e-05, "loss": 0.0162, "step": 7380 }, { "epoch": 3.3580527752502274, "grad_norm": 0.26703405110355866, "learning_rate": 1.2171383701610026e-05, "loss": 0.0198, "step": 7381 }, { "epoch": 3.3585077343039127, "grad_norm": 0.2541655245880142, "learning_rate": 1.2165250386664303e-05, "loss": 0.0124, "step": 7382 }, { "epoch": 3.358962693357598, "grad_norm": 0.2588156216411319, "learning_rate": 1.215911812051636e-05, "loss": 0.0115, "step": 7383 }, { "epoch": 3.359417652411283, "grad_norm": 0.7829262315464981, "learning_rate": 1.2152986903667293e-05, "loss": 0.0128, "step": 7384 }, { "epoch": 3.359872611464968, "grad_norm": 0.1509982919412134, "learning_rate": 1.214685673661811e-05, "loss": 0.0026, "step": 7385 }, { "epoch": 3.3603275705186535, "grad_norm": 0.2453296237616627, "learning_rate": 1.214072761986976e-05, "loss": 0.013, "step": 7386 }, { "epoch": 3.3607825295723384, "grad_norm": 0.3542360165603526, "learning_rate": 1.2134599553923076e-05, "loss": 0.0212, "step": 7387 }, { "epoch": 3.3612374886260237, "grad_norm": 0.35214115503733223, "learning_rate": 1.2128472539278809e-05, "loss": 0.0168, "step": 7388 }, { "epoch": 3.361692447679709, "grad_norm": 0.4830154916529633, "learning_rate": 1.2122346576437649e-05, "loss": 0.0153, "step": 7389 }, { "epoch": 3.362147406733394, "grad_norm": 0.27669991221540696, "learning_rate": 1.2116221665900159e-05, "loss": 0.0102, "step": 7390 }, { "epoch": 3.362602365787079, "grad_norm": 0.3775642348272993, "learning_rate": 1.2110097808166865e-05, "loss": 0.0169, "step": 7391 }, { "epoch": 3.3630573248407645, "grad_norm": 0.5746304438692669, "learning_rate": 1.2103975003738166e-05, "loss": 0.01, "step": 7392 }, { "epoch": 3.3635122838944493, "grad_norm": 0.2239913027790978, "learning_rate": 1.2097853253114391e-05, "loss": 0.0082, "step": 7393 }, { "epoch": 3.3639672429481347, "grad_norm": 0.3249738036330835, "learning_rate": 1.2091732556795774e-05, "loss": 0.0132, "step": 7394 }, { "epoch": 3.36442220200182, "grad_norm": 0.34378909228855253, "learning_rate": 1.2085612915282479e-05, "loss": 0.0123, "step": 7395 }, { "epoch": 3.364877161055505, "grad_norm": 0.7981714434670416, "learning_rate": 1.2079494329074587e-05, "loss": 0.0609, "step": 7396 }, { "epoch": 3.36533212010919, "grad_norm": 0.35141566978411204, "learning_rate": 1.2073376798672068e-05, "loss": 0.0168, "step": 7397 }, { "epoch": 3.3657870791628755, "grad_norm": 0.3176832427933116, "learning_rate": 1.2067260324574822e-05, "loss": 0.0104, "step": 7398 }, { "epoch": 3.3662420382165603, "grad_norm": 0.15494004985082863, "learning_rate": 1.2061144907282656e-05, "loss": 0.0073, "step": 7399 }, { "epoch": 3.3666969972702456, "grad_norm": 0.3295092249475723, "learning_rate": 1.205503054729529e-05, "loss": 0.009, "step": 7400 }, { "epoch": 3.367151956323931, "grad_norm": 0.33129654897033445, "learning_rate": 1.2048917245112376e-05, "loss": 0.0118, "step": 7401 }, { "epoch": 3.367606915377616, "grad_norm": 0.3257718630802374, "learning_rate": 1.204280500123345e-05, "loss": 0.0177, "step": 7402 }, { "epoch": 3.368061874431301, "grad_norm": 0.26813647719291506, "learning_rate": 1.2036693816157995e-05, "loss": 0.0201, "step": 7403 }, { "epoch": 3.3685168334849864, "grad_norm": 0.3111276388611202, "learning_rate": 1.2030583690385381e-05, "loss": 0.0128, "step": 7404 }, { "epoch": 3.3689717925386713, "grad_norm": 0.32391039520901044, "learning_rate": 1.2024474624414886e-05, "loss": 0.0087, "step": 7405 }, { "epoch": 3.3694267515923566, "grad_norm": 0.2649647493344084, "learning_rate": 1.2018366618745739e-05, "loss": 0.0169, "step": 7406 }, { "epoch": 3.369881710646042, "grad_norm": 0.3649142289771265, "learning_rate": 1.2012259673877046e-05, "loss": 0.0199, "step": 7407 }, { "epoch": 3.370336669699727, "grad_norm": 0.2851014691546411, "learning_rate": 1.2006153790307842e-05, "loss": 0.0107, "step": 7408 }, { "epoch": 3.370791628753412, "grad_norm": 0.4545646158405543, "learning_rate": 1.2000048968537059e-05, "loss": 0.0209, "step": 7409 }, { "epoch": 3.3712465878070974, "grad_norm": 0.24224274984196328, "learning_rate": 1.1993945209063568e-05, "loss": 0.0191, "step": 7410 }, { "epoch": 3.3717015468607827, "grad_norm": 0.2903839582700539, "learning_rate": 1.198784251238615e-05, "loss": 0.0089, "step": 7411 }, { "epoch": 3.3721565059144676, "grad_norm": 0.30735890660927406, "learning_rate": 1.1981740879003478e-05, "loss": 0.011, "step": 7412 }, { "epoch": 3.372611464968153, "grad_norm": 0.26208112610005885, "learning_rate": 1.1975640309414151e-05, "loss": 0.0117, "step": 7413 }, { "epoch": 3.373066424021838, "grad_norm": 0.48075304978847905, "learning_rate": 1.1969540804116675e-05, "loss": 0.0381, "step": 7414 }, { "epoch": 3.373521383075523, "grad_norm": 0.44683474362762465, "learning_rate": 1.1963442363609487e-05, "loss": 0.0354, "step": 7415 }, { "epoch": 3.3739763421292084, "grad_norm": 0.332926305160124, "learning_rate": 1.1957344988390903e-05, "loss": 0.0192, "step": 7416 }, { "epoch": 3.3744313011828937, "grad_norm": 0.3118555296367247, "learning_rate": 1.1951248678959201e-05, "loss": 0.0124, "step": 7417 }, { "epoch": 3.3748862602365786, "grad_norm": 0.26971878958760653, "learning_rate": 1.1945153435812528e-05, "loss": 0.012, "step": 7418 }, { "epoch": 3.375341219290264, "grad_norm": 0.400972670204958, "learning_rate": 1.1939059259448951e-05, "loss": 0.0221, "step": 7419 }, { "epoch": 3.375796178343949, "grad_norm": 0.22966593025783413, "learning_rate": 1.1932966150366476e-05, "loss": 0.0057, "step": 7420 }, { "epoch": 3.376251137397634, "grad_norm": 0.40789066729630863, "learning_rate": 1.1926874109063e-05, "loss": 0.0281, "step": 7421 }, { "epoch": 3.3767060964513194, "grad_norm": 0.4101298865962957, "learning_rate": 1.1920783136036334e-05, "loss": 0.0124, "step": 7422 }, { "epoch": 3.3771610555050047, "grad_norm": 0.2680069334102, "learning_rate": 1.1914693231784193e-05, "loss": 0.0244, "step": 7423 }, { "epoch": 3.3776160145586895, "grad_norm": 0.48000250985076703, "learning_rate": 1.1908604396804232e-05, "loss": 0.0219, "step": 7424 }, { "epoch": 3.378070973612375, "grad_norm": 0.447671191180806, "learning_rate": 1.1902516631594004e-05, "loss": 0.0202, "step": 7425 }, { "epoch": 3.37852593266606, "grad_norm": 0.3756681573726519, "learning_rate": 1.1896429936650974e-05, "loss": 0.0119, "step": 7426 }, { "epoch": 3.3789808917197455, "grad_norm": 0.2900926049114098, "learning_rate": 1.1890344312472512e-05, "loss": 0.0112, "step": 7427 }, { "epoch": 3.3794358507734303, "grad_norm": 0.381921617508109, "learning_rate": 1.1884259759555901e-05, "loss": 0.0229, "step": 7428 }, { "epoch": 3.3798908098271156, "grad_norm": 0.4913044655177357, "learning_rate": 1.1878176278398361e-05, "loss": 0.01, "step": 7429 }, { "epoch": 3.380345768880801, "grad_norm": 0.4117476354093593, "learning_rate": 1.187209386949699e-05, "loss": 0.0308, "step": 7430 }, { "epoch": 3.380800727934486, "grad_norm": 0.392056696119935, "learning_rate": 1.1866012533348833e-05, "loss": 0.0065, "step": 7431 }, { "epoch": 3.381255686988171, "grad_norm": 0.42993201021184696, "learning_rate": 1.1859932270450816e-05, "loss": 0.0264, "step": 7432 }, { "epoch": 3.3817106460418564, "grad_norm": 0.5997399109653666, "learning_rate": 1.1853853081299787e-05, "loss": 0.0312, "step": 7433 }, { "epoch": 3.3821656050955413, "grad_norm": 0.2789531747619838, "learning_rate": 1.1847774966392525e-05, "loss": 0.0092, "step": 7434 }, { "epoch": 3.3826205641492266, "grad_norm": 0.21018650514873044, "learning_rate": 1.1841697926225697e-05, "loss": 0.0036, "step": 7435 }, { "epoch": 3.383075523202912, "grad_norm": 0.2887259055608827, "learning_rate": 1.1835621961295896e-05, "loss": 0.0102, "step": 7436 }, { "epoch": 3.383530482256597, "grad_norm": 0.3280824107885246, "learning_rate": 1.1829547072099606e-05, "loss": 0.0151, "step": 7437 }, { "epoch": 3.383985441310282, "grad_norm": 0.40926470019529265, "learning_rate": 1.1823473259133261e-05, "loss": 0.0161, "step": 7438 }, { "epoch": 3.3844404003639674, "grad_norm": 0.37229905987400475, "learning_rate": 1.1817400522893169e-05, "loss": 0.0152, "step": 7439 }, { "epoch": 3.3848953594176523, "grad_norm": 0.3178401768906203, "learning_rate": 1.1811328863875582e-05, "loss": 0.0119, "step": 7440 }, { "epoch": 3.3853503184713376, "grad_norm": 0.272482340680271, "learning_rate": 1.180525828257664e-05, "loss": 0.0128, "step": 7441 }, { "epoch": 3.385805277525023, "grad_norm": 0.23930251806352804, "learning_rate": 1.1799188779492406e-05, "loss": 0.0104, "step": 7442 }, { "epoch": 3.386260236578708, "grad_norm": 0.4220368804891491, "learning_rate": 1.1793120355118841e-05, "loss": 0.0249, "step": 7443 }, { "epoch": 3.386715195632393, "grad_norm": 0.20124408469139585, "learning_rate": 1.1787053009951837e-05, "loss": 0.0044, "step": 7444 }, { "epoch": 3.3871701546860784, "grad_norm": 0.3119129451832033, "learning_rate": 1.1780986744487205e-05, "loss": 0.0201, "step": 7445 }, { "epoch": 3.3876251137397633, "grad_norm": 0.2665901845825872, "learning_rate": 1.1774921559220636e-05, "loss": 0.0169, "step": 7446 }, { "epoch": 3.3880800727934486, "grad_norm": 0.4063787876383273, "learning_rate": 1.1768857454647756e-05, "loss": 0.0203, "step": 7447 }, { "epoch": 3.388535031847134, "grad_norm": 0.2884284716395349, "learning_rate": 1.1762794431264081e-05, "loss": 0.0128, "step": 7448 }, { "epoch": 3.3889899909008188, "grad_norm": 0.6491667237221235, "learning_rate": 1.1756732489565079e-05, "loss": 0.0349, "step": 7449 }, { "epoch": 3.389444949954504, "grad_norm": 0.41346780359206825, "learning_rate": 1.175067163004609e-05, "loss": 0.0239, "step": 7450 }, { "epoch": 3.3898999090081894, "grad_norm": 0.19963998152043838, "learning_rate": 1.1744611853202376e-05, "loss": 0.0109, "step": 7451 }, { "epoch": 3.3903548680618742, "grad_norm": 0.27711357990575103, "learning_rate": 1.1738553159529126e-05, "loss": 0.0074, "step": 7452 }, { "epoch": 3.3908098271155596, "grad_norm": 0.31856896016504266, "learning_rate": 1.1732495549521413e-05, "loss": 0.0239, "step": 7453 }, { "epoch": 3.391264786169245, "grad_norm": 0.3666209639312302, "learning_rate": 1.172643902367426e-05, "loss": 0.0156, "step": 7454 }, { "epoch": 3.3917197452229297, "grad_norm": 0.25703752237124083, "learning_rate": 1.172038358248257e-05, "loss": 0.0091, "step": 7455 }, { "epoch": 3.392174704276615, "grad_norm": 0.3594974735079784, "learning_rate": 1.171432922644116e-05, "loss": 0.0228, "step": 7456 }, { "epoch": 3.3926296633303004, "grad_norm": 0.4748275330443375, "learning_rate": 1.1708275956044757e-05, "loss": 0.0328, "step": 7457 }, { "epoch": 3.3930846223839852, "grad_norm": 0.38839427576182983, "learning_rate": 1.1702223771788021e-05, "loss": 0.0123, "step": 7458 }, { "epoch": 3.3935395814376705, "grad_norm": 0.2991878086888187, "learning_rate": 1.1696172674165515e-05, "loss": 0.0101, "step": 7459 }, { "epoch": 3.393994540491356, "grad_norm": 0.37556228562797056, "learning_rate": 1.1690122663671699e-05, "loss": 0.0128, "step": 7460 }, { "epoch": 3.3944494995450407, "grad_norm": 0.2696528821159299, "learning_rate": 1.168407374080095e-05, "loss": 0.0072, "step": 7461 }, { "epoch": 3.394904458598726, "grad_norm": 0.3107194775872673, "learning_rate": 1.1678025906047551e-05, "loss": 0.0201, "step": 7462 }, { "epoch": 3.3953594176524113, "grad_norm": 0.35787446415056773, "learning_rate": 1.1671979159905724e-05, "loss": 0.0113, "step": 7463 }, { "epoch": 3.395814376706096, "grad_norm": 0.31902984686145436, "learning_rate": 1.1665933502869564e-05, "loss": 0.0098, "step": 7464 }, { "epoch": 3.3962693357597815, "grad_norm": 0.2786858778069422, "learning_rate": 1.1659888935433108e-05, "loss": 0.0101, "step": 7465 }, { "epoch": 3.396724294813467, "grad_norm": 0.2725817818240038, "learning_rate": 1.1653845458090287e-05, "loss": 0.0147, "step": 7466 }, { "epoch": 3.397179253867152, "grad_norm": 0.3411746878631725, "learning_rate": 1.1647803071334933e-05, "loss": 0.0162, "step": 7467 }, { "epoch": 3.397634212920837, "grad_norm": 0.3575079194134863, "learning_rate": 1.1641761775660825e-05, "loss": 0.0209, "step": 7468 }, { "epoch": 3.3980891719745223, "grad_norm": 0.5554341949413405, "learning_rate": 1.1635721571561619e-05, "loss": 0.0319, "step": 7469 }, { "epoch": 3.3985441310282076, "grad_norm": 0.26646470260931715, "learning_rate": 1.1629682459530897e-05, "loss": 0.0091, "step": 7470 }, { "epoch": 3.3989990900818925, "grad_norm": 0.2692497793229527, "learning_rate": 1.1623644440062131e-05, "loss": 0.0071, "step": 7471 }, { "epoch": 3.399454049135578, "grad_norm": 0.28052373758100385, "learning_rate": 1.1617607513648734e-05, "loss": 0.0115, "step": 7472 }, { "epoch": 3.399909008189263, "grad_norm": 0.3419403423756052, "learning_rate": 1.1611571680784029e-05, "loss": 0.0095, "step": 7473 }, { "epoch": 3.400363967242948, "grad_norm": 0.2871131491688801, "learning_rate": 1.1605536941961223e-05, "loss": 0.0211, "step": 7474 }, { "epoch": 3.4008189262966333, "grad_norm": 0.2552074229376785, "learning_rate": 1.159950329767345e-05, "loss": 0.0169, "step": 7475 }, { "epoch": 3.4012738853503186, "grad_norm": 0.2292873401881511, "learning_rate": 1.1593470748413748e-05, "loss": 0.009, "step": 7476 }, { "epoch": 3.4017288444040035, "grad_norm": 0.33904763526898285, "learning_rate": 1.1587439294675068e-05, "loss": 0.0074, "step": 7477 }, { "epoch": 3.402183803457689, "grad_norm": 0.2855442570793421, "learning_rate": 1.1581408936950278e-05, "loss": 0.0193, "step": 7478 }, { "epoch": 3.402638762511374, "grad_norm": 0.27547220226065405, "learning_rate": 1.1575379675732159e-05, "loss": 0.011, "step": 7479 }, { "epoch": 3.403093721565059, "grad_norm": 0.24895154609786502, "learning_rate": 1.1569351511513387e-05, "loss": 0.0067, "step": 7480 }, { "epoch": 3.4035486806187443, "grad_norm": 0.18999888219089178, "learning_rate": 1.156332444478656e-05, "loss": 0.0161, "step": 7481 }, { "epoch": 3.4040036396724296, "grad_norm": 0.3742273704566265, "learning_rate": 1.155729847604417e-05, "loss": 0.0229, "step": 7482 }, { "epoch": 3.404458598726115, "grad_norm": 0.2210540541560316, "learning_rate": 1.155127360577865e-05, "loss": 0.0065, "step": 7483 }, { "epoch": 3.4049135577797998, "grad_norm": 0.3651984503217936, "learning_rate": 1.154524983448232e-05, "loss": 0.0208, "step": 7484 }, { "epoch": 3.405368516833485, "grad_norm": 0.17238780793459638, "learning_rate": 1.1539227162647399e-05, "loss": 0.0047, "step": 7485 }, { "epoch": 3.4058234758871704, "grad_norm": 0.39740032229870376, "learning_rate": 1.1533205590766055e-05, "loss": 0.0344, "step": 7486 }, { "epoch": 3.4062784349408552, "grad_norm": 0.30295586090357846, "learning_rate": 1.1527185119330328e-05, "loss": 0.0147, "step": 7487 }, { "epoch": 3.4067333939945406, "grad_norm": 0.2906586378886499, "learning_rate": 1.15211657488322e-05, "loss": 0.0077, "step": 7488 }, { "epoch": 3.407188353048226, "grad_norm": 0.3681110533984873, "learning_rate": 1.1515147479763535e-05, "loss": 0.0056, "step": 7489 }, { "epoch": 3.4076433121019107, "grad_norm": 0.4349829401369805, "learning_rate": 1.1509130312616124e-05, "loss": 0.0335, "step": 7490 }, { "epoch": 3.408098271155596, "grad_norm": 0.4830325399954756, "learning_rate": 1.1503114247881648e-05, "loss": 0.0196, "step": 7491 }, { "epoch": 3.4085532302092814, "grad_norm": 0.3611287670724317, "learning_rate": 1.1497099286051725e-05, "loss": 0.0182, "step": 7492 }, { "epoch": 3.4090081892629662, "grad_norm": 0.4531373476552137, "learning_rate": 1.1491085427617878e-05, "loss": 0.0134, "step": 7493 }, { "epoch": 3.4094631483166515, "grad_norm": 0.3730208024423151, "learning_rate": 1.1485072673071522e-05, "loss": 0.0128, "step": 7494 }, { "epoch": 3.409918107370337, "grad_norm": 0.3821913039150987, "learning_rate": 1.1479061022904e-05, "loss": 0.0188, "step": 7495 }, { "epoch": 3.4103730664240217, "grad_norm": 0.31656332526019754, "learning_rate": 1.147305047760654e-05, "loss": 0.0079, "step": 7496 }, { "epoch": 3.410828025477707, "grad_norm": 0.23819761772467166, "learning_rate": 1.1467041037670314e-05, "loss": 0.0072, "step": 7497 }, { "epoch": 3.4112829845313923, "grad_norm": 0.43268856539204437, "learning_rate": 1.1461032703586383e-05, "loss": 0.0143, "step": 7498 }, { "epoch": 3.411737943585077, "grad_norm": 0.42292019440719664, "learning_rate": 1.1455025475845706e-05, "loss": 0.0145, "step": 7499 }, { "epoch": 3.4121929026387625, "grad_norm": 0.35968746000593577, "learning_rate": 1.1449019354939192e-05, "loss": 0.0069, "step": 7500 }, { "epoch": 3.412647861692448, "grad_norm": 0.3073830685174102, "learning_rate": 1.1443014341357609e-05, "loss": 0.0142, "step": 7501 }, { "epoch": 3.4131028207461327, "grad_norm": 0.2848290326105592, "learning_rate": 1.143701043559168e-05, "loss": 0.0101, "step": 7502 }, { "epoch": 3.413557779799818, "grad_norm": 0.40963993202433224, "learning_rate": 1.1431007638132007e-05, "loss": 0.0269, "step": 7503 }, { "epoch": 3.4140127388535033, "grad_norm": 0.3086596620346774, "learning_rate": 1.1425005949469118e-05, "loss": 0.0171, "step": 7504 }, { "epoch": 3.414467697907188, "grad_norm": 0.6598372856320515, "learning_rate": 1.1419005370093425e-05, "loss": 0.0264, "step": 7505 }, { "epoch": 3.4149226569608735, "grad_norm": 0.3415293873100661, "learning_rate": 1.1413005900495283e-05, "loss": 0.0276, "step": 7506 }, { "epoch": 3.415377616014559, "grad_norm": 0.27373311600605604, "learning_rate": 1.1407007541164949e-05, "loss": 0.0086, "step": 7507 }, { "epoch": 3.4158325750682437, "grad_norm": 0.41700957650982023, "learning_rate": 1.1401010292592573e-05, "loss": 0.0319, "step": 7508 }, { "epoch": 3.416287534121929, "grad_norm": 0.4415741542399081, "learning_rate": 1.1395014155268224e-05, "loss": 0.0335, "step": 7509 }, { "epoch": 3.4167424931756143, "grad_norm": 0.40134937647070734, "learning_rate": 1.138901912968188e-05, "loss": 0.0059, "step": 7510 }, { "epoch": 3.417197452229299, "grad_norm": 0.4099306504784986, "learning_rate": 1.1383025216323417e-05, "loss": 0.0152, "step": 7511 }, { "epoch": 3.4176524112829845, "grad_norm": 0.3061666093438862, "learning_rate": 1.1377032415682648e-05, "loss": 0.0079, "step": 7512 }, { "epoch": 3.41810737033667, "grad_norm": 0.47006824955099635, "learning_rate": 1.1371040728249258e-05, "loss": 0.0152, "step": 7513 }, { "epoch": 3.4185623293903546, "grad_norm": 0.34600043595844476, "learning_rate": 1.1365050154512883e-05, "loss": 0.0123, "step": 7514 }, { "epoch": 3.41901728844404, "grad_norm": 0.42724579892500203, "learning_rate": 1.1359060694963037e-05, "loss": 0.0156, "step": 7515 }, { "epoch": 3.4194722474977253, "grad_norm": 0.21920710197011578, "learning_rate": 1.1353072350089135e-05, "loss": 0.006, "step": 7516 }, { "epoch": 3.41992720655141, "grad_norm": 0.2541227492912572, "learning_rate": 1.1347085120380544e-05, "loss": 0.0098, "step": 7517 }, { "epoch": 3.4203821656050954, "grad_norm": 0.4632578251385366, "learning_rate": 1.13410990063265e-05, "loss": 0.0171, "step": 7518 }, { "epoch": 3.4208371246587808, "grad_norm": 0.20868676779912354, "learning_rate": 1.1335114008416161e-05, "loss": 0.0059, "step": 7519 }, { "epoch": 3.421292083712466, "grad_norm": 0.4075635977105194, "learning_rate": 1.1329130127138587e-05, "loss": 0.0235, "step": 7520 }, { "epoch": 3.421747042766151, "grad_norm": 0.3014429367541853, "learning_rate": 1.1323147362982761e-05, "loss": 0.0047, "step": 7521 }, { "epoch": 3.4222020018198362, "grad_norm": 0.2230610412558585, "learning_rate": 1.131716571643758e-05, "loss": 0.01, "step": 7522 }, { "epoch": 3.4226569608735216, "grad_norm": 0.2880935557268939, "learning_rate": 1.1311185187991824e-05, "loss": 0.0081, "step": 7523 }, { "epoch": 3.4231119199272064, "grad_norm": 0.3063148810463517, "learning_rate": 1.1305205778134195e-05, "loss": 0.0172, "step": 7524 }, { "epoch": 3.4235668789808917, "grad_norm": 0.4954073843320526, "learning_rate": 1.1299227487353297e-05, "loss": 0.0159, "step": 7525 }, { "epoch": 3.424021838034577, "grad_norm": 0.21084848903980163, "learning_rate": 1.1293250316137665e-05, "loss": 0.0039, "step": 7526 }, { "epoch": 3.424476797088262, "grad_norm": 0.36695905936266837, "learning_rate": 1.1287274264975711e-05, "loss": 0.0198, "step": 7527 }, { "epoch": 3.4249317561419472, "grad_norm": 0.4055197819640996, "learning_rate": 1.1281299334355785e-05, "loss": 0.0267, "step": 7528 }, { "epoch": 3.4253867151956325, "grad_norm": 0.2523263888777098, "learning_rate": 1.1275325524766125e-05, "loss": 0.0067, "step": 7529 }, { "epoch": 3.4258416742493174, "grad_norm": 0.26945390870717506, "learning_rate": 1.1269352836694874e-05, "loss": 0.0078, "step": 7530 }, { "epoch": 3.4262966333030027, "grad_norm": 0.2113469801675978, "learning_rate": 1.126338127063011e-05, "loss": 0.0044, "step": 7531 }, { "epoch": 3.426751592356688, "grad_norm": 0.2739282257927062, "learning_rate": 1.1257410827059794e-05, "loss": 0.0055, "step": 7532 }, { "epoch": 3.427206551410373, "grad_norm": 0.39410643796231903, "learning_rate": 1.1251441506471808e-05, "loss": 0.019, "step": 7533 }, { "epoch": 3.427661510464058, "grad_norm": 0.48479062695846786, "learning_rate": 1.124547330935392e-05, "loss": 0.0273, "step": 7534 }, { "epoch": 3.4281164695177435, "grad_norm": 0.365237516313285, "learning_rate": 1.1239506236193843e-05, "loss": 0.0061, "step": 7535 }, { "epoch": 3.4285714285714284, "grad_norm": 0.2865415668309282, "learning_rate": 1.1233540287479182e-05, "loss": 0.0158, "step": 7536 }, { "epoch": 3.4290263876251137, "grad_norm": 0.5445827583350822, "learning_rate": 1.122757546369744e-05, "loss": 0.0144, "step": 7537 }, { "epoch": 3.429481346678799, "grad_norm": 0.5214255759685154, "learning_rate": 1.1221611765336035e-05, "loss": 0.0183, "step": 7538 }, { "epoch": 3.4299363057324843, "grad_norm": 0.3369409090750568, "learning_rate": 1.1215649192882283e-05, "loss": 0.0177, "step": 7539 }, { "epoch": 3.430391264786169, "grad_norm": 0.35055911183665817, "learning_rate": 1.1209687746823442e-05, "loss": 0.0144, "step": 7540 }, { "epoch": 3.4308462238398545, "grad_norm": 0.3540905098137074, "learning_rate": 1.120372742764663e-05, "loss": 0.025, "step": 7541 }, { "epoch": 3.43130118289354, "grad_norm": 0.2358925319617691, "learning_rate": 1.1197768235838918e-05, "loss": 0.0104, "step": 7542 }, { "epoch": 3.4317561419472247, "grad_norm": 0.3972772857312809, "learning_rate": 1.1191810171887257e-05, "loss": 0.0187, "step": 7543 }, { "epoch": 3.43221110100091, "grad_norm": 0.2580349682095937, "learning_rate": 1.1185853236278513e-05, "loss": 0.0084, "step": 7544 }, { "epoch": 3.4326660600545953, "grad_norm": 0.6986750467874361, "learning_rate": 1.1179897429499448e-05, "loss": 0.012, "step": 7545 }, { "epoch": 3.43312101910828, "grad_norm": 0.3835428292295021, "learning_rate": 1.1173942752036762e-05, "loss": 0.0095, "step": 7546 }, { "epoch": 3.4335759781619655, "grad_norm": 0.5348511885287015, "learning_rate": 1.1167989204377036e-05, "loss": 0.0411, "step": 7547 }, { "epoch": 3.434030937215651, "grad_norm": 0.3241785575047505, "learning_rate": 1.116203678700676e-05, "loss": 0.0198, "step": 7548 }, { "epoch": 3.4344858962693356, "grad_norm": 0.26214646438975436, "learning_rate": 1.1156085500412356e-05, "loss": 0.0065, "step": 7549 }, { "epoch": 3.434940855323021, "grad_norm": 0.2319766055948243, "learning_rate": 1.1150135345080116e-05, "loss": 0.0066, "step": 7550 }, { "epoch": 3.4353958143767063, "grad_norm": 0.3217195954287712, "learning_rate": 1.1144186321496277e-05, "loss": 0.0114, "step": 7551 }, { "epoch": 3.435850773430391, "grad_norm": 0.36304126557059163, "learning_rate": 1.1138238430146961e-05, "loss": 0.0258, "step": 7552 }, { "epoch": 3.4363057324840764, "grad_norm": 0.2589512618512424, "learning_rate": 1.1132291671518202e-05, "loss": 0.0093, "step": 7553 }, { "epoch": 3.4367606915377618, "grad_norm": 0.2303509209389608, "learning_rate": 1.112634604609593e-05, "loss": 0.0104, "step": 7554 }, { "epoch": 3.4372156505914466, "grad_norm": 0.25633607845435313, "learning_rate": 1.112040155436601e-05, "loss": 0.0113, "step": 7555 }, { "epoch": 3.437670609645132, "grad_norm": 0.32567205404701854, "learning_rate": 1.1114458196814204e-05, "loss": 0.0233, "step": 7556 }, { "epoch": 3.4381255686988172, "grad_norm": 0.37401000985431704, "learning_rate": 1.1108515973926167e-05, "loss": 0.012, "step": 7557 }, { "epoch": 3.438580527752502, "grad_norm": 0.18818413770068326, "learning_rate": 1.110257488618747e-05, "loss": 0.0057, "step": 7558 }, { "epoch": 3.4390354868061874, "grad_norm": 0.49190332953428006, "learning_rate": 1.1096634934083585e-05, "loss": 0.0412, "step": 7559 }, { "epoch": 3.4394904458598727, "grad_norm": 0.27748934041246387, "learning_rate": 1.1090696118099914e-05, "loss": 0.0168, "step": 7560 }, { "epoch": 3.4399454049135576, "grad_norm": 0.3250361793185927, "learning_rate": 1.1084758438721743e-05, "loss": 0.011, "step": 7561 }, { "epoch": 3.440400363967243, "grad_norm": 0.5041040933792507, "learning_rate": 1.1078821896434264e-05, "loss": 0.0185, "step": 7562 }, { "epoch": 3.4408553230209282, "grad_norm": 0.23015332587588408, "learning_rate": 1.1072886491722601e-05, "loss": 0.0047, "step": 7563 }, { "epoch": 3.441310282074613, "grad_norm": 0.576898086167456, "learning_rate": 1.106695222507175e-05, "loss": 0.0194, "step": 7564 }, { "epoch": 3.4417652411282984, "grad_norm": 0.34434594716456524, "learning_rate": 1.106101909696665e-05, "loss": 0.0167, "step": 7565 }, { "epoch": 3.4422202001819837, "grad_norm": 0.3222726989756892, "learning_rate": 1.1055087107892122e-05, "loss": 0.0062, "step": 7566 }, { "epoch": 3.4426751592356686, "grad_norm": 0.43467680680242465, "learning_rate": 1.1049156258332901e-05, "loss": 0.0264, "step": 7567 }, { "epoch": 3.443130118289354, "grad_norm": 0.23429822997696165, "learning_rate": 1.1043226548773621e-05, "loss": 0.0084, "step": 7568 }, { "epoch": 3.443585077343039, "grad_norm": 0.3203757689080942, "learning_rate": 1.1037297979698837e-05, "loss": 0.0217, "step": 7569 }, { "epoch": 3.444040036396724, "grad_norm": 0.5567495355392769, "learning_rate": 1.1031370551593017e-05, "loss": 0.0207, "step": 7570 }, { "epoch": 3.4444949954504094, "grad_norm": 0.49315755057485466, "learning_rate": 1.1025444264940515e-05, "loss": 0.0134, "step": 7571 }, { "epoch": 3.4449499545040947, "grad_norm": 0.25871964117401614, "learning_rate": 1.1019519120225599e-05, "loss": 0.0088, "step": 7572 }, { "epoch": 3.4454049135577796, "grad_norm": 0.3220186506974479, "learning_rate": 1.1013595117932438e-05, "loss": 0.0157, "step": 7573 }, { "epoch": 3.445859872611465, "grad_norm": 0.31821618722115447, "learning_rate": 1.1007672258545126e-05, "loss": 0.0164, "step": 7574 }, { "epoch": 3.44631483166515, "grad_norm": 0.5971554571015194, "learning_rate": 1.100175054254765e-05, "loss": 0.0124, "step": 7575 }, { "epoch": 3.4467697907188355, "grad_norm": 0.45242164190185574, "learning_rate": 1.09958299704239e-05, "loss": 0.0343, "step": 7576 }, { "epoch": 3.4472247497725204, "grad_norm": 0.395290332730002, "learning_rate": 1.0989910542657685e-05, "loss": 0.0239, "step": 7577 }, { "epoch": 3.4476797088262057, "grad_norm": 0.4093883023855205, "learning_rate": 1.0983992259732706e-05, "loss": 0.0199, "step": 7578 }, { "epoch": 3.448134667879891, "grad_norm": 0.40379295251724673, "learning_rate": 1.0978075122132592e-05, "loss": 0.0212, "step": 7579 }, { "epoch": 3.448589626933576, "grad_norm": 0.14088416240322854, "learning_rate": 1.0972159130340857e-05, "loss": 0.0033, "step": 7580 }, { "epoch": 3.449044585987261, "grad_norm": 0.42766242390902837, "learning_rate": 1.0966244284840926e-05, "loss": 0.0153, "step": 7581 }, { "epoch": 3.4494995450409465, "grad_norm": 0.8063200699333957, "learning_rate": 1.0960330586116138e-05, "loss": 0.0378, "step": 7582 }, { "epoch": 3.4499545040946313, "grad_norm": 0.3532033604817366, "learning_rate": 1.0954418034649725e-05, "loss": 0.0083, "step": 7583 }, { "epoch": 3.4504094631483166, "grad_norm": 0.1670656094487469, "learning_rate": 1.0948506630924837e-05, "loss": 0.0054, "step": 7584 }, { "epoch": 3.450864422202002, "grad_norm": 0.34708699022830414, "learning_rate": 1.0942596375424543e-05, "loss": 0.0156, "step": 7585 }, { "epoch": 3.451319381255687, "grad_norm": 0.2905339767922927, "learning_rate": 1.093668726863179e-05, "loss": 0.0138, "step": 7586 }, { "epoch": 3.451774340309372, "grad_norm": 0.19632756857313682, "learning_rate": 1.0930779311029442e-05, "loss": 0.0042, "step": 7587 }, { "epoch": 3.4522292993630574, "grad_norm": 0.4338983970423882, "learning_rate": 1.0924872503100267e-05, "loss": 0.0132, "step": 7588 }, { "epoch": 3.4526842584167423, "grad_norm": 0.23244746362442945, "learning_rate": 1.0918966845326955e-05, "loss": 0.0075, "step": 7589 }, { "epoch": 3.4531392174704276, "grad_norm": 0.22419411016131108, "learning_rate": 1.0913062338192077e-05, "loss": 0.0152, "step": 7590 }, { "epoch": 3.453594176524113, "grad_norm": 0.32507252012222493, "learning_rate": 1.0907158982178134e-05, "loss": 0.0102, "step": 7591 }, { "epoch": 3.4540491355777982, "grad_norm": 0.26291265437313666, "learning_rate": 1.0901256777767519e-05, "loss": 0.0083, "step": 7592 }, { "epoch": 3.454504094631483, "grad_norm": 0.28221028277834753, "learning_rate": 1.0895355725442519e-05, "loss": 0.0113, "step": 7593 }, { "epoch": 3.4549590536851684, "grad_norm": 0.350864063793226, "learning_rate": 1.0889455825685362e-05, "loss": 0.0123, "step": 7594 }, { "epoch": 3.4554140127388537, "grad_norm": 0.3165335217553209, "learning_rate": 1.0883557078978154e-05, "loss": 0.0169, "step": 7595 }, { "epoch": 3.4558689717925386, "grad_norm": 0.32817283054713264, "learning_rate": 1.0877659485802913e-05, "loss": 0.0196, "step": 7596 }, { "epoch": 3.456323930846224, "grad_norm": 0.3540101549582968, "learning_rate": 1.0871763046641553e-05, "loss": 0.0252, "step": 7597 }, { "epoch": 3.4567788898999092, "grad_norm": 0.30940640773554073, "learning_rate": 1.0865867761975915e-05, "loss": 0.022, "step": 7598 }, { "epoch": 3.457233848953594, "grad_norm": 0.3026862148651228, "learning_rate": 1.0859973632287742e-05, "loss": 0.0089, "step": 7599 }, { "epoch": 3.4576888080072794, "grad_norm": 0.2937403016642639, "learning_rate": 1.0854080658058669e-05, "loss": 0.0115, "step": 7600 }, { "epoch": 3.4581437670609647, "grad_norm": 0.2735240108191262, "learning_rate": 1.084818883977024e-05, "loss": 0.0145, "step": 7601 }, { "epoch": 3.4585987261146496, "grad_norm": 0.3846993830845759, "learning_rate": 1.0842298177903903e-05, "loss": 0.0175, "step": 7602 }, { "epoch": 3.459053685168335, "grad_norm": 0.3514869939435362, "learning_rate": 1.0836408672941034e-05, "loss": 0.0164, "step": 7603 }, { "epoch": 3.45950864422202, "grad_norm": 0.29051011478532457, "learning_rate": 1.0830520325362876e-05, "loss": 0.0131, "step": 7604 }, { "epoch": 3.459963603275705, "grad_norm": 0.3469400719178307, "learning_rate": 1.0824633135650614e-05, "loss": 0.013, "step": 7605 }, { "epoch": 3.4604185623293904, "grad_norm": 0.2871039098238808, "learning_rate": 1.0818747104285321e-05, "loss": 0.0095, "step": 7606 }, { "epoch": 3.4608735213830757, "grad_norm": 0.4359875972092363, "learning_rate": 1.081286223174796e-05, "loss": 0.0285, "step": 7607 }, { "epoch": 3.4613284804367606, "grad_norm": 0.32032353308239503, "learning_rate": 1.080697851851944e-05, "loss": 0.0171, "step": 7608 }, { "epoch": 3.461783439490446, "grad_norm": 0.3689385810662938, "learning_rate": 1.0801095965080541e-05, "loss": 0.016, "step": 7609 }, { "epoch": 3.462238398544131, "grad_norm": 0.356626574377173, "learning_rate": 1.0795214571911955e-05, "loss": 0.0147, "step": 7610 }, { "epoch": 3.462693357597816, "grad_norm": 0.40552271332339523, "learning_rate": 1.0789334339494278e-05, "loss": 0.0124, "step": 7611 }, { "epoch": 3.4631483166515014, "grad_norm": 0.484824868954711, "learning_rate": 1.0783455268308026e-05, "loss": 0.031, "step": 7612 }, { "epoch": 3.4636032757051867, "grad_norm": 0.4184361772450152, "learning_rate": 1.0777577358833616e-05, "loss": 0.0277, "step": 7613 }, { "epoch": 3.4640582347588715, "grad_norm": 0.3884864209182071, "learning_rate": 1.0771700611551354e-05, "loss": 0.019, "step": 7614 }, { "epoch": 3.464513193812557, "grad_norm": 0.24092337285853055, "learning_rate": 1.0765825026941468e-05, "loss": 0.0102, "step": 7615 }, { "epoch": 3.464968152866242, "grad_norm": 0.195980178001834, "learning_rate": 1.0759950605484079e-05, "loss": 0.0123, "step": 7616 }, { "epoch": 3.465423111919927, "grad_norm": 0.2584760700836025, "learning_rate": 1.0754077347659208e-05, "loss": 0.0161, "step": 7617 }, { "epoch": 3.4658780709736123, "grad_norm": 0.34811812229773004, "learning_rate": 1.0748205253946802e-05, "loss": 0.0186, "step": 7618 }, { "epoch": 3.4663330300272976, "grad_norm": 0.21160250845437895, "learning_rate": 1.0742334324826714e-05, "loss": 0.0058, "step": 7619 }, { "epoch": 3.4667879890809825, "grad_norm": 0.38003655973634215, "learning_rate": 1.0736464560778675e-05, "loss": 0.033, "step": 7620 }, { "epoch": 3.467242948134668, "grad_norm": 0.5741168039501321, "learning_rate": 1.0730595962282339e-05, "loss": 0.0205, "step": 7621 }, { "epoch": 3.467697907188353, "grad_norm": 0.36566169656901043, "learning_rate": 1.0724728529817253e-05, "loss": 0.0152, "step": 7622 }, { "epoch": 3.468152866242038, "grad_norm": 0.2333719148147921, "learning_rate": 1.0718862263862892e-05, "loss": 0.0057, "step": 7623 }, { "epoch": 3.4686078252957233, "grad_norm": 0.39882640365377237, "learning_rate": 1.0712997164898616e-05, "loss": 0.0078, "step": 7624 }, { "epoch": 3.4690627843494086, "grad_norm": 0.1952917929805915, "learning_rate": 1.0707133233403682e-05, "loss": 0.0059, "step": 7625 }, { "epoch": 3.4695177434030935, "grad_norm": 0.2635171185949613, "learning_rate": 1.0701270469857284e-05, "loss": 0.0041, "step": 7626 }, { "epoch": 3.469972702456779, "grad_norm": 0.503038989118809, "learning_rate": 1.069540887473848e-05, "loss": 0.0116, "step": 7627 }, { "epoch": 3.470427661510464, "grad_norm": 0.2802311861639431, "learning_rate": 1.0689548448526273e-05, "loss": 0.015, "step": 7628 }, { "epoch": 3.470882620564149, "grad_norm": 0.32337538114419145, "learning_rate": 1.0683689191699545e-05, "loss": 0.013, "step": 7629 }, { "epoch": 3.4713375796178343, "grad_norm": 0.26758101188058225, "learning_rate": 1.0677831104737079e-05, "loss": 0.01, "step": 7630 }, { "epoch": 3.4717925386715196, "grad_norm": 0.34043795918609554, "learning_rate": 1.0671974188117572e-05, "loss": 0.0121, "step": 7631 }, { "epoch": 3.472247497725205, "grad_norm": 0.27998245438143077, "learning_rate": 1.0666118442319628e-05, "loss": 0.0056, "step": 7632 }, { "epoch": 3.47270245677889, "grad_norm": 0.32370280547288804, "learning_rate": 1.0660263867821762e-05, "loss": 0.0243, "step": 7633 }, { "epoch": 3.473157415832575, "grad_norm": 0.43532050303368286, "learning_rate": 1.0654410465102378e-05, "loss": 0.0184, "step": 7634 }, { "epoch": 3.4736123748862604, "grad_norm": 0.3376433309921514, "learning_rate": 1.0648558234639783e-05, "loss": 0.0123, "step": 7635 }, { "epoch": 3.4740673339399453, "grad_norm": 0.2532089350563352, "learning_rate": 1.0642707176912189e-05, "loss": 0.0286, "step": 7636 }, { "epoch": 3.4745222929936306, "grad_norm": 0.7589305862413159, "learning_rate": 1.0636857292397739e-05, "loss": 0.0514, "step": 7637 }, { "epoch": 3.474977252047316, "grad_norm": 0.3278612776374481, "learning_rate": 1.0631008581574447e-05, "loss": 0.014, "step": 7638 }, { "epoch": 3.4754322111010008, "grad_norm": 0.9646595472381455, "learning_rate": 1.0625161044920237e-05, "loss": 0.1056, "step": 7639 }, { "epoch": 3.475887170154686, "grad_norm": 0.30349505216312184, "learning_rate": 1.0619314682912956e-05, "loss": 0.0153, "step": 7640 }, { "epoch": 3.4763421292083714, "grad_norm": 0.5706325683019043, "learning_rate": 1.0613469496030328e-05, "loss": 0.0286, "step": 7641 }, { "epoch": 3.4767970882620562, "grad_norm": 0.32969892942355083, "learning_rate": 1.0607625484750013e-05, "loss": 0.0185, "step": 7642 }, { "epoch": 3.4772520473157416, "grad_norm": 0.4247881444929945, "learning_rate": 1.060178264954955e-05, "loss": 0.0252, "step": 7643 }, { "epoch": 3.477707006369427, "grad_norm": 0.24676691214650703, "learning_rate": 1.0595940990906387e-05, "loss": 0.0123, "step": 7644 }, { "epoch": 3.4781619654231117, "grad_norm": 0.11975845095116137, "learning_rate": 1.0590100509297866e-05, "loss": 0.0037, "step": 7645 }, { "epoch": 3.478616924476797, "grad_norm": 0.20854294664031586, "learning_rate": 1.058426120520126e-05, "loss": 0.004, "step": 7646 }, { "epoch": 3.4790718835304824, "grad_norm": 0.42643313511822606, "learning_rate": 1.0578423079093732e-05, "loss": 0.022, "step": 7647 }, { "epoch": 3.4795268425841677, "grad_norm": 0.4456797714942991, "learning_rate": 1.0572586131452347e-05, "loss": 0.025, "step": 7648 }, { "epoch": 3.4799818016378525, "grad_norm": 0.3168265507617392, "learning_rate": 1.0566750362754069e-05, "loss": 0.0162, "step": 7649 }, { "epoch": 3.480436760691538, "grad_norm": 0.2657594369003933, "learning_rate": 1.0560915773475761e-05, "loss": 0.0133, "step": 7650 }, { "epoch": 3.480891719745223, "grad_norm": 0.22352836990323005, "learning_rate": 1.0555082364094221e-05, "loss": 0.0055, "step": 7651 }, { "epoch": 3.481346678798908, "grad_norm": 0.4046305656050898, "learning_rate": 1.0549250135086114e-05, "loss": 0.0258, "step": 7652 }, { "epoch": 3.4818016378525933, "grad_norm": 0.2679133964688922, "learning_rate": 1.0543419086928018e-05, "loss": 0.0079, "step": 7653 }, { "epoch": 3.4822565969062786, "grad_norm": 0.4365096762935569, "learning_rate": 1.0537589220096441e-05, "loss": 0.024, "step": 7654 }, { "epoch": 3.4827115559599635, "grad_norm": 0.48228326189865567, "learning_rate": 1.0531760535067762e-05, "loss": 0.0141, "step": 7655 }, { "epoch": 3.483166515013649, "grad_norm": 0.2684309123357071, "learning_rate": 1.0525933032318263e-05, "loss": 0.0187, "step": 7656 }, { "epoch": 3.483621474067334, "grad_norm": 0.3394085942678226, "learning_rate": 1.052010671232416e-05, "loss": 0.0237, "step": 7657 }, { "epoch": 3.484076433121019, "grad_norm": 0.36160443961471556, "learning_rate": 1.0514281575561549e-05, "loss": 0.0147, "step": 7658 }, { "epoch": 3.4845313921747043, "grad_norm": 0.31185718691300707, "learning_rate": 1.050845762250643e-05, "loss": 0.0165, "step": 7659 }, { "epoch": 3.4849863512283896, "grad_norm": 0.4368899881440047, "learning_rate": 1.05026348536347e-05, "loss": 0.023, "step": 7660 }, { "epoch": 3.4854413102820745, "grad_norm": 0.29720433690869164, "learning_rate": 1.049681326942218e-05, "loss": 0.0191, "step": 7661 }, { "epoch": 3.48589626933576, "grad_norm": 0.5026646668889203, "learning_rate": 1.0490992870344593e-05, "loss": 0.01, "step": 7662 }, { "epoch": 3.486351228389445, "grad_norm": 0.3104179863217629, "learning_rate": 1.0485173656877547e-05, "loss": 0.0154, "step": 7663 }, { "epoch": 3.48680618744313, "grad_norm": 0.2500133052931248, "learning_rate": 1.0479355629496562e-05, "loss": 0.0075, "step": 7664 }, { "epoch": 3.4872611464968153, "grad_norm": 0.21661757031667644, "learning_rate": 1.047353878867705e-05, "loss": 0.0083, "step": 7665 }, { "epoch": 3.4877161055505006, "grad_norm": 0.16211667885233885, "learning_rate": 1.0467723134894358e-05, "loss": 0.0052, "step": 7666 }, { "epoch": 3.4881710646041855, "grad_norm": 0.43159590743123544, "learning_rate": 1.0461908668623696e-05, "loss": 0.0137, "step": 7667 }, { "epoch": 3.488626023657871, "grad_norm": 0.2864657358072857, "learning_rate": 1.0456095390340212e-05, "loss": 0.0111, "step": 7668 }, { "epoch": 3.489080982711556, "grad_norm": 0.31884467581246645, "learning_rate": 1.0450283300518932e-05, "loss": 0.0065, "step": 7669 }, { "epoch": 3.489535941765241, "grad_norm": 0.33567875347087706, "learning_rate": 1.0444472399634786e-05, "loss": 0.0146, "step": 7670 }, { "epoch": 3.4899909008189263, "grad_norm": 0.162301779927352, "learning_rate": 1.0438662688162634e-05, "loss": 0.0058, "step": 7671 }, { "epoch": 3.4904458598726116, "grad_norm": 0.6797317488729887, "learning_rate": 1.0432854166577209e-05, "loss": 0.0334, "step": 7672 }, { "epoch": 3.4909008189262964, "grad_norm": 0.3277366656837895, "learning_rate": 1.0427046835353155e-05, "loss": 0.0111, "step": 7673 }, { "epoch": 3.4913557779799818, "grad_norm": 0.22578829386982083, "learning_rate": 1.0421240694965013e-05, "loss": 0.0092, "step": 7674 }, { "epoch": 3.491810737033667, "grad_norm": 0.3458614726038349, "learning_rate": 1.0415435745887245e-05, "loss": 0.0169, "step": 7675 }, { "epoch": 3.492265696087352, "grad_norm": 0.360316568466508, "learning_rate": 1.0409631988594215e-05, "loss": 0.0148, "step": 7676 }, { "epoch": 3.4927206551410372, "grad_norm": 0.23094299947139318, "learning_rate": 1.0403829423560168e-05, "loss": 0.0186, "step": 7677 }, { "epoch": 3.4931756141947226, "grad_norm": 0.3732384719694616, "learning_rate": 1.0398028051259265e-05, "loss": 0.0156, "step": 7678 }, { "epoch": 3.4936305732484074, "grad_norm": 0.5367631505509675, "learning_rate": 1.0392227872165557e-05, "loss": 0.0371, "step": 7679 }, { "epoch": 3.4940855323020927, "grad_norm": 0.2760615694421554, "learning_rate": 1.038642888675303e-05, "loss": 0.0091, "step": 7680 }, { "epoch": 3.494540491355778, "grad_norm": 0.3973874704626301, "learning_rate": 1.0380631095495533e-05, "loss": 0.0268, "step": 7681 }, { "epoch": 3.494995450409463, "grad_norm": 0.2504758693258473, "learning_rate": 1.0374834498866848e-05, "loss": 0.0069, "step": 7682 }, { "epoch": 3.4954504094631482, "grad_norm": 0.3090247496314876, "learning_rate": 1.0369039097340644e-05, "loss": 0.0098, "step": 7683 }, { "epoch": 3.4959053685168335, "grad_norm": 0.29168190187890974, "learning_rate": 1.036324489139048e-05, "loss": 0.0124, "step": 7684 }, { "epoch": 3.496360327570519, "grad_norm": 0.4413735837093503, "learning_rate": 1.0357451881489858e-05, "loss": 0.0262, "step": 7685 }, { "epoch": 3.4968152866242037, "grad_norm": 0.24865356689747195, "learning_rate": 1.0351660068112137e-05, "loss": 0.0135, "step": 7686 }, { "epoch": 3.497270245677889, "grad_norm": 0.32636829355255903, "learning_rate": 1.0345869451730608e-05, "loss": 0.018, "step": 7687 }, { "epoch": 3.4977252047315743, "grad_norm": 0.3821293908538402, "learning_rate": 1.0340080032818442e-05, "loss": 0.0137, "step": 7688 }, { "epoch": 3.498180163785259, "grad_norm": 0.34796066943452086, "learning_rate": 1.0334291811848738e-05, "loss": 0.0136, "step": 7689 }, { "epoch": 3.4986351228389445, "grad_norm": 1.4584716206960722, "learning_rate": 1.0328504789294469e-05, "loss": 0.1, "step": 7690 }, { "epoch": 3.49909008189263, "grad_norm": 0.22325985005898796, "learning_rate": 1.0322718965628542e-05, "loss": 0.0116, "step": 7691 }, { "epoch": 3.4995450409463147, "grad_norm": 0.3878382261921425, "learning_rate": 1.031693434132374e-05, "loss": 0.0164, "step": 7692 }, { "epoch": 3.5, "grad_norm": 0.4048402106304265, "learning_rate": 1.0311150916852755e-05, "loss": 0.0356, "step": 7693 }, { "epoch": 3.5004549590536853, "grad_norm": 0.37723621788485906, "learning_rate": 1.0305368692688174e-05, "loss": 0.0155, "step": 7694 }, { "epoch": 3.50090991810737, "grad_norm": 0.3201590064161834, "learning_rate": 1.0299587669302502e-05, "loss": 0.0109, "step": 7695 }, { "epoch": 3.5013648771610555, "grad_norm": 0.3021571581783607, "learning_rate": 1.029380784716815e-05, "loss": 0.0102, "step": 7696 }, { "epoch": 3.501819836214741, "grad_norm": 0.5342419378004951, "learning_rate": 1.0288029226757406e-05, "loss": 0.0522, "step": 7697 }, { "epoch": 3.502274795268426, "grad_norm": 0.39773506110074064, "learning_rate": 1.0282251808542475e-05, "loss": 0.0117, "step": 7698 }, { "epoch": 3.502729754322111, "grad_norm": 0.3236132286136434, "learning_rate": 1.0276475592995454e-05, "loss": 0.0135, "step": 7699 }, { "epoch": 3.5031847133757963, "grad_norm": 0.36963151264528354, "learning_rate": 1.0270700580588366e-05, "loss": 0.0205, "step": 7700 }, { "epoch": 3.5036396724294816, "grad_norm": 0.26481471782644556, "learning_rate": 1.026492677179311e-05, "loss": 0.0074, "step": 7701 }, { "epoch": 3.5040946314831665, "grad_norm": 0.25918002367266085, "learning_rate": 1.0259154167081483e-05, "loss": 0.0064, "step": 7702 }, { "epoch": 3.5045495905368518, "grad_norm": 0.2647108999195838, "learning_rate": 1.0253382766925221e-05, "loss": 0.0065, "step": 7703 }, { "epoch": 3.505004549590537, "grad_norm": 0.21535122970639925, "learning_rate": 1.0247612571795914e-05, "loss": 0.0057, "step": 7704 }, { "epoch": 3.505459508644222, "grad_norm": 0.224407998279665, "learning_rate": 1.0241843582165094e-05, "loss": 0.0057, "step": 7705 }, { "epoch": 3.5059144676979073, "grad_norm": 0.2749538828007824, "learning_rate": 1.0236075798504172e-05, "loss": 0.0109, "step": 7706 }, { "epoch": 3.5063694267515926, "grad_norm": 0.24667417262622424, "learning_rate": 1.023030922128446e-05, "loss": 0.0136, "step": 7707 }, { "epoch": 3.5068243858052774, "grad_norm": 0.29349500360985015, "learning_rate": 1.0224543850977169e-05, "loss": 0.0139, "step": 7708 }, { "epoch": 3.5072793448589628, "grad_norm": 0.21783735213047978, "learning_rate": 1.0218779688053432e-05, "loss": 0.0082, "step": 7709 }, { "epoch": 3.507734303912648, "grad_norm": 0.17749630115700024, "learning_rate": 1.0213016732984276e-05, "loss": 0.0028, "step": 7710 }, { "epoch": 3.508189262966333, "grad_norm": 0.24798730439112018, "learning_rate": 1.0207254986240614e-05, "loss": 0.0046, "step": 7711 }, { "epoch": 3.5086442220200182, "grad_norm": 0.22919866771105293, "learning_rate": 1.0201494448293272e-05, "loss": 0.0091, "step": 7712 }, { "epoch": 3.5090991810737036, "grad_norm": 0.4246633538791263, "learning_rate": 1.0195735119612965e-05, "loss": 0.0233, "step": 7713 }, { "epoch": 3.5095541401273884, "grad_norm": 0.5668641272467383, "learning_rate": 1.0189977000670336e-05, "loss": 0.0107, "step": 7714 }, { "epoch": 3.5100090991810737, "grad_norm": 0.34658386122410456, "learning_rate": 1.0184220091935905e-05, "loss": 0.0134, "step": 7715 }, { "epoch": 3.510464058234759, "grad_norm": 0.5278207528497539, "learning_rate": 1.0178464393880094e-05, "loss": 0.0316, "step": 7716 }, { "epoch": 3.510919017288444, "grad_norm": 0.3964886338145172, "learning_rate": 1.0172709906973249e-05, "loss": 0.0139, "step": 7717 }, { "epoch": 3.511373976342129, "grad_norm": 0.2852288098927782, "learning_rate": 1.0166956631685578e-05, "loss": 0.0092, "step": 7718 }, { "epoch": 3.5118289353958145, "grad_norm": 0.2785766268503672, "learning_rate": 1.016120456848724e-05, "loss": 0.008, "step": 7719 }, { "epoch": 3.5122838944494994, "grad_norm": 0.6035995391089407, "learning_rate": 1.0155453717848249e-05, "loss": 0.0119, "step": 7720 }, { "epoch": 3.5127388535031847, "grad_norm": 0.39613083510818775, "learning_rate": 1.0149704080238543e-05, "loss": 0.0094, "step": 7721 }, { "epoch": 3.51319381255687, "grad_norm": 0.20779778854664682, "learning_rate": 1.0143955656127957e-05, "loss": 0.0052, "step": 7722 }, { "epoch": 3.513648771610555, "grad_norm": 0.3376917212610404, "learning_rate": 1.0138208445986209e-05, "loss": 0.017, "step": 7723 }, { "epoch": 3.51410373066424, "grad_norm": 0.2582575952804332, "learning_rate": 1.0132462450282967e-05, "loss": 0.0077, "step": 7724 }, { "epoch": 3.5145586897179255, "grad_norm": 0.4077725517295637, "learning_rate": 1.0126717669487754e-05, "loss": 0.0105, "step": 7725 }, { "epoch": 3.5150136487716104, "grad_norm": 0.2581875723073248, "learning_rate": 1.0120974104070006e-05, "loss": 0.0164, "step": 7726 }, { "epoch": 3.5154686078252957, "grad_norm": 0.346642905530198, "learning_rate": 1.011523175449906e-05, "loss": 0.0108, "step": 7727 }, { "epoch": 3.515923566878981, "grad_norm": 0.33196408521423826, "learning_rate": 1.0109490621244146e-05, "loss": 0.0128, "step": 7728 }, { "epoch": 3.516378525932666, "grad_norm": 0.6538298252844729, "learning_rate": 1.0103750704774426e-05, "loss": 0.0286, "step": 7729 }, { "epoch": 3.516833484986351, "grad_norm": 0.28700539611191594, "learning_rate": 1.0098012005558916e-05, "loss": 0.0188, "step": 7730 }, { "epoch": 3.5172884440400365, "grad_norm": 0.42400584050336193, "learning_rate": 1.0092274524066578e-05, "loss": 0.0209, "step": 7731 }, { "epoch": 3.5177434030937214, "grad_norm": 0.177791552438741, "learning_rate": 1.0086538260766243e-05, "loss": 0.0046, "step": 7732 }, { "epoch": 3.5181983621474067, "grad_norm": 0.28472739840279926, "learning_rate": 1.0080803216126645e-05, "loss": 0.0076, "step": 7733 }, { "epoch": 3.518653321201092, "grad_norm": 0.2853032796529881, "learning_rate": 1.007506939061644e-05, "loss": 0.0165, "step": 7734 }, { "epoch": 3.519108280254777, "grad_norm": 0.36679424705879776, "learning_rate": 1.0069336784704164e-05, "loss": 0.0202, "step": 7735 }, { "epoch": 3.519563239308462, "grad_norm": 0.860346838473344, "learning_rate": 1.0063605398858262e-05, "loss": 0.0249, "step": 7736 }, { "epoch": 3.5200181983621475, "grad_norm": 0.3193004911328957, "learning_rate": 1.0057875233547066e-05, "loss": 0.0263, "step": 7737 }, { "epoch": 3.5204731574158323, "grad_norm": 0.4210008595949589, "learning_rate": 1.0052146289238826e-05, "loss": 0.0187, "step": 7738 }, { "epoch": 3.5209281164695176, "grad_norm": 0.496576139090813, "learning_rate": 1.0046418566401698e-05, "loss": 0.0124, "step": 7739 }, { "epoch": 3.521383075523203, "grad_norm": 0.3992909442057239, "learning_rate": 1.0040692065503712e-05, "loss": 0.0132, "step": 7740 }, { "epoch": 3.521838034576888, "grad_norm": 0.19271303788193406, "learning_rate": 1.0034966787012818e-05, "loss": 0.0064, "step": 7741 }, { "epoch": 3.522292993630573, "grad_norm": 0.4089743156474746, "learning_rate": 1.0029242731396845e-05, "loss": 0.0236, "step": 7742 }, { "epoch": 3.5227479526842584, "grad_norm": 0.551316967508034, "learning_rate": 1.002351989912356e-05, "loss": 0.0545, "step": 7743 }, { "epoch": 3.5232029117379433, "grad_norm": 0.41673606822263454, "learning_rate": 1.0017798290660584e-05, "loss": 0.0184, "step": 7744 }, { "epoch": 3.5236578707916286, "grad_norm": 0.3608739626802103, "learning_rate": 1.0012077906475484e-05, "loss": 0.0236, "step": 7745 }, { "epoch": 3.524112829845314, "grad_norm": 0.24478848772876424, "learning_rate": 1.0006358747035691e-05, "loss": 0.0105, "step": 7746 }, { "epoch": 3.5245677888989992, "grad_norm": 0.27064730447897034, "learning_rate": 1.0000640812808542e-05, "loss": 0.0138, "step": 7747 }, { "epoch": 3.525022747952684, "grad_norm": 2.1219598674121203, "learning_rate": 9.9949241042613e-06, "loss": 0.0266, "step": 7748 }, { "epoch": 3.5254777070063694, "grad_norm": 0.4486519073885841, "learning_rate": 9.989208621861096e-06, "loss": 0.0307, "step": 7749 }, { "epoch": 3.5259326660600547, "grad_norm": 0.2791731709105996, "learning_rate": 9.983494366074975e-06, "loss": 0.0087, "step": 7750 }, { "epoch": 3.5263876251137396, "grad_norm": 0.3173675872692361, "learning_rate": 9.977781337369873e-06, "loss": 0.0203, "step": 7751 }, { "epoch": 3.526842584167425, "grad_norm": 0.39298017858431655, "learning_rate": 9.97206953621264e-06, "loss": 0.0202, "step": 7752 }, { "epoch": 3.52729754322111, "grad_norm": 0.4618790863948446, "learning_rate": 9.966358963070027e-06, "loss": 0.0224, "step": 7753 }, { "epoch": 3.5277525022747955, "grad_norm": 0.18328361103497984, "learning_rate": 9.960649618408668e-06, "loss": 0.0074, "step": 7754 }, { "epoch": 3.5282074613284804, "grad_norm": 0.4222545854013132, "learning_rate": 9.954941502695107e-06, "loss": 0.0206, "step": 7755 }, { "epoch": 3.5286624203821657, "grad_norm": 0.4337862580856391, "learning_rate": 9.949234616395772e-06, "loss": 0.0216, "step": 7756 }, { "epoch": 3.529117379435851, "grad_norm": 0.3341879549234914, "learning_rate": 9.943528959977027e-06, "loss": 0.0213, "step": 7757 }, { "epoch": 3.529572338489536, "grad_norm": 0.4793236785606001, "learning_rate": 9.937824533905091e-06, "loss": 0.0222, "step": 7758 }, { "epoch": 3.530027297543221, "grad_norm": 0.34882266634749803, "learning_rate": 9.932121338646122e-06, "loss": 0.0183, "step": 7759 }, { "epoch": 3.5304822565969065, "grad_norm": 0.3377211149689815, "learning_rate": 9.926419374666151e-06, "loss": 0.0134, "step": 7760 }, { "epoch": 3.5309372156505914, "grad_norm": 0.5619555614568484, "learning_rate": 9.92071864243112e-06, "loss": 0.0412, "step": 7761 }, { "epoch": 3.5313921747042767, "grad_norm": 0.2763175257993922, "learning_rate": 9.915019142406853e-06, "loss": 0.0079, "step": 7762 }, { "epoch": 3.531847133757962, "grad_norm": 0.27603981613642653, "learning_rate": 9.909320875059107e-06, "loss": 0.0325, "step": 7763 }, { "epoch": 3.532302092811647, "grad_norm": 0.24174538143392307, "learning_rate": 9.90362384085351e-06, "loss": 0.0116, "step": 7764 }, { "epoch": 3.532757051865332, "grad_norm": 0.37155751770769496, "learning_rate": 9.897928040255592e-06, "loss": 0.0189, "step": 7765 }, { "epoch": 3.5332120109190175, "grad_norm": 0.30485222219896857, "learning_rate": 9.8922334737308e-06, "loss": 0.012, "step": 7766 }, { "epoch": 3.5336669699727024, "grad_norm": 0.2433532484404869, "learning_rate": 9.886540141744456e-06, "loss": 0.0061, "step": 7767 }, { "epoch": 3.5341219290263877, "grad_norm": 0.2171488209318856, "learning_rate": 9.880848044761807e-06, "loss": 0.0121, "step": 7768 }, { "epoch": 3.534576888080073, "grad_norm": 0.31033871071400865, "learning_rate": 9.875157183247977e-06, "loss": 0.0122, "step": 7769 }, { "epoch": 3.535031847133758, "grad_norm": 0.40363649446746047, "learning_rate": 9.869467557668002e-06, "loss": 0.019, "step": 7770 }, { "epoch": 3.535486806187443, "grad_norm": 0.3108224143991828, "learning_rate": 9.863779168486798e-06, "loss": 0.0136, "step": 7771 }, { "epoch": 3.5359417652411285, "grad_norm": 0.32550231260689033, "learning_rate": 9.858092016169206e-06, "loss": 0.0157, "step": 7772 }, { "epoch": 3.5363967242948133, "grad_norm": 0.25905980126434397, "learning_rate": 9.852406101179965e-06, "loss": 0.0097, "step": 7773 }, { "epoch": 3.5368516833484986, "grad_norm": 0.47375349246837783, "learning_rate": 9.84672142398369e-06, "loss": 0.0219, "step": 7774 }, { "epoch": 3.537306642402184, "grad_norm": 0.34582762567267744, "learning_rate": 9.841037985044907e-06, "loss": 0.0283, "step": 7775 }, { "epoch": 3.537761601455869, "grad_norm": 0.37014779406734755, "learning_rate": 9.835355784828038e-06, "loss": 0.0235, "step": 7776 }, { "epoch": 3.538216560509554, "grad_norm": 0.3856874533837658, "learning_rate": 9.829674823797416e-06, "loss": 0.0198, "step": 7777 }, { "epoch": 3.5386715195632394, "grad_norm": 0.31345553603878346, "learning_rate": 9.82399510241726e-06, "loss": 0.0136, "step": 7778 }, { "epoch": 3.5391264786169243, "grad_norm": 0.39547261582401066, "learning_rate": 9.818316621151682e-06, "loss": 0.0091, "step": 7779 }, { "epoch": 3.5395814376706096, "grad_norm": 0.3319144621156922, "learning_rate": 9.81263938046472e-06, "loss": 0.0136, "step": 7780 }, { "epoch": 3.540036396724295, "grad_norm": 0.3153057222818684, "learning_rate": 9.806963380820272e-06, "loss": 0.0173, "step": 7781 }, { "epoch": 3.54049135577798, "grad_norm": 0.2587168869666795, "learning_rate": 9.801288622682172e-06, "loss": 0.015, "step": 7782 }, { "epoch": 3.540946314831665, "grad_norm": 0.21363868172264303, "learning_rate": 9.795615106514133e-06, "loss": 0.0046, "step": 7783 }, { "epoch": 3.5414012738853504, "grad_norm": 0.3963026727948137, "learning_rate": 9.789942832779764e-06, "loss": 0.0193, "step": 7784 }, { "epoch": 3.5418562329390353, "grad_norm": 0.6084007601467198, "learning_rate": 9.784271801942569e-06, "loss": 0.0273, "step": 7785 }, { "epoch": 3.5423111919927206, "grad_norm": 0.42774354104157186, "learning_rate": 9.778602014465968e-06, "loss": 0.0237, "step": 7786 }, { "epoch": 3.542766151046406, "grad_norm": 0.2669796796218277, "learning_rate": 9.77293347081328e-06, "loss": 0.009, "step": 7787 }, { "epoch": 3.5432211101000908, "grad_norm": 0.3516244500488686, "learning_rate": 9.767266171447706e-06, "loss": 0.0121, "step": 7788 }, { "epoch": 3.543676069153776, "grad_norm": 0.3423405782228961, "learning_rate": 9.761600116832349e-06, "loss": 0.0139, "step": 7789 }, { "epoch": 3.5441310282074614, "grad_norm": 0.22468816041915143, "learning_rate": 9.755935307430203e-06, "loss": 0.0084, "step": 7790 }, { "epoch": 3.5445859872611463, "grad_norm": 0.2575954967075248, "learning_rate": 9.750271743704195e-06, "loss": 0.0101, "step": 7791 }, { "epoch": 3.5450409463148316, "grad_norm": 0.21708748852997373, "learning_rate": 9.74460942611711e-06, "loss": 0.0053, "step": 7792 }, { "epoch": 3.545495905368517, "grad_norm": 0.27528652219701727, "learning_rate": 9.738948355131642e-06, "loss": 0.0078, "step": 7793 }, { "epoch": 3.5459508644222018, "grad_norm": 0.588222574903249, "learning_rate": 9.733288531210405e-06, "loss": 0.0185, "step": 7794 }, { "epoch": 3.546405823475887, "grad_norm": 0.22259279052662903, "learning_rate": 9.727629954815878e-06, "loss": 0.0052, "step": 7795 }, { "epoch": 3.5468607825295724, "grad_norm": 0.4028170033062584, "learning_rate": 9.72197262641047e-06, "loss": 0.0224, "step": 7796 }, { "epoch": 3.5473157415832572, "grad_norm": 0.37611351611265254, "learning_rate": 9.716316546456461e-06, "loss": 0.0177, "step": 7797 }, { "epoch": 3.5477707006369426, "grad_norm": 0.2086707764814849, "learning_rate": 9.710661715416047e-06, "loss": 0.0152, "step": 7798 }, { "epoch": 3.548225659690628, "grad_norm": 0.34272682885104555, "learning_rate": 9.705008133751308e-06, "loss": 0.0258, "step": 7799 }, { "epoch": 3.548680618744313, "grad_norm": 0.5071817491026714, "learning_rate": 9.699355801924229e-06, "loss": 0.0344, "step": 7800 }, { "epoch": 3.549135577797998, "grad_norm": 0.4052438993595297, "learning_rate": 9.693704720396693e-06, "loss": 0.0242, "step": 7801 }, { "epoch": 3.5495905368516834, "grad_norm": 0.23967748830018684, "learning_rate": 9.688054889630494e-06, "loss": 0.0063, "step": 7802 }, { "epoch": 3.5500454959053687, "grad_norm": 0.2608774453190942, "learning_rate": 9.682406310087302e-06, "loss": 0.0158, "step": 7803 }, { "epoch": 3.5505004549590535, "grad_norm": 0.34865991517973605, "learning_rate": 9.676758982228693e-06, "loss": 0.0224, "step": 7804 }, { "epoch": 3.550955414012739, "grad_norm": 0.40903023915530257, "learning_rate": 9.67111290651613e-06, "loss": 0.0172, "step": 7805 }, { "epoch": 3.551410373066424, "grad_norm": 0.4578284984711304, "learning_rate": 9.665468083411005e-06, "loss": 0.0069, "step": 7806 }, { "epoch": 3.5518653321201095, "grad_norm": 0.5259574181370217, "learning_rate": 9.659824513374571e-06, "loss": 0.0122, "step": 7807 }, { "epoch": 3.5523202911737943, "grad_norm": 0.47435058462477836, "learning_rate": 9.654182196868011e-06, "loss": 0.0122, "step": 7808 }, { "epoch": 3.5527752502274796, "grad_norm": 0.35715805588911226, "learning_rate": 9.64854113435238e-06, "loss": 0.0139, "step": 7809 }, { "epoch": 3.553230209281165, "grad_norm": 0.23630139392413488, "learning_rate": 9.642901326288631e-06, "loss": 0.0065, "step": 7810 }, { "epoch": 3.55368516833485, "grad_norm": 0.519380216619125, "learning_rate": 9.637262773137643e-06, "loss": 0.0273, "step": 7811 }, { "epoch": 3.554140127388535, "grad_norm": 0.2957309835654471, "learning_rate": 9.631625475360165e-06, "loss": 0.0158, "step": 7812 }, { "epoch": 3.5545950864422204, "grad_norm": 0.2869391501363754, "learning_rate": 9.62598943341685e-06, "loss": 0.0096, "step": 7813 }, { "epoch": 3.5550500454959053, "grad_norm": 0.2413688071578876, "learning_rate": 9.620354647768239e-06, "loss": 0.0082, "step": 7814 }, { "epoch": 3.5555050045495906, "grad_norm": 0.2720352997897493, "learning_rate": 9.614721118874795e-06, "loss": 0.0115, "step": 7815 }, { "epoch": 3.555959963603276, "grad_norm": 0.20837823728920887, "learning_rate": 9.609088847196868e-06, "loss": 0.0128, "step": 7816 }, { "epoch": 3.556414922656961, "grad_norm": 0.3726028531569406, "learning_rate": 9.603457833194698e-06, "loss": 0.0268, "step": 7817 }, { "epoch": 3.556869881710646, "grad_norm": 0.2651979104229254, "learning_rate": 9.597828077328423e-06, "loss": 0.0091, "step": 7818 }, { "epoch": 3.5573248407643314, "grad_norm": 0.3667790917632566, "learning_rate": 9.592199580058073e-06, "loss": 0.0156, "step": 7819 }, { "epoch": 3.5577797998180163, "grad_norm": 0.1917440695099879, "learning_rate": 9.5865723418436e-06, "loss": 0.0038, "step": 7820 }, { "epoch": 3.5582347588717016, "grad_norm": 0.1832169397271722, "learning_rate": 9.580946363144821e-06, "loss": 0.0044, "step": 7821 }, { "epoch": 3.558689717925387, "grad_norm": 0.2743477582093021, "learning_rate": 9.57532164442148e-06, "loss": 0.0209, "step": 7822 }, { "epoch": 3.5591446769790718, "grad_norm": 0.237101723240764, "learning_rate": 9.569698186133202e-06, "loss": 0.0246, "step": 7823 }, { "epoch": 3.559599636032757, "grad_norm": 0.30995207646764655, "learning_rate": 9.564075988739493e-06, "loss": 0.0199, "step": 7824 }, { "epoch": 3.5600545950864424, "grad_norm": 0.2676018889821645, "learning_rate": 9.558455052699797e-06, "loss": 0.0079, "step": 7825 }, { "epoch": 3.5605095541401273, "grad_norm": 0.3957488495972071, "learning_rate": 9.552835378473418e-06, "loss": 0.0168, "step": 7826 }, { "epoch": 3.5609645131938126, "grad_norm": 0.42243561268420143, "learning_rate": 9.547216966519577e-06, "loss": 0.0191, "step": 7827 }, { "epoch": 3.561419472247498, "grad_norm": 0.2953992359842186, "learning_rate": 9.54159981729737e-06, "loss": 0.033, "step": 7828 }, { "epoch": 3.5618744313011828, "grad_norm": 0.08763505357405911, "learning_rate": 9.535983931265815e-06, "loss": 0.0029, "step": 7829 }, { "epoch": 3.562329390354868, "grad_norm": 0.4120249392675336, "learning_rate": 9.53036930888383e-06, "loss": 0.0229, "step": 7830 }, { "epoch": 3.5627843494085534, "grad_norm": 0.251290915712708, "learning_rate": 9.524755950610204e-06, "loss": 0.0094, "step": 7831 }, { "epoch": 3.5632393084622382, "grad_norm": 0.1980365589077631, "learning_rate": 9.519143856903633e-06, "loss": 0.0109, "step": 7832 }, { "epoch": 3.5636942675159236, "grad_norm": 0.3915285958659358, "learning_rate": 9.513533028222718e-06, "loss": 0.0138, "step": 7833 }, { "epoch": 3.564149226569609, "grad_norm": 0.2491319601060259, "learning_rate": 9.507923465025939e-06, "loss": 0.0098, "step": 7834 }, { "epoch": 3.5646041856232937, "grad_norm": 0.4200388365624958, "learning_rate": 9.502315167771694e-06, "loss": 0.0111, "step": 7835 }, { "epoch": 3.565059144676979, "grad_norm": 0.3477698423071749, "learning_rate": 9.496708136918272e-06, "loss": 0.0129, "step": 7836 }, { "epoch": 3.5655141037306644, "grad_norm": 0.3482149261448949, "learning_rate": 9.491102372923851e-06, "loss": 0.0117, "step": 7837 }, { "epoch": 3.565969062784349, "grad_norm": 0.2713790264280857, "learning_rate": 9.485497876246508e-06, "loss": 0.0112, "step": 7838 }, { "epoch": 3.5664240218380345, "grad_norm": 0.2692849723163714, "learning_rate": 9.479894647344204e-06, "loss": 0.0137, "step": 7839 }, { "epoch": 3.56687898089172, "grad_norm": 0.2643156722741621, "learning_rate": 9.474292686674833e-06, "loss": 0.0072, "step": 7840 }, { "epoch": 3.5673339399454047, "grad_norm": 1.028664353915766, "learning_rate": 9.468691994696147e-06, "loss": 0.0224, "step": 7841 }, { "epoch": 3.56778889899909, "grad_norm": 0.4329450625739231, "learning_rate": 9.463092571865803e-06, "loss": 0.0261, "step": 7842 }, { "epoch": 3.5682438580527753, "grad_norm": 0.35108154943896364, "learning_rate": 9.457494418641383e-06, "loss": 0.0177, "step": 7843 }, { "epoch": 3.56869881710646, "grad_norm": 0.19073960141119367, "learning_rate": 9.451897535480316e-06, "loss": 0.0063, "step": 7844 }, { "epoch": 3.5691537761601455, "grad_norm": 0.3835928773970753, "learning_rate": 9.446301922839981e-06, "loss": 0.0166, "step": 7845 }, { "epoch": 3.569608735213831, "grad_norm": 0.4574596271998396, "learning_rate": 9.44070758117761e-06, "loss": 0.0085, "step": 7846 }, { "epoch": 3.5700636942675157, "grad_norm": 0.28581209266703145, "learning_rate": 9.435114510950352e-06, "loss": 0.0139, "step": 7847 }, { "epoch": 3.570518653321201, "grad_norm": 0.17350817666194804, "learning_rate": 9.429522712615238e-06, "loss": 0.0052, "step": 7848 }, { "epoch": 3.5709736123748863, "grad_norm": 0.6555038261035493, "learning_rate": 9.42393218662921e-06, "loss": 0.0146, "step": 7849 }, { "epoch": 3.571428571428571, "grad_norm": 0.24616724137204693, "learning_rate": 9.418342933449111e-06, "loss": 0.0074, "step": 7850 }, { "epoch": 3.5718835304822565, "grad_norm": 0.3574023065881432, "learning_rate": 9.412754953531663e-06, "loss": 0.0226, "step": 7851 }, { "epoch": 3.572338489535942, "grad_norm": 0.3861228118475803, "learning_rate": 9.40716824733349e-06, "loss": 0.024, "step": 7852 }, { "epoch": 3.5727934485896267, "grad_norm": 0.2737882474134573, "learning_rate": 9.4015828153111e-06, "loss": 0.0142, "step": 7853 }, { "epoch": 3.573248407643312, "grad_norm": 0.3892838698320413, "learning_rate": 9.395998657920932e-06, "loss": 0.0347, "step": 7854 }, { "epoch": 3.5737033666969973, "grad_norm": 0.3801613670125119, "learning_rate": 9.390415775619283e-06, "loss": 0.0158, "step": 7855 }, { "epoch": 3.5741583257506826, "grad_norm": 0.29585960125294286, "learning_rate": 9.384834168862359e-06, "loss": 0.0105, "step": 7856 }, { "epoch": 3.5746132848043675, "grad_norm": 0.39408245074260184, "learning_rate": 9.379253838106275e-06, "loss": 0.0215, "step": 7857 }, { "epoch": 3.5750682438580528, "grad_norm": 0.38716476162434393, "learning_rate": 9.373674783807019e-06, "loss": 0.0185, "step": 7858 }, { "epoch": 3.575523202911738, "grad_norm": 0.29550631383222176, "learning_rate": 9.368097006420497e-06, "loss": 0.0115, "step": 7859 }, { "epoch": 3.575978161965423, "grad_norm": 0.12345789649610424, "learning_rate": 9.362520506402497e-06, "loss": 0.003, "step": 7860 }, { "epoch": 3.5764331210191083, "grad_norm": 0.16498490890853487, "learning_rate": 9.356945284208704e-06, "loss": 0.0102, "step": 7861 }, { "epoch": 3.5768880800727936, "grad_norm": 0.2787310634844777, "learning_rate": 9.35137134029469e-06, "loss": 0.0088, "step": 7862 }, { "epoch": 3.577343039126479, "grad_norm": 0.31792738844668716, "learning_rate": 9.345798675115938e-06, "loss": 0.0109, "step": 7863 }, { "epoch": 3.5777979981801638, "grad_norm": 0.26441502615330453, "learning_rate": 9.340227289127837e-06, "loss": 0.0113, "step": 7864 }, { "epoch": 3.578252957233849, "grad_norm": 0.14448169585959428, "learning_rate": 9.334657182785642e-06, "loss": 0.0059, "step": 7865 }, { "epoch": 3.5787079162875344, "grad_norm": 0.16639231925738704, "learning_rate": 9.329088356544518e-06, "loss": 0.0078, "step": 7866 }, { "epoch": 3.5791628753412192, "grad_norm": 0.22808199874242516, "learning_rate": 9.323520810859524e-06, "loss": 0.0127, "step": 7867 }, { "epoch": 3.5796178343949046, "grad_norm": 0.20502520035380306, "learning_rate": 9.317954546185607e-06, "loss": 0.0049, "step": 7868 }, { "epoch": 3.58007279344859, "grad_norm": 0.27634261545755284, "learning_rate": 9.31238956297763e-06, "loss": 0.0061, "step": 7869 }, { "epoch": 3.5805277525022747, "grad_norm": 0.26598098228606515, "learning_rate": 9.30682586169033e-06, "loss": 0.005, "step": 7870 }, { "epoch": 3.58098271155596, "grad_norm": 0.4446940316735892, "learning_rate": 9.301263442778358e-06, "loss": 0.0241, "step": 7871 }, { "epoch": 3.5814376706096454, "grad_norm": 0.21837004298516632, "learning_rate": 9.295702306696238e-06, "loss": 0.0194, "step": 7872 }, { "epoch": 3.58189262966333, "grad_norm": 0.256275359202139, "learning_rate": 9.290142453898401e-06, "loss": 0.0054, "step": 7873 }, { "epoch": 3.5823475887170155, "grad_norm": 0.35978029380012083, "learning_rate": 9.284583884839188e-06, "loss": 0.0197, "step": 7874 }, { "epoch": 3.582802547770701, "grad_norm": 0.2968055939957511, "learning_rate": 9.279026599972806e-06, "loss": 0.015, "step": 7875 }, { "epoch": 3.5832575068243857, "grad_norm": 0.3856140020995894, "learning_rate": 9.273470599753376e-06, "loss": 0.0224, "step": 7876 }, { "epoch": 3.583712465878071, "grad_norm": 0.4094487276274529, "learning_rate": 9.2679158846349e-06, "loss": 0.0214, "step": 7877 }, { "epoch": 3.5841674249317563, "grad_norm": 0.2125498835585565, "learning_rate": 9.262362455071294e-06, "loss": 0.0101, "step": 7878 }, { "epoch": 3.584622383985441, "grad_norm": 0.42188714279300255, "learning_rate": 9.256810311516365e-06, "loss": 0.0315, "step": 7879 }, { "epoch": 3.5850773430391265, "grad_norm": 0.4659634256713715, "learning_rate": 9.2512594544238e-06, "loss": 0.0113, "step": 7880 }, { "epoch": 3.585532302092812, "grad_norm": 0.19366535206730703, "learning_rate": 9.245709884247194e-06, "loss": 0.0056, "step": 7881 }, { "epoch": 3.5859872611464967, "grad_norm": 0.23849496028476663, "learning_rate": 9.24016160144002e-06, "loss": 0.021, "step": 7882 }, { "epoch": 3.586442220200182, "grad_norm": 0.2658740558223457, "learning_rate": 9.23461460645568e-06, "loss": 0.0109, "step": 7883 }, { "epoch": 3.5868971792538673, "grad_norm": 0.2632577871832208, "learning_rate": 9.229068899747428e-06, "loss": 0.0113, "step": 7884 }, { "epoch": 3.587352138307552, "grad_norm": 0.21537348633087633, "learning_rate": 9.223524481768454e-06, "loss": 0.0087, "step": 7885 }, { "epoch": 3.5878070973612375, "grad_norm": 0.19932527239576844, "learning_rate": 9.217981352971814e-06, "loss": 0.0042, "step": 7886 }, { "epoch": 3.588262056414923, "grad_norm": 0.27203673104612064, "learning_rate": 9.212439513810455e-06, "loss": 0.0131, "step": 7887 }, { "epoch": 3.5887170154686077, "grad_norm": 0.30317509905279216, "learning_rate": 9.206898964737257e-06, "loss": 0.0092, "step": 7888 }, { "epoch": 3.589171974522293, "grad_norm": 0.3761187717715082, "learning_rate": 9.20135970620495e-06, "loss": 0.028, "step": 7889 }, { "epoch": 3.5896269335759783, "grad_norm": 0.29332856128606294, "learning_rate": 9.195821738666183e-06, "loss": 0.0084, "step": 7890 }, { "epoch": 3.590081892629663, "grad_norm": 0.24120673975722418, "learning_rate": 9.190285062573483e-06, "loss": 0.0111, "step": 7891 }, { "epoch": 3.5905368516833485, "grad_norm": 0.25652929379068734, "learning_rate": 9.184749678379295e-06, "loss": 0.0097, "step": 7892 }, { "epoch": 3.5909918107370338, "grad_norm": 0.3631671187066052, "learning_rate": 9.17921558653595e-06, "loss": 0.0088, "step": 7893 }, { "epoch": 3.5914467697907186, "grad_norm": 0.254563674556113, "learning_rate": 9.173682787495658e-06, "loss": 0.0101, "step": 7894 }, { "epoch": 3.591901728844404, "grad_norm": 0.33833080063086696, "learning_rate": 9.168151281710542e-06, "loss": 0.0128, "step": 7895 }, { "epoch": 3.5923566878980893, "grad_norm": 0.45453855334852994, "learning_rate": 9.162621069632597e-06, "loss": 0.016, "step": 7896 }, { "epoch": 3.592811646951774, "grad_norm": 0.4069472158893592, "learning_rate": 9.157092151713742e-06, "loss": 0.0139, "step": 7897 }, { "epoch": 3.5932666060054594, "grad_norm": 0.23583793503276929, "learning_rate": 9.151564528405765e-06, "loss": 0.017, "step": 7898 }, { "epoch": 3.5937215650591448, "grad_norm": 0.44386620783862213, "learning_rate": 9.146038200160373e-06, "loss": 0.014, "step": 7899 }, { "epoch": 3.5941765241128296, "grad_norm": 0.33797991726501114, "learning_rate": 9.140513167429144e-06, "loss": 0.0208, "step": 7900 }, { "epoch": 3.594631483166515, "grad_norm": 0.7462835606347256, "learning_rate": 9.13498943066355e-06, "loss": 0.0205, "step": 7901 }, { "epoch": 3.5950864422202002, "grad_norm": 0.3122440042447858, "learning_rate": 9.129466990314977e-06, "loss": 0.007, "step": 7902 }, { "epoch": 3.595541401273885, "grad_norm": 0.4300990464414375, "learning_rate": 9.123945846834696e-06, "loss": 0.0236, "step": 7903 }, { "epoch": 3.5959963603275704, "grad_norm": 0.30430043177334526, "learning_rate": 9.118426000673864e-06, "loss": 0.0068, "step": 7904 }, { "epoch": 3.5964513193812557, "grad_norm": 0.3405599447505851, "learning_rate": 9.112907452283528e-06, "loss": 0.0192, "step": 7905 }, { "epoch": 3.5969062784349406, "grad_norm": 0.24912129897930396, "learning_rate": 9.10739020211466e-06, "loss": 0.0188, "step": 7906 }, { "epoch": 3.597361237488626, "grad_norm": 0.27999645294164877, "learning_rate": 9.101874250618086e-06, "loss": 0.0157, "step": 7907 }, { "epoch": 3.597816196542311, "grad_norm": 0.21992734318709992, "learning_rate": 9.09635959824456e-06, "loss": 0.0101, "step": 7908 }, { "epoch": 3.598271155595996, "grad_norm": 0.29453759489035464, "learning_rate": 9.090846245444709e-06, "loss": 0.0164, "step": 7909 }, { "epoch": 3.5987261146496814, "grad_norm": 0.40187686897781355, "learning_rate": 9.085334192669057e-06, "loss": 0.0204, "step": 7910 }, { "epoch": 3.5991810737033667, "grad_norm": 0.39035758749220495, "learning_rate": 9.079823440368018e-06, "loss": 0.0173, "step": 7911 }, { "epoch": 3.599636032757052, "grad_norm": 0.31416988308341826, "learning_rate": 9.07431398899191e-06, "loss": 0.0117, "step": 7912 }, { "epoch": 3.600090991810737, "grad_norm": 0.24422457615142656, "learning_rate": 9.068805838990951e-06, "loss": 0.0129, "step": 7913 }, { "epoch": 3.600545950864422, "grad_norm": 0.3545114515539359, "learning_rate": 9.063298990815236e-06, "loss": 0.0181, "step": 7914 }, { "epoch": 3.6010009099181075, "grad_norm": 0.3059424270595068, "learning_rate": 9.057793444914758e-06, "loss": 0.0103, "step": 7915 }, { "epoch": 3.6014558689717924, "grad_norm": 0.28793023928357503, "learning_rate": 9.052289201739397e-06, "loss": 0.0083, "step": 7916 }, { "epoch": 3.6019108280254777, "grad_norm": 0.3044178385320273, "learning_rate": 9.046786261738952e-06, "loss": 0.0074, "step": 7917 }, { "epoch": 3.602365787079163, "grad_norm": 0.3087839433720908, "learning_rate": 9.041284625363089e-06, "loss": 0.0171, "step": 7918 }, { "epoch": 3.6028207461328483, "grad_norm": 0.34929378599126365, "learning_rate": 9.035784293061367e-06, "loss": 0.0217, "step": 7919 }, { "epoch": 3.603275705186533, "grad_norm": 0.2518442349655992, "learning_rate": 9.03028526528327e-06, "loss": 0.012, "step": 7920 }, { "epoch": 3.6037306642402185, "grad_norm": 0.25014255864610335, "learning_rate": 9.024787542478133e-06, "loss": 0.0065, "step": 7921 }, { "epoch": 3.604185623293904, "grad_norm": 0.2736898129325639, "learning_rate": 9.019291125095222e-06, "loss": 0.0211, "step": 7922 }, { "epoch": 3.6046405823475887, "grad_norm": 0.4096184788626745, "learning_rate": 9.013796013583675e-06, "loss": 0.0262, "step": 7923 }, { "epoch": 3.605095541401274, "grad_norm": 0.25857938955296345, "learning_rate": 9.008302208392521e-06, "loss": 0.0149, "step": 7924 }, { "epoch": 3.6055505004549593, "grad_norm": 0.47221149048902566, "learning_rate": 9.002809709970686e-06, "loss": 0.0234, "step": 7925 }, { "epoch": 3.606005459508644, "grad_norm": 0.29849031791505476, "learning_rate": 8.997318518767001e-06, "loss": 0.0094, "step": 7926 }, { "epoch": 3.6064604185623295, "grad_norm": 0.29323393206978826, "learning_rate": 8.991828635230184e-06, "loss": 0.0143, "step": 7927 }, { "epoch": 3.6069153776160148, "grad_norm": 0.22175524187309673, "learning_rate": 8.98634005980884e-06, "loss": 0.0109, "step": 7928 }, { "epoch": 3.6073703366696996, "grad_norm": 0.4206389328798086, "learning_rate": 8.980852792951472e-06, "loss": 0.0161, "step": 7929 }, { "epoch": 3.607825295723385, "grad_norm": 0.40885588926001176, "learning_rate": 8.97536683510646e-06, "loss": 0.0202, "step": 7930 }, { "epoch": 3.6082802547770703, "grad_norm": 0.23573463328803757, "learning_rate": 8.969882186722112e-06, "loss": 0.0065, "step": 7931 }, { "epoch": 3.608735213830755, "grad_norm": 0.3933745979880861, "learning_rate": 8.964398848246603e-06, "loss": 0.0158, "step": 7932 }, { "epoch": 3.6091901728844404, "grad_norm": 0.5096714331594061, "learning_rate": 8.958916820127995e-06, "loss": 0.0151, "step": 7933 }, { "epoch": 3.6096451319381258, "grad_norm": 0.3810858319827942, "learning_rate": 8.95343610281427e-06, "loss": 0.0076, "step": 7934 }, { "epoch": 3.6101000909918106, "grad_norm": 0.13447801683336438, "learning_rate": 8.947956696753274e-06, "loss": 0.0031, "step": 7935 }, { "epoch": 3.610555050045496, "grad_norm": 0.5847774446649852, "learning_rate": 8.942478602392774e-06, "loss": 0.0268, "step": 7936 }, { "epoch": 3.6110100090991812, "grad_norm": 0.28676314266052166, "learning_rate": 8.937001820180407e-06, "loss": 0.008, "step": 7937 }, { "epoch": 3.611464968152866, "grad_norm": 0.21212034938310814, "learning_rate": 8.931526350563713e-06, "loss": 0.0072, "step": 7938 }, { "epoch": 3.6119199272065514, "grad_norm": 0.34731407211019094, "learning_rate": 8.92605219399012e-06, "loss": 0.0173, "step": 7939 }, { "epoch": 3.6123748862602367, "grad_norm": 0.33348277316346225, "learning_rate": 8.920579350906936e-06, "loss": 0.0151, "step": 7940 }, { "epoch": 3.6128298453139216, "grad_norm": 0.24402646700875652, "learning_rate": 8.915107821761409e-06, "loss": 0.0075, "step": 7941 }, { "epoch": 3.613284804367607, "grad_norm": 0.2056320882153239, "learning_rate": 8.909637607000632e-06, "loss": 0.0037, "step": 7942 }, { "epoch": 3.613739763421292, "grad_norm": 0.2669812189014817, "learning_rate": 8.904168707071608e-06, "loss": 0.0133, "step": 7943 }, { "epoch": 3.614194722474977, "grad_norm": 0.4607162367164103, "learning_rate": 8.898701122421229e-06, "loss": 0.0187, "step": 7944 }, { "epoch": 3.6146496815286624, "grad_norm": 0.2522047182583553, "learning_rate": 8.89323485349627e-06, "loss": 0.0111, "step": 7945 }, { "epoch": 3.6151046405823477, "grad_norm": 0.4340503410391285, "learning_rate": 8.887769900743434e-06, "loss": 0.0129, "step": 7946 }, { "epoch": 3.6155595996360326, "grad_norm": 0.5927001309871789, "learning_rate": 8.882306264609269e-06, "loss": 0.0253, "step": 7947 }, { "epoch": 3.616014558689718, "grad_norm": 0.43939735746443687, "learning_rate": 8.876843945540259e-06, "loss": 0.0189, "step": 7948 }, { "epoch": 3.616469517743403, "grad_norm": 0.2095732032996094, "learning_rate": 8.871382943982751e-06, "loss": 0.0112, "step": 7949 }, { "epoch": 3.616924476797088, "grad_norm": 0.28829390604109717, "learning_rate": 8.86592326038298e-06, "loss": 0.01, "step": 7950 }, { "epoch": 3.6173794358507734, "grad_norm": 0.29619054877424184, "learning_rate": 8.860464895187112e-06, "loss": 0.0103, "step": 7951 }, { "epoch": 3.6178343949044587, "grad_norm": 0.3774260397329701, "learning_rate": 8.855007848841166e-06, "loss": 0.0162, "step": 7952 }, { "epoch": 3.6182893539581436, "grad_norm": 0.3218652558714794, "learning_rate": 8.849552121791065e-06, "loss": 0.0135, "step": 7953 }, { "epoch": 3.618744313011829, "grad_norm": 0.41621386531581567, "learning_rate": 8.844097714482624e-06, "loss": 0.0159, "step": 7954 }, { "epoch": 3.619199272065514, "grad_norm": 0.49990344721751223, "learning_rate": 8.838644627361562e-06, "loss": 0.0279, "step": 7955 }, { "epoch": 3.619654231119199, "grad_norm": 0.12730132831350513, "learning_rate": 8.83319286087348e-06, "loss": 0.0027, "step": 7956 }, { "epoch": 3.6201091901728844, "grad_norm": 0.23547693456934576, "learning_rate": 8.827742415463872e-06, "loss": 0.0123, "step": 7957 }, { "epoch": 3.6205641492265697, "grad_norm": 0.42666332182062533, "learning_rate": 8.822293291578118e-06, "loss": 0.0071, "step": 7958 }, { "epoch": 3.6210191082802545, "grad_norm": 0.41275306023019503, "learning_rate": 8.816845489661493e-06, "loss": 0.0197, "step": 7959 }, { "epoch": 3.62147406733394, "grad_norm": 0.2785425213825369, "learning_rate": 8.811399010159177e-06, "loss": 0.0094, "step": 7960 }, { "epoch": 3.621929026387625, "grad_norm": 0.33000246140705625, "learning_rate": 8.805953853516222e-06, "loss": 0.0064, "step": 7961 }, { "epoch": 3.62238398544131, "grad_norm": 0.2938831067030401, "learning_rate": 8.80051002017759e-06, "loss": 0.0135, "step": 7962 }, { "epoch": 3.6228389444949953, "grad_norm": 0.4300003530997519, "learning_rate": 8.795067510588128e-06, "loss": 0.0143, "step": 7963 }, { "epoch": 3.6232939035486806, "grad_norm": 0.25112320552619577, "learning_rate": 8.789626325192556e-06, "loss": 0.0138, "step": 7964 }, { "epoch": 3.623748862602366, "grad_norm": 0.37994051996240175, "learning_rate": 8.784186464435525e-06, "loss": 0.017, "step": 7965 }, { "epoch": 3.624203821656051, "grad_norm": 0.3984349569898789, "learning_rate": 8.77874792876155e-06, "loss": 0.0147, "step": 7966 }, { "epoch": 3.624658780709736, "grad_norm": 0.11377922797591228, "learning_rate": 8.773310718615036e-06, "loss": 0.0015, "step": 7967 }, { "epoch": 3.6251137397634214, "grad_norm": 0.2584593513697745, "learning_rate": 8.767874834440281e-06, "loss": 0.0091, "step": 7968 }, { "epoch": 3.6255686988171063, "grad_norm": 0.3566048037273826, "learning_rate": 8.762440276681493e-06, "loss": 0.0172, "step": 7969 }, { "epoch": 3.6260236578707916, "grad_norm": 1.0907054392826763, "learning_rate": 8.757007045782766e-06, "loss": 0.0246, "step": 7970 }, { "epoch": 3.626478616924477, "grad_norm": 0.2007119497727924, "learning_rate": 8.751575142188071e-06, "loss": 0.0066, "step": 7971 }, { "epoch": 3.6269335759781622, "grad_norm": 0.24131788072504542, "learning_rate": 8.746144566341276e-06, "loss": 0.0137, "step": 7972 }, { "epoch": 3.627388535031847, "grad_norm": 0.3011265633712943, "learning_rate": 8.740715318686148e-06, "loss": 0.0072, "step": 7973 }, { "epoch": 3.6278434940855324, "grad_norm": 0.30364204803355194, "learning_rate": 8.735287399666328e-06, "loss": 0.0171, "step": 7974 }, { "epoch": 3.6282984531392177, "grad_norm": 0.4357648006137466, "learning_rate": 8.72986080972537e-06, "loss": 0.014, "step": 7975 }, { "epoch": 3.6287534121929026, "grad_norm": 0.39164518003899945, "learning_rate": 8.724435549306722e-06, "loss": 0.0189, "step": 7976 }, { "epoch": 3.629208371246588, "grad_norm": 0.28797830143269515, "learning_rate": 8.7190116188537e-06, "loss": 0.0171, "step": 7977 }, { "epoch": 3.629663330300273, "grad_norm": 0.4581840612896068, "learning_rate": 8.713589018809523e-06, "loss": 0.0149, "step": 7978 }, { "epoch": 3.630118289353958, "grad_norm": 0.36492452133178555, "learning_rate": 8.708167749617296e-06, "loss": 0.0126, "step": 7979 }, { "epoch": 3.6305732484076434, "grad_norm": 0.24291607839388063, "learning_rate": 8.702747811720036e-06, "loss": 0.012, "step": 7980 }, { "epoch": 3.6310282074613287, "grad_norm": 0.3256428651065547, "learning_rate": 8.697329205560625e-06, "loss": 0.0098, "step": 7981 }, { "epoch": 3.6314831665150136, "grad_norm": 0.24168087889264586, "learning_rate": 8.691911931581842e-06, "loss": 0.0156, "step": 7982 }, { "epoch": 3.631938125568699, "grad_norm": 0.7553426463558033, "learning_rate": 8.686495990226376e-06, "loss": 0.0867, "step": 7983 }, { "epoch": 3.632393084622384, "grad_norm": 0.3174038973875936, "learning_rate": 8.681081381936779e-06, "loss": 0.0196, "step": 7984 }, { "epoch": 3.632848043676069, "grad_norm": 0.35105138336520125, "learning_rate": 8.675668107155527e-06, "loss": 0.0113, "step": 7985 }, { "epoch": 3.6333030027297544, "grad_norm": 0.23126872893358694, "learning_rate": 8.670256166324953e-06, "loss": 0.0052, "step": 7986 }, { "epoch": 3.6337579617834397, "grad_norm": 0.2883918213185846, "learning_rate": 8.664845559887303e-06, "loss": 0.0121, "step": 7987 }, { "epoch": 3.6342129208371245, "grad_norm": 0.44316293345044955, "learning_rate": 8.659436288284698e-06, "loss": 0.0156, "step": 7988 }, { "epoch": 3.63466787989081, "grad_norm": 0.1905109720780447, "learning_rate": 8.654028351959161e-06, "loss": 0.003, "step": 7989 }, { "epoch": 3.635122838944495, "grad_norm": 0.17471189246845098, "learning_rate": 8.648621751352624e-06, "loss": 0.0068, "step": 7990 }, { "epoch": 3.63557779799818, "grad_norm": 0.6212831981090899, "learning_rate": 8.643216486906872e-06, "loss": 0.0487, "step": 7991 }, { "epoch": 3.6360327570518653, "grad_norm": 0.32296996056654237, "learning_rate": 8.637812559063602e-06, "loss": 0.0145, "step": 7992 }, { "epoch": 3.6364877161055507, "grad_norm": 0.4550074955744526, "learning_rate": 8.63240996826439e-06, "loss": 0.0277, "step": 7993 }, { "epoch": 3.6369426751592355, "grad_norm": 0.3072868069769045, "learning_rate": 8.62700871495073e-06, "loss": 0.0219, "step": 7994 }, { "epoch": 3.637397634212921, "grad_norm": 0.24091305214769462, "learning_rate": 8.621608799563977e-06, "loss": 0.0107, "step": 7995 }, { "epoch": 3.637852593266606, "grad_norm": 0.243431666238325, "learning_rate": 8.616210222545382e-06, "loss": 0.0059, "step": 7996 }, { "epoch": 3.638307552320291, "grad_norm": 0.3680257011347648, "learning_rate": 8.610812984336106e-06, "loss": 0.0174, "step": 7997 }, { "epoch": 3.6387625113739763, "grad_norm": 0.22135654055704887, "learning_rate": 8.605417085377171e-06, "loss": 0.0099, "step": 7998 }, { "epoch": 3.6392174704276616, "grad_norm": 0.23491476277298598, "learning_rate": 8.600022526109522e-06, "loss": 0.0112, "step": 7999 }, { "epoch": 3.6396724294813465, "grad_norm": 0.23046168005048528, "learning_rate": 8.594629306973973e-06, "loss": 0.0104, "step": 8000 }, { "epoch": 3.640127388535032, "grad_norm": 0.30575159558634146, "learning_rate": 8.589237428411228e-06, "loss": 0.0159, "step": 8001 }, { "epoch": 3.640582347588717, "grad_norm": 0.2955595377570318, "learning_rate": 8.583846890861886e-06, "loss": 0.0092, "step": 8002 }, { "epoch": 3.641037306642402, "grad_norm": 0.38591756832221036, "learning_rate": 8.57845769476644e-06, "loss": 0.0161, "step": 8003 }, { "epoch": 3.6414922656960873, "grad_norm": 0.3495664501757952, "learning_rate": 8.57306984056528e-06, "loss": 0.0161, "step": 8004 }, { "epoch": 3.6419472247497726, "grad_norm": 0.4264086681687965, "learning_rate": 8.567683328698666e-06, "loss": 0.0173, "step": 8005 }, { "epoch": 3.6424021838034575, "grad_norm": 0.27621667878184303, "learning_rate": 8.562298159606766e-06, "loss": 0.0187, "step": 8006 }, { "epoch": 3.642857142857143, "grad_norm": 0.29349048467453065, "learning_rate": 8.55691433372962e-06, "loss": 0.0103, "step": 8007 }, { "epoch": 3.643312101910828, "grad_norm": 0.3321129215018385, "learning_rate": 8.551531851507186e-06, "loss": 0.018, "step": 8008 }, { "epoch": 3.643767060964513, "grad_norm": 0.26288147163203157, "learning_rate": 8.54615071337929e-06, "loss": 0.008, "step": 8009 }, { "epoch": 3.6442220200181983, "grad_norm": 0.31249391099415347, "learning_rate": 8.540770919785643e-06, "loss": 0.0149, "step": 8010 }, { "epoch": 3.6446769790718836, "grad_norm": 0.3169002588499963, "learning_rate": 8.535392471165876e-06, "loss": 0.0309, "step": 8011 }, { "epoch": 3.6451319381255685, "grad_norm": 0.2383124175279392, "learning_rate": 8.530015367959482e-06, "loss": 0.0091, "step": 8012 }, { "epoch": 3.6455868971792538, "grad_norm": 1.9862193181051377, "learning_rate": 8.524639610605848e-06, "loss": 0.0453, "step": 8013 }, { "epoch": 3.646041856232939, "grad_norm": 0.3337462997764011, "learning_rate": 8.519265199544268e-06, "loss": 0.0085, "step": 8014 }, { "epoch": 3.646496815286624, "grad_norm": 0.34657096959048894, "learning_rate": 8.51389213521391e-06, "loss": 0.0114, "step": 8015 }, { "epoch": 3.6469517743403093, "grad_norm": 0.16861752151149176, "learning_rate": 8.50852041805384e-06, "loss": 0.0063, "step": 8016 }, { "epoch": 3.6474067333939946, "grad_norm": 0.414133328710513, "learning_rate": 8.503150048502996e-06, "loss": 0.0196, "step": 8017 }, { "epoch": 3.6478616924476794, "grad_norm": 0.19526515284475257, "learning_rate": 8.497781027000229e-06, "loss": 0.0062, "step": 8018 }, { "epoch": 3.6483166515013647, "grad_norm": 0.3372359482584931, "learning_rate": 8.492413353984283e-06, "loss": 0.0161, "step": 8019 }, { "epoch": 3.64877161055505, "grad_norm": 0.27790443595223013, "learning_rate": 8.487047029893772e-06, "loss": 0.0113, "step": 8020 }, { "epoch": 3.6492265696087354, "grad_norm": 0.21435231505470845, "learning_rate": 8.481682055167203e-06, "loss": 0.0074, "step": 8021 }, { "epoch": 3.6496815286624202, "grad_norm": 0.3433779143129405, "learning_rate": 8.476318430242972e-06, "loss": 0.0172, "step": 8022 }, { "epoch": 3.6501364877161055, "grad_norm": 0.3370355914550519, "learning_rate": 8.47095615555939e-06, "loss": 0.0095, "step": 8023 }, { "epoch": 3.650591446769791, "grad_norm": 0.4646328812210465, "learning_rate": 8.465595231554615e-06, "loss": 0.0251, "step": 8024 }, { "epoch": 3.6510464058234757, "grad_norm": 0.27362400918738505, "learning_rate": 8.460235658666738e-06, "loss": 0.0114, "step": 8025 }, { "epoch": 3.651501364877161, "grad_norm": 0.4472313141258646, "learning_rate": 8.45487743733371e-06, "loss": 0.0214, "step": 8026 }, { "epoch": 3.6519563239308463, "grad_norm": 0.39341955331696576, "learning_rate": 8.449520567993375e-06, "loss": 0.0197, "step": 8027 }, { "epoch": 3.6524112829845317, "grad_norm": 0.3005956704142107, "learning_rate": 8.444165051083484e-06, "loss": 0.012, "step": 8028 }, { "epoch": 3.6528662420382165, "grad_norm": 0.4311799934283919, "learning_rate": 8.43881088704166e-06, "loss": 0.0183, "step": 8029 }, { "epoch": 3.653321201091902, "grad_norm": 0.20011352048120076, "learning_rate": 8.433458076305417e-06, "loss": 0.0068, "step": 8030 }, { "epoch": 3.653776160145587, "grad_norm": 0.26879790398204306, "learning_rate": 8.428106619312162e-06, "loss": 0.0087, "step": 8031 }, { "epoch": 3.654231119199272, "grad_norm": 0.34303518231314606, "learning_rate": 8.422756516499195e-06, "loss": 0.0206, "step": 8032 }, { "epoch": 3.6546860782529573, "grad_norm": 0.2105158806843412, "learning_rate": 8.417407768303712e-06, "loss": 0.0027, "step": 8033 }, { "epoch": 3.6551410373066426, "grad_norm": 0.3501272185766499, "learning_rate": 8.41206037516278e-06, "loss": 0.0108, "step": 8034 }, { "epoch": 3.6555959963603275, "grad_norm": 0.33781794867122955, "learning_rate": 8.406714337513363e-06, "loss": 0.0112, "step": 8035 }, { "epoch": 3.656050955414013, "grad_norm": 0.4302298717832941, "learning_rate": 8.401369655792307e-06, "loss": 0.029, "step": 8036 }, { "epoch": 3.656505914467698, "grad_norm": 0.4584761296552093, "learning_rate": 8.396026330436374e-06, "loss": 0.026, "step": 8037 }, { "epoch": 3.656960873521383, "grad_norm": 0.29440469921137996, "learning_rate": 8.390684361882176e-06, "loss": 0.0215, "step": 8038 }, { "epoch": 3.6574158325750683, "grad_norm": 0.5846039620964487, "learning_rate": 8.385343750566254e-06, "loss": 0.0219, "step": 8039 }, { "epoch": 3.6578707916287536, "grad_norm": 0.42180099271021, "learning_rate": 8.380004496925011e-06, "loss": 0.0268, "step": 8040 }, { "epoch": 3.6583257506824385, "grad_norm": 0.2566571782643515, "learning_rate": 8.374666601394738e-06, "loss": 0.0091, "step": 8041 }, { "epoch": 3.658780709736124, "grad_norm": 0.24184100485951218, "learning_rate": 8.369330064411635e-06, "loss": 0.0045, "step": 8042 }, { "epoch": 3.659235668789809, "grad_norm": 0.43822191218427364, "learning_rate": 8.363994886411777e-06, "loss": 0.0259, "step": 8043 }, { "epoch": 3.659690627843494, "grad_norm": 0.34072738074671544, "learning_rate": 8.35866106783113e-06, "loss": 0.0113, "step": 8044 }, { "epoch": 3.6601455868971793, "grad_norm": 0.1484997483162511, "learning_rate": 8.353328609105543e-06, "loss": 0.0026, "step": 8045 }, { "epoch": 3.6606005459508646, "grad_norm": 0.2014858894570965, "learning_rate": 8.347997510670764e-06, "loss": 0.0081, "step": 8046 }, { "epoch": 3.6610555050045495, "grad_norm": 0.21315007139539313, "learning_rate": 8.342667772962437e-06, "loss": 0.0075, "step": 8047 }, { "epoch": 3.6615104640582348, "grad_norm": 0.2070514333801964, "learning_rate": 8.337339396416075e-06, "loss": 0.0063, "step": 8048 }, { "epoch": 3.66196542311192, "grad_norm": 0.17222404067924305, "learning_rate": 8.332012381467092e-06, "loss": 0.004, "step": 8049 }, { "epoch": 3.662420382165605, "grad_norm": 0.2911080036389778, "learning_rate": 8.326686728550781e-06, "loss": 0.023, "step": 8050 }, { "epoch": 3.6628753412192903, "grad_norm": 0.18233537215678297, "learning_rate": 8.32136243810233e-06, "loss": 0.0053, "step": 8051 }, { "epoch": 3.6633303002729756, "grad_norm": 0.29762160790233505, "learning_rate": 8.31603951055682e-06, "loss": 0.0183, "step": 8052 }, { "epoch": 3.6637852593266604, "grad_norm": 0.2252424243879002, "learning_rate": 8.310717946349225e-06, "loss": 0.007, "step": 8053 }, { "epoch": 3.6642402183803457, "grad_norm": 0.2498577951757963, "learning_rate": 8.30539774591439e-06, "loss": 0.0089, "step": 8054 }, { "epoch": 3.664695177434031, "grad_norm": 0.24273000996996646, "learning_rate": 8.30007890968706e-06, "loss": 0.0093, "step": 8055 }, { "epoch": 3.665150136487716, "grad_norm": 0.34020559913840154, "learning_rate": 8.294761438101858e-06, "loss": 0.0126, "step": 8056 }, { "epoch": 3.6656050955414012, "grad_norm": 0.2264072173105761, "learning_rate": 8.289445331593318e-06, "loss": 0.0088, "step": 8057 }, { "epoch": 3.6660600545950865, "grad_norm": 0.2061917257559539, "learning_rate": 8.284130590595843e-06, "loss": 0.0055, "step": 8058 }, { "epoch": 3.6665150136487714, "grad_norm": 0.3768637082070238, "learning_rate": 8.278817215543716e-06, "loss": 0.0164, "step": 8059 }, { "epoch": 3.6669699727024567, "grad_norm": 0.3148196096987261, "learning_rate": 8.273505206871146e-06, "loss": 0.0117, "step": 8060 }, { "epoch": 3.667424931756142, "grad_norm": 0.2956732170691929, "learning_rate": 8.268194565012186e-06, "loss": 0.0098, "step": 8061 }, { "epoch": 3.667879890809827, "grad_norm": 0.24839433111609552, "learning_rate": 8.262885290400812e-06, "loss": 0.0148, "step": 8062 }, { "epoch": 3.668334849863512, "grad_norm": 0.2030190424423925, "learning_rate": 8.25757738347087e-06, "loss": 0.0087, "step": 8063 }, { "epoch": 3.6687898089171975, "grad_norm": 0.299644661678514, "learning_rate": 8.252270844656093e-06, "loss": 0.0096, "step": 8064 }, { "epoch": 3.6692447679708824, "grad_norm": 0.2503024877941189, "learning_rate": 8.246965674390106e-06, "loss": 0.0093, "step": 8065 }, { "epoch": 3.6696997270245677, "grad_norm": 0.4080221760221528, "learning_rate": 8.241661873106427e-06, "loss": 0.0185, "step": 8066 }, { "epoch": 3.670154686078253, "grad_norm": 0.33636139517533437, "learning_rate": 8.236359441238467e-06, "loss": 0.0041, "step": 8067 }, { "epoch": 3.670609645131938, "grad_norm": 0.22693687212427763, "learning_rate": 8.231058379219508e-06, "loss": 0.008, "step": 8068 }, { "epoch": 3.671064604185623, "grad_norm": 0.16276365602779921, "learning_rate": 8.225758687482732e-06, "loss": 0.0031, "step": 8069 }, { "epoch": 3.6715195632393085, "grad_norm": 0.4436995540876357, "learning_rate": 8.220460366461197e-06, "loss": 0.0272, "step": 8070 }, { "epoch": 3.6719745222929934, "grad_norm": 0.05735679782257363, "learning_rate": 8.215163416587874e-06, "loss": 0.0014, "step": 8071 }, { "epoch": 3.6724294813466787, "grad_norm": 0.30825707751567216, "learning_rate": 8.209867838295596e-06, "loss": 0.0092, "step": 8072 }, { "epoch": 3.672884440400364, "grad_norm": 0.3871145062138258, "learning_rate": 8.204573632017085e-06, "loss": 0.0179, "step": 8073 }, { "epoch": 3.673339399454049, "grad_norm": 0.23663352372866664, "learning_rate": 8.199280798184977e-06, "loss": 0.0165, "step": 8074 }, { "epoch": 3.673794358507734, "grad_norm": 0.6057171994671939, "learning_rate": 8.193989337231763e-06, "loss": 0.03, "step": 8075 }, { "epoch": 3.6742493175614195, "grad_norm": 0.42370362115063404, "learning_rate": 8.188699249589856e-06, "loss": 0.0216, "step": 8076 }, { "epoch": 3.674704276615105, "grad_norm": 0.41039243079162274, "learning_rate": 8.183410535691526e-06, "loss": 0.0228, "step": 8077 }, { "epoch": 3.6751592356687897, "grad_norm": 0.34545203179099093, "learning_rate": 8.178123195968943e-06, "loss": 0.0216, "step": 8078 }, { "epoch": 3.675614194722475, "grad_norm": 0.3179853798618908, "learning_rate": 8.172837230854158e-06, "loss": 0.0158, "step": 8079 }, { "epoch": 3.6760691537761603, "grad_norm": 0.4389515441487279, "learning_rate": 8.167552640779125e-06, "loss": 0.0402, "step": 8080 }, { "epoch": 3.676524112829845, "grad_norm": 0.23487952135206938, "learning_rate": 8.162269426175681e-06, "loss": 0.0049, "step": 8081 }, { "epoch": 3.6769790718835305, "grad_norm": 0.205528865381266, "learning_rate": 8.156987587475543e-06, "loss": 0.0059, "step": 8082 }, { "epoch": 3.6774340309372158, "grad_norm": 0.3366381322210958, "learning_rate": 8.151707125110316e-06, "loss": 0.0129, "step": 8083 }, { "epoch": 3.677888989990901, "grad_norm": 0.1900093007087898, "learning_rate": 8.146428039511497e-06, "loss": 0.0149, "step": 8084 }, { "epoch": 3.678343949044586, "grad_norm": 0.32377211016912955, "learning_rate": 8.141150331110459e-06, "loss": 0.0066, "step": 8085 }, { "epoch": 3.6787989080982713, "grad_norm": 0.2890512770348778, "learning_rate": 8.135874000338491e-06, "loss": 0.0223, "step": 8086 }, { "epoch": 3.6792538671519566, "grad_norm": 0.33116374553955513, "learning_rate": 8.130599047626735e-06, "loss": 0.0115, "step": 8087 }, { "epoch": 3.6797088262056414, "grad_norm": 0.45935421508455554, "learning_rate": 8.12532547340625e-06, "loss": 0.0242, "step": 8088 }, { "epoch": 3.6801637852593267, "grad_norm": 0.32560701060028807, "learning_rate": 8.120053278107964e-06, "loss": 0.0118, "step": 8089 }, { "epoch": 3.680618744313012, "grad_norm": 0.23684150348161273, "learning_rate": 8.114782462162685e-06, "loss": 0.0078, "step": 8090 }, { "epoch": 3.681073703366697, "grad_norm": 0.28492805664981485, "learning_rate": 8.10951302600114e-06, "loss": 0.008, "step": 8091 }, { "epoch": 3.6815286624203822, "grad_norm": 0.4250419903475021, "learning_rate": 8.104244970053912e-06, "loss": 0.0155, "step": 8092 }, { "epoch": 3.6819836214740675, "grad_norm": 0.403044297624122, "learning_rate": 8.098978294751484e-06, "loss": 0.0101, "step": 8093 }, { "epoch": 3.6824385805277524, "grad_norm": 0.2851833693905971, "learning_rate": 8.093713000524218e-06, "loss": 0.013, "step": 8094 }, { "epoch": 3.6828935395814377, "grad_norm": 0.18256127024621138, "learning_rate": 8.088449087802378e-06, "loss": 0.0056, "step": 8095 }, { "epoch": 3.683348498635123, "grad_norm": 0.37196666140231005, "learning_rate": 8.083186557016114e-06, "loss": 0.0063, "step": 8096 }, { "epoch": 3.683803457688808, "grad_norm": 0.22117840751567305, "learning_rate": 8.077925408595449e-06, "loss": 0.0035, "step": 8097 }, { "epoch": 3.684258416742493, "grad_norm": 0.5550072205995319, "learning_rate": 8.0726656429703e-06, "loss": 0.0258, "step": 8098 }, { "epoch": 3.6847133757961785, "grad_norm": 0.3432533423884399, "learning_rate": 8.067407260570465e-06, "loss": 0.014, "step": 8099 }, { "epoch": 3.6851683348498634, "grad_norm": 0.4382941517244063, "learning_rate": 8.062150261825648e-06, "loss": 0.0178, "step": 8100 }, { "epoch": 3.6856232939035487, "grad_norm": 0.306936684970163, "learning_rate": 8.056894647165414e-06, "loss": 0.0256, "step": 8101 }, { "epoch": 3.686078252957234, "grad_norm": 0.39220890116831986, "learning_rate": 8.051640417019243e-06, "loss": 0.0156, "step": 8102 }, { "epoch": 3.686533212010919, "grad_norm": 0.5225261485760592, "learning_rate": 8.04638757181648e-06, "loss": 0.0124, "step": 8103 }, { "epoch": 3.686988171064604, "grad_norm": 0.24560781148074173, "learning_rate": 8.041136111986353e-06, "loss": 0.006, "step": 8104 }, { "epoch": 3.6874431301182895, "grad_norm": 0.3437505598620353, "learning_rate": 8.035886037958007e-06, "loss": 0.0133, "step": 8105 }, { "epoch": 3.6878980891719744, "grad_norm": 0.37399406570297017, "learning_rate": 8.03063735016044e-06, "loss": 0.0164, "step": 8106 }, { "epoch": 3.6883530482256597, "grad_norm": 0.21304961588504623, "learning_rate": 8.025390049022562e-06, "loss": 0.0074, "step": 8107 }, { "epoch": 3.688808007279345, "grad_norm": 0.26408222767200484, "learning_rate": 8.020144134973143e-06, "loss": 0.0137, "step": 8108 }, { "epoch": 3.68926296633303, "grad_norm": 0.21096262222584447, "learning_rate": 8.014899608440862e-06, "loss": 0.0076, "step": 8109 }, { "epoch": 3.689717925386715, "grad_norm": 0.24554526656598982, "learning_rate": 8.009656469854294e-06, "loss": 0.009, "step": 8110 }, { "epoch": 3.6901728844404005, "grad_norm": 0.14468065697135243, "learning_rate": 8.004414719641868e-06, "loss": 0.0039, "step": 8111 }, { "epoch": 3.6906278434940853, "grad_norm": 0.6377810666567871, "learning_rate": 7.999174358231918e-06, "loss": 0.0177, "step": 8112 }, { "epoch": 3.6910828025477707, "grad_norm": 0.471212708078553, "learning_rate": 7.99393538605266e-06, "loss": 0.0201, "step": 8113 }, { "epoch": 3.691537761601456, "grad_norm": 0.2730528287394561, "learning_rate": 7.988697803532209e-06, "loss": 0.0148, "step": 8114 }, { "epoch": 3.691992720655141, "grad_norm": 0.28680880014621557, "learning_rate": 7.983461611098544e-06, "loss": 0.0064, "step": 8115 }, { "epoch": 3.692447679708826, "grad_norm": 0.29214524628405736, "learning_rate": 7.978226809179559e-06, "loss": 0.0185, "step": 8116 }, { "epoch": 3.6929026387625115, "grad_norm": 0.4072179466832113, "learning_rate": 7.972993398203008e-06, "loss": 0.0125, "step": 8117 }, { "epoch": 3.6933575978161963, "grad_norm": 0.22689992413797802, "learning_rate": 7.967761378596545e-06, "loss": 0.0144, "step": 8118 }, { "epoch": 3.6938125568698816, "grad_norm": 0.31056978761824333, "learning_rate": 7.962530750787698e-06, "loss": 0.009, "step": 8119 }, { "epoch": 3.694267515923567, "grad_norm": 0.17727729696332872, "learning_rate": 7.957301515203903e-06, "loss": 0.0042, "step": 8120 }, { "epoch": 3.694722474977252, "grad_norm": 0.22011758215727817, "learning_rate": 7.952073672272465e-06, "loss": 0.0052, "step": 8121 }, { "epoch": 3.695177434030937, "grad_norm": 0.32562630230316364, "learning_rate": 7.946847222420569e-06, "loss": 0.0298, "step": 8122 }, { "epoch": 3.6956323930846224, "grad_norm": 0.3350632969329711, "learning_rate": 7.941622166075316e-06, "loss": 0.0101, "step": 8123 }, { "epoch": 3.6960873521383073, "grad_norm": 0.21509609174991523, "learning_rate": 7.936398503663658e-06, "loss": 0.0063, "step": 8124 }, { "epoch": 3.6965423111919926, "grad_norm": 0.38156139016566604, "learning_rate": 7.93117623561246e-06, "loss": 0.0219, "step": 8125 }, { "epoch": 3.696997270245678, "grad_norm": 0.37166344324836026, "learning_rate": 7.925955362348464e-06, "loss": 0.0137, "step": 8126 }, { "epoch": 3.697452229299363, "grad_norm": 0.30553918953835413, "learning_rate": 7.920735884298286e-06, "loss": 0.0078, "step": 8127 }, { "epoch": 3.697907188353048, "grad_norm": 0.23475019394883823, "learning_rate": 7.915517801888433e-06, "loss": 0.0098, "step": 8128 }, { "epoch": 3.6983621474067334, "grad_norm": 0.4505586983406794, "learning_rate": 7.910301115545315e-06, "loss": 0.0301, "step": 8129 }, { "epoch": 3.6988171064604187, "grad_norm": 0.27045725594083747, "learning_rate": 7.905085825695222e-06, "loss": 0.0084, "step": 8130 }, { "epoch": 3.6992720655141036, "grad_norm": 0.25454589000981637, "learning_rate": 7.899871932764314e-06, "loss": 0.0105, "step": 8131 }, { "epoch": 3.699727024567789, "grad_norm": 0.5269782807051245, "learning_rate": 7.894659437178648e-06, "loss": 0.0187, "step": 8132 }, { "epoch": 3.700181983621474, "grad_norm": 0.4897360042774467, "learning_rate": 7.889448339364158e-06, "loss": 0.0259, "step": 8133 }, { "epoch": 3.700636942675159, "grad_norm": 0.5831775449047324, "learning_rate": 7.884238639746686e-06, "loss": 0.0244, "step": 8134 }, { "epoch": 3.7010919017288444, "grad_norm": 0.2464914193759277, "learning_rate": 7.879030338751939e-06, "loss": 0.0033, "step": 8135 }, { "epoch": 3.7015468607825297, "grad_norm": 0.3018898988827231, "learning_rate": 7.873823436805508e-06, "loss": 0.0111, "step": 8136 }, { "epoch": 3.702001819836215, "grad_norm": 0.2955346688049733, "learning_rate": 7.868617934332892e-06, "loss": 0.0149, "step": 8137 }, { "epoch": 3.7024567788899, "grad_norm": 0.33268772168603866, "learning_rate": 7.863413831759448e-06, "loss": 0.0204, "step": 8138 }, { "epoch": 3.702911737943585, "grad_norm": 0.2503985946439606, "learning_rate": 7.858211129510442e-06, "loss": 0.0104, "step": 8139 }, { "epoch": 3.7033666969972705, "grad_norm": 0.32977459479647486, "learning_rate": 7.853009828011012e-06, "loss": 0.0139, "step": 8140 }, { "epoch": 3.7038216560509554, "grad_norm": 0.107014075715032, "learning_rate": 7.847809927686184e-06, "loss": 0.0019, "step": 8141 }, { "epoch": 3.7042766151046407, "grad_norm": 0.5183870202040785, "learning_rate": 7.842611428960861e-06, "loss": 0.0191, "step": 8142 }, { "epoch": 3.704731574158326, "grad_norm": 0.3937567162963472, "learning_rate": 7.837414332259852e-06, "loss": 0.0232, "step": 8143 }, { "epoch": 3.705186533212011, "grad_norm": 0.2185108042659226, "learning_rate": 7.832218638007845e-06, "loss": 0.0066, "step": 8144 }, { "epoch": 3.705641492265696, "grad_norm": 0.36976688284186876, "learning_rate": 7.827024346629403e-06, "loss": 0.0169, "step": 8145 }, { "epoch": 3.7060964513193815, "grad_norm": 0.357086771385112, "learning_rate": 7.821831458548979e-06, "loss": 0.0214, "step": 8146 }, { "epoch": 3.7065514103730663, "grad_norm": 0.4129100870235662, "learning_rate": 7.816639974190901e-06, "loss": 0.0472, "step": 8147 }, { "epoch": 3.7070063694267517, "grad_norm": 0.4385617417667354, "learning_rate": 7.811449893979416e-06, "loss": 0.0133, "step": 8148 }, { "epoch": 3.707461328480437, "grad_norm": 0.4010327580886039, "learning_rate": 7.806261218338622e-06, "loss": 0.0215, "step": 8149 }, { "epoch": 3.707916287534122, "grad_norm": 0.24948590719807862, "learning_rate": 7.801073947692508e-06, "loss": 0.0066, "step": 8150 }, { "epoch": 3.708371246587807, "grad_norm": 0.33635756079062773, "learning_rate": 7.795888082464966e-06, "loss": 0.015, "step": 8151 }, { "epoch": 3.7088262056414925, "grad_norm": 0.34004871756146976, "learning_rate": 7.790703623079753e-06, "loss": 0.0142, "step": 8152 }, { "epoch": 3.7092811646951773, "grad_norm": 0.24239480935336738, "learning_rate": 7.78552056996053e-06, "loss": 0.0082, "step": 8153 }, { "epoch": 3.7097361237488626, "grad_norm": 0.24388791463793769, "learning_rate": 7.780338923530825e-06, "loss": 0.009, "step": 8154 }, { "epoch": 3.710191082802548, "grad_norm": 0.23400649611824376, "learning_rate": 7.775158684214061e-06, "loss": 0.0105, "step": 8155 }, { "epoch": 3.710646041856233, "grad_norm": 0.29115505769637645, "learning_rate": 7.769979852433542e-06, "loss": 0.0123, "step": 8156 }, { "epoch": 3.711101000909918, "grad_norm": 0.4845893702066633, "learning_rate": 7.764802428612452e-06, "loss": 0.0164, "step": 8157 }, { "epoch": 3.7115559599636034, "grad_norm": 0.29264858856682685, "learning_rate": 7.759626413173873e-06, "loss": 0.0165, "step": 8158 }, { "epoch": 3.7120109190172883, "grad_norm": 0.24543106940932347, "learning_rate": 7.754451806540777e-06, "loss": 0.0118, "step": 8159 }, { "epoch": 3.7124658780709736, "grad_norm": 0.24481469224803493, "learning_rate": 7.749278609135996e-06, "loss": 0.0051, "step": 8160 }, { "epoch": 3.712920837124659, "grad_norm": 0.2680743437846624, "learning_rate": 7.744106821382266e-06, "loss": 0.0126, "step": 8161 }, { "epoch": 3.713375796178344, "grad_norm": 0.4324980697143822, "learning_rate": 7.738936443702192e-06, "loss": 0.0208, "step": 8162 }, { "epoch": 3.713830755232029, "grad_norm": 0.36590054756405777, "learning_rate": 7.733767476518286e-06, "loss": 0.0144, "step": 8163 }, { "epoch": 3.7142857142857144, "grad_norm": 0.3500140614886988, "learning_rate": 7.728599920252925e-06, "loss": 0.0227, "step": 8164 }, { "epoch": 3.7147406733393993, "grad_norm": 0.39879942812974345, "learning_rate": 7.723433775328384e-06, "loss": 0.0167, "step": 8165 }, { "epoch": 3.7151956323930846, "grad_norm": 0.6068701888089033, "learning_rate": 7.718269042166817e-06, "loss": 0.0518, "step": 8166 }, { "epoch": 3.71565059144677, "grad_norm": 0.2465458271638738, "learning_rate": 7.713105721190256e-06, "loss": 0.0128, "step": 8167 }, { "epoch": 3.7161055505004548, "grad_norm": 0.312537626027334, "learning_rate": 7.707943812820631e-06, "loss": 0.0172, "step": 8168 }, { "epoch": 3.71656050955414, "grad_norm": 0.35607270874076463, "learning_rate": 7.70278331747975e-06, "loss": 0.0135, "step": 8169 }, { "epoch": 3.7170154686078254, "grad_norm": 0.3898180845990806, "learning_rate": 7.697624235589303e-06, "loss": 0.019, "step": 8170 }, { "epoch": 3.7174704276615103, "grad_norm": 0.40630369298080027, "learning_rate": 7.692466567570859e-06, "loss": 0.017, "step": 8171 }, { "epoch": 3.7179253867151956, "grad_norm": 0.42289304520893384, "learning_rate": 7.687310313845886e-06, "loss": 0.011, "step": 8172 }, { "epoch": 3.718380345768881, "grad_norm": 0.1691466473728522, "learning_rate": 7.68215547483574e-06, "loss": 0.0084, "step": 8173 }, { "epoch": 3.7188353048225657, "grad_norm": 0.33913141412132747, "learning_rate": 7.67700205096164e-06, "loss": 0.0218, "step": 8174 }, { "epoch": 3.719290263876251, "grad_norm": 0.27321319260081917, "learning_rate": 7.671850042644702e-06, "loss": 0.0148, "step": 8175 }, { "epoch": 3.7197452229299364, "grad_norm": 0.22724526743663181, "learning_rate": 7.66669945030592e-06, "loss": 0.0108, "step": 8176 }, { "epoch": 3.7202001819836212, "grad_norm": 0.31396655101885484, "learning_rate": 7.661550274366189e-06, "loss": 0.0193, "step": 8177 }, { "epoch": 3.7206551410373065, "grad_norm": 0.32874661482705936, "learning_rate": 7.656402515246261e-06, "loss": 0.0101, "step": 8178 }, { "epoch": 3.721110100090992, "grad_norm": 0.4172582635073624, "learning_rate": 7.651256173366806e-06, "loss": 0.0248, "step": 8179 }, { "epoch": 3.7215650591446767, "grad_norm": 0.25576205195449575, "learning_rate": 7.64611124914835e-06, "loss": 0.0089, "step": 8180 }, { "epoch": 3.722020018198362, "grad_norm": 0.25531134749334217, "learning_rate": 7.640967743011304e-06, "loss": 0.0127, "step": 8181 }, { "epoch": 3.7224749772520473, "grad_norm": 0.4991955801838791, "learning_rate": 7.635825655375989e-06, "loss": 0.0579, "step": 8182 }, { "epoch": 3.722929936305732, "grad_norm": 0.45606251374559215, "learning_rate": 7.630684986662587e-06, "loss": 0.0253, "step": 8183 }, { "epoch": 3.7233848953594175, "grad_norm": 0.3238908049170165, "learning_rate": 7.625545737291168e-06, "loss": 0.0099, "step": 8184 }, { "epoch": 3.723839854413103, "grad_norm": 0.25468552859197363, "learning_rate": 7.620407907681682e-06, "loss": 0.0065, "step": 8185 }, { "epoch": 3.724294813466788, "grad_norm": 0.30628526614764323, "learning_rate": 7.6152714982539755e-06, "loss": 0.0118, "step": 8186 }, { "epoch": 3.724749772520473, "grad_norm": 0.2101377944360839, "learning_rate": 7.610136509427782e-06, "loss": 0.0082, "step": 8187 }, { "epoch": 3.7252047315741583, "grad_norm": 0.24194302585362065, "learning_rate": 7.605002941622699e-06, "loss": 0.0078, "step": 8188 }, { "epoch": 3.7256596906278436, "grad_norm": 0.17449106015465357, "learning_rate": 7.599870795258224e-06, "loss": 0.0054, "step": 8189 }, { "epoch": 3.7261146496815285, "grad_norm": 0.36087053984264156, "learning_rate": 7.594740070753726e-06, "loss": 0.0163, "step": 8190 }, { "epoch": 3.726569608735214, "grad_norm": 0.24400634050698547, "learning_rate": 7.58961076852846e-06, "loss": 0.0087, "step": 8191 }, { "epoch": 3.727024567788899, "grad_norm": 0.8327355489269551, "learning_rate": 7.584482889001579e-06, "loss": 0.0156, "step": 8192 }, { "epoch": 3.7274795268425844, "grad_norm": 0.16369841825871537, "learning_rate": 7.579356432592116e-06, "loss": 0.0046, "step": 8193 }, { "epoch": 3.7279344858962693, "grad_norm": 0.4677815535623148, "learning_rate": 7.5742313997189755e-06, "loss": 0.0048, "step": 8194 }, { "epoch": 3.7283894449499546, "grad_norm": 0.3478988574519327, "learning_rate": 7.56910779080095e-06, "loss": 0.0132, "step": 8195 }, { "epoch": 3.72884440400364, "grad_norm": 0.34522377049196556, "learning_rate": 7.56398560625671e-06, "loss": 0.0115, "step": 8196 }, { "epoch": 3.729299363057325, "grad_norm": 0.17465366674697114, "learning_rate": 7.5588648465048345e-06, "loss": 0.006, "step": 8197 }, { "epoch": 3.72975432211101, "grad_norm": 0.2217843864648381, "learning_rate": 7.5537455119637615e-06, "loss": 0.0097, "step": 8198 }, { "epoch": 3.7302092811646954, "grad_norm": 0.29580490836167805, "learning_rate": 7.548627603051808e-06, "loss": 0.0093, "step": 8199 }, { "epoch": 3.7306642402183803, "grad_norm": 0.3942362762063202, "learning_rate": 7.543511120187207e-06, "loss": 0.0217, "step": 8200 }, { "epoch": 3.7311191992720656, "grad_norm": 0.3920310916588643, "learning_rate": 7.538396063788037e-06, "loss": 0.0148, "step": 8201 }, { "epoch": 3.731574158325751, "grad_norm": 0.17185932452560934, "learning_rate": 7.533282434272293e-06, "loss": 0.0048, "step": 8202 }, { "epoch": 3.7320291173794358, "grad_norm": 0.2614203338856091, "learning_rate": 7.528170232057827e-06, "loss": 0.0082, "step": 8203 }, { "epoch": 3.732484076433121, "grad_norm": 0.23907495565777512, "learning_rate": 7.52305945756239e-06, "loss": 0.0114, "step": 8204 }, { "epoch": 3.7329390354868064, "grad_norm": 0.3022324667496997, "learning_rate": 7.517950111203598e-06, "loss": 0.0201, "step": 8205 }, { "epoch": 3.7333939945404913, "grad_norm": 0.2699386517973602, "learning_rate": 7.512842193398978e-06, "loss": 0.0103, "step": 8206 }, { "epoch": 3.7338489535941766, "grad_norm": 0.4142070337246259, "learning_rate": 7.50773570456593e-06, "loss": 0.0244, "step": 8207 }, { "epoch": 3.734303912647862, "grad_norm": 0.35083488653817463, "learning_rate": 7.502630645121722e-06, "loss": 0.0157, "step": 8208 }, { "epoch": 3.7347588717015467, "grad_norm": 0.25411489999227443, "learning_rate": 7.497527015483525e-06, "loss": 0.0096, "step": 8209 }, { "epoch": 3.735213830755232, "grad_norm": 0.19020487427594399, "learning_rate": 7.492424816068369e-06, "loss": 0.0079, "step": 8210 }, { "epoch": 3.7356687898089174, "grad_norm": 0.16058512031748398, "learning_rate": 7.487324047293204e-06, "loss": 0.0029, "step": 8211 }, { "epoch": 3.7361237488626022, "grad_norm": 0.43399516253101644, "learning_rate": 7.482224709574829e-06, "loss": 0.0123, "step": 8212 }, { "epoch": 3.7365787079162875, "grad_norm": 0.35281791991575223, "learning_rate": 7.4771268033299345e-06, "loss": 0.0195, "step": 8213 }, { "epoch": 3.737033666969973, "grad_norm": 0.357280429731154, "learning_rate": 7.472030328975113e-06, "loss": 0.0183, "step": 8214 }, { "epoch": 3.7374886260236577, "grad_norm": 0.2846951935569525, "learning_rate": 7.466935286926807e-06, "loss": 0.0133, "step": 8215 }, { "epoch": 3.737943585077343, "grad_norm": 0.3732278073564261, "learning_rate": 7.4618416776013804e-06, "loss": 0.0119, "step": 8216 }, { "epoch": 3.7383985441310283, "grad_norm": 0.3023185757444238, "learning_rate": 7.456749501415053e-06, "loss": 0.0136, "step": 8217 }, { "epoch": 3.738853503184713, "grad_norm": 0.5246313257721538, "learning_rate": 7.451658758783928e-06, "loss": 0.0213, "step": 8218 }, { "epoch": 3.7393084622383985, "grad_norm": 0.5371108936435847, "learning_rate": 7.446569450123994e-06, "loss": 0.0086, "step": 8219 }, { "epoch": 3.739763421292084, "grad_norm": 0.16006846293523588, "learning_rate": 7.441481575851136e-06, "loss": 0.0051, "step": 8220 }, { "epoch": 3.7402183803457687, "grad_norm": 0.28568279859779006, "learning_rate": 7.436395136381116e-06, "loss": 0.0153, "step": 8221 }, { "epoch": 3.740673339399454, "grad_norm": 0.09851255054291963, "learning_rate": 7.431310132129571e-06, "loss": 0.0013, "step": 8222 }, { "epoch": 3.7411282984531393, "grad_norm": 0.17391472064874913, "learning_rate": 7.4262265635120205e-06, "loss": 0.0047, "step": 8223 }, { "epoch": 3.741583257506824, "grad_norm": 0.24379319027634888, "learning_rate": 7.421144430943866e-06, "loss": 0.009, "step": 8224 }, { "epoch": 3.7420382165605095, "grad_norm": 0.7316643799503032, "learning_rate": 7.416063734840412e-06, "loss": 0.0172, "step": 8225 }, { "epoch": 3.742493175614195, "grad_norm": 0.15873042189010853, "learning_rate": 7.410984475616819e-06, "loss": 0.0048, "step": 8226 }, { "epoch": 3.7429481346678797, "grad_norm": 0.3482069645222555, "learning_rate": 7.4059066536881355e-06, "loss": 0.0072, "step": 8227 }, { "epoch": 3.743403093721565, "grad_norm": 0.3264186324661532, "learning_rate": 7.400830269469316e-06, "loss": 0.0171, "step": 8228 }, { "epoch": 3.7438580527752503, "grad_norm": 0.3090075291698735, "learning_rate": 7.39575532337517e-06, "loss": 0.0085, "step": 8229 }, { "epoch": 3.744313011828935, "grad_norm": 0.5195339483048238, "learning_rate": 7.390681815820389e-06, "loss": 0.0221, "step": 8230 }, { "epoch": 3.7447679708826205, "grad_norm": 0.84024207491603, "learning_rate": 7.385609747219574e-06, "loss": 0.0438, "step": 8231 }, { "epoch": 3.745222929936306, "grad_norm": 0.4534125931050612, "learning_rate": 7.380539117987187e-06, "loss": 0.0403, "step": 8232 }, { "epoch": 3.7456778889899907, "grad_norm": 0.230205653930673, "learning_rate": 7.375469928537573e-06, "loss": 0.0062, "step": 8233 }, { "epoch": 3.746132848043676, "grad_norm": 0.4155692575849485, "learning_rate": 7.370402179284957e-06, "loss": 0.016, "step": 8234 }, { "epoch": 3.7465878070973613, "grad_norm": 0.450337552691653, "learning_rate": 7.365335870643461e-06, "loss": 0.0185, "step": 8235 }, { "epoch": 3.747042766151046, "grad_norm": 0.5594632399927322, "learning_rate": 7.3602710030270886e-06, "loss": 0.0166, "step": 8236 }, { "epoch": 3.7474977252047315, "grad_norm": 0.33195872765200624, "learning_rate": 7.3552075768497094e-06, "loss": 0.0219, "step": 8237 }, { "epoch": 3.7479526842584168, "grad_norm": 0.2764572237285986, "learning_rate": 7.350145592525082e-06, "loss": 0.0133, "step": 8238 }, { "epoch": 3.7484076433121016, "grad_norm": 0.268299467116559, "learning_rate": 7.345085050466846e-06, "loss": 0.0083, "step": 8239 }, { "epoch": 3.748862602365787, "grad_norm": 0.3107147812127192, "learning_rate": 7.340025951088536e-06, "loss": 0.0342, "step": 8240 }, { "epoch": 3.7493175614194723, "grad_norm": 0.37871221604371946, "learning_rate": 7.334968294803546e-06, "loss": 0.0198, "step": 8241 }, { "epoch": 3.7497725204731576, "grad_norm": 0.2520754171600191, "learning_rate": 7.329912082025181e-06, "loss": 0.0147, "step": 8242 }, { "epoch": 3.7502274795268424, "grad_norm": 0.32073444459531886, "learning_rate": 7.3248573131666025e-06, "loss": 0.015, "step": 8243 }, { "epoch": 3.7506824385805277, "grad_norm": 0.408528803353352, "learning_rate": 7.319803988640858e-06, "loss": 0.0249, "step": 8244 }, { "epoch": 3.751137397634213, "grad_norm": 0.34933848414399804, "learning_rate": 7.314752108860895e-06, "loss": 0.0091, "step": 8245 }, { "epoch": 3.7515923566878984, "grad_norm": 0.2849969642904804, "learning_rate": 7.309701674239522e-06, "loss": 0.0149, "step": 8246 }, { "epoch": 3.7520473157415832, "grad_norm": 0.31743610941904615, "learning_rate": 7.304652685189434e-06, "loss": 0.0207, "step": 8247 }, { "epoch": 3.7525022747952685, "grad_norm": 0.38363514249563474, "learning_rate": 7.299605142123225e-06, "loss": 0.0197, "step": 8248 }, { "epoch": 3.752957233848954, "grad_norm": 0.29357383668681575, "learning_rate": 7.294559045453342e-06, "loss": 0.0112, "step": 8249 }, { "epoch": 3.7534121929026387, "grad_norm": 0.2796352852292541, "learning_rate": 7.289514395592142e-06, "loss": 0.0137, "step": 8250 }, { "epoch": 3.753867151956324, "grad_norm": 0.25108586779807696, "learning_rate": 7.284471192951847e-06, "loss": 0.0109, "step": 8251 }, { "epoch": 3.7543221110100093, "grad_norm": 0.2532869859452235, "learning_rate": 7.2794294379445655e-06, "loss": 0.0066, "step": 8252 }, { "epoch": 3.754777070063694, "grad_norm": 0.19963867611209882, "learning_rate": 7.274389130982276e-06, "loss": 0.0094, "step": 8253 }, { "epoch": 3.7552320291173795, "grad_norm": 0.28645229117663146, "learning_rate": 7.269350272476858e-06, "loss": 0.0196, "step": 8254 }, { "epoch": 3.755686988171065, "grad_norm": 0.46505633309213756, "learning_rate": 7.264312862840072e-06, "loss": 0.0206, "step": 8255 }, { "epoch": 3.7561419472247497, "grad_norm": 0.2630651720745387, "learning_rate": 7.259276902483547e-06, "loss": 0.0127, "step": 8256 }, { "epoch": 3.756596906278435, "grad_norm": 0.38348826127798286, "learning_rate": 7.254242391818794e-06, "loss": 0.0181, "step": 8257 }, { "epoch": 3.7570518653321203, "grad_norm": 0.3569146040429124, "learning_rate": 7.249209331257209e-06, "loss": 0.0121, "step": 8258 }, { "epoch": 3.757506824385805, "grad_norm": 0.23668991337422368, "learning_rate": 7.244177721210083e-06, "loss": 0.0096, "step": 8259 }, { "epoch": 3.7579617834394905, "grad_norm": 0.16508659066238576, "learning_rate": 7.239147562088566e-06, "loss": 0.0051, "step": 8260 }, { "epoch": 3.758416742493176, "grad_norm": 0.27133550452051525, "learning_rate": 7.2341188543036985e-06, "loss": 0.006, "step": 8261 }, { "epoch": 3.7588717015468607, "grad_norm": 0.33305981765962966, "learning_rate": 7.229091598266416e-06, "loss": 0.0145, "step": 8262 }, { "epoch": 3.759326660600546, "grad_norm": 0.18555211843162378, "learning_rate": 7.2240657943875136e-06, "loss": 0.0045, "step": 8263 }, { "epoch": 3.7597816196542313, "grad_norm": 0.28824522861456514, "learning_rate": 7.219041443077673e-06, "loss": 0.0071, "step": 8264 }, { "epoch": 3.760236578707916, "grad_norm": 0.4545670119137953, "learning_rate": 7.214018544747472e-06, "loss": 0.0264, "step": 8265 }, { "epoch": 3.7606915377616015, "grad_norm": 0.3899925912049379, "learning_rate": 7.208997099807358e-06, "loss": 0.0253, "step": 8266 }, { "epoch": 3.761146496815287, "grad_norm": 0.7646291162829505, "learning_rate": 7.203977108667656e-06, "loss": 0.0291, "step": 8267 }, { "epoch": 3.7616014558689717, "grad_norm": 0.353735658727304, "learning_rate": 7.1989585717385735e-06, "loss": 0.0271, "step": 8268 }, { "epoch": 3.762056414922657, "grad_norm": 0.2564637727784143, "learning_rate": 7.193941489430206e-06, "loss": 0.012, "step": 8269 }, { "epoch": 3.7625113739763423, "grad_norm": 0.20645219775721993, "learning_rate": 7.1889258621525344e-06, "loss": 0.0078, "step": 8270 }, { "epoch": 3.762966333030027, "grad_norm": 0.2601995559534334, "learning_rate": 7.1839116903154094e-06, "loss": 0.0123, "step": 8271 }, { "epoch": 3.7634212920837125, "grad_norm": 0.31674792033887905, "learning_rate": 7.178898974328563e-06, "loss": 0.016, "step": 8272 }, { "epoch": 3.7638762511373978, "grad_norm": 0.3607976380655891, "learning_rate": 7.173887714601607e-06, "loss": 0.0369, "step": 8273 }, { "epoch": 3.7643312101910826, "grad_norm": 0.693713868296386, "learning_rate": 7.16887791154405e-06, "loss": 0.0234, "step": 8274 }, { "epoch": 3.764786169244768, "grad_norm": 0.401093458217833, "learning_rate": 7.1638695655652596e-06, "loss": 0.0168, "step": 8275 }, { "epoch": 3.7652411282984533, "grad_norm": 0.4488244086149554, "learning_rate": 7.1588626770745095e-06, "loss": 0.0366, "step": 8276 }, { "epoch": 3.765696087352138, "grad_norm": 0.3761315239165739, "learning_rate": 7.1538572464809304e-06, "loss": 0.007, "step": 8277 }, { "epoch": 3.7661510464058234, "grad_norm": 0.3669785936132469, "learning_rate": 7.148853274193537e-06, "loss": 0.013, "step": 8278 }, { "epoch": 3.7666060054595087, "grad_norm": 0.42716209824485957, "learning_rate": 7.143850760621246e-06, "loss": 0.0093, "step": 8279 }, { "epoch": 3.7670609645131936, "grad_norm": 0.2993195150428255, "learning_rate": 7.138849706172834e-06, "loss": 0.023, "step": 8280 }, { "epoch": 3.767515923566879, "grad_norm": 0.25832373418263616, "learning_rate": 7.133850111256965e-06, "loss": 0.0088, "step": 8281 }, { "epoch": 3.7679708826205642, "grad_norm": 0.3219771631908163, "learning_rate": 7.128851976282172e-06, "loss": 0.0177, "step": 8282 }, { "epoch": 3.768425841674249, "grad_norm": 0.30873683899523274, "learning_rate": 7.123855301656893e-06, "loss": 0.0112, "step": 8283 }, { "epoch": 3.7688808007279344, "grad_norm": 0.418533288019418, "learning_rate": 7.118860087789436e-06, "loss": 0.0454, "step": 8284 }, { "epoch": 3.7693357597816197, "grad_norm": 0.26758243475140236, "learning_rate": 7.113866335087982e-06, "loss": 0.0057, "step": 8285 }, { "epoch": 3.7697907188353046, "grad_norm": 0.30416275947906674, "learning_rate": 7.1088740439606e-06, "loss": 0.0159, "step": 8286 }, { "epoch": 3.77024567788899, "grad_norm": 0.33689446307579574, "learning_rate": 7.103883214815227e-06, "loss": 0.0102, "step": 8287 }, { "epoch": 3.770700636942675, "grad_norm": 0.3140234169319836, "learning_rate": 7.098893848059707e-06, "loss": 0.0119, "step": 8288 }, { "epoch": 3.77115559599636, "grad_norm": 0.21652836504767997, "learning_rate": 7.093905944101733e-06, "loss": 0.0091, "step": 8289 }, { "epoch": 3.7716105550500454, "grad_norm": 0.34319654627555746, "learning_rate": 7.088919503348909e-06, "loss": 0.0209, "step": 8290 }, { "epoch": 3.7720655141037307, "grad_norm": 0.253584010871255, "learning_rate": 7.083934526208699e-06, "loss": 0.0132, "step": 8291 }, { "epoch": 3.7725204731574156, "grad_norm": 0.272943376918826, "learning_rate": 7.078951013088444e-06, "loss": 0.0146, "step": 8292 }, { "epoch": 3.772975432211101, "grad_norm": 0.3038878553465418, "learning_rate": 7.073968964395389e-06, "loss": 0.0251, "step": 8293 }, { "epoch": 3.773430391264786, "grad_norm": 0.22680584515253754, "learning_rate": 7.0689883805366345e-06, "loss": 0.0093, "step": 8294 }, { "epoch": 3.7738853503184715, "grad_norm": 0.21663677395653402, "learning_rate": 7.064009261919177e-06, "loss": 0.0076, "step": 8295 }, { "epoch": 3.7743403093721564, "grad_norm": 0.2047512870116336, "learning_rate": 7.059031608949873e-06, "loss": 0.0045, "step": 8296 }, { "epoch": 3.7747952684258417, "grad_norm": 0.5238867150038256, "learning_rate": 7.054055422035488e-06, "loss": 0.0274, "step": 8297 }, { "epoch": 3.775250227479527, "grad_norm": 0.30813476728258415, "learning_rate": 7.049080701582658e-06, "loss": 0.0063, "step": 8298 }, { "epoch": 3.775705186533212, "grad_norm": 0.345106868613781, "learning_rate": 7.044107447997888e-06, "loss": 0.0143, "step": 8299 }, { "epoch": 3.776160145586897, "grad_norm": 0.1654183545545733, "learning_rate": 7.039135661687568e-06, "loss": 0.0071, "step": 8300 }, { "epoch": 3.7766151046405825, "grad_norm": 0.2502534159459541, "learning_rate": 7.0341653430579715e-06, "loss": 0.0131, "step": 8301 }, { "epoch": 3.777070063694268, "grad_norm": 0.29460531685325336, "learning_rate": 7.029196492515244e-06, "loss": 0.0138, "step": 8302 }, { "epoch": 3.7775250227479527, "grad_norm": 0.39771448000669773, "learning_rate": 7.024229110465422e-06, "loss": 0.0194, "step": 8303 }, { "epoch": 3.777979981801638, "grad_norm": 0.18475029111773167, "learning_rate": 7.019263197314427e-06, "loss": 0.0118, "step": 8304 }, { "epoch": 3.7784349408553233, "grad_norm": 0.3410942574617035, "learning_rate": 7.014298753468043e-06, "loss": 0.0242, "step": 8305 }, { "epoch": 3.778889899909008, "grad_norm": 0.16449290445367376, "learning_rate": 7.009335779331944e-06, "loss": 0.0102, "step": 8306 }, { "epoch": 3.7793448589626935, "grad_norm": 0.3713120037274859, "learning_rate": 7.00437427531167e-06, "loss": 0.0115, "step": 8307 }, { "epoch": 3.7797998180163788, "grad_norm": 0.2000412508495656, "learning_rate": 6.999414241812671e-06, "loss": 0.0071, "step": 8308 }, { "epoch": 3.7802547770700636, "grad_norm": 0.27145682897802587, "learning_rate": 6.994455679240253e-06, "loss": 0.0152, "step": 8309 }, { "epoch": 3.780709736123749, "grad_norm": 0.31251355212525905, "learning_rate": 6.9894985879995935e-06, "loss": 0.0153, "step": 8310 }, { "epoch": 3.7811646951774343, "grad_norm": 0.404627777924367, "learning_rate": 6.9845429684957834e-06, "loss": 0.0122, "step": 8311 }, { "epoch": 3.781619654231119, "grad_norm": 0.32498435830274947, "learning_rate": 6.979588821133756e-06, "loss": 0.0104, "step": 8312 }, { "epoch": 3.7820746132848044, "grad_norm": 0.33999833988840555, "learning_rate": 6.974636146318361e-06, "loss": 0.0155, "step": 8313 }, { "epoch": 3.7825295723384897, "grad_norm": 0.3209170475358678, "learning_rate": 6.969684944454297e-06, "loss": 0.0176, "step": 8314 }, { "epoch": 3.7829845313921746, "grad_norm": 0.29023488230170924, "learning_rate": 6.964735215946155e-06, "loss": 0.0124, "step": 8315 }, { "epoch": 3.78343949044586, "grad_norm": 0.38229565918374153, "learning_rate": 6.9597869611983974e-06, "loss": 0.0074, "step": 8316 }, { "epoch": 3.7838944494995452, "grad_norm": 0.198359704012706, "learning_rate": 6.95484018061538e-06, "loss": 0.0076, "step": 8317 }, { "epoch": 3.78434940855323, "grad_norm": 0.30781430588348313, "learning_rate": 6.949894874601337e-06, "loss": 0.0111, "step": 8318 }, { "epoch": 3.7848043676069154, "grad_norm": 0.3503001851935704, "learning_rate": 6.944951043560374e-06, "loss": 0.0081, "step": 8319 }, { "epoch": 3.7852593266606007, "grad_norm": 0.2943388180321233, "learning_rate": 6.940008687896476e-06, "loss": 0.0077, "step": 8320 }, { "epoch": 3.7857142857142856, "grad_norm": 0.27135624330547453, "learning_rate": 6.935067808013501e-06, "loss": 0.0108, "step": 8321 }, { "epoch": 3.786169244767971, "grad_norm": 0.16653270464345363, "learning_rate": 6.930128404315214e-06, "loss": 0.0052, "step": 8322 }, { "epoch": 3.786624203821656, "grad_norm": 0.22287507972407242, "learning_rate": 6.925190477205229e-06, "loss": 0.0065, "step": 8323 }, { "epoch": 3.787079162875341, "grad_norm": 0.23400466865949768, "learning_rate": 6.920254027087048e-06, "loss": 0.0075, "step": 8324 }, { "epoch": 3.7875341219290264, "grad_norm": 0.3009418668126487, "learning_rate": 6.915319054364064e-06, "loss": 0.018, "step": 8325 }, { "epoch": 3.7879890809827117, "grad_norm": 0.6529129387369492, "learning_rate": 6.910385559439534e-06, "loss": 0.0146, "step": 8326 }, { "epoch": 3.7884440400363966, "grad_norm": 0.2439516834522579, "learning_rate": 6.905453542716608e-06, "loss": 0.0078, "step": 8327 }, { "epoch": 3.788898999090082, "grad_norm": 0.41598928153107523, "learning_rate": 6.900523004598305e-06, "loss": 0.0221, "step": 8328 }, { "epoch": 3.789353958143767, "grad_norm": 0.17786265425301243, "learning_rate": 6.895593945487528e-06, "loss": 0.0063, "step": 8329 }, { "epoch": 3.789808917197452, "grad_norm": 0.3269029061256478, "learning_rate": 6.890666365787043e-06, "loss": 0.0163, "step": 8330 }, { "epoch": 3.7902638762511374, "grad_norm": 0.34968423873113247, "learning_rate": 6.885740265899526e-06, "loss": 0.0156, "step": 8331 }, { "epoch": 3.7907188353048227, "grad_norm": 0.22230710892684105, "learning_rate": 6.880815646227518e-06, "loss": 0.0061, "step": 8332 }, { "epoch": 3.7911737943585075, "grad_norm": 0.249734054305219, "learning_rate": 6.875892507173426e-06, "loss": 0.0272, "step": 8333 }, { "epoch": 3.791628753412193, "grad_norm": 0.39872708518356786, "learning_rate": 6.870970849139555e-06, "loss": 0.0177, "step": 8334 }, { "epoch": 3.792083712465878, "grad_norm": 0.31363816257670396, "learning_rate": 6.866050672528073e-06, "loss": 0.008, "step": 8335 }, { "epoch": 3.792538671519563, "grad_norm": 0.3228407664799665, "learning_rate": 6.861131977741034e-06, "loss": 0.0061, "step": 8336 }, { "epoch": 3.7929936305732483, "grad_norm": 0.3398912685430559, "learning_rate": 6.85621476518038e-06, "loss": 0.0103, "step": 8337 }, { "epoch": 3.7934485896269337, "grad_norm": 0.2608179436458172, "learning_rate": 6.851299035247913e-06, "loss": 0.0091, "step": 8338 }, { "epoch": 3.7939035486806185, "grad_norm": 0.361673525079947, "learning_rate": 6.846384788345337e-06, "loss": 0.0112, "step": 8339 }, { "epoch": 3.794358507734304, "grad_norm": 0.38974091343323075, "learning_rate": 6.841472024874212e-06, "loss": 0.0253, "step": 8340 }, { "epoch": 3.794813466787989, "grad_norm": 0.26948586722273477, "learning_rate": 6.836560745235987e-06, "loss": 0.008, "step": 8341 }, { "epoch": 3.795268425841674, "grad_norm": 0.22647316244330806, "learning_rate": 6.831650949831997e-06, "loss": 0.0137, "step": 8342 }, { "epoch": 3.7957233848953593, "grad_norm": 0.2534968076402494, "learning_rate": 6.8267426390634466e-06, "loss": 0.009, "step": 8343 }, { "epoch": 3.7961783439490446, "grad_norm": 0.48029050542854373, "learning_rate": 6.821835813331415e-06, "loss": 0.0117, "step": 8344 }, { "epoch": 3.7966333030027295, "grad_norm": 0.29879722679820525, "learning_rate": 6.816930473036864e-06, "loss": 0.0089, "step": 8345 }, { "epoch": 3.797088262056415, "grad_norm": 0.04695677757158915, "learning_rate": 6.8120266185806385e-06, "loss": 0.0013, "step": 8346 }, { "epoch": 3.7975432211101, "grad_norm": 0.35599047515114923, "learning_rate": 6.8071242503634696e-06, "loss": 0.0152, "step": 8347 }, { "epoch": 3.797998180163785, "grad_norm": 0.3794112565251462, "learning_rate": 6.802223368785951e-06, "loss": 0.0217, "step": 8348 }, { "epoch": 3.7984531392174703, "grad_norm": 0.3739611631301518, "learning_rate": 6.797323974248557e-06, "loss": 0.0164, "step": 8349 }, { "epoch": 3.7989080982711556, "grad_norm": 0.39554140730554593, "learning_rate": 6.792426067151636e-06, "loss": 0.015, "step": 8350 }, { "epoch": 3.799363057324841, "grad_norm": 0.27693489252069053, "learning_rate": 6.787529647895441e-06, "loss": 0.0104, "step": 8351 }, { "epoch": 3.799818016378526, "grad_norm": 0.2543046304281236, "learning_rate": 6.782634716880068e-06, "loss": 0.0131, "step": 8352 }, { "epoch": 3.800272975432211, "grad_norm": 0.27207841594502336, "learning_rate": 6.777741274505525e-06, "loss": 0.0145, "step": 8353 }, { "epoch": 3.8007279344858964, "grad_norm": 0.2983542631508661, "learning_rate": 6.772849321171676e-06, "loss": 0.0128, "step": 8354 }, { "epoch": 3.8011828935395813, "grad_norm": 0.26199683988117395, "learning_rate": 6.767958857278256e-06, "loss": 0.0082, "step": 8355 }, { "epoch": 3.8016378525932666, "grad_norm": 0.538253354374318, "learning_rate": 6.763069883224915e-06, "loss": 0.0231, "step": 8356 }, { "epoch": 3.802092811646952, "grad_norm": 0.2617047210439784, "learning_rate": 6.758182399411142e-06, "loss": 0.0068, "step": 8357 }, { "epoch": 3.802547770700637, "grad_norm": 0.2307185469438786, "learning_rate": 6.753296406236326e-06, "loss": 0.0112, "step": 8358 }, { "epoch": 3.803002729754322, "grad_norm": 0.2811565582076802, "learning_rate": 6.748411904099719e-06, "loss": 0.0087, "step": 8359 }, { "epoch": 3.8034576888080074, "grad_norm": 0.39118490394173894, "learning_rate": 6.743528893400467e-06, "loss": 0.0141, "step": 8360 }, { "epoch": 3.8039126478616927, "grad_norm": 0.2604571071196429, "learning_rate": 6.738647374537596e-06, "loss": 0.0248, "step": 8361 }, { "epoch": 3.8043676069153776, "grad_norm": 0.49760915624799534, "learning_rate": 6.733767347909995e-06, "loss": 0.0235, "step": 8362 }, { "epoch": 3.804822565969063, "grad_norm": 0.39113252444809, "learning_rate": 6.728888813916434e-06, "loss": 0.0212, "step": 8363 }, { "epoch": 3.805277525022748, "grad_norm": 0.29060809477202937, "learning_rate": 6.724011772955563e-06, "loss": 0.0118, "step": 8364 }, { "epoch": 3.805732484076433, "grad_norm": 0.18919319865720863, "learning_rate": 6.719136225425923e-06, "loss": 0.0036, "step": 8365 }, { "epoch": 3.8061874431301184, "grad_norm": 0.2551720244696452, "learning_rate": 6.714262171725905e-06, "loss": 0.0057, "step": 8366 }, { "epoch": 3.8066424021838037, "grad_norm": 0.34333955570434815, "learning_rate": 6.7093896122538165e-06, "loss": 0.0139, "step": 8367 }, { "epoch": 3.8070973612374885, "grad_norm": 0.3287022655858481, "learning_rate": 6.704518547407806e-06, "loss": 0.0132, "step": 8368 }, { "epoch": 3.807552320291174, "grad_norm": 0.29072745100378206, "learning_rate": 6.6996489775859125e-06, "loss": 0.0336, "step": 8369 }, { "epoch": 3.808007279344859, "grad_norm": 0.3652861435858056, "learning_rate": 6.694780903186066e-06, "loss": 0.0209, "step": 8370 }, { "epoch": 3.808462238398544, "grad_norm": 0.39558346265369526, "learning_rate": 6.689914324606061e-06, "loss": 0.0146, "step": 8371 }, { "epoch": 3.8089171974522293, "grad_norm": 0.1429747742250703, "learning_rate": 6.685049242243569e-06, "loss": 0.0105, "step": 8372 }, { "epoch": 3.8093721565059147, "grad_norm": 0.3814511961807319, "learning_rate": 6.680185656496135e-06, "loss": 0.0089, "step": 8373 }, { "epoch": 3.8098271155595995, "grad_norm": 0.2574597161786434, "learning_rate": 6.675323567761205e-06, "loss": 0.0149, "step": 8374 }, { "epoch": 3.810282074613285, "grad_norm": 0.27697327713596503, "learning_rate": 6.670462976436073e-06, "loss": 0.0188, "step": 8375 }, { "epoch": 3.81073703366697, "grad_norm": 0.4047458309458851, "learning_rate": 6.665603882917937e-06, "loss": 0.0285, "step": 8376 }, { "epoch": 3.811191992720655, "grad_norm": 0.44898423707894436, "learning_rate": 6.660746287603856e-06, "loss": 0.0346, "step": 8377 }, { "epoch": 3.8116469517743403, "grad_norm": 0.2976029168885108, "learning_rate": 6.655890190890768e-06, "loss": 0.0122, "step": 8378 }, { "epoch": 3.8121019108280256, "grad_norm": 0.5175791158914546, "learning_rate": 6.651035593175487e-06, "loss": 0.0087, "step": 8379 }, { "epoch": 3.8125568698817105, "grad_norm": 0.2632464180219327, "learning_rate": 6.646182494854711e-06, "loss": 0.0134, "step": 8380 }, { "epoch": 3.813011828935396, "grad_norm": 0.3668890699320141, "learning_rate": 6.6413308963250266e-06, "loss": 0.0123, "step": 8381 }, { "epoch": 3.813466787989081, "grad_norm": 0.21155167028121236, "learning_rate": 6.636480797982872e-06, "loss": 0.0068, "step": 8382 }, { "epoch": 3.813921747042766, "grad_norm": 0.3162315754497914, "learning_rate": 6.63163220022458e-06, "loss": 0.0232, "step": 8383 }, { "epoch": 3.8143767060964513, "grad_norm": 0.5223467206956328, "learning_rate": 6.626785103446345e-06, "loss": 0.003, "step": 8384 }, { "epoch": 3.8148316651501366, "grad_norm": 0.1707804471405375, "learning_rate": 6.621939508044267e-06, "loss": 0.0054, "step": 8385 }, { "epoch": 3.8152866242038215, "grad_norm": 0.3610153834017725, "learning_rate": 6.617095414414295e-06, "loss": 0.0131, "step": 8386 }, { "epoch": 3.815741583257507, "grad_norm": 0.2901930288335942, "learning_rate": 6.612252822952267e-06, "loss": 0.0078, "step": 8387 }, { "epoch": 3.816196542311192, "grad_norm": 0.22842175841953086, "learning_rate": 6.6074117340539035e-06, "loss": 0.0138, "step": 8388 }, { "epoch": 3.816651501364877, "grad_norm": 0.3173238371847938, "learning_rate": 6.602572148114786e-06, "loss": 0.0103, "step": 8389 }, { "epoch": 3.8171064604185623, "grad_norm": 0.2541662268878239, "learning_rate": 6.597734065530397e-06, "loss": 0.0116, "step": 8390 }, { "epoch": 3.8175614194722476, "grad_norm": 0.4705455255911208, "learning_rate": 6.5928974866960785e-06, "loss": 0.0308, "step": 8391 }, { "epoch": 3.8180163785259325, "grad_norm": 0.41566801765945455, "learning_rate": 6.588062412007051e-06, "loss": 0.0096, "step": 8392 }, { "epoch": 3.8184713375796178, "grad_norm": 0.2309529745778776, "learning_rate": 6.583228841858408e-06, "loss": 0.0115, "step": 8393 }, { "epoch": 3.818926296633303, "grad_norm": 0.29214222820616453, "learning_rate": 6.578396776645135e-06, "loss": 0.0259, "step": 8394 }, { "epoch": 3.819381255686988, "grad_norm": 0.2248995847685552, "learning_rate": 6.573566216762092e-06, "loss": 0.0039, "step": 8395 }, { "epoch": 3.8198362147406733, "grad_norm": 0.27301834487309884, "learning_rate": 6.568737162604005e-06, "loss": 0.0112, "step": 8396 }, { "epoch": 3.8202911737943586, "grad_norm": 0.4267512002266989, "learning_rate": 6.563909614565483e-06, "loss": 0.0248, "step": 8397 }, { "epoch": 3.8207461328480434, "grad_norm": 0.3235511825705231, "learning_rate": 6.559083573041002e-06, "loss": 0.0159, "step": 8398 }, { "epoch": 3.8212010919017287, "grad_norm": 0.2893897349212095, "learning_rate": 6.554259038424943e-06, "loss": 0.0116, "step": 8399 }, { "epoch": 3.821656050955414, "grad_norm": 0.5566042639378462, "learning_rate": 6.5494360111115345e-06, "loss": 0.0425, "step": 8400 }, { "epoch": 3.822111010009099, "grad_norm": 0.19910199645740895, "learning_rate": 6.544614491494885e-06, "loss": 0.0059, "step": 8401 }, { "epoch": 3.8225659690627842, "grad_norm": 0.262398653226174, "learning_rate": 6.539794479969003e-06, "loss": 0.0065, "step": 8402 }, { "epoch": 3.8230209281164695, "grad_norm": 0.18659739916321116, "learning_rate": 6.534975976927743e-06, "loss": 0.006, "step": 8403 }, { "epoch": 3.823475887170155, "grad_norm": 0.22495377033741046, "learning_rate": 6.530158982764867e-06, "loss": 0.0102, "step": 8404 }, { "epoch": 3.8239308462238397, "grad_norm": 0.6420496217138264, "learning_rate": 6.5253434978739895e-06, "loss": 0.0325, "step": 8405 }, { "epoch": 3.824385805277525, "grad_norm": 0.3016941267469592, "learning_rate": 6.520529522648608e-06, "loss": 0.0056, "step": 8406 }, { "epoch": 3.8248407643312103, "grad_norm": 0.37884438134980786, "learning_rate": 6.5157170574821046e-06, "loss": 0.0167, "step": 8407 }, { "epoch": 3.825295723384895, "grad_norm": 0.2768928203826967, "learning_rate": 6.510906102767722e-06, "loss": 0.0094, "step": 8408 }, { "epoch": 3.8257506824385805, "grad_norm": 0.266504654954554, "learning_rate": 6.506096658898594e-06, "loss": 0.0092, "step": 8409 }, { "epoch": 3.826205641492266, "grad_norm": 0.33913638399328405, "learning_rate": 6.501288726267738e-06, "loss": 0.017, "step": 8410 }, { "epoch": 3.826660600545951, "grad_norm": 0.3022974192284911, "learning_rate": 6.496482305268028e-06, "loss": 0.0175, "step": 8411 }, { "epoch": 3.827115559599636, "grad_norm": 0.2929885384716407, "learning_rate": 6.491677396292223e-06, "loss": 0.0303, "step": 8412 }, { "epoch": 3.8275705186533213, "grad_norm": 0.28337517845194854, "learning_rate": 6.486873999732951e-06, "loss": 0.0103, "step": 8413 }, { "epoch": 3.8280254777070066, "grad_norm": 0.43669012294157833, "learning_rate": 6.482072115982738e-06, "loss": 0.0187, "step": 8414 }, { "epoch": 3.8284804367606915, "grad_norm": 0.21970918503146392, "learning_rate": 6.477271745433958e-06, "loss": 0.0094, "step": 8415 }, { "epoch": 3.828935395814377, "grad_norm": 0.2948406314136066, "learning_rate": 6.47247288847889e-06, "loss": 0.0218, "step": 8416 }, { "epoch": 3.829390354868062, "grad_norm": 0.1944472501728174, "learning_rate": 6.467675545509669e-06, "loss": 0.0095, "step": 8417 }, { "epoch": 3.829845313921747, "grad_norm": 0.24739864699261652, "learning_rate": 6.4628797169183016e-06, "loss": 0.0071, "step": 8418 }, { "epoch": 3.8303002729754323, "grad_norm": 0.31819570527481844, "learning_rate": 6.4580854030967e-06, "loss": 0.0155, "step": 8419 }, { "epoch": 3.8307552320291176, "grad_norm": 0.3060516177482685, "learning_rate": 6.4532926044366255e-06, "loss": 0.0109, "step": 8420 }, { "epoch": 3.8312101910828025, "grad_norm": 0.23754472464622242, "learning_rate": 6.448501321329722e-06, "loss": 0.0064, "step": 8421 }, { "epoch": 3.831665150136488, "grad_norm": 0.24278041985960447, "learning_rate": 6.443711554167506e-06, "loss": 0.006, "step": 8422 }, { "epoch": 3.832120109190173, "grad_norm": 0.5859702092115721, "learning_rate": 6.438923303341382e-06, "loss": 0.0196, "step": 8423 }, { "epoch": 3.832575068243858, "grad_norm": 0.3376566920025962, "learning_rate": 6.4341365692426325e-06, "loss": 0.013, "step": 8424 }, { "epoch": 3.8330300272975433, "grad_norm": 0.3563869196365148, "learning_rate": 6.4293513522624e-06, "loss": 0.0213, "step": 8425 }, { "epoch": 3.8334849863512286, "grad_norm": 0.1500712193426035, "learning_rate": 6.42456765279171e-06, "loss": 0.003, "step": 8426 }, { "epoch": 3.8339399454049135, "grad_norm": 0.17122653165155502, "learning_rate": 6.4197854712214596e-06, "loss": 0.0027, "step": 8427 }, { "epoch": 3.8343949044585988, "grad_norm": 0.3630747256966057, "learning_rate": 6.415004807942437e-06, "loss": 0.0175, "step": 8428 }, { "epoch": 3.834849863512284, "grad_norm": 0.36890943991222586, "learning_rate": 6.4102256633452876e-06, "loss": 0.0152, "step": 8429 }, { "epoch": 3.835304822565969, "grad_norm": 0.36823218037105726, "learning_rate": 6.405448037820552e-06, "loss": 0.0111, "step": 8430 }, { "epoch": 3.8357597816196543, "grad_norm": 0.33523300332986444, "learning_rate": 6.4006719317586335e-06, "loss": 0.0151, "step": 8431 }, { "epoch": 3.8362147406733396, "grad_norm": 0.4383323304582857, "learning_rate": 6.395897345549801e-06, "loss": 0.0156, "step": 8432 }, { "epoch": 3.8366696997270244, "grad_norm": 0.36710145630127905, "learning_rate": 6.391124279584229e-06, "loss": 0.0162, "step": 8433 }, { "epoch": 3.8371246587807097, "grad_norm": 0.2815449262795785, "learning_rate": 6.386352734251946e-06, "loss": 0.0156, "step": 8434 }, { "epoch": 3.837579617834395, "grad_norm": 0.3357123412484143, "learning_rate": 6.381582709942857e-06, "loss": 0.0193, "step": 8435 }, { "epoch": 3.83803457688808, "grad_norm": 0.456156560962256, "learning_rate": 6.376814207046744e-06, "loss": 0.0374, "step": 8436 }, { "epoch": 3.8384895359417652, "grad_norm": 0.22544645622476223, "learning_rate": 6.37204722595327e-06, "loss": 0.0081, "step": 8437 }, { "epoch": 3.8389444949954505, "grad_norm": 0.27184866541294983, "learning_rate": 6.367281767051983e-06, "loss": 0.0082, "step": 8438 }, { "epoch": 3.8393994540491354, "grad_norm": 0.38048308812791826, "learning_rate": 6.362517830732284e-06, "loss": 0.013, "step": 8439 }, { "epoch": 3.8398544131028207, "grad_norm": 0.31117393708905583, "learning_rate": 6.357755417383463e-06, "loss": 0.0254, "step": 8440 }, { "epoch": 3.840309372156506, "grad_norm": 0.23456606707361885, "learning_rate": 6.3529945273946785e-06, "loss": 0.0081, "step": 8441 }, { "epoch": 3.840764331210191, "grad_norm": 0.43607227917148567, "learning_rate": 6.348235161154969e-06, "loss": 0.0228, "step": 8442 }, { "epoch": 3.841219290263876, "grad_norm": 0.3670890134524167, "learning_rate": 6.343477319053248e-06, "loss": 0.0354, "step": 8443 }, { "epoch": 3.8416742493175615, "grad_norm": 0.42464004240041653, "learning_rate": 6.338721001478318e-06, "loss": 0.0063, "step": 8444 }, { "epoch": 3.8421292083712464, "grad_norm": 0.1718672168446185, "learning_rate": 6.333966208818834e-06, "loss": 0.0058, "step": 8445 }, { "epoch": 3.8425841674249317, "grad_norm": 0.35457708484164696, "learning_rate": 6.329212941463336e-06, "loss": 0.0283, "step": 8446 }, { "epoch": 3.843039126478617, "grad_norm": 0.41782768504860096, "learning_rate": 6.324461199800233e-06, "loss": 0.0172, "step": 8447 }, { "epoch": 3.843494085532302, "grad_norm": 0.36546290931014974, "learning_rate": 6.319710984217828e-06, "loss": 0.0176, "step": 8448 }, { "epoch": 3.843949044585987, "grad_norm": 0.23382719303553673, "learning_rate": 6.314962295104285e-06, "loss": 0.0094, "step": 8449 }, { "epoch": 3.8444040036396725, "grad_norm": 0.39933055012273344, "learning_rate": 6.310215132847633e-06, "loss": 0.0277, "step": 8450 }, { "epoch": 3.8448589626933574, "grad_norm": 0.4900719253597927, "learning_rate": 6.305469497835803e-06, "loss": 0.0375, "step": 8451 }, { "epoch": 3.8453139217470427, "grad_norm": 0.27748512371678413, "learning_rate": 6.30072539045658e-06, "loss": 0.0083, "step": 8452 }, { "epoch": 3.845768880800728, "grad_norm": 0.3149057217511301, "learning_rate": 6.295982811097637e-06, "loss": 0.0264, "step": 8453 }, { "epoch": 3.846223839854413, "grad_norm": 0.2735164482456725, "learning_rate": 6.291241760146513e-06, "loss": 0.0139, "step": 8454 }, { "epoch": 3.846678798908098, "grad_norm": 0.2890537697067112, "learning_rate": 6.286502237990621e-06, "loss": 0.0144, "step": 8455 }, { "epoch": 3.8471337579617835, "grad_norm": 0.3443456910498896, "learning_rate": 6.281764245017254e-06, "loss": 0.0109, "step": 8456 }, { "epoch": 3.8475887170154683, "grad_norm": 0.2492363041669404, "learning_rate": 6.2770277816135814e-06, "loss": 0.0101, "step": 8457 }, { "epoch": 3.8480436760691537, "grad_norm": 0.30248739691747034, "learning_rate": 6.272292848166652e-06, "loss": 0.013, "step": 8458 }, { "epoch": 3.848498635122839, "grad_norm": 0.2582429681383806, "learning_rate": 6.2675594450633785e-06, "loss": 0.0082, "step": 8459 }, { "epoch": 3.8489535941765243, "grad_norm": 0.19252240217180078, "learning_rate": 6.262827572690552e-06, "loss": 0.0174, "step": 8460 }, { "epoch": 3.849408553230209, "grad_norm": 0.26322991761447134, "learning_rate": 6.258097231434831e-06, "loss": 0.0056, "step": 8461 }, { "epoch": 3.8498635122838945, "grad_norm": 0.3191715205582762, "learning_rate": 6.253368421682776e-06, "loss": 0.0123, "step": 8462 }, { "epoch": 3.8503184713375798, "grad_norm": 0.2906352362127367, "learning_rate": 6.248641143820794e-06, "loss": 0.0178, "step": 8463 }, { "epoch": 3.8507734303912646, "grad_norm": 0.46074914222804564, "learning_rate": 6.24391539823517e-06, "loss": 0.0194, "step": 8464 }, { "epoch": 3.85122838944495, "grad_norm": 0.4049676620784657, "learning_rate": 6.239191185312085e-06, "loss": 0.0138, "step": 8465 }, { "epoch": 3.8516833484986353, "grad_norm": 0.22065442229367288, "learning_rate": 6.234468505437566e-06, "loss": 0.0108, "step": 8466 }, { "epoch": 3.8521383075523206, "grad_norm": 0.2644674065919637, "learning_rate": 6.229747358997542e-06, "loss": 0.0059, "step": 8467 }, { "epoch": 3.8525932666060054, "grad_norm": 0.25333848322093644, "learning_rate": 6.2250277463778014e-06, "loss": 0.0071, "step": 8468 }, { "epoch": 3.8530482256596907, "grad_norm": 0.551614676114712, "learning_rate": 6.220309667964006e-06, "loss": 0.027, "step": 8469 }, { "epoch": 3.853503184713376, "grad_norm": 0.6816210469568231, "learning_rate": 6.215593124141686e-06, "loss": 0.026, "step": 8470 }, { "epoch": 3.853958143767061, "grad_norm": 0.2513431532844331, "learning_rate": 6.210878115296267e-06, "loss": 0.0081, "step": 8471 }, { "epoch": 3.8544131028207462, "grad_norm": 0.28487761263221634, "learning_rate": 6.206164641813047e-06, "loss": 0.0071, "step": 8472 }, { "epoch": 3.8548680618744315, "grad_norm": 0.3510301643347241, "learning_rate": 6.201452704077179e-06, "loss": 0.0281, "step": 8473 }, { "epoch": 3.8553230209281164, "grad_norm": 0.13767249653939775, "learning_rate": 6.196742302473701e-06, "loss": 0.0034, "step": 8474 }, { "epoch": 3.8557779799818017, "grad_norm": 0.21082491383059815, "learning_rate": 6.192033437387523e-06, "loss": 0.004, "step": 8475 }, { "epoch": 3.856232939035487, "grad_norm": 0.2479689792841913, "learning_rate": 6.187326109203443e-06, "loss": 0.0076, "step": 8476 }, { "epoch": 3.856687898089172, "grad_norm": 0.4010075722526762, "learning_rate": 6.182620318306115e-06, "loss": 0.0093, "step": 8477 }, { "epoch": 3.857142857142857, "grad_norm": 0.3676334630051685, "learning_rate": 6.1779160650800665e-06, "loss": 0.0132, "step": 8478 }, { "epoch": 3.8575978161965425, "grad_norm": 0.2604908741173147, "learning_rate": 6.173213349909729e-06, "loss": 0.0163, "step": 8479 }, { "epoch": 3.8580527752502274, "grad_norm": 0.35838150086943743, "learning_rate": 6.168512173179372e-06, "loss": 0.0242, "step": 8480 }, { "epoch": 3.8585077343039127, "grad_norm": 0.2756182826342139, "learning_rate": 6.163812535273153e-06, "loss": 0.0059, "step": 8481 }, { "epoch": 3.858962693357598, "grad_norm": 0.2968264529857887, "learning_rate": 6.159114436575117e-06, "loss": 0.0118, "step": 8482 }, { "epoch": 3.859417652411283, "grad_norm": 0.156854434150484, "learning_rate": 6.154417877469165e-06, "loss": 0.0043, "step": 8483 }, { "epoch": 3.859872611464968, "grad_norm": 0.628112344880025, "learning_rate": 6.149722858339077e-06, "loss": 0.0091, "step": 8484 }, { "epoch": 3.8603275705186535, "grad_norm": 0.22097336344767307, "learning_rate": 6.1450293795685035e-06, "loss": 0.0071, "step": 8485 }, { "epoch": 3.8607825295723384, "grad_norm": 0.11723444831453841, "learning_rate": 6.1403374415409805e-06, "loss": 0.0035, "step": 8486 }, { "epoch": 3.8612374886260237, "grad_norm": 0.40609966967540084, "learning_rate": 6.135647044639919e-06, "loss": 0.0275, "step": 8487 }, { "epoch": 3.861692447679709, "grad_norm": 0.21792729394984173, "learning_rate": 6.130958189248592e-06, "loss": 0.0091, "step": 8488 }, { "epoch": 3.862147406733394, "grad_norm": 0.5249422903485289, "learning_rate": 6.126270875750148e-06, "loss": 0.0193, "step": 8489 }, { "epoch": 3.862602365787079, "grad_norm": 0.31340165937423253, "learning_rate": 6.121585104527608e-06, "loss": 0.0107, "step": 8490 }, { "epoch": 3.8630573248407645, "grad_norm": 0.266778783258165, "learning_rate": 6.116900875963888e-06, "loss": 0.0163, "step": 8491 }, { "epoch": 3.8635122838944493, "grad_norm": 0.32484128619482255, "learning_rate": 6.112218190441746e-06, "loss": 0.0183, "step": 8492 }, { "epoch": 3.8639672429481347, "grad_norm": 0.30281584951393353, "learning_rate": 6.107537048343842e-06, "loss": 0.0123, "step": 8493 }, { "epoch": 3.86442220200182, "grad_norm": 0.37712305531768797, "learning_rate": 6.102857450052693e-06, "loss": 0.0223, "step": 8494 }, { "epoch": 3.864877161055505, "grad_norm": 0.37831274492071676, "learning_rate": 6.09817939595069e-06, "loss": 0.0106, "step": 8495 }, { "epoch": 3.86533212010919, "grad_norm": 0.2445612045312461, "learning_rate": 6.09350288642011e-06, "loss": 0.0121, "step": 8496 }, { "epoch": 3.8657870791628755, "grad_norm": 0.34072345279569277, "learning_rate": 6.088827921843096e-06, "loss": 0.0213, "step": 8497 }, { "epoch": 3.8662420382165603, "grad_norm": 0.2325494790367573, "learning_rate": 6.084154502601661e-06, "loss": 0.0137, "step": 8498 }, { "epoch": 3.8666969972702456, "grad_norm": 0.4058933605010855, "learning_rate": 6.07948262907769e-06, "loss": 0.0119, "step": 8499 }, { "epoch": 3.867151956323931, "grad_norm": 0.3457995948634717, "learning_rate": 6.074812301652954e-06, "loss": 0.0188, "step": 8500 }, { "epoch": 3.867606915377616, "grad_norm": 0.3429388029892456, "learning_rate": 6.070143520709101e-06, "loss": 0.0232, "step": 8501 }, { "epoch": 3.868061874431301, "grad_norm": 0.25705114001220275, "learning_rate": 6.065476286627631e-06, "loss": 0.0126, "step": 8502 }, { "epoch": 3.8685168334849864, "grad_norm": 0.34277016804749944, "learning_rate": 6.060810599789929e-06, "loss": 0.0108, "step": 8503 }, { "epoch": 3.8689717925386713, "grad_norm": 0.2335324014724469, "learning_rate": 6.056146460577253e-06, "loss": 0.0107, "step": 8504 }, { "epoch": 3.8694267515923566, "grad_norm": 0.3349378135283835, "learning_rate": 6.051483869370744e-06, "loss": 0.0206, "step": 8505 }, { "epoch": 3.869881710646042, "grad_norm": 0.26722649825486106, "learning_rate": 6.046822826551393e-06, "loss": 0.0089, "step": 8506 }, { "epoch": 3.870336669699727, "grad_norm": 0.4096491156183259, "learning_rate": 6.0421633325001e-06, "loss": 0.0161, "step": 8507 }, { "epoch": 3.870791628753412, "grad_norm": 0.3478667291259506, "learning_rate": 6.0375053875976025e-06, "loss": 0.0185, "step": 8508 }, { "epoch": 3.8712465878070974, "grad_norm": 0.2674627025642548, "learning_rate": 6.032848992224527e-06, "loss": 0.0126, "step": 8509 }, { "epoch": 3.8717015468607823, "grad_norm": 0.31648644429789685, "learning_rate": 6.028194146761384e-06, "loss": 0.0111, "step": 8510 }, { "epoch": 3.8721565059144676, "grad_norm": 0.37480892164338026, "learning_rate": 6.023540851588538e-06, "loss": 0.0196, "step": 8511 }, { "epoch": 3.872611464968153, "grad_norm": 0.34174348171885444, "learning_rate": 6.018889107086237e-06, "loss": 0.0171, "step": 8512 }, { "epoch": 3.8730664240218378, "grad_norm": 0.2114178234413295, "learning_rate": 6.014238913634593e-06, "loss": 0.0092, "step": 8513 }, { "epoch": 3.873521383075523, "grad_norm": 0.20762137012698473, "learning_rate": 6.009590271613608e-06, "loss": 0.0126, "step": 8514 }, { "epoch": 3.8739763421292084, "grad_norm": 0.2131134798661138, "learning_rate": 6.00494318140315e-06, "loss": 0.0114, "step": 8515 }, { "epoch": 3.8744313011828937, "grad_norm": 0.06265044200764931, "learning_rate": 6.000297643382957e-06, "loss": 0.001, "step": 8516 }, { "epoch": 3.8748862602365786, "grad_norm": 0.2787326375033081, "learning_rate": 5.9956536579326374e-06, "loss": 0.0219, "step": 8517 }, { "epoch": 3.875341219290264, "grad_norm": 0.2945316957640929, "learning_rate": 5.991011225431678e-06, "loss": 0.0134, "step": 8518 }, { "epoch": 3.875796178343949, "grad_norm": 0.36547568231062305, "learning_rate": 5.986370346259429e-06, "loss": 0.0166, "step": 8519 }, { "epoch": 3.876251137397634, "grad_norm": 0.291585400589764, "learning_rate": 5.981731020795131e-06, "loss": 0.0102, "step": 8520 }, { "epoch": 3.8767060964513194, "grad_norm": 0.43906512283387233, "learning_rate": 5.9770932494178976e-06, "loss": 0.0432, "step": 8521 }, { "epoch": 3.8771610555050047, "grad_norm": 0.34947429206909925, "learning_rate": 5.972457032506695e-06, "loss": 0.0124, "step": 8522 }, { "epoch": 3.87761601455869, "grad_norm": 0.05063202864770909, "learning_rate": 5.967822370440379e-06, "loss": 0.0013, "step": 8523 }, { "epoch": 3.878070973612375, "grad_norm": 0.46617711637560777, "learning_rate": 5.96318926359766e-06, "loss": 0.0151, "step": 8524 }, { "epoch": 3.87852593266606, "grad_norm": 0.36503084268845076, "learning_rate": 5.958557712357151e-06, "loss": 0.0119, "step": 8525 }, { "epoch": 3.8789808917197455, "grad_norm": 0.4639846229836878, "learning_rate": 5.9539277170973185e-06, "loss": 0.0242, "step": 8526 }, { "epoch": 3.8794358507734303, "grad_norm": 0.14186477517474497, "learning_rate": 5.949299278196493e-06, "loss": 0.0035, "step": 8527 }, { "epoch": 3.8798908098271156, "grad_norm": 0.484435960170278, "learning_rate": 5.9446723960329075e-06, "loss": 0.024, "step": 8528 }, { "epoch": 3.880345768880801, "grad_norm": 0.2984325477841494, "learning_rate": 5.940047070984631e-06, "loss": 0.0081, "step": 8529 }, { "epoch": 3.880800727934486, "grad_norm": 0.2538521769254018, "learning_rate": 5.935423303429643e-06, "loss": 0.0088, "step": 8530 }, { "epoch": 3.881255686988171, "grad_norm": 0.2110109029138733, "learning_rate": 5.9308010937457664e-06, "loss": 0.0073, "step": 8531 }, { "epoch": 3.8817106460418564, "grad_norm": 0.2665454141739439, "learning_rate": 5.926180442310708e-06, "loss": 0.0325, "step": 8532 }, { "epoch": 3.8821656050955413, "grad_norm": 0.4099096795822854, "learning_rate": 5.921561349502042e-06, "loss": 0.0124, "step": 8533 }, { "epoch": 3.8826205641492266, "grad_norm": 0.26047308387701595, "learning_rate": 5.916943815697224e-06, "loss": 0.0124, "step": 8534 }, { "epoch": 3.883075523202912, "grad_norm": 0.6606952354629013, "learning_rate": 5.912327841273588e-06, "loss": 0.038, "step": 8535 }, { "epoch": 3.883530482256597, "grad_norm": 0.416985080335716, "learning_rate": 5.907713426608319e-06, "loss": 0.0072, "step": 8536 }, { "epoch": 3.883985441310282, "grad_norm": 0.2634812862826541, "learning_rate": 5.9031005720784905e-06, "loss": 0.0105, "step": 8537 }, { "epoch": 3.8844404003639674, "grad_norm": 0.3547765364161437, "learning_rate": 5.898489278061034e-06, "loss": 0.0224, "step": 8538 }, { "epoch": 3.8848953594176523, "grad_norm": 0.3240667584874476, "learning_rate": 5.893879544932779e-06, "loss": 0.0086, "step": 8539 }, { "epoch": 3.8853503184713376, "grad_norm": 0.34156080423496116, "learning_rate": 5.889271373070407e-06, "loss": 0.0271, "step": 8540 }, { "epoch": 3.885805277525023, "grad_norm": 0.5117666891422197, "learning_rate": 5.884664762850467e-06, "loss": 0.0157, "step": 8541 }, { "epoch": 3.886260236578708, "grad_norm": 0.31379822900327187, "learning_rate": 5.880059714649405e-06, "loss": 0.0114, "step": 8542 }, { "epoch": 3.886715195632393, "grad_norm": 0.1672197909168145, "learning_rate": 5.875456228843513e-06, "loss": 0.0076, "step": 8543 }, { "epoch": 3.8871701546860784, "grad_norm": 0.25913487029365645, "learning_rate": 5.870854305808976e-06, "loss": 0.0079, "step": 8544 }, { "epoch": 3.8876251137397633, "grad_norm": 0.1897456683923431, "learning_rate": 5.8662539459218405e-06, "loss": 0.0074, "step": 8545 }, { "epoch": 3.8880800727934486, "grad_norm": 0.2575941889086302, "learning_rate": 5.861655149558026e-06, "loss": 0.0087, "step": 8546 }, { "epoch": 3.888535031847134, "grad_norm": 0.2725543933016509, "learning_rate": 5.857057917093323e-06, "loss": 0.0117, "step": 8547 }, { "epoch": 3.8889899909008188, "grad_norm": 0.26939140576359544, "learning_rate": 5.852462248903387e-06, "loss": 0.0158, "step": 8548 }, { "epoch": 3.889444949954504, "grad_norm": 0.2639710846878539, "learning_rate": 5.847868145363777e-06, "loss": 0.0033, "step": 8549 }, { "epoch": 3.8898999090081894, "grad_norm": 0.18100025452015184, "learning_rate": 5.843275606849894e-06, "loss": 0.0048, "step": 8550 }, { "epoch": 3.8903548680618742, "grad_norm": 0.27044949949716157, "learning_rate": 5.838684633737018e-06, "loss": 0.0108, "step": 8551 }, { "epoch": 3.8908098271155596, "grad_norm": 0.24877454681878888, "learning_rate": 5.834095226400302e-06, "loss": 0.0082, "step": 8552 }, { "epoch": 3.891264786169245, "grad_norm": 0.3580924575924344, "learning_rate": 5.829507385214763e-06, "loss": 0.02, "step": 8553 }, { "epoch": 3.8917197452229297, "grad_norm": 0.346631233079056, "learning_rate": 5.824921110555314e-06, "loss": 0.0268, "step": 8554 }, { "epoch": 3.892174704276615, "grad_norm": 0.19884502922461944, "learning_rate": 5.820336402796711e-06, "loss": 0.0062, "step": 8555 }, { "epoch": 3.8926296633303004, "grad_norm": 0.3179763373766143, "learning_rate": 5.815753262313611e-06, "loss": 0.0137, "step": 8556 }, { "epoch": 3.8930846223839852, "grad_norm": 0.2697122118991234, "learning_rate": 5.811171689480519e-06, "loss": 0.019, "step": 8557 }, { "epoch": 3.8935395814376705, "grad_norm": 0.32977424383025533, "learning_rate": 5.8065916846718135e-06, "loss": 0.0135, "step": 8558 }, { "epoch": 3.893994540491356, "grad_norm": 0.2884524421252767, "learning_rate": 5.802013248261767e-06, "loss": 0.0086, "step": 8559 }, { "epoch": 3.8944494995450407, "grad_norm": 0.2711299020509251, "learning_rate": 5.797436380624499e-06, "loss": 0.0091, "step": 8560 }, { "epoch": 3.894904458598726, "grad_norm": 0.17425232089083922, "learning_rate": 5.79286108213401e-06, "loss": 0.0064, "step": 8561 }, { "epoch": 3.8953594176524113, "grad_norm": 0.30550528553795375, "learning_rate": 5.7882873531641705e-06, "loss": 0.0075, "step": 8562 }, { "epoch": 3.895814376706096, "grad_norm": 0.2523361345631705, "learning_rate": 5.783715194088729e-06, "loss": 0.0058, "step": 8563 }, { "epoch": 3.8962693357597815, "grad_norm": 0.30715625651475453, "learning_rate": 5.779144605281308e-06, "loss": 0.0214, "step": 8564 }, { "epoch": 3.896724294813467, "grad_norm": 0.31328474168076675, "learning_rate": 5.774575587115388e-06, "loss": 0.0192, "step": 8565 }, { "epoch": 3.8971792538671517, "grad_norm": 0.9467669075145108, "learning_rate": 5.770008139964333e-06, "loss": 0.0713, "step": 8566 }, { "epoch": 3.897634212920837, "grad_norm": 0.2705727947135598, "learning_rate": 5.7654422642013625e-06, "loss": 0.0121, "step": 8567 }, { "epoch": 3.8980891719745223, "grad_norm": 0.4515332392519041, "learning_rate": 5.7608779601995955e-06, "loss": 0.031, "step": 8568 }, { "epoch": 3.8985441310282076, "grad_norm": 0.3274704894851274, "learning_rate": 5.756315228331988e-06, "loss": 0.0241, "step": 8569 }, { "epoch": 3.8989990900818925, "grad_norm": 0.4072535689241111, "learning_rate": 5.751754068971407e-06, "loss": 0.0137, "step": 8570 }, { "epoch": 3.899454049135578, "grad_norm": 0.5657765394716888, "learning_rate": 5.747194482490559e-06, "loss": 0.0295, "step": 8571 }, { "epoch": 3.899909008189263, "grad_norm": 0.39284815229209585, "learning_rate": 5.742636469262022e-06, "loss": 0.0169, "step": 8572 }, { "epoch": 3.900363967242948, "grad_norm": 0.3689086228659615, "learning_rate": 5.73808002965828e-06, "loss": 0.0312, "step": 8573 }, { "epoch": 3.9008189262966333, "grad_norm": 0.43853761281237635, "learning_rate": 5.733525164051648e-06, "loss": 0.0182, "step": 8574 }, { "epoch": 3.9012738853503186, "grad_norm": 0.6164924240794954, "learning_rate": 5.728971872814334e-06, "loss": 0.0292, "step": 8575 }, { "epoch": 3.901728844404004, "grad_norm": 0.14109207456624923, "learning_rate": 5.724420156318405e-06, "loss": 0.004, "step": 8576 }, { "epoch": 3.902183803457689, "grad_norm": 0.47417371756514914, "learning_rate": 5.71987001493581e-06, "loss": 0.0183, "step": 8577 }, { "epoch": 3.902638762511374, "grad_norm": 0.3020348590384773, "learning_rate": 5.71532144903838e-06, "loss": 0.012, "step": 8578 }, { "epoch": 3.9030937215650594, "grad_norm": 0.3070563877459852, "learning_rate": 5.710774458997792e-06, "loss": 0.014, "step": 8579 }, { "epoch": 3.9035486806187443, "grad_norm": 0.22052054802379184, "learning_rate": 5.706229045185604e-06, "loss": 0.0088, "step": 8580 }, { "epoch": 3.9040036396724296, "grad_norm": 0.2909700785858464, "learning_rate": 5.701685207973243e-06, "loss": 0.0108, "step": 8581 }, { "epoch": 3.904458598726115, "grad_norm": 0.43942260975313546, "learning_rate": 5.69714294773202e-06, "loss": 0.0298, "step": 8582 }, { "epoch": 3.9049135577797998, "grad_norm": 0.5078055478036392, "learning_rate": 5.692602264833103e-06, "loss": 0.0299, "step": 8583 }, { "epoch": 3.905368516833485, "grad_norm": 0.20210788333489682, "learning_rate": 5.6880631596475386e-06, "loss": 0.0116, "step": 8584 }, { "epoch": 3.9058234758871704, "grad_norm": 0.2288639528750357, "learning_rate": 5.683525632546244e-06, "loss": 0.0071, "step": 8585 }, { "epoch": 3.9062784349408552, "grad_norm": 0.2870182597503876, "learning_rate": 5.6789896839000014e-06, "loss": 0.018, "step": 8586 }, { "epoch": 3.9067333939945406, "grad_norm": 0.2380479157805101, "learning_rate": 5.6744553140794635e-06, "loss": 0.0066, "step": 8587 }, { "epoch": 3.907188353048226, "grad_norm": 0.5094417884209613, "learning_rate": 5.66992252345517e-06, "loss": 0.0237, "step": 8588 }, { "epoch": 3.9076433121019107, "grad_norm": 0.24351499328080173, "learning_rate": 5.665391312397514e-06, "loss": 0.0048, "step": 8589 }, { "epoch": 3.908098271155596, "grad_norm": 0.37906937338242597, "learning_rate": 5.6608616812767575e-06, "loss": 0.0155, "step": 8590 }, { "epoch": 3.9085532302092814, "grad_norm": 0.34453609456725115, "learning_rate": 5.656333630463059e-06, "loss": 0.0083, "step": 8591 }, { "epoch": 3.9090081892629662, "grad_norm": 0.5214536985202494, "learning_rate": 5.651807160326414e-06, "loss": 0.0296, "step": 8592 }, { "epoch": 3.9094631483166515, "grad_norm": 0.3737643546533609, "learning_rate": 5.647282271236718e-06, "loss": 0.0186, "step": 8593 }, { "epoch": 3.909918107370337, "grad_norm": 0.2519253091890226, "learning_rate": 5.642758963563718e-06, "loss": 0.0081, "step": 8594 }, { "epoch": 3.9103730664240217, "grad_norm": 0.39042882967635517, "learning_rate": 5.6382372376770385e-06, "loss": 0.0107, "step": 8595 }, { "epoch": 3.910828025477707, "grad_norm": 0.1606956496800005, "learning_rate": 5.63371709394617e-06, "loss": 0.0041, "step": 8596 }, { "epoch": 3.9112829845313923, "grad_norm": 0.11569664136481839, "learning_rate": 5.629198532740482e-06, "loss": 0.0022, "step": 8597 }, { "epoch": 3.911737943585077, "grad_norm": 0.2630102442623265, "learning_rate": 5.62468155442922e-06, "loss": 0.0145, "step": 8598 }, { "epoch": 3.9121929026387625, "grad_norm": 0.19184195847832294, "learning_rate": 5.620166159381482e-06, "loss": 0.0068, "step": 8599 }, { "epoch": 3.912647861692448, "grad_norm": 0.10865367270298706, "learning_rate": 5.615652347966246e-06, "loss": 0.0027, "step": 8600 }, { "epoch": 3.9131028207461327, "grad_norm": 0.22221948548878406, "learning_rate": 5.611140120552358e-06, "loss": 0.0053, "step": 8601 }, { "epoch": 3.913557779799818, "grad_norm": 0.3209685522827909, "learning_rate": 5.606629477508543e-06, "loss": 0.0129, "step": 8602 }, { "epoch": 3.9140127388535033, "grad_norm": 0.33069518100033307, "learning_rate": 5.602120419203391e-06, "loss": 0.0263, "step": 8603 }, { "epoch": 3.914467697907188, "grad_norm": 0.5657796543470475, "learning_rate": 5.597612946005349e-06, "loss": 0.0235, "step": 8604 }, { "epoch": 3.9149226569608735, "grad_norm": 0.21074999249686582, "learning_rate": 5.593107058282765e-06, "loss": 0.0057, "step": 8605 }, { "epoch": 3.915377616014559, "grad_norm": 0.3427466210853063, "learning_rate": 5.588602756403821e-06, "loss": 0.0079, "step": 8606 }, { "epoch": 3.9158325750682437, "grad_norm": 0.2586511493301299, "learning_rate": 5.584100040736609e-06, "loss": 0.009, "step": 8607 }, { "epoch": 3.916287534121929, "grad_norm": 0.20933037398125745, "learning_rate": 5.579598911649059e-06, "loss": 0.0095, "step": 8608 }, { "epoch": 3.9167424931756143, "grad_norm": 0.42973093896230646, "learning_rate": 5.5750993695089846e-06, "loss": 0.0196, "step": 8609 }, { "epoch": 3.917197452229299, "grad_norm": 0.25922354307243195, "learning_rate": 5.570601414684062e-06, "loss": 0.0077, "step": 8610 }, { "epoch": 3.9176524112829845, "grad_norm": 0.28045044187901447, "learning_rate": 5.566105047541847e-06, "loss": 0.0062, "step": 8611 }, { "epoch": 3.91810737033667, "grad_norm": 0.32967880074076733, "learning_rate": 5.561610268449774e-06, "loss": 0.0124, "step": 8612 }, { "epoch": 3.9185623293903546, "grad_norm": 0.23800127759008535, "learning_rate": 5.5571170777751255e-06, "loss": 0.008, "step": 8613 }, { "epoch": 3.91901728844404, "grad_norm": 0.276945119725163, "learning_rate": 5.552625475885065e-06, "loss": 0.0109, "step": 8614 }, { "epoch": 3.9194722474977253, "grad_norm": 0.21904620882690695, "learning_rate": 5.548135463146623e-06, "loss": 0.0066, "step": 8615 }, { "epoch": 3.91992720655141, "grad_norm": 0.3021372752363788, "learning_rate": 5.543647039926711e-06, "loss": 0.0149, "step": 8616 }, { "epoch": 3.9203821656050954, "grad_norm": 0.4444956845010533, "learning_rate": 5.539160206592101e-06, "loss": 0.0154, "step": 8617 }, { "epoch": 3.9208371246587808, "grad_norm": 0.273465984883891, "learning_rate": 5.534674963509429e-06, "loss": 0.018, "step": 8618 }, { "epoch": 3.9212920837124656, "grad_norm": 0.26924351123728474, "learning_rate": 5.530191311045218e-06, "loss": 0.0073, "step": 8619 }, { "epoch": 3.921747042766151, "grad_norm": 0.2621374754007488, "learning_rate": 5.525709249565841e-06, "loss": 0.0064, "step": 8620 }, { "epoch": 3.9222020018198362, "grad_norm": 0.3052735127986808, "learning_rate": 5.521228779437568e-06, "loss": 0.0218, "step": 8621 }, { "epoch": 3.922656960873521, "grad_norm": 0.24747201604764824, "learning_rate": 5.5167499010265135e-06, "loss": 0.0176, "step": 8622 }, { "epoch": 3.9231119199272064, "grad_norm": 0.3466231617830664, "learning_rate": 5.5122726146986714e-06, "loss": 0.0198, "step": 8623 }, { "epoch": 3.9235668789808917, "grad_norm": 0.22805928231796596, "learning_rate": 5.507796920819905e-06, "loss": 0.0153, "step": 8624 }, { "epoch": 3.924021838034577, "grad_norm": 0.4125785093206727, "learning_rate": 5.503322819755941e-06, "loss": 0.0229, "step": 8625 }, { "epoch": 3.924476797088262, "grad_norm": 0.3016032848057149, "learning_rate": 5.498850311872391e-06, "loss": 0.0289, "step": 8626 }, { "epoch": 3.9249317561419472, "grad_norm": 0.3291996683112376, "learning_rate": 5.494379397534733e-06, "loss": 0.0092, "step": 8627 }, { "epoch": 3.9253867151956325, "grad_norm": 0.2150787267623478, "learning_rate": 5.489910077108304e-06, "loss": 0.0119, "step": 8628 }, { "epoch": 3.9258416742493174, "grad_norm": 0.29481066275940365, "learning_rate": 5.485442350958317e-06, "loss": 0.0086, "step": 8629 }, { "epoch": 3.9262966333030027, "grad_norm": 0.3591260174547051, "learning_rate": 5.480976219449849e-06, "loss": 0.0075, "step": 8630 }, { "epoch": 3.926751592356688, "grad_norm": 0.49382614884966614, "learning_rate": 5.47651168294786e-06, "loss": 0.0164, "step": 8631 }, { "epoch": 3.9272065514103733, "grad_norm": 0.3866707359424419, "learning_rate": 5.472048741817165e-06, "loss": 0.0234, "step": 8632 }, { "epoch": 3.927661510464058, "grad_norm": 0.2506979338925986, "learning_rate": 5.467587396422466e-06, "loss": 0.0161, "step": 8633 }, { "epoch": 3.9281164695177435, "grad_norm": 0.29428100251556394, "learning_rate": 5.463127647128319e-06, "loss": 0.0116, "step": 8634 }, { "epoch": 3.928571428571429, "grad_norm": 0.2126654597440702, "learning_rate": 5.458669494299143e-06, "loss": 0.0067, "step": 8635 }, { "epoch": 3.9290263876251137, "grad_norm": 0.45157758713605906, "learning_rate": 5.454212938299255e-06, "loss": 0.0207, "step": 8636 }, { "epoch": 3.929481346678799, "grad_norm": 0.421902722295526, "learning_rate": 5.44975797949282e-06, "loss": 0.0191, "step": 8637 }, { "epoch": 3.9299363057324843, "grad_norm": 0.4221670983429178, "learning_rate": 5.445304618243874e-06, "loss": 0.0458, "step": 8638 }, { "epoch": 3.930391264786169, "grad_norm": 0.27371455171352266, "learning_rate": 5.44085285491632e-06, "loss": 0.0106, "step": 8639 }, { "epoch": 3.9308462238398545, "grad_norm": 0.2653656799138447, "learning_rate": 5.43640268987394e-06, "loss": 0.0081, "step": 8640 }, { "epoch": 3.93130118289354, "grad_norm": 0.35474635197960036, "learning_rate": 5.431954123480393e-06, "loss": 0.015, "step": 8641 }, { "epoch": 3.9317561419472247, "grad_norm": 0.34008204403176745, "learning_rate": 5.427507156099184e-06, "loss": 0.0181, "step": 8642 }, { "epoch": 3.93221110100091, "grad_norm": 0.3271904880820656, "learning_rate": 5.423061788093706e-06, "loss": 0.0188, "step": 8643 }, { "epoch": 3.9326660600545953, "grad_norm": 0.2613228649675782, "learning_rate": 5.418618019827198e-06, "loss": 0.0087, "step": 8644 }, { "epoch": 3.93312101910828, "grad_norm": 0.17552408113401605, "learning_rate": 5.414175851662806e-06, "loss": 0.0065, "step": 8645 }, { "epoch": 3.9335759781619655, "grad_norm": 0.37709179887521843, "learning_rate": 5.40973528396351e-06, "loss": 0.0254, "step": 8646 }, { "epoch": 3.934030937215651, "grad_norm": 0.1503892785741061, "learning_rate": 5.4052963170921816e-06, "loss": 0.0033, "step": 8647 }, { "epoch": 3.9344858962693356, "grad_norm": 0.30162342892918836, "learning_rate": 5.40085895141155e-06, "loss": 0.0144, "step": 8648 }, { "epoch": 3.934940855323021, "grad_norm": 0.4836616621307628, "learning_rate": 5.396423187284208e-06, "loss": 0.008, "step": 8649 }, { "epoch": 3.9353958143767063, "grad_norm": 0.2404705892199787, "learning_rate": 5.391989025072644e-06, "loss": 0.0079, "step": 8650 }, { "epoch": 3.935850773430391, "grad_norm": 0.4471614237219351, "learning_rate": 5.387556465139184e-06, "loss": 0.0196, "step": 8651 }, { "epoch": 3.9363057324840764, "grad_norm": 0.3473416655824439, "learning_rate": 5.383125507846043e-06, "loss": 0.025, "step": 8652 }, { "epoch": 3.9367606915377618, "grad_norm": 0.24065649023410002, "learning_rate": 5.378696153555291e-06, "loss": 0.0097, "step": 8653 }, { "epoch": 3.9372156505914466, "grad_norm": 0.35223774326622115, "learning_rate": 5.374268402628876e-06, "loss": 0.0116, "step": 8654 }, { "epoch": 3.937670609645132, "grad_norm": 0.2621044597236205, "learning_rate": 5.369842255428628e-06, "loss": 0.0117, "step": 8655 }, { "epoch": 3.9381255686988172, "grad_norm": 0.3468880190836337, "learning_rate": 5.365417712316223e-06, "loss": 0.0084, "step": 8656 }, { "epoch": 3.938580527752502, "grad_norm": 0.13208759096859762, "learning_rate": 5.360994773653211e-06, "loss": 0.0038, "step": 8657 }, { "epoch": 3.9390354868061874, "grad_norm": 0.3301791340916573, "learning_rate": 5.356573439801018e-06, "loss": 0.014, "step": 8658 }, { "epoch": 3.9394904458598727, "grad_norm": 0.21757708602503392, "learning_rate": 5.352153711120927e-06, "loss": 0.0078, "step": 8659 }, { "epoch": 3.9399454049135576, "grad_norm": 0.28504293001820624, "learning_rate": 5.347735587974107e-06, "loss": 0.0117, "step": 8660 }, { "epoch": 3.940400363967243, "grad_norm": 0.2730257849776017, "learning_rate": 5.343319070721592e-06, "loss": 0.0317, "step": 8661 }, { "epoch": 3.9408553230209282, "grad_norm": 0.26158239445854886, "learning_rate": 5.338904159724275e-06, "loss": 0.013, "step": 8662 }, { "epoch": 3.941310282074613, "grad_norm": 0.354200521114593, "learning_rate": 5.334490855342922e-06, "loss": 0.0234, "step": 8663 }, { "epoch": 3.9417652411282984, "grad_norm": 0.17027362286956663, "learning_rate": 5.330079157938159e-06, "loss": 0.0038, "step": 8664 }, { "epoch": 3.9422202001819837, "grad_norm": 0.2814505902694766, "learning_rate": 5.325669067870504e-06, "loss": 0.0085, "step": 8665 }, { "epoch": 3.9426751592356686, "grad_norm": 0.268599458179651, "learning_rate": 5.321260585500326e-06, "loss": 0.0079, "step": 8666 }, { "epoch": 3.943130118289354, "grad_norm": 0.3074844420126451, "learning_rate": 5.316853711187858e-06, "loss": 0.0104, "step": 8667 }, { "epoch": 3.943585077343039, "grad_norm": 0.3857247997990402, "learning_rate": 5.312448445293225e-06, "loss": 0.0164, "step": 8668 }, { "epoch": 3.944040036396724, "grad_norm": 0.34469060664854556, "learning_rate": 5.308044788176386e-06, "loss": 0.0082, "step": 8669 }, { "epoch": 3.9444949954504094, "grad_norm": 0.19218552450692059, "learning_rate": 5.3036427401972096e-06, "loss": 0.0068, "step": 8670 }, { "epoch": 3.9449499545040947, "grad_norm": 0.35517840186245525, "learning_rate": 5.299242301715399e-06, "loss": 0.0224, "step": 8671 }, { "epoch": 3.9454049135577796, "grad_norm": 0.3129884601337448, "learning_rate": 5.294843473090539e-06, "loss": 0.0112, "step": 8672 }, { "epoch": 3.945859872611465, "grad_norm": 0.2471677510153382, "learning_rate": 5.290446254682075e-06, "loss": 0.0135, "step": 8673 }, { "epoch": 3.94631483166515, "grad_norm": 0.22853570093931014, "learning_rate": 5.286050646849336e-06, "loss": 0.0122, "step": 8674 }, { "epoch": 3.946769790718835, "grad_norm": 0.2194844589550376, "learning_rate": 5.281656649951519e-06, "loss": 0.0077, "step": 8675 }, { "epoch": 3.9472247497725204, "grad_norm": 0.2320063110821586, "learning_rate": 5.2772642643476725e-06, "loss": 0.0065, "step": 8676 }, { "epoch": 3.9476797088262057, "grad_norm": 0.30031079841732794, "learning_rate": 5.272873490396723e-06, "loss": 0.0111, "step": 8677 }, { "epoch": 3.9481346678798905, "grad_norm": 0.2149837867479485, "learning_rate": 5.268484328457457e-06, "loss": 0.01, "step": 8678 }, { "epoch": 3.948589626933576, "grad_norm": 0.5154496345000517, "learning_rate": 5.2640967788885554e-06, "loss": 0.0243, "step": 8679 }, { "epoch": 3.949044585987261, "grad_norm": 0.327050150724537, "learning_rate": 5.259710842048535e-06, "loss": 0.0188, "step": 8680 }, { "epoch": 3.9494995450409465, "grad_norm": 0.2845923411320096, "learning_rate": 5.255326518295792e-06, "loss": 0.014, "step": 8681 }, { "epoch": 3.9499545040946313, "grad_norm": 0.08078096677895169, "learning_rate": 5.250943807988606e-06, "loss": 0.0017, "step": 8682 }, { "epoch": 3.9504094631483166, "grad_norm": 0.3431414631935216, "learning_rate": 5.246562711485101e-06, "loss": 0.0146, "step": 8683 }, { "epoch": 3.950864422202002, "grad_norm": 0.2954275564692978, "learning_rate": 5.242183229143294e-06, "loss": 0.0149, "step": 8684 }, { "epoch": 3.951319381255687, "grad_norm": 0.3201817383752256, "learning_rate": 5.237805361321044e-06, "loss": 0.0303, "step": 8685 }, { "epoch": 3.951774340309372, "grad_norm": 0.3227062114282842, "learning_rate": 5.233429108376098e-06, "loss": 0.0199, "step": 8686 }, { "epoch": 3.9522292993630574, "grad_norm": 0.2710371942507668, "learning_rate": 5.229054470666051e-06, "loss": 0.0123, "step": 8687 }, { "epoch": 3.9526842584167428, "grad_norm": 0.31189633321560883, "learning_rate": 5.224681448548388e-06, "loss": 0.0154, "step": 8688 }, { "epoch": 3.9531392174704276, "grad_norm": 0.2710236791170992, "learning_rate": 5.2203100423804605e-06, "loss": 0.0259, "step": 8689 }, { "epoch": 3.953594176524113, "grad_norm": 0.18147563215013932, "learning_rate": 5.215940252519472e-06, "loss": 0.0062, "step": 8690 }, { "epoch": 3.9540491355777982, "grad_norm": 0.3199009428712182, "learning_rate": 5.2115720793225e-06, "loss": 0.017, "step": 8691 }, { "epoch": 3.954504094631483, "grad_norm": 0.483218233887957, "learning_rate": 5.207205523146497e-06, "loss": 0.0214, "step": 8692 }, { "epoch": 3.9549590536851684, "grad_norm": 0.09956032245664645, "learning_rate": 5.202840584348265e-06, "loss": 0.0023, "step": 8693 }, { "epoch": 3.9554140127388537, "grad_norm": 0.35918205858259133, "learning_rate": 5.198477263284507e-06, "loss": 0.0088, "step": 8694 }, { "epoch": 3.9558689717925386, "grad_norm": 0.26681012124168263, "learning_rate": 5.194115560311754e-06, "loss": 0.0104, "step": 8695 }, { "epoch": 3.956323930846224, "grad_norm": 0.42445163102957983, "learning_rate": 5.189755475786445e-06, "loss": 0.0274, "step": 8696 }, { "epoch": 3.9567788898999092, "grad_norm": 0.40921773229105735, "learning_rate": 5.185397010064855e-06, "loss": 0.0122, "step": 8697 }, { "epoch": 3.957233848953594, "grad_norm": 0.26013540145321096, "learning_rate": 5.181040163503132e-06, "loss": 0.0071, "step": 8698 }, { "epoch": 3.9576888080072794, "grad_norm": 0.40532646824677887, "learning_rate": 5.1766849364573126e-06, "loss": 0.0229, "step": 8699 }, { "epoch": 3.9581437670609647, "grad_norm": 0.23241699491901535, "learning_rate": 5.172331329283281e-06, "loss": 0.0098, "step": 8700 }, { "epoch": 3.9585987261146496, "grad_norm": 0.2920075820265827, "learning_rate": 5.167979342336787e-06, "loss": 0.016, "step": 8701 }, { "epoch": 3.959053685168335, "grad_norm": 0.22731289650514092, "learning_rate": 5.163628975973458e-06, "loss": 0.0038, "step": 8702 }, { "epoch": 3.95950864422202, "grad_norm": 0.30280566588634816, "learning_rate": 5.159280230548788e-06, "loss": 0.0168, "step": 8703 }, { "epoch": 3.959963603275705, "grad_norm": 0.1905687913042166, "learning_rate": 5.154933106418145e-06, "loss": 0.014, "step": 8704 }, { "epoch": 3.9604185623293904, "grad_norm": 0.06401312089504042, "learning_rate": 5.150587603936746e-06, "loss": 0.0017, "step": 8705 }, { "epoch": 3.9608735213830757, "grad_norm": 0.14844960647993977, "learning_rate": 5.146243723459693e-06, "loss": 0.0042, "step": 8706 }, { "epoch": 3.9613284804367606, "grad_norm": 0.3826159445670554, "learning_rate": 5.141901465341933e-06, "loss": 0.0209, "step": 8707 }, { "epoch": 3.961783439490446, "grad_norm": 0.3694635354924652, "learning_rate": 5.1375608299383175e-06, "loss": 0.0163, "step": 8708 }, { "epoch": 3.962238398544131, "grad_norm": 0.45410199022381087, "learning_rate": 5.1332218176035255e-06, "loss": 0.0335, "step": 8709 }, { "epoch": 3.962693357597816, "grad_norm": 0.3410423091896633, "learning_rate": 5.128884428692135e-06, "loss": 0.0098, "step": 8710 }, { "epoch": 3.9631483166515014, "grad_norm": 0.31013773564581554, "learning_rate": 5.124548663558571e-06, "loss": 0.0193, "step": 8711 }, { "epoch": 3.9636032757051867, "grad_norm": 0.29406104249335263, "learning_rate": 5.120214522557129e-06, "loss": 0.0072, "step": 8712 }, { "epoch": 3.9640582347588715, "grad_norm": 0.3264163710541041, "learning_rate": 5.115882006041983e-06, "loss": 0.012, "step": 8713 }, { "epoch": 3.964513193812557, "grad_norm": 0.23444717547100158, "learning_rate": 5.1115511143671666e-06, "loss": 0.0105, "step": 8714 }, { "epoch": 3.964968152866242, "grad_norm": 0.20915412028268648, "learning_rate": 5.107221847886576e-06, "loss": 0.0083, "step": 8715 }, { "epoch": 3.965423111919927, "grad_norm": 0.23842385197051916, "learning_rate": 5.102894206953976e-06, "loss": 0.0123, "step": 8716 }, { "epoch": 3.9658780709736123, "grad_norm": 0.32946168221087646, "learning_rate": 5.098568191923006e-06, "loss": 0.0056, "step": 8717 }, { "epoch": 3.9663330300272976, "grad_norm": 0.6624205022100376, "learning_rate": 5.094243803147175e-06, "loss": 0.0205, "step": 8718 }, { "epoch": 3.9667879890809825, "grad_norm": 0.2618007687815463, "learning_rate": 5.089921040979848e-06, "loss": 0.0088, "step": 8719 }, { "epoch": 3.967242948134668, "grad_norm": 0.28273012967430555, "learning_rate": 5.08559990577426e-06, "loss": 0.0116, "step": 8720 }, { "epoch": 3.967697907188353, "grad_norm": 0.36039060431489345, "learning_rate": 5.081280397883509e-06, "loss": 0.0116, "step": 8721 }, { "epoch": 3.968152866242038, "grad_norm": 0.3859244747825221, "learning_rate": 5.076962517660577e-06, "loss": 0.0175, "step": 8722 }, { "epoch": 3.9686078252957233, "grad_norm": 0.31315313762391084, "learning_rate": 5.072646265458292e-06, "loss": 0.0084, "step": 8723 }, { "epoch": 3.9690627843494086, "grad_norm": 0.2237367468107488, "learning_rate": 5.068331641629368e-06, "loss": 0.0123, "step": 8724 }, { "epoch": 3.9695177434030935, "grad_norm": 0.33124823865224146, "learning_rate": 5.064018646526372e-06, "loss": 0.015, "step": 8725 }, { "epoch": 3.969972702456779, "grad_norm": 0.22946495571684628, "learning_rate": 5.059707280501735e-06, "loss": 0.0115, "step": 8726 }, { "epoch": 3.970427661510464, "grad_norm": 0.2738871232600508, "learning_rate": 5.055397543907778e-06, "loss": 0.0061, "step": 8727 }, { "epoch": 3.970882620564149, "grad_norm": 0.47898602720840494, "learning_rate": 5.051089437096662e-06, "loss": 0.0188, "step": 8728 }, { "epoch": 3.9713375796178343, "grad_norm": 0.39577998899641254, "learning_rate": 5.046782960420432e-06, "loss": 0.0135, "step": 8729 }, { "epoch": 3.9717925386715196, "grad_norm": 0.3656358983862938, "learning_rate": 5.04247811423098e-06, "loss": 0.0226, "step": 8730 }, { "epoch": 3.9722474977252045, "grad_norm": 0.21406872594105775, "learning_rate": 5.038174898880099e-06, "loss": 0.0076, "step": 8731 }, { "epoch": 3.97270245677889, "grad_norm": 0.38584244915798194, "learning_rate": 5.0338733147194085e-06, "loss": 0.0118, "step": 8732 }, { "epoch": 3.973157415832575, "grad_norm": 0.3663051705371014, "learning_rate": 5.029573362100434e-06, "loss": 0.0127, "step": 8733 }, { "epoch": 3.9736123748862604, "grad_norm": 0.262214222993254, "learning_rate": 5.025275041374539e-06, "loss": 0.0119, "step": 8734 }, { "epoch": 3.9740673339399453, "grad_norm": 0.2933988961452443, "learning_rate": 5.020978352892961e-06, "loss": 0.0107, "step": 8735 }, { "epoch": 3.9745222929936306, "grad_norm": 0.2581041970703221, "learning_rate": 5.016683297006802e-06, "loss": 0.0162, "step": 8736 }, { "epoch": 3.974977252047316, "grad_norm": 0.547813772176782, "learning_rate": 5.012389874067039e-06, "loss": 0.0309, "step": 8737 }, { "epoch": 3.9754322111010008, "grad_norm": 0.31603536914078173, "learning_rate": 5.00809808442452e-06, "loss": 0.0163, "step": 8738 }, { "epoch": 3.975887170154686, "grad_norm": 0.5009958511735699, "learning_rate": 5.003807928429941e-06, "loss": 0.0108, "step": 8739 }, { "epoch": 3.9763421292083714, "grad_norm": 0.2554099781049232, "learning_rate": 4.999519406433878e-06, "loss": 0.011, "step": 8740 }, { "epoch": 3.9767970882620567, "grad_norm": 0.22461809327831284, "learning_rate": 4.99523251878676e-06, "loss": 0.006, "step": 8741 }, { "epoch": 3.9772520473157416, "grad_norm": 0.4256811839126272, "learning_rate": 4.9909472658389054e-06, "loss": 0.0136, "step": 8742 }, { "epoch": 3.977707006369427, "grad_norm": 0.33395161691581693, "learning_rate": 4.986663647940482e-06, "loss": 0.0125, "step": 8743 }, { "epoch": 3.978161965423112, "grad_norm": 0.21620890544333965, "learning_rate": 4.982381665441518e-06, "loss": 0.0049, "step": 8744 }, { "epoch": 3.978616924476797, "grad_norm": 0.2725091816147469, "learning_rate": 4.978101318691936e-06, "loss": 0.0242, "step": 8745 }, { "epoch": 3.9790718835304824, "grad_norm": 0.19037005088474454, "learning_rate": 4.973822608041484e-06, "loss": 0.006, "step": 8746 }, { "epoch": 3.9795268425841677, "grad_norm": 0.2369481892827748, "learning_rate": 4.96954553383982e-06, "loss": 0.0106, "step": 8747 }, { "epoch": 3.9799818016378525, "grad_norm": 0.2348988639800447, "learning_rate": 4.965270096436439e-06, "loss": 0.0065, "step": 8748 }, { "epoch": 3.980436760691538, "grad_norm": 0.2748459274234872, "learning_rate": 4.960996296180709e-06, "loss": 0.0101, "step": 8749 }, { "epoch": 3.980891719745223, "grad_norm": 0.3278241836584118, "learning_rate": 4.956724133421861e-06, "loss": 0.0198, "step": 8750 }, { "epoch": 3.981346678798908, "grad_norm": 0.5331649702486604, "learning_rate": 4.952453608509e-06, "loss": 0.0212, "step": 8751 }, { "epoch": 3.9818016378525933, "grad_norm": 0.2532210044334437, "learning_rate": 4.9481847217911045e-06, "loss": 0.0072, "step": 8752 }, { "epoch": 3.9822565969062786, "grad_norm": 0.29052056271189164, "learning_rate": 4.943917473616999e-06, "loss": 0.0192, "step": 8753 }, { "epoch": 3.9827115559599635, "grad_norm": 0.11772651025526051, "learning_rate": 4.939651864335385e-06, "loss": 0.0025, "step": 8754 }, { "epoch": 3.983166515013649, "grad_norm": 0.2627155008751083, "learning_rate": 4.935387894294824e-06, "loss": 0.0211, "step": 8755 }, { "epoch": 3.983621474067334, "grad_norm": 0.37585522773120883, "learning_rate": 4.931125563843758e-06, "loss": 0.0196, "step": 8756 }, { "epoch": 3.984076433121019, "grad_norm": 0.18226675616625954, "learning_rate": 4.926864873330484e-06, "loss": 0.0093, "step": 8757 }, { "epoch": 3.9845313921747043, "grad_norm": 0.2804856823680643, "learning_rate": 4.922605823103152e-06, "loss": 0.0089, "step": 8758 }, { "epoch": 3.9849863512283896, "grad_norm": 0.40973484049229225, "learning_rate": 4.918348413509813e-06, "loss": 0.008, "step": 8759 }, { "epoch": 3.9854413102820745, "grad_norm": 0.29365851124293324, "learning_rate": 4.914092644898346e-06, "loss": 0.0085, "step": 8760 }, { "epoch": 3.98589626933576, "grad_norm": 0.33173035592716077, "learning_rate": 4.909838517616527e-06, "loss": 0.0124, "step": 8761 }, { "epoch": 3.986351228389445, "grad_norm": 0.15723298976178557, "learning_rate": 4.90558603201198e-06, "loss": 0.0039, "step": 8762 }, { "epoch": 3.98680618744313, "grad_norm": 0.36102186651536333, "learning_rate": 4.901335188432194e-06, "loss": 0.0153, "step": 8763 }, { "epoch": 3.9872611464968153, "grad_norm": 0.18321253545360514, "learning_rate": 4.897085987224534e-06, "loss": 0.0074, "step": 8764 }, { "epoch": 3.9877161055505006, "grad_norm": 0.21292645119582815, "learning_rate": 4.89283842873621e-06, "loss": 0.0052, "step": 8765 }, { "epoch": 3.9881710646041855, "grad_norm": 0.28152101347609454, "learning_rate": 4.8885925133143385e-06, "loss": 0.0091, "step": 8766 }, { "epoch": 3.988626023657871, "grad_norm": 0.3351503399948722, "learning_rate": 4.8843482413058635e-06, "loss": 0.0126, "step": 8767 }, { "epoch": 3.989080982711556, "grad_norm": 0.2362555402097503, "learning_rate": 4.880105613057612e-06, "loss": 0.0101, "step": 8768 }, { "epoch": 3.989535941765241, "grad_norm": 0.399696194813735, "learning_rate": 4.875864628916265e-06, "loss": 0.0187, "step": 8769 }, { "epoch": 3.9899909008189263, "grad_norm": 0.2210809351893775, "learning_rate": 4.871625289228377e-06, "loss": 0.0025, "step": 8770 }, { "epoch": 3.9904458598726116, "grad_norm": 0.3874055886393434, "learning_rate": 4.867387594340378e-06, "loss": 0.0167, "step": 8771 }, { "epoch": 3.9909008189262964, "grad_norm": 0.36225860676070304, "learning_rate": 4.8631515445985404e-06, "loss": 0.0203, "step": 8772 }, { "epoch": 3.9913557779799818, "grad_norm": 0.12206578813525332, "learning_rate": 4.858917140349026e-06, "loss": 0.0112, "step": 8773 }, { "epoch": 3.991810737033667, "grad_norm": 0.3481942737272652, "learning_rate": 4.854684381937846e-06, "loss": 0.0142, "step": 8774 }, { "epoch": 3.992265696087352, "grad_norm": 0.3865886425147189, "learning_rate": 4.850453269710878e-06, "loss": 0.0134, "step": 8775 }, { "epoch": 3.9927206551410372, "grad_norm": 0.35445825630474814, "learning_rate": 4.846223804013883e-06, "loss": 0.0113, "step": 8776 }, { "epoch": 3.9931756141947226, "grad_norm": 0.28239990713154395, "learning_rate": 4.8419959851924625e-06, "loss": 0.0112, "step": 8777 }, { "epoch": 3.9936305732484074, "grad_norm": 0.20597364551269545, "learning_rate": 4.8377698135920965e-06, "loss": 0.0114, "step": 8778 }, { "epoch": 3.9940855323020927, "grad_norm": 0.28431053068726514, "learning_rate": 4.833545289558125e-06, "loss": 0.0197, "step": 8779 }, { "epoch": 3.994540491355778, "grad_norm": 0.3245182071871113, "learning_rate": 4.829322413435761e-06, "loss": 0.0103, "step": 8780 }, { "epoch": 3.994995450409463, "grad_norm": 0.3192855747581905, "learning_rate": 4.8251011855700855e-06, "loss": 0.0121, "step": 8781 }, { "epoch": 3.9954504094631482, "grad_norm": 0.4110362005289237, "learning_rate": 4.8208816063060324e-06, "loss": 0.0104, "step": 8782 }, { "epoch": 3.9959053685168335, "grad_norm": 0.10991303048353536, "learning_rate": 4.816663675988406e-06, "loss": 0.0037, "step": 8783 }, { "epoch": 3.9963603275705184, "grad_norm": 0.28147897897865704, "learning_rate": 4.812447394961869e-06, "loss": 0.0102, "step": 8784 }, { "epoch": 3.9968152866242037, "grad_norm": 0.2557813662900227, "learning_rate": 4.808232763570972e-06, "loss": 0.0079, "step": 8785 }, { "epoch": 3.997270245677889, "grad_norm": 0.25964611223730294, "learning_rate": 4.8040197821601045e-06, "loss": 0.0112, "step": 8786 }, { "epoch": 3.997725204731574, "grad_norm": 0.37955183003930215, "learning_rate": 4.79980845107354e-06, "loss": 0.0149, "step": 8787 }, { "epoch": 3.998180163785259, "grad_norm": 0.3604041716522406, "learning_rate": 4.795598770655408e-06, "loss": 0.0231, "step": 8788 }, { "epoch": 3.9986351228389445, "grad_norm": 0.23619303518611404, "learning_rate": 4.791390741249691e-06, "loss": 0.0079, "step": 8789 }, { "epoch": 3.99909008189263, "grad_norm": 0.4096682955879523, "learning_rate": 4.787184363200273e-06, "loss": 0.0169, "step": 8790 }, { "epoch": 3.9995450409463147, "grad_norm": 0.3947014448987931, "learning_rate": 4.782979636850865e-06, "loss": 0.0112, "step": 8791 }, { "epoch": 4.0, "grad_norm": 0.25370985546020647, "learning_rate": 4.778776562545062e-06, "loss": 0.0084, "step": 8792 }, { "epoch": 4.000454959053685, "grad_norm": 1.2565376275659896, "learning_rate": 4.7745751406263165e-06, "loss": 0.0113, "step": 8793 }, { "epoch": 4.000909918107371, "grad_norm": 0.14969893914369456, "learning_rate": 4.770375371437951e-06, "loss": 0.0014, "step": 8794 }, { "epoch": 4.0013648771610555, "grad_norm": 0.06916074979417189, "learning_rate": 4.766177255323162e-06, "loss": 0.0012, "step": 8795 }, { "epoch": 4.00181983621474, "grad_norm": 0.15483948403423475, "learning_rate": 4.76198079262499e-06, "loss": 0.0076, "step": 8796 }, { "epoch": 4.002274795268426, "grad_norm": 0.09611805651273075, "learning_rate": 4.757785983686355e-06, "loss": 0.0013, "step": 8797 }, { "epoch": 4.002729754322111, "grad_norm": 0.12869692976728214, "learning_rate": 4.753592828850032e-06, "loss": 0.0035, "step": 8798 }, { "epoch": 4.003184713375796, "grad_norm": 0.015575294427033617, "learning_rate": 4.749401328458675e-06, "loss": 0.0004, "step": 8799 }, { "epoch": 4.003639672429482, "grad_norm": 0.11439174035629932, "learning_rate": 4.745211482854783e-06, "loss": 0.003, "step": 8800 }, { "epoch": 4.0040946314831665, "grad_norm": 0.13200346004463334, "learning_rate": 4.7410232923807475e-06, "loss": 0.0053, "step": 8801 }, { "epoch": 4.004549590536851, "grad_norm": 0.15876169054241482, "learning_rate": 4.736836757378799e-06, "loss": 0.0049, "step": 8802 }, { "epoch": 4.005004549590537, "grad_norm": 0.09215895446875283, "learning_rate": 4.732651878191044e-06, "loss": 0.0025, "step": 8803 }, { "epoch": 4.005459508644222, "grad_norm": 0.3842893105822144, "learning_rate": 4.728468655159443e-06, "loss": 0.0082, "step": 8804 }, { "epoch": 4.005914467697907, "grad_norm": 0.20746867732988672, "learning_rate": 4.724287088625845e-06, "loss": 0.0087, "step": 8805 }, { "epoch": 4.006369426751593, "grad_norm": 0.06791202865747527, "learning_rate": 4.720107178931943e-06, "loss": 0.0012, "step": 8806 }, { "epoch": 4.0068243858052774, "grad_norm": 0.19401495154189544, "learning_rate": 4.715928926419292e-06, "loss": 0.0044, "step": 8807 }, { "epoch": 4.007279344858962, "grad_norm": 0.14132278280712354, "learning_rate": 4.711752331429334e-06, "loss": 0.0038, "step": 8808 }, { "epoch": 4.007734303912648, "grad_norm": 0.15649824627216133, "learning_rate": 4.7075773943033475e-06, "loss": 0.0026, "step": 8809 }, { "epoch": 4.008189262966333, "grad_norm": 0.20829316736391645, "learning_rate": 4.703404115382501e-06, "loss": 0.0062, "step": 8810 }, { "epoch": 4.008644222020018, "grad_norm": 0.021655103882586946, "learning_rate": 4.699232495007816e-06, "loss": 0.0005, "step": 8811 }, { "epoch": 4.0090991810737036, "grad_norm": 0.11841418192492906, "learning_rate": 4.695062533520169e-06, "loss": 0.0062, "step": 8812 }, { "epoch": 4.009554140127388, "grad_norm": 0.058252038938096204, "learning_rate": 4.690894231260312e-06, "loss": 0.0014, "step": 8813 }, { "epoch": 4.010009099181073, "grad_norm": 0.10754331651541585, "learning_rate": 4.686727588568865e-06, "loss": 0.0031, "step": 8814 }, { "epoch": 4.010464058234759, "grad_norm": 0.11126500016301948, "learning_rate": 4.682562605786309e-06, "loss": 0.0039, "step": 8815 }, { "epoch": 4.010919017288444, "grad_norm": 0.0807335067774098, "learning_rate": 4.678399283252985e-06, "loss": 0.0025, "step": 8816 }, { "epoch": 4.011373976342129, "grad_norm": 0.19248968044088444, "learning_rate": 4.674237621309099e-06, "loss": 0.0028, "step": 8817 }, { "epoch": 4.0118289353958145, "grad_norm": 0.11376635593033657, "learning_rate": 4.6700776202947184e-06, "loss": 0.0022, "step": 8818 }, { "epoch": 4.012283894449499, "grad_norm": 0.05704169444666614, "learning_rate": 4.665919280549794e-06, "loss": 0.0008, "step": 8819 }, { "epoch": 4.012738853503185, "grad_norm": 0.20817229502601317, "learning_rate": 4.661762602414116e-06, "loss": 0.008, "step": 8820 }, { "epoch": 4.01319381255687, "grad_norm": 0.01847503344483732, "learning_rate": 4.657607586227345e-06, "loss": 0.0004, "step": 8821 }, { "epoch": 4.013648771610555, "grad_norm": 0.12086532606996418, "learning_rate": 4.6534542323290244e-06, "loss": 0.0036, "step": 8822 }, { "epoch": 4.014103730664241, "grad_norm": 0.08728589328579618, "learning_rate": 4.649302541058531e-06, "loss": 0.0043, "step": 8823 }, { "epoch": 4.0145586897179255, "grad_norm": 0.08630003581661048, "learning_rate": 4.645152512755141e-06, "loss": 0.0022, "step": 8824 }, { "epoch": 4.01501364877161, "grad_norm": 0.060021347249320574, "learning_rate": 4.641004147757963e-06, "loss": 0.0011, "step": 8825 }, { "epoch": 4.015468607825296, "grad_norm": 0.15275599985738086, "learning_rate": 4.636857446405988e-06, "loss": 0.0027, "step": 8826 }, { "epoch": 4.015923566878981, "grad_norm": 0.08338033900276674, "learning_rate": 4.632712409038054e-06, "loss": 0.0025, "step": 8827 }, { "epoch": 4.016378525932666, "grad_norm": 0.019573744139982942, "learning_rate": 4.628569035992886e-06, "loss": 0.0004, "step": 8828 }, { "epoch": 4.016833484986352, "grad_norm": 0.06321139607430466, "learning_rate": 4.624427327609066e-06, "loss": 0.0017, "step": 8829 }, { "epoch": 4.0172884440400365, "grad_norm": 0.032297723478943725, "learning_rate": 4.6202872842250275e-06, "loss": 0.0009, "step": 8830 }, { "epoch": 4.017743403093721, "grad_norm": 0.2146336232841078, "learning_rate": 4.616148906179082e-06, "loss": 0.0081, "step": 8831 }, { "epoch": 4.018198362147407, "grad_norm": 0.09104198737592605, "learning_rate": 4.612012193809387e-06, "loss": 0.0025, "step": 8832 }, { "epoch": 4.018653321201092, "grad_norm": 0.03154828705268825, "learning_rate": 4.60787714745399e-06, "loss": 0.001, "step": 8833 }, { "epoch": 4.019108280254777, "grad_norm": 0.18362139219765009, "learning_rate": 4.603743767450783e-06, "loss": 0.0063, "step": 8834 }, { "epoch": 4.019563239308463, "grad_norm": 0.13455231930970354, "learning_rate": 4.59961205413752e-06, "loss": 0.0044, "step": 8835 }, { "epoch": 4.0200181983621475, "grad_norm": 0.07334237922418727, "learning_rate": 4.59548200785184e-06, "loss": 0.0021, "step": 8836 }, { "epoch": 4.020473157415832, "grad_norm": 0.11567757534438292, "learning_rate": 4.591353628931222e-06, "loss": 0.0032, "step": 8837 }, { "epoch": 4.020928116469518, "grad_norm": 0.15269297208441301, "learning_rate": 4.587226917713017e-06, "loss": 0.0022, "step": 8838 }, { "epoch": 4.021383075523203, "grad_norm": 0.052859533977470397, "learning_rate": 4.583101874534448e-06, "loss": 0.001, "step": 8839 }, { "epoch": 4.021838034576888, "grad_norm": 0.0883236707611987, "learning_rate": 4.578978499732594e-06, "loss": 0.0021, "step": 8840 }, { "epoch": 4.022292993630574, "grad_norm": 0.06057531115528436, "learning_rate": 4.574856793644397e-06, "loss": 0.0014, "step": 8841 }, { "epoch": 4.022747952684258, "grad_norm": 0.111755462717363, "learning_rate": 4.570736756606659e-06, "loss": 0.0028, "step": 8842 }, { "epoch": 4.023202911737943, "grad_norm": 0.08625599927969245, "learning_rate": 4.5666183889560535e-06, "loss": 0.0012, "step": 8843 }, { "epoch": 4.023657870791629, "grad_norm": 0.11333929715989044, "learning_rate": 4.562501691029122e-06, "loss": 0.002, "step": 8844 }, { "epoch": 4.024112829845314, "grad_norm": 0.13311211122461936, "learning_rate": 4.558386663162259e-06, "loss": 0.0026, "step": 8845 }, { "epoch": 4.024567788898999, "grad_norm": 0.07921046618599575, "learning_rate": 4.554273305691725e-06, "loss": 0.001, "step": 8846 }, { "epoch": 4.0250227479526846, "grad_norm": 0.049186213432263676, "learning_rate": 4.550161618953636e-06, "loss": 0.0008, "step": 8847 }, { "epoch": 4.025477707006369, "grad_norm": 0.09156699940267513, "learning_rate": 4.546051603283996e-06, "loss": 0.0029, "step": 8848 }, { "epoch": 4.025932666060054, "grad_norm": 0.10849083038374703, "learning_rate": 4.541943259018644e-06, "loss": 0.0011, "step": 8849 }, { "epoch": 4.02638762511374, "grad_norm": 0.11218500278631974, "learning_rate": 4.537836586493307e-06, "loss": 0.0034, "step": 8850 }, { "epoch": 4.026842584167425, "grad_norm": 0.2476970326735346, "learning_rate": 4.533731586043557e-06, "loss": 0.0134, "step": 8851 }, { "epoch": 4.02729754322111, "grad_norm": 0.3341615513980685, "learning_rate": 4.529628258004831e-06, "loss": 0.0033, "step": 8852 }, { "epoch": 4.0277525022747955, "grad_norm": 0.15475545322269776, "learning_rate": 4.525526602712449e-06, "loss": 0.0025, "step": 8853 }, { "epoch": 4.02820746132848, "grad_norm": 0.1735877940734062, "learning_rate": 4.521426620501567e-06, "loss": 0.0041, "step": 8854 }, { "epoch": 4.028662420382165, "grad_norm": 0.019376346622633225, "learning_rate": 4.517328311707225e-06, "loss": 0.0004, "step": 8855 }, { "epoch": 4.029117379435851, "grad_norm": 0.26260901221157934, "learning_rate": 4.513231676664306e-06, "loss": 0.0053, "step": 8856 }, { "epoch": 4.029572338489536, "grad_norm": 0.05230183485722295, "learning_rate": 4.509136715707579e-06, "loss": 0.0008, "step": 8857 }, { "epoch": 4.030027297543221, "grad_norm": 0.13053417092112618, "learning_rate": 4.505043429171668e-06, "loss": 0.0021, "step": 8858 }, { "epoch": 4.0304822565969065, "grad_norm": 0.07456264163627548, "learning_rate": 4.500951817391055e-06, "loss": 0.0012, "step": 8859 }, { "epoch": 4.030937215650591, "grad_norm": 0.039074278506241435, "learning_rate": 4.4968618807000845e-06, "loss": 0.0008, "step": 8860 }, { "epoch": 4.031392174704276, "grad_norm": 0.021558935691424967, "learning_rate": 4.4927736194329665e-06, "loss": 0.0004, "step": 8861 }, { "epoch": 4.031847133757962, "grad_norm": 0.07276796847707996, "learning_rate": 4.488687033923783e-06, "loss": 0.0013, "step": 8862 }, { "epoch": 4.032302092811647, "grad_norm": 0.020234401599255356, "learning_rate": 4.4846021245064595e-06, "loss": 0.0004, "step": 8863 }, { "epoch": 4.032757051865332, "grad_norm": 0.06339693633036732, "learning_rate": 4.480518891514809e-06, "loss": 0.001, "step": 8864 }, { "epoch": 4.0332120109190175, "grad_norm": 0.13838781423860866, "learning_rate": 4.476437335282494e-06, "loss": 0.0008, "step": 8865 }, { "epoch": 4.033666969972702, "grad_norm": 0.03658747826598101, "learning_rate": 4.472357456143025e-06, "loss": 0.0018, "step": 8866 }, { "epoch": 4.034121929026387, "grad_norm": 0.18727243532915697, "learning_rate": 4.468279254429814e-06, "loss": 0.0017, "step": 8867 }, { "epoch": 4.034576888080073, "grad_norm": 0.12046751644474885, "learning_rate": 4.464202730476099e-06, "loss": 0.0013, "step": 8868 }, { "epoch": 4.035031847133758, "grad_norm": 0.09280911693886178, "learning_rate": 4.460127884614998e-06, "loss": 0.0019, "step": 8869 }, { "epoch": 4.035486806187443, "grad_norm": 0.1348228897558593, "learning_rate": 4.456054717179483e-06, "loss": 0.0015, "step": 8870 }, { "epoch": 4.0359417652411285, "grad_norm": 0.08564576502907387, "learning_rate": 4.451983228502402e-06, "loss": 0.0009, "step": 8871 }, { "epoch": 4.036396724294813, "grad_norm": 0.0928816495688832, "learning_rate": 4.447913418916464e-06, "loss": 0.0022, "step": 8872 }, { "epoch": 4.036851683348498, "grad_norm": 0.25804020082563345, "learning_rate": 4.443845288754226e-06, "loss": 0.0022, "step": 8873 }, { "epoch": 4.037306642402184, "grad_norm": 0.0654956048149071, "learning_rate": 4.439778838348121e-06, "loss": 0.0013, "step": 8874 }, { "epoch": 4.037761601455869, "grad_norm": 0.057670579446683745, "learning_rate": 4.435714068030442e-06, "loss": 0.0013, "step": 8875 }, { "epoch": 4.038216560509555, "grad_norm": 0.15275982271120692, "learning_rate": 4.431650978133337e-06, "loss": 0.0045, "step": 8876 }, { "epoch": 4.038671519563239, "grad_norm": 0.024839904123821758, "learning_rate": 4.427589568988824e-06, "loss": 0.0004, "step": 8877 }, { "epoch": 4.039126478616924, "grad_norm": 0.2450664339706814, "learning_rate": 4.423529840928797e-06, "loss": 0.0021, "step": 8878 }, { "epoch": 4.03958143767061, "grad_norm": 0.24661388600534406, "learning_rate": 4.419471794284988e-06, "loss": 0.0031, "step": 8879 }, { "epoch": 4.040036396724295, "grad_norm": 0.013073596390224923, "learning_rate": 4.415415429389e-06, "loss": 0.0002, "step": 8880 }, { "epoch": 4.04049135577798, "grad_norm": 0.05872460385490517, "learning_rate": 4.4113607465723016e-06, "loss": 0.0009, "step": 8881 }, { "epoch": 4.0409463148316656, "grad_norm": 0.18266475459575207, "learning_rate": 4.407307746166231e-06, "loss": 0.0033, "step": 8882 }, { "epoch": 4.04140127388535, "grad_norm": 0.07075863867932668, "learning_rate": 4.403256428501976e-06, "loss": 0.0017, "step": 8883 }, { "epoch": 4.041856232939035, "grad_norm": 0.2550102904139845, "learning_rate": 4.399206793910582e-06, "loss": 0.0041, "step": 8884 }, { "epoch": 4.042311191992721, "grad_norm": 0.1484474358654129, "learning_rate": 4.395158842722985e-06, "loss": 0.0022, "step": 8885 }, { "epoch": 4.042766151046406, "grad_norm": 0.1574754225440847, "learning_rate": 4.391112575269951e-06, "loss": 0.0072, "step": 8886 }, { "epoch": 4.043221110100091, "grad_norm": 0.16348577901177616, "learning_rate": 4.387067991882135e-06, "loss": 0.0018, "step": 8887 }, { "epoch": 4.0436760691537765, "grad_norm": 0.09903108741678088, "learning_rate": 4.383025092890034e-06, "loss": 0.0011, "step": 8888 }, { "epoch": 4.044131028207461, "grad_norm": 0.05500583295871154, "learning_rate": 4.378983878624019e-06, "loss": 0.0007, "step": 8889 }, { "epoch": 4.044585987261146, "grad_norm": 0.18867907710068463, "learning_rate": 4.3749443494143085e-06, "loss": 0.0022, "step": 8890 }, { "epoch": 4.045040946314832, "grad_norm": 0.04445729575165061, "learning_rate": 4.370906505591007e-06, "loss": 0.0008, "step": 8891 }, { "epoch": 4.045495905368517, "grad_norm": 0.02345085511447987, "learning_rate": 4.36687034748407e-06, "loss": 0.0004, "step": 8892 }, { "epoch": 4.045950864422202, "grad_norm": 0.42358130859638915, "learning_rate": 4.36283587542331e-06, "loss": 0.0172, "step": 8893 }, { "epoch": 4.0464058234758875, "grad_norm": 0.16135741360644054, "learning_rate": 4.358803089738405e-06, "loss": 0.0034, "step": 8894 }, { "epoch": 4.046860782529572, "grad_norm": 0.10719548747859946, "learning_rate": 4.354771990758894e-06, "loss": 0.0011, "step": 8895 }, { "epoch": 4.047315741583257, "grad_norm": 0.17446342744783003, "learning_rate": 4.350742578814185e-06, "loss": 0.0077, "step": 8896 }, { "epoch": 4.047770700636943, "grad_norm": 0.04955556106709087, "learning_rate": 4.346714854233544e-06, "loss": 0.0006, "step": 8897 }, { "epoch": 4.048225659690628, "grad_norm": 0.056009052081592334, "learning_rate": 4.342688817346089e-06, "loss": 0.0011, "step": 8898 }, { "epoch": 4.048680618744313, "grad_norm": 0.024345501890083495, "learning_rate": 4.3386644684808216e-06, "loss": 0.0004, "step": 8899 }, { "epoch": 4.0491355777979985, "grad_norm": 0.014598122340058774, "learning_rate": 4.33464180796658e-06, "loss": 0.0003, "step": 8900 }, { "epoch": 4.049590536851683, "grad_norm": 0.01923694897317042, "learning_rate": 4.330620836132096e-06, "loss": 0.0003, "step": 8901 }, { "epoch": 4.050045495905368, "grad_norm": 0.04179074961558796, "learning_rate": 4.326601553305934e-06, "loss": 0.0009, "step": 8902 }, { "epoch": 4.050500454959054, "grad_norm": 0.19943826444411872, "learning_rate": 4.322583959816531e-06, "loss": 0.0035, "step": 8903 }, { "epoch": 4.050955414012739, "grad_norm": 0.15642308163235874, "learning_rate": 4.318568055992183e-06, "loss": 0.0051, "step": 8904 }, { "epoch": 4.051410373066424, "grad_norm": 0.09180228866019377, "learning_rate": 4.314553842161057e-06, "loss": 0.0052, "step": 8905 }, { "epoch": 4.0518653321201095, "grad_norm": 0.11344785974147825, "learning_rate": 4.310541318651184e-06, "loss": 0.0034, "step": 8906 }, { "epoch": 4.052320291173794, "grad_norm": 0.05060618533946687, "learning_rate": 4.306530485790439e-06, "loss": 0.0006, "step": 8907 }, { "epoch": 4.052775250227479, "grad_norm": 0.02643355689051481, "learning_rate": 4.302521343906573e-06, "loss": 0.0003, "step": 8908 }, { "epoch": 4.053230209281165, "grad_norm": 0.1352972239453439, "learning_rate": 4.298513893327194e-06, "loss": 0.0009, "step": 8909 }, { "epoch": 4.05368516833485, "grad_norm": 0.28151729778343176, "learning_rate": 4.294508134379768e-06, "loss": 0.0016, "step": 8910 }, { "epoch": 4.054140127388535, "grad_norm": 0.21441407173292387, "learning_rate": 4.290504067391638e-06, "loss": 0.0032, "step": 8911 }, { "epoch": 4.05459508644222, "grad_norm": 0.08216122816213278, "learning_rate": 4.286501692689984e-06, "loss": 0.0016, "step": 8912 }, { "epoch": 4.055050045495905, "grad_norm": 0.17378635806701076, "learning_rate": 4.282501010601877e-06, "loss": 0.0034, "step": 8913 }, { "epoch": 4.05550500454959, "grad_norm": 0.09514283725052824, "learning_rate": 4.278502021454228e-06, "loss": 0.0035, "step": 8914 }, { "epoch": 4.055959963603276, "grad_norm": 0.08745804101438763, "learning_rate": 4.274504725573811e-06, "loss": 0.0015, "step": 8915 }, { "epoch": 4.056414922656961, "grad_norm": 0.02974247341733284, "learning_rate": 4.270509123287278e-06, "loss": 0.0005, "step": 8916 }, { "epoch": 4.056869881710646, "grad_norm": 0.15929921951438578, "learning_rate": 4.266515214921127e-06, "loss": 0.002, "step": 8917 }, { "epoch": 4.057324840764331, "grad_norm": 0.15227207523685107, "learning_rate": 4.262523000801718e-06, "loss": 0.001, "step": 8918 }, { "epoch": 4.057779799818016, "grad_norm": 0.05820771539229512, "learning_rate": 4.258532481255276e-06, "loss": 0.0012, "step": 8919 }, { "epoch": 4.058234758871701, "grad_norm": 0.1611708210919653, "learning_rate": 4.254543656607893e-06, "loss": 0.0025, "step": 8920 }, { "epoch": 4.058689717925387, "grad_norm": 0.0705350259566975, "learning_rate": 4.2505565271855205e-06, "loss": 0.0002, "step": 8921 }, { "epoch": 4.059144676979072, "grad_norm": 0.08383172185081089, "learning_rate": 4.2465710933139665e-06, "loss": 0.0021, "step": 8922 }, { "epoch": 4.059599636032757, "grad_norm": 0.15754253473499627, "learning_rate": 4.2425873553189e-06, "loss": 0.0033, "step": 8923 }, { "epoch": 4.060054595086442, "grad_norm": 0.06288112288919552, "learning_rate": 4.23860531352585e-06, "loss": 0.0013, "step": 8924 }, { "epoch": 4.060509554140127, "grad_norm": 0.07408320767564067, "learning_rate": 4.234624968260223e-06, "loss": 0.0021, "step": 8925 }, { "epoch": 4.060964513193812, "grad_norm": 0.24760226563478727, "learning_rate": 4.230646319847259e-06, "loss": 0.0029, "step": 8926 }, { "epoch": 4.061419472247498, "grad_norm": 0.05077061443386522, "learning_rate": 4.2266693686120935e-06, "loss": 0.001, "step": 8927 }, { "epoch": 4.061874431301183, "grad_norm": 0.16124362072551787, "learning_rate": 4.2226941148796935e-06, "loss": 0.0046, "step": 8928 }, { "epoch": 4.0623293903548685, "grad_norm": 0.06781310979570784, "learning_rate": 4.218720558974895e-06, "loss": 0.001, "step": 8929 }, { "epoch": 4.062784349408553, "grad_norm": 0.07822328787989337, "learning_rate": 4.214748701222412e-06, "loss": 0.0014, "step": 8930 }, { "epoch": 4.063239308462238, "grad_norm": 0.6791743696869358, "learning_rate": 4.2107785419468e-06, "loss": 0.0117, "step": 8931 }, { "epoch": 4.063694267515924, "grad_norm": 0.28693842374855705, "learning_rate": 4.2068100814724815e-06, "loss": 0.0012, "step": 8932 }, { "epoch": 4.064149226569609, "grad_norm": 0.301821920074811, "learning_rate": 4.2028433201237365e-06, "loss": 0.0044, "step": 8933 }, { "epoch": 4.064604185623294, "grad_norm": 0.2333741903145277, "learning_rate": 4.198878258224715e-06, "loss": 0.0034, "step": 8934 }, { "epoch": 4.0650591446769795, "grad_norm": 0.06315754156887961, "learning_rate": 4.194914896099433e-06, "loss": 0.001, "step": 8935 }, { "epoch": 4.065514103730664, "grad_norm": 0.17880914702564393, "learning_rate": 4.1909532340717485e-06, "loss": 0.0016, "step": 8936 }, { "epoch": 4.065969062784349, "grad_norm": 0.11798097959360136, "learning_rate": 4.186993272465395e-06, "loss": 0.0026, "step": 8937 }, { "epoch": 4.066424021838035, "grad_norm": 0.17473767566283288, "learning_rate": 4.183035011603953e-06, "loss": 0.0041, "step": 8938 }, { "epoch": 4.06687898089172, "grad_norm": 0.1932526586041605, "learning_rate": 4.179078451810889e-06, "loss": 0.0029, "step": 8939 }, { "epoch": 4.067333939945405, "grad_norm": 0.31702285240728817, "learning_rate": 4.175123593409499e-06, "loss": 0.0033, "step": 8940 }, { "epoch": 4.0677888989990905, "grad_norm": 0.15636551863663342, "learning_rate": 4.171170436722974e-06, "loss": 0.0021, "step": 8941 }, { "epoch": 4.068243858052775, "grad_norm": 0.20328442727453977, "learning_rate": 4.167218982074337e-06, "loss": 0.0049, "step": 8942 }, { "epoch": 4.06869881710646, "grad_norm": 0.13226098985414514, "learning_rate": 4.1632692297864765e-06, "loss": 0.0027, "step": 8943 }, { "epoch": 4.069153776160146, "grad_norm": 0.05277324306396963, "learning_rate": 4.159321180182166e-06, "loss": 0.0005, "step": 8944 }, { "epoch": 4.069608735213831, "grad_norm": 0.1266515441846948, "learning_rate": 4.155374833584011e-06, "loss": 0.0031, "step": 8945 }, { "epoch": 4.070063694267516, "grad_norm": 0.13160455004364516, "learning_rate": 4.1514301903144925e-06, "loss": 0.0038, "step": 8946 }, { "epoch": 4.070518653321201, "grad_norm": 0.09709402546895343, "learning_rate": 4.1474872506959416e-06, "loss": 0.0008, "step": 8947 }, { "epoch": 4.070973612374886, "grad_norm": 0.10583674295014915, "learning_rate": 4.143546015050567e-06, "loss": 0.0012, "step": 8948 }, { "epoch": 4.071428571428571, "grad_norm": 0.08492225530873129, "learning_rate": 4.139606483700423e-06, "loss": 0.0031, "step": 8949 }, { "epoch": 4.071883530482257, "grad_norm": 0.1561428008272186, "learning_rate": 4.135668656967434e-06, "loss": 0.0043, "step": 8950 }, { "epoch": 4.072338489535942, "grad_norm": 0.1705457157869751, "learning_rate": 4.131732535173382e-06, "loss": 0.0017, "step": 8951 }, { "epoch": 4.072793448589627, "grad_norm": 0.242881283703111, "learning_rate": 4.127798118639909e-06, "loss": 0.0033, "step": 8952 }, { "epoch": 4.073248407643312, "grad_norm": 0.055563689832668024, "learning_rate": 4.123865407688507e-06, "loss": 0.0006, "step": 8953 }, { "epoch": 4.073703366696997, "grad_norm": 0.07251510841089086, "learning_rate": 4.119934402640549e-06, "loss": 0.0009, "step": 8954 }, { "epoch": 4.074158325750682, "grad_norm": 0.08719141510230408, "learning_rate": 4.116005103817264e-06, "loss": 0.0018, "step": 8955 }, { "epoch": 4.074613284804368, "grad_norm": 0.15513595051339546, "learning_rate": 4.1120775115397305e-06, "loss": 0.0074, "step": 8956 }, { "epoch": 4.075068243858053, "grad_norm": 0.09070458761138783, "learning_rate": 4.108151626128895e-06, "loss": 0.0013, "step": 8957 }, { "epoch": 4.075523202911738, "grad_norm": 0.009434699932942608, "learning_rate": 4.104227447905553e-06, "loss": 0.0001, "step": 8958 }, { "epoch": 4.075978161965423, "grad_norm": 0.11591774450326751, "learning_rate": 4.100304977190389e-06, "loss": 0.0034, "step": 8959 }, { "epoch": 4.076433121019108, "grad_norm": 0.03902208473485113, "learning_rate": 4.09638421430392e-06, "loss": 0.0006, "step": 8960 }, { "epoch": 4.076888080072793, "grad_norm": 0.11175114289618086, "learning_rate": 4.092465159566525e-06, "loss": 0.0008, "step": 8961 }, { "epoch": 4.077343039126479, "grad_norm": 0.04566756393325762, "learning_rate": 4.088547813298465e-06, "loss": 0.0004, "step": 8962 }, { "epoch": 4.077797998180164, "grad_norm": 0.09393829789008445, "learning_rate": 4.084632175819836e-06, "loss": 0.0021, "step": 8963 }, { "epoch": 4.078252957233849, "grad_norm": 0.1636165800206047, "learning_rate": 4.080718247450621e-06, "loss": 0.0019, "step": 8964 }, { "epoch": 4.078707916287534, "grad_norm": 0.09993361483304636, "learning_rate": 4.076806028510638e-06, "loss": 0.0017, "step": 8965 }, { "epoch": 4.079162875341219, "grad_norm": 0.027632320395722058, "learning_rate": 4.07289551931958e-06, "loss": 0.0005, "step": 8966 }, { "epoch": 4.079617834394904, "grad_norm": 0.07978995460111445, "learning_rate": 4.068986720196988e-06, "loss": 0.0015, "step": 8967 }, { "epoch": 4.08007279344859, "grad_norm": 0.0650089577662012, "learning_rate": 4.0650796314622765e-06, "loss": 0.0005, "step": 8968 }, { "epoch": 4.080527752502275, "grad_norm": 0.1158380080376855, "learning_rate": 4.061174253434724e-06, "loss": 0.0028, "step": 8969 }, { "epoch": 4.08098271155596, "grad_norm": 0.10321156360963588, "learning_rate": 4.057270586433451e-06, "loss": 0.0009, "step": 8970 }, { "epoch": 4.081437670609645, "grad_norm": 0.1897984789527528, "learning_rate": 4.053368630777449e-06, "loss": 0.007, "step": 8971 }, { "epoch": 4.08189262966333, "grad_norm": 0.08133230781460224, "learning_rate": 4.049468386785562e-06, "loss": 0.0016, "step": 8972 }, { "epoch": 4.082347588717015, "grad_norm": 0.2924286148164867, "learning_rate": 4.045569854776515e-06, "loss": 0.0073, "step": 8973 }, { "epoch": 4.082802547770701, "grad_norm": 0.03939672965097756, "learning_rate": 4.041673035068868e-06, "loss": 0.0015, "step": 8974 }, { "epoch": 4.083257506824386, "grad_norm": 0.07938280224378423, "learning_rate": 4.037777927981049e-06, "loss": 0.0012, "step": 8975 }, { "epoch": 4.083712465878071, "grad_norm": 0.05834217155302482, "learning_rate": 4.033884533831359e-06, "loss": 0.001, "step": 8976 }, { "epoch": 4.084167424931756, "grad_norm": 0.1031075500651799, "learning_rate": 4.029992852937936e-06, "loss": 0.0017, "step": 8977 }, { "epoch": 4.084622383985441, "grad_norm": 0.11100445981611325, "learning_rate": 4.026102885618801e-06, "loss": 0.0017, "step": 8978 }, { "epoch": 4.085077343039126, "grad_norm": 0.04771372517669258, "learning_rate": 4.022214632191826e-06, "loss": 0.0006, "step": 8979 }, { "epoch": 4.085532302092812, "grad_norm": 0.05274034204782114, "learning_rate": 4.0183280929747325e-06, "loss": 0.0016, "step": 8980 }, { "epoch": 4.085987261146497, "grad_norm": 0.08766181071830724, "learning_rate": 4.014443268285117e-06, "loss": 0.0026, "step": 8981 }, { "epoch": 4.0864422202001816, "grad_norm": 0.19121770390541762, "learning_rate": 4.0105601584404214e-06, "loss": 0.0022, "step": 8982 }, { "epoch": 4.086897179253867, "grad_norm": 0.24452937321623147, "learning_rate": 4.006678763757959e-06, "loss": 0.0016, "step": 8983 }, { "epoch": 4.087352138307552, "grad_norm": 0.12788517360548207, "learning_rate": 4.002799084554915e-06, "loss": 0.0021, "step": 8984 }, { "epoch": 4.087807097361237, "grad_norm": 0.10841993337851119, "learning_rate": 3.9989211211483026e-06, "loss": 0.0015, "step": 8985 }, { "epoch": 4.088262056414923, "grad_norm": 0.029449444453340985, "learning_rate": 3.9950448738550166e-06, "loss": 0.0006, "step": 8986 }, { "epoch": 4.088717015468608, "grad_norm": 0.19931277804022257, "learning_rate": 3.991170342991801e-06, "loss": 0.0034, "step": 8987 }, { "epoch": 4.089171974522293, "grad_norm": 0.10427500572516747, "learning_rate": 3.987297528875275e-06, "loss": 0.001, "step": 8988 }, { "epoch": 4.089626933575978, "grad_norm": 0.12736930924037126, "learning_rate": 3.983426431821899e-06, "loss": 0.0056, "step": 8989 }, { "epoch": 4.090081892629663, "grad_norm": 0.1336889099389317, "learning_rate": 3.9795570521480085e-06, "loss": 0.002, "step": 8990 }, { "epoch": 4.090536851683349, "grad_norm": 0.1172073015523535, "learning_rate": 3.97568939016979e-06, "loss": 0.0014, "step": 8991 }, { "epoch": 4.090991810737034, "grad_norm": 0.25191131194980004, "learning_rate": 3.971823446203282e-06, "loss": 0.0022, "step": 8992 }, { "epoch": 4.091446769790719, "grad_norm": 0.16248710994420273, "learning_rate": 3.967959220564404e-06, "loss": 0.0031, "step": 8993 }, { "epoch": 4.091901728844404, "grad_norm": 0.16253710097266288, "learning_rate": 3.964096713568924e-06, "loss": 0.0014, "step": 8994 }, { "epoch": 4.092356687898089, "grad_norm": 0.17451934958090945, "learning_rate": 3.960235925532457e-06, "loss": 0.0091, "step": 8995 }, { "epoch": 4.092811646951774, "grad_norm": 0.05779535995100376, "learning_rate": 3.956376856770494e-06, "loss": 0.001, "step": 8996 }, { "epoch": 4.09326660600546, "grad_norm": 0.10586673035413846, "learning_rate": 3.952519507598382e-06, "loss": 0.0022, "step": 8997 }, { "epoch": 4.093721565059145, "grad_norm": 0.11832210782943538, "learning_rate": 3.948663878331329e-06, "loss": 0.0013, "step": 8998 }, { "epoch": 4.09417652411283, "grad_norm": 0.2749403029519595, "learning_rate": 3.944809969284399e-06, "loss": 0.0047, "step": 8999 }, { "epoch": 4.094631483166515, "grad_norm": 0.11149814474388163, "learning_rate": 3.940957780772514e-06, "loss": 0.0005, "step": 9000 }, { "epoch": 4.0950864422202, "grad_norm": 0.5706599712918773, "learning_rate": 3.937107313110449e-06, "loss": 0.0083, "step": 9001 }, { "epoch": 4.095541401273885, "grad_norm": 0.10156060907842474, "learning_rate": 3.933258566612863e-06, "loss": 0.0022, "step": 9002 }, { "epoch": 4.095996360327571, "grad_norm": 0.20699259361612146, "learning_rate": 3.929411541594246e-06, "loss": 0.0036, "step": 9003 }, { "epoch": 4.096451319381256, "grad_norm": 0.3591545637751722, "learning_rate": 3.925566238368969e-06, "loss": 0.0083, "step": 9004 }, { "epoch": 4.096906278434941, "grad_norm": 0.03843587010519756, "learning_rate": 3.921722657251245e-06, "loss": 0.0005, "step": 9005 }, { "epoch": 4.097361237488626, "grad_norm": 0.13085084282602671, "learning_rate": 3.917880798555154e-06, "loss": 0.0053, "step": 9006 }, { "epoch": 4.097816196542311, "grad_norm": 0.12717764875334428, "learning_rate": 3.914040662594642e-06, "loss": 0.0014, "step": 9007 }, { "epoch": 4.098271155595996, "grad_norm": 0.05755294505150601, "learning_rate": 3.910202249683506e-06, "loss": 0.0012, "step": 9008 }, { "epoch": 4.098726114649682, "grad_norm": 0.023776073972278506, "learning_rate": 3.9063655601354e-06, "loss": 0.0003, "step": 9009 }, { "epoch": 4.099181073703367, "grad_norm": 0.13096270625836554, "learning_rate": 3.9025305942638365e-06, "loss": 0.0027, "step": 9010 }, { "epoch": 4.099636032757052, "grad_norm": 0.21236259885119335, "learning_rate": 3.898697352382197e-06, "loss": 0.0039, "step": 9011 }, { "epoch": 4.100090991810737, "grad_norm": 0.192234941210357, "learning_rate": 3.8948658348037234e-06, "loss": 0.0086, "step": 9012 }, { "epoch": 4.100545950864422, "grad_norm": 0.04467181264616869, "learning_rate": 3.891036041841506e-06, "loss": 0.0007, "step": 9013 }, { "epoch": 4.101000909918107, "grad_norm": 0.04791039483366729, "learning_rate": 3.887207973808494e-06, "loss": 0.0008, "step": 9014 }, { "epoch": 4.101455868971793, "grad_norm": 0.09746672916443547, "learning_rate": 3.883381631017502e-06, "loss": 0.0008, "step": 9015 }, { "epoch": 4.101910828025478, "grad_norm": 0.07695667743165295, "learning_rate": 3.879557013781193e-06, "loss": 0.0013, "step": 9016 }, { "epoch": 4.1023657870791626, "grad_norm": 0.12216698135521835, "learning_rate": 3.8757341224121085e-06, "loss": 0.0028, "step": 9017 }, { "epoch": 4.102820746132848, "grad_norm": 0.2515919253026368, "learning_rate": 3.871912957222642e-06, "loss": 0.0035, "step": 9018 }, { "epoch": 4.103275705186533, "grad_norm": 0.011397978200521755, "learning_rate": 3.868093518525034e-06, "loss": 0.0002, "step": 9019 }, { "epoch": 4.103730664240218, "grad_norm": 0.09713431907073634, "learning_rate": 3.864275806631393e-06, "loss": 0.0011, "step": 9020 }, { "epoch": 4.104185623293904, "grad_norm": 0.05568334790713967, "learning_rate": 3.860459821853679e-06, "loss": 0.0011, "step": 9021 }, { "epoch": 4.104640582347589, "grad_norm": 0.24594327149793804, "learning_rate": 3.856645564503727e-06, "loss": 0.0023, "step": 9022 }, { "epoch": 4.1050955414012735, "grad_norm": 0.09399983405935713, "learning_rate": 3.852833034893219e-06, "loss": 0.0011, "step": 9023 }, { "epoch": 4.105550500454959, "grad_norm": 0.17047049724731764, "learning_rate": 3.849022233333691e-06, "loss": 0.0086, "step": 9024 }, { "epoch": 4.106005459508644, "grad_norm": 0.15162903558255755, "learning_rate": 3.845213160136551e-06, "loss": 0.0008, "step": 9025 }, { "epoch": 4.106460418562329, "grad_norm": 0.06510365780744384, "learning_rate": 3.841405815613056e-06, "loss": 0.0011, "step": 9026 }, { "epoch": 4.106915377616015, "grad_norm": 0.12516847857607075, "learning_rate": 3.837600200074329e-06, "loss": 0.0036, "step": 9027 }, { "epoch": 4.1073703366697, "grad_norm": 0.36073718702744834, "learning_rate": 3.833796313831345e-06, "loss": 0.0057, "step": 9028 }, { "epoch": 4.1078252957233845, "grad_norm": 0.27266134627239486, "learning_rate": 3.829994157194943e-06, "loss": 0.0034, "step": 9029 }, { "epoch": 4.10828025477707, "grad_norm": 0.09588427825091118, "learning_rate": 3.826193730475808e-06, "loss": 0.0012, "step": 9030 }, { "epoch": 4.108735213830755, "grad_norm": 0.17889158155221085, "learning_rate": 3.822395033984502e-06, "loss": 0.0043, "step": 9031 }, { "epoch": 4.10919017288444, "grad_norm": 0.5684686060363283, "learning_rate": 3.818598068031442e-06, "loss": 0.0048, "step": 9032 }, { "epoch": 4.109645131938126, "grad_norm": 0.13188820621188582, "learning_rate": 3.814802832926895e-06, "loss": 0.002, "step": 9033 }, { "epoch": 4.110100090991811, "grad_norm": 0.16734200065885457, "learning_rate": 3.8110093289809857e-06, "loss": 0.0024, "step": 9034 }, { "epoch": 4.1105550500454955, "grad_norm": 0.3027672443060466, "learning_rate": 3.8072175565037033e-06, "loss": 0.0044, "step": 9035 }, { "epoch": 4.111010009099181, "grad_norm": 0.06514761106903538, "learning_rate": 3.8034275158048997e-06, "loss": 0.0011, "step": 9036 }, { "epoch": 4.111464968152866, "grad_norm": 0.2833385777260312, "learning_rate": 3.799639207194272e-06, "loss": 0.0026, "step": 9037 }, { "epoch": 4.111919927206552, "grad_norm": 0.08954390006603763, "learning_rate": 3.7958526309813917e-06, "loss": 0.001, "step": 9038 }, { "epoch": 4.112374886260237, "grad_norm": 0.1326363423250957, "learning_rate": 3.7920677874756812e-06, "loss": 0.002, "step": 9039 }, { "epoch": 4.112829845313922, "grad_norm": 0.0838909956660388, "learning_rate": 3.788284676986409e-06, "loss": 0.0012, "step": 9040 }, { "epoch": 4.113284804367607, "grad_norm": 0.043665483972541386, "learning_rate": 3.784503299822728e-06, "loss": 0.001, "step": 9041 }, { "epoch": 4.113739763421292, "grad_norm": 0.3036781569796629, "learning_rate": 3.7807236562936277e-06, "loss": 0.0056, "step": 9042 }, { "epoch": 4.114194722474977, "grad_norm": 0.08043169705801662, "learning_rate": 3.776945746707966e-06, "loss": 0.0009, "step": 9043 }, { "epoch": 4.114649681528663, "grad_norm": 0.15429719601946978, "learning_rate": 3.773169571374449e-06, "loss": 0.0028, "step": 9044 }, { "epoch": 4.115104640582348, "grad_norm": 0.046263921173593285, "learning_rate": 3.769395130601655e-06, "loss": 0.0011, "step": 9045 }, { "epoch": 4.115559599636033, "grad_norm": 0.16526143525620351, "learning_rate": 3.7656224246980204e-06, "loss": 0.0012, "step": 9046 }, { "epoch": 4.116014558689718, "grad_norm": 0.06343237013080524, "learning_rate": 3.761851453971829e-06, "loss": 0.0009, "step": 9047 }, { "epoch": 4.116469517743403, "grad_norm": 0.2705379168295649, "learning_rate": 3.7580822187312263e-06, "loss": 0.0019, "step": 9048 }, { "epoch": 4.116924476797088, "grad_norm": 0.021282242896661405, "learning_rate": 3.754314719284208e-06, "loss": 0.0004, "step": 9049 }, { "epoch": 4.117379435850774, "grad_norm": 0.08304599128056231, "learning_rate": 3.750548955938654e-06, "loss": 0.0009, "step": 9050 }, { "epoch": 4.117834394904459, "grad_norm": 0.23124450267532665, "learning_rate": 3.7467849290022727e-06, "loss": 0.0029, "step": 9051 }, { "epoch": 4.1182893539581436, "grad_norm": 0.30733286265872717, "learning_rate": 3.7430226387826535e-06, "loss": 0.0047, "step": 9052 }, { "epoch": 4.118744313011829, "grad_norm": 0.11741803637066381, "learning_rate": 3.739262085587228e-06, "loss": 0.0014, "step": 9053 }, { "epoch": 4.119199272065514, "grad_norm": 0.146766912227471, "learning_rate": 3.7355032697232924e-06, "loss": 0.0015, "step": 9054 }, { "epoch": 4.119654231119199, "grad_norm": 0.23001256258473038, "learning_rate": 3.731746191497995e-06, "loss": 0.004, "step": 9055 }, { "epoch": 4.120109190172885, "grad_norm": 0.02557849636544926, "learning_rate": 3.7279908512183576e-06, "loss": 0.0003, "step": 9056 }, { "epoch": 4.12056414922657, "grad_norm": 0.01631036816885238, "learning_rate": 3.7242372491912454e-06, "loss": 0.0002, "step": 9057 }, { "epoch": 4.1210191082802545, "grad_norm": 0.04155973915654957, "learning_rate": 3.7204853857233768e-06, "loss": 0.0006, "step": 9058 }, { "epoch": 4.12147406733394, "grad_norm": 0.22390250606476192, "learning_rate": 3.7167352611213513e-06, "loss": 0.0026, "step": 9059 }, { "epoch": 4.121929026387625, "grad_norm": 0.022337386253851694, "learning_rate": 3.7129868756916013e-06, "loss": 0.0004, "step": 9060 }, { "epoch": 4.12238398544131, "grad_norm": 0.10327413725081745, "learning_rate": 3.709240229740435e-06, "loss": 0.0016, "step": 9061 }, { "epoch": 4.122838944494996, "grad_norm": 0.17539408651506538, "learning_rate": 3.7054953235740126e-06, "loss": 0.0032, "step": 9062 }, { "epoch": 4.123293903548681, "grad_norm": 0.12050141670499576, "learning_rate": 3.701752157498345e-06, "loss": 0.0019, "step": 9063 }, { "epoch": 4.1237488626023655, "grad_norm": 0.41598602425108844, "learning_rate": 3.698010731819304e-06, "loss": 0.0008, "step": 9064 }, { "epoch": 4.124203821656051, "grad_norm": 0.16270405593573922, "learning_rate": 3.694271046842629e-06, "loss": 0.002, "step": 9065 }, { "epoch": 4.124658780709736, "grad_norm": 0.0756162811666449, "learning_rate": 3.690533102873911e-06, "loss": 0.0016, "step": 9066 }, { "epoch": 4.125113739763421, "grad_norm": 0.19352259906165706, "learning_rate": 3.6867969002185974e-06, "loss": 0.0014, "step": 9067 }, { "epoch": 4.125568698817107, "grad_norm": 0.09556557185120605, "learning_rate": 3.683062439181992e-06, "loss": 0.0022, "step": 9068 }, { "epoch": 4.126023657870792, "grad_norm": 0.10760202981651501, "learning_rate": 3.6793297200692495e-06, "loss": 0.0011, "step": 9069 }, { "epoch": 4.1264786169244765, "grad_norm": 0.12136952535097842, "learning_rate": 3.6755987431854043e-06, "loss": 0.0009, "step": 9070 }, { "epoch": 4.126933575978162, "grad_norm": 0.021612961768180932, "learning_rate": 3.6718695088353323e-06, "loss": 0.0002, "step": 9071 }, { "epoch": 4.127388535031847, "grad_norm": 0.20157206730141614, "learning_rate": 3.6681420173237584e-06, "loss": 0.0045, "step": 9072 }, { "epoch": 4.127843494085532, "grad_norm": 0.15260562979581585, "learning_rate": 3.6644162689552924e-06, "loss": 0.0014, "step": 9073 }, { "epoch": 4.128298453139218, "grad_norm": 0.04056682325925041, "learning_rate": 3.6606922640343737e-06, "loss": 0.0006, "step": 9074 }, { "epoch": 4.128753412192903, "grad_norm": 0.046674918075318796, "learning_rate": 3.6569700028653205e-06, "loss": 0.0011, "step": 9075 }, { "epoch": 4.1292083712465875, "grad_norm": 0.014612648721727154, "learning_rate": 3.6532494857522943e-06, "loss": 0.0003, "step": 9076 }, { "epoch": 4.129663330300273, "grad_norm": 0.15774164463264453, "learning_rate": 3.649530712999319e-06, "loss": 0.0004, "step": 9077 }, { "epoch": 4.130118289353958, "grad_norm": 0.13570358757025455, "learning_rate": 3.6458136849102707e-06, "loss": 0.0009, "step": 9078 }, { "epoch": 4.130573248407643, "grad_norm": 0.02618985228584831, "learning_rate": 3.6420984017888933e-06, "loss": 0.0002, "step": 9079 }, { "epoch": 4.131028207461329, "grad_norm": 0.08571009730046032, "learning_rate": 3.6383848639387874e-06, "loss": 0.0011, "step": 9080 }, { "epoch": 4.131483166515014, "grad_norm": 0.09440133229436594, "learning_rate": 3.6346730716634025e-06, "loss": 0.0008, "step": 9081 }, { "epoch": 4.131938125568698, "grad_norm": 0.09640755371360318, "learning_rate": 3.6309630252660514e-06, "loss": 0.0015, "step": 9082 }, { "epoch": 4.132393084622384, "grad_norm": 0.1475113383264629, "learning_rate": 3.627254725049892e-06, "loss": 0.0024, "step": 9083 }, { "epoch": 4.132848043676069, "grad_norm": 0.04922258530952729, "learning_rate": 3.6235481713179644e-06, "loss": 0.0009, "step": 9084 }, { "epoch": 4.133303002729754, "grad_norm": 0.11219859468169022, "learning_rate": 3.619843364373146e-06, "loss": 0.002, "step": 9085 }, { "epoch": 4.13375796178344, "grad_norm": 0.1377850026849835, "learning_rate": 3.6161403045181704e-06, "loss": 0.001, "step": 9086 }, { "epoch": 4.1342129208371245, "grad_norm": 0.07250211990699959, "learning_rate": 3.6124389920556445e-06, "loss": 0.0006, "step": 9087 }, { "epoch": 4.134667879890809, "grad_norm": 0.05572322815056665, "learning_rate": 3.608739427288013e-06, "loss": 0.001, "step": 9088 }, { "epoch": 4.135122838944495, "grad_norm": 0.1274201958726622, "learning_rate": 3.605041610517601e-06, "loss": 0.0028, "step": 9089 }, { "epoch": 4.13557779799818, "grad_norm": 0.20625300724191728, "learning_rate": 3.601345542046569e-06, "loss": 0.0038, "step": 9090 }, { "epoch": 4.136032757051865, "grad_norm": 0.08107458261609826, "learning_rate": 3.597651222176943e-06, "loss": 0.0011, "step": 9091 }, { "epoch": 4.136487716105551, "grad_norm": 0.2587836937400572, "learning_rate": 3.593958651210608e-06, "loss": 0.0031, "step": 9092 }, { "epoch": 4.1369426751592355, "grad_norm": 0.08150080212833863, "learning_rate": 3.590267829449298e-06, "loss": 0.0016, "step": 9093 }, { "epoch": 4.13739763421292, "grad_norm": 0.08415404645854306, "learning_rate": 3.5865787571946143e-06, "loss": 0.0034, "step": 9094 }, { "epoch": 4.137852593266606, "grad_norm": 0.21940424016163185, "learning_rate": 3.5828914347480174e-06, "loss": 0.0051, "step": 9095 }, { "epoch": 4.138307552320291, "grad_norm": 0.1336141910897121, "learning_rate": 3.5792058624108142e-06, "loss": 0.0067, "step": 9096 }, { "epoch": 4.138762511373977, "grad_norm": 0.05266961409955484, "learning_rate": 3.5755220404841723e-06, "loss": 0.0005, "step": 9097 }, { "epoch": 4.139217470427662, "grad_norm": 0.12802461472262355, "learning_rate": 3.571839969269114e-06, "loss": 0.0019, "step": 9098 }, { "epoch": 4.1396724294813465, "grad_norm": 0.09316437285137552, "learning_rate": 3.568159649066527e-06, "loss": 0.0029, "step": 9099 }, { "epoch": 4.140127388535032, "grad_norm": 0.12421320881225179, "learning_rate": 3.5644810801771454e-06, "loss": 0.0017, "step": 9100 }, { "epoch": 4.140582347588717, "grad_norm": 0.21417787577430908, "learning_rate": 3.5608042629015707e-06, "loss": 0.0012, "step": 9101 }, { "epoch": 4.141037306642402, "grad_norm": 0.25311927062614276, "learning_rate": 3.5571291975402543e-06, "loss": 0.005, "step": 9102 }, { "epoch": 4.141492265696088, "grad_norm": 0.24527437649721368, "learning_rate": 3.5534558843935e-06, "loss": 0.0035, "step": 9103 }, { "epoch": 4.141947224749773, "grad_norm": 0.12394747217807611, "learning_rate": 3.5497843237614846e-06, "loss": 0.0015, "step": 9104 }, { "epoch": 4.1424021838034575, "grad_norm": 0.20071735272064875, "learning_rate": 3.5461145159442237e-06, "loss": 0.0039, "step": 9105 }, { "epoch": 4.142857142857143, "grad_norm": 0.12436172851725805, "learning_rate": 3.5424464612416026e-06, "loss": 0.0031, "step": 9106 }, { "epoch": 4.143312101910828, "grad_norm": 0.08230969088106842, "learning_rate": 3.5387801599533475e-06, "loss": 0.0015, "step": 9107 }, { "epoch": 4.143767060964513, "grad_norm": 0.05534777198683033, "learning_rate": 3.5351156123790614e-06, "loss": 0.0006, "step": 9108 }, { "epoch": 4.144222020018199, "grad_norm": 0.12497806521081695, "learning_rate": 3.5314528188181984e-06, "loss": 0.0012, "step": 9109 }, { "epoch": 4.144676979071884, "grad_norm": 0.22212702794396189, "learning_rate": 3.527791779570058e-06, "loss": 0.0011, "step": 9110 }, { "epoch": 4.1451319381255685, "grad_norm": 0.3534897558315844, "learning_rate": 3.5241324949338074e-06, "loss": 0.0014, "step": 9111 }, { "epoch": 4.145586897179254, "grad_norm": 0.13236565137036602, "learning_rate": 3.520474965208459e-06, "loss": 0.0043, "step": 9112 }, { "epoch": 4.146041856232939, "grad_norm": 0.17211754486556213, "learning_rate": 3.516819190692902e-06, "loss": 0.003, "step": 9113 }, { "epoch": 4.146496815286624, "grad_norm": 0.09735006974501408, "learning_rate": 3.513165171685856e-06, "loss": 0.0011, "step": 9114 }, { "epoch": 4.14695177434031, "grad_norm": 0.04203415138633591, "learning_rate": 3.509512908485926e-06, "loss": 0.0007, "step": 9115 }, { "epoch": 4.147406733393995, "grad_norm": 0.026842122657791664, "learning_rate": 3.505862401391552e-06, "loss": 0.0005, "step": 9116 }, { "epoch": 4.147861692447679, "grad_norm": 0.04194081312633875, "learning_rate": 3.5022136507010276e-06, "loss": 0.0005, "step": 9117 }, { "epoch": 4.148316651501365, "grad_norm": 0.18786337134507414, "learning_rate": 3.498566656712529e-06, "loss": 0.002, "step": 9118 }, { "epoch": 4.14877161055505, "grad_norm": 0.1672903194139271, "learning_rate": 3.494921419724062e-06, "loss": 0.0029, "step": 9119 }, { "epoch": 4.149226569608735, "grad_norm": 0.13057759459013607, "learning_rate": 3.4912779400334993e-06, "loss": 0.0016, "step": 9120 }, { "epoch": 4.149681528662421, "grad_norm": 0.1312075707303854, "learning_rate": 3.487636217938567e-06, "loss": 0.0025, "step": 9121 }, { "epoch": 4.1501364877161055, "grad_norm": 0.04317188802622885, "learning_rate": 3.4839962537368516e-06, "loss": 0.0005, "step": 9122 }, { "epoch": 4.15059144676979, "grad_norm": 0.1602013566521898, "learning_rate": 3.480358047725804e-06, "loss": 0.001, "step": 9123 }, { "epoch": 4.151046405823476, "grad_norm": 0.13739313614668786, "learning_rate": 3.4767216002027146e-06, "loss": 0.001, "step": 9124 }, { "epoch": 4.151501364877161, "grad_norm": 0.15601828660337755, "learning_rate": 3.47308691146474e-06, "loss": 0.0046, "step": 9125 }, { "epoch": 4.151956323930846, "grad_norm": 0.14061124738145536, "learning_rate": 3.469453981808887e-06, "loss": 0.0037, "step": 9126 }, { "epoch": 4.152411282984532, "grad_norm": 0.11112949376568539, "learning_rate": 3.4658228115320157e-06, "loss": 0.0019, "step": 9127 }, { "epoch": 4.1528662420382165, "grad_norm": 0.02652197481017562, "learning_rate": 3.4621934009308603e-06, "loss": 0.0004, "step": 9128 }, { "epoch": 4.153321201091901, "grad_norm": 0.29622890402441915, "learning_rate": 3.458565750301998e-06, "loss": 0.0075, "step": 9129 }, { "epoch": 4.153776160145587, "grad_norm": 0.1951180763337643, "learning_rate": 3.4549398599418663e-06, "loss": 0.0035, "step": 9130 }, { "epoch": 4.154231119199272, "grad_norm": 0.011679160232895324, "learning_rate": 3.451315730146751e-06, "loss": 0.0002, "step": 9131 }, { "epoch": 4.154686078252957, "grad_norm": 0.323711104436701, "learning_rate": 3.447693361212795e-06, "loss": 0.0034, "step": 9132 }, { "epoch": 4.155141037306643, "grad_norm": 0.08473351496974307, "learning_rate": 3.4440727534360144e-06, "loss": 0.0007, "step": 9133 }, { "epoch": 4.1555959963603275, "grad_norm": 0.10652270630996404, "learning_rate": 3.440453907112262e-06, "loss": 0.0019, "step": 9134 }, { "epoch": 4.156050955414012, "grad_norm": 0.2871406788316793, "learning_rate": 3.4368368225372482e-06, "loss": 0.0056, "step": 9135 }, { "epoch": 4.156505914467698, "grad_norm": 0.11951785167195003, "learning_rate": 3.4332215000065587e-06, "loss": 0.0007, "step": 9136 }, { "epoch": 4.156960873521383, "grad_norm": 0.05139817529561993, "learning_rate": 3.4296079398156074e-06, "loss": 0.0002, "step": 9137 }, { "epoch": 4.157415832575068, "grad_norm": 0.18981049853513207, "learning_rate": 3.425996142259688e-06, "loss": 0.0027, "step": 9138 }, { "epoch": 4.157870791628754, "grad_norm": 0.026269012459965057, "learning_rate": 3.4223861076339374e-06, "loss": 0.0002, "step": 9139 }, { "epoch": 4.1583257506824385, "grad_norm": 0.08736043785130142, "learning_rate": 3.4187778362333502e-06, "loss": 0.0023, "step": 9140 }, { "epoch": 4.158780709736123, "grad_norm": 0.12230080216323196, "learning_rate": 3.4151713283527732e-06, "loss": 0.001, "step": 9141 }, { "epoch": 4.159235668789809, "grad_norm": 0.09562310248906758, "learning_rate": 3.4115665842869184e-06, "loss": 0.0012, "step": 9142 }, { "epoch": 4.159690627843494, "grad_norm": 0.27568718171011697, "learning_rate": 3.4079636043303555e-06, "loss": 0.0045, "step": 9143 }, { "epoch": 4.160145586897179, "grad_norm": 0.005634397887241552, "learning_rate": 3.404362388777499e-06, "loss": 0.0001, "step": 9144 }, { "epoch": 4.160600545950865, "grad_norm": 0.11544778190199524, "learning_rate": 3.400762937922622e-06, "loss": 0.0015, "step": 9145 }, { "epoch": 4.1610555050045495, "grad_norm": 0.30863823175406946, "learning_rate": 3.397165252059853e-06, "loss": 0.0039, "step": 9146 }, { "epoch": 4.161510464058235, "grad_norm": 0.04957835207743252, "learning_rate": 3.3935693314831847e-06, "loss": 0.0006, "step": 9147 }, { "epoch": 4.16196542311192, "grad_norm": 0.10350443171913219, "learning_rate": 3.3899751764864594e-06, "loss": 0.0012, "step": 9148 }, { "epoch": 4.162420382165605, "grad_norm": 0.02796076163326969, "learning_rate": 3.386382787363365e-06, "loss": 0.0003, "step": 9149 }, { "epoch": 4.162875341219291, "grad_norm": 0.29049506749505666, "learning_rate": 3.3827921644074694e-06, "loss": 0.0058, "step": 9150 }, { "epoch": 4.163330300272976, "grad_norm": 0.023638184559030453, "learning_rate": 3.379203307912171e-06, "loss": 0.0004, "step": 9151 }, { "epoch": 4.16378525932666, "grad_norm": 0.30006193778434687, "learning_rate": 3.3756162181707433e-06, "loss": 0.0076, "step": 9152 }, { "epoch": 4.164240218380346, "grad_norm": 0.05665221146475244, "learning_rate": 3.3720308954763053e-06, "loss": 0.0008, "step": 9153 }, { "epoch": 4.164695177434031, "grad_norm": 0.15260635741015843, "learning_rate": 3.3684473401218304e-06, "loss": 0.0062, "step": 9154 }, { "epoch": 4.165150136487716, "grad_norm": 0.13373143636241022, "learning_rate": 3.3648655524001453e-06, "loss": 0.0019, "step": 9155 }, { "epoch": 4.165605095541402, "grad_norm": 0.07271378401660109, "learning_rate": 3.3612855326039446e-06, "loss": 0.001, "step": 9156 }, { "epoch": 4.1660600545950865, "grad_norm": 0.1468800550965672, "learning_rate": 3.3577072810257766e-06, "loss": 0.0023, "step": 9157 }, { "epoch": 4.166515013648771, "grad_norm": 0.5142985170280056, "learning_rate": 3.3541307979580354e-06, "loss": 0.0127, "step": 9158 }, { "epoch": 4.166969972702457, "grad_norm": 0.14427013900363214, "learning_rate": 3.350556083692971e-06, "loss": 0.003, "step": 9159 }, { "epoch": 4.167424931756142, "grad_norm": 0.030841411065227285, "learning_rate": 3.3469831385226968e-06, "loss": 0.0002, "step": 9160 }, { "epoch": 4.167879890809827, "grad_norm": 0.6149895884592742, "learning_rate": 3.343411962739168e-06, "loss": 0.0084, "step": 9161 }, { "epoch": 4.168334849863513, "grad_norm": 0.09153766825025657, "learning_rate": 3.3398425566342235e-06, "loss": 0.0027, "step": 9162 }, { "epoch": 4.1687898089171975, "grad_norm": 0.04151044580225179, "learning_rate": 3.3362749204995184e-06, "loss": 0.0003, "step": 9163 }, { "epoch": 4.169244767970882, "grad_norm": 0.049693138780278374, "learning_rate": 3.332709054626604e-06, "loss": 0.0014, "step": 9164 }, { "epoch": 4.169699727024568, "grad_norm": 0.07766711224383661, "learning_rate": 3.329144959306854e-06, "loss": 0.0002, "step": 9165 }, { "epoch": 4.170154686078253, "grad_norm": 0.08491008711284995, "learning_rate": 3.325582634831509e-06, "loss": 0.0018, "step": 9166 }, { "epoch": 4.170609645131938, "grad_norm": 0.1824705518854197, "learning_rate": 3.322022081491677e-06, "loss": 0.0019, "step": 9167 }, { "epoch": 4.171064604185624, "grad_norm": 0.17560195450521493, "learning_rate": 3.3184632995783003e-06, "loss": 0.0037, "step": 9168 }, { "epoch": 4.1715195632393085, "grad_norm": 0.04122978010994089, "learning_rate": 3.314906289382194e-06, "loss": 0.0003, "step": 9169 }, { "epoch": 4.171974522292993, "grad_norm": 0.3992008497515075, "learning_rate": 3.311351051194009e-06, "loss": 0.0043, "step": 9170 }, { "epoch": 4.172429481346679, "grad_norm": 0.24014573686450905, "learning_rate": 3.3077975853042703e-06, "loss": 0.0022, "step": 9171 }, { "epoch": 4.172884440400364, "grad_norm": 0.3519785842347105, "learning_rate": 3.3042458920033577e-06, "loss": 0.0015, "step": 9172 }, { "epoch": 4.173339399454049, "grad_norm": 0.005940438521649066, "learning_rate": 3.300695971581494e-06, "loss": 0.0001, "step": 9173 }, { "epoch": 4.173794358507735, "grad_norm": 0.19604613122644285, "learning_rate": 3.297147824328764e-06, "loss": 0.0022, "step": 9174 }, { "epoch": 4.1742493175614195, "grad_norm": 0.3186859033648062, "learning_rate": 3.293601450535097e-06, "loss": 0.0038, "step": 9175 }, { "epoch": 4.174704276615104, "grad_norm": 0.08575065181940676, "learning_rate": 3.2900568504903e-06, "loss": 0.0005, "step": 9176 }, { "epoch": 4.17515923566879, "grad_norm": 0.1062487331210009, "learning_rate": 3.286514024484011e-06, "loss": 0.0009, "step": 9177 }, { "epoch": 4.175614194722475, "grad_norm": 0.04456153231803642, "learning_rate": 3.282972972805742e-06, "loss": 0.0006, "step": 9178 }, { "epoch": 4.17606915377616, "grad_norm": 0.14917566113229452, "learning_rate": 3.279433695744852e-06, "loss": 0.0011, "step": 9179 }, { "epoch": 4.176524112829846, "grad_norm": 0.19224922910411754, "learning_rate": 3.275896193590544e-06, "loss": 0.0044, "step": 9180 }, { "epoch": 4.1769790718835305, "grad_norm": 0.1090157449410505, "learning_rate": 3.272360466631899e-06, "loss": 0.0021, "step": 9181 }, { "epoch": 4.177434030937215, "grad_norm": 0.19490750691717063, "learning_rate": 3.268826515157836e-06, "loss": 0.0021, "step": 9182 }, { "epoch": 4.177888989990901, "grad_norm": 0.5606244067681033, "learning_rate": 3.265294339457131e-06, "loss": 0.0053, "step": 9183 }, { "epoch": 4.178343949044586, "grad_norm": 0.050968829544764976, "learning_rate": 3.2617639398184185e-06, "loss": 0.0005, "step": 9184 }, { "epoch": 4.178798908098271, "grad_norm": 0.1763832433947497, "learning_rate": 3.258235316530184e-06, "loss": 0.003, "step": 9185 }, { "epoch": 4.179253867151957, "grad_norm": 0.24876484026784298, "learning_rate": 3.254708469880782e-06, "loss": 0.0048, "step": 9186 }, { "epoch": 4.179708826205641, "grad_norm": 0.19260785691125545, "learning_rate": 3.2511834001584e-06, "loss": 0.0012, "step": 9187 }, { "epoch": 4.180163785259326, "grad_norm": 0.11672597352336023, "learning_rate": 3.247660107651096e-06, "loss": 0.001, "step": 9188 }, { "epoch": 4.180618744313012, "grad_norm": 0.032963521753760675, "learning_rate": 3.24413859264677e-06, "loss": 0.0004, "step": 9189 }, { "epoch": 4.181073703366697, "grad_norm": 0.1319628021415218, "learning_rate": 3.2406188554331945e-06, "loss": 0.0016, "step": 9190 }, { "epoch": 4.181528662420382, "grad_norm": 0.13473936953532933, "learning_rate": 3.2371008962979787e-06, "loss": 0.0055, "step": 9191 }, { "epoch": 4.1819836214740675, "grad_norm": 0.11707760806515217, "learning_rate": 3.233584715528601e-06, "loss": 0.0009, "step": 9192 }, { "epoch": 4.182438580527752, "grad_norm": 0.12940520817915646, "learning_rate": 3.230070313412381e-06, "loss": 0.0015, "step": 9193 }, { "epoch": 4.182893539581437, "grad_norm": 0.06671195654512953, "learning_rate": 3.2265576902365007e-06, "loss": 0.0006, "step": 9194 }, { "epoch": 4.183348498635123, "grad_norm": 0.059747719901124575, "learning_rate": 3.2230468462880024e-06, "loss": 0.0004, "step": 9195 }, { "epoch": 4.183803457688808, "grad_norm": 0.17636830183324886, "learning_rate": 3.2195377818537736e-06, "loss": 0.0027, "step": 9196 }, { "epoch": 4.184258416742493, "grad_norm": 0.3186489763455502, "learning_rate": 3.216030497220557e-06, "loss": 0.0048, "step": 9197 }, { "epoch": 4.1847133757961785, "grad_norm": 0.09373430970245472, "learning_rate": 3.2125249926749453e-06, "loss": 0.0029, "step": 9198 }, { "epoch": 4.185168334849863, "grad_norm": 0.45899726032687554, "learning_rate": 3.2090212685034067e-06, "loss": 0.0115, "step": 9199 }, { "epoch": 4.185623293903548, "grad_norm": 0.13109446353542237, "learning_rate": 3.205519324992237e-06, "loss": 0.0014, "step": 9200 }, { "epoch": 4.186078252957234, "grad_norm": 0.20267463220745668, "learning_rate": 3.202019162427611e-06, "loss": 0.0025, "step": 9201 }, { "epoch": 4.186533212010919, "grad_norm": 0.05822637732114507, "learning_rate": 3.1985207810955405e-06, "loss": 0.0008, "step": 9202 }, { "epoch": 4.186988171064604, "grad_norm": 0.22843335475224197, "learning_rate": 3.195024181281894e-06, "loss": 0.005, "step": 9203 }, { "epoch": 4.1874431301182895, "grad_norm": 0.12364189575818568, "learning_rate": 3.1915293632723996e-06, "loss": 0.0006, "step": 9204 }, { "epoch": 4.187898089171974, "grad_norm": 0.4997072114534902, "learning_rate": 3.1880363273526363e-06, "loss": 0.0145, "step": 9205 }, { "epoch": 4.188353048225659, "grad_norm": 0.14982310172902552, "learning_rate": 3.1845450738080513e-06, "loss": 0.0037, "step": 9206 }, { "epoch": 4.188808007279345, "grad_norm": 0.1514840503489821, "learning_rate": 3.1810556029239215e-06, "loss": 0.0019, "step": 9207 }, { "epoch": 4.18926296633303, "grad_norm": 0.13413649920554996, "learning_rate": 3.1775679149853964e-06, "loss": 0.001, "step": 9208 }, { "epoch": 4.189717925386716, "grad_norm": 0.12505659319824924, "learning_rate": 3.174082010277468e-06, "loss": 0.0069, "step": 9209 }, { "epoch": 4.1901728844404005, "grad_norm": 0.15116995438998804, "learning_rate": 3.1705978890849947e-06, "loss": 0.0036, "step": 9210 }, { "epoch": 4.190627843494085, "grad_norm": 0.019969404189307285, "learning_rate": 3.1671155516926844e-06, "loss": 0.0002, "step": 9211 }, { "epoch": 4.191082802547771, "grad_norm": 0.2640505445704908, "learning_rate": 3.163634998385087e-06, "loss": 0.0027, "step": 9212 }, { "epoch": 4.191537761601456, "grad_norm": 0.053617916288259716, "learning_rate": 3.1601562294466313e-06, "loss": 0.0007, "step": 9213 }, { "epoch": 4.191992720655141, "grad_norm": 0.0073295347309463715, "learning_rate": 3.1566792451615755e-06, "loss": 0.0001, "step": 9214 }, { "epoch": 4.192447679708827, "grad_norm": 0.16931807400139032, "learning_rate": 3.1532040458140536e-06, "loss": 0.0036, "step": 9215 }, { "epoch": 4.1929026387625115, "grad_norm": 0.037031147759879535, "learning_rate": 3.149730631688039e-06, "loss": 0.0007, "step": 9216 }, { "epoch": 4.193357597816196, "grad_norm": 0.13260421622583976, "learning_rate": 3.1462590030673615e-06, "loss": 0.0016, "step": 9217 }, { "epoch": 4.193812556869882, "grad_norm": 0.03421193521035052, "learning_rate": 3.142789160235701e-06, "loss": 0.0006, "step": 9218 }, { "epoch": 4.194267515923567, "grad_norm": 0.294958544735628, "learning_rate": 3.1393211034766057e-06, "loss": 0.0051, "step": 9219 }, { "epoch": 4.194722474977252, "grad_norm": 0.23178319587553675, "learning_rate": 3.135854833073473e-06, "loss": 0.0044, "step": 9220 }, { "epoch": 4.195177434030938, "grad_norm": 0.055656244973794976, "learning_rate": 3.132390349309547e-06, "loss": 0.0005, "step": 9221 }, { "epoch": 4.195632393084622, "grad_norm": 0.022361619467579117, "learning_rate": 3.128927652467925e-06, "loss": 0.0002, "step": 9222 }, { "epoch": 4.196087352138307, "grad_norm": 0.17410972946479594, "learning_rate": 3.125466742831562e-06, "loss": 0.0024, "step": 9223 }, { "epoch": 4.196542311191993, "grad_norm": 0.2619920428891518, "learning_rate": 3.1220076206832786e-06, "loss": 0.0042, "step": 9224 }, { "epoch": 4.196997270245678, "grad_norm": 0.16601638267463475, "learning_rate": 3.1185502863057327e-06, "loss": 0.0023, "step": 9225 }, { "epoch": 4.197452229299363, "grad_norm": 0.09966973379982914, "learning_rate": 3.115094739981436e-06, "loss": 0.0013, "step": 9226 }, { "epoch": 4.1979071883530485, "grad_norm": 0.12445480625454765, "learning_rate": 3.1116409819927695e-06, "loss": 0.0025, "step": 9227 }, { "epoch": 4.198362147406733, "grad_norm": 0.11822376878204306, "learning_rate": 3.108189012621951e-06, "loss": 0.0017, "step": 9228 }, { "epoch": 4.198817106460418, "grad_norm": 0.11771066494977744, "learning_rate": 3.104738832151069e-06, "loss": 0.0033, "step": 9229 }, { "epoch": 4.199272065514104, "grad_norm": 0.14895715985534208, "learning_rate": 3.1012904408620537e-06, "loss": 0.0036, "step": 9230 }, { "epoch": 4.199727024567789, "grad_norm": 0.07943718663279944, "learning_rate": 3.0978438390366877e-06, "loss": 0.001, "step": 9231 }, { "epoch": 4.200181983621474, "grad_norm": 0.22730459359159755, "learning_rate": 3.0943990269566124e-06, "loss": 0.0028, "step": 9232 }, { "epoch": 4.2006369426751595, "grad_norm": 0.11442409199917457, "learning_rate": 3.0909560049033143e-06, "loss": 0.0016, "step": 9233 }, { "epoch": 4.201091901728844, "grad_norm": 0.0506635444498351, "learning_rate": 3.087514773158162e-06, "loss": 0.0003, "step": 9234 }, { "epoch": 4.201546860782529, "grad_norm": 0.01655383288105424, "learning_rate": 3.084075332002348e-06, "loss": 0.0002, "step": 9235 }, { "epoch": 4.202001819836215, "grad_norm": 0.06441108620452995, "learning_rate": 3.080637681716925e-06, "loss": 0.0004, "step": 9236 }, { "epoch": 4.2024567788899, "grad_norm": 0.05310665908573686, "learning_rate": 3.0772018225828036e-06, "loss": 0.0012, "step": 9237 }, { "epoch": 4.202911737943585, "grad_norm": 0.005515125376317968, "learning_rate": 3.073767754880744e-06, "loss": 0.0001, "step": 9238 }, { "epoch": 4.2033666969972705, "grad_norm": 0.11807439247541708, "learning_rate": 3.0703354788913673e-06, "loss": 0.0019, "step": 9239 }, { "epoch": 4.203821656050955, "grad_norm": 0.044036702400035696, "learning_rate": 3.0669049948951367e-06, "loss": 0.0005, "step": 9240 }, { "epoch": 4.20427661510464, "grad_norm": 0.08560729712815128, "learning_rate": 3.0634763031723882e-06, "loss": 0.0015, "step": 9241 }, { "epoch": 4.204731574158326, "grad_norm": 0.22169642441890003, "learning_rate": 3.0600494040032903e-06, "loss": 0.0037, "step": 9242 }, { "epoch": 4.205186533212011, "grad_norm": 0.09626245789570287, "learning_rate": 3.0566242976678707e-06, "loss": 0.0025, "step": 9243 }, { "epoch": 4.205641492265696, "grad_norm": 0.319595922697305, "learning_rate": 3.0532009844460226e-06, "loss": 0.0062, "step": 9244 }, { "epoch": 4.2060964513193815, "grad_norm": 0.2679357462740025, "learning_rate": 3.0497794646174803e-06, "loss": 0.0056, "step": 9245 }, { "epoch": 4.206551410373066, "grad_norm": 0.27415252910107113, "learning_rate": 3.046359738461832e-06, "loss": 0.0057, "step": 9246 }, { "epoch": 4.207006369426751, "grad_norm": 0.1499617821058508, "learning_rate": 3.0429418062585203e-06, "loss": 0.0037, "step": 9247 }, { "epoch": 4.207461328480437, "grad_norm": 0.1577679386802573, "learning_rate": 3.0395256682868467e-06, "loss": 0.0034, "step": 9248 }, { "epoch": 4.207916287534122, "grad_norm": 0.21531126110696452, "learning_rate": 3.0361113248259685e-06, "loss": 0.0051, "step": 9249 }, { "epoch": 4.208371246587807, "grad_norm": 0.18629178336261237, "learning_rate": 3.0326987761548825e-06, "loss": 0.004, "step": 9250 }, { "epoch": 4.2088262056414925, "grad_norm": 0.4657783105020774, "learning_rate": 3.0292880225524513e-06, "loss": 0.014, "step": 9251 }, { "epoch": 4.209281164695177, "grad_norm": 0.11758819773117964, "learning_rate": 3.0258790642973793e-06, "loss": 0.0026, "step": 9252 }, { "epoch": 4.209736123748862, "grad_norm": 0.2164736917139879, "learning_rate": 3.022471901668239e-06, "loss": 0.0055, "step": 9253 }, { "epoch": 4.210191082802548, "grad_norm": 0.030358969525760858, "learning_rate": 3.0190665349434435e-06, "loss": 0.0004, "step": 9254 }, { "epoch": 4.210646041856233, "grad_norm": 0.3602815984439154, "learning_rate": 3.0156629644012667e-06, "loss": 0.0018, "step": 9255 }, { "epoch": 4.211101000909918, "grad_norm": 0.10606617896211717, "learning_rate": 3.0122611903198345e-06, "loss": 0.0025, "step": 9256 }, { "epoch": 4.211555959963603, "grad_norm": 0.24520512752745224, "learning_rate": 3.0088612129771156e-06, "loss": 0.0062, "step": 9257 }, { "epoch": 4.212010919017288, "grad_norm": 0.024588642311601475, "learning_rate": 3.005463032650954e-06, "loss": 0.0003, "step": 9258 }, { "epoch": 4.212465878070974, "grad_norm": 0.45811862032354694, "learning_rate": 3.0020666496190257e-06, "loss": 0.0024, "step": 9259 }, { "epoch": 4.212920837124659, "grad_norm": 0.06285193945767818, "learning_rate": 2.9986720641588696e-06, "loss": 0.0002, "step": 9260 }, { "epoch": 4.213375796178344, "grad_norm": 0.0535965158373713, "learning_rate": 2.9952792765478716e-06, "loss": 0.0007, "step": 9261 }, { "epoch": 4.2138307552320295, "grad_norm": 0.17842320946801832, "learning_rate": 2.991888287063277e-06, "loss": 0.0009, "step": 9262 }, { "epoch": 4.214285714285714, "grad_norm": 0.1816583449493357, "learning_rate": 2.988499095982189e-06, "loss": 0.0034, "step": 9263 }, { "epoch": 4.214740673339399, "grad_norm": 0.21997652586589603, "learning_rate": 2.98511170358155e-06, "loss": 0.0027, "step": 9264 }, { "epoch": 4.215195632393085, "grad_norm": 0.12780039128122445, "learning_rate": 2.9817261101381666e-06, "loss": 0.0005, "step": 9265 }, { "epoch": 4.21565059144677, "grad_norm": 0.0716459428190585, "learning_rate": 2.9783423159286922e-06, "loss": 0.0012, "step": 9266 }, { "epoch": 4.216105550500455, "grad_norm": 0.3154503227193584, "learning_rate": 2.974960321229628e-06, "loss": 0.0052, "step": 9267 }, { "epoch": 4.2165605095541405, "grad_norm": 0.5609228930257417, "learning_rate": 2.9715801263173445e-06, "loss": 0.0131, "step": 9268 }, { "epoch": 4.217015468607825, "grad_norm": 0.010099134344421957, "learning_rate": 2.9682017314680567e-06, "loss": 0.0002, "step": 9269 }, { "epoch": 4.21747042766151, "grad_norm": 0.14923343418604162, "learning_rate": 2.96482513695783e-06, "loss": 0.0029, "step": 9270 }, { "epoch": 4.217925386715196, "grad_norm": 0.023652704273067208, "learning_rate": 2.961450343062583e-06, "loss": 0.0003, "step": 9271 }, { "epoch": 4.218380345768881, "grad_norm": 0.09529953005833494, "learning_rate": 2.9580773500580805e-06, "loss": 0.0012, "step": 9272 }, { "epoch": 4.218835304822566, "grad_norm": 0.09047639819625157, "learning_rate": 2.9547061582199664e-06, "loss": 0.0015, "step": 9273 }, { "epoch": 4.2192902638762515, "grad_norm": 0.1670398301163671, "learning_rate": 2.9513367678237066e-06, "loss": 0.0036, "step": 9274 }, { "epoch": 4.219745222929936, "grad_norm": 0.16602637725513741, "learning_rate": 2.94796917914463e-06, "loss": 0.004, "step": 9275 }, { "epoch": 4.220200181983621, "grad_norm": 0.011044954169683712, "learning_rate": 2.944603392457931e-06, "loss": 0.0001, "step": 9276 }, { "epoch": 4.220655141037307, "grad_norm": 0.04812254369734956, "learning_rate": 2.9412394080386375e-06, "loss": 0.0013, "step": 9277 }, { "epoch": 4.221110100090992, "grad_norm": 0.07522731487874075, "learning_rate": 2.937877226161648e-06, "loss": 0.0021, "step": 9278 }, { "epoch": 4.221565059144677, "grad_norm": 0.19143176488771727, "learning_rate": 2.934516847101701e-06, "loss": 0.0048, "step": 9279 }, { "epoch": 4.2220200181983625, "grad_norm": 0.23740994187360767, "learning_rate": 2.9311582711333885e-06, "loss": 0.0068, "step": 9280 }, { "epoch": 4.222474977252047, "grad_norm": 0.13267124739839442, "learning_rate": 2.9278014985311548e-06, "loss": 0.0018, "step": 9281 }, { "epoch": 4.222929936305732, "grad_norm": 0.10005710679425005, "learning_rate": 2.924446529569308e-06, "loss": 0.0006, "step": 9282 }, { "epoch": 4.223384895359418, "grad_norm": 0.13843979069811177, "learning_rate": 2.9210933645220014e-06, "loss": 0.0036, "step": 9283 }, { "epoch": 4.223839854413103, "grad_norm": 0.031180119647630865, "learning_rate": 2.9177420036632374e-06, "loss": 0.0003, "step": 9284 }, { "epoch": 4.224294813466788, "grad_norm": 0.04161278534894511, "learning_rate": 2.9143924472668753e-06, "loss": 0.0005, "step": 9285 }, { "epoch": 4.2247497725204735, "grad_norm": 0.542360785778991, "learning_rate": 2.9110446956066185e-06, "loss": 0.0028, "step": 9286 }, { "epoch": 4.225204731574158, "grad_norm": 0.2590022582744293, "learning_rate": 2.907698748956042e-06, "loss": 0.01, "step": 9287 }, { "epoch": 4.225659690627843, "grad_norm": 0.10657085927648012, "learning_rate": 2.904354607588555e-06, "loss": 0.002, "step": 9288 }, { "epoch": 4.226114649681529, "grad_norm": 0.25748628450817834, "learning_rate": 2.901012271777423e-06, "loss": 0.0049, "step": 9289 }, { "epoch": 4.226569608735214, "grad_norm": 0.19363946232088447, "learning_rate": 2.897671741795774e-06, "loss": 0.003, "step": 9290 }, { "epoch": 4.227024567788899, "grad_norm": 0.10173748434531225, "learning_rate": 2.894333017916573e-06, "loss": 0.0016, "step": 9291 }, { "epoch": 4.227479526842584, "grad_norm": 0.04870523835786692, "learning_rate": 2.8909961004126547e-06, "loss": 0.0007, "step": 9292 }, { "epoch": 4.227934485896269, "grad_norm": 0.0736103310116871, "learning_rate": 2.8876609895566897e-06, "loss": 0.0008, "step": 9293 }, { "epoch": 4.228389444949954, "grad_norm": 0.0846721931745991, "learning_rate": 2.88432768562121e-06, "loss": 0.002, "step": 9294 }, { "epoch": 4.22884440400364, "grad_norm": 0.22166360226763238, "learning_rate": 2.8809961888785957e-06, "loss": 0.0069, "step": 9295 }, { "epoch": 4.229299363057325, "grad_norm": 0.182674386596055, "learning_rate": 2.877666499601084e-06, "loss": 0.003, "step": 9296 }, { "epoch": 4.22975432211101, "grad_norm": 0.1990907930287629, "learning_rate": 2.874338618060765e-06, "loss": 0.0049, "step": 9297 }, { "epoch": 4.230209281164695, "grad_norm": 0.024988834653340584, "learning_rate": 2.8710125445295776e-06, "loss": 0.0002, "step": 9298 }, { "epoch": 4.23066424021838, "grad_norm": 0.14492264578596925, "learning_rate": 2.8676882792793125e-06, "loss": 0.0043, "step": 9299 }, { "epoch": 4.231119199272065, "grad_norm": 0.05682156271812069, "learning_rate": 2.864365822581605e-06, "loss": 0.0005, "step": 9300 }, { "epoch": 4.231574158325751, "grad_norm": 0.027322247919617043, "learning_rate": 2.8610451747079658e-06, "loss": 0.0004, "step": 9301 }, { "epoch": 4.232029117379436, "grad_norm": 0.2266570527635624, "learning_rate": 2.8577263359297335e-06, "loss": 0.0056, "step": 9302 }, { "epoch": 4.232484076433121, "grad_norm": 0.023645290476188133, "learning_rate": 2.8544093065181104e-06, "loss": 0.0002, "step": 9303 }, { "epoch": 4.232939035486806, "grad_norm": 0.07908255907756027, "learning_rate": 2.851094086744152e-06, "loss": 0.0009, "step": 9304 }, { "epoch": 4.233393994540491, "grad_norm": 0.16414635086921023, "learning_rate": 2.8477806768787614e-06, "loss": 0.0025, "step": 9305 }, { "epoch": 4.233848953594176, "grad_norm": 0.26809687428807455, "learning_rate": 2.8444690771926907e-06, "loss": 0.0063, "step": 9306 }, { "epoch": 4.234303912647862, "grad_norm": 0.01191430867328034, "learning_rate": 2.84115928795656e-06, "loss": 0.0001, "step": 9307 }, { "epoch": 4.234758871701547, "grad_norm": 0.10519638805357973, "learning_rate": 2.8378513094408225e-06, "loss": 0.0035, "step": 9308 }, { "epoch": 4.235213830755232, "grad_norm": 0.04556315383661376, "learning_rate": 2.8345451419157924e-06, "loss": 0.0007, "step": 9309 }, { "epoch": 4.235668789808917, "grad_norm": 0.14875651689983171, "learning_rate": 2.8312407856516315e-06, "loss": 0.0019, "step": 9310 }, { "epoch": 4.236123748862602, "grad_norm": 0.11187253805096027, "learning_rate": 2.8279382409183598e-06, "loss": 0.0009, "step": 9311 }, { "epoch": 4.236578707916287, "grad_norm": 0.09274361378417821, "learning_rate": 2.824637507985853e-06, "loss": 0.0018, "step": 9312 }, { "epoch": 4.237033666969973, "grad_norm": 0.21905698090384348, "learning_rate": 2.821338587123823e-06, "loss": 0.0024, "step": 9313 }, { "epoch": 4.237488626023658, "grad_norm": 0.10790559475195903, "learning_rate": 2.818041478601849e-06, "loss": 0.0012, "step": 9314 }, { "epoch": 4.237943585077343, "grad_norm": 0.01289407306077433, "learning_rate": 2.814746182689346e-06, "loss": 0.0001, "step": 9315 }, { "epoch": 4.238398544131028, "grad_norm": 0.4538860237655752, "learning_rate": 2.8114526996556007e-06, "loss": 0.0051, "step": 9316 }, { "epoch": 4.238853503184713, "grad_norm": 0.05564068134998087, "learning_rate": 2.808161029769735e-06, "loss": 0.0011, "step": 9317 }, { "epoch": 4.239308462238399, "grad_norm": 0.1164776412800317, "learning_rate": 2.8048711733007356e-06, "loss": 0.0033, "step": 9318 }, { "epoch": 4.239763421292084, "grad_norm": 0.028557344840835403, "learning_rate": 2.801583130517432e-06, "loss": 0.0003, "step": 9319 }, { "epoch": 4.240218380345769, "grad_norm": 0.08323164078505349, "learning_rate": 2.798296901688505e-06, "loss": 0.0005, "step": 9320 }, { "epoch": 4.2406733393994545, "grad_norm": 0.17516400569094398, "learning_rate": 2.7950124870824964e-06, "loss": 0.0091, "step": 9321 }, { "epoch": 4.241128298453139, "grad_norm": 0.07001665164646158, "learning_rate": 2.791729886967792e-06, "loss": 0.0007, "step": 9322 }, { "epoch": 4.241583257506824, "grad_norm": 0.10238203093836908, "learning_rate": 2.788449101612628e-06, "loss": 0.0017, "step": 9323 }, { "epoch": 4.24203821656051, "grad_norm": 0.08146823580845543, "learning_rate": 2.785170131285092e-06, "loss": 0.002, "step": 9324 }, { "epoch": 4.242493175614195, "grad_norm": 0.07779087597202666, "learning_rate": 2.7818929762531336e-06, "loss": 0.0015, "step": 9325 }, { "epoch": 4.24294813466788, "grad_norm": 0.2398485288313554, "learning_rate": 2.7786176367845473e-06, "loss": 0.0049, "step": 9326 }, { "epoch": 4.243403093721565, "grad_norm": 0.16349184699965824, "learning_rate": 2.77534411314698e-06, "loss": 0.0019, "step": 9327 }, { "epoch": 4.24385805277525, "grad_norm": 0.26369386896435637, "learning_rate": 2.7720724056079226e-06, "loss": 0.0041, "step": 9328 }, { "epoch": 4.244313011828935, "grad_norm": 0.10294419189263018, "learning_rate": 2.768802514434726e-06, "loss": 0.0041, "step": 9329 }, { "epoch": 4.244767970882621, "grad_norm": 0.04286717284777888, "learning_rate": 2.765534439894596e-06, "loss": 0.0004, "step": 9330 }, { "epoch": 4.245222929936306, "grad_norm": 0.07603318874625214, "learning_rate": 2.7622681822545764e-06, "loss": 0.0019, "step": 9331 }, { "epoch": 4.245677888989991, "grad_norm": 0.13388784901356196, "learning_rate": 2.7590037417815824e-06, "loss": 0.0016, "step": 9332 }, { "epoch": 4.246132848043676, "grad_norm": 0.13016376477024735, "learning_rate": 2.755741118742361e-06, "loss": 0.0024, "step": 9333 }, { "epoch": 4.246587807097361, "grad_norm": 0.30737775152150365, "learning_rate": 2.752480313403519e-06, "loss": 0.0029, "step": 9334 }, { "epoch": 4.247042766151046, "grad_norm": 0.15299911075368455, "learning_rate": 2.74922132603152e-06, "loss": 0.0014, "step": 9335 }, { "epoch": 4.247497725204732, "grad_norm": 0.09912734245088033, "learning_rate": 2.745964156892672e-06, "loss": 0.001, "step": 9336 }, { "epoch": 4.247952684258417, "grad_norm": 1.0731126166266525, "learning_rate": 2.7427088062531332e-06, "loss": 0.0197, "step": 9337 }, { "epoch": 4.248407643312102, "grad_norm": 0.08297305051193607, "learning_rate": 2.739455274378913e-06, "loss": 0.0034, "step": 9338 }, { "epoch": 4.248862602365787, "grad_norm": 0.3875503996137633, "learning_rate": 2.73620356153588e-06, "loss": 0.0105, "step": 9339 }, { "epoch": 4.249317561419472, "grad_norm": 0.21726516621312153, "learning_rate": 2.7329536679897572e-06, "loss": 0.0063, "step": 9340 }, { "epoch": 4.249772520473157, "grad_norm": 0.14502490971373866, "learning_rate": 2.729705594006099e-06, "loss": 0.0028, "step": 9341 }, { "epoch": 4.250227479526843, "grad_norm": 0.11118875445445243, "learning_rate": 2.7264593398503318e-06, "loss": 0.005, "step": 9342 }, { "epoch": 4.250682438580528, "grad_norm": 0.22793651825972472, "learning_rate": 2.723214905787719e-06, "loss": 0.0037, "step": 9343 }, { "epoch": 4.251137397634213, "grad_norm": 0.02294977338819591, "learning_rate": 2.719972292083378e-06, "loss": 0.0002, "step": 9344 }, { "epoch": 4.251592356687898, "grad_norm": 0.11754573375124573, "learning_rate": 2.7167314990022868e-06, "loss": 0.0013, "step": 9345 }, { "epoch": 4.252047315741583, "grad_norm": 0.042883676731459305, "learning_rate": 2.713492526809272e-06, "loss": 0.0007, "step": 9346 }, { "epoch": 4.252502274795268, "grad_norm": 0.18950463692538744, "learning_rate": 2.7102553757690023e-06, "loss": 0.0013, "step": 9347 }, { "epoch": 4.252957233848954, "grad_norm": 0.5700408968467069, "learning_rate": 2.707020046146003e-06, "loss": 0.0078, "step": 9348 }, { "epoch": 4.253412192902639, "grad_norm": 0.27897473351071145, "learning_rate": 2.7037865382046473e-06, "loss": 0.0019, "step": 9349 }, { "epoch": 4.253867151956324, "grad_norm": 0.2605180037871396, "learning_rate": 2.7005548522091694e-06, "loss": 0.0009, "step": 9350 }, { "epoch": 4.254322111010009, "grad_norm": 1.8882042908550452, "learning_rate": 2.6973249884236495e-06, "loss": 0.0127, "step": 9351 }, { "epoch": 4.254777070063694, "grad_norm": 0.12501494153884435, "learning_rate": 2.694096947112007e-06, "loss": 0.0026, "step": 9352 }, { "epoch": 4.255232029117379, "grad_norm": 0.040358749747012035, "learning_rate": 2.6908707285380337e-06, "loss": 0.0003, "step": 9353 }, { "epoch": 4.255686988171065, "grad_norm": 0.2230502919402955, "learning_rate": 2.687646332965352e-06, "loss": 0.0034, "step": 9354 }, { "epoch": 4.25614194722475, "grad_norm": 0.07610931409420552, "learning_rate": 2.6844237606574563e-06, "loss": 0.0018, "step": 9355 }, { "epoch": 4.256596906278435, "grad_norm": 0.3303064861915986, "learning_rate": 2.6812030118776755e-06, "loss": 0.0039, "step": 9356 }, { "epoch": 4.25705186533212, "grad_norm": 0.08019154235897855, "learning_rate": 2.6779840868891933e-06, "loss": 0.0018, "step": 9357 }, { "epoch": 4.257506824385805, "grad_norm": 0.038318351423988914, "learning_rate": 2.6747669859550404e-06, "loss": 0.0005, "step": 9358 }, { "epoch": 4.25796178343949, "grad_norm": 0.2199636751959495, "learning_rate": 2.6715517093381076e-06, "loss": 0.0136, "step": 9359 }, { "epoch": 4.258416742493176, "grad_norm": 0.2014008535306848, "learning_rate": 2.6683382573011423e-06, "loss": 0.0026, "step": 9360 }, { "epoch": 4.258871701546861, "grad_norm": 0.15691255129414683, "learning_rate": 2.665126630106726e-06, "loss": 0.0042, "step": 9361 }, { "epoch": 4.2593266606005455, "grad_norm": 0.45750122269816434, "learning_rate": 2.6619168280172968e-06, "loss": 0.0115, "step": 9362 }, { "epoch": 4.259781619654231, "grad_norm": 0.13952290495399614, "learning_rate": 2.6587088512951415e-06, "loss": 0.0068, "step": 9363 }, { "epoch": 4.260236578707916, "grad_norm": 0.24560293894435667, "learning_rate": 2.655502700202414e-06, "loss": 0.0023, "step": 9364 }, { "epoch": 4.260691537761602, "grad_norm": 0.2297474983188084, "learning_rate": 2.652298375001097e-06, "loss": 0.0055, "step": 9365 }, { "epoch": 4.261146496815287, "grad_norm": 0.13451436495568725, "learning_rate": 2.6490958759530283e-06, "loss": 0.0011, "step": 9366 }, { "epoch": 4.261601455868972, "grad_norm": 0.013475272530991724, "learning_rate": 2.6458952033199176e-06, "loss": 0.0001, "step": 9367 }, { "epoch": 4.262056414922657, "grad_norm": 0.08759481096597196, "learning_rate": 2.6426963573632945e-06, "loss": 0.0016, "step": 9368 }, { "epoch": 4.262511373976342, "grad_norm": 0.19660372871974954, "learning_rate": 2.6394993383445643e-06, "loss": 0.006, "step": 9369 }, { "epoch": 4.262966333030027, "grad_norm": 0.19459729845630314, "learning_rate": 2.6363041465249703e-06, "loss": 0.0037, "step": 9370 }, { "epoch": 4.263421292083713, "grad_norm": 0.08645847842285481, "learning_rate": 2.633110782165607e-06, "loss": 0.002, "step": 9371 }, { "epoch": 4.263876251137398, "grad_norm": 0.18625567576046076, "learning_rate": 2.629919245527418e-06, "loss": 0.0026, "step": 9372 }, { "epoch": 4.264331210191083, "grad_norm": 0.24828607889406035, "learning_rate": 2.6267295368712058e-06, "loss": 0.0035, "step": 9373 }, { "epoch": 4.264786169244768, "grad_norm": 0.4168395716589842, "learning_rate": 2.623541656457623e-06, "loss": 0.0065, "step": 9374 }, { "epoch": 4.265241128298453, "grad_norm": 0.08525957350627278, "learning_rate": 2.620355604547167e-06, "loss": 0.0013, "step": 9375 }, { "epoch": 4.265696087352138, "grad_norm": 0.38725134288989155, "learning_rate": 2.6171713814001824e-06, "loss": 0.0112, "step": 9376 }, { "epoch": 4.266151046405824, "grad_norm": 0.09436018387846828, "learning_rate": 2.6139889872768746e-06, "loss": 0.0006, "step": 9377 }, { "epoch": 4.266606005459509, "grad_norm": 0.23277868629237042, "learning_rate": 2.6108084224372887e-06, "loss": 0.001, "step": 9378 }, { "epoch": 4.267060964513194, "grad_norm": 0.07671605394592107, "learning_rate": 2.607629687141333e-06, "loss": 0.0014, "step": 9379 }, { "epoch": 4.267515923566879, "grad_norm": 0.11111491565193578, "learning_rate": 2.60445278164875e-06, "loss": 0.0009, "step": 9380 }, { "epoch": 4.267970882620564, "grad_norm": 0.1711583946348969, "learning_rate": 2.6012777062191547e-06, "loss": 0.0017, "step": 9381 }, { "epoch": 4.268425841674249, "grad_norm": 0.05864323826046482, "learning_rate": 2.5981044611119944e-06, "loss": 0.0008, "step": 9382 }, { "epoch": 4.268880800727935, "grad_norm": 0.11064782052078588, "learning_rate": 2.5949330465865675e-06, "loss": 0.0016, "step": 9383 }, { "epoch": 4.26933575978162, "grad_norm": 0.10052525786171324, "learning_rate": 2.591763462902033e-06, "loss": 0.0045, "step": 9384 }, { "epoch": 4.269790718835305, "grad_norm": 0.14657699788407572, "learning_rate": 2.588595710317396e-06, "loss": 0.0031, "step": 9385 }, { "epoch": 4.27024567788899, "grad_norm": 0.10010034809205073, "learning_rate": 2.585429789091509e-06, "loss": 0.0011, "step": 9386 }, { "epoch": 4.270700636942675, "grad_norm": 0.01658041554610315, "learning_rate": 2.5822656994830693e-06, "loss": 0.0003, "step": 9387 }, { "epoch": 4.27115559599636, "grad_norm": 0.03737532330251937, "learning_rate": 2.579103441750641e-06, "loss": 0.0004, "step": 9388 }, { "epoch": 4.271610555050046, "grad_norm": 0.041887157919591175, "learning_rate": 2.5759430161526323e-06, "loss": 0.0008, "step": 9389 }, { "epoch": 4.272065514103731, "grad_norm": 0.00498558044153608, "learning_rate": 2.5727844229472914e-06, "loss": 0.0, "step": 9390 }, { "epoch": 4.272520473157416, "grad_norm": 0.1409076362960076, "learning_rate": 2.569627662392729e-06, "loss": 0.0021, "step": 9391 }, { "epoch": 4.272975432211101, "grad_norm": 0.1534002519557771, "learning_rate": 2.566472734746894e-06, "loss": 0.0044, "step": 9392 }, { "epoch": 4.273430391264786, "grad_norm": 0.09505157781033312, "learning_rate": 2.563319640267606e-06, "loss": 0.0024, "step": 9393 }, { "epoch": 4.273885350318471, "grad_norm": 0.008225670694499422, "learning_rate": 2.5601683792125047e-06, "loss": 0.0001, "step": 9394 }, { "epoch": 4.274340309372157, "grad_norm": 0.08618997225803211, "learning_rate": 2.557018951839113e-06, "loss": 0.0006, "step": 9395 }, { "epoch": 4.274795268425842, "grad_norm": 0.39058783539276354, "learning_rate": 2.553871358404783e-06, "loss": 0.0027, "step": 9396 }, { "epoch": 4.2752502274795265, "grad_norm": 0.06704091862980681, "learning_rate": 2.5507255991667115e-06, "loss": 0.0009, "step": 9397 }, { "epoch": 4.275705186533212, "grad_norm": 0.1454115928799974, "learning_rate": 2.5475816743819714e-06, "loss": 0.001, "step": 9398 }, { "epoch": 4.276160145586897, "grad_norm": 0.13656705705298464, "learning_rate": 2.544439584307459e-06, "loss": 0.0041, "step": 9399 }, { "epoch": 4.276615104640582, "grad_norm": 0.13438288805339554, "learning_rate": 2.5412993291999393e-06, "loss": 0.0025, "step": 9400 }, { "epoch": 4.277070063694268, "grad_norm": 0.10823120818202385, "learning_rate": 2.538160909316009e-06, "loss": 0.0031, "step": 9401 }, { "epoch": 4.277525022747953, "grad_norm": 0.027919104747173788, "learning_rate": 2.535024324912133e-06, "loss": 0.0003, "step": 9402 }, { "epoch": 4.2779799818016375, "grad_norm": 0.06600254932262825, "learning_rate": 2.531889576244623e-06, "loss": 0.0005, "step": 9403 }, { "epoch": 4.278434940855323, "grad_norm": 0.0038670957379632527, "learning_rate": 2.528756663569631e-06, "loss": 0.0001, "step": 9404 }, { "epoch": 4.278889899909008, "grad_norm": 0.0683849173734131, "learning_rate": 2.5256255871431657e-06, "loss": 0.0013, "step": 9405 }, { "epoch": 4.279344858962693, "grad_norm": 0.16402380459409252, "learning_rate": 2.522496347221079e-06, "loss": 0.0029, "step": 9406 }, { "epoch": 4.279799818016379, "grad_norm": 0.12704361743203893, "learning_rate": 2.519368944059089e-06, "loss": 0.0022, "step": 9407 }, { "epoch": 4.280254777070064, "grad_norm": 0.2796941271574098, "learning_rate": 2.5162433779127414e-06, "loss": 0.0024, "step": 9408 }, { "epoch": 4.2807097361237485, "grad_norm": 0.08345425745032135, "learning_rate": 2.5131196490374543e-06, "loss": 0.0016, "step": 9409 }, { "epoch": 4.281164695177434, "grad_norm": 0.08489583985041961, "learning_rate": 2.5099977576884814e-06, "loss": 0.0019, "step": 9410 }, { "epoch": 4.281619654231119, "grad_norm": 0.29335749517831544, "learning_rate": 2.5068777041209246e-06, "loss": 0.0039, "step": 9411 }, { "epoch": 4.282074613284804, "grad_norm": 0.13186621000310242, "learning_rate": 2.503759488589741e-06, "loss": 0.0029, "step": 9412 }, { "epoch": 4.28252957233849, "grad_norm": 0.05765258883656439, "learning_rate": 2.500643111349746e-06, "loss": 0.0004, "step": 9413 }, { "epoch": 4.282984531392175, "grad_norm": 0.07087840917474615, "learning_rate": 2.497528572655586e-06, "loss": 0.0006, "step": 9414 }, { "epoch": 4.2834394904458595, "grad_norm": 0.07313996487797088, "learning_rate": 2.4944158727617683e-06, "loss": 0.0031, "step": 9415 }, { "epoch": 4.283894449499545, "grad_norm": 0.009124241837026655, "learning_rate": 2.4913050119226567e-06, "loss": 0.0002, "step": 9416 }, { "epoch": 4.28434940855323, "grad_norm": 0.20438522659976524, "learning_rate": 2.4881959903924446e-06, "loss": 0.0041, "step": 9417 }, { "epoch": 4.284804367606915, "grad_norm": 0.012352973440560694, "learning_rate": 2.4850888084251985e-06, "loss": 0.0001, "step": 9418 }, { "epoch": 4.285259326660601, "grad_norm": 0.051921490099953956, "learning_rate": 2.4819834662748203e-06, "loss": 0.0006, "step": 9419 }, { "epoch": 4.285714285714286, "grad_norm": 0.09562347465179219, "learning_rate": 2.4788799641950605e-06, "loss": 0.0008, "step": 9420 }, { "epoch": 4.2861692447679705, "grad_norm": 0.02791621996378701, "learning_rate": 2.475778302439524e-06, "loss": 0.0003, "step": 9421 }, { "epoch": 4.286624203821656, "grad_norm": 0.16818042782133125, "learning_rate": 2.4726784812616644e-06, "loss": 0.0039, "step": 9422 }, { "epoch": 4.287079162875341, "grad_norm": 0.11999488160390566, "learning_rate": 2.4695805009147893e-06, "loss": 0.0026, "step": 9423 }, { "epoch": 4.287534121929026, "grad_norm": 0.06838907087306083, "learning_rate": 2.4664843616520522e-06, "loss": 0.0011, "step": 9424 }, { "epoch": 4.287989080982712, "grad_norm": 0.18821489173125727, "learning_rate": 2.4633900637264507e-06, "loss": 0.0014, "step": 9425 }, { "epoch": 4.288444040036397, "grad_norm": 0.0669408860165936, "learning_rate": 2.4602976073908353e-06, "loss": 0.0011, "step": 9426 }, { "epoch": 4.288898999090081, "grad_norm": 0.04817215419276371, "learning_rate": 2.4572069928979145e-06, "loss": 0.0007, "step": 9427 }, { "epoch": 4.289353958143767, "grad_norm": 0.25370637472462765, "learning_rate": 2.4541182205002373e-06, "loss": 0.0037, "step": 9428 }, { "epoch": 4.289808917197452, "grad_norm": 0.21290998439638092, "learning_rate": 2.451031290450198e-06, "loss": 0.0021, "step": 9429 }, { "epoch": 4.290263876251138, "grad_norm": 0.21317504267349346, "learning_rate": 2.4479462030000566e-06, "loss": 0.0067, "step": 9430 }, { "epoch": 4.290718835304823, "grad_norm": 0.060552521509065316, "learning_rate": 2.4448629584019e-06, "loss": 0.0008, "step": 9431 }, { "epoch": 4.2911737943585075, "grad_norm": 0.15708965218251986, "learning_rate": 2.4417815569076937e-06, "loss": 0.0032, "step": 9432 }, { "epoch": 4.291628753412193, "grad_norm": 0.1847081510244047, "learning_rate": 2.438701998769222e-06, "loss": 0.0022, "step": 9433 }, { "epoch": 4.292083712465878, "grad_norm": 0.012140127308288505, "learning_rate": 2.4356242842381387e-06, "loss": 0.0003, "step": 9434 }, { "epoch": 4.292538671519563, "grad_norm": 0.03534683653635997, "learning_rate": 2.4325484135659355e-06, "loss": 0.0004, "step": 9435 }, { "epoch": 4.292993630573249, "grad_norm": 0.03731376686029526, "learning_rate": 2.4294743870039604e-06, "loss": 0.0007, "step": 9436 }, { "epoch": 4.293448589626934, "grad_norm": 0.09288653983212096, "learning_rate": 2.4264022048034155e-06, "loss": 0.004, "step": 9437 }, { "epoch": 4.2939035486806185, "grad_norm": 0.10559536204276233, "learning_rate": 2.4233318672153423e-06, "loss": 0.0018, "step": 9438 }, { "epoch": 4.294358507734304, "grad_norm": 0.1273325082939855, "learning_rate": 2.4202633744906335e-06, "loss": 0.0024, "step": 9439 }, { "epoch": 4.294813466787989, "grad_norm": 0.15378584748584145, "learning_rate": 2.417196726880025e-06, "loss": 0.0022, "step": 9440 }, { "epoch": 4.295268425841674, "grad_norm": 0.06251296948881598, "learning_rate": 2.414131924634125e-06, "loss": 0.0004, "step": 9441 }, { "epoch": 4.29572338489536, "grad_norm": 0.032327512681789676, "learning_rate": 2.411068968003366e-06, "loss": 0.0004, "step": 9442 }, { "epoch": 4.296178343949045, "grad_norm": 0.27324306832930373, "learning_rate": 2.408007857238037e-06, "loss": 0.0031, "step": 9443 }, { "epoch": 4.2966333030027295, "grad_norm": 0.10396440572386646, "learning_rate": 2.4049485925882827e-06, "loss": 0.0018, "step": 9444 }, { "epoch": 4.297088262056415, "grad_norm": 0.3315347516489002, "learning_rate": 2.4018911743040884e-06, "loss": 0.0063, "step": 9445 }, { "epoch": 4.2975432211101, "grad_norm": 0.10337151683570515, "learning_rate": 2.3988356026353022e-06, "loss": 0.0005, "step": 9446 }, { "epoch": 4.297998180163785, "grad_norm": 0.26704901828576394, "learning_rate": 2.3957818778316015e-06, "loss": 0.0054, "step": 9447 }, { "epoch": 4.298453139217471, "grad_norm": 0.12932715259411093, "learning_rate": 2.3927300001425264e-06, "loss": 0.0025, "step": 9448 }, { "epoch": 4.298908098271156, "grad_norm": 0.12571518539706053, "learning_rate": 2.389679969817463e-06, "loss": 0.0028, "step": 9449 }, { "epoch": 4.2993630573248405, "grad_norm": 0.07928923450290012, "learning_rate": 2.3866317871056394e-06, "loss": 0.0013, "step": 9450 }, { "epoch": 4.299818016378526, "grad_norm": 0.04039016298453127, "learning_rate": 2.383585452256146e-06, "loss": 0.0005, "step": 9451 }, { "epoch": 4.300272975432211, "grad_norm": 0.08539973340244009, "learning_rate": 2.380540965517919e-06, "loss": 0.0011, "step": 9452 }, { "epoch": 4.300727934485896, "grad_norm": 0.1399737072840955, "learning_rate": 2.377498327139735e-06, "loss": 0.0042, "step": 9453 }, { "epoch": 4.301182893539582, "grad_norm": 0.13538179580782192, "learning_rate": 2.3744575373702253e-06, "loss": 0.002, "step": 9454 }, { "epoch": 4.301637852593267, "grad_norm": 0.38386490624372277, "learning_rate": 2.3714185964578666e-06, "loss": 0.0078, "step": 9455 }, { "epoch": 4.3020928116469515, "grad_norm": 0.048232641130268296, "learning_rate": 2.3683815046509934e-06, "loss": 0.0005, "step": 9456 }, { "epoch": 4.302547770700637, "grad_norm": 0.09143423868492671, "learning_rate": 2.365346262197776e-06, "loss": 0.0012, "step": 9457 }, { "epoch": 4.303002729754322, "grad_norm": 0.2176388301537928, "learning_rate": 2.3623128693462504e-06, "loss": 0.0033, "step": 9458 }, { "epoch": 4.303457688808007, "grad_norm": 0.04036206084015176, "learning_rate": 2.359281326344287e-06, "loss": 0.0006, "step": 9459 }, { "epoch": 4.303912647861693, "grad_norm": 0.032203595599751195, "learning_rate": 2.3562516334396016e-06, "loss": 0.0006, "step": 9460 }, { "epoch": 4.304367606915378, "grad_norm": 0.27379619323323906, "learning_rate": 2.35322379087978e-06, "loss": 0.0058, "step": 9461 }, { "epoch": 4.304822565969062, "grad_norm": 0.14538064285916302, "learning_rate": 2.3501977989122403e-06, "loss": 0.0021, "step": 9462 }, { "epoch": 4.305277525022748, "grad_norm": 0.30476119974936705, "learning_rate": 2.347173657784249e-06, "loss": 0.0035, "step": 9463 }, { "epoch": 4.305732484076433, "grad_norm": 0.11924639488783816, "learning_rate": 2.3441513677429222e-06, "loss": 0.0015, "step": 9464 }, { "epoch": 4.306187443130118, "grad_norm": 0.07882317498249915, "learning_rate": 2.3411309290352346e-06, "loss": 0.0015, "step": 9465 }, { "epoch": 4.306642402183804, "grad_norm": 0.10898023094617056, "learning_rate": 2.3381123419080027e-06, "loss": 0.0024, "step": 9466 }, { "epoch": 4.3070973612374885, "grad_norm": 0.12999263521424298, "learning_rate": 2.335095606607893e-06, "loss": 0.002, "step": 9467 }, { "epoch": 4.307552320291173, "grad_norm": 0.3082605281727686, "learning_rate": 2.3320807233814136e-06, "loss": 0.0062, "step": 9468 }, { "epoch": 4.308007279344859, "grad_norm": 0.04917362366177489, "learning_rate": 2.329067692474929e-06, "loss": 0.0005, "step": 9469 }, { "epoch": 4.308462238398544, "grad_norm": 0.06839503868724311, "learning_rate": 2.326056514134653e-06, "loss": 0.0015, "step": 9470 }, { "epoch": 4.308917197452229, "grad_norm": 0.049680610479788764, "learning_rate": 2.3230471886066417e-06, "loss": 0.0004, "step": 9471 }, { "epoch": 4.309372156505915, "grad_norm": 0.1065901614931893, "learning_rate": 2.320039716136807e-06, "loss": 0.0018, "step": 9472 }, { "epoch": 4.3098271155595995, "grad_norm": 0.1421658400820097, "learning_rate": 2.3170340969709077e-06, "loss": 0.0105, "step": 9473 }, { "epoch": 4.310282074613285, "grad_norm": 0.19894899739457528, "learning_rate": 2.3140303313545415e-06, "loss": 0.0026, "step": 9474 }, { "epoch": 4.31073703366697, "grad_norm": 0.026004501410993277, "learning_rate": 2.311028419533173e-06, "loss": 0.0002, "step": 9475 }, { "epoch": 4.311191992720655, "grad_norm": 0.030244088916789306, "learning_rate": 2.308028361752099e-06, "loss": 0.0004, "step": 9476 }, { "epoch": 4.311646951774341, "grad_norm": 0.10907286804349577, "learning_rate": 2.3050301582564717e-06, "loss": 0.0024, "step": 9477 }, { "epoch": 4.312101910828026, "grad_norm": 0.2730590617752094, "learning_rate": 2.3020338092912848e-06, "loss": 0.0021, "step": 9478 }, { "epoch": 4.3125568698817105, "grad_norm": 0.302767202661406, "learning_rate": 2.2990393151013923e-06, "loss": 0.0044, "step": 9479 }, { "epoch": 4.313011828935396, "grad_norm": 0.03949129947716835, "learning_rate": 2.296046675931496e-06, "loss": 0.0004, "step": 9480 }, { "epoch": 4.313466787989081, "grad_norm": 0.21024457879291789, "learning_rate": 2.293055892026133e-06, "loss": 0.0022, "step": 9481 }, { "epoch": 4.313921747042766, "grad_norm": 0.07703310039784929, "learning_rate": 2.2900669636297e-06, "loss": 0.0007, "step": 9482 }, { "epoch": 4.314376706096452, "grad_norm": 0.21353757288741096, "learning_rate": 2.287079890986438e-06, "loss": 0.0033, "step": 9483 }, { "epoch": 4.314831665150137, "grad_norm": 0.05845025932471448, "learning_rate": 2.2840946743404313e-06, "loss": 0.0004, "step": 9484 }, { "epoch": 4.3152866242038215, "grad_norm": 0.07135837909732916, "learning_rate": 2.2811113139356245e-06, "loss": 0.0014, "step": 9485 }, { "epoch": 4.315741583257507, "grad_norm": 0.09734885928233403, "learning_rate": 2.2781298100158084e-06, "loss": 0.0009, "step": 9486 }, { "epoch": 4.316196542311192, "grad_norm": 0.08954021620936205, "learning_rate": 2.275150162824613e-06, "loss": 0.0006, "step": 9487 }, { "epoch": 4.316651501364877, "grad_norm": 0.09299221838305903, "learning_rate": 2.272172372605519e-06, "loss": 0.0014, "step": 9488 }, { "epoch": 4.317106460418563, "grad_norm": 0.15052693978675105, "learning_rate": 2.269196439601859e-06, "loss": 0.0019, "step": 9489 }, { "epoch": 4.317561419472248, "grad_norm": 0.15442497736995328, "learning_rate": 2.266222364056819e-06, "loss": 0.0023, "step": 9490 }, { "epoch": 4.3180163785259325, "grad_norm": 0.10547294081907609, "learning_rate": 2.263250146213422e-06, "loss": 0.002, "step": 9491 }, { "epoch": 4.318471337579618, "grad_norm": 0.02481191863646484, "learning_rate": 2.2602797863145397e-06, "loss": 0.0003, "step": 9492 }, { "epoch": 4.318926296633303, "grad_norm": 0.0666144488258205, "learning_rate": 2.2573112846029086e-06, "loss": 0.0028, "step": 9493 }, { "epoch": 4.319381255686988, "grad_norm": 0.11724034958503338, "learning_rate": 2.254344641321088e-06, "loss": 0.0021, "step": 9494 }, { "epoch": 4.319836214740674, "grad_norm": 0.201372817418205, "learning_rate": 2.251379856711508e-06, "loss": 0.0052, "step": 9495 }, { "epoch": 4.320291173794359, "grad_norm": 0.16486478561202808, "learning_rate": 2.2484169310164367e-06, "loss": 0.0017, "step": 9496 }, { "epoch": 4.320746132848043, "grad_norm": 0.0947643674703494, "learning_rate": 2.2454558644779857e-06, "loss": 0.0012, "step": 9497 }, { "epoch": 4.321201091901729, "grad_norm": 0.05740031100031815, "learning_rate": 2.2424966573381196e-06, "loss": 0.0009, "step": 9498 }, { "epoch": 4.321656050955414, "grad_norm": 0.014268796933542372, "learning_rate": 2.2395393098386563e-06, "loss": 0.0002, "step": 9499 }, { "epoch": 4.322111010009099, "grad_norm": 0.20417128709115126, "learning_rate": 2.236583822221258e-06, "loss": 0.0104, "step": 9500 }, { "epoch": 4.322565969062785, "grad_norm": 0.1227368635218314, "learning_rate": 2.233630194727432e-06, "loss": 0.003, "step": 9501 }, { "epoch": 4.3230209281164695, "grad_norm": 0.12359006374056707, "learning_rate": 2.230678427598534e-06, "loss": 0.0036, "step": 9502 }, { "epoch": 4.323475887170154, "grad_norm": 0.15134896173140625, "learning_rate": 2.2277285210757644e-06, "loss": 0.001, "step": 9503 }, { "epoch": 4.32393084622384, "grad_norm": 0.21244082864592329, "learning_rate": 2.2247804754001874e-06, "loss": 0.004, "step": 9504 }, { "epoch": 4.324385805277525, "grad_norm": 0.1790774914420256, "learning_rate": 2.2218342908126965e-06, "loss": 0.0035, "step": 9505 }, { "epoch": 4.32484076433121, "grad_norm": 0.08587064745300983, "learning_rate": 2.2188899675540385e-06, "loss": 0.0018, "step": 9506 }, { "epoch": 4.325295723384896, "grad_norm": 0.3781799217818971, "learning_rate": 2.215947505864818e-06, "loss": 0.0032, "step": 9507 }, { "epoch": 4.3257506824385805, "grad_norm": 0.08585513206728858, "learning_rate": 2.213006905985471e-06, "loss": 0.0008, "step": 9508 }, { "epoch": 4.326205641492265, "grad_norm": 0.016808649588346637, "learning_rate": 2.2100681681562983e-06, "loss": 0.0003, "step": 9509 }, { "epoch": 4.326660600545951, "grad_norm": 0.11298440526768963, "learning_rate": 2.207131292617437e-06, "loss": 0.0018, "step": 9510 }, { "epoch": 4.327115559599636, "grad_norm": 0.11936817804445891, "learning_rate": 2.204196279608875e-06, "loss": 0.002, "step": 9511 }, { "epoch": 4.327570518653321, "grad_norm": 0.04055173558441332, "learning_rate": 2.2012631293704426e-06, "loss": 0.0003, "step": 9512 }, { "epoch": 4.328025477707007, "grad_norm": 0.12317610323192295, "learning_rate": 2.198331842141832e-06, "loss": 0.0014, "step": 9513 }, { "epoch": 4.3284804367606915, "grad_norm": 0.17756591905809752, "learning_rate": 2.195402418162573e-06, "loss": 0.0041, "step": 9514 }, { "epoch": 4.328935395814376, "grad_norm": 0.02056169817272951, "learning_rate": 2.1924748576720443e-06, "loss": 0.0003, "step": 9515 }, { "epoch": 4.329390354868062, "grad_norm": 0.12215446290379624, "learning_rate": 2.1895491609094737e-06, "loss": 0.0027, "step": 9516 }, { "epoch": 4.329845313921747, "grad_norm": 0.03777314972038525, "learning_rate": 2.186625328113931e-06, "loss": 0.0004, "step": 9517 }, { "epoch": 4.330300272975432, "grad_norm": 0.20629514999960702, "learning_rate": 2.1837033595243443e-06, "loss": 0.0059, "step": 9518 }, { "epoch": 4.330755232029118, "grad_norm": 0.22994694419096698, "learning_rate": 2.1807832553794816e-06, "loss": 0.0057, "step": 9519 }, { "epoch": 4.3312101910828025, "grad_norm": 0.13213830076295596, "learning_rate": 2.177865015917957e-06, "loss": 0.0043, "step": 9520 }, { "epoch": 4.331665150136487, "grad_norm": 0.021745817984011442, "learning_rate": 2.1749486413782437e-06, "loss": 0.0002, "step": 9521 }, { "epoch": 4.332120109190173, "grad_norm": 0.1267800055973002, "learning_rate": 2.1720341319986514e-06, "loss": 0.0031, "step": 9522 }, { "epoch": 4.332575068243858, "grad_norm": 0.13471920604802268, "learning_rate": 2.169121488017334e-06, "loss": 0.0022, "step": 9523 }, { "epoch": 4.333030027297543, "grad_norm": 0.04743168561455213, "learning_rate": 2.1662107096723115e-06, "loss": 0.0003, "step": 9524 }, { "epoch": 4.333484986351229, "grad_norm": 0.06727963352448421, "learning_rate": 2.163301797201431e-06, "loss": 0.0018, "step": 9525 }, { "epoch": 4.3339399454049135, "grad_norm": 0.15198605292163403, "learning_rate": 2.1603947508423983e-06, "loss": 0.0042, "step": 9526 }, { "epoch": 4.334394904458598, "grad_norm": 0.1885484206834894, "learning_rate": 2.15748957083276e-06, "loss": 0.0048, "step": 9527 }, { "epoch": 4.334849863512284, "grad_norm": 0.25497767408885647, "learning_rate": 2.1545862574099183e-06, "loss": 0.0078, "step": 9528 }, { "epoch": 4.335304822565969, "grad_norm": 0.19139908264426153, "learning_rate": 2.151684810811122e-06, "loss": 0.0021, "step": 9529 }, { "epoch": 4.335759781619654, "grad_norm": 0.06171670927950103, "learning_rate": 2.148785231273462e-06, "loss": 0.0003, "step": 9530 }, { "epoch": 4.33621474067334, "grad_norm": 0.3929139986827478, "learning_rate": 2.145887519033879e-06, "loss": 0.0014, "step": 9531 }, { "epoch": 4.336669699727024, "grad_norm": 0.10525287862579194, "learning_rate": 2.1429916743291533e-06, "loss": 0.0017, "step": 9532 }, { "epoch": 4.337124658780709, "grad_norm": 0.20263157133973725, "learning_rate": 2.140097697395932e-06, "loss": 0.0014, "step": 9533 }, { "epoch": 4.337579617834395, "grad_norm": 0.1942058008555963, "learning_rate": 2.1372055884706865e-06, "loss": 0.0031, "step": 9534 }, { "epoch": 4.33803457688808, "grad_norm": 0.25788680272302045, "learning_rate": 2.1343153477897586e-06, "loss": 0.0042, "step": 9535 }, { "epoch": 4.338489535941765, "grad_norm": 0.155480277491666, "learning_rate": 2.131426975589321e-06, "loss": 0.006, "step": 9536 }, { "epoch": 4.3389444949954505, "grad_norm": 0.11288457149601983, "learning_rate": 2.128540472105389e-06, "loss": 0.0037, "step": 9537 }, { "epoch": 4.339399454049135, "grad_norm": 0.02762499271916583, "learning_rate": 2.1256558375738505e-06, "loss": 0.0003, "step": 9538 }, { "epoch": 4.339854413102821, "grad_norm": 0.21177882324063052, "learning_rate": 2.122773072230419e-06, "loss": 0.0042, "step": 9539 }, { "epoch": 4.340309372156506, "grad_norm": 0.20724637711299881, "learning_rate": 2.119892176310656e-06, "loss": 0.0033, "step": 9540 }, { "epoch": 4.340764331210191, "grad_norm": 1.472060022828267, "learning_rate": 2.117013150049976e-06, "loss": 0.0041, "step": 9541 }, { "epoch": 4.341219290263877, "grad_norm": 0.37625272707887697, "learning_rate": 2.1141359936836414e-06, "loss": 0.0031, "step": 9542 }, { "epoch": 4.3416742493175615, "grad_norm": 0.09996071922792839, "learning_rate": 2.1112607074467687e-06, "loss": 0.0017, "step": 9543 }, { "epoch": 4.342129208371246, "grad_norm": 0.2381031253093235, "learning_rate": 2.108387291574304e-06, "loss": 0.0042, "step": 9544 }, { "epoch": 4.342584167424932, "grad_norm": 0.10754369751042969, "learning_rate": 2.105515746301051e-06, "loss": 0.0044, "step": 9545 }, { "epoch": 4.343039126478617, "grad_norm": 0.1579456285825193, "learning_rate": 2.1026460718616607e-06, "loss": 0.0011, "step": 9546 }, { "epoch": 4.343494085532302, "grad_norm": 0.09020956490851739, "learning_rate": 2.099778268490632e-06, "loss": 0.0008, "step": 9547 }, { "epoch": 4.343949044585988, "grad_norm": 0.1376063421142804, "learning_rate": 2.0969123364222993e-06, "loss": 0.003, "step": 9548 }, { "epoch": 4.3444040036396725, "grad_norm": 0.15914248610557585, "learning_rate": 2.0940482758908695e-06, "loss": 0.0053, "step": 9549 }, { "epoch": 4.344858962693357, "grad_norm": 0.09125886957209443, "learning_rate": 2.0911860871303702e-06, "loss": 0.0015, "step": 9550 }, { "epoch": 4.345313921747043, "grad_norm": 0.26072993272102124, "learning_rate": 2.0883257703746855e-06, "loss": 0.0029, "step": 9551 }, { "epoch": 4.345768880800728, "grad_norm": 0.1891427694953306, "learning_rate": 2.085467325857554e-06, "loss": 0.0019, "step": 9552 }, { "epoch": 4.346223839854413, "grad_norm": 0.10354094075814312, "learning_rate": 2.082610753812553e-06, "loss": 0.0025, "step": 9553 }, { "epoch": 4.346678798908099, "grad_norm": 0.36761034658813946, "learning_rate": 2.079756054473106e-06, "loss": 0.0069, "step": 9554 }, { "epoch": 4.3471337579617835, "grad_norm": 0.07052119680564031, "learning_rate": 2.0769032280724825e-06, "loss": 0.001, "step": 9555 }, { "epoch": 4.347588717015468, "grad_norm": 0.218687708224779, "learning_rate": 2.074052274843813e-06, "loss": 0.0059, "step": 9556 }, { "epoch": 4.348043676069154, "grad_norm": 0.18688315074272566, "learning_rate": 2.0712031950200522e-06, "loss": 0.0048, "step": 9557 }, { "epoch": 4.348498635122839, "grad_norm": 0.07921796886123032, "learning_rate": 2.068355988834028e-06, "loss": 0.001, "step": 9558 }, { "epoch": 4.348953594176524, "grad_norm": 0.13462724150271851, "learning_rate": 2.0655106565183936e-06, "loss": 0.0026, "step": 9559 }, { "epoch": 4.34940855323021, "grad_norm": 0.24290253938320122, "learning_rate": 2.0626671983056565e-06, "loss": 0.0152, "step": 9560 }, { "epoch": 4.3498635122838945, "grad_norm": 0.20499737056499806, "learning_rate": 2.059825614428165e-06, "loss": 0.0098, "step": 9561 }, { "epoch": 4.350318471337579, "grad_norm": 0.05368113637276621, "learning_rate": 2.0569859051181306e-06, "loss": 0.0006, "step": 9562 }, { "epoch": 4.350773430391265, "grad_norm": 0.07595104390824933, "learning_rate": 2.0541480706076035e-06, "loss": 0.0016, "step": 9563 }, { "epoch": 4.35122838944495, "grad_norm": 0.01459185599419313, "learning_rate": 2.05131211112847e-06, "loss": 0.0003, "step": 9564 }, { "epoch": 4.351683348498635, "grad_norm": 0.1150608105101279, "learning_rate": 2.0484780269124786e-06, "loss": 0.0027, "step": 9565 }, { "epoch": 4.352138307552321, "grad_norm": 0.06952078047866662, "learning_rate": 2.045645818191208e-06, "loss": 0.0018, "step": 9566 }, { "epoch": 4.352593266606005, "grad_norm": 0.2829443005464772, "learning_rate": 2.0428154851961062e-06, "loss": 0.0026, "step": 9567 }, { "epoch": 4.35304822565969, "grad_norm": 0.15849366189158265, "learning_rate": 2.0399870281584467e-06, "loss": 0.0007, "step": 9568 }, { "epoch": 4.353503184713376, "grad_norm": 0.049719268855464224, "learning_rate": 2.037160447309358e-06, "loss": 0.0008, "step": 9569 }, { "epoch": 4.353958143767061, "grad_norm": 0.11469832538895094, "learning_rate": 2.0343357428798254e-06, "loss": 0.0014, "step": 9570 }, { "epoch": 4.354413102820746, "grad_norm": 0.12975646622249662, "learning_rate": 2.0315129151006558e-06, "loss": 0.0061, "step": 9571 }, { "epoch": 4.3548680618744315, "grad_norm": 0.11512880625759732, "learning_rate": 2.028691964202531e-06, "loss": 0.008, "step": 9572 }, { "epoch": 4.355323020928116, "grad_norm": 0.05904552112095913, "learning_rate": 2.0258728904159614e-06, "loss": 0.0012, "step": 9573 }, { "epoch": 4.355777979981801, "grad_norm": 0.3707099165160068, "learning_rate": 2.0230556939713095e-06, "loss": 0.0095, "step": 9574 }, { "epoch": 4.356232939035487, "grad_norm": 0.08523378445539274, "learning_rate": 2.0202403750987807e-06, "loss": 0.002, "step": 9575 }, { "epoch": 4.356687898089172, "grad_norm": 0.4146171279769053, "learning_rate": 2.01742693402843e-06, "loss": 0.0075, "step": 9576 }, { "epoch": 4.357142857142857, "grad_norm": 0.12591888910874904, "learning_rate": 2.0146153709901665e-06, "loss": 0.0012, "step": 9577 }, { "epoch": 4.3575978161965425, "grad_norm": 0.3316729022498448, "learning_rate": 2.0118056862137357e-06, "loss": 0.0098, "step": 9578 }, { "epoch": 4.358052775250227, "grad_norm": 0.1100298987984023, "learning_rate": 2.0089978799287286e-06, "loss": 0.0004, "step": 9579 }, { "epoch": 4.358507734303912, "grad_norm": 0.1318416577377102, "learning_rate": 2.0061919523645833e-06, "loss": 0.0013, "step": 9580 }, { "epoch": 4.358962693357598, "grad_norm": 0.1372903327302584, "learning_rate": 2.0033879037506004e-06, "loss": 0.0013, "step": 9581 }, { "epoch": 4.359417652411283, "grad_norm": 0.16441722108717619, "learning_rate": 2.0005857343159044e-06, "loss": 0.0019, "step": 9582 }, { "epoch": 4.359872611464968, "grad_norm": 0.2974905272621395, "learning_rate": 1.9977854442894713e-06, "loss": 0.0023, "step": 9583 }, { "epoch": 4.3603275705186535, "grad_norm": 0.26126512041283306, "learning_rate": 1.994987033900142e-06, "loss": 0.0029, "step": 9584 }, { "epoch": 4.360782529572338, "grad_norm": 0.08250205745209215, "learning_rate": 1.9921905033765757e-06, "loss": 0.0017, "step": 9585 }, { "epoch": 4.361237488626024, "grad_norm": 0.4842278886286901, "learning_rate": 1.989395852947304e-06, "loss": 0.0023, "step": 9586 }, { "epoch": 4.361692447679709, "grad_norm": 0.21814369232969621, "learning_rate": 1.9866030828406905e-06, "loss": 0.0068, "step": 9587 }, { "epoch": 4.362147406733394, "grad_norm": 0.09663583510278746, "learning_rate": 1.9838121932849423e-06, "loss": 0.0006, "step": 9588 }, { "epoch": 4.36260236578708, "grad_norm": 0.3152054215090005, "learning_rate": 1.981023184508124e-06, "loss": 0.01, "step": 9589 }, { "epoch": 4.3630573248407645, "grad_norm": 0.13546350321924083, "learning_rate": 1.978236056738128e-06, "loss": 0.0023, "step": 9590 }, { "epoch": 4.363512283894449, "grad_norm": 0.0512778406252701, "learning_rate": 1.975450810202725e-06, "loss": 0.0012, "step": 9591 }, { "epoch": 4.363967242948135, "grad_norm": 0.1295602851064624, "learning_rate": 1.9726674451295056e-06, "loss": 0.0032, "step": 9592 }, { "epoch": 4.36442220200182, "grad_norm": 0.20482930148939368, "learning_rate": 1.9698859617459097e-06, "loss": 0.0025, "step": 9593 }, { "epoch": 4.364877161055505, "grad_norm": 0.09483286292468811, "learning_rate": 1.9671063602792307e-06, "loss": 0.0006, "step": 9594 }, { "epoch": 4.365332120109191, "grad_norm": 0.2551129198648198, "learning_rate": 1.9643286409566e-06, "loss": 0.0018, "step": 9595 }, { "epoch": 4.3657870791628755, "grad_norm": 0.03923442969741123, "learning_rate": 1.9615528040050097e-06, "loss": 0.0006, "step": 9596 }, { "epoch": 4.36624203821656, "grad_norm": 0.35002053368806396, "learning_rate": 1.95877884965128e-06, "loss": 0.0053, "step": 9597 }, { "epoch": 4.366696997270246, "grad_norm": 0.19210856625557257, "learning_rate": 1.9560067781220907e-06, "loss": 0.0035, "step": 9598 }, { "epoch": 4.367151956323931, "grad_norm": 0.093115427390924, "learning_rate": 1.9532365896439643e-06, "loss": 0.0006, "step": 9599 }, { "epoch": 4.367606915377616, "grad_norm": 0.15842601646101492, "learning_rate": 1.9504682844432603e-06, "loss": 0.0146, "step": 9600 }, { "epoch": 4.368061874431302, "grad_norm": 0.0071632151933257995, "learning_rate": 1.947701862746201e-06, "loss": 0.0001, "step": 9601 }, { "epoch": 4.368516833484986, "grad_norm": 0.22120771269801703, "learning_rate": 1.9449373247788448e-06, "loss": 0.0064, "step": 9602 }, { "epoch": 4.368971792538671, "grad_norm": 0.0706323501791929, "learning_rate": 1.9421746707670913e-06, "loss": 0.0003, "step": 9603 }, { "epoch": 4.369426751592357, "grad_norm": 0.1694134842848472, "learning_rate": 1.939413900936693e-06, "loss": 0.0028, "step": 9604 }, { "epoch": 4.369881710646042, "grad_norm": 0.02774272905924572, "learning_rate": 1.9366550155132503e-06, "loss": 0.0004, "step": 9605 }, { "epoch": 4.370336669699727, "grad_norm": 0.1601651617699797, "learning_rate": 1.9338980147222104e-06, "loss": 0.0024, "step": 9606 }, { "epoch": 4.3707916287534125, "grad_norm": 0.23388110087742428, "learning_rate": 1.9311428987888596e-06, "loss": 0.0038, "step": 9607 }, { "epoch": 4.371246587807097, "grad_norm": 0.09379878276186941, "learning_rate": 1.9283896679383324e-06, "loss": 0.0019, "step": 9608 }, { "epoch": 4.371701546860782, "grad_norm": 0.06963931458591141, "learning_rate": 1.9256383223956065e-06, "loss": 0.001, "step": 9609 }, { "epoch": 4.372156505914468, "grad_norm": 0.10750290813752197, "learning_rate": 1.922888862385519e-06, "loss": 0.0008, "step": 9610 }, { "epoch": 4.372611464968153, "grad_norm": 0.3884908607381301, "learning_rate": 1.920141288132732e-06, "loss": 0.0147, "step": 9611 }, { "epoch": 4.373066424021838, "grad_norm": 0.028864965559940222, "learning_rate": 1.9173955998617792e-06, "loss": 0.0006, "step": 9612 }, { "epoch": 4.3735213830755235, "grad_norm": 0.31189645114340164, "learning_rate": 1.9146517977970152e-06, "loss": 0.0106, "step": 9613 }, { "epoch": 4.373976342129208, "grad_norm": 0.08128982824694116, "learning_rate": 1.9119098821626493e-06, "loss": 0.0008, "step": 9614 }, { "epoch": 4.374431301182893, "grad_norm": 0.23508967963819233, "learning_rate": 1.909169853182749e-06, "loss": 0.0029, "step": 9615 }, { "epoch": 4.374886260236579, "grad_norm": 0.07077882200668092, "learning_rate": 1.9064317110812108e-06, "loss": 0.0013, "step": 9616 }, { "epoch": 4.375341219290264, "grad_norm": 0.13107684032231082, "learning_rate": 1.9036954560817805e-06, "loss": 0.0107, "step": 9617 }, { "epoch": 4.375796178343949, "grad_norm": 0.24245965980277087, "learning_rate": 1.9009610884080543e-06, "loss": 0.004, "step": 9618 }, { "epoch": 4.3762511373976345, "grad_norm": 0.1103625106249074, "learning_rate": 1.8982286082834726e-06, "loss": 0.0018, "step": 9619 }, { "epoch": 4.376706096451319, "grad_norm": 0.06901438039945429, "learning_rate": 1.895498015931327e-06, "loss": 0.0023, "step": 9620 }, { "epoch": 4.377161055505004, "grad_norm": 0.16169821083958305, "learning_rate": 1.8927693115747464e-06, "loss": 0.0015, "step": 9621 }, { "epoch": 4.37761601455869, "grad_norm": 0.26749497392134153, "learning_rate": 1.8900424954367029e-06, "loss": 0.0016, "step": 9622 }, { "epoch": 4.378070973612375, "grad_norm": 0.041330916842243305, "learning_rate": 1.8873175677400211e-06, "loss": 0.0004, "step": 9623 }, { "epoch": 4.37852593266606, "grad_norm": 0.02611942847586466, "learning_rate": 1.8845945287073758e-06, "loss": 0.0003, "step": 9624 }, { "epoch": 4.3789808917197455, "grad_norm": 0.21018463114158978, "learning_rate": 1.881873378561272e-06, "loss": 0.0041, "step": 9625 }, { "epoch": 4.37943585077343, "grad_norm": 0.23412057919511853, "learning_rate": 1.879154117524079e-06, "loss": 0.0026, "step": 9626 }, { "epoch": 4.379890809827115, "grad_norm": 0.04614931776962354, "learning_rate": 1.876436745818e-06, "loss": 0.0004, "step": 9627 }, { "epoch": 4.380345768880801, "grad_norm": 0.23563894177488817, "learning_rate": 1.8737212636650847e-06, "loss": 0.0042, "step": 9628 }, { "epoch": 4.380800727934486, "grad_norm": 0.09060525436182772, "learning_rate": 1.8710076712872254e-06, "loss": 0.0026, "step": 9629 }, { "epoch": 4.381255686988171, "grad_norm": 0.20067544466865533, "learning_rate": 1.8682959689061753e-06, "loss": 0.0036, "step": 9630 }, { "epoch": 4.3817106460418564, "grad_norm": 0.07029586154012599, "learning_rate": 1.865586156743515e-06, "loss": 0.0003, "step": 9631 }, { "epoch": 4.382165605095541, "grad_norm": 0.006599379643982433, "learning_rate": 1.8628782350206764e-06, "loss": 0.0001, "step": 9632 }, { "epoch": 4.382620564149226, "grad_norm": 0.3229214673817089, "learning_rate": 1.8601722039589487e-06, "loss": 0.0052, "step": 9633 }, { "epoch": 4.383075523202912, "grad_norm": 0.1497357245451766, "learning_rate": 1.857468063779441e-06, "loss": 0.0026, "step": 9634 }, { "epoch": 4.383530482256597, "grad_norm": 0.2962859173821663, "learning_rate": 1.854765814703141e-06, "loss": 0.0047, "step": 9635 }, { "epoch": 4.383985441310282, "grad_norm": 0.03974348715446527, "learning_rate": 1.8520654569508522e-06, "loss": 0.0006, "step": 9636 }, { "epoch": 4.384440400363967, "grad_norm": 0.1122656730779324, "learning_rate": 1.8493669907432425e-06, "loss": 0.0015, "step": 9637 }, { "epoch": 4.384895359417652, "grad_norm": 0.11414033604635362, "learning_rate": 1.846670416300808e-06, "loss": 0.001, "step": 9638 }, { "epoch": 4.385350318471337, "grad_norm": 0.174313472518668, "learning_rate": 1.8439757338439085e-06, "loss": 0.0024, "step": 9639 }, { "epoch": 4.385805277525023, "grad_norm": 0.13857542930319, "learning_rate": 1.8412829435927453e-06, "loss": 0.001, "step": 9640 }, { "epoch": 4.386260236578708, "grad_norm": 0.24703785087324162, "learning_rate": 1.838592045767354e-06, "loss": 0.0034, "step": 9641 }, { "epoch": 4.386715195632393, "grad_norm": 0.10581312511324667, "learning_rate": 1.8359030405876277e-06, "loss": 0.0006, "step": 9642 }, { "epoch": 4.387170154686078, "grad_norm": 0.17352073968430973, "learning_rate": 1.8332159282732908e-06, "loss": 0.0045, "step": 9643 }, { "epoch": 4.387625113739763, "grad_norm": 0.1661646688060215, "learning_rate": 1.8305307090439316e-06, "loss": 0.0014, "step": 9644 }, { "epoch": 4.388080072793448, "grad_norm": 0.05391682996624345, "learning_rate": 1.8278473831189718e-06, "loss": 0.0006, "step": 9645 }, { "epoch": 4.388535031847134, "grad_norm": 0.044559688282410276, "learning_rate": 1.825165950717675e-06, "loss": 0.0007, "step": 9646 }, { "epoch": 4.388989990900819, "grad_norm": 0.1554244113001205, "learning_rate": 1.8224864120591629e-06, "loss": 0.0091, "step": 9647 }, { "epoch": 4.389444949954504, "grad_norm": 0.14091273596457754, "learning_rate": 1.8198087673623908e-06, "loss": 0.0006, "step": 9648 }, { "epoch": 4.389899909008189, "grad_norm": 0.03633382674977563, "learning_rate": 1.8171330168461675e-06, "loss": 0.0003, "step": 9649 }, { "epoch": 4.390354868061874, "grad_norm": 0.17895155886785405, "learning_rate": 1.8144591607291427e-06, "loss": 0.0072, "step": 9650 }, { "epoch": 4.39080982711556, "grad_norm": 0.11136760184833629, "learning_rate": 1.8117871992298085e-06, "loss": 0.0007, "step": 9651 }, { "epoch": 4.391264786169245, "grad_norm": 0.06981080739782981, "learning_rate": 1.809117132566504e-06, "loss": 0.0005, "step": 9652 }, { "epoch": 4.39171974522293, "grad_norm": 0.09208634629885863, "learning_rate": 1.8064489609574186e-06, "loss": 0.0008, "step": 9653 }, { "epoch": 4.3921747042766155, "grad_norm": 0.15247569652885112, "learning_rate": 1.8037826846205864e-06, "loss": 0.0042, "step": 9654 }, { "epoch": 4.3926296633303, "grad_norm": 0.10040933244605955, "learning_rate": 1.80111830377388e-06, "loss": 0.0011, "step": 9655 }, { "epoch": 4.393084622383985, "grad_norm": 0.2259836707934932, "learning_rate": 1.7984558186350226e-06, "loss": 0.0101, "step": 9656 }, { "epoch": 4.393539581437671, "grad_norm": 0.05576794508262986, "learning_rate": 1.7957952294215708e-06, "loss": 0.0004, "step": 9657 }, { "epoch": 4.393994540491356, "grad_norm": 0.1400906900961105, "learning_rate": 1.7931365363509505e-06, "loss": 0.003, "step": 9658 }, { "epoch": 4.394449499545041, "grad_norm": 0.08379055897152425, "learning_rate": 1.7904797396404104e-06, "loss": 0.0019, "step": 9659 }, { "epoch": 4.3949044585987265, "grad_norm": 0.2374297206116016, "learning_rate": 1.787824839507049e-06, "loss": 0.0043, "step": 9660 }, { "epoch": 4.395359417652411, "grad_norm": 0.042904701437084705, "learning_rate": 1.7851718361678205e-06, "loss": 0.0003, "step": 9661 }, { "epoch": 4.395814376706096, "grad_norm": 0.18434090657884233, "learning_rate": 1.7825207298395068e-06, "loss": 0.0032, "step": 9662 }, { "epoch": 4.396269335759782, "grad_norm": 0.006326181476174392, "learning_rate": 1.7798715207387545e-06, "loss": 0.0001, "step": 9663 }, { "epoch": 4.396724294813467, "grad_norm": 0.07468680683383822, "learning_rate": 1.7772242090820402e-06, "loss": 0.0005, "step": 9664 }, { "epoch": 4.397179253867152, "grad_norm": 0.15436253911198644, "learning_rate": 1.7745787950856907e-06, "loss": 0.0092, "step": 9665 }, { "epoch": 4.3976342129208374, "grad_norm": 0.11610474690713124, "learning_rate": 1.7719352789658778e-06, "loss": 0.005, "step": 9666 }, { "epoch": 4.398089171974522, "grad_norm": 0.12992190966523878, "learning_rate": 1.7692936609386118e-06, "loss": 0.0022, "step": 9667 }, { "epoch": 4.398544131028207, "grad_norm": 0.08177947772172703, "learning_rate": 1.766653941219762e-06, "loss": 0.0007, "step": 9668 }, { "epoch": 4.398999090081893, "grad_norm": 0.06956749756749611, "learning_rate": 1.7640161200250305e-06, "loss": 0.0015, "step": 9669 }, { "epoch": 4.399454049135578, "grad_norm": 0.17716384343834113, "learning_rate": 1.761380197569973e-06, "loss": 0.0024, "step": 9670 }, { "epoch": 4.399909008189263, "grad_norm": 0.041286698586374036, "learning_rate": 1.7587461740699778e-06, "loss": 0.0004, "step": 9671 }, { "epoch": 4.400363967242948, "grad_norm": 0.14254462568612392, "learning_rate": 1.7561140497402874e-06, "loss": 0.0015, "step": 9672 }, { "epoch": 4.400818926296633, "grad_norm": 0.061844787507069066, "learning_rate": 1.7534838247959901e-06, "loss": 0.0042, "step": 9673 }, { "epoch": 4.401273885350318, "grad_norm": 0.17027772628738116, "learning_rate": 1.7508554994520116e-06, "loss": 0.0019, "step": 9674 }, { "epoch": 4.401728844404004, "grad_norm": 0.16037621850471812, "learning_rate": 1.7482290739231327e-06, "loss": 0.0039, "step": 9675 }, { "epoch": 4.402183803457689, "grad_norm": 0.2380339509983766, "learning_rate": 1.7456045484239707e-06, "loss": 0.0081, "step": 9676 }, { "epoch": 4.402638762511374, "grad_norm": 0.1256275565711439, "learning_rate": 1.7429819231689848e-06, "loss": 0.0016, "step": 9677 }, { "epoch": 4.403093721565059, "grad_norm": 0.14517511664224353, "learning_rate": 1.7403611983724922e-06, "loss": 0.0013, "step": 9678 }, { "epoch": 4.403548680618744, "grad_norm": 0.2689363084112735, "learning_rate": 1.7377423742486436e-06, "loss": 0.0012, "step": 9679 }, { "epoch": 4.404003639672429, "grad_norm": 0.15754965835266818, "learning_rate": 1.7351254510114352e-06, "loss": 0.0037, "step": 9680 }, { "epoch": 4.404458598726115, "grad_norm": 0.05157900836252286, "learning_rate": 1.732510428874709e-06, "loss": 0.0008, "step": 9681 }, { "epoch": 4.4049135577798, "grad_norm": 0.10590304902025194, "learning_rate": 1.729897308052153e-06, "loss": 0.0031, "step": 9682 }, { "epoch": 4.405368516833485, "grad_norm": 0.19922784646415231, "learning_rate": 1.72728608875731e-06, "loss": 0.0017, "step": 9683 }, { "epoch": 4.40582347588717, "grad_norm": 0.22602535414906988, "learning_rate": 1.7246767712035455e-06, "loss": 0.0009, "step": 9684 }, { "epoch": 4.406278434940855, "grad_norm": 0.09500124390821042, "learning_rate": 1.7220693556040862e-06, "loss": 0.0026, "step": 9685 }, { "epoch": 4.40673339399454, "grad_norm": 0.14313015920099487, "learning_rate": 1.719463842171995e-06, "loss": 0.0026, "step": 9686 }, { "epoch": 4.407188353048226, "grad_norm": 0.12611780765450636, "learning_rate": 1.7168602311201875e-06, "loss": 0.0052, "step": 9687 }, { "epoch": 4.407643312101911, "grad_norm": 0.10983287847371884, "learning_rate": 1.7142585226614105e-06, "loss": 0.0018, "step": 9688 }, { "epoch": 4.408098271155596, "grad_norm": 0.06628548906409854, "learning_rate": 1.7116587170082744e-06, "loss": 0.0015, "step": 9689 }, { "epoch": 4.408553230209281, "grad_norm": 0.14187710349289384, "learning_rate": 1.70906081437322e-06, "loss": 0.0019, "step": 9690 }, { "epoch": 4.409008189262966, "grad_norm": 0.18692841567164611, "learning_rate": 1.7064648149685309e-06, "loss": 0.0017, "step": 9691 }, { "epoch": 4.409463148316651, "grad_norm": 0.07036082453938743, "learning_rate": 1.7038707190063452e-06, "loss": 0.001, "step": 9692 }, { "epoch": 4.409918107370337, "grad_norm": 0.018623575894167226, "learning_rate": 1.7012785266986437e-06, "loss": 0.0004, "step": 9693 }, { "epoch": 4.410373066424022, "grad_norm": 0.14373459641158096, "learning_rate": 1.6986882382572427e-06, "loss": 0.0041, "step": 9694 }, { "epoch": 4.4108280254777075, "grad_norm": 0.083827440170615, "learning_rate": 1.6960998538938094e-06, "loss": 0.0015, "step": 9695 }, { "epoch": 4.411282984531392, "grad_norm": 0.06681330726493037, "learning_rate": 1.693513373819855e-06, "loss": 0.0007, "step": 9696 }, { "epoch": 4.411737943585077, "grad_norm": 0.0770581236766075, "learning_rate": 1.6909287982467382e-06, "loss": 0.0009, "step": 9697 }, { "epoch": 4.412192902638763, "grad_norm": 0.07666670330404203, "learning_rate": 1.688346127385662e-06, "loss": 0.0006, "step": 9698 }, { "epoch": 4.412647861692448, "grad_norm": 0.12443624520276371, "learning_rate": 1.685765361447661e-06, "loss": 0.0021, "step": 9699 }, { "epoch": 4.413102820746133, "grad_norm": 0.11289198981735765, "learning_rate": 1.6831865006436326e-06, "loss": 0.0024, "step": 9700 }, { "epoch": 4.4135577797998184, "grad_norm": 0.2651313786077712, "learning_rate": 1.6806095451843002e-06, "loss": 0.0015, "step": 9701 }, { "epoch": 4.414012738853503, "grad_norm": 0.03887946482639536, "learning_rate": 1.6780344952802452e-06, "loss": 0.0004, "step": 9702 }, { "epoch": 4.414467697907188, "grad_norm": 0.13617808554841643, "learning_rate": 1.6754613511418937e-06, "loss": 0.0024, "step": 9703 }, { "epoch": 4.414922656960874, "grad_norm": 0.0540208580090244, "learning_rate": 1.6728901129795083e-06, "loss": 0.002, "step": 9704 }, { "epoch": 4.415377616014559, "grad_norm": 0.15600771798534577, "learning_rate": 1.6703207810032012e-06, "loss": 0.0042, "step": 9705 }, { "epoch": 4.415832575068244, "grad_norm": 0.08066909187947238, "learning_rate": 1.6677533554229185e-06, "loss": 0.0025, "step": 9706 }, { "epoch": 4.416287534121929, "grad_norm": 0.09904181711955044, "learning_rate": 1.6651878364484674e-06, "loss": 0.004, "step": 9707 }, { "epoch": 4.416742493175614, "grad_norm": 0.07711601668592709, "learning_rate": 1.6626242242894858e-06, "loss": 0.0015, "step": 9708 }, { "epoch": 4.417197452229299, "grad_norm": 0.2079248711602121, "learning_rate": 1.6600625191554613e-06, "loss": 0.0031, "step": 9709 }, { "epoch": 4.417652411282985, "grad_norm": 0.24248486909288425, "learning_rate": 1.6575027212557271e-06, "loss": 0.0087, "step": 9710 }, { "epoch": 4.41810737033667, "grad_norm": 0.10405531782358038, "learning_rate": 1.654944830799454e-06, "loss": 0.0013, "step": 9711 }, { "epoch": 4.418562329390355, "grad_norm": 0.08114146545495662, "learning_rate": 1.6523888479956673e-06, "loss": 0.0018, "step": 9712 }, { "epoch": 4.41901728844404, "grad_norm": 0.1993478291400216, "learning_rate": 1.649834773053227e-06, "loss": 0.0036, "step": 9713 }, { "epoch": 4.419472247497725, "grad_norm": 0.2527845663409545, "learning_rate": 1.6472826061808416e-06, "loss": 0.0074, "step": 9714 }, { "epoch": 4.41992720655141, "grad_norm": 0.19290140418216895, "learning_rate": 1.6447323475870553e-06, "loss": 0.0052, "step": 9715 }, { "epoch": 4.420382165605096, "grad_norm": 0.030781496375059066, "learning_rate": 1.6421839974802734e-06, "loss": 0.0003, "step": 9716 }, { "epoch": 4.420837124658781, "grad_norm": 0.15691148881574288, "learning_rate": 1.6396375560687348e-06, "loss": 0.0018, "step": 9717 }, { "epoch": 4.421292083712466, "grad_norm": 0.2729958832020698, "learning_rate": 1.637093023560518e-06, "loss": 0.0035, "step": 9718 }, { "epoch": 4.421747042766151, "grad_norm": 0.2697372346728951, "learning_rate": 1.6345504001635564e-06, "loss": 0.0035, "step": 9719 }, { "epoch": 4.422202001819836, "grad_norm": 0.14819490882782943, "learning_rate": 1.6320096860856143e-06, "loss": 0.0016, "step": 9720 }, { "epoch": 4.422656960873521, "grad_norm": 0.1875252913542141, "learning_rate": 1.6294708815343174e-06, "loss": 0.005, "step": 9721 }, { "epoch": 4.423111919927207, "grad_norm": 0.12301619651965105, "learning_rate": 1.6269339867171162e-06, "loss": 0.0012, "step": 9722 }, { "epoch": 4.423566878980892, "grad_norm": 0.23572887198830766, "learning_rate": 1.6243990018413146e-06, "loss": 0.0023, "step": 9723 }, { "epoch": 4.424021838034577, "grad_norm": 0.07732896839951235, "learning_rate": 1.6218659271140691e-06, "loss": 0.0009, "step": 9724 }, { "epoch": 4.424476797088262, "grad_norm": 0.01619749923697963, "learning_rate": 1.619334762742361e-06, "loss": 0.0002, "step": 9725 }, { "epoch": 4.424931756141947, "grad_norm": 0.06079715460663139, "learning_rate": 1.6168055089330337e-06, "loss": 0.0013, "step": 9726 }, { "epoch": 4.425386715195632, "grad_norm": 0.20571174283991883, "learning_rate": 1.6142781658927603e-06, "loss": 0.0031, "step": 9727 }, { "epoch": 4.425841674249318, "grad_norm": 0.17991869983915604, "learning_rate": 1.6117527338280674e-06, "loss": 0.0034, "step": 9728 }, { "epoch": 4.426296633303003, "grad_norm": 0.005518698865338883, "learning_rate": 1.6092292129453179e-06, "loss": 0.0001, "step": 9729 }, { "epoch": 4.426751592356688, "grad_norm": 0.11494522857718313, "learning_rate": 1.6067076034507244e-06, "loss": 0.0022, "step": 9730 }, { "epoch": 4.427206551410373, "grad_norm": 0.0069723774568210715, "learning_rate": 1.6041879055503473e-06, "loss": 0.0001, "step": 9731 }, { "epoch": 4.427661510464058, "grad_norm": 0.0835901306451555, "learning_rate": 1.60167011945008e-06, "loss": 0.0007, "step": 9732 }, { "epoch": 4.428116469517743, "grad_norm": 0.011737656478022947, "learning_rate": 1.5991542453556634e-06, "loss": 0.0001, "step": 9733 }, { "epoch": 4.428571428571429, "grad_norm": 0.013385900680512021, "learning_rate": 1.5966402834726863e-06, "loss": 0.0002, "step": 9734 }, { "epoch": 4.429026387625114, "grad_norm": 0.17838048482924154, "learning_rate": 1.59412823400657e-06, "loss": 0.0039, "step": 9735 }, { "epoch": 4.429481346678799, "grad_norm": 0.17593794012308814, "learning_rate": 1.5916180971626005e-06, "loss": 0.0026, "step": 9736 }, { "epoch": 4.429936305732484, "grad_norm": 0.022144613319979526, "learning_rate": 1.589109873145883e-06, "loss": 0.0003, "step": 9737 }, { "epoch": 4.430391264786169, "grad_norm": 0.321805174837575, "learning_rate": 1.58660356216139e-06, "loss": 0.0153, "step": 9738 }, { "epoch": 4.430846223839854, "grad_norm": 0.05751253462267621, "learning_rate": 1.5840991644139187e-06, "loss": 0.0004, "step": 9739 }, { "epoch": 4.43130118289354, "grad_norm": 0.30586134004510246, "learning_rate": 1.5815966801081162e-06, "loss": 0.0012, "step": 9740 }, { "epoch": 4.431756141947225, "grad_norm": 0.14015550608198968, "learning_rate": 1.5790961094484802e-06, "loss": 0.006, "step": 9741 }, { "epoch": 4.4322111010009095, "grad_norm": 0.2574229574946534, "learning_rate": 1.576597452639339e-06, "loss": 0.0052, "step": 9742 }, { "epoch": 4.432666060054595, "grad_norm": 0.026332610884638927, "learning_rate": 1.5741007098848793e-06, "loss": 0.0002, "step": 9743 }, { "epoch": 4.43312101910828, "grad_norm": 0.18449244462483727, "learning_rate": 1.5716058813891127e-06, "loss": 0.002, "step": 9744 }, { "epoch": 4.433575978161965, "grad_norm": 0.32784015058795396, "learning_rate": 1.5691129673559095e-06, "loss": 0.0058, "step": 9745 }, { "epoch": 4.434030937215651, "grad_norm": 0.07901585519294539, "learning_rate": 1.5666219679889905e-06, "loss": 0.0015, "step": 9746 }, { "epoch": 4.434485896269336, "grad_norm": 0.2197986470879584, "learning_rate": 1.5641328834918978e-06, "loss": 0.0031, "step": 9747 }, { "epoch": 4.4349408553230205, "grad_norm": 0.09888070257849622, "learning_rate": 1.5616457140680301e-06, "loss": 0.0016, "step": 9748 }, { "epoch": 4.435395814376706, "grad_norm": 0.2619906711625467, "learning_rate": 1.5591604599206222e-06, "loss": 0.004, "step": 9749 }, { "epoch": 4.435850773430391, "grad_norm": 0.1631417488183739, "learning_rate": 1.5566771212527699e-06, "loss": 0.013, "step": 9750 }, { "epoch": 4.436305732484076, "grad_norm": 0.17325989122872978, "learning_rate": 1.554195698267391e-06, "loss": 0.0032, "step": 9751 }, { "epoch": 4.436760691537762, "grad_norm": 0.20082391604067096, "learning_rate": 1.5517161911672628e-06, "loss": 0.005, "step": 9752 }, { "epoch": 4.437215650591447, "grad_norm": 0.04937882610422314, "learning_rate": 1.5492386001549951e-06, "loss": 0.0004, "step": 9753 }, { "epoch": 4.4376706096451315, "grad_norm": 0.1922566963996528, "learning_rate": 1.5467629254330401e-06, "loss": 0.0089, "step": 9754 }, { "epoch": 4.438125568698817, "grad_norm": 0.1019241305143775, "learning_rate": 1.5442891672037135e-06, "loss": 0.0019, "step": 9755 }, { "epoch": 4.438580527752502, "grad_norm": 0.08543130663584785, "learning_rate": 1.5418173256691483e-06, "loss": 0.0025, "step": 9756 }, { "epoch": 4.439035486806187, "grad_norm": 0.009931122826260078, "learning_rate": 1.5393474010313353e-06, "loss": 0.0001, "step": 9757 }, { "epoch": 4.439490445859873, "grad_norm": 0.06863948798286429, "learning_rate": 1.5368793934921022e-06, "loss": 0.0007, "step": 9758 }, { "epoch": 4.439945404913558, "grad_norm": 0.1986212914579758, "learning_rate": 1.5344133032531266e-06, "loss": 0.0133, "step": 9759 }, { "epoch": 4.440400363967243, "grad_norm": 0.08228493228543915, "learning_rate": 1.5319491305159278e-06, "loss": 0.0022, "step": 9760 }, { "epoch": 4.440855323020928, "grad_norm": 0.025031441713954022, "learning_rate": 1.529486875481867e-06, "loss": 0.0002, "step": 9761 }, { "epoch": 4.441310282074613, "grad_norm": 0.1320372849910236, "learning_rate": 1.527026538352147e-06, "loss": 0.0017, "step": 9762 }, { "epoch": 4.441765241128299, "grad_norm": 0.02109946280156141, "learning_rate": 1.5245681193278128e-06, "loss": 0.0002, "step": 9763 }, { "epoch": 4.442220200181984, "grad_norm": 0.14506071938294707, "learning_rate": 1.522111618609759e-06, "loss": 0.0018, "step": 9764 }, { "epoch": 4.442675159235669, "grad_norm": 0.036132278958130135, "learning_rate": 1.5196570363987167e-06, "loss": 0.0002, "step": 9765 }, { "epoch": 4.443130118289354, "grad_norm": 0.07762024505096526, "learning_rate": 1.5172043728952673e-06, "loss": 0.002, "step": 9766 }, { "epoch": 4.443585077343039, "grad_norm": 0.24151297636350819, "learning_rate": 1.5147536282998308e-06, "loss": 0.0018, "step": 9767 }, { "epoch": 4.444040036396724, "grad_norm": 0.1472313370285413, "learning_rate": 1.5123048028126641e-06, "loss": 0.0026, "step": 9768 }, { "epoch": 4.44449499545041, "grad_norm": 0.9985349220689882, "learning_rate": 1.5098578966338844e-06, "loss": 0.0058, "step": 9769 }, { "epoch": 4.444949954504095, "grad_norm": 0.014622483724041017, "learning_rate": 1.5074129099634348e-06, "loss": 0.0001, "step": 9770 }, { "epoch": 4.44540491355778, "grad_norm": 0.09471397346945291, "learning_rate": 1.5049698430011138e-06, "loss": 0.0004, "step": 9771 }, { "epoch": 4.445859872611465, "grad_norm": 0.2368071713552508, "learning_rate": 1.502528695946548e-06, "loss": 0.0063, "step": 9772 }, { "epoch": 4.44631483166515, "grad_norm": 0.37579450920199153, "learning_rate": 1.5000894689992274e-06, "loss": 0.0019, "step": 9773 }, { "epoch": 4.446769790718835, "grad_norm": 0.2608132671464073, "learning_rate": 1.4976521623584677e-06, "loss": 0.0043, "step": 9774 }, { "epoch": 4.447224749772521, "grad_norm": 0.16873491986210987, "learning_rate": 1.4952167762234432e-06, "loss": 0.0039, "step": 9775 }, { "epoch": 4.447679708826206, "grad_norm": 0.17446193338212584, "learning_rate": 1.4927833107931555e-06, "loss": 0.0013, "step": 9776 }, { "epoch": 4.4481346678798905, "grad_norm": 0.23902500389058787, "learning_rate": 1.4903517662664568e-06, "loss": 0.0027, "step": 9777 }, { "epoch": 4.448589626933576, "grad_norm": 0.06521062475958056, "learning_rate": 1.487922142842041e-06, "loss": 0.0011, "step": 9778 }, { "epoch": 4.449044585987261, "grad_norm": 0.3520667302302507, "learning_rate": 1.4854944407184463e-06, "loss": 0.008, "step": 9779 }, { "epoch": 4.449499545040946, "grad_norm": 0.03097565904169709, "learning_rate": 1.4830686600940614e-06, "loss": 0.0003, "step": 9780 }, { "epoch": 4.449954504094632, "grad_norm": 0.28247286996264565, "learning_rate": 1.4806448011671025e-06, "loss": 0.0104, "step": 9781 }, { "epoch": 4.450409463148317, "grad_norm": 0.5445920190853444, "learning_rate": 1.4782228641356394e-06, "loss": 0.0039, "step": 9782 }, { "epoch": 4.4508644222020015, "grad_norm": 0.03966342209026813, "learning_rate": 1.4758028491975744e-06, "loss": 0.0002, "step": 9783 }, { "epoch": 4.451319381255687, "grad_norm": 0.17100684968495597, "learning_rate": 1.473384756550672e-06, "loss": 0.0015, "step": 9784 }, { "epoch": 4.451774340309372, "grad_norm": 0.07841396704152849, "learning_rate": 1.470968586392521e-06, "loss": 0.0005, "step": 9785 }, { "epoch": 4.452229299363057, "grad_norm": 0.14991867670604925, "learning_rate": 1.4685543389205581e-06, "loss": 0.0033, "step": 9786 }, { "epoch": 4.452684258416743, "grad_norm": 0.16681365070681115, "learning_rate": 1.4661420143320725e-06, "loss": 0.0033, "step": 9787 }, { "epoch": 4.453139217470428, "grad_norm": 0.20709670850142256, "learning_rate": 1.4637316128241764e-06, "loss": 0.0023, "step": 9788 }, { "epoch": 4.4535941765241125, "grad_norm": 0.11417369283289, "learning_rate": 1.4613231345938505e-06, "loss": 0.001, "step": 9789 }, { "epoch": 4.454049135577798, "grad_norm": 0.15645252988392191, "learning_rate": 1.4589165798378962e-06, "loss": 0.0009, "step": 9790 }, { "epoch": 4.454504094631483, "grad_norm": 0.03555193987229084, "learning_rate": 1.45651194875297e-06, "loss": 0.0004, "step": 9791 }, { "epoch": 4.454959053685168, "grad_norm": 0.23429076448016803, "learning_rate": 1.4541092415355617e-06, "loss": 0.0015, "step": 9792 }, { "epoch": 4.455414012738854, "grad_norm": 0.34009162165012835, "learning_rate": 1.4517084583820146e-06, "loss": 0.0057, "step": 9793 }, { "epoch": 4.455868971792539, "grad_norm": 0.013432405642903066, "learning_rate": 1.4493095994885132e-06, "loss": 0.0002, "step": 9794 }, { "epoch": 4.4563239308462235, "grad_norm": 0.08753841210745382, "learning_rate": 1.4469126650510756e-06, "loss": 0.0006, "step": 9795 }, { "epoch": 4.456778889899909, "grad_norm": 0.18483880559851573, "learning_rate": 1.4445176552655703e-06, "loss": 0.0024, "step": 9796 }, { "epoch": 4.457233848953594, "grad_norm": 0.043787336422445554, "learning_rate": 1.4421245703277047e-06, "loss": 0.0005, "step": 9797 }, { "epoch": 4.457688808007279, "grad_norm": 0.24584599440638127, "learning_rate": 1.4397334104330335e-06, "loss": 0.005, "step": 9798 }, { "epoch": 4.458143767060965, "grad_norm": 0.014339337762489949, "learning_rate": 1.4373441757769529e-06, "loss": 0.0003, "step": 9799 }, { "epoch": 4.45859872611465, "grad_norm": 0.09867096338320214, "learning_rate": 1.4349568665546931e-06, "loss": 0.0022, "step": 9800 }, { "epoch": 4.4590536851683344, "grad_norm": 0.1561895283174515, "learning_rate": 1.432571482961345e-06, "loss": 0.0026, "step": 9801 }, { "epoch": 4.45950864422202, "grad_norm": 0.01109512048659328, "learning_rate": 1.4301880251918226e-06, "loss": 0.0002, "step": 9802 }, { "epoch": 4.459963603275705, "grad_norm": 0.19176322714766278, "learning_rate": 1.4278064934408947e-06, "loss": 0.0014, "step": 9803 }, { "epoch": 4.460418562329391, "grad_norm": 0.20679061753794342, "learning_rate": 1.4254268879031723e-06, "loss": 0.0043, "step": 9804 }, { "epoch": 4.460873521383076, "grad_norm": 0.12777120732590608, "learning_rate": 1.4230492087731028e-06, "loss": 0.0027, "step": 9805 }, { "epoch": 4.461328480436761, "grad_norm": 0.08401267154751348, "learning_rate": 1.420673456244978e-06, "loss": 0.0014, "step": 9806 }, { "epoch": 4.461783439490446, "grad_norm": 0.03833584797876776, "learning_rate": 1.4182996305129259e-06, "loss": 0.0003, "step": 9807 }, { "epoch": 4.462238398544131, "grad_norm": 0.21104352435108367, "learning_rate": 1.4159277317709442e-06, "loss": 0.0019, "step": 9808 }, { "epoch": 4.462693357597816, "grad_norm": 0.03645673236537885, "learning_rate": 1.4135577602128414e-06, "loss": 0.0004, "step": 9809 }, { "epoch": 4.463148316651502, "grad_norm": 0.044530590083739725, "learning_rate": 1.4111897160322852e-06, "loss": 0.0004, "step": 9810 }, { "epoch": 4.463603275705187, "grad_norm": 0.4114364827962077, "learning_rate": 1.4088235994227788e-06, "loss": 0.0071, "step": 9811 }, { "epoch": 4.4640582347588715, "grad_norm": 0.2625142575296416, "learning_rate": 1.406459410577668e-06, "loss": 0.0015, "step": 9812 }, { "epoch": 4.464513193812557, "grad_norm": 0.20265353661888647, "learning_rate": 1.4040971496901478e-06, "loss": 0.0041, "step": 9813 }, { "epoch": 4.464968152866242, "grad_norm": 0.21329025292987633, "learning_rate": 1.4017368169532475e-06, "loss": 0.0049, "step": 9814 }, { "epoch": 4.465423111919927, "grad_norm": 0.2802728103604153, "learning_rate": 1.3993784125598514e-06, "loss": 0.003, "step": 9815 }, { "epoch": 4.465878070973613, "grad_norm": 0.03248989128140333, "learning_rate": 1.3970219367026694e-06, "loss": 0.0003, "step": 9816 }, { "epoch": 4.466333030027298, "grad_norm": 0.24945744097648145, "learning_rate": 1.394667389574264e-06, "loss": 0.0053, "step": 9817 }, { "epoch": 4.4667879890809825, "grad_norm": 0.21523480801336828, "learning_rate": 1.3923147713670398e-06, "loss": 0.0032, "step": 9818 }, { "epoch": 4.467242948134668, "grad_norm": 0.12536833091269134, "learning_rate": 1.38996408227324e-06, "loss": 0.0036, "step": 9819 }, { "epoch": 4.467697907188353, "grad_norm": 0.6682144123934282, "learning_rate": 1.387615322484953e-06, "loss": 0.0021, "step": 9820 }, { "epoch": 4.468152866242038, "grad_norm": 0.16770836336728587, "learning_rate": 1.385268492194111e-06, "loss": 0.0053, "step": 9821 }, { "epoch": 4.468607825295724, "grad_norm": 0.04854832771430486, "learning_rate": 1.3829235915924833e-06, "loss": 0.0006, "step": 9822 }, { "epoch": 4.469062784349409, "grad_norm": 0.09561554520999256, "learning_rate": 1.3805806208716854e-06, "loss": 0.002, "step": 9823 }, { "epoch": 4.4695177434030935, "grad_norm": 0.8957367303672126, "learning_rate": 1.3782395802231784e-06, "loss": 0.004, "step": 9824 }, { "epoch": 4.469972702456779, "grad_norm": 0.07473758368860776, "learning_rate": 1.3759004698382566e-06, "loss": 0.0019, "step": 9825 }, { "epoch": 4.470427661510464, "grad_norm": 0.21877986690209472, "learning_rate": 1.3735632899080587e-06, "loss": 0.0038, "step": 9826 }, { "epoch": 4.470882620564149, "grad_norm": 0.10006493932094755, "learning_rate": 1.3712280406235733e-06, "loss": 0.0011, "step": 9827 }, { "epoch": 4.471337579617835, "grad_norm": 0.0470321330056212, "learning_rate": 1.3688947221756315e-06, "loss": 0.0009, "step": 9828 }, { "epoch": 4.47179253867152, "grad_norm": 0.16055732006487455, "learning_rate": 1.3665633347548946e-06, "loss": 0.0036, "step": 9829 }, { "epoch": 4.4722474977252045, "grad_norm": 0.041717985081074174, "learning_rate": 1.3642338785518738e-06, "loss": 0.0006, "step": 9830 }, { "epoch": 4.47270245677889, "grad_norm": 0.09645351046768072, "learning_rate": 1.361906353756917e-06, "loss": 0.0025, "step": 9831 }, { "epoch": 4.473157415832575, "grad_norm": 0.13910491848985945, "learning_rate": 1.3595807605602307e-06, "loss": 0.0008, "step": 9832 }, { "epoch": 4.47361237488626, "grad_norm": 0.03342944959694652, "learning_rate": 1.357257099151843e-06, "loss": 0.0003, "step": 9833 }, { "epoch": 4.474067333939946, "grad_norm": 0.19926665309632427, "learning_rate": 1.3549353697216327e-06, "loss": 0.0105, "step": 9834 }, { "epoch": 4.474522292993631, "grad_norm": 0.1684624167731519, "learning_rate": 1.3526155724593287e-06, "loss": 0.0042, "step": 9835 }, { "epoch": 4.4749772520473154, "grad_norm": 0.10673544753185275, "learning_rate": 1.3502977075544848e-06, "loss": 0.0016, "step": 9836 }, { "epoch": 4.475432211101001, "grad_norm": 0.30900400130972316, "learning_rate": 1.3479817751965162e-06, "loss": 0.0012, "step": 9837 }, { "epoch": 4.475887170154686, "grad_norm": 0.18089632658797497, "learning_rate": 1.3456677755746633e-06, "loss": 0.0019, "step": 9838 }, { "epoch": 4.476342129208371, "grad_norm": 0.03185726985946676, "learning_rate": 1.343355708878019e-06, "loss": 0.0005, "step": 9839 }, { "epoch": 4.476797088262057, "grad_norm": 0.1436730983177701, "learning_rate": 1.341045575295513e-06, "loss": 0.0034, "step": 9840 }, { "epoch": 4.477252047315742, "grad_norm": 0.24895681032892045, "learning_rate": 1.338737375015911e-06, "loss": 0.0035, "step": 9841 }, { "epoch": 4.477707006369426, "grad_norm": 0.013926628829899913, "learning_rate": 1.336431108227848e-06, "loss": 0.0001, "step": 9842 }, { "epoch": 4.478161965423112, "grad_norm": 0.1549745027606713, "learning_rate": 1.3341267751197678e-06, "loss": 0.0014, "step": 9843 }, { "epoch": 4.478616924476797, "grad_norm": 0.14429285471232547, "learning_rate": 1.3318243758799752e-06, "loss": 0.0086, "step": 9844 }, { "epoch": 4.479071883530482, "grad_norm": 0.11592091121827917, "learning_rate": 1.3295239106966118e-06, "loss": 0.0017, "step": 9845 }, { "epoch": 4.479526842584168, "grad_norm": 0.1769928623876725, "learning_rate": 1.3272253797576517e-06, "loss": 0.0016, "step": 9846 }, { "epoch": 4.4799818016378525, "grad_norm": 0.20782151852884753, "learning_rate": 1.3249287832509366e-06, "loss": 0.0018, "step": 9847 }, { "epoch": 4.480436760691537, "grad_norm": 0.11258576984483909, "learning_rate": 1.3226341213641192e-06, "loss": 0.0049, "step": 9848 }, { "epoch": 4.480891719745223, "grad_norm": 0.08023158906123598, "learning_rate": 1.3203413942847188e-06, "loss": 0.0004, "step": 9849 }, { "epoch": 4.481346678798908, "grad_norm": 0.15639608729665705, "learning_rate": 1.3180506022000826e-06, "loss": 0.001, "step": 9850 }, { "epoch": 4.481801637852593, "grad_norm": 0.029754741538609224, "learning_rate": 1.315761745297403e-06, "loss": 0.0005, "step": 9851 }, { "epoch": 4.482256596906279, "grad_norm": 0.07529345491284832, "learning_rate": 1.313474823763719e-06, "loss": 0.0019, "step": 9852 }, { "epoch": 4.4827115559599635, "grad_norm": 0.04725490521535136, "learning_rate": 1.311189837785906e-06, "loss": 0.0004, "step": 9853 }, { "epoch": 4.483166515013648, "grad_norm": 0.02890602120040976, "learning_rate": 1.3089067875506788e-06, "loss": 0.0003, "step": 9854 }, { "epoch": 4.483621474067334, "grad_norm": 0.021516928232824896, "learning_rate": 1.306625673244602e-06, "loss": 0.0003, "step": 9855 }, { "epoch": 4.484076433121019, "grad_norm": 0.08419750065731749, "learning_rate": 1.3043464950540735e-06, "loss": 0.0014, "step": 9856 }, { "epoch": 4.484531392174704, "grad_norm": 0.20442853076069453, "learning_rate": 1.3020692531653444e-06, "loss": 0.0018, "step": 9857 }, { "epoch": 4.48498635122839, "grad_norm": 0.060616126182557226, "learning_rate": 1.2997939477644966e-06, "loss": 0.0011, "step": 9858 }, { "epoch": 4.4854413102820745, "grad_norm": 0.1525426049564198, "learning_rate": 1.2975205790374618e-06, "loss": 0.0015, "step": 9859 }, { "epoch": 4.485896269335759, "grad_norm": 0.09304094012936787, "learning_rate": 1.2952491471699995e-06, "loss": 0.0023, "step": 9860 }, { "epoch": 4.486351228389445, "grad_norm": 0.1731354973208225, "learning_rate": 1.2929796523477312e-06, "loss": 0.0021, "step": 9861 }, { "epoch": 4.48680618744313, "grad_norm": 0.19398114013410767, "learning_rate": 1.2907120947561025e-06, "loss": 0.0027, "step": 9862 }, { "epoch": 4.487261146496815, "grad_norm": 0.1984619417762699, "learning_rate": 1.2884464745804126e-06, "loss": 0.0037, "step": 9863 }, { "epoch": 4.487716105550501, "grad_norm": 0.37315271259138605, "learning_rate": 1.2861827920057995e-06, "loss": 0.0021, "step": 9864 }, { "epoch": 4.4881710646041855, "grad_norm": 0.14788250117818816, "learning_rate": 1.2839210472172319e-06, "loss": 0.0026, "step": 9865 }, { "epoch": 4.48862602365787, "grad_norm": 0.13677534229985513, "learning_rate": 1.2816612403995399e-06, "loss": 0.0032, "step": 9866 }, { "epoch": 4.489080982711556, "grad_norm": 0.08967471133496469, "learning_rate": 1.279403371737381e-06, "loss": 0.0009, "step": 9867 }, { "epoch": 4.489535941765241, "grad_norm": 0.039214606600340214, "learning_rate": 1.2771474414152552e-06, "loss": 0.0003, "step": 9868 }, { "epoch": 4.489990900818927, "grad_norm": 0.013050799607138328, "learning_rate": 1.2748934496175092e-06, "loss": 0.0002, "step": 9869 }, { "epoch": 4.490445859872612, "grad_norm": 0.26105423816061535, "learning_rate": 1.2726413965283263e-06, "loss": 0.003, "step": 9870 }, { "epoch": 4.4909008189262964, "grad_norm": 0.06801084145398165, "learning_rate": 1.27039128233174e-06, "loss": 0.0008, "step": 9871 }, { "epoch": 4.491355777979982, "grad_norm": 0.4709565557466841, "learning_rate": 1.2681431072116167e-06, "loss": 0.0154, "step": 9872 }, { "epoch": 4.491810737033667, "grad_norm": 0.07049191175223801, "learning_rate": 1.2658968713516656e-06, "loss": 0.0005, "step": 9873 }, { "epoch": 4.492265696087352, "grad_norm": 0.030167556624245875, "learning_rate": 1.2636525749354395e-06, "loss": 0.0004, "step": 9874 }, { "epoch": 4.492720655141038, "grad_norm": 0.16057704384651836, "learning_rate": 1.2614102181463333e-06, "loss": 0.0092, "step": 9875 }, { "epoch": 4.4931756141947226, "grad_norm": 0.3886069341525445, "learning_rate": 1.2591698011675785e-06, "loss": 0.0069, "step": 9876 }, { "epoch": 4.493630573248407, "grad_norm": 0.06804736605212046, "learning_rate": 1.256931324182259e-06, "loss": 0.0006, "step": 9877 }, { "epoch": 4.494085532302093, "grad_norm": 0.12250701151734102, "learning_rate": 1.2546947873732894e-06, "loss": 0.0012, "step": 9878 }, { "epoch": 4.494540491355778, "grad_norm": 0.12828038489548396, "learning_rate": 1.2524601909234267e-06, "loss": 0.0013, "step": 9879 }, { "epoch": 4.494995450409463, "grad_norm": 0.3133308436580378, "learning_rate": 1.250227535015272e-06, "loss": 0.0082, "step": 9880 }, { "epoch": 4.495450409463149, "grad_norm": 0.19067719191069366, "learning_rate": 1.2479968198312736e-06, "loss": 0.0045, "step": 9881 }, { "epoch": 4.4959053685168335, "grad_norm": 0.07394889750016374, "learning_rate": 1.2457680455537135e-06, "loss": 0.0009, "step": 9882 }, { "epoch": 4.496360327570518, "grad_norm": 0.06494560671776266, "learning_rate": 1.24354121236471e-06, "loss": 0.001, "step": 9883 }, { "epoch": 4.496815286624204, "grad_norm": 0.06344099408648193, "learning_rate": 1.2413163204462396e-06, "loss": 0.0006, "step": 9884 }, { "epoch": 4.497270245677889, "grad_norm": 0.11639825866494977, "learning_rate": 1.2390933699801015e-06, "loss": 0.0026, "step": 9885 }, { "epoch": 4.497725204731574, "grad_norm": 0.6384568565402093, "learning_rate": 1.236872361147956e-06, "loss": 0.0135, "step": 9886 }, { "epoch": 4.49818016378526, "grad_norm": 0.1865742934325774, "learning_rate": 1.2346532941312854e-06, "loss": 0.0022, "step": 9887 }, { "epoch": 4.4986351228389445, "grad_norm": 0.07710070881357588, "learning_rate": 1.2324361691114256e-06, "loss": 0.0003, "step": 9888 }, { "epoch": 4.499090081892629, "grad_norm": 0.5536949994508512, "learning_rate": 1.2302209862695451e-06, "loss": 0.0043, "step": 9889 }, { "epoch": 4.499545040946315, "grad_norm": 0.06873048017505853, "learning_rate": 1.2280077457866634e-06, "loss": 0.0003, "step": 9890 }, { "epoch": 4.5, "grad_norm": 0.010914944926367735, "learning_rate": 1.2257964478436356e-06, "loss": 0.0002, "step": 9891 }, { "epoch": 4.500454959053685, "grad_norm": 0.14773336221771177, "learning_rate": 1.2235870926211619e-06, "loss": 0.0027, "step": 9892 }, { "epoch": 4.500909918107371, "grad_norm": 0.3536443615136479, "learning_rate": 1.2213796802997752e-06, "loss": 0.0057, "step": 9893 }, { "epoch": 4.5013648771610555, "grad_norm": 0.1546223468980637, "learning_rate": 1.2191742110598565e-06, "loss": 0.0024, "step": 9894 }, { "epoch": 4.50181983621474, "grad_norm": 0.019359902249828042, "learning_rate": 1.216970685081631e-06, "loss": 0.0003, "step": 9895 }, { "epoch": 4.502274795268426, "grad_norm": 0.07844077902000825, "learning_rate": 1.2147691025451573e-06, "loss": 0.001, "step": 9896 }, { "epoch": 4.502729754322111, "grad_norm": 0.24951367201217453, "learning_rate": 1.2125694636303337e-06, "loss": 0.0045, "step": 9897 }, { "epoch": 4.503184713375796, "grad_norm": 0.27790288773998495, "learning_rate": 1.2103717685169185e-06, "loss": 0.003, "step": 9898 }, { "epoch": 4.503639672429482, "grad_norm": 0.047360434907777985, "learning_rate": 1.2081760173844825e-06, "loss": 0.0003, "step": 9899 }, { "epoch": 4.5040946314831665, "grad_norm": 0.04938298966398533, "learning_rate": 1.2059822104124623e-06, "loss": 0.0002, "step": 9900 }, { "epoch": 4.504549590536851, "grad_norm": 0.5042657030126894, "learning_rate": 1.203790347780126e-06, "loss": 0.0087, "step": 9901 }, { "epoch": 4.505004549590537, "grad_norm": 0.15144470048614903, "learning_rate": 1.2016004296665773e-06, "loss": 0.005, "step": 9902 }, { "epoch": 4.505459508644222, "grad_norm": 0.155239931553453, "learning_rate": 1.1994124562507675e-06, "loss": 0.0038, "step": 9903 }, { "epoch": 4.505914467697907, "grad_norm": 0.5813059969135023, "learning_rate": 1.1972264277114897e-06, "loss": 0.0127, "step": 9904 }, { "epoch": 4.506369426751593, "grad_norm": 0.010262104000740374, "learning_rate": 1.1950423442273761e-06, "loss": 0.0002, "step": 9905 }, { "epoch": 4.5068243858052774, "grad_norm": 0.12535996894227192, "learning_rate": 1.1928602059769007e-06, "loss": 0.0022, "step": 9906 }, { "epoch": 4.507279344858962, "grad_norm": 0.12446528864125579, "learning_rate": 1.1906800131383788e-06, "loss": 0.0055, "step": 9907 }, { "epoch": 4.507734303912648, "grad_norm": 0.12049067045360057, "learning_rate": 1.18850176588996e-06, "loss": 0.0004, "step": 9908 }, { "epoch": 4.508189262966333, "grad_norm": 0.2654800681301145, "learning_rate": 1.1863254644096489e-06, "loss": 0.0033, "step": 9909 }, { "epoch": 4.508644222020019, "grad_norm": 0.08485320861161638, "learning_rate": 1.1841511088752782e-06, "loss": 0.0027, "step": 9910 }, { "epoch": 4.5090991810737036, "grad_norm": 0.36844309896211475, "learning_rate": 1.1819786994645254e-06, "loss": 0.0022, "step": 9911 }, { "epoch": 4.509554140127388, "grad_norm": 0.04469506582264362, "learning_rate": 1.1798082363549152e-06, "loss": 0.0008, "step": 9912 }, { "epoch": 4.510009099181074, "grad_norm": 0.06213051302272569, "learning_rate": 1.1776397197238025e-06, "loss": 0.0005, "step": 9913 }, { "epoch": 4.510464058234759, "grad_norm": 0.015015040684397456, "learning_rate": 1.1754731497483933e-06, "loss": 0.0002, "step": 9914 }, { "epoch": 4.510919017288444, "grad_norm": 0.15133932416153867, "learning_rate": 1.1733085266057265e-06, "loss": 0.0021, "step": 9915 }, { "epoch": 4.51137397634213, "grad_norm": 0.23017755933718193, "learning_rate": 1.171145850472688e-06, "loss": 0.0025, "step": 9916 }, { "epoch": 4.5118289353958145, "grad_norm": 0.04679028173615272, "learning_rate": 1.1689851215260005e-06, "loss": 0.001, "step": 9917 }, { "epoch": 4.512283894449499, "grad_norm": 0.12121836255449009, "learning_rate": 1.1668263399422258e-06, "loss": 0.0024, "step": 9918 }, { "epoch": 4.512738853503185, "grad_norm": 0.12882444847913851, "learning_rate": 1.1646695058977697e-06, "loss": 0.0015, "step": 9919 }, { "epoch": 4.51319381255687, "grad_norm": 0.11181343163178084, "learning_rate": 1.1625146195688886e-06, "loss": 0.001, "step": 9920 }, { "epoch": 4.513648771610555, "grad_norm": 0.09082024852934126, "learning_rate": 1.1603616811316636e-06, "loss": 0.0005, "step": 9921 }, { "epoch": 4.514103730664241, "grad_norm": 0.16671795576959575, "learning_rate": 1.1582106907620239e-06, "loss": 0.0015, "step": 9922 }, { "epoch": 4.5145586897179255, "grad_norm": 0.16411945952880141, "learning_rate": 1.156061648635734e-06, "loss": 0.0016, "step": 9923 }, { "epoch": 4.51501364877161, "grad_norm": 0.09856546963885113, "learning_rate": 1.1539145549284092e-06, "loss": 0.0022, "step": 9924 }, { "epoch": 4.515468607825296, "grad_norm": 0.17858287209915122, "learning_rate": 1.151769409815498e-06, "loss": 0.0063, "step": 9925 }, { "epoch": 4.515923566878981, "grad_norm": 0.04146093928430012, "learning_rate": 1.1496262134722936e-06, "loss": 0.0006, "step": 9926 }, { "epoch": 4.516378525932666, "grad_norm": 0.16869201577282963, "learning_rate": 1.1474849660739306e-06, "loss": 0.0032, "step": 9927 }, { "epoch": 4.516833484986352, "grad_norm": 0.07334843592305999, "learning_rate": 1.145345667795375e-06, "loss": 0.0004, "step": 9928 }, { "epoch": 4.5172884440400365, "grad_norm": 0.11429103511145443, "learning_rate": 1.1432083188114478e-06, "loss": 0.0056, "step": 9929 }, { "epoch": 4.517743403093721, "grad_norm": 0.0964441261097998, "learning_rate": 1.141072919296801e-06, "loss": 0.003, "step": 9930 }, { "epoch": 4.518198362147407, "grad_norm": 0.061383039639926154, "learning_rate": 1.1389394694259288e-06, "loss": 0.0005, "step": 9931 }, { "epoch": 4.518653321201092, "grad_norm": 0.036855172287831146, "learning_rate": 1.1368079693731632e-06, "loss": 0.0004, "step": 9932 }, { "epoch": 4.519108280254777, "grad_norm": 0.04609651420567755, "learning_rate": 1.1346784193126874e-06, "loss": 0.0006, "step": 9933 }, { "epoch": 4.519563239308463, "grad_norm": 0.24415950759699429, "learning_rate": 1.132550819418518e-06, "loss": 0.0094, "step": 9934 }, { "epoch": 4.5200181983621475, "grad_norm": 0.11087210254743173, "learning_rate": 1.1304251698645101e-06, "loss": 0.0019, "step": 9935 }, { "epoch": 4.520473157415832, "grad_norm": 0.12079668516616295, "learning_rate": 1.1283014708243667e-06, "loss": 0.0015, "step": 9936 }, { "epoch": 4.520928116469518, "grad_norm": 0.269340241883687, "learning_rate": 1.126179722471618e-06, "loss": 0.0157, "step": 9937 }, { "epoch": 4.521383075523203, "grad_norm": 0.11990755293151871, "learning_rate": 1.1240599249796536e-06, "loss": 0.0051, "step": 9938 }, { "epoch": 4.521838034576888, "grad_norm": 0.07761779341545415, "learning_rate": 1.1219420785216844e-06, "loss": 0.0012, "step": 9939 }, { "epoch": 4.522292993630574, "grad_norm": 0.19133529369747815, "learning_rate": 1.1198261832707806e-06, "loss": 0.0029, "step": 9940 }, { "epoch": 4.522747952684258, "grad_norm": 0.247535633579819, "learning_rate": 1.1177122393998374e-06, "loss": 0.0074, "step": 9941 }, { "epoch": 4.523202911737943, "grad_norm": 0.05158534147398957, "learning_rate": 1.115600247081597e-06, "loss": 0.0009, "step": 9942 }, { "epoch": 4.523657870791629, "grad_norm": 0.10812711142285944, "learning_rate": 1.113490206488646e-06, "loss": 0.0014, "step": 9943 }, { "epoch": 4.524112829845314, "grad_norm": 0.11699539349138156, "learning_rate": 1.1113821177934053e-06, "loss": 0.0012, "step": 9944 }, { "epoch": 4.524567788898999, "grad_norm": 0.253247115559082, "learning_rate": 1.109275981168137e-06, "loss": 0.0069, "step": 9945 }, { "epoch": 4.5250227479526846, "grad_norm": 0.30042981954824227, "learning_rate": 1.107171796784945e-06, "loss": 0.0032, "step": 9946 }, { "epoch": 4.525477707006369, "grad_norm": 0.11502876845226166, "learning_rate": 1.105069564815772e-06, "loss": 0.0006, "step": 9947 }, { "epoch": 4.525932666060054, "grad_norm": 0.013356140881844532, "learning_rate": 1.1029692854324092e-06, "loss": 0.0002, "step": 9948 }, { "epoch": 4.52638762511374, "grad_norm": 0.155451074996255, "learning_rate": 1.1008709588064798e-06, "loss": 0.0024, "step": 9949 }, { "epoch": 4.526842584167425, "grad_norm": 0.3055750525971655, "learning_rate": 1.0987745851094494e-06, "loss": 0.0069, "step": 9950 }, { "epoch": 4.52729754322111, "grad_norm": 0.050304268904633166, "learning_rate": 1.0966801645126202e-06, "loss": 0.0005, "step": 9951 }, { "epoch": 4.5277525022747955, "grad_norm": 0.1536430881596253, "learning_rate": 1.0945876971871382e-06, "loss": 0.0015, "step": 9952 }, { "epoch": 4.52820746132848, "grad_norm": 0.09782171478333429, "learning_rate": 1.0924971833039948e-06, "loss": 0.0004, "step": 9953 }, { "epoch": 4.528662420382165, "grad_norm": 0.33363155170861597, "learning_rate": 1.0904086230340198e-06, "loss": 0.0089, "step": 9954 }, { "epoch": 4.529117379435851, "grad_norm": 0.04308743079070622, "learning_rate": 1.0883220165478768e-06, "loss": 0.0002, "step": 9955 }, { "epoch": 4.529572338489536, "grad_norm": 0.15581557181724753, "learning_rate": 1.0862373640160738e-06, "loss": 0.0022, "step": 9956 }, { "epoch": 4.530027297543221, "grad_norm": 0.16318742292230703, "learning_rate": 1.084154665608958e-06, "loss": 0.0029, "step": 9957 }, { "epoch": 4.5304822565969065, "grad_norm": 0.18916310405663936, "learning_rate": 1.0820739214967208e-06, "loss": 0.0032, "step": 9958 }, { "epoch": 4.530937215650591, "grad_norm": 0.1366996743850715, "learning_rate": 1.0799951318493928e-06, "loss": 0.0006, "step": 9959 }, { "epoch": 4.531392174704276, "grad_norm": 0.23262659384664222, "learning_rate": 1.0779182968368357e-06, "loss": 0.0049, "step": 9960 }, { "epoch": 4.531847133757962, "grad_norm": 0.1895280224470744, "learning_rate": 1.075843416628769e-06, "loss": 0.0043, "step": 9961 }, { "epoch": 4.532302092811647, "grad_norm": 0.31299138534291593, "learning_rate": 1.073770491394735e-06, "loss": 0.0012, "step": 9962 }, { "epoch": 4.532757051865332, "grad_norm": 0.08208145125608492, "learning_rate": 1.0716995213041286e-06, "loss": 0.0014, "step": 9963 }, { "epoch": 4.5332120109190175, "grad_norm": 0.13947742752481732, "learning_rate": 1.0696305065261786e-06, "loss": 0.003, "step": 9964 }, { "epoch": 4.533666969972702, "grad_norm": 0.02171151994968608, "learning_rate": 1.067563447229955e-06, "loss": 0.0003, "step": 9965 }, { "epoch": 4.534121929026387, "grad_norm": 0.18895521078581395, "learning_rate": 1.0654983435843647e-06, "loss": 0.0063, "step": 9966 }, { "epoch": 4.534576888080073, "grad_norm": 0.21907882907919196, "learning_rate": 1.0634351957581611e-06, "loss": 0.0054, "step": 9967 }, { "epoch": 4.535031847133758, "grad_norm": 0.1147025247659386, "learning_rate": 1.0613740039199432e-06, "loss": 0.0022, "step": 9968 }, { "epoch": 4.535486806187443, "grad_norm": 0.05625480583543634, "learning_rate": 1.0593147682381344e-06, "loss": 0.0002, "step": 9969 }, { "epoch": 4.5359417652411285, "grad_norm": 0.2954742859910265, "learning_rate": 1.0572574888810055e-06, "loss": 0.0087, "step": 9970 }, { "epoch": 4.536396724294813, "grad_norm": 0.08364770763416655, "learning_rate": 1.0552021660166695e-06, "loss": 0.001, "step": 9971 }, { "epoch": 4.536851683348498, "grad_norm": 0.043413567476762345, "learning_rate": 1.0531487998130807e-06, "loss": 0.0014, "step": 9972 }, { "epoch": 4.537306642402184, "grad_norm": 0.1276441992182644, "learning_rate": 1.05109739043803e-06, "loss": 0.0016, "step": 9973 }, { "epoch": 4.537761601455869, "grad_norm": 0.1663512759804656, "learning_rate": 1.0490479380591446e-06, "loss": 0.003, "step": 9974 }, { "epoch": 4.538216560509554, "grad_norm": 0.2673281807620466, "learning_rate": 1.047000442843904e-06, "loss": 0.0041, "step": 9975 }, { "epoch": 4.538671519563239, "grad_norm": 0.10374459008758058, "learning_rate": 1.0449549049596136e-06, "loss": 0.002, "step": 9976 }, { "epoch": 4.539126478616924, "grad_norm": 0.026584269404800978, "learning_rate": 1.0429113245734284e-06, "loss": 0.0001, "step": 9977 }, { "epoch": 4.539581437670609, "grad_norm": 0.1416213835352749, "learning_rate": 1.0408697018523427e-06, "loss": 0.0037, "step": 9978 }, { "epoch": 4.540036396724295, "grad_norm": 0.06586289414025318, "learning_rate": 1.038830036963187e-06, "loss": 0.0004, "step": 9979 }, { "epoch": 4.54049135577798, "grad_norm": 0.34754354951157584, "learning_rate": 1.0367923300726306e-06, "loss": 0.0038, "step": 9980 }, { "epoch": 4.540946314831665, "grad_norm": 0.059762858919457894, "learning_rate": 1.0347565813471848e-06, "loss": 0.0036, "step": 9981 }, { "epoch": 4.54140127388535, "grad_norm": 0.08496623503714322, "learning_rate": 1.032722790953211e-06, "loss": 0.0003, "step": 9982 }, { "epoch": 4.541856232939035, "grad_norm": 0.17080668122368528, "learning_rate": 1.030690959056896e-06, "loss": 0.0022, "step": 9983 }, { "epoch": 4.542311191992721, "grad_norm": 0.0693251832020811, "learning_rate": 1.0286610858242702e-06, "loss": 0.0013, "step": 9984 }, { "epoch": 4.542766151046406, "grad_norm": 0.17899130845504072, "learning_rate": 1.0266331714212068e-06, "loss": 0.0021, "step": 9985 }, { "epoch": 4.543221110100091, "grad_norm": 0.12989838969912768, "learning_rate": 1.0246072160134124e-06, "loss": 0.0034, "step": 9986 }, { "epoch": 4.5436760691537765, "grad_norm": 0.10995309140143168, "learning_rate": 1.0225832197664515e-06, "loss": 0.0011, "step": 9987 }, { "epoch": 4.544131028207461, "grad_norm": 0.07926335397290192, "learning_rate": 1.020561182845703e-06, "loss": 0.0011, "step": 9988 }, { "epoch": 4.544585987261146, "grad_norm": 0.23406687144980592, "learning_rate": 1.0185411054164097e-06, "loss": 0.0045, "step": 9989 }, { "epoch": 4.545040946314832, "grad_norm": 0.16306229124851868, "learning_rate": 1.0165229876436366e-06, "loss": 0.0027, "step": 9990 }, { "epoch": 4.545495905368517, "grad_norm": 0.032232945997096404, "learning_rate": 1.0145068296922911e-06, "loss": 0.0005, "step": 9991 }, { "epoch": 4.545950864422202, "grad_norm": 0.02841630395967451, "learning_rate": 1.0124926317271355e-06, "loss": 0.0005, "step": 9992 }, { "epoch": 4.5464058234758875, "grad_norm": 0.08783408276697392, "learning_rate": 1.0104803939127577e-06, "loss": 0.002, "step": 9993 }, { "epoch": 4.546860782529572, "grad_norm": 0.19044176087757855, "learning_rate": 1.0084701164135818e-06, "loss": 0.0052, "step": 9994 }, { "epoch": 4.547315741583257, "grad_norm": 0.127296615879073, "learning_rate": 1.0064617993938845e-06, "loss": 0.0012, "step": 9995 }, { "epoch": 4.547770700636943, "grad_norm": 0.10162041990408781, "learning_rate": 1.004455443017771e-06, "loss": 0.0013, "step": 9996 }, { "epoch": 4.548225659690628, "grad_norm": 0.20586656547314952, "learning_rate": 1.0024510474492016e-06, "loss": 0.002, "step": 9997 }, { "epoch": 4.548680618744313, "grad_norm": 0.17477639568207054, "learning_rate": 1.0004486128519592e-06, "loss": 0.0027, "step": 9998 }, { "epoch": 4.5491355777979985, "grad_norm": 0.5720510054656637, "learning_rate": 9.984481393896765e-07, "loss": 0.0089, "step": 9999 }, { "epoch": 4.549590536851683, "grad_norm": 1.1147464425990026, "learning_rate": 9.964496272258177e-07, "loss": 0.0173, "step": 10000 }, { "epoch": 4.550045495905368, "grad_norm": 0.1950561617103243, "learning_rate": 9.944530765236993e-07, "loss": 0.0025, "step": 10001 }, { "epoch": 4.550500454959054, "grad_norm": 0.0239383771637316, "learning_rate": 9.924584874464655e-07, "loss": 0.0003, "step": 10002 }, { "epoch": 4.550955414012739, "grad_norm": 0.1327670421428825, "learning_rate": 9.904658601571082e-07, "loss": 0.0026, "step": 10003 }, { "epoch": 4.551410373066424, "grad_norm": 0.3683932528977685, "learning_rate": 9.884751948184585e-07, "loss": 0.005, "step": 10004 }, { "epoch": 4.5518653321201095, "grad_norm": 0.12720226435710702, "learning_rate": 9.864864915931749e-07, "loss": 0.0015, "step": 10005 }, { "epoch": 4.552320291173794, "grad_norm": 0.04227125246561786, "learning_rate": 9.84499750643772e-07, "loss": 0.0004, "step": 10006 }, { "epoch": 4.552775250227479, "grad_norm": 0.05313302624913452, "learning_rate": 9.825149721326005e-07, "loss": 0.0005, "step": 10007 }, { "epoch": 4.553230209281165, "grad_norm": 0.02985428180213061, "learning_rate": 9.805321562218417e-07, "loss": 0.0005, "step": 10008 }, { "epoch": 4.55368516833485, "grad_norm": 0.031212381101965506, "learning_rate": 9.785513030735215e-07, "loss": 0.0005, "step": 10009 }, { "epoch": 4.554140127388535, "grad_norm": 0.05300735346015249, "learning_rate": 9.76572412849508e-07, "loss": 0.0007, "step": 10010 }, { "epoch": 4.55459508644222, "grad_norm": 0.17904940009259643, "learning_rate": 9.745954857115102e-07, "loss": 0.0061, "step": 10011 }, { "epoch": 4.555050045495905, "grad_norm": 0.066356784497393, "learning_rate": 9.726205218210744e-07, "loss": 0.0005, "step": 10012 }, { "epoch": 4.55550500454959, "grad_norm": 0.03633444735142661, "learning_rate": 9.70647521339582e-07, "loss": 0.0005, "step": 10013 }, { "epoch": 4.555959963603276, "grad_norm": 0.0284407518350107, "learning_rate": 9.686764844282547e-07, "loss": 0.0005, "step": 10014 }, { "epoch": 4.556414922656961, "grad_norm": 0.09887910167689709, "learning_rate": 9.667074112481634e-07, "loss": 0.0021, "step": 10015 }, { "epoch": 4.556869881710646, "grad_norm": 0.029770228087334316, "learning_rate": 9.64740301960207e-07, "loss": 0.0002, "step": 10016 }, { "epoch": 4.557324840764331, "grad_norm": 0.06508964810342856, "learning_rate": 9.62775156725132e-07, "loss": 0.0014, "step": 10017 }, { "epoch": 4.557779799818016, "grad_norm": 0.1977677674214991, "learning_rate": 9.608119757035211e-07, "loss": 0.0007, "step": 10018 }, { "epoch": 4.558234758871702, "grad_norm": 0.1567979618191839, "learning_rate": 9.588507590557933e-07, "loss": 0.0024, "step": 10019 }, { "epoch": 4.558689717925387, "grad_norm": 0.010733000412750597, "learning_rate": 9.568915069422147e-07, "loss": 0.0002, "step": 10020 }, { "epoch": 4.559144676979072, "grad_norm": 0.1149221455968976, "learning_rate": 9.54934219522885e-07, "loss": 0.0037, "step": 10021 }, { "epoch": 4.5595996360327575, "grad_norm": 0.012075730642245949, "learning_rate": 9.529788969577458e-07, "loss": 0.0002, "step": 10022 }, { "epoch": 4.560054595086442, "grad_norm": 0.028160348708211602, "learning_rate": 9.510255394065692e-07, "loss": 0.0005, "step": 10023 }, { "epoch": 4.560509554140127, "grad_norm": 0.0455468456634696, "learning_rate": 9.49074147028986e-07, "loss": 0.0004, "step": 10024 }, { "epoch": 4.560964513193813, "grad_norm": 0.3821234584961276, "learning_rate": 9.471247199844491e-07, "loss": 0.0045, "step": 10025 }, { "epoch": 4.561419472247498, "grad_norm": 0.034930269195671666, "learning_rate": 9.451772584322588e-07, "loss": 0.0004, "step": 10026 }, { "epoch": 4.561874431301183, "grad_norm": 0.18462307122469807, "learning_rate": 9.432317625315545e-07, "loss": 0.0053, "step": 10027 }, { "epoch": 4.5623293903548685, "grad_norm": 0.27259621390667976, "learning_rate": 9.412882324413091e-07, "loss": 0.0053, "step": 10028 }, { "epoch": 4.562784349408553, "grad_norm": 0.07979365779800364, "learning_rate": 9.393466683203401e-07, "loss": 0.0007, "step": 10029 }, { "epoch": 4.563239308462238, "grad_norm": 0.0816397030207032, "learning_rate": 9.374070703273035e-07, "loss": 0.0013, "step": 10030 }, { "epoch": 4.563694267515924, "grad_norm": 0.01137443750098277, "learning_rate": 9.35469438620698e-07, "loss": 0.0002, "step": 10031 }, { "epoch": 4.564149226569609, "grad_norm": 0.1897327103916492, "learning_rate": 9.335337733588551e-07, "loss": 0.0042, "step": 10032 }, { "epoch": 4.564604185623294, "grad_norm": 0.14706511120379778, "learning_rate": 9.316000746999509e-07, "loss": 0.0029, "step": 10033 }, { "epoch": 4.5650591446769795, "grad_norm": 0.009817981136245765, "learning_rate": 9.296683428019925e-07, "loss": 0.0002, "step": 10034 }, { "epoch": 4.565514103730664, "grad_norm": 0.015520321638060915, "learning_rate": 9.277385778228398e-07, "loss": 0.0002, "step": 10035 }, { "epoch": 4.565969062784349, "grad_norm": 0.343072433310277, "learning_rate": 9.258107799201804e-07, "loss": 0.0074, "step": 10036 }, { "epoch": 4.566424021838035, "grad_norm": 0.3557412086166228, "learning_rate": 9.238849492515439e-07, "loss": 0.0063, "step": 10037 }, { "epoch": 4.56687898089172, "grad_norm": 0.12986945509491696, "learning_rate": 9.219610859743044e-07, "loss": 0.0006, "step": 10038 }, { "epoch": 4.567333939945405, "grad_norm": 0.019703490066375853, "learning_rate": 9.200391902456667e-07, "loss": 0.0001, "step": 10039 }, { "epoch": 4.5677888989990905, "grad_norm": 0.447631385571906, "learning_rate": 9.181192622226858e-07, "loss": 0.0038, "step": 10040 }, { "epoch": 4.568243858052775, "grad_norm": 0.07782679671288832, "learning_rate": 9.162013020622473e-07, "loss": 0.0015, "step": 10041 }, { "epoch": 4.56869881710646, "grad_norm": 0.1079449554404958, "learning_rate": 9.142853099210757e-07, "loss": 0.0013, "step": 10042 }, { "epoch": 4.569153776160146, "grad_norm": 0.10404620955989836, "learning_rate": 9.123712859557349e-07, "loss": 0.0015, "step": 10043 }, { "epoch": 4.569608735213831, "grad_norm": 0.04487016502800224, "learning_rate": 9.104592303226356e-07, "loss": 0.0006, "step": 10044 }, { "epoch": 4.570063694267516, "grad_norm": 0.22806492397891073, "learning_rate": 9.085491431780224e-07, "loss": 0.0034, "step": 10045 }, { "epoch": 4.570518653321201, "grad_norm": 0.14352597967399866, "learning_rate": 9.066410246779761e-07, "loss": 0.0012, "step": 10046 }, { "epoch": 4.570973612374886, "grad_norm": 0.10622884630636131, "learning_rate": 9.047348749784218e-07, "loss": 0.0017, "step": 10047 }, { "epoch": 4.571428571428571, "grad_norm": 0.07423641527111391, "learning_rate": 9.028306942351156e-07, "loss": 0.001, "step": 10048 }, { "epoch": 4.571883530482257, "grad_norm": 0.08296510713245119, "learning_rate": 9.009284826036691e-07, "loss": 0.0017, "step": 10049 }, { "epoch": 4.572338489535942, "grad_norm": 0.15444100619850124, "learning_rate": 8.990282402395133e-07, "loss": 0.0028, "step": 10050 }, { "epoch": 4.572793448589627, "grad_norm": 0.13583638256809305, "learning_rate": 8.9712996729793e-07, "loss": 0.0006, "step": 10051 }, { "epoch": 4.573248407643312, "grad_norm": 0.07178319399733116, "learning_rate": 8.952336639340419e-07, "loss": 0.0007, "step": 10052 }, { "epoch": 4.573703366696997, "grad_norm": 0.1762076506993859, "learning_rate": 8.933393303027976e-07, "loss": 0.0009, "step": 10053 }, { "epoch": 4.574158325750682, "grad_norm": 0.015668127243043145, "learning_rate": 8.914469665590036e-07, "loss": 0.0001, "step": 10054 }, { "epoch": 4.574613284804368, "grad_norm": 0.09730302077789024, "learning_rate": 8.895565728572863e-07, "loss": 0.0008, "step": 10055 }, { "epoch": 4.575068243858053, "grad_norm": 0.0708906902209445, "learning_rate": 8.876681493521278e-07, "loss": 0.001, "step": 10056 }, { "epoch": 4.575523202911738, "grad_norm": 0.05374228805817584, "learning_rate": 8.857816961978377e-07, "loss": 0.0007, "step": 10057 }, { "epoch": 4.575978161965423, "grad_norm": 0.21717428345501127, "learning_rate": 8.838972135485596e-07, "loss": 0.002, "step": 10058 }, { "epoch": 4.576433121019108, "grad_norm": 0.013200750743322578, "learning_rate": 8.820147015583036e-07, "loss": 0.0002, "step": 10059 }, { "epoch": 4.576888080072793, "grad_norm": 0.25952940782702816, "learning_rate": 8.801341603808883e-07, "loss": 0.0061, "step": 10060 }, { "epoch": 4.577343039126479, "grad_norm": 0.5939021107816302, "learning_rate": 8.782555901699852e-07, "loss": 0.0103, "step": 10061 }, { "epoch": 4.577797998180164, "grad_norm": 0.11769068666798187, "learning_rate": 8.763789910791021e-07, "loss": 0.0016, "step": 10062 }, { "epoch": 4.578252957233849, "grad_norm": 0.36379193969634194, "learning_rate": 8.745043632615829e-07, "loss": 0.0017, "step": 10063 }, { "epoch": 4.578707916287534, "grad_norm": 0.04422763980651582, "learning_rate": 8.726317068706219e-07, "loss": 0.0005, "step": 10064 }, { "epoch": 4.579162875341219, "grad_norm": 0.06862992830340889, "learning_rate": 8.707610220592355e-07, "loss": 0.0011, "step": 10065 }, { "epoch": 4.579617834394904, "grad_norm": 0.13849148852414525, "learning_rate": 8.688923089802958e-07, "loss": 0.0011, "step": 10066 }, { "epoch": 4.58007279344859, "grad_norm": 0.1251638345997584, "learning_rate": 8.670255677865003e-07, "loss": 0.0009, "step": 10067 }, { "epoch": 4.580527752502275, "grad_norm": 0.4399777864681725, "learning_rate": 8.651607986303906e-07, "loss": 0.0045, "step": 10068 }, { "epoch": 4.58098271155596, "grad_norm": 0.10718666692238776, "learning_rate": 8.632980016643505e-07, "loss": 0.0006, "step": 10069 }, { "epoch": 4.581437670609645, "grad_norm": 0.34648679840824537, "learning_rate": 8.614371770405971e-07, "loss": 0.0081, "step": 10070 }, { "epoch": 4.58189262966333, "grad_norm": 0.24460979025791052, "learning_rate": 8.595783249111894e-07, "loss": 0.0037, "step": 10071 }, { "epoch": 4.582347588717015, "grad_norm": 0.1098605725385065, "learning_rate": 8.577214454280197e-07, "loss": 0.0014, "step": 10072 }, { "epoch": 4.582802547770701, "grad_norm": 0.1782560414080342, "learning_rate": 8.558665387428278e-07, "loss": 0.0031, "step": 10073 }, { "epoch": 4.583257506824386, "grad_norm": 0.14995869088478447, "learning_rate": 8.540136050071923e-07, "loss": 0.0024, "step": 10074 }, { "epoch": 4.583712465878071, "grad_norm": 0.10283587706089872, "learning_rate": 8.521626443725229e-07, "loss": 0.0016, "step": 10075 }, { "epoch": 4.584167424931756, "grad_norm": 0.034662717214945064, "learning_rate": 8.503136569900705e-07, "loss": 0.0006, "step": 10076 }, { "epoch": 4.584622383985441, "grad_norm": 0.02813301788219779, "learning_rate": 8.484666430109256e-07, "loss": 0.0003, "step": 10077 }, { "epoch": 4.585077343039126, "grad_norm": 0.12621634422820888, "learning_rate": 8.466216025860201e-07, "loss": 0.0026, "step": 10078 }, { "epoch": 4.585532302092812, "grad_norm": 0.18396817439731894, "learning_rate": 8.447785358661198e-07, "loss": 0.0013, "step": 10079 }, { "epoch": 4.585987261146497, "grad_norm": 0.0030256574626545732, "learning_rate": 8.429374430018372e-07, "loss": 0.0, "step": 10080 }, { "epoch": 4.5864422202001816, "grad_norm": 0.13367224022654534, "learning_rate": 8.410983241436132e-07, "loss": 0.0022, "step": 10081 }, { "epoch": 4.586897179253867, "grad_norm": 0.07666189720954562, "learning_rate": 8.392611794417305e-07, "loss": 0.0014, "step": 10082 }, { "epoch": 4.587352138307552, "grad_norm": 0.13849202114609982, "learning_rate": 8.374260090463188e-07, "loss": 0.0013, "step": 10083 }, { "epoch": 4.587807097361237, "grad_norm": 0.04813252096279367, "learning_rate": 8.35592813107336e-07, "loss": 0.0009, "step": 10084 }, { "epoch": 4.588262056414923, "grad_norm": 0.33864043084610895, "learning_rate": 8.337615917745845e-07, "loss": 0.0023, "step": 10085 }, { "epoch": 4.588717015468608, "grad_norm": 0.27466729843913057, "learning_rate": 8.319323451976973e-07, "loss": 0.0081, "step": 10086 }, { "epoch": 4.5891719745222925, "grad_norm": 0.0571275666961662, "learning_rate": 8.301050735261579e-07, "loss": 0.0005, "step": 10087 }, { "epoch": 4.589626933575978, "grad_norm": 0.08474125036658108, "learning_rate": 8.282797769092854e-07, "loss": 0.0028, "step": 10088 }, { "epoch": 4.590081892629663, "grad_norm": 0.009702616939912546, "learning_rate": 8.264564554962273e-07, "loss": 0.0002, "step": 10089 }, { "epoch": 4.590536851683348, "grad_norm": 0.0032030913594421027, "learning_rate": 8.246351094359838e-07, "loss": 0.0001, "step": 10090 }, { "epoch": 4.590991810737034, "grad_norm": 0.1630506103994533, "learning_rate": 8.228157388773806e-07, "loss": 0.0025, "step": 10091 }, { "epoch": 4.591446769790719, "grad_norm": 0.1504813426508862, "learning_rate": 8.209983439690955e-07, "loss": 0.0023, "step": 10092 }, { "epoch": 4.5919017288444035, "grad_norm": 0.07953115538381676, "learning_rate": 8.191829248596322e-07, "loss": 0.0011, "step": 10093 }, { "epoch": 4.592356687898089, "grad_norm": 0.08401757956451573, "learning_rate": 8.173694816973415e-07, "loss": 0.0005, "step": 10094 }, { "epoch": 4.592811646951774, "grad_norm": 0.2960947979179957, "learning_rate": 8.155580146304103e-07, "loss": 0.0023, "step": 10095 }, { "epoch": 4.59326660600546, "grad_norm": 0.08972612189819684, "learning_rate": 8.137485238068621e-07, "loss": 0.0024, "step": 10096 }, { "epoch": 4.593721565059145, "grad_norm": 0.06036147892690665, "learning_rate": 8.119410093745561e-07, "loss": 0.0007, "step": 10097 }, { "epoch": 4.59417652411283, "grad_norm": 0.159573809506605, "learning_rate": 8.101354714812021e-07, "loss": 0.0024, "step": 10098 }, { "epoch": 4.594631483166515, "grad_norm": 0.09270239739120559, "learning_rate": 8.083319102743375e-07, "loss": 0.0016, "step": 10099 }, { "epoch": 4.5950864422202, "grad_norm": 0.22356422390954298, "learning_rate": 8.065303259013363e-07, "loss": 0.0035, "step": 10100 }, { "epoch": 4.595541401273885, "grad_norm": 0.008459426827987972, "learning_rate": 8.047307185094249e-07, "loss": 0.0001, "step": 10101 }, { "epoch": 4.595996360327571, "grad_norm": 0.20523987346273642, "learning_rate": 8.029330882456499e-07, "loss": 0.0016, "step": 10102 }, { "epoch": 4.596451319381256, "grad_norm": 0.09569264214089968, "learning_rate": 8.011374352569157e-07, "loss": 0.0026, "step": 10103 }, { "epoch": 4.596906278434941, "grad_norm": 0.07932205794658174, "learning_rate": 7.993437596899467e-07, "loss": 0.0017, "step": 10104 }, { "epoch": 4.597361237488626, "grad_norm": 0.2329615844180905, "learning_rate": 7.975520616913173e-07, "loss": 0.0071, "step": 10105 }, { "epoch": 4.597816196542311, "grad_norm": 0.1024547168726971, "learning_rate": 7.957623414074327e-07, "loss": 0.0036, "step": 10106 }, { "epoch": 4.598271155595996, "grad_norm": 0.015139890933865408, "learning_rate": 7.939745989845426e-07, "loss": 0.0002, "step": 10107 }, { "epoch": 4.598726114649682, "grad_norm": 0.02684738134807264, "learning_rate": 7.921888345687412e-07, "loss": 0.0003, "step": 10108 }, { "epoch": 4.599181073703367, "grad_norm": 0.010826280980767464, "learning_rate": 7.904050483059422e-07, "loss": 0.0002, "step": 10109 }, { "epoch": 4.599636032757052, "grad_norm": 0.14504937050962582, "learning_rate": 7.886232403419181e-07, "loss": 0.0011, "step": 10110 }, { "epoch": 4.600090991810737, "grad_norm": 0.05505121189632612, "learning_rate": 7.868434108222577e-07, "loss": 0.0006, "step": 10111 }, { "epoch": 4.600545950864422, "grad_norm": 0.20348488685110924, "learning_rate": 7.850655598924144e-07, "loss": 0.004, "step": 10112 }, { "epoch": 4.601000909918107, "grad_norm": 0.16305466494129758, "learning_rate": 7.832896876976581e-07, "loss": 0.0048, "step": 10113 }, { "epoch": 4.601455868971793, "grad_norm": 0.08339214736664555, "learning_rate": 7.815157943831059e-07, "loss": 0.0012, "step": 10114 }, { "epoch": 4.601910828025478, "grad_norm": 0.0687417943229193, "learning_rate": 7.79743880093714e-07, "loss": 0.0007, "step": 10115 }, { "epoch": 4.6023657870791626, "grad_norm": 0.020313325780918972, "learning_rate": 7.779739449742724e-07, "loss": 0.0003, "step": 10116 }, { "epoch": 4.602820746132848, "grad_norm": 0.06703120068201773, "learning_rate": 7.762059891694179e-07, "loss": 0.0005, "step": 10117 }, { "epoch": 4.603275705186533, "grad_norm": 0.1597649516987392, "learning_rate": 7.744400128236157e-07, "loss": 0.0053, "step": 10118 }, { "epoch": 4.603730664240218, "grad_norm": 0.08325240120154886, "learning_rate": 7.726760160811725e-07, "loss": 0.0026, "step": 10119 }, { "epoch": 4.604185623293904, "grad_norm": 0.07444503856583752, "learning_rate": 7.709139990862341e-07, "loss": 0.0018, "step": 10120 }, { "epoch": 4.604640582347589, "grad_norm": 0.06522500686713735, "learning_rate": 7.691539619827881e-07, "loss": 0.0011, "step": 10121 }, { "epoch": 4.6050955414012735, "grad_norm": 0.09273468379724172, "learning_rate": 7.673959049146556e-07, "loss": 0.0019, "step": 10122 }, { "epoch": 4.605550500454959, "grad_norm": 0.055279095148612636, "learning_rate": 7.656398280254967e-07, "loss": 0.0005, "step": 10123 }, { "epoch": 4.606005459508644, "grad_norm": 0.008030929277056117, "learning_rate": 7.638857314588077e-07, "loss": 0.0001, "step": 10124 }, { "epoch": 4.606460418562329, "grad_norm": 0.21113841500449967, "learning_rate": 7.621336153579267e-07, "loss": 0.0015, "step": 10125 }, { "epoch": 4.606915377616015, "grad_norm": 0.13002517953626083, "learning_rate": 7.603834798660309e-07, "loss": 0.0028, "step": 10126 }, { "epoch": 4.6073703366697, "grad_norm": 0.016187947488675573, "learning_rate": 7.586353251261335e-07, "loss": 0.0001, "step": 10127 }, { "epoch": 4.607825295723385, "grad_norm": 0.16126715593196264, "learning_rate": 7.568891512810816e-07, "loss": 0.0015, "step": 10128 }, { "epoch": 4.60828025477707, "grad_norm": 0.10583843899417877, "learning_rate": 7.551449584735692e-07, "loss": 0.0009, "step": 10129 }, { "epoch": 4.608735213830755, "grad_norm": 0.16645752344327858, "learning_rate": 7.534027468461213e-07, "loss": 0.0013, "step": 10130 }, { "epoch": 4.609190172884441, "grad_norm": 0.06009874666362457, "learning_rate": 7.516625165411017e-07, "loss": 0.002, "step": 10131 }, { "epoch": 4.609645131938126, "grad_norm": 0.34253134776756833, "learning_rate": 7.499242677007218e-07, "loss": 0.0018, "step": 10132 }, { "epoch": 4.610100090991811, "grad_norm": 0.025707317425818096, "learning_rate": 7.48188000467015e-07, "loss": 0.0003, "step": 10133 }, { "epoch": 4.610555050045496, "grad_norm": 0.17988390741703478, "learning_rate": 7.464537149818679e-07, "loss": 0.0093, "step": 10134 }, { "epoch": 4.611010009099181, "grad_norm": 0.220218047877593, "learning_rate": 7.447214113869893e-07, "loss": 0.0052, "step": 10135 }, { "epoch": 4.611464968152866, "grad_norm": 0.31800990151162495, "learning_rate": 7.42991089823944e-07, "loss": 0.0117, "step": 10136 }, { "epoch": 4.611919927206552, "grad_norm": 1.4106114053462053, "learning_rate": 7.412627504341241e-07, "loss": 0.0172, "step": 10137 }, { "epoch": 4.612374886260237, "grad_norm": 0.21198213326711354, "learning_rate": 7.395363933587613e-07, "loss": 0.0024, "step": 10138 }, { "epoch": 4.612829845313922, "grad_norm": 0.0554908697906455, "learning_rate": 7.378120187389231e-07, "loss": 0.0006, "step": 10139 }, { "epoch": 4.613284804367607, "grad_norm": 0.056013321188814906, "learning_rate": 7.360896267155193e-07, "loss": 0.0008, "step": 10140 }, { "epoch": 4.613739763421292, "grad_norm": 0.0632760752916946, "learning_rate": 7.343692174292982e-07, "loss": 0.0011, "step": 10141 }, { "epoch": 4.614194722474977, "grad_norm": 0.03909030710735446, "learning_rate": 7.326507910208419e-07, "loss": 0.0002, "step": 10142 }, { "epoch": 4.614649681528663, "grad_norm": 0.1803823532878193, "learning_rate": 7.309343476305714e-07, "loss": 0.0047, "step": 10143 }, { "epoch": 4.615104640582348, "grad_norm": 0.20236115435615395, "learning_rate": 7.292198873987493e-07, "loss": 0.002, "step": 10144 }, { "epoch": 4.615559599636033, "grad_norm": 0.29274080317501555, "learning_rate": 7.275074104654694e-07, "loss": 0.0054, "step": 10145 }, { "epoch": 4.616014558689718, "grad_norm": 0.09031645284889524, "learning_rate": 7.257969169706752e-07, "loss": 0.0026, "step": 10146 }, { "epoch": 4.616469517743403, "grad_norm": 0.1819384715106888, "learning_rate": 7.240884070541326e-07, "loss": 0.0011, "step": 10147 }, { "epoch": 4.616924476797088, "grad_norm": 0.17100748006061392, "learning_rate": 7.223818808554577e-07, "loss": 0.0047, "step": 10148 }, { "epoch": 4.617379435850774, "grad_norm": 0.14247362157194812, "learning_rate": 7.206773385140947e-07, "loss": 0.0012, "step": 10149 }, { "epoch": 4.617834394904459, "grad_norm": 0.053102302858615606, "learning_rate": 7.189747801693375e-07, "loss": 0.0007, "step": 10150 }, { "epoch": 4.6182893539581436, "grad_norm": 0.03653378271100313, "learning_rate": 7.17274205960311e-07, "loss": 0.0002, "step": 10151 }, { "epoch": 4.618744313011829, "grad_norm": 0.11373061599690246, "learning_rate": 7.155756160259763e-07, "loss": 0.0024, "step": 10152 }, { "epoch": 4.619199272065514, "grad_norm": 0.04110572343048809, "learning_rate": 7.138790105051335e-07, "loss": 0.0005, "step": 10153 }, { "epoch": 4.619654231119199, "grad_norm": 0.03358178271300377, "learning_rate": 7.121843895364217e-07, "loss": 0.0006, "step": 10154 }, { "epoch": 4.620109190172885, "grad_norm": 0.231039756944268, "learning_rate": 7.104917532583216e-07, "loss": 0.0053, "step": 10155 }, { "epoch": 4.62056414922657, "grad_norm": 0.01457236617744336, "learning_rate": 7.088011018091395e-07, "loss": 0.0003, "step": 10156 }, { "epoch": 4.6210191082802545, "grad_norm": 0.03173256495608151, "learning_rate": 7.071124353270397e-07, "loss": 0.0003, "step": 10157 }, { "epoch": 4.62147406733394, "grad_norm": 0.20509452936542136, "learning_rate": 7.054257539500037e-07, "loss": 0.005, "step": 10158 }, { "epoch": 4.621929026387625, "grad_norm": 0.22951116495152596, "learning_rate": 7.037410578158599e-07, "loss": 0.0034, "step": 10159 }, { "epoch": 4.62238398544131, "grad_norm": 0.15939400282500393, "learning_rate": 7.020583470622787e-07, "loss": 0.0044, "step": 10160 }, { "epoch": 4.622838944494996, "grad_norm": 0.06533784821729764, "learning_rate": 7.003776218267588e-07, "loss": 0.0004, "step": 10161 }, { "epoch": 4.623293903548681, "grad_norm": 0.1960483838973409, "learning_rate": 6.986988822466456e-07, "loss": 0.005, "step": 10162 }, { "epoch": 4.6237488626023655, "grad_norm": 0.20793854403425296, "learning_rate": 6.970221284591128e-07, "loss": 0.0076, "step": 10163 }, { "epoch": 4.624203821656051, "grad_norm": 0.11089880571041037, "learning_rate": 6.953473606011812e-07, "loss": 0.0023, "step": 10164 }, { "epoch": 4.624658780709736, "grad_norm": 0.17798597814150224, "learning_rate": 6.936745788097083e-07, "loss": 0.0047, "step": 10165 }, { "epoch": 4.625113739763421, "grad_norm": 0.23553638034023813, "learning_rate": 6.920037832213788e-07, "loss": 0.0084, "step": 10166 }, { "epoch": 4.625568698817107, "grad_norm": 0.405828376930348, "learning_rate": 6.903349739727283e-07, "loss": 0.0019, "step": 10167 }, { "epoch": 4.626023657870792, "grad_norm": 0.13201790512982833, "learning_rate": 6.886681512001225e-07, "loss": 0.0032, "step": 10168 }, { "epoch": 4.6264786169244765, "grad_norm": 0.1292565600926503, "learning_rate": 6.870033150397636e-07, "loss": 0.0011, "step": 10169 }, { "epoch": 4.626933575978162, "grad_norm": 0.17782998147137738, "learning_rate": 6.853404656276957e-07, "loss": 0.0028, "step": 10170 }, { "epoch": 4.627388535031847, "grad_norm": 0.04154040501540236, "learning_rate": 6.836796030998043e-07, "loss": 0.0004, "step": 10171 }, { "epoch": 4.627843494085532, "grad_norm": 0.04926708032291024, "learning_rate": 6.820207275918061e-07, "loss": 0.0004, "step": 10172 }, { "epoch": 4.628298453139218, "grad_norm": 0.3829220745430216, "learning_rate": 6.803638392392537e-07, "loss": 0.003, "step": 10173 }, { "epoch": 4.628753412192903, "grad_norm": 0.11956130266680932, "learning_rate": 6.787089381775385e-07, "loss": 0.0017, "step": 10174 }, { "epoch": 4.6292083712465875, "grad_norm": 0.17626955905875577, "learning_rate": 6.770560245418972e-07, "loss": 0.0019, "step": 10175 }, { "epoch": 4.629663330300273, "grad_norm": 0.09244764040456697, "learning_rate": 6.754050984673993e-07, "loss": 0.0012, "step": 10176 }, { "epoch": 4.630118289353958, "grad_norm": 0.2520819299377568, "learning_rate": 6.737561600889425e-07, "loss": 0.002, "step": 10177 }, { "epoch": 4.630573248407643, "grad_norm": 0.18251942300033183, "learning_rate": 6.721092095412773e-07, "loss": 0.0052, "step": 10178 }, { "epoch": 4.631028207461329, "grad_norm": 0.02961550464206996, "learning_rate": 6.704642469589851e-07, "loss": 0.0004, "step": 10179 }, { "epoch": 4.631483166515014, "grad_norm": 0.14958591178383526, "learning_rate": 6.688212724764831e-07, "loss": 0.0005, "step": 10180 }, { "epoch": 4.631938125568698, "grad_norm": 0.1232483079553489, "learning_rate": 6.671802862280279e-07, "loss": 0.0066, "step": 10181 }, { "epoch": 4.632393084622384, "grad_norm": 0.03865594402227279, "learning_rate": 6.655412883477153e-07, "loss": 0.0015, "step": 10182 }, { "epoch": 4.632848043676069, "grad_norm": 0.1755542092878715, "learning_rate": 6.639042789694711e-07, "loss": 0.0009, "step": 10183 }, { "epoch": 4.633303002729754, "grad_norm": 0.09496200489731496, "learning_rate": 6.62269258227069e-07, "loss": 0.002, "step": 10184 }, { "epoch": 4.63375796178344, "grad_norm": 0.10535940434296182, "learning_rate": 6.606362262541188e-07, "loss": 0.0022, "step": 10185 }, { "epoch": 4.6342129208371245, "grad_norm": 0.13369135627284545, "learning_rate": 6.590051831840583e-07, "loss": 0.0011, "step": 10186 }, { "epoch": 4.634667879890809, "grad_norm": 0.06563013612071518, "learning_rate": 6.573761291501723e-07, "loss": 0.0004, "step": 10187 }, { "epoch": 4.635122838944495, "grad_norm": 0.06638586108284085, "learning_rate": 6.557490642855768e-07, "loss": 0.0009, "step": 10188 }, { "epoch": 4.63557779799818, "grad_norm": 0.008948345060896349, "learning_rate": 6.54123988723232e-07, "loss": 0.0001, "step": 10189 }, { "epoch": 4.636032757051865, "grad_norm": 0.11847107404285924, "learning_rate": 6.525009025959289e-07, "loss": 0.003, "step": 10190 }, { "epoch": 4.636487716105551, "grad_norm": 0.011574867152686614, "learning_rate": 6.508798060362975e-07, "loss": 0.0001, "step": 10191 }, { "epoch": 4.6369426751592355, "grad_norm": 0.010257892665758493, "learning_rate": 6.492606991768124e-07, "loss": 0.0001, "step": 10192 }, { "epoch": 4.63739763421292, "grad_norm": 0.16651629389281028, "learning_rate": 6.476435821497734e-07, "loss": 0.0096, "step": 10193 }, { "epoch": 4.637852593266606, "grad_norm": 0.159035069382405, "learning_rate": 6.460284550873274e-07, "loss": 0.0033, "step": 10194 }, { "epoch": 4.638307552320291, "grad_norm": 0.058120244220319126, "learning_rate": 6.44415318121458e-07, "loss": 0.0018, "step": 10195 }, { "epoch": 4.638762511373976, "grad_norm": 0.11605631450564409, "learning_rate": 6.428041713839761e-07, "loss": 0.002, "step": 10196 }, { "epoch": 4.639217470427662, "grad_norm": 0.019382770315054358, "learning_rate": 6.411950150065405e-07, "loss": 0.0003, "step": 10197 }, { "epoch": 4.6396724294813465, "grad_norm": 0.11238364669798893, "learning_rate": 6.395878491206458e-07, "loss": 0.005, "step": 10198 }, { "epoch": 4.640127388535031, "grad_norm": 0.3355211300211452, "learning_rate": 6.379826738576205e-07, "loss": 0.0038, "step": 10199 }, { "epoch": 4.640582347588717, "grad_norm": 0.20149301317828544, "learning_rate": 6.363794893486375e-07, "loss": 0.0014, "step": 10200 }, { "epoch": 4.641037306642402, "grad_norm": 0.019980201640206037, "learning_rate": 6.347782957246945e-07, "loss": 0.0002, "step": 10201 }, { "epoch": 4.641492265696087, "grad_norm": 0.2999415474749564, "learning_rate": 6.331790931166371e-07, "loss": 0.0061, "step": 10202 }, { "epoch": 4.641947224749773, "grad_norm": 0.13511674337241578, "learning_rate": 6.315818816551439e-07, "loss": 0.0043, "step": 10203 }, { "epoch": 4.6424021838034575, "grad_norm": 0.06896642845077222, "learning_rate": 6.299866614707328e-07, "loss": 0.0004, "step": 10204 }, { "epoch": 4.642857142857143, "grad_norm": 0.1893848174182741, "learning_rate": 6.28393432693758e-07, "loss": 0.0012, "step": 10205 }, { "epoch": 4.643312101910828, "grad_norm": 0.2595084508396542, "learning_rate": 6.268021954544096e-07, "loss": 0.0009, "step": 10206 }, { "epoch": 4.643767060964513, "grad_norm": 0.009852032445709228, "learning_rate": 6.252129498827197e-07, "loss": 0.0001, "step": 10207 }, { "epoch": 4.644222020018199, "grad_norm": 0.0663354563967578, "learning_rate": 6.236256961085485e-07, "loss": 0.0013, "step": 10208 }, { "epoch": 4.644676979071884, "grad_norm": 0.1898038198315653, "learning_rate": 6.22040434261606e-07, "loss": 0.0064, "step": 10209 }, { "epoch": 4.6451319381255685, "grad_norm": 0.08833883969196711, "learning_rate": 6.204571644714302e-07, "loss": 0.0021, "step": 10210 }, { "epoch": 4.645586897179254, "grad_norm": 0.10949178299464513, "learning_rate": 6.188758868673955e-07, "loss": 0.0012, "step": 10211 }, { "epoch": 4.646041856232939, "grad_norm": 0.049059118511559724, "learning_rate": 6.17296601578718e-07, "loss": 0.0002, "step": 10212 }, { "epoch": 4.646496815286624, "grad_norm": 0.20813842414706749, "learning_rate": 6.157193087344526e-07, "loss": 0.005, "step": 10213 }, { "epoch": 4.64695177434031, "grad_norm": 0.31296299096796465, "learning_rate": 6.141440084634853e-07, "loss": 0.0035, "step": 10214 }, { "epoch": 4.647406733393995, "grad_norm": 0.35114195051702485, "learning_rate": 6.125707008945464e-07, "loss": 0.0064, "step": 10215 }, { "epoch": 4.647861692447679, "grad_norm": 0.011370890791420242, "learning_rate": 6.109993861561968e-07, "loss": 0.0001, "step": 10216 }, { "epoch": 4.648316651501365, "grad_norm": 0.2096506699143916, "learning_rate": 6.09430064376837e-07, "loss": 0.0038, "step": 10217 }, { "epoch": 4.64877161055505, "grad_norm": 0.042043285640550584, "learning_rate": 6.078627356847055e-07, "loss": 0.0012, "step": 10218 }, { "epoch": 4.649226569608735, "grad_norm": 0.22254662752571655, "learning_rate": 6.062974002078752e-07, "loss": 0.0033, "step": 10219 }, { "epoch": 4.649681528662421, "grad_norm": 0.05895492300893225, "learning_rate": 6.047340580742633e-07, "loss": 0.0004, "step": 10220 }, { "epoch": 4.6501364877161055, "grad_norm": 0.13643047441836545, "learning_rate": 6.031727094116175e-07, "loss": 0.0032, "step": 10221 }, { "epoch": 4.65059144676979, "grad_norm": 0.12928736888223044, "learning_rate": 6.016133543475188e-07, "loss": 0.0007, "step": 10222 }, { "epoch": 4.651046405823476, "grad_norm": 0.11942784372772594, "learning_rate": 6.000559930093963e-07, "loss": 0.0026, "step": 10223 }, { "epoch": 4.651501364877161, "grad_norm": 0.5624364178261728, "learning_rate": 5.985006255245091e-07, "loss": 0.005, "step": 10224 }, { "epoch": 4.651956323930846, "grad_norm": 0.2625783702617602, "learning_rate": 5.969472520199554e-07, "loss": 0.0035, "step": 10225 }, { "epoch": 4.652411282984532, "grad_norm": 0.03662945480864635, "learning_rate": 5.953958726226672e-07, "loss": 0.0005, "step": 10226 }, { "epoch": 4.6528662420382165, "grad_norm": 0.19253076609489267, "learning_rate": 5.93846487459418e-07, "loss": 0.0027, "step": 10227 }, { "epoch": 4.653321201091901, "grad_norm": 0.12501736499372934, "learning_rate": 5.922990966568176e-07, "loss": 0.0014, "step": 10228 }, { "epoch": 4.653776160145587, "grad_norm": 0.04256483602327633, "learning_rate": 5.907537003413121e-07, "loss": 0.0004, "step": 10229 }, { "epoch": 4.654231119199272, "grad_norm": 0.20614401433203716, "learning_rate": 5.89210298639184e-07, "loss": 0.0029, "step": 10230 }, { "epoch": 4.654686078252957, "grad_norm": 0.1638471577612174, "learning_rate": 5.87668891676546e-07, "loss": 0.0034, "step": 10231 }, { "epoch": 4.655141037306643, "grad_norm": 0.1057664679119025, "learning_rate": 5.86129479579367e-07, "loss": 0.0009, "step": 10232 }, { "epoch": 4.6555959963603275, "grad_norm": 0.1145966768689946, "learning_rate": 5.845920624734324e-07, "loss": 0.0016, "step": 10233 }, { "epoch": 4.656050955414012, "grad_norm": 0.09708763253754527, "learning_rate": 5.830566404843752e-07, "loss": 0.0023, "step": 10234 }, { "epoch": 4.656505914467698, "grad_norm": 0.019426275724592417, "learning_rate": 5.815232137376641e-07, "loss": 0.0003, "step": 10235 }, { "epoch": 4.656960873521383, "grad_norm": 0.23019749539571505, "learning_rate": 5.799917823586021e-07, "loss": 0.0039, "step": 10236 }, { "epoch": 4.657415832575068, "grad_norm": 0.05293514630558273, "learning_rate": 5.784623464723333e-07, "loss": 0.0004, "step": 10237 }, { "epoch": 4.657870791628754, "grad_norm": 0.06780861309095582, "learning_rate": 5.769349062038354e-07, "loss": 0.0007, "step": 10238 }, { "epoch": 4.6583257506824385, "grad_norm": 0.027531187552135416, "learning_rate": 5.754094616779227e-07, "loss": 0.0003, "step": 10239 }, { "epoch": 4.658780709736124, "grad_norm": 0.12913794690942743, "learning_rate": 5.738860130192481e-07, "loss": 0.0075, "step": 10240 }, { "epoch": 4.659235668789809, "grad_norm": 0.11152201838534015, "learning_rate": 5.723645603523009e-07, "loss": 0.0008, "step": 10241 }, { "epoch": 4.659690627843494, "grad_norm": 0.1619244206282666, "learning_rate": 5.708451038014068e-07, "loss": 0.003, "step": 10242 }, { "epoch": 4.66014558689718, "grad_norm": 0.1286166963546077, "learning_rate": 5.693276434907302e-07, "loss": 0.0023, "step": 10243 }, { "epoch": 4.660600545950865, "grad_norm": 0.14577060804148223, "learning_rate": 5.67812179544272e-07, "loss": 0.0033, "step": 10244 }, { "epoch": 4.6610555050045495, "grad_norm": 0.11717107970159384, "learning_rate": 5.662987120858693e-07, "loss": 0.0005, "step": 10245 }, { "epoch": 4.661510464058235, "grad_norm": 0.10785153844729863, "learning_rate": 5.647872412391897e-07, "loss": 0.0017, "step": 10246 }, { "epoch": 4.66196542311192, "grad_norm": 0.18184161003635377, "learning_rate": 5.632777671277483e-07, "loss": 0.0099, "step": 10247 }, { "epoch": 4.662420382165605, "grad_norm": 0.09492423142509111, "learning_rate": 5.617702898748967e-07, "loss": 0.0032, "step": 10248 }, { "epoch": 4.662875341219291, "grad_norm": 0.10289798574474028, "learning_rate": 5.602648096038138e-07, "loss": 0.0013, "step": 10249 }, { "epoch": 4.663330300272976, "grad_norm": 0.27691119688328186, "learning_rate": 5.587613264375207e-07, "loss": 0.0015, "step": 10250 }, { "epoch": 4.66378525932666, "grad_norm": 0.6025423848487722, "learning_rate": 5.572598404988749e-07, "loss": 0.0059, "step": 10251 }, { "epoch": 4.664240218380346, "grad_norm": 0.15149346607952485, "learning_rate": 5.557603519105753e-07, "loss": 0.0045, "step": 10252 }, { "epoch": 4.664695177434031, "grad_norm": 0.14819416681992767, "learning_rate": 5.542628607951489e-07, "loss": 0.0022, "step": 10253 }, { "epoch": 4.665150136487716, "grad_norm": 0.3053132312899647, "learning_rate": 5.527673672749645e-07, "loss": 0.0007, "step": 10254 }, { "epoch": 4.665605095541402, "grad_norm": 0.09768367655454663, "learning_rate": 5.512738714722299e-07, "loss": 0.0062, "step": 10255 }, { "epoch": 4.6660600545950865, "grad_norm": 0.11799170007041135, "learning_rate": 5.497823735089835e-07, "loss": 0.0049, "step": 10256 }, { "epoch": 4.666515013648771, "grad_norm": 0.17840014857402195, "learning_rate": 5.482928735071086e-07, "loss": 0.0006, "step": 10257 }, { "epoch": 4.666969972702457, "grad_norm": 0.023392068056816186, "learning_rate": 5.468053715883159e-07, "loss": 0.0003, "step": 10258 }, { "epoch": 4.667424931756142, "grad_norm": 0.05454318744097796, "learning_rate": 5.453198678741584e-07, "loss": 0.0008, "step": 10259 }, { "epoch": 4.667879890809827, "grad_norm": 0.09005245385873314, "learning_rate": 5.438363624860221e-07, "loss": 0.0017, "step": 10260 }, { "epoch": 4.668334849863513, "grad_norm": 0.21015789882649813, "learning_rate": 5.423548555451352e-07, "loss": 0.0017, "step": 10261 }, { "epoch": 4.6687898089171975, "grad_norm": 0.08831181921532413, "learning_rate": 5.408753471725619e-07, "loss": 0.0005, "step": 10262 }, { "epoch": 4.669244767970882, "grad_norm": 0.11531307062215468, "learning_rate": 5.393978374892e-07, "loss": 0.0031, "step": 10263 }, { "epoch": 4.669699727024568, "grad_norm": 0.1057940244887325, "learning_rate": 5.379223266157835e-07, "loss": 0.0018, "step": 10264 }, { "epoch": 4.670154686078253, "grad_norm": 0.23982675676316595, "learning_rate": 5.364488146728824e-07, "loss": 0.0058, "step": 10265 }, { "epoch": 4.670609645131938, "grad_norm": 0.0819417593780071, "learning_rate": 5.34977301780909e-07, "loss": 0.0007, "step": 10266 }, { "epoch": 4.671064604185624, "grad_norm": 0.08128971902154497, "learning_rate": 5.335077880601086e-07, "loss": 0.0014, "step": 10267 }, { "epoch": 4.6715195632393085, "grad_norm": 0.0076564775710460916, "learning_rate": 5.320402736305602e-07, "loss": 0.0001, "step": 10268 }, { "epoch": 4.671974522292993, "grad_norm": 0.15548740325285504, "learning_rate": 5.305747586121846e-07, "loss": 0.0033, "step": 10269 }, { "epoch": 4.672429481346679, "grad_norm": 0.05311941234084467, "learning_rate": 5.291112431247358e-07, "loss": 0.0008, "step": 10270 }, { "epoch": 4.672884440400364, "grad_norm": 0.08440776169619879, "learning_rate": 5.2764972728781e-07, "loss": 0.0009, "step": 10271 }, { "epoch": 4.673339399454049, "grad_norm": 0.030739190788324558, "learning_rate": 5.261902112208311e-07, "loss": 0.0002, "step": 10272 }, { "epoch": 4.673794358507735, "grad_norm": 0.17428005066752308, "learning_rate": 5.247326950430648e-07, "loss": 0.0046, "step": 10273 }, { "epoch": 4.6742493175614195, "grad_norm": 0.15058526663958427, "learning_rate": 5.232771788736157e-07, "loss": 0.0008, "step": 10274 }, { "epoch": 4.674704276615104, "grad_norm": 0.19585879015239527, "learning_rate": 5.218236628314139e-07, "loss": 0.0047, "step": 10275 }, { "epoch": 4.67515923566879, "grad_norm": 0.040404700240429615, "learning_rate": 5.203721470352419e-07, "loss": 0.0003, "step": 10276 }, { "epoch": 4.675614194722475, "grad_norm": 0.18249828209368688, "learning_rate": 5.189226316037105e-07, "loss": 0.0013, "step": 10277 }, { "epoch": 4.67606915377616, "grad_norm": 0.18177112994379646, "learning_rate": 5.174751166552666e-07, "loss": 0.0083, "step": 10278 }, { "epoch": 4.676524112829846, "grad_norm": 0.17157275492541654, "learning_rate": 5.160296023081961e-07, "loss": 0.0015, "step": 10279 }, { "epoch": 4.6769790718835305, "grad_norm": 0.1846190366719386, "learning_rate": 5.14586088680613e-07, "loss": 0.0027, "step": 10280 }, { "epoch": 4.677434030937215, "grad_norm": 0.2359719282263591, "learning_rate": 5.131445758904812e-07, "loss": 0.0022, "step": 10281 }, { "epoch": 4.677888989990901, "grad_norm": 0.19227290301980313, "learning_rate": 5.117050640555926e-07, "loss": 0.0013, "step": 10282 }, { "epoch": 4.678343949044586, "grad_norm": 0.05304295471162265, "learning_rate": 5.102675532935808e-07, "loss": 0.0004, "step": 10283 }, { "epoch": 4.678798908098271, "grad_norm": 0.08173850380465304, "learning_rate": 5.088320437219074e-07, "loss": 0.0013, "step": 10284 }, { "epoch": 4.679253867151957, "grad_norm": 0.1809168196732078, "learning_rate": 5.073985354578787e-07, "loss": 0.006, "step": 10285 }, { "epoch": 4.679708826205641, "grad_norm": 0.2045234642150784, "learning_rate": 5.05967028618634e-07, "loss": 0.0012, "step": 10286 }, { "epoch": 4.680163785259326, "grad_norm": 0.07825070554914272, "learning_rate": 5.045375233211496e-07, "loss": 0.0023, "step": 10287 }, { "epoch": 4.680618744313012, "grad_norm": 0.13768202744580224, "learning_rate": 5.031100196822403e-07, "loss": 0.0007, "step": 10288 }, { "epoch": 4.681073703366697, "grad_norm": 0.09853317674389175, "learning_rate": 5.016845178185487e-07, "loss": 0.0014, "step": 10289 }, { "epoch": 4.681528662420382, "grad_norm": 0.057032338480099636, "learning_rate": 5.002610178465678e-07, "loss": 0.001, "step": 10290 }, { "epoch": 4.6819836214740675, "grad_norm": 0.24879257162455687, "learning_rate": 4.988395198826157e-07, "loss": 0.0091, "step": 10291 }, { "epoch": 4.682438580527752, "grad_norm": 0.12558038303157934, "learning_rate": 4.97420024042855e-07, "loss": 0.0027, "step": 10292 }, { "epoch": 4.682893539581437, "grad_norm": 0.13063711946489473, "learning_rate": 4.960025304432736e-07, "loss": 0.0021, "step": 10293 }, { "epoch": 4.683348498635123, "grad_norm": 0.048472756729408144, "learning_rate": 4.945870391997065e-07, "loss": 0.0005, "step": 10294 }, { "epoch": 4.683803457688808, "grad_norm": 0.08879566745838997, "learning_rate": 4.931735504278223e-07, "loss": 0.0004, "step": 10295 }, { "epoch": 4.684258416742493, "grad_norm": 0.1461095850378091, "learning_rate": 4.91762064243123e-07, "loss": 0.0007, "step": 10296 }, { "epoch": 4.6847133757961785, "grad_norm": 0.17574476213337215, "learning_rate": 4.903525807609499e-07, "loss": 0.0019, "step": 10297 }, { "epoch": 4.685168334849863, "grad_norm": 0.25025312378986, "learning_rate": 4.889451000964801e-07, "loss": 0.0083, "step": 10298 }, { "epoch": 4.685623293903548, "grad_norm": 0.21860389086070864, "learning_rate": 4.875396223647244e-07, "loss": 0.004, "step": 10299 }, { "epoch": 4.686078252957234, "grad_norm": 0.02323981134957793, "learning_rate": 4.861361476805354e-07, "loss": 0.0003, "step": 10300 }, { "epoch": 4.686533212010919, "grad_norm": 0.18119475838506557, "learning_rate": 4.847346761585963e-07, "loss": 0.0022, "step": 10301 }, { "epoch": 4.686988171064604, "grad_norm": 0.12271565870594524, "learning_rate": 4.833352079134295e-07, "loss": 0.0015, "step": 10302 }, { "epoch": 4.6874431301182895, "grad_norm": 0.08071575658674235, "learning_rate": 4.819377430593908e-07, "loss": 0.0005, "step": 10303 }, { "epoch": 4.687898089171974, "grad_norm": 0.005723485060460332, "learning_rate": 4.805422817106775e-07, "loss": 0.0001, "step": 10304 }, { "epoch": 4.688353048225659, "grad_norm": 0.06801481689039654, "learning_rate": 4.791488239813235e-07, "loss": 0.0005, "step": 10305 }, { "epoch": 4.688808007279345, "grad_norm": 0.05231618805276353, "learning_rate": 4.777573699851906e-07, "loss": 0.0002, "step": 10306 }, { "epoch": 4.68926296633303, "grad_norm": 0.08258551341606023, "learning_rate": 4.7636791983598493e-07, "loss": 0.0008, "step": 10307 }, { "epoch": 4.689717925386715, "grad_norm": 0.29226512161968565, "learning_rate": 4.749804736472435e-07, "loss": 0.0089, "step": 10308 }, { "epoch": 4.6901728844404005, "grad_norm": 0.013504810726690653, "learning_rate": 4.735950315323423e-07, "loss": 0.0002, "step": 10309 }, { "epoch": 4.690627843494085, "grad_norm": 0.12666264920091733, "learning_rate": 4.722115936044963e-07, "loss": 0.0038, "step": 10310 }, { "epoch": 4.69108280254777, "grad_norm": 0.10327091129117204, "learning_rate": 4.7083015997675396e-07, "loss": 0.0009, "step": 10311 }, { "epoch": 4.691537761601456, "grad_norm": 0.24547326123555954, "learning_rate": 4.694507307619972e-07, "loss": 0.0033, "step": 10312 }, { "epoch": 4.691992720655141, "grad_norm": 0.1781893917003564, "learning_rate": 4.6807330607294696e-07, "loss": 0.0071, "step": 10313 }, { "epoch": 4.692447679708827, "grad_norm": 0.028732596552669645, "learning_rate": 4.6669788602216047e-07, "loss": 0.0002, "step": 10314 }, { "epoch": 4.6929026387625115, "grad_norm": 0.2726465367760285, "learning_rate": 4.653244707220339e-07, "loss": 0.002, "step": 10315 }, { "epoch": 4.693357597816196, "grad_norm": 0.08960159162728422, "learning_rate": 4.6395306028479133e-07, "loss": 0.0023, "step": 10316 }, { "epoch": 4.693812556869882, "grad_norm": 0.12999248485483222, "learning_rate": 4.625836548225015e-07, "loss": 0.0016, "step": 10317 }, { "epoch": 4.694267515923567, "grad_norm": 0.36318846501542607, "learning_rate": 4.612162544470666e-07, "loss": 0.0145, "step": 10318 }, { "epoch": 4.694722474977252, "grad_norm": 0.15188883844599596, "learning_rate": 4.598508592702222e-07, "loss": 0.0025, "step": 10319 }, { "epoch": 4.695177434030938, "grad_norm": 0.26810431303562565, "learning_rate": 4.584874694035429e-07, "loss": 0.0022, "step": 10320 }, { "epoch": 4.695632393084622, "grad_norm": 0.2545357062123537, "learning_rate": 4.571260849584397e-07, "loss": 0.0029, "step": 10321 }, { "epoch": 4.696087352138307, "grad_norm": 0.022678849059299504, "learning_rate": 4.5576670604615956e-07, "loss": 0.0002, "step": 10322 }, { "epoch": 4.696542311191993, "grad_norm": 0.13421121819485432, "learning_rate": 4.544093327777804e-07, "loss": 0.0075, "step": 10323 }, { "epoch": 4.696997270245678, "grad_norm": 0.27709319536885796, "learning_rate": 4.530539652642246e-07, "loss": 0.0149, "step": 10324 }, { "epoch": 4.697452229299363, "grad_norm": 0.14040312725016021, "learning_rate": 4.51700603616248e-07, "loss": 0.0051, "step": 10325 }, { "epoch": 4.6979071883530485, "grad_norm": 0.27506004527915223, "learning_rate": 4.503492479444371e-07, "loss": 0.0184, "step": 10326 }, { "epoch": 4.698362147406733, "grad_norm": 0.08612422960552815, "learning_rate": 4.489998983592231e-07, "loss": 0.0009, "step": 10327 }, { "epoch": 4.698817106460418, "grad_norm": 0.03508444511853369, "learning_rate": 4.4765255497086214e-07, "loss": 0.0005, "step": 10328 }, { "epoch": 4.699272065514104, "grad_norm": 0.37599791241593494, "learning_rate": 4.4630721788945786e-07, "loss": 0.0104, "step": 10329 }, { "epoch": 4.699727024567789, "grad_norm": 0.02079750519659065, "learning_rate": 4.4496388722494453e-07, "loss": 0.0002, "step": 10330 }, { "epoch": 4.700181983621474, "grad_norm": 0.07019145073772873, "learning_rate": 4.436225630870927e-07, "loss": 0.0023, "step": 10331 }, { "epoch": 4.7006369426751595, "grad_norm": 0.29143962774954135, "learning_rate": 4.4228324558551193e-07, "loss": 0.018, "step": 10332 }, { "epoch": 4.701091901728844, "grad_norm": 0.16310714845492874, "learning_rate": 4.4094593482963686e-07, "loss": 0.0017, "step": 10333 }, { "epoch": 4.701546860782529, "grad_norm": 0.2751212784877831, "learning_rate": 4.396106309287579e-07, "loss": 0.0041, "step": 10334 }, { "epoch": 4.702001819836215, "grad_norm": 0.15014934775230873, "learning_rate": 4.3827733399198214e-07, "loss": 0.0015, "step": 10335 }, { "epoch": 4.7024567788899, "grad_norm": 0.019772590352813126, "learning_rate": 4.3694604412826414e-07, "loss": 0.0003, "step": 10336 }, { "epoch": 4.702911737943585, "grad_norm": 0.04195953680297014, "learning_rate": 4.356167614463891e-07, "loss": 0.0008, "step": 10337 }, { "epoch": 4.7033666969972705, "grad_norm": 0.06287729304413195, "learning_rate": 4.342894860549784e-07, "loss": 0.002, "step": 10338 }, { "epoch": 4.703821656050955, "grad_norm": 0.13311443684774643, "learning_rate": 4.3296421806249545e-07, "loss": 0.0037, "step": 10339 }, { "epoch": 4.70427661510464, "grad_norm": 0.1406125867306676, "learning_rate": 4.31640957577234e-07, "loss": 0.0033, "step": 10340 }, { "epoch": 4.704731574158326, "grad_norm": 0.1872394734624398, "learning_rate": 4.303197047073215e-07, "loss": 0.0006, "step": 10341 }, { "epoch": 4.705186533212011, "grad_norm": 0.17649297033664277, "learning_rate": 4.290004595607272e-07, "loss": 0.0023, "step": 10342 }, { "epoch": 4.705641492265696, "grad_norm": 0.19980904845729808, "learning_rate": 4.276832222452537e-07, "loss": 0.0012, "step": 10343 }, { "epoch": 4.7060964513193815, "grad_norm": 0.06682871618196755, "learning_rate": 4.263679928685399e-07, "loss": 0.0012, "step": 10344 }, { "epoch": 4.706551410373066, "grad_norm": 0.2951413579509484, "learning_rate": 4.2505477153806096e-07, "loss": 0.0052, "step": 10345 }, { "epoch": 4.707006369426751, "grad_norm": 0.11029949175324028, "learning_rate": 4.2374355836112545e-07, "loss": 0.0012, "step": 10346 }, { "epoch": 4.707461328480437, "grad_norm": 0.32259463562449997, "learning_rate": 4.224343534448838e-07, "loss": 0.0013, "step": 10347 }, { "epoch": 4.707916287534122, "grad_norm": 0.21299058706634946, "learning_rate": 4.2112715689631154e-07, "loss": 0.0005, "step": 10348 }, { "epoch": 4.708371246587808, "grad_norm": 0.013979147966921983, "learning_rate": 4.1982196882223156e-07, "loss": 0.0001, "step": 10349 }, { "epoch": 4.7088262056414925, "grad_norm": 0.22933031886348204, "learning_rate": 4.185187893293002e-07, "loss": 0.004, "step": 10350 }, { "epoch": 4.709281164695177, "grad_norm": 0.08406456579193529, "learning_rate": 4.1721761852400175e-07, "loss": 0.0016, "step": 10351 }, { "epoch": 4.709736123748863, "grad_norm": 0.0793439199858124, "learning_rate": 4.1591845651266504e-07, "loss": 0.0013, "step": 10352 }, { "epoch": 4.710191082802548, "grad_norm": 0.04314963979307407, "learning_rate": 4.146213034014496e-07, "loss": 0.0005, "step": 10353 }, { "epoch": 4.710646041856233, "grad_norm": 0.15207791516256172, "learning_rate": 4.1332615929635666e-07, "loss": 0.0046, "step": 10354 }, { "epoch": 4.711101000909919, "grad_norm": 0.23188562482432448, "learning_rate": 4.1203302430321834e-07, "loss": 0.0096, "step": 10355 }, { "epoch": 4.711555959963603, "grad_norm": 0.11943159333705765, "learning_rate": 4.1074189852770284e-07, "loss": 0.0005, "step": 10356 }, { "epoch": 4.712010919017288, "grad_norm": 0.03242772163281809, "learning_rate": 4.0945278207531466e-07, "loss": 0.0005, "step": 10357 }, { "epoch": 4.712465878070974, "grad_norm": 0.10555755151928152, "learning_rate": 4.081656750513946e-07, "loss": 0.0013, "step": 10358 }, { "epoch": 4.712920837124659, "grad_norm": 0.08038350978154657, "learning_rate": 4.0688057756111955e-07, "loss": 0.0017, "step": 10359 }, { "epoch": 4.713375796178344, "grad_norm": 0.09817404546149928, "learning_rate": 4.055974897095027e-07, "loss": 0.0015, "step": 10360 }, { "epoch": 4.7138307552320295, "grad_norm": 0.0406101637691374, "learning_rate": 4.0431641160139367e-07, "loss": 0.0008, "step": 10361 }, { "epoch": 4.714285714285714, "grad_norm": 0.22438774309875423, "learning_rate": 4.030373433414697e-07, "loss": 0.0048, "step": 10362 }, { "epoch": 4.714740673339399, "grad_norm": 0.27463618099565296, "learning_rate": 4.0176028503425835e-07, "loss": 0.0083, "step": 10363 }, { "epoch": 4.715195632393085, "grad_norm": 0.26983566964314526, "learning_rate": 4.004852367841122e-07, "loss": 0.0044, "step": 10364 }, { "epoch": 4.71565059144677, "grad_norm": 0.09280641491267719, "learning_rate": 3.9921219869522296e-07, "loss": 0.0033, "step": 10365 }, { "epoch": 4.716105550500455, "grad_norm": 0.15935833407746638, "learning_rate": 3.97941170871613e-07, "loss": 0.0041, "step": 10366 }, { "epoch": 4.7165605095541405, "grad_norm": 0.20532494564795414, "learning_rate": 3.9667215341714916e-07, "loss": 0.0026, "step": 10367 }, { "epoch": 4.717015468607825, "grad_norm": 0.1589951898632376, "learning_rate": 3.9540514643553183e-07, "loss": 0.0021, "step": 10368 }, { "epoch": 4.71747042766151, "grad_norm": 0.14277665620343358, "learning_rate": 3.9414015003029214e-07, "loss": 0.0011, "step": 10369 }, { "epoch": 4.717925386715196, "grad_norm": 0.07238891675716498, "learning_rate": 3.9287716430480014e-07, "loss": 0.0016, "step": 10370 }, { "epoch": 4.718380345768881, "grad_norm": 0.2574572641993727, "learning_rate": 3.916161893622594e-07, "loss": 0.0015, "step": 10371 }, { "epoch": 4.718835304822566, "grad_norm": 0.069742272387847, "learning_rate": 3.9035722530571526e-07, "loss": 0.0015, "step": 10372 }, { "epoch": 4.7192902638762515, "grad_norm": 0.09505818517447963, "learning_rate": 3.89100272238041e-07, "loss": 0.0004, "step": 10373 }, { "epoch": 4.719745222929936, "grad_norm": 0.2347183090275007, "learning_rate": 3.878453302619545e-07, "loss": 0.0037, "step": 10374 }, { "epoch": 4.720200181983621, "grad_norm": 0.19037165283305713, "learning_rate": 3.865923994799958e-07, "loss": 0.0058, "step": 10375 }, { "epoch": 4.720655141037307, "grad_norm": 0.20450442165013288, "learning_rate": 3.853414799945554e-07, "loss": 0.0079, "step": 10376 }, { "epoch": 4.721110100090992, "grad_norm": 0.22267879920954006, "learning_rate": 3.8409257190784864e-07, "loss": 0.0031, "step": 10377 }, { "epoch": 4.721565059144677, "grad_norm": 0.19406435405183914, "learning_rate": 3.828456753219356e-07, "loss": 0.0017, "step": 10378 }, { "epoch": 4.7220200181983625, "grad_norm": 0.3401982677119803, "learning_rate": 3.8160079033870146e-07, "loss": 0.004, "step": 10379 }, { "epoch": 4.722474977252047, "grad_norm": 0.15188200172473615, "learning_rate": 3.803579170598731e-07, "loss": 0.0015, "step": 10380 }, { "epoch": 4.722929936305732, "grad_norm": 0.04810447603330177, "learning_rate": 3.791170555870166e-07, "loss": 0.0004, "step": 10381 }, { "epoch": 4.723384895359418, "grad_norm": 0.020346648902682524, "learning_rate": 3.7787820602152856e-07, "loss": 0.0003, "step": 10382 }, { "epoch": 4.723839854413103, "grad_norm": 0.047201053399878005, "learning_rate": 3.7664136846463913e-07, "loss": 0.0003, "step": 10383 }, { "epoch": 4.724294813466788, "grad_norm": 0.05506360025336808, "learning_rate": 3.754065430174203e-07, "loss": 0.0005, "step": 10384 }, { "epoch": 4.7247497725204735, "grad_norm": 0.2715111444342005, "learning_rate": 3.741737297807746e-07, "loss": 0.0101, "step": 10385 }, { "epoch": 4.725204731574158, "grad_norm": 0.03045750515881791, "learning_rate": 3.729429288554409e-07, "loss": 0.0002, "step": 10386 }, { "epoch": 4.725659690627843, "grad_norm": 0.15758264455870988, "learning_rate": 3.7171414034199714e-07, "loss": 0.002, "step": 10387 }, { "epoch": 4.726114649681529, "grad_norm": 0.32882854078924456, "learning_rate": 3.7048736434085463e-07, "loss": 0.0274, "step": 10388 }, { "epoch": 4.726569608735214, "grad_norm": 0.11599121374236972, "learning_rate": 3.692626009522582e-07, "loss": 0.0032, "step": 10389 }, { "epoch": 4.727024567788899, "grad_norm": 0.006941105113202984, "learning_rate": 3.680398502762916e-07, "loss": 0.0001, "step": 10390 }, { "epoch": 4.727479526842584, "grad_norm": 0.11055002278210595, "learning_rate": 3.668191124128695e-07, "loss": 0.0014, "step": 10391 }, { "epoch": 4.727934485896269, "grad_norm": 0.08468460391859083, "learning_rate": 3.6560038746174807e-07, "loss": 0.0025, "step": 10392 }, { "epoch": 4.728389444949954, "grad_norm": 0.03696420425501596, "learning_rate": 3.643836755225172e-07, "loss": 0.0003, "step": 10393 }, { "epoch": 4.72884440400364, "grad_norm": 0.03817319637323982, "learning_rate": 3.6316897669459447e-07, "loss": 0.0003, "step": 10394 }, { "epoch": 4.729299363057325, "grad_norm": 0.13358559931050215, "learning_rate": 3.619562910772478e-07, "loss": 0.0016, "step": 10395 }, { "epoch": 4.72975432211101, "grad_norm": 0.17635623697673777, "learning_rate": 3.6074561876956457e-07, "loss": 0.001, "step": 10396 }, { "epoch": 4.730209281164695, "grad_norm": 0.018515640128029105, "learning_rate": 3.595369598704823e-07, "loss": 0.0004, "step": 10397 }, { "epoch": 4.73066424021838, "grad_norm": 0.0737346096592794, "learning_rate": 3.5833031447876365e-07, "loss": 0.0018, "step": 10398 }, { "epoch": 4.731119199272065, "grad_norm": 0.03517720325253116, "learning_rate": 3.5712568269301306e-07, "loss": 0.0002, "step": 10399 }, { "epoch": 4.731574158325751, "grad_norm": 0.0935574185247646, "learning_rate": 3.559230646116629e-07, "loss": 0.0009, "step": 10400 }, { "epoch": 4.732029117379436, "grad_norm": 0.21058823005593671, "learning_rate": 3.547224603329874e-07, "loss": 0.0081, "step": 10401 }, { "epoch": 4.732484076433121, "grad_norm": 0.05770471795585066, "learning_rate": 3.535238699550969e-07, "loss": 0.0008, "step": 10402 }, { "epoch": 4.732939035486806, "grad_norm": 0.009024728451283848, "learning_rate": 3.5232729357593254e-07, "loss": 0.0001, "step": 10403 }, { "epoch": 4.733393994540491, "grad_norm": 0.09149315069034124, "learning_rate": 3.5113273129327717e-07, "loss": 0.001, "step": 10404 }, { "epoch": 4.733848953594176, "grad_norm": 0.00667836280944964, "learning_rate": 3.499401832047361e-07, "loss": 0.0001, "step": 10405 }, { "epoch": 4.734303912647862, "grad_norm": 0.12649941716096877, "learning_rate": 3.4874964940777023e-07, "loss": 0.0009, "step": 10406 }, { "epoch": 4.734758871701547, "grad_norm": 0.17192102955608024, "learning_rate": 3.475611299996545e-07, "loss": 0.002, "step": 10407 }, { "epoch": 4.735213830755232, "grad_norm": 0.005692073301989358, "learning_rate": 3.4637462507751404e-07, "loss": 0.0001, "step": 10408 }, { "epoch": 4.735668789808917, "grad_norm": 0.07907162948251555, "learning_rate": 3.451901347383074e-07, "loss": 0.0006, "step": 10409 }, { "epoch": 4.736123748862602, "grad_norm": 0.06616334164902647, "learning_rate": 3.44007659078821e-07, "loss": 0.0004, "step": 10410 }, { "epoch": 4.736578707916287, "grad_norm": 0.15264377560560466, "learning_rate": 3.428271981956832e-07, "loss": 0.0023, "step": 10411 }, { "epoch": 4.737033666969973, "grad_norm": 0.2721521847687883, "learning_rate": 3.416487521853584e-07, "loss": 0.006, "step": 10412 }, { "epoch": 4.737488626023658, "grad_norm": 0.04274301654128207, "learning_rate": 3.4047232114413905e-07, "loss": 0.0004, "step": 10413 }, { "epoch": 4.737943585077343, "grad_norm": 0.11936812638319245, "learning_rate": 3.392979051681622e-07, "loss": 0.0024, "step": 10414 }, { "epoch": 4.738398544131028, "grad_norm": 0.488656211032015, "learning_rate": 3.381255043533871e-07, "loss": 0.0069, "step": 10415 }, { "epoch": 4.738853503184713, "grad_norm": 0.13869413711385692, "learning_rate": 3.369551187956288e-07, "loss": 0.0011, "step": 10416 }, { "epoch": 4.739308462238398, "grad_norm": 0.033548680463391614, "learning_rate": 3.3578674859052196e-07, "loss": 0.0002, "step": 10417 }, { "epoch": 4.739763421292084, "grad_norm": 0.21478066489433126, "learning_rate": 3.3462039383354015e-07, "loss": 0.004, "step": 10418 }, { "epoch": 4.740218380345769, "grad_norm": 0.07924274902421959, "learning_rate": 3.3345605461999053e-07, "loss": 0.0004, "step": 10419 }, { "epoch": 4.740673339399454, "grad_norm": 0.48732324477101674, "learning_rate": 3.3229373104501636e-07, "loss": 0.0122, "step": 10420 }, { "epoch": 4.741128298453139, "grad_norm": 0.010540424797806586, "learning_rate": 3.3113342320360285e-07, "loss": 0.0001, "step": 10421 }, { "epoch": 4.741583257506824, "grad_norm": 0.015423679591025743, "learning_rate": 3.2997513119056025e-07, "loss": 0.0002, "step": 10422 }, { "epoch": 4.742038216560509, "grad_norm": 0.1830275218236165, "learning_rate": 3.288188551005433e-07, "loss": 0.0038, "step": 10423 }, { "epoch": 4.742493175614195, "grad_norm": 0.06411120712156267, "learning_rate": 3.2766459502803494e-07, "loss": 0.0012, "step": 10424 }, { "epoch": 4.74294813466788, "grad_norm": 0.13210194805751113, "learning_rate": 3.2651235106735403e-07, "loss": 0.0014, "step": 10425 }, { "epoch": 4.743403093721565, "grad_norm": 0.1857370043256521, "learning_rate": 3.2536212331266413e-07, "loss": 0.0022, "step": 10426 }, { "epoch": 4.74385805277525, "grad_norm": 0.043181948313797715, "learning_rate": 3.2421391185794846e-07, "loss": 0.0003, "step": 10427 }, { "epoch": 4.744313011828935, "grad_norm": 0.007758509217299212, "learning_rate": 3.230677167970403e-07, "loss": 0.0001, "step": 10428 }, { "epoch": 4.744767970882621, "grad_norm": 0.03480048354166414, "learning_rate": 3.2192353822359243e-07, "loss": 0.0008, "step": 10429 }, { "epoch": 4.745222929936306, "grad_norm": 0.4233034383003463, "learning_rate": 3.207813762311107e-07, "loss": 0.0054, "step": 10430 }, { "epoch": 4.745677888989991, "grad_norm": 0.18459436083555678, "learning_rate": 3.1964123091292595e-07, "loss": 0.0022, "step": 10431 }, { "epoch": 4.746132848043676, "grad_norm": 0.07091357877111099, "learning_rate": 3.185031023622026e-07, "loss": 0.0008, "step": 10432 }, { "epoch": 4.746587807097361, "grad_norm": 0.1521410310901387, "learning_rate": 3.1736699067194677e-07, "loss": 0.0024, "step": 10433 }, { "epoch": 4.747042766151046, "grad_norm": 0.14592853456489405, "learning_rate": 3.162328959349925e-07, "loss": 0.0011, "step": 10434 }, { "epoch": 4.747497725204732, "grad_norm": 0.02313389379198112, "learning_rate": 3.151008182440185e-07, "loss": 0.0002, "step": 10435 }, { "epoch": 4.747952684258417, "grad_norm": 0.17684113059337647, "learning_rate": 3.1397075769152574e-07, "loss": 0.0012, "step": 10436 }, { "epoch": 4.748407643312102, "grad_norm": 0.20670275536175894, "learning_rate": 3.128427143698626e-07, "loss": 0.0034, "step": 10437 }, { "epoch": 4.748862602365787, "grad_norm": 0.05147670945286942, "learning_rate": 3.1171668837120805e-07, "loss": 0.001, "step": 10438 }, { "epoch": 4.749317561419472, "grad_norm": 0.06497750267471597, "learning_rate": 3.105926797875747e-07, "loss": 0.0007, "step": 10439 }, { "epoch": 4.749772520473157, "grad_norm": 0.10385655314287064, "learning_rate": 3.0947068871080844e-07, "loss": 0.0022, "step": 10440 }, { "epoch": 4.750227479526843, "grad_norm": 0.18643037449179334, "learning_rate": 3.0835071523259983e-07, "loss": 0.005, "step": 10441 }, { "epoch": 4.750682438580528, "grad_norm": 0.02799950115272164, "learning_rate": 3.0723275944446183e-07, "loss": 0.0002, "step": 10442 }, { "epoch": 4.751137397634213, "grad_norm": 0.07696743519706353, "learning_rate": 3.0611682143775187e-07, "loss": 0.0028, "step": 10443 }, { "epoch": 4.751592356687898, "grad_norm": 0.3717573552647083, "learning_rate": 3.0500290130365536e-07, "loss": 0.0132, "step": 10444 }, { "epoch": 4.752047315741583, "grad_norm": 0.3305443289192515, "learning_rate": 3.0389099913320506e-07, "loss": 0.0066, "step": 10445 }, { "epoch": 4.752502274795268, "grad_norm": 0.12775184060875724, "learning_rate": 3.0278111501725325e-07, "loss": 0.0026, "step": 10446 }, { "epoch": 4.752957233848954, "grad_norm": 0.05913784271402879, "learning_rate": 3.0167324904649963e-07, "loss": 0.001, "step": 10447 }, { "epoch": 4.753412192902639, "grad_norm": 0.11870742014628942, "learning_rate": 3.0056740131146624e-07, "loss": 0.0005, "step": 10448 }, { "epoch": 4.753867151956324, "grad_norm": 0.07465720903391744, "learning_rate": 2.9946357190252794e-07, "loss": 0.0005, "step": 10449 }, { "epoch": 4.754322111010009, "grad_norm": 0.16687207340069157, "learning_rate": 2.9836176090987656e-07, "loss": 0.0011, "step": 10450 }, { "epoch": 4.754777070063694, "grad_norm": 0.12237571118765003, "learning_rate": 2.972619684235539e-07, "loss": 0.0039, "step": 10451 }, { "epoch": 4.755232029117379, "grad_norm": 0.19709528252136088, "learning_rate": 2.9616419453342423e-07, "loss": 0.0044, "step": 10452 }, { "epoch": 4.755686988171065, "grad_norm": 0.05067009264333823, "learning_rate": 2.9506843932919635e-07, "loss": 0.0006, "step": 10453 }, { "epoch": 4.75614194722475, "grad_norm": 0.26785323015154133, "learning_rate": 2.9397470290040697e-07, "loss": 0.0072, "step": 10454 }, { "epoch": 4.756596906278435, "grad_norm": 0.18751158280068442, "learning_rate": 2.9288298533643454e-07, "loss": 0.0012, "step": 10455 }, { "epoch": 4.75705186533212, "grad_norm": 0.21650618867925642, "learning_rate": 2.917932867264911e-07, "loss": 0.003, "step": 10456 }, { "epoch": 4.757506824385805, "grad_norm": 0.07979117064531524, "learning_rate": 2.907056071596137e-07, "loss": 0.001, "step": 10457 }, { "epoch": 4.757961783439491, "grad_norm": 0.023904582977913435, "learning_rate": 2.896199467246924e-07, "loss": 0.0003, "step": 10458 }, { "epoch": 4.758416742493176, "grad_norm": 0.29620667976379333, "learning_rate": 2.8853630551043397e-07, "loss": 0.0027, "step": 10459 }, { "epoch": 4.758871701546861, "grad_norm": 0.053445453568019996, "learning_rate": 2.874546836053954e-07, "loss": 0.0006, "step": 10460 }, { "epoch": 4.759326660600546, "grad_norm": 0.13719304671534233, "learning_rate": 2.863750810979587e-07, "loss": 0.0004, "step": 10461 }, { "epoch": 4.759781619654231, "grad_norm": 0.20544778211028542, "learning_rate": 2.852974980763451e-07, "loss": 0.0019, "step": 10462 }, { "epoch": 4.760236578707916, "grad_norm": 0.20726746161590984, "learning_rate": 2.84221934628609e-07, "loss": 0.0027, "step": 10463 }, { "epoch": 4.760691537761602, "grad_norm": 0.1114645888233743, "learning_rate": 2.8314839084263857e-07, "loss": 0.0028, "step": 10464 }, { "epoch": 4.761146496815287, "grad_norm": 0.16225943904305654, "learning_rate": 2.820768668061635e-07, "loss": 0.0007, "step": 10465 }, { "epoch": 4.761601455868972, "grad_norm": 0.1755806902758662, "learning_rate": 2.8100736260674443e-07, "loss": 0.0032, "step": 10466 }, { "epoch": 4.762056414922657, "grad_norm": 0.10336485243073132, "learning_rate": 2.799398783317697e-07, "loss": 0.0027, "step": 10467 }, { "epoch": 4.762511373976342, "grad_norm": 0.07067883957772193, "learning_rate": 2.7887441406847513e-07, "loss": 0.001, "step": 10468 }, { "epoch": 4.762966333030027, "grad_norm": 0.010224316958447142, "learning_rate": 2.7781096990392444e-07, "loss": 0.0001, "step": 10469 }, { "epoch": 4.763421292083713, "grad_norm": 0.0023692097389952124, "learning_rate": 2.767495459250147e-07, "loss": 0.0, "step": 10470 }, { "epoch": 4.763876251137398, "grad_norm": 0.04240309640167185, "learning_rate": 2.7569014221848213e-07, "loss": 0.0003, "step": 10471 }, { "epoch": 4.764331210191083, "grad_norm": 0.1243436950710241, "learning_rate": 2.746327588709019e-07, "loss": 0.0027, "step": 10472 }, { "epoch": 4.764786169244768, "grad_norm": 0.04928612976228861, "learning_rate": 2.735773959686688e-07, "loss": 0.0006, "step": 10473 }, { "epoch": 4.765241128298453, "grad_norm": 0.02917306959494334, "learning_rate": 2.7252405359803056e-07, "loss": 0.0004, "step": 10474 }, { "epoch": 4.765696087352138, "grad_norm": 0.1214395681444921, "learning_rate": 2.714727318450572e-07, "loss": 0.0003, "step": 10475 }, { "epoch": 4.766151046405824, "grad_norm": 0.12108754307298725, "learning_rate": 2.7042343079566046e-07, "loss": 0.0017, "step": 10476 }, { "epoch": 4.766606005459509, "grad_norm": 0.010455091535691937, "learning_rate": 2.693761505355802e-07, "loss": 0.0001, "step": 10477 }, { "epoch": 4.767060964513194, "grad_norm": 0.29112238436732796, "learning_rate": 2.6833089115039787e-07, "loss": 0.0005, "step": 10478 }, { "epoch": 4.767515923566879, "grad_norm": 0.07325697967069078, "learning_rate": 2.672876527255314e-07, "loss": 0.0011, "step": 10479 }, { "epoch": 4.767970882620564, "grad_norm": 0.12983506342283346, "learning_rate": 2.662464353462263e-07, "loss": 0.0019, "step": 10480 }, { "epoch": 4.768425841674249, "grad_norm": 0.20150736584149284, "learning_rate": 2.652072390975646e-07, "loss": 0.0171, "step": 10481 }, { "epoch": 4.768880800727935, "grad_norm": 0.0640056521560067, "learning_rate": 2.6417006406446454e-07, "loss": 0.0007, "step": 10482 }, { "epoch": 4.76933575978162, "grad_norm": 0.04270289517664768, "learning_rate": 2.631349103316805e-07, "loss": 0.0005, "step": 10483 }, { "epoch": 4.769790718835305, "grad_norm": 0.11458337200769761, "learning_rate": 2.621017779838031e-07, "loss": 0.0023, "step": 10484 }, { "epoch": 4.77024567788899, "grad_norm": 0.17699439258187302, "learning_rate": 2.6107066710525097e-07, "loss": 0.0022, "step": 10485 }, { "epoch": 4.770700636942675, "grad_norm": 0.1399800005383537, "learning_rate": 2.600415777802873e-07, "loss": 0.0043, "step": 10486 }, { "epoch": 4.77115559599636, "grad_norm": 0.15910276773854046, "learning_rate": 2.590145100929975e-07, "loss": 0.0032, "step": 10487 }, { "epoch": 4.771610555050046, "grad_norm": 0.20252353991732552, "learning_rate": 2.579894641273145e-07, "loss": 0.0027, "step": 10488 }, { "epoch": 4.772065514103731, "grad_norm": 0.1703731409673555, "learning_rate": 2.569664399669991e-07, "loss": 0.0029, "step": 10489 }, { "epoch": 4.772520473157416, "grad_norm": 0.06789399665579297, "learning_rate": 2.559454376956483e-07, "loss": 0.0015, "step": 10490 }, { "epoch": 4.772975432211101, "grad_norm": 0.3344633982952753, "learning_rate": 2.549264573966925e-07, "loss": 0.0034, "step": 10491 }, { "epoch": 4.773430391264786, "grad_norm": 0.16773730513845453, "learning_rate": 2.5390949915339577e-07, "loss": 0.0037, "step": 10492 }, { "epoch": 4.773885350318471, "grad_norm": 0.1725114929666816, "learning_rate": 2.528945630488638e-07, "loss": 0.0022, "step": 10493 }, { "epoch": 4.774340309372157, "grad_norm": 0.1821565803691434, "learning_rate": 2.5188164916603307e-07, "loss": 0.0059, "step": 10494 }, { "epoch": 4.774795268425842, "grad_norm": 0.28099019940138603, "learning_rate": 2.5087075758767063e-07, "loss": 0.0021, "step": 10495 }, { "epoch": 4.7752502274795265, "grad_norm": 0.016326497699111063, "learning_rate": 2.498618883963855e-07, "loss": 0.0002, "step": 10496 }, { "epoch": 4.775705186533212, "grad_norm": 0.10458015970230355, "learning_rate": 2.488550416746144e-07, "loss": 0.001, "step": 10497 }, { "epoch": 4.776160145586897, "grad_norm": 0.16442176439675704, "learning_rate": 2.47850217504636e-07, "loss": 0.002, "step": 10498 }, { "epoch": 4.776615104640582, "grad_norm": 0.3360128109015967, "learning_rate": 2.468474159685569e-07, "loss": 0.0034, "step": 10499 }, { "epoch": 4.777070063694268, "grad_norm": 0.10220250914611825, "learning_rate": 2.458466371483226e-07, "loss": 0.001, "step": 10500 }, { "epoch": 4.777525022747953, "grad_norm": 0.06413576112272924, "learning_rate": 2.448478811257149e-07, "loss": 0.0007, "step": 10501 }, { "epoch": 4.7779799818016375, "grad_norm": 0.22213333111999387, "learning_rate": 2.438511479823408e-07, "loss": 0.0034, "step": 10502 }, { "epoch": 4.778434940855323, "grad_norm": 0.04668819045352645, "learning_rate": 2.428564377996545e-07, "loss": 0.0004, "step": 10503 }, { "epoch": 4.778889899909008, "grad_norm": 0.17108748961921083, "learning_rate": 2.4186375065894104e-07, "loss": 0.0046, "step": 10504 }, { "epoch": 4.779344858962693, "grad_norm": 0.029553898527738924, "learning_rate": 2.4087308664131335e-07, "loss": 0.0002, "step": 10505 }, { "epoch": 4.779799818016379, "grad_norm": 0.07916465640611325, "learning_rate": 2.398844458277233e-07, "loss": 0.0005, "step": 10506 }, { "epoch": 4.780254777070064, "grad_norm": 0.022116765687175216, "learning_rate": 2.3889782829896177e-07, "loss": 0.0002, "step": 10507 }, { "epoch": 4.7807097361237485, "grad_norm": 0.1470549656664803, "learning_rate": 2.3791323413565047e-07, "loss": 0.0024, "step": 10508 }, { "epoch": 4.781164695177434, "grad_norm": 0.09699373680080088, "learning_rate": 2.3693066341824445e-07, "loss": 0.0032, "step": 10509 }, { "epoch": 4.781619654231119, "grad_norm": 0.02137480429022692, "learning_rate": 2.3595011622703777e-07, "loss": 0.0004, "step": 10510 }, { "epoch": 4.782074613284804, "grad_norm": 0.20128755021754863, "learning_rate": 2.3497159264214973e-07, "loss": 0.0049, "step": 10511 }, { "epoch": 4.78252957233849, "grad_norm": 0.2734289021801442, "learning_rate": 2.339950927435497e-07, "loss": 0.001, "step": 10512 }, { "epoch": 4.782984531392175, "grad_norm": 0.1737805051581366, "learning_rate": 2.330206166110238e-07, "loss": 0.0056, "step": 10513 }, { "epoch": 4.7834394904458595, "grad_norm": 0.07758490645698386, "learning_rate": 2.3204816432421118e-07, "loss": 0.0013, "step": 10514 }, { "epoch": 4.783894449499545, "grad_norm": 0.12251350532515728, "learning_rate": 2.3107773596257043e-07, "loss": 0.0032, "step": 10515 }, { "epoch": 4.78434940855323, "grad_norm": 0.021360914341323537, "learning_rate": 2.3010933160539927e-07, "loss": 0.0004, "step": 10516 }, { "epoch": 4.784804367606915, "grad_norm": 0.2791778322058837, "learning_rate": 2.2914295133183438e-07, "loss": 0.007, "step": 10517 }, { "epoch": 4.785259326660601, "grad_norm": 0.27802383036695705, "learning_rate": 2.2817859522084596e-07, "loss": 0.0122, "step": 10518 }, { "epoch": 4.785714285714286, "grad_norm": 0.03161529481514898, "learning_rate": 2.2721626335123202e-07, "loss": 0.0004, "step": 10519 }, { "epoch": 4.7861692447679705, "grad_norm": 0.13478734624432054, "learning_rate": 2.262559558016325e-07, "loss": 0.003, "step": 10520 }, { "epoch": 4.786624203821656, "grad_norm": 0.03902348134850618, "learning_rate": 2.2529767265051793e-07, "loss": 0.0003, "step": 10521 }, { "epoch": 4.787079162875341, "grad_norm": 0.01025973685770281, "learning_rate": 2.2434141397619512e-07, "loss": 0.0001, "step": 10522 }, { "epoch": 4.787534121929026, "grad_norm": 0.1764176278896413, "learning_rate": 2.233871798568099e-07, "loss": 0.0056, "step": 10523 }, { "epoch": 4.787989080982712, "grad_norm": 0.009108121812903235, "learning_rate": 2.2243497037033322e-07, "loss": 0.0001, "step": 10524 }, { "epoch": 4.788444040036397, "grad_norm": 0.2873676017066391, "learning_rate": 2.2148478559457508e-07, "loss": 0.0038, "step": 10525 }, { "epoch": 4.788898999090081, "grad_norm": 0.11076412775152178, "learning_rate": 2.205366256071817e-07, "loss": 0.0018, "step": 10526 }, { "epoch": 4.789353958143767, "grad_norm": 0.28920452153780873, "learning_rate": 2.1959049048562996e-07, "loss": 0.0031, "step": 10527 }, { "epoch": 4.789808917197452, "grad_norm": 0.2331656380598316, "learning_rate": 2.186463803072386e-07, "loss": 0.0036, "step": 10528 }, { "epoch": 4.790263876251137, "grad_norm": 0.07561233018516547, "learning_rate": 2.1770429514915426e-07, "loss": 0.0024, "step": 10529 }, { "epoch": 4.790718835304823, "grad_norm": 0.021277632591686656, "learning_rate": 2.1676423508835697e-07, "loss": 0.0002, "step": 10530 }, { "epoch": 4.7911737943585075, "grad_norm": 0.11407744377016579, "learning_rate": 2.158262002016659e-07, "loss": 0.0013, "step": 10531 }, { "epoch": 4.791628753412192, "grad_norm": 0.07367511824804707, "learning_rate": 2.1489019056573634e-07, "loss": 0.0013, "step": 10532 }, { "epoch": 4.792083712465878, "grad_norm": 0.024591264437256653, "learning_rate": 2.1395620625704882e-07, "loss": 0.0002, "step": 10533 }, { "epoch": 4.792538671519563, "grad_norm": 0.39299114495991455, "learning_rate": 2.1302424735192838e-07, "loss": 0.0036, "step": 10534 }, { "epoch": 4.792993630573249, "grad_norm": 0.19002889094485884, "learning_rate": 2.120943139265308e-07, "loss": 0.0057, "step": 10535 }, { "epoch": 4.793448589626934, "grad_norm": 0.0987294624206646, "learning_rate": 2.1116640605684247e-07, "loss": 0.0018, "step": 10536 }, { "epoch": 4.7939035486806185, "grad_norm": 0.0260139008145231, "learning_rate": 2.1024052381869165e-07, "loss": 0.0002, "step": 10537 }, { "epoch": 4.794358507734304, "grad_norm": 0.31762630098242783, "learning_rate": 2.0931666728773448e-07, "loss": 0.0059, "step": 10538 }, { "epoch": 4.794813466787989, "grad_norm": 0.04461232052913572, "learning_rate": 2.0839483653946612e-07, "loss": 0.0007, "step": 10539 }, { "epoch": 4.795268425841674, "grad_norm": 0.10515279835067716, "learning_rate": 2.0747503164921523e-07, "loss": 0.011, "step": 10540 }, { "epoch": 4.79572338489536, "grad_norm": 0.026410477839585993, "learning_rate": 2.0655725269213834e-07, "loss": 0.0003, "step": 10541 }, { "epoch": 4.796178343949045, "grad_norm": 0.015849868740162854, "learning_rate": 2.056414997432421e-07, "loss": 0.0001, "step": 10542 }, { "epoch": 4.7966333030027295, "grad_norm": 0.10233976330465434, "learning_rate": 2.0472777287735e-07, "loss": 0.0009, "step": 10543 }, { "epoch": 4.797088262056415, "grad_norm": 0.2604863988797712, "learning_rate": 2.038160721691301e-07, "loss": 0.0041, "step": 10544 }, { "epoch": 4.7975432211101, "grad_norm": 0.17582645438587519, "learning_rate": 2.029063976930784e-07, "loss": 0.0013, "step": 10545 }, { "epoch": 4.797998180163785, "grad_norm": 0.0578538278414178, "learning_rate": 2.0199874952353814e-07, "loss": 0.0007, "step": 10546 }, { "epoch": 4.798453139217471, "grad_norm": 0.18041864967940646, "learning_rate": 2.0109312773467225e-07, "loss": 0.0018, "step": 10547 }, { "epoch": 4.798908098271156, "grad_norm": 0.18232785213739436, "learning_rate": 2.0018953240048265e-07, "loss": 0.004, "step": 10548 }, { "epoch": 4.7993630573248405, "grad_norm": 0.07756970940522714, "learning_rate": 1.9928796359481306e-07, "loss": 0.0007, "step": 10549 }, { "epoch": 4.799818016378526, "grad_norm": 0.09876276159987472, "learning_rate": 1.9838842139132952e-07, "loss": 0.0021, "step": 10550 }, { "epoch": 4.800272975432211, "grad_norm": 0.028445315211383618, "learning_rate": 1.974909058635399e-07, "loss": 0.0005, "step": 10551 }, { "epoch": 4.800727934485896, "grad_norm": 0.08122551733172939, "learning_rate": 1.9659541708478834e-07, "loss": 0.0018, "step": 10552 }, { "epoch": 4.801182893539582, "grad_norm": 0.03444632506769072, "learning_rate": 1.957019551282496e-07, "loss": 0.0003, "step": 10553 }, { "epoch": 4.801637852593267, "grad_norm": 0.15605521992477542, "learning_rate": 1.9481052006692922e-07, "loss": 0.002, "step": 10554 }, { "epoch": 4.8020928116469515, "grad_norm": 0.05143719137968244, "learning_rate": 1.9392111197367447e-07, "loss": 0.0007, "step": 10555 }, { "epoch": 4.802547770700637, "grad_norm": 0.26907059426218655, "learning_rate": 1.930337309211633e-07, "loss": 0.0038, "step": 10556 }, { "epoch": 4.803002729754322, "grad_norm": 0.21607506774370797, "learning_rate": 1.9214837698190992e-07, "loss": 0.0044, "step": 10557 }, { "epoch": 4.803457688808007, "grad_norm": 0.10620527339633083, "learning_rate": 1.9126505022825924e-07, "loss": 0.0029, "step": 10558 }, { "epoch": 4.803912647861693, "grad_norm": 0.08972865704205184, "learning_rate": 1.9038375073239245e-07, "loss": 0.0008, "step": 10559 }, { "epoch": 4.804367606915378, "grad_norm": 0.1487344907594945, "learning_rate": 1.895044785663269e-07, "loss": 0.0005, "step": 10560 }, { "epoch": 4.804822565969062, "grad_norm": 0.09607812832079513, "learning_rate": 1.8862723380191072e-07, "loss": 0.0027, "step": 10561 }, { "epoch": 4.805277525022748, "grad_norm": 0.007786215930126035, "learning_rate": 1.8775201651083096e-07, "loss": 0.0001, "step": 10562 }, { "epoch": 4.805732484076433, "grad_norm": 0.15674332692089316, "learning_rate": 1.8687882676460544e-07, "loss": 0.0077, "step": 10563 }, { "epoch": 4.806187443130118, "grad_norm": 0.057133576389738976, "learning_rate": 1.860076646345882e-07, "loss": 0.0004, "step": 10564 }, { "epoch": 4.806642402183804, "grad_norm": 0.21197988688812427, "learning_rate": 1.8513853019196391e-07, "loss": 0.003, "step": 10565 }, { "epoch": 4.8070973612374885, "grad_norm": 0.016718193394488167, "learning_rate": 1.8427142350775638e-07, "loss": 0.0002, "step": 10566 }, { "epoch": 4.807552320291173, "grad_norm": 0.15447242413038464, "learning_rate": 1.834063446528228e-07, "loss": 0.0019, "step": 10567 }, { "epoch": 4.808007279344859, "grad_norm": 0.07556198719956644, "learning_rate": 1.8254329369785107e-07, "loss": 0.0007, "step": 10568 }, { "epoch": 4.808462238398544, "grad_norm": 0.21780577109892177, "learning_rate": 1.816822707133653e-07, "loss": 0.0025, "step": 10569 }, { "epoch": 4.80891719745223, "grad_norm": 0.13302261967023926, "learning_rate": 1.8082327576972591e-07, "loss": 0.0016, "step": 10570 }, { "epoch": 4.809372156505915, "grad_norm": 0.07121711895488701, "learning_rate": 1.7996630893712675e-07, "loss": 0.001, "step": 10571 }, { "epoch": 4.8098271155595995, "grad_norm": 0.16816957891166923, "learning_rate": 1.791113702855951e-07, "loss": 0.0017, "step": 10572 }, { "epoch": 4.810282074613285, "grad_norm": 0.43919040971097584, "learning_rate": 1.7825845988499178e-07, "loss": 0.0092, "step": 10573 }, { "epoch": 4.81073703366697, "grad_norm": 0.04304974259224572, "learning_rate": 1.7740757780501384e-07, "loss": 0.0004, "step": 10574 }, { "epoch": 4.811191992720655, "grad_norm": 0.16407511902849187, "learning_rate": 1.7655872411518893e-07, "loss": 0.0024, "step": 10575 }, { "epoch": 4.811646951774341, "grad_norm": 0.06693827699500107, "learning_rate": 1.757118988848838e-07, "loss": 0.0006, "step": 10576 }, { "epoch": 4.812101910828026, "grad_norm": 0.09188084046396829, "learning_rate": 1.7486710218329872e-07, "loss": 0.0012, "step": 10577 }, { "epoch": 4.8125568698817105, "grad_norm": 0.11309941486117173, "learning_rate": 1.740243340794645e-07, "loss": 0.001, "step": 10578 }, { "epoch": 4.813011828935396, "grad_norm": 0.23453441231357808, "learning_rate": 1.7318359464224553e-07, "loss": 0.0009, "step": 10579 }, { "epoch": 4.813466787989081, "grad_norm": 0.11154737322756579, "learning_rate": 1.7234488394034797e-07, "loss": 0.005, "step": 10580 }, { "epoch": 4.813921747042766, "grad_norm": 0.17723145645643418, "learning_rate": 1.7150820204230865e-07, "loss": 0.003, "step": 10581 }, { "epoch": 4.814376706096452, "grad_norm": 0.04502235431854445, "learning_rate": 1.7067354901649236e-07, "loss": 0.0006, "step": 10582 }, { "epoch": 4.814831665150137, "grad_norm": 0.15410305129562593, "learning_rate": 1.6984092493110283e-07, "loss": 0.0032, "step": 10583 }, { "epoch": 4.8152866242038215, "grad_norm": 0.1069065900123098, "learning_rate": 1.6901032985418286e-07, "loss": 0.0015, "step": 10584 }, { "epoch": 4.815741583257507, "grad_norm": 0.08026159081047679, "learning_rate": 1.6818176385360317e-07, "loss": 0.001, "step": 10585 }, { "epoch": 4.816196542311192, "grad_norm": 0.048925190255150734, "learning_rate": 1.6735522699707073e-07, "loss": 0.0004, "step": 10586 }, { "epoch": 4.816651501364877, "grad_norm": 0.041801940206804875, "learning_rate": 1.665307193521287e-07, "loss": 0.0004, "step": 10587 }, { "epoch": 4.817106460418563, "grad_norm": 0.07062626055005944, "learning_rate": 1.6570824098614547e-07, "loss": 0.0024, "step": 10588 }, { "epoch": 4.817561419472248, "grad_norm": 0.04164324928542172, "learning_rate": 1.6488779196633386e-07, "loss": 0.0007, "step": 10589 }, { "epoch": 4.8180163785259325, "grad_norm": 0.12959816770185956, "learning_rate": 1.6406937235973752e-07, "loss": 0.0008, "step": 10590 }, { "epoch": 4.818471337579618, "grad_norm": 0.16765106896728782, "learning_rate": 1.6325298223323627e-07, "loss": 0.001, "step": 10591 }, { "epoch": 4.818926296633303, "grad_norm": 0.14134503362412396, "learning_rate": 1.6243862165353784e-07, "loss": 0.0024, "step": 10592 }, { "epoch": 4.819381255686988, "grad_norm": 0.0852730693639794, "learning_rate": 1.6162629068718903e-07, "loss": 0.0013, "step": 10593 }, { "epoch": 4.819836214740674, "grad_norm": 0.14057825807758473, "learning_rate": 1.6081598940057285e-07, "loss": 0.0097, "step": 10594 }, { "epoch": 4.820291173794359, "grad_norm": 0.18249578661507643, "learning_rate": 1.6000771785990022e-07, "loss": 0.0056, "step": 10595 }, { "epoch": 4.820746132848043, "grad_norm": 0.12095789138367243, "learning_rate": 1.5920147613122105e-07, "loss": 0.0036, "step": 10596 }, { "epoch": 4.821201091901729, "grad_norm": 0.05803996257644538, "learning_rate": 1.5839726428041602e-07, "loss": 0.0003, "step": 10597 }, { "epoch": 4.821656050955414, "grad_norm": 0.2700515774312671, "learning_rate": 1.5759508237320474e-07, "loss": 0.0074, "step": 10598 }, { "epoch": 4.822111010009099, "grad_norm": 0.2581403949707326, "learning_rate": 1.5679493047513483e-07, "loss": 0.0028, "step": 10599 }, { "epoch": 4.822565969062785, "grad_norm": 0.24770924582156326, "learning_rate": 1.5599680865159284e-07, "loss": 0.0045, "step": 10600 }, { "epoch": 4.8230209281164695, "grad_norm": 0.17268389841952356, "learning_rate": 1.5520071696779604e-07, "loss": 0.0016, "step": 10601 }, { "epoch": 4.823475887170154, "grad_norm": 0.1587181931044445, "learning_rate": 1.5440665548879794e-07, "loss": 0.0015, "step": 10602 }, { "epoch": 4.82393084622384, "grad_norm": 0.2409665488083132, "learning_rate": 1.5361462427948837e-07, "loss": 0.0059, "step": 10603 }, { "epoch": 4.824385805277525, "grad_norm": 0.015845977679135197, "learning_rate": 1.5282462340458492e-07, "loss": 0.0002, "step": 10604 }, { "epoch": 4.82484076433121, "grad_norm": 0.1600431566990502, "learning_rate": 1.5203665292864434e-07, "loss": 0.002, "step": 10605 }, { "epoch": 4.825295723384896, "grad_norm": 0.09824295944566014, "learning_rate": 1.5125071291605676e-07, "loss": 0.0019, "step": 10606 }, { "epoch": 4.8257506824385805, "grad_norm": 0.012150203797153617, "learning_rate": 1.504668034310458e-07, "loss": 0.0001, "step": 10607 }, { "epoch": 4.826205641492265, "grad_norm": 0.28637564322086045, "learning_rate": 1.496849245376658e-07, "loss": 0.003, "step": 10608 }, { "epoch": 4.826660600545951, "grad_norm": 0.0996500513034018, "learning_rate": 1.4890507629981288e-07, "loss": 0.0008, "step": 10609 }, { "epoch": 4.827115559599636, "grad_norm": 0.007921803195493663, "learning_rate": 1.4812725878120827e-07, "loss": 0.0001, "step": 10610 }, { "epoch": 4.827570518653321, "grad_norm": 0.21064172373787104, "learning_rate": 1.47351472045415e-07, "loss": 0.0038, "step": 10611 }, { "epoch": 4.828025477707007, "grad_norm": 0.010433422620730602, "learning_rate": 1.4657771615582683e-07, "loss": 0.0002, "step": 10612 }, { "epoch": 4.8284804367606915, "grad_norm": 0.09640699239800463, "learning_rate": 1.4580599117567095e-07, "loss": 0.0007, "step": 10613 }, { "epoch": 4.828935395814376, "grad_norm": 0.12487106728472631, "learning_rate": 1.4503629716800803e-07, "loss": 0.0014, "step": 10614 }, { "epoch": 4.829390354868062, "grad_norm": 0.05815076566129801, "learning_rate": 1.44268634195735e-07, "loss": 0.0003, "step": 10615 }, { "epoch": 4.829845313921747, "grad_norm": 0.14258146184671405, "learning_rate": 1.435030023215822e-07, "loss": 0.0014, "step": 10616 }, { "epoch": 4.830300272975432, "grad_norm": 0.046457174817505435, "learning_rate": 1.4273940160811073e-07, "loss": 0.0005, "step": 10617 }, { "epoch": 4.830755232029118, "grad_norm": 0.09213058191597528, "learning_rate": 1.4197783211772342e-07, "loss": 0.0008, "step": 10618 }, { "epoch": 4.8312101910828025, "grad_norm": 0.17645734218950263, "learning_rate": 1.412182939126483e-07, "loss": 0.0006, "step": 10619 }, { "epoch": 4.831665150136487, "grad_norm": 0.07684373559705708, "learning_rate": 1.4046078705495512e-07, "loss": 0.0011, "step": 10620 }, { "epoch": 4.832120109190173, "grad_norm": 0.1032441143337584, "learning_rate": 1.3970531160654167e-07, "loss": 0.0009, "step": 10621 }, { "epoch": 4.832575068243858, "grad_norm": 0.35188798091886536, "learning_rate": 1.3895186762913902e-07, "loss": 0.0034, "step": 10622 }, { "epoch": 4.833030027297543, "grad_norm": 0.10686985938628109, "learning_rate": 1.3820045518432024e-07, "loss": 0.006, "step": 10623 }, { "epoch": 4.833484986351229, "grad_norm": 0.11472086928571278, "learning_rate": 1.3745107433348614e-07, "loss": 0.0015, "step": 10624 }, { "epoch": 4.8339399454049135, "grad_norm": 0.09659086005024256, "learning_rate": 1.3670372513787112e-07, "loss": 0.0021, "step": 10625 }, { "epoch": 4.834394904458598, "grad_norm": 0.029068160075943532, "learning_rate": 1.3595840765854572e-07, "loss": 0.0003, "step": 10626 }, { "epoch": 4.834849863512284, "grad_norm": 0.12053130155737073, "learning_rate": 1.3521512195641407e-07, "loss": 0.002, "step": 10627 }, { "epoch": 4.835304822565969, "grad_norm": 0.05843095534608275, "learning_rate": 1.3447386809221363e-07, "loss": 0.0004, "step": 10628 }, { "epoch": 4.835759781619654, "grad_norm": 0.15573909327030339, "learning_rate": 1.337346461265182e-07, "loss": 0.0027, "step": 10629 }, { "epoch": 4.83621474067334, "grad_norm": 0.4557519216954905, "learning_rate": 1.3299745611973223e-07, "loss": 0.004, "step": 10630 }, { "epoch": 4.836669699727024, "grad_norm": 0.0988701021646587, "learning_rate": 1.3226229813209645e-07, "loss": 0.0006, "step": 10631 }, { "epoch": 4.837124658780709, "grad_norm": 0.0795657155483214, "learning_rate": 1.315291722236822e-07, "loss": 0.0014, "step": 10632 }, { "epoch": 4.837579617834395, "grad_norm": 0.12102890444362675, "learning_rate": 1.3079807845439995e-07, "loss": 0.0023, "step": 10633 }, { "epoch": 4.83803457688808, "grad_norm": 0.028300118482779125, "learning_rate": 1.3006901688399075e-07, "loss": 0.0003, "step": 10634 }, { "epoch": 4.838489535941765, "grad_norm": 0.19183993860755924, "learning_rate": 1.293419875720292e-07, "loss": 0.0013, "step": 10635 }, { "epoch": 4.8389444949954505, "grad_norm": 0.6525366597224099, "learning_rate": 1.2861699057792887e-07, "loss": 0.0614, "step": 10636 }, { "epoch": 4.839399454049135, "grad_norm": 0.18942363351559127, "learning_rate": 1.278940259609257e-07, "loss": 0.0008, "step": 10637 }, { "epoch": 4.83985441310282, "grad_norm": 0.5499495067218343, "learning_rate": 1.2717309378010023e-07, "loss": 0.0017, "step": 10638 }, { "epoch": 4.840309372156506, "grad_norm": 0.3155667781601065, "learning_rate": 1.264541940943692e-07, "loss": 0.0097, "step": 10639 }, { "epoch": 4.840764331210191, "grad_norm": 0.12088928637269838, "learning_rate": 1.2573732696247176e-07, "loss": 0.0009, "step": 10640 }, { "epoch": 4.841219290263876, "grad_norm": 0.08002837318155073, "learning_rate": 1.250224924429888e-07, "loss": 0.0034, "step": 10641 }, { "epoch": 4.8416742493175615, "grad_norm": 0.03561725678106418, "learning_rate": 1.2430969059433196e-07, "loss": 0.0006, "step": 10642 }, { "epoch": 4.842129208371246, "grad_norm": 0.015246263456580925, "learning_rate": 1.2359892147474905e-07, "loss": 0.0003, "step": 10643 }, { "epoch": 4.842584167424932, "grad_norm": 0.09472649998665403, "learning_rate": 1.2289018514232419e-07, "loss": 0.0008, "step": 10644 }, { "epoch": 4.843039126478617, "grad_norm": 0.08188473892936972, "learning_rate": 1.221834816549666e-07, "loss": 0.0009, "step": 10645 }, { "epoch": 4.843494085532302, "grad_norm": 0.05653587403343606, "learning_rate": 1.2147881107043013e-07, "loss": 0.0004, "step": 10646 }, { "epoch": 4.843949044585988, "grad_norm": 0.09544651381731793, "learning_rate": 1.2077617344629365e-07, "loss": 0.0015, "step": 10647 }, { "epoch": 4.8444040036396725, "grad_norm": 0.12061361946343101, "learning_rate": 1.2007556883997517e-07, "loss": 0.0002, "step": 10648 }, { "epoch": 4.844858962693357, "grad_norm": 0.13061040408843488, "learning_rate": 1.1937699730872608e-07, "loss": 0.0003, "step": 10649 }, { "epoch": 4.845313921747043, "grad_norm": 0.32888716377955396, "learning_rate": 1.1868045890962575e-07, "loss": 0.0026, "step": 10650 }, { "epoch": 4.845768880800728, "grad_norm": 0.03524049566762271, "learning_rate": 1.179859536995953e-07, "loss": 0.0006, "step": 10651 }, { "epoch": 4.846223839854413, "grad_norm": 0.09311542847301098, "learning_rate": 1.1729348173538935e-07, "loss": 0.0018, "step": 10652 }, { "epoch": 4.846678798908099, "grad_norm": 0.10055980091361089, "learning_rate": 1.166030430735876e-07, "loss": 0.0006, "step": 10653 }, { "epoch": 4.8471337579617835, "grad_norm": 0.2501918864234568, "learning_rate": 1.159146377706144e-07, "loss": 0.0054, "step": 10654 }, { "epoch": 4.847588717015468, "grad_norm": 0.08995790572480673, "learning_rate": 1.1522826588272195e-07, "loss": 0.0019, "step": 10655 }, { "epoch": 4.848043676069154, "grad_norm": 0.3438458467536681, "learning_rate": 1.1454392746599595e-07, "loss": 0.0044, "step": 10656 }, { "epoch": 4.848498635122839, "grad_norm": 0.17184355213301344, "learning_rate": 1.1386162257636113e-07, "loss": 0.0012, "step": 10657 }, { "epoch": 4.848953594176524, "grad_norm": 0.15319272831062714, "learning_rate": 1.131813512695673e-07, "loss": 0.0029, "step": 10658 }, { "epoch": 4.84940855323021, "grad_norm": 0.06415891384614687, "learning_rate": 1.1250311360120336e-07, "loss": 0.0014, "step": 10659 }, { "epoch": 4.8498635122838945, "grad_norm": 0.018979902873822542, "learning_rate": 1.1182690962669718e-07, "loss": 0.0002, "step": 10660 }, { "epoch": 4.850318471337579, "grad_norm": 0.03250235096723667, "learning_rate": 1.1115273940130177e-07, "loss": 0.0003, "step": 10661 }, { "epoch": 4.850773430391265, "grad_norm": 0.021137550382576264, "learning_rate": 1.1048060298010642e-07, "loss": 0.0001, "step": 10662 }, { "epoch": 4.85122838944495, "grad_norm": 0.1841122657807046, "learning_rate": 1.0981050041803665e-07, "loss": 0.0025, "step": 10663 }, { "epoch": 4.851683348498635, "grad_norm": 0.14078960612115685, "learning_rate": 1.091424317698514e-07, "loss": 0.0004, "step": 10664 }, { "epoch": 4.852138307552321, "grad_norm": 0.2906612581062857, "learning_rate": 1.0847639709013758e-07, "loss": 0.0018, "step": 10665 }, { "epoch": 4.852593266606005, "grad_norm": 0.04218828459035705, "learning_rate": 1.0781239643332386e-07, "loss": 0.0005, "step": 10666 }, { "epoch": 4.85304822565969, "grad_norm": 0.267417619496183, "learning_rate": 1.0715042985366963e-07, "loss": 0.0082, "step": 10667 }, { "epoch": 4.853503184713376, "grad_norm": 0.297020537829627, "learning_rate": 1.0649049740526773e-07, "loss": 0.0104, "step": 10668 }, { "epoch": 4.853958143767061, "grad_norm": 0.08487572776022889, "learning_rate": 1.0583259914204447e-07, "loss": 0.0049, "step": 10669 }, { "epoch": 4.854413102820746, "grad_norm": 0.1563195277481319, "learning_rate": 1.0517673511775961e-07, "loss": 0.0021, "step": 10670 }, { "epoch": 4.8548680618744315, "grad_norm": 0.9766401261516076, "learning_rate": 1.045229053860064e-07, "loss": 0.0077, "step": 10671 }, { "epoch": 4.855323020928116, "grad_norm": 0.03760697400659421, "learning_rate": 1.038711100002171e-07, "loss": 0.0006, "step": 10672 }, { "epoch": 4.855777979981801, "grad_norm": 0.03595697572567338, "learning_rate": 1.032213490136491e-07, "loss": 0.0006, "step": 10673 }, { "epoch": 4.856232939035487, "grad_norm": 0.14267666821568087, "learning_rate": 1.0257362247939883e-07, "loss": 0.0012, "step": 10674 }, { "epoch": 4.856687898089172, "grad_norm": 0.44847379003449034, "learning_rate": 1.0192793045039894e-07, "loss": 0.0071, "step": 10675 }, { "epoch": 4.857142857142857, "grad_norm": 0.10693939081890279, "learning_rate": 1.0128427297940723e-07, "loss": 0.0015, "step": 10676 }, { "epoch": 4.8575978161965425, "grad_norm": 0.02687260853915465, "learning_rate": 1.006426501190233e-07, "loss": 0.0003, "step": 10677 }, { "epoch": 4.858052775250227, "grad_norm": 0.05601246300136848, "learning_rate": 1.0000306192168018e-07, "loss": 0.0009, "step": 10678 }, { "epoch": 4.858507734303913, "grad_norm": 0.07822341908779579, "learning_rate": 9.936550843963888e-08, "loss": 0.0006, "step": 10679 }, { "epoch": 4.858962693357598, "grad_norm": 0.03699013661449186, "learning_rate": 9.872998972499381e-08, "loss": 0.0007, "step": 10680 }, { "epoch": 4.859417652411283, "grad_norm": 0.15098663920224625, "learning_rate": 9.809650582968399e-08, "loss": 0.0004, "step": 10681 }, { "epoch": 4.859872611464969, "grad_norm": 0.037474621251392684, "learning_rate": 9.746505680547358e-08, "loss": 0.0004, "step": 10682 }, { "epoch": 4.8603275705186535, "grad_norm": 0.13265361243631826, "learning_rate": 9.68356427039574e-08, "loss": 0.0033, "step": 10683 }, { "epoch": 4.860782529572338, "grad_norm": 0.09024872449147321, "learning_rate": 9.620826357657208e-08, "loss": 0.0014, "step": 10684 }, { "epoch": 4.861237488626024, "grad_norm": 0.33550925548183436, "learning_rate": 9.558291947457942e-08, "loss": 0.0051, "step": 10685 }, { "epoch": 4.861692447679709, "grad_norm": 0.10031265437625429, "learning_rate": 9.495961044908852e-08, "loss": 0.0018, "step": 10686 }, { "epoch": 4.862147406733394, "grad_norm": 0.16343482561911532, "learning_rate": 9.433833655102254e-08, "loss": 0.0037, "step": 10687 }, { "epoch": 4.86260236578708, "grad_norm": 0.08648344886432427, "learning_rate": 9.371909783116028e-08, "loss": 0.0019, "step": 10688 }, { "epoch": 4.8630573248407645, "grad_norm": 0.08366660984342085, "learning_rate": 9.310189434009464e-08, "loss": 0.0013, "step": 10689 }, { "epoch": 4.863512283894449, "grad_norm": 0.297924317597371, "learning_rate": 9.248672612826303e-08, "loss": 0.0036, "step": 10690 }, { "epoch": 4.863967242948135, "grad_norm": 0.12752691023290058, "learning_rate": 9.187359324593636e-08, "loss": 0.0036, "step": 10691 }, { "epoch": 4.86442220200182, "grad_norm": 0.07106248491162631, "learning_rate": 9.126249574321344e-08, "loss": 0.0005, "step": 10692 }, { "epoch": 4.864877161055505, "grad_norm": 0.23424050567943253, "learning_rate": 9.065343367003487e-08, "loss": 0.0045, "step": 10693 }, { "epoch": 4.865332120109191, "grad_norm": 0.21315202594084065, "learning_rate": 9.004640707616641e-08, "loss": 0.0039, "step": 10694 }, { "epoch": 4.8657870791628755, "grad_norm": 0.14208714342437775, "learning_rate": 8.944141601121559e-08, "loss": 0.0024, "step": 10695 }, { "epoch": 4.86624203821656, "grad_norm": 0.06465461070175489, "learning_rate": 8.883846052461509e-08, "loss": 0.0013, "step": 10696 }, { "epoch": 4.866696997270246, "grad_norm": 0.3553047665368998, "learning_rate": 8.82375406656366e-08, "loss": 0.0031, "step": 10697 }, { "epoch": 4.867151956323931, "grad_norm": 0.11588261334857902, "learning_rate": 8.763865648338809e-08, "loss": 0.0028, "step": 10698 }, { "epoch": 4.867606915377616, "grad_norm": 0.26849101227822963, "learning_rate": 8.704180802680539e-08, "loss": 0.0048, "step": 10699 }, { "epoch": 4.868061874431302, "grad_norm": 0.28977917068961856, "learning_rate": 8.64469953446606e-08, "loss": 0.0059, "step": 10700 }, { "epoch": 4.868516833484986, "grad_norm": 0.08090616372298345, "learning_rate": 8.585421848555652e-08, "loss": 0.001, "step": 10701 }, { "epoch": 4.868971792538671, "grad_norm": 0.2566908838361502, "learning_rate": 8.526347749793495e-08, "loss": 0.0028, "step": 10702 }, { "epoch": 4.869426751592357, "grad_norm": 0.17143957307162444, "learning_rate": 8.467477243006838e-08, "loss": 0.0025, "step": 10703 }, { "epoch": 4.869881710646042, "grad_norm": 0.12566818830783938, "learning_rate": 8.408810333006278e-08, "loss": 0.0018, "step": 10704 }, { "epoch": 4.870336669699727, "grad_norm": 0.014736695634504396, "learning_rate": 8.350347024586037e-08, "loss": 0.0002, "step": 10705 }, { "epoch": 4.8707916287534125, "grad_norm": 0.19992990896358798, "learning_rate": 8.292087322522845e-08, "loss": 0.0013, "step": 10706 }, { "epoch": 4.871246587807097, "grad_norm": 0.023553155723867278, "learning_rate": 8.234031231578176e-08, "loss": 0.0003, "step": 10707 }, { "epoch": 4.871701546860782, "grad_norm": 0.16997569500057436, "learning_rate": 8.176178756495456e-08, "loss": 0.009, "step": 10708 }, { "epoch": 4.872156505914468, "grad_norm": 0.12631805122262788, "learning_rate": 8.118529902002569e-08, "loss": 0.0007, "step": 10709 }, { "epoch": 4.872611464968153, "grad_norm": 0.26196893523322806, "learning_rate": 8.061084672810193e-08, "loss": 0.0079, "step": 10710 }, { "epoch": 4.873066424021838, "grad_norm": 0.1392780540191059, "learning_rate": 8.003843073612627e-08, "loss": 0.0022, "step": 10711 }, { "epoch": 4.8735213830755235, "grad_norm": 0.08719417965841042, "learning_rate": 7.946805109086964e-08, "loss": 0.0018, "step": 10712 }, { "epoch": 4.873976342129208, "grad_norm": 0.04405726289133016, "learning_rate": 7.889970783894751e-08, "loss": 0.0007, "step": 10713 }, { "epoch": 4.874431301182893, "grad_norm": 0.025945618689239426, "learning_rate": 7.833340102679499e-08, "loss": 0.0002, "step": 10714 }, { "epoch": 4.874886260236579, "grad_norm": 0.19237124678016695, "learning_rate": 7.77691307006917e-08, "loss": 0.0029, "step": 10715 }, { "epoch": 4.875341219290264, "grad_norm": 0.16856001732557752, "learning_rate": 7.720689690674798e-08, "loss": 0.0026, "step": 10716 }, { "epoch": 4.875796178343949, "grad_norm": 0.4513681090655039, "learning_rate": 7.664669969090765e-08, "loss": 0.0107, "step": 10717 }, { "epoch": 4.8762511373976345, "grad_norm": 0.11092806981814085, "learning_rate": 7.60885390989452e-08, "loss": 0.0011, "step": 10718 }, { "epoch": 4.876706096451319, "grad_norm": 0.015017808436034988, "learning_rate": 7.553241517647136e-08, "loss": 0.0004, "step": 10719 }, { "epoch": 4.877161055505004, "grad_norm": 0.23912263989603436, "learning_rate": 7.497832796893312e-08, "loss": 0.0059, "step": 10720 }, { "epoch": 4.87761601455869, "grad_norm": 0.17337682274671398, "learning_rate": 7.442627752160258e-08, "loss": 0.0044, "step": 10721 }, { "epoch": 4.878070973612375, "grad_norm": 0.0391021360130019, "learning_rate": 7.387626387959368e-08, "loss": 0.0005, "step": 10722 }, { "epoch": 4.87852593266606, "grad_norm": 0.10775807553445817, "learning_rate": 7.332828708785377e-08, "loss": 0.0016, "step": 10723 }, { "epoch": 4.8789808917197455, "grad_norm": 0.14114310886217743, "learning_rate": 7.27823471911554e-08, "loss": 0.0026, "step": 10724 }, { "epoch": 4.87943585077343, "grad_norm": 0.14000943548644076, "learning_rate": 7.223844423411563e-08, "loss": 0.0013, "step": 10725 }, { "epoch": 4.879890809827115, "grad_norm": 0.06722787509955716, "learning_rate": 7.169657826117671e-08, "loss": 0.0012, "step": 10726 }, { "epoch": 4.880345768880801, "grad_norm": 0.05065869053682349, "learning_rate": 7.115674931661987e-08, "loss": 0.0004, "step": 10727 }, { "epoch": 4.880800727934486, "grad_norm": 0.2398208972233308, "learning_rate": 7.061895744455149e-08, "loss": 0.0027, "step": 10728 }, { "epoch": 4.881255686988171, "grad_norm": 0.1363464922820646, "learning_rate": 7.008320268892533e-08, "loss": 0.0026, "step": 10729 }, { "epoch": 4.8817106460418564, "grad_norm": 0.2652407604401157, "learning_rate": 6.954948509351467e-08, "loss": 0.0049, "step": 10730 }, { "epoch": 4.882165605095541, "grad_norm": 0.029132115768571776, "learning_rate": 6.901780470193741e-08, "loss": 0.0004, "step": 10731 }, { "epoch": 4.882620564149226, "grad_norm": 0.12189333069063153, "learning_rate": 6.848816155763938e-08, "loss": 0.0008, "step": 10732 }, { "epoch": 4.883075523202912, "grad_norm": 0.14898563119926697, "learning_rate": 6.796055570389426e-08, "loss": 0.0018, "step": 10733 }, { "epoch": 4.883530482256597, "grad_norm": 0.4229643207680149, "learning_rate": 6.743498718382591e-08, "loss": 0.013, "step": 10734 }, { "epoch": 4.883985441310282, "grad_norm": 0.3092024882876222, "learning_rate": 6.69114560403722e-08, "loss": 0.0042, "step": 10735 }, { "epoch": 4.884440400363967, "grad_norm": 0.09164478501860437, "learning_rate": 6.638996231631834e-08, "loss": 0.002, "step": 10736 }, { "epoch": 4.884895359417652, "grad_norm": 0.08209094580211325, "learning_rate": 6.587050605427746e-08, "loss": 0.0009, "step": 10737 }, { "epoch": 4.885350318471337, "grad_norm": 0.17231760016054984, "learning_rate": 6.53530872966962e-08, "loss": 0.0012, "step": 10738 }, { "epoch": 4.885805277525023, "grad_norm": 0.06507806670868144, "learning_rate": 6.483770608586015e-08, "loss": 0.0013, "step": 10739 }, { "epoch": 4.886260236578708, "grad_norm": 0.11682745614293723, "learning_rate": 6.432436246387729e-08, "loss": 0.0012, "step": 10740 }, { "epoch": 4.886715195632393, "grad_norm": 0.30083601240044255, "learning_rate": 6.381305647270019e-08, "loss": 0.003, "step": 10741 }, { "epoch": 4.887170154686078, "grad_norm": 0.17938603003687806, "learning_rate": 6.330378815410931e-08, "loss": 0.0026, "step": 10742 }, { "epoch": 4.887625113739763, "grad_norm": 0.09120222582753867, "learning_rate": 6.279655754971858e-08, "loss": 0.0005, "step": 10743 }, { "epoch": 4.888080072793448, "grad_norm": 0.043612022504023004, "learning_rate": 6.229136470098096e-08, "loss": 0.0004, "step": 10744 }, { "epoch": 4.888535031847134, "grad_norm": 0.3340356440882775, "learning_rate": 6.178820964917176e-08, "loss": 0.0035, "step": 10745 }, { "epoch": 4.888989990900819, "grad_norm": 0.0980120699980474, "learning_rate": 6.128709243541363e-08, "loss": 0.0006, "step": 10746 }, { "epoch": 4.889444949954504, "grad_norm": 0.03709710687060849, "learning_rate": 6.078801310064886e-08, "loss": 0.0005, "step": 10747 }, { "epoch": 4.889899909008189, "grad_norm": 0.02443638537690225, "learning_rate": 6.029097168566422e-08, "loss": 0.0003, "step": 10748 }, { "epoch": 4.890354868061874, "grad_norm": 0.1721419726392737, "learning_rate": 5.979596823107447e-08, "loss": 0.0034, "step": 10749 }, { "epoch": 4.890809827115559, "grad_norm": 0.16405201647965917, "learning_rate": 5.930300277732781e-08, "loss": 0.002, "step": 10750 }, { "epoch": 4.891264786169245, "grad_norm": 0.07444139436629493, "learning_rate": 5.881207536471145e-08, "loss": 0.0018, "step": 10751 }, { "epoch": 4.89171974522293, "grad_norm": 0.06733986731409869, "learning_rate": 5.832318603333775e-08, "loss": 0.0005, "step": 10752 }, { "epoch": 4.892174704276615, "grad_norm": 0.08865529435269566, "learning_rate": 5.783633482315809e-08, "loss": 0.0005, "step": 10753 }, { "epoch": 4.8926296633303, "grad_norm": 0.1294945147507927, "learning_rate": 5.735152177395453e-08, "loss": 0.0057, "step": 10754 }, { "epoch": 4.893084622383985, "grad_norm": 0.10925951515418621, "learning_rate": 5.686874692534538e-08, "loss": 0.001, "step": 10755 }, { "epoch": 4.893539581437671, "grad_norm": 0.3002743538573031, "learning_rate": 5.6388010316779647e-08, "loss": 0.0056, "step": 10756 }, { "epoch": 4.893994540491356, "grad_norm": 0.06392666616018194, "learning_rate": 5.590931198753979e-08, "loss": 0.0012, "step": 10757 }, { "epoch": 4.894449499545041, "grad_norm": 0.07739478384385921, "learning_rate": 5.54326519767473e-08, "loss": 0.0007, "step": 10758 }, { "epoch": 4.8949044585987265, "grad_norm": 0.1346606239144257, "learning_rate": 5.4958030323348784e-08, "loss": 0.0043, "step": 10759 }, { "epoch": 4.895359417652411, "grad_norm": 0.1376023177146858, "learning_rate": 5.448544706612713e-08, "loss": 0.0021, "step": 10760 }, { "epoch": 4.895814376706096, "grad_norm": 0.0776932878791795, "learning_rate": 5.40149022437042e-08, "loss": 0.0017, "step": 10761 }, { "epoch": 4.896269335759782, "grad_norm": 0.07292519978902176, "learning_rate": 5.3546395894527036e-08, "loss": 0.0004, "step": 10762 }, { "epoch": 4.896724294813467, "grad_norm": 0.11089128086157904, "learning_rate": 5.307992805688444e-08, "loss": 0.001, "step": 10763 }, { "epoch": 4.897179253867152, "grad_norm": 0.01728605874090701, "learning_rate": 5.2615498768887605e-08, "loss": 0.0002, "step": 10764 }, { "epoch": 4.8976342129208374, "grad_norm": 0.08235523973194787, "learning_rate": 5.21531080684895e-08, "loss": 0.001, "step": 10765 }, { "epoch": 4.898089171974522, "grad_norm": 0.2310905140402235, "learning_rate": 5.1692755993479335e-08, "loss": 0.0011, "step": 10766 }, { "epoch": 4.898544131028207, "grad_norm": 0.14013557418250824, "learning_rate": 5.12344425814687e-08, "loss": 0.005, "step": 10767 }, { "epoch": 4.898999090081893, "grad_norm": 0.14820125192944494, "learning_rate": 5.077816786991374e-08, "loss": 0.0022, "step": 10768 }, { "epoch": 4.899454049135578, "grad_norm": 0.038799809965536766, "learning_rate": 5.032393189609852e-08, "loss": 0.0007, "step": 10769 }, { "epoch": 4.899909008189263, "grad_norm": 0.08889649968544007, "learning_rate": 4.9871734697137794e-08, "loss": 0.0022, "step": 10770 }, { "epoch": 4.900363967242948, "grad_norm": 0.09583906130178063, "learning_rate": 4.942157630998534e-08, "loss": 0.0017, "step": 10771 }, { "epoch": 4.900818926296633, "grad_norm": 0.12745591451269933, "learning_rate": 4.897345677142562e-08, "loss": 0.0042, "step": 10772 }, { "epoch": 4.901273885350318, "grad_norm": 0.15708209101876175, "learning_rate": 4.852737611807656e-08, "loss": 0.0024, "step": 10773 }, { "epoch": 4.901728844404004, "grad_norm": 0.005699695829996438, "learning_rate": 4.8083334386392345e-08, "loss": 0.0001, "step": 10774 }, { "epoch": 4.902183803457689, "grad_norm": 0.08984461491557448, "learning_rate": 4.764133161265505e-08, "loss": 0.0009, "step": 10775 }, { "epoch": 4.902638762511374, "grad_norm": 0.06299114366412108, "learning_rate": 4.720136783298579e-08, "loss": 0.0006, "step": 10776 }, { "epoch": 4.903093721565059, "grad_norm": 0.030643652321759137, "learning_rate": 4.67634430833308e-08, "loss": 0.0002, "step": 10777 }, { "epoch": 4.903548680618744, "grad_norm": 0.01628114964748512, "learning_rate": 4.632755739948369e-08, "loss": 0.0003, "step": 10778 }, { "epoch": 4.904003639672429, "grad_norm": 0.04140338500008665, "learning_rate": 4.589371081705762e-08, "loss": 0.0004, "step": 10779 }, { "epoch": 4.904458598726115, "grad_norm": 0.34447993769071733, "learning_rate": 4.54619033715048e-08, "loss": 0.0053, "step": 10780 }, { "epoch": 4.9049135577798, "grad_norm": 0.40080541282262716, "learning_rate": 4.503213509811088e-08, "loss": 0.0021, "step": 10781 }, { "epoch": 4.905368516833485, "grad_norm": 0.13675790273957922, "learning_rate": 4.460440603199778e-08, "loss": 0.0007, "step": 10782 }, { "epoch": 4.90582347588717, "grad_norm": 0.039673766962575734, "learning_rate": 4.417871620811254e-08, "loss": 0.0006, "step": 10783 }, { "epoch": 4.906278434940855, "grad_norm": 0.10153102722996561, "learning_rate": 4.375506566124677e-08, "loss": 0.0016, "step": 10784 }, { "epoch": 4.90673339399454, "grad_norm": 0.15990036580745742, "learning_rate": 4.333345442601167e-08, "loss": 0.0027, "step": 10785 }, { "epoch": 4.907188353048226, "grad_norm": 0.1715838787409154, "learning_rate": 4.291388253686579e-08, "loss": 0.0024, "step": 10786 }, { "epoch": 4.907643312101911, "grad_norm": 0.09373957835611876, "learning_rate": 4.2496350028090046e-08, "loss": 0.0007, "step": 10787 }, { "epoch": 4.9080982711555965, "grad_norm": 0.08066907300764217, "learning_rate": 4.2080856933807143e-08, "loss": 0.0012, "step": 10788 }, { "epoch": 4.908553230209281, "grad_norm": 0.12683288271205398, "learning_rate": 4.16674032879677e-08, "loss": 0.0029, "step": 10789 }, { "epoch": 4.909008189262966, "grad_norm": 0.21733507806796823, "learning_rate": 4.12559891243558e-08, "loss": 0.0031, "step": 10790 }, { "epoch": 4.909463148316652, "grad_norm": 0.16653780426586615, "learning_rate": 4.0846614476591774e-08, "loss": 0.0016, "step": 10791 }, { "epoch": 4.909918107370337, "grad_norm": 0.13275263674027402, "learning_rate": 4.043927937812941e-08, "loss": 0.0023, "step": 10792 }, { "epoch": 4.910373066424022, "grad_norm": 0.23452885720172165, "learning_rate": 4.0033983862253214e-08, "loss": 0.0023, "step": 10793 }, { "epoch": 4.9108280254777075, "grad_norm": 0.1565993694707464, "learning_rate": 3.963072796208111e-08, "loss": 0.0066, "step": 10794 }, { "epoch": 4.911282984531392, "grad_norm": 0.11974782268274262, "learning_rate": 3.9229511710564545e-08, "loss": 0.0012, "step": 10795 }, { "epoch": 4.911737943585077, "grad_norm": 0.10165361150994169, "learning_rate": 3.883033514049117e-08, "loss": 0.0036, "step": 10796 }, { "epoch": 4.912192902638763, "grad_norm": 0.06902383944386685, "learning_rate": 3.843319828447933e-08, "loss": 0.0009, "step": 10797 }, { "epoch": 4.912647861692448, "grad_norm": 0.1910417254962274, "learning_rate": 3.8038101174980856e-08, "loss": 0.005, "step": 10798 }, { "epoch": 4.913102820746133, "grad_norm": 0.23504982013142253, "learning_rate": 3.7645043844281024e-08, "loss": 0.0051, "step": 10799 }, { "epoch": 4.9135577797998184, "grad_norm": 0.03582560797535683, "learning_rate": 3.7254026324501365e-08, "loss": 0.0004, "step": 10800 }, { "epoch": 4.914012738853503, "grad_norm": 0.275353669929974, "learning_rate": 3.686504864758855e-08, "loss": 0.0062, "step": 10801 }, { "epoch": 4.914467697907188, "grad_norm": 0.13796631927055783, "learning_rate": 3.647811084533381e-08, "loss": 0.0021, "step": 10802 }, { "epoch": 4.914922656960874, "grad_norm": 0.10744317013469155, "learning_rate": 3.609321294935353e-08, "loss": 0.0021, "step": 10803 }, { "epoch": 4.915377616014559, "grad_norm": 0.1841309408437785, "learning_rate": 3.5710354991100315e-08, "loss": 0.0015, "step": 10804 }, { "epoch": 4.915832575068244, "grad_norm": 0.19730349942432282, "learning_rate": 3.5329537001857484e-08, "loss": 0.0033, "step": 10805 }, { "epoch": 4.916287534121929, "grad_norm": 0.01193513712821032, "learning_rate": 3.4950759012747355e-08, "loss": 0.0001, "step": 10806 }, { "epoch": 4.916742493175614, "grad_norm": 0.026773800862820912, "learning_rate": 3.457402105471741e-08, "loss": 0.0002, "step": 10807 }, { "epoch": 4.917197452229299, "grad_norm": 0.2540567888399968, "learning_rate": 3.419932315855689e-08, "loss": 0.0066, "step": 10808 }, { "epoch": 4.917652411282985, "grad_norm": 0.044039815037810265, "learning_rate": 3.3826665354882994e-08, "loss": 0.0003, "step": 10809 }, { "epoch": 4.91810737033667, "grad_norm": 0.21812088667199309, "learning_rate": 3.345604767414911e-08, "loss": 0.0019, "step": 10810 }, { "epoch": 4.918562329390355, "grad_norm": 0.014229530675519696, "learning_rate": 3.308747014663938e-08, "loss": 0.0001, "step": 10811 }, { "epoch": 4.91901728844404, "grad_norm": 0.17430041867275936, "learning_rate": 3.2720932802468576e-08, "loss": 0.0067, "step": 10812 }, { "epoch": 4.919472247497725, "grad_norm": 0.016429280570336845, "learning_rate": 3.235643567159608e-08, "loss": 0.0002, "step": 10813 }, { "epoch": 4.91992720655141, "grad_norm": 0.17095755372035437, "learning_rate": 3.199397878380084e-08, "loss": 0.001, "step": 10814 }, { "epoch": 4.920382165605096, "grad_norm": 0.10390148415157356, "learning_rate": 3.1633562168700836e-08, "loss": 0.0027, "step": 10815 }, { "epoch": 4.920837124658781, "grad_norm": 0.09135914474086046, "learning_rate": 3.127518585575306e-08, "loss": 0.0018, "step": 10816 }, { "epoch": 4.921292083712466, "grad_norm": 0.13425882004391956, "learning_rate": 3.0918849874239654e-08, "loss": 0.0045, "step": 10817 }, { "epoch": 4.921747042766151, "grad_norm": 0.09138346001481137, "learning_rate": 3.05645542532762e-08, "loss": 0.0053, "step": 10818 }, { "epoch": 4.922202001819836, "grad_norm": 0.08813400357511168, "learning_rate": 3.0212299021817324e-08, "loss": 0.0005, "step": 10819 }, { "epoch": 4.922656960873521, "grad_norm": 0.09222750254042376, "learning_rate": 2.9862084208648335e-08, "loss": 0.0009, "step": 10820 }, { "epoch": 4.923111919927207, "grad_norm": 0.10361322001851041, "learning_rate": 2.9513909842382448e-08, "loss": 0.0018, "step": 10821 }, { "epoch": 4.923566878980892, "grad_norm": 0.23153795058046756, "learning_rate": 2.916777595147746e-08, "loss": 0.0032, "step": 10822 }, { "epoch": 4.924021838034577, "grad_norm": 0.12579642972513386, "learning_rate": 2.8823682564210752e-08, "loss": 0.0024, "step": 10823 }, { "epoch": 4.924476797088262, "grad_norm": 0.21503129956808062, "learning_rate": 2.8481629708707048e-08, "loss": 0.0031, "step": 10824 }, { "epoch": 4.924931756141947, "grad_norm": 0.011600951634620795, "learning_rate": 2.8141617412913436e-08, "loss": 0.0002, "step": 10825 }, { "epoch": 4.925386715195632, "grad_norm": 0.12543322812942242, "learning_rate": 2.7803645704616022e-08, "loss": 0.0033, "step": 10826 }, { "epoch": 4.925841674249318, "grad_norm": 0.18625883604844873, "learning_rate": 2.746771461142883e-08, "loss": 0.0043, "step": 10827 }, { "epoch": 4.926296633303003, "grad_norm": 0.12527550304121493, "learning_rate": 2.71338241608049e-08, "loss": 0.0045, "step": 10828 }, { "epoch": 4.926751592356688, "grad_norm": 0.24134233488699697, "learning_rate": 2.6801974380030738e-08, "loss": 0.004, "step": 10829 }, { "epoch": 4.927206551410373, "grad_norm": 0.09855340870174914, "learning_rate": 2.6472165296220762e-08, "loss": 0.0018, "step": 10830 }, { "epoch": 4.927661510464058, "grad_norm": 0.2449540888257841, "learning_rate": 2.6144396936325645e-08, "loss": 0.0054, "step": 10831 }, { "epoch": 4.928116469517743, "grad_norm": 0.22932513889533543, "learning_rate": 2.581866932712951e-08, "loss": 0.0033, "step": 10832 }, { "epoch": 4.928571428571429, "grad_norm": 0.09483552023058454, "learning_rate": 2.5494982495249952e-08, "loss": 0.002, "step": 10833 }, { "epoch": 4.929026387625114, "grad_norm": 0.08894566500479553, "learning_rate": 2.5173336467135267e-08, "loss": 0.0011, "step": 10834 }, { "epoch": 4.929481346678799, "grad_norm": 0.15536189555924976, "learning_rate": 2.485373126906998e-08, "loss": 0.0025, "step": 10835 }, { "epoch": 4.929936305732484, "grad_norm": 0.12413666917251878, "learning_rate": 2.453616692717209e-08, "loss": 0.0022, "step": 10836 }, { "epoch": 4.930391264786169, "grad_norm": 0.10779158541367458, "learning_rate": 2.4220643467387504e-08, "loss": 0.0035, "step": 10837 }, { "epoch": 4.930846223839854, "grad_norm": 0.07675022421691624, "learning_rate": 2.390716091550671e-08, "loss": 0.0011, "step": 10838 }, { "epoch": 4.93130118289354, "grad_norm": 0.1578312530443844, "learning_rate": 2.3595719297139775e-08, "loss": 0.006, "step": 10839 }, { "epoch": 4.931756141947225, "grad_norm": 0.3075715199947801, "learning_rate": 2.3286318637738557e-08, "loss": 0.002, "step": 10840 }, { "epoch": 4.9322111010009095, "grad_norm": 0.08076684268408983, "learning_rate": 2.2978958962582843e-08, "loss": 0.0007, "step": 10841 }, { "epoch": 4.932666060054595, "grad_norm": 0.15746579293513607, "learning_rate": 2.2673640296794197e-08, "loss": 0.0016, "step": 10842 }, { "epoch": 4.93312101910828, "grad_norm": 0.0896969283728765, "learning_rate": 2.2370362665319334e-08, "loss": 0.0011, "step": 10843 }, { "epoch": 4.933575978161965, "grad_norm": 0.17824020616606787, "learning_rate": 2.206912609293843e-08, "loss": 0.0034, "step": 10844 }, { "epoch": 4.934030937215651, "grad_norm": 0.008253282865366432, "learning_rate": 2.1769930604270684e-08, "loss": 0.0001, "step": 10845 }, { "epoch": 4.934485896269336, "grad_norm": 0.18076075148074514, "learning_rate": 2.1472776223763203e-08, "loss": 0.0048, "step": 10846 }, { "epoch": 4.9349408553230205, "grad_norm": 0.10683124832661306, "learning_rate": 2.117766297569934e-08, "loss": 0.0015, "step": 10847 }, { "epoch": 4.935395814376706, "grad_norm": 0.22138679892672156, "learning_rate": 2.0884590884193146e-08, "loss": 0.0046, "step": 10848 }, { "epoch": 4.935850773430391, "grad_norm": 0.03592251618087141, "learning_rate": 2.0593559973192123e-08, "loss": 0.001, "step": 10849 }, { "epoch": 4.936305732484076, "grad_norm": 0.3681283176194575, "learning_rate": 2.0304570266480027e-08, "loss": 0.0066, "step": 10850 }, { "epoch": 4.936760691537762, "grad_norm": 0.19435811680570209, "learning_rate": 2.0017621787671303e-08, "loss": 0.0056, "step": 10851 }, { "epoch": 4.937215650591447, "grad_norm": 0.05610961298018971, "learning_rate": 1.973271456021386e-08, "loss": 0.0004, "step": 10852 }, { "epoch": 4.9376706096451315, "grad_norm": 0.2069352175153229, "learning_rate": 1.944984860739185e-08, "loss": 0.0011, "step": 10853 }, { "epoch": 4.938125568698817, "grad_norm": 0.015727300150374234, "learning_rate": 1.9169023952311792e-08, "loss": 0.0002, "step": 10854 }, { "epoch": 4.938580527752502, "grad_norm": 0.07805943125441327, "learning_rate": 1.889024061793032e-08, "loss": 0.001, "step": 10855 }, { "epoch": 4.939035486806187, "grad_norm": 0.1448021826525604, "learning_rate": 1.8613498627023662e-08, "loss": 0.004, "step": 10856 }, { "epoch": 4.939490445859873, "grad_norm": 0.2878644468673347, "learning_rate": 1.8338798002207058e-08, "loss": 0.0053, "step": 10857 }, { "epoch": 4.939945404913558, "grad_norm": 0.5231292296084831, "learning_rate": 1.8066138765926444e-08, "loss": 0.0203, "step": 10858 }, { "epoch": 4.9404003639672425, "grad_norm": 0.04780678254626667, "learning_rate": 1.779552094046677e-08, "loss": 0.0007, "step": 10859 }, { "epoch": 4.940855323020928, "grad_norm": 0.020954255249558497, "learning_rate": 1.7526944547935354e-08, "loss": 0.0004, "step": 10860 }, { "epoch": 4.941310282074613, "grad_norm": 0.6202957337205551, "learning_rate": 1.7260409610284077e-08, "loss": 0.0037, "step": 10861 }, { "epoch": 4.941765241128298, "grad_norm": 0.09380124858450323, "learning_rate": 1.699591614928997e-08, "loss": 0.0015, "step": 10862 }, { "epoch": 4.942220200181984, "grad_norm": 0.2643536461766595, "learning_rate": 1.6733464186566293e-08, "loss": 0.0059, "step": 10863 }, { "epoch": 4.942675159235669, "grad_norm": 0.07217510045390763, "learning_rate": 1.647305374356256e-08, "loss": 0.0006, "step": 10864 }, { "epoch": 4.943130118289354, "grad_norm": 0.09847458208533033, "learning_rate": 1.6214684841556193e-08, "loss": 0.0016, "step": 10865 }, { "epoch": 4.943585077343039, "grad_norm": 0.28123712132563866, "learning_rate": 1.5958357501658083e-08, "loss": 0.0019, "step": 10866 }, { "epoch": 4.944040036396724, "grad_norm": 0.09585237001021371, "learning_rate": 1.570407174481814e-08, "loss": 0.001, "step": 10867 }, { "epoch": 4.94449499545041, "grad_norm": 0.31075571288714926, "learning_rate": 1.5451827591811408e-08, "loss": 0.0146, "step": 10868 }, { "epoch": 4.944949954504095, "grad_norm": 0.137106268287295, "learning_rate": 1.5201625063251956e-08, "loss": 0.0021, "step": 10869 }, { "epoch": 4.94540491355778, "grad_norm": 0.3335769488431826, "learning_rate": 1.4953464179587317e-08, "loss": 0.002, "step": 10870 }, { "epoch": 4.945859872611465, "grad_norm": 0.1679528498208494, "learning_rate": 1.470734496109294e-08, "loss": 0.0028, "step": 10871 }, { "epoch": 4.94631483166515, "grad_norm": 0.05595747452520695, "learning_rate": 1.4463267427883287e-08, "loss": 0.0007, "step": 10872 }, { "epoch": 4.946769790718835, "grad_norm": 0.44551862034031126, "learning_rate": 1.4221231599900741e-08, "loss": 0.0124, "step": 10873 }, { "epoch": 4.947224749772521, "grad_norm": 0.04933082062534708, "learning_rate": 1.3981237496923926e-08, "loss": 0.0009, "step": 10874 }, { "epoch": 4.947679708826206, "grad_norm": 0.15221001831158015, "learning_rate": 1.3743285138564931e-08, "loss": 0.0018, "step": 10875 }, { "epoch": 4.9481346678798905, "grad_norm": 0.1257887450050944, "learning_rate": 1.3507374544266537e-08, "loss": 0.0005, "step": 10876 }, { "epoch": 4.948589626933576, "grad_norm": 0.04710776767739244, "learning_rate": 1.3273505733310543e-08, "loss": 0.0002, "step": 10877 }, { "epoch": 4.949044585987261, "grad_norm": 0.1939653362435116, "learning_rate": 1.304167872480111e-08, "loss": 0.0061, "step": 10878 }, { "epoch": 4.949499545040946, "grad_norm": 0.18185097653712462, "learning_rate": 1.2811893537686969e-08, "loss": 0.0035, "step": 10879 }, { "epoch": 4.949954504094632, "grad_norm": 0.2668416652372025, "learning_rate": 1.258415019074477e-08, "loss": 0.0035, "step": 10880 }, { "epoch": 4.950409463148317, "grad_norm": 0.25228142145636184, "learning_rate": 1.2358448702581849e-08, "loss": 0.0051, "step": 10881 }, { "epoch": 4.9508644222020015, "grad_norm": 0.07461147083186881, "learning_rate": 1.213478909164456e-08, "loss": 0.0004, "step": 10882 }, { "epoch": 4.951319381255687, "grad_norm": 0.11945684325452638, "learning_rate": 1.1913171376207178e-08, "loss": 0.0014, "step": 10883 }, { "epoch": 4.951774340309372, "grad_norm": 0.2908849285701655, "learning_rate": 1.1693595574382988e-08, "loss": 0.0046, "step": 10884 }, { "epoch": 4.952229299363057, "grad_norm": 0.06377326332600376, "learning_rate": 1.1476061704107643e-08, "loss": 0.0006, "step": 10885 }, { "epoch": 4.952684258416743, "grad_norm": 0.1376755402557293, "learning_rate": 1.1260569783164142e-08, "loss": 0.0021, "step": 10886 }, { "epoch": 4.953139217470428, "grad_norm": 0.2922194312498041, "learning_rate": 1.1047119829157848e-08, "loss": 0.0072, "step": 10887 }, { "epoch": 4.9535941765241125, "grad_norm": 0.29443116367191624, "learning_rate": 1.0835711859533138e-08, "loss": 0.003, "step": 10888 }, { "epoch": 4.954049135577798, "grad_norm": 0.06163365673773304, "learning_rate": 1.0626345891562306e-08, "loss": 0.0007, "step": 10889 }, { "epoch": 4.954504094631483, "grad_norm": 0.19347611900004966, "learning_rate": 1.0419021942356666e-08, "loss": 0.0022, "step": 10890 }, { "epoch": 4.954959053685168, "grad_norm": 0.07672155979465402, "learning_rate": 1.0213740028855446e-08, "loss": 0.0005, "step": 10891 }, { "epoch": 4.955414012738854, "grad_norm": 0.08258362865059853, "learning_rate": 1.0010500167836889e-08, "loss": 0.0012, "step": 10892 }, { "epoch": 4.955868971792539, "grad_norm": 0.1510981110954366, "learning_rate": 9.809302375904383e-09, "loss": 0.0032, "step": 10893 }, { "epoch": 4.9563239308462235, "grad_norm": 0.27643788586772255, "learning_rate": 9.61014666950033e-09, "loss": 0.0086, "step": 10894 }, { "epoch": 4.956778889899909, "grad_norm": 0.0961366536679579, "learning_rate": 9.413033064900601e-09, "loss": 0.0015, "step": 10895 }, { "epoch": 4.957233848953594, "grad_norm": 0.11550395239074633, "learning_rate": 9.217961578211753e-09, "loss": 0.0033, "step": 10896 }, { "epoch": 4.957688808007279, "grad_norm": 0.04035182389742982, "learning_rate": 9.024932225371042e-09, "loss": 0.0007, "step": 10897 }, { "epoch": 4.958143767060965, "grad_norm": 0.129281017565914, "learning_rate": 8.833945022157509e-09, "loss": 0.0014, "step": 10898 }, { "epoch": 4.95859872611465, "grad_norm": 0.07104921895572715, "learning_rate": 8.644999984175339e-09, "loss": 0.0011, "step": 10899 }, { "epoch": 4.959053685168335, "grad_norm": 0.1655504180844965, "learning_rate": 8.458097126862186e-09, "loss": 0.0016, "step": 10900 }, { "epoch": 4.95950864422202, "grad_norm": 0.1383686648377542, "learning_rate": 8.27323646549194e-09, "loss": 0.0042, "step": 10901 }, { "epoch": 4.959963603275705, "grad_norm": 0.22239214644196614, "learning_rate": 8.090418015171964e-09, "loss": 0.01, "step": 10902 }, { "epoch": 4.960418562329391, "grad_norm": 0.11932934685171441, "learning_rate": 7.909641790840306e-09, "loss": 0.0002, "step": 10903 }, { "epoch": 4.960873521383076, "grad_norm": 0.09582142786373489, "learning_rate": 7.730907807271259e-09, "loss": 0.0015, "step": 10904 }, { "epoch": 4.961328480436761, "grad_norm": 0.18034466819157222, "learning_rate": 7.554216079067034e-09, "loss": 0.0027, "step": 10905 }, { "epoch": 4.961783439490446, "grad_norm": 0.08177614902176109, "learning_rate": 7.379566620666078e-09, "loss": 0.0021, "step": 10906 }, { "epoch": 4.962238398544131, "grad_norm": 0.041070890301713, "learning_rate": 7.2069594463430866e-09, "loss": 0.0006, "step": 10907 }, { "epoch": 4.962693357597816, "grad_norm": 0.0210143814102111, "learning_rate": 7.036394570200666e-09, "loss": 0.0002, "step": 10908 }, { "epoch": 4.963148316651502, "grad_norm": 0.033966929951542865, "learning_rate": 6.867872006174891e-09, "loss": 0.0004, "step": 10909 }, { "epoch": 4.963603275705187, "grad_norm": 0.15243702212204666, "learning_rate": 6.7013917680408545e-09, "loss": 0.0033, "step": 10910 }, { "epoch": 4.9640582347588715, "grad_norm": 0.12679556680213377, "learning_rate": 6.536953869398788e-09, "loss": 0.0023, "step": 10911 }, { "epoch": 4.964513193812557, "grad_norm": 0.16061846210537462, "learning_rate": 6.3745583236879404e-09, "loss": 0.0029, "step": 10912 }, { "epoch": 4.964968152866242, "grad_norm": 0.20178787804717138, "learning_rate": 6.2142051441782535e-09, "loss": 0.0045, "step": 10913 }, { "epoch": 4.965423111919927, "grad_norm": 0.06542005220842903, "learning_rate": 6.055894343973134e-09, "loss": 0.0009, "step": 10914 }, { "epoch": 4.965878070973613, "grad_norm": 0.2780140415702618, "learning_rate": 5.8996259360094565e-09, "loss": 0.0049, "step": 10915 }, { "epoch": 4.966333030027298, "grad_norm": 0.1079638889309393, "learning_rate": 5.7453999330547845e-09, "loss": 0.0065, "step": 10916 }, { "epoch": 4.9667879890809825, "grad_norm": 0.03490017663981264, "learning_rate": 5.593216347712926e-09, "loss": 0.0006, "step": 10917 }, { "epoch": 4.967242948134668, "grad_norm": 0.008867860092868947, "learning_rate": 5.443075192418379e-09, "loss": 0.0001, "step": 10918 }, { "epoch": 4.967697907188353, "grad_norm": 0.1746853521525859, "learning_rate": 5.294976479441882e-09, "loss": 0.0046, "step": 10919 }, { "epoch": 4.968152866242038, "grad_norm": 0.0378487387023087, "learning_rate": 5.148920220887643e-09, "loss": 0.0005, "step": 10920 }, { "epoch": 4.968607825295724, "grad_norm": 0.15524993255341651, "learning_rate": 5.004906428685008e-09, "loss": 0.0029, "step": 10921 }, { "epoch": 4.969062784349409, "grad_norm": 0.11859942320735456, "learning_rate": 4.862935114605116e-09, "loss": 0.0033, "step": 10922 }, { "epoch": 4.9695177434030935, "grad_norm": 0.04797060412825457, "learning_rate": 4.723006290249799e-09, "loss": 0.0011, "step": 10923 }, { "epoch": 4.969972702456779, "grad_norm": 0.11555913364198374, "learning_rate": 4.585119967054352e-09, "loss": 0.0006, "step": 10924 }, { "epoch": 4.970427661510464, "grad_norm": 0.04109601817861989, "learning_rate": 4.4492761562819894e-09, "loss": 0.0006, "step": 10925 }, { "epoch": 4.970882620564149, "grad_norm": 0.09915069494091096, "learning_rate": 4.3154748690377166e-09, "loss": 0.0021, "step": 10926 }, { "epoch": 4.971337579617835, "grad_norm": 0.12634249127834998, "learning_rate": 4.183716116251679e-09, "loss": 0.0019, "step": 10927 }, { "epoch": 4.97179253867152, "grad_norm": 0.12798785006444596, "learning_rate": 4.053999908693041e-09, "loss": 0.0021, "step": 10928 }, { "epoch": 4.9722474977252045, "grad_norm": 0.1531835088205956, "learning_rate": 3.926326256961654e-09, "loss": 0.0008, "step": 10929 }, { "epoch": 4.97270245677889, "grad_norm": 0.2934111539452304, "learning_rate": 3.800695171488067e-09, "loss": 0.0036, "step": 10930 }, { "epoch": 4.973157415832575, "grad_norm": 0.039356017992787144, "learning_rate": 3.6771066625418403e-09, "loss": 0.0006, "step": 10931 }, { "epoch": 4.97361237488626, "grad_norm": 0.02460361147902038, "learning_rate": 3.5555607402176784e-09, "loss": 0.0004, "step": 10932 }, { "epoch": 4.974067333939946, "grad_norm": 0.23737035365549689, "learning_rate": 3.4360574144520764e-09, "loss": 0.0069, "step": 10933 }, { "epoch": 4.974522292993631, "grad_norm": 0.09136554444950756, "learning_rate": 3.3185966950066703e-09, "loss": 0.0002, "step": 10934 }, { "epoch": 4.9749772520473154, "grad_norm": 0.12670248179343493, "learning_rate": 3.20317859148489e-09, "loss": 0.0016, "step": 10935 }, { "epoch": 4.975432211101001, "grad_norm": 0.020126006588775107, "learning_rate": 3.089803113312528e-09, "loss": 0.0003, "step": 10936 }, { "epoch": 4.975887170154686, "grad_norm": 0.22925549191065994, "learning_rate": 2.9784702697543965e-09, "loss": 0.0035, "step": 10937 }, { "epoch": 4.976342129208371, "grad_norm": 0.14081791353664336, "learning_rate": 2.869180069911548e-09, "loss": 0.002, "step": 10938 }, { "epoch": 4.976797088262057, "grad_norm": 0.21940184063723672, "learning_rate": 2.761932522715727e-09, "loss": 0.0041, "step": 10939 }, { "epoch": 4.977252047315742, "grad_norm": 0.20552280426120825, "learning_rate": 2.6567276369265927e-09, "loss": 0.0137, "step": 10940 }, { "epoch": 4.977707006369426, "grad_norm": 0.16435092395042064, "learning_rate": 2.5535654211400474e-09, "loss": 0.0065, "step": 10941 }, { "epoch": 4.978161965423112, "grad_norm": 0.6545092449020498, "learning_rate": 2.45244588379101e-09, "loss": 0.0087, "step": 10942 }, { "epoch": 4.978616924476797, "grad_norm": 0.11053431958448881, "learning_rate": 2.353369033142316e-09, "loss": 0.0016, "step": 10943 }, { "epoch": 4.979071883530482, "grad_norm": 0.20741669585098857, "learning_rate": 2.256334877284716e-09, "loss": 0.0014, "step": 10944 }, { "epoch": 4.979526842584168, "grad_norm": 0.036500195588817765, "learning_rate": 2.1613434241507524e-09, "loss": 0.0005, "step": 10945 }, { "epoch": 4.9799818016378525, "grad_norm": 0.3655389787322379, "learning_rate": 2.0683946815036604e-09, "loss": 0.0108, "step": 10946 }, { "epoch": 4.980436760691537, "grad_norm": 0.1919782929858497, "learning_rate": 1.9774886569373652e-09, "loss": 0.0016, "step": 10947 }, { "epoch": 4.980891719745223, "grad_norm": 0.1272312897919515, "learning_rate": 1.8886253578820346e-09, "loss": 0.0014, "step": 10948 }, { "epoch": 4.981346678798908, "grad_norm": 0.012589593325644214, "learning_rate": 1.8018047915957515e-09, "loss": 0.0002, "step": 10949 }, { "epoch": 4.981801637852593, "grad_norm": 0.12196542506644788, "learning_rate": 1.7170269651756165e-09, "loss": 0.0018, "step": 10950 }, { "epoch": 4.982256596906279, "grad_norm": 0.22464822372012802, "learning_rate": 1.6342918855494216e-09, "loss": 0.0121, "step": 10951 }, { "epoch": 4.9827115559599635, "grad_norm": 0.033590983047350635, "learning_rate": 1.5535995594756492e-09, "loss": 0.0004, "step": 10952 }, { "epoch": 4.983166515013648, "grad_norm": 0.15543020374818675, "learning_rate": 1.4749499935517997e-09, "loss": 0.001, "step": 10953 }, { "epoch": 4.983621474067334, "grad_norm": 0.15016693095597902, "learning_rate": 1.3983431942005132e-09, "loss": 0.0011, "step": 10954 }, { "epoch": 4.984076433121019, "grad_norm": 0.1737887876728062, "learning_rate": 1.3237791676862232e-09, "loss": 0.0016, "step": 10955 }, { "epoch": 4.984531392174704, "grad_norm": 0.21020825886967742, "learning_rate": 1.251257920098503e-09, "loss": 0.0047, "step": 10956 }, { "epoch": 4.98498635122839, "grad_norm": 0.023783008434112397, "learning_rate": 1.1807794573659436e-09, "loss": 0.0002, "step": 10957 }, { "epoch": 4.9854413102820745, "grad_norm": 0.03290331051343001, "learning_rate": 1.1123437852450513e-09, "loss": 0.0004, "step": 10958 }, { "epoch": 4.985896269335759, "grad_norm": 0.16588611361782968, "learning_rate": 1.0459509093285746e-09, "loss": 0.0023, "step": 10959 }, { "epoch": 4.986351228389445, "grad_norm": 0.09274810858801887, "learning_rate": 9.816008350455041e-10, "loss": 0.0013, "step": 10960 }, { "epoch": 4.98680618744313, "grad_norm": 0.1460170480615193, "learning_rate": 9.192935676499703e-10, "loss": 0.0034, "step": 10961 }, { "epoch": 4.987261146496815, "grad_norm": 0.0978891574169675, "learning_rate": 8.590291122323458e-10, "loss": 0.0009, "step": 10962 }, { "epoch": 4.987716105550501, "grad_norm": 0.10981327136642707, "learning_rate": 8.008074737220206e-10, "loss": 0.0022, "step": 10963 }, { "epoch": 4.9881710646041855, "grad_norm": 0.08887677073085437, "learning_rate": 7.446286568763006e-10, "loss": 0.0014, "step": 10964 }, { "epoch": 4.98862602365787, "grad_norm": 0.02807567089872963, "learning_rate": 6.904926662804068e-10, "loss": 0.0004, "step": 10965 }, { "epoch": 4.989080982711556, "grad_norm": 0.14131092528845743, "learning_rate": 6.38399506364129e-10, "loss": 0.0029, "step": 10966 }, { "epoch": 4.989535941765241, "grad_norm": 0.15184829605903719, "learning_rate": 5.883491813796216e-10, "loss": 0.0008, "step": 10967 }, { "epoch": 4.989990900818926, "grad_norm": 0.30861659005673864, "learning_rate": 5.403416954208318e-10, "loss": 0.0031, "step": 10968 }, { "epoch": 4.990445859872612, "grad_norm": 0.08791108426301178, "learning_rate": 4.943770524068469e-10, "loss": 0.0006, "step": 10969 }, { "epoch": 4.9909008189262964, "grad_norm": 0.03355868000929393, "learning_rate": 4.5045525609854757e-10, "loss": 0.0004, "step": 10970 }, { "epoch": 4.991355777979981, "grad_norm": 0.12153480744047956, "learning_rate": 4.085763100791784e-10, "loss": 0.0013, "step": 10971 }, { "epoch": 4.991810737033667, "grad_norm": 0.14047277915242168, "learning_rate": 3.6874021777377755e-10, "loss": 0.0034, "step": 10972 }, { "epoch": 4.992265696087352, "grad_norm": 0.1763768290836994, "learning_rate": 3.3094698244084953e-10, "loss": 0.0015, "step": 10973 }, { "epoch": 4.992720655141038, "grad_norm": 0.17801380607450504, "learning_rate": 2.951966071612633e-10, "loss": 0.0046, "step": 10974 }, { "epoch": 4.9931756141947226, "grad_norm": 0.039157260437406395, "learning_rate": 2.6148909486323204e-10, "loss": 0.0005, "step": 10975 }, { "epoch": 4.993630573248407, "grad_norm": 0.22528583052638396, "learning_rate": 2.298244482973333e-10, "loss": 0.0045, "step": 10976 }, { "epoch": 4.994085532302093, "grad_norm": 0.16491477895663378, "learning_rate": 2.002026700531623e-10, "loss": 0.0011, "step": 10977 }, { "epoch": 4.994540491355778, "grad_norm": 0.2545849326287858, "learning_rate": 1.7262376254822965e-10, "loss": 0.0018, "step": 10978 }, { "epoch": 4.994995450409463, "grad_norm": 0.08166610603163324, "learning_rate": 1.4708772804183922e-10, "loss": 0.0011, "step": 10979 }, { "epoch": 4.995450409463149, "grad_norm": 0.06535598365538212, "learning_rate": 1.2359456861565922e-10, "loss": 0.0016, "step": 10980 }, { "epoch": 4.9959053685168335, "grad_norm": 0.17874761703561548, "learning_rate": 1.0214428618759986e-10, "loss": 0.0019, "step": 10981 }, { "epoch": 4.996360327570518, "grad_norm": 0.07797339004314222, "learning_rate": 8.273688251736466e-11, "loss": 0.0004, "step": 10982 }, { "epoch": 4.996815286624204, "grad_norm": 0.2826981000454031, "learning_rate": 6.537235918702145e-11, "loss": 0.002, "step": 10983 }, { "epoch": 4.997270245677889, "grad_norm": 0.005179394107741166, "learning_rate": 5.005071761488012e-11, "loss": 0.0001, "step": 10984 }, { "epoch": 4.997725204731574, "grad_norm": 0.07960018530236426, "learning_rate": 3.677195905271713e-11, "loss": 0.0004, "step": 10985 }, { "epoch": 4.99818016378526, "grad_norm": 0.17958235281065213, "learning_rate": 2.5536084588551056e-11, "loss": 0.0033, "step": 10986 }, { "epoch": 4.9986351228389445, "grad_norm": 0.11066527649799185, "learning_rate": 1.634309513831589e-11, "loss": 0.0044, "step": 10987 }, { "epoch": 4.999090081892629, "grad_norm": 0.10693983057891679, "learning_rate": 9.192991454187727e-12, "loss": 0.0016, "step": 10988 }, { "epoch": 4.999545040946315, "grad_norm": 0.0691199273857346, "learning_rate": 4.085774119033658e-12, "loss": 0.0014, "step": 10989 }, { "epoch": 5.0, "grad_norm": 0.01591841757894161, "learning_rate": 1.0214435491873176e-12, "loss": 0.0002, "step": 10990 }, { "epoch": 5.0, "step": 10990, "total_flos": 97535837306880.0, "train_loss": 0.08569821962617845, "train_runtime": 16167.7852, "train_samples_per_second": 2.718, "train_steps_per_second": 0.68 } ], "logging_steps": 1, "max_steps": 10990, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 555, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 97535837306880.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }