diff --git "a/contextlm_gpt2_xl/trainer_state.json" "b/contextlm_gpt2_xl/trainer_state.json" new file mode 100644--- /dev/null +++ "b/contextlm_gpt2_xl/trainer_state.json" @@ -0,0 +1,12240 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999854940017697, + "eval_steps": 1000, + "global_step": 17234, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005802399292107286, + "grad_norm": 9.509501457214355, + "learning_rate": 4.176334106728538e-06, + "loss": 10.385, + "step": 10 + }, + { + "epoch": 0.0011604798584214572, + "grad_norm": 2.0677127838134766, + "learning_rate": 8.816705336426914e-06, + "loss": 8.9985, + "step": 20 + }, + { + "epoch": 0.001740719787632186, + "grad_norm": 2.0270631313323975, + "learning_rate": 1.345707656612529e-05, + "loss": 8.659, + "step": 30 + }, + { + "epoch": 0.0023209597168429145, + "grad_norm": 4.9875383377075195, + "learning_rate": 1.8097447795823665e-05, + "loss": 8.3344, + "step": 40 + }, + { + "epoch": 0.002901199646053643, + "grad_norm": 2.1933412551879883, + "learning_rate": 2.273781902552204e-05, + "loss": 8.0255, + "step": 50 + }, + { + "epoch": 0.003481439575264372, + "grad_norm": 3.3649749755859375, + "learning_rate": 2.737819025522042e-05, + "loss": 7.7313, + "step": 60 + }, + { + "epoch": 0.0040616795044751, + "grad_norm": 3.5209426879882812, + "learning_rate": 3.20185614849188e-05, + "loss": 7.4545, + "step": 70 + }, + { + "epoch": 0.004641919433685829, + "grad_norm": 1.8262568712234497, + "learning_rate": 3.665893271461717e-05, + "loss": 7.1915, + "step": 80 + }, + { + "epoch": 0.005222159362896558, + "grad_norm": 1.9789323806762695, + "learning_rate": 4.129930394431555e-05, + "loss": 6.9657, + "step": 90 + }, + { + "epoch": 0.005802399292107286, + "grad_norm": 1.8022093772888184, + "learning_rate": 4.593967517401392e-05, + "loss": 6.8231, + "step": 100 + }, + { + "epoch": 0.006382639221318015, + "grad_norm": 2.8330202102661133, + "learning_rate": 5.05800464037123e-05, + "loss": 6.7133, + "step": 110 + }, + { + "epoch": 0.006962879150528744, + "grad_norm": 1.8549864292144775, + "learning_rate": 5.522041763341067e-05, + "loss": 6.6013, + "step": 120 + }, + { + "epoch": 0.007543119079739472, + "grad_norm": 1.6776927709579468, + "learning_rate": 5.986078886310905e-05, + "loss": 6.5068, + "step": 130 + }, + { + "epoch": 0.0081233590089502, + "grad_norm": 1.2742928266525269, + "learning_rate": 6.450116009280742e-05, + "loss": 6.4149, + "step": 140 + }, + { + "epoch": 0.008703598938160929, + "grad_norm": 1.3102290630340576, + "learning_rate": 6.91415313225058e-05, + "loss": 6.326, + "step": 150 + }, + { + "epoch": 0.009283838867371658, + "grad_norm": 1.5298289060592651, + "learning_rate": 7.378190255220419e-05, + "loss": 6.2688, + "step": 160 + }, + { + "epoch": 0.009864078796582387, + "grad_norm": 1.2847580909729004, + "learning_rate": 7.842227378190256e-05, + "loss": 6.2044, + "step": 170 + }, + { + "epoch": 0.010444318725793116, + "grad_norm": 0.9664031863212585, + "learning_rate": 8.306264501160093e-05, + "loss": 6.1479, + "step": 180 + }, + { + "epoch": 0.011024558655003845, + "grad_norm": 1.668137550354004, + "learning_rate": 8.77030162412993e-05, + "loss": 6.1069, + "step": 190 + }, + { + "epoch": 0.011604798584214572, + "grad_norm": 1.3086931705474854, + "learning_rate": 9.234338747099769e-05, + "loss": 6.0787, + "step": 200 + }, + { + "epoch": 0.012185038513425301, + "grad_norm": 1.773193120956421, + "learning_rate": 9.698375870069606e-05, + "loss": 6.0132, + "step": 210 + }, + { + "epoch": 0.01276527844263603, + "grad_norm": 0.8364600539207458, + "learning_rate": 0.00010162412993039443, + "loss": 5.9783, + "step": 220 + }, + { + "epoch": 0.013345518371846759, + "grad_norm": 1.0191259384155273, + "learning_rate": 0.0001062645011600928, + "loss": 5.9312, + "step": 230 + }, + { + "epoch": 0.013925758301057488, + "grad_norm": 0.904647946357727, + "learning_rate": 0.00011090487238979119, + "loss": 5.924, + "step": 240 + }, + { + "epoch": 0.014505998230268215, + "grad_norm": 0.6738490462303162, + "learning_rate": 0.00011554524361948958, + "loss": 5.8692, + "step": 250 + }, + { + "epoch": 0.015086238159478944, + "grad_norm": 0.7382608652114868, + "learning_rate": 0.00012018561484918794, + "loss": 5.8497, + "step": 260 + }, + { + "epoch": 0.015666478088689675, + "grad_norm": 0.7634956240653992, + "learning_rate": 0.0001248259860788863, + "loss": 5.8059, + "step": 270 + }, + { + "epoch": 0.0162467180179004, + "grad_norm": 0.6887519955635071, + "learning_rate": 0.0001294663573085847, + "loss": 5.7723, + "step": 280 + }, + { + "epoch": 0.01682695794711113, + "grad_norm": 1.0721409320831299, + "learning_rate": 0.00013410672853828308, + "loss": 5.7549, + "step": 290 + }, + { + "epoch": 0.017407197876321858, + "grad_norm": 0.8256045579910278, + "learning_rate": 0.00013874709976798144, + "loss": 5.7243, + "step": 300 + }, + { + "epoch": 0.017987437805532587, + "grad_norm": 0.5584640502929688, + "learning_rate": 0.00014338747099767982, + "loss": 5.6909, + "step": 310 + }, + { + "epoch": 0.018567677734743316, + "grad_norm": 0.6018552780151367, + "learning_rate": 0.0001480278422273782, + "loss": 5.6591, + "step": 320 + }, + { + "epoch": 0.019147917663954045, + "grad_norm": 0.7498115301132202, + "learning_rate": 0.00015266821345707657, + "loss": 5.6199, + "step": 330 + }, + { + "epoch": 0.019728157593164774, + "grad_norm": 0.6623280048370361, + "learning_rate": 0.00015730858468677495, + "loss": 5.5741, + "step": 340 + }, + { + "epoch": 0.020308397522375503, + "grad_norm": 0.5316705703735352, + "learning_rate": 0.0001619489559164733, + "loss": 5.5444, + "step": 350 + }, + { + "epoch": 0.020888637451586232, + "grad_norm": 0.5719560980796814, + "learning_rate": 0.0001665893271461717, + "loss": 5.501, + "step": 360 + }, + { + "epoch": 0.02146887738079696, + "grad_norm": 0.8803657293319702, + "learning_rate": 0.00017122969837587008, + "loss": 5.4982, + "step": 370 + }, + { + "epoch": 0.02204911731000769, + "grad_norm": 0.46674010157585144, + "learning_rate": 0.00017587006960556844, + "loss": 5.4638, + "step": 380 + }, + { + "epoch": 0.022629357239218415, + "grad_norm": 0.9015390276908875, + "learning_rate": 0.00018051044083526683, + "loss": 5.4292, + "step": 390 + }, + { + "epoch": 0.023209597168429144, + "grad_norm": 0.5593010783195496, + "learning_rate": 0.0001851508120649652, + "loss": 5.4009, + "step": 400 + }, + { + "epoch": 0.023789837097639873, + "grad_norm": 0.4915124177932739, + "learning_rate": 0.0001897911832946636, + "loss": 5.3654, + "step": 410 + }, + { + "epoch": 0.024370077026850602, + "grad_norm": 0.6447678208351135, + "learning_rate": 0.00019443155452436196, + "loss": 5.3323, + "step": 420 + }, + { + "epoch": 0.02495031695606133, + "grad_norm": 0.43100541830062866, + "learning_rate": 0.00019907192575406032, + "loss": 5.3044, + "step": 430 + }, + { + "epoch": 0.02553055688527206, + "grad_norm": 0.5044266581535339, + "learning_rate": 0.00020371229698375873, + "loss": 5.2817, + "step": 440 + }, + { + "epoch": 0.02611079681448279, + "grad_norm": 0.44544899463653564, + "learning_rate": 0.00020835266821345706, + "loss": 5.2623, + "step": 450 + }, + { + "epoch": 0.026691036743693518, + "grad_norm": 0.6966679096221924, + "learning_rate": 0.00021299303944315545, + "loss": 5.2158, + "step": 460 + }, + { + "epoch": 0.027271276672904247, + "grad_norm": 0.6552466154098511, + "learning_rate": 0.00021763341067285383, + "loss": 5.1987, + "step": 470 + }, + { + "epoch": 0.027851516602114976, + "grad_norm": 0.3580920398235321, + "learning_rate": 0.00022227378190255222, + "loss": 5.1516, + "step": 480 + }, + { + "epoch": 0.028431756531325705, + "grad_norm": 0.6580965518951416, + "learning_rate": 0.0002269141531322506, + "loss": 5.1102, + "step": 490 + }, + { + "epoch": 0.02901199646053643, + "grad_norm": 0.40035542845726013, + "learning_rate": 0.000231554524361949, + "loss": 5.086, + "step": 500 + }, + { + "epoch": 0.02959223638974716, + "grad_norm": 0.46560588479042053, + "learning_rate": 0.00023619489559164735, + "loss": 5.0338, + "step": 510 + }, + { + "epoch": 0.030172476318957888, + "grad_norm": 0.610862135887146, + "learning_rate": 0.00024083526682134573, + "loss": 5.0033, + "step": 520 + }, + { + "epoch": 0.030752716248168617, + "grad_norm": 0.43752819299697876, + "learning_rate": 0.00024547563805104406, + "loss": 4.9667, + "step": 530 + }, + { + "epoch": 0.03133295617737935, + "grad_norm": 0.38704100251197815, + "learning_rate": 0.00025011600928074245, + "loss": 4.9253, + "step": 540 + }, + { + "epoch": 0.031913196106590075, + "grad_norm": 0.4786874055862427, + "learning_rate": 0.00025475638051044084, + "loss": 4.9177, + "step": 550 + }, + { + "epoch": 0.0324934360358008, + "grad_norm": 0.36762577295303345, + "learning_rate": 0.0002593967517401392, + "loss": 4.8879, + "step": 560 + }, + { + "epoch": 0.03307367596501153, + "grad_norm": 0.407249391078949, + "learning_rate": 0.0002640371229698376, + "loss": 4.8372, + "step": 570 + }, + { + "epoch": 0.03365391589422226, + "grad_norm": 0.4547578692436218, + "learning_rate": 0.000268677494199536, + "loss": 4.8013, + "step": 580 + }, + { + "epoch": 0.03423415582343299, + "grad_norm": 0.34854525327682495, + "learning_rate": 0.0002733178654292344, + "loss": 4.7609, + "step": 590 + }, + { + "epoch": 0.034814395752643716, + "grad_norm": 0.33431318402290344, + "learning_rate": 0.00027795823665893276, + "loss": 4.7366, + "step": 600 + }, + { + "epoch": 0.03539463568185445, + "grad_norm": 0.4790738523006439, + "learning_rate": 0.0002825986078886311, + "loss": 4.7052, + "step": 610 + }, + { + "epoch": 0.035974875611065174, + "grad_norm": 0.4698006510734558, + "learning_rate": 0.0002872389791183295, + "loss": 4.7075, + "step": 620 + }, + { + "epoch": 0.036555115540275906, + "grad_norm": 0.32214587926864624, + "learning_rate": 0.00029187935034802787, + "loss": 4.6669, + "step": 630 + }, + { + "epoch": 0.03713535546948663, + "grad_norm": 0.29147374629974365, + "learning_rate": 0.00029651972157772625, + "loss": 4.6087, + "step": 640 + }, + { + "epoch": 0.037715595398697364, + "grad_norm": 0.37832900881767273, + "learning_rate": 0.0003011600928074246, + "loss": 4.5891, + "step": 650 + }, + { + "epoch": 0.03829583532790809, + "grad_norm": 0.4138408899307251, + "learning_rate": 0.00030580046403712297, + "loss": 4.5831, + "step": 660 + }, + { + "epoch": 0.038876075257118815, + "grad_norm": 0.3426309823989868, + "learning_rate": 0.00031044083526682135, + "loss": 4.5467, + "step": 670 + }, + { + "epoch": 0.03945631518632955, + "grad_norm": 0.3773086369037628, + "learning_rate": 0.00031508120649651974, + "loss": 4.5238, + "step": 680 + }, + { + "epoch": 0.04003655511554027, + "grad_norm": 0.39989471435546875, + "learning_rate": 0.00031972157772621807, + "loss": 4.5152, + "step": 690 + }, + { + "epoch": 0.040616795044751006, + "grad_norm": 0.40890148282051086, + "learning_rate": 0.00032436194895591646, + "loss": 4.46, + "step": 700 + }, + { + "epoch": 0.04119703497396173, + "grad_norm": 0.38713014125823975, + "learning_rate": 0.00032900232018561484, + "loss": 4.4458, + "step": 710 + }, + { + "epoch": 0.041777274903172464, + "grad_norm": 0.3816765248775482, + "learning_rate": 0.00033364269141531323, + "loss": 4.4313, + "step": 720 + }, + { + "epoch": 0.04235751483238319, + "grad_norm": 0.2601516842842102, + "learning_rate": 0.0003382830626450116, + "loss": 4.3935, + "step": 730 + }, + { + "epoch": 0.04293775476159392, + "grad_norm": 0.5483155250549316, + "learning_rate": 0.00034292343387471, + "loss": 4.3647, + "step": 740 + }, + { + "epoch": 0.04351799469080465, + "grad_norm": 0.41322359442710876, + "learning_rate": 0.0003475638051044084, + "loss": 4.3641, + "step": 750 + }, + { + "epoch": 0.04409823462001538, + "grad_norm": 0.2739952802658081, + "learning_rate": 0.00035220417633410677, + "loss": 4.3179, + "step": 760 + }, + { + "epoch": 0.044678474549226105, + "grad_norm": 0.311780720949173, + "learning_rate": 0.00035684454756380516, + "loss": 4.2882, + "step": 770 + }, + { + "epoch": 0.04525871447843683, + "grad_norm": 0.34148845076560974, + "learning_rate": 0.0003614849187935035, + "loss": 4.2675, + "step": 780 + }, + { + "epoch": 0.04583895440764756, + "grad_norm": 0.3027192950248718, + "learning_rate": 0.0003661252900232019, + "loss": 4.2453, + "step": 790 + }, + { + "epoch": 0.04641919433685829, + "grad_norm": 0.31899961829185486, + "learning_rate": 0.00037076566125290026, + "loss": 4.2297, + "step": 800 + }, + { + "epoch": 0.04699943426606902, + "grad_norm": 0.29658201336860657, + "learning_rate": 0.00037540603248259865, + "loss": 4.2002, + "step": 810 + }, + { + "epoch": 0.047579674195279746, + "grad_norm": 0.3228875696659088, + "learning_rate": 0.00038004640371229703, + "loss": 4.1797, + "step": 820 + }, + { + "epoch": 0.04815991412449048, + "grad_norm": 0.3000330328941345, + "learning_rate": 0.00038468677494199536, + "loss": 4.1557, + "step": 830 + }, + { + "epoch": 0.048740154053701204, + "grad_norm": 0.3490864336490631, + "learning_rate": 0.00038932714617169375, + "loss": 4.1553, + "step": 840 + }, + { + "epoch": 0.049320393982911936, + "grad_norm": 0.3470998704433441, + "learning_rate": 0.00039396751740139213, + "loss": 4.1359, + "step": 850 + }, + { + "epoch": 0.04990063391212266, + "grad_norm": 0.28483402729034424, + "learning_rate": 0.00039860788863109047, + "loss": 4.1102, + "step": 860 + }, + { + "epoch": 0.050480873841333394, + "grad_norm": 0.2661000192165375, + "learning_rate": 0.0003999998195768387, + "loss": 4.089, + "step": 870 + }, + { + "epoch": 0.05106111377054412, + "grad_norm": 0.28349924087524414, + "learning_rate": 0.0003999989358723423, + "loss": 4.0595, + "step": 880 + }, + { + "epoch": 0.051641353699754845, + "grad_norm": 0.31661197543144226, + "learning_rate": 0.0003999973157508127, + "loss": 4.059, + "step": 890 + }, + { + "epoch": 0.05222159362896558, + "grad_norm": 0.28986701369285583, + "learning_rate": 0.0003999949592182153, + "loss": 4.0408, + "step": 900 + }, + { + "epoch": 0.0528018335581763, + "grad_norm": 0.21339298784732819, + "learning_rate": 0.0003999918662832272, + "loss": 4.0218, + "step": 910 + }, + { + "epoch": 0.053382073487387036, + "grad_norm": 0.3597091734409332, + "learning_rate": 0.00039998803695723685, + "loss": 3.9897, + "step": 920 + }, + { + "epoch": 0.05396231341659776, + "grad_norm": 0.3721710443496704, + "learning_rate": 0.0003999834712543442, + "loss": 3.9904, + "step": 930 + }, + { + "epoch": 0.054542553345808494, + "grad_norm": 0.2166401445865631, + "learning_rate": 0.0003999781691913607, + "loss": 3.9665, + "step": 940 + }, + { + "epoch": 0.05512279327501922, + "grad_norm": 0.24356764554977417, + "learning_rate": 0.00039997213078780903, + "loss": 3.9508, + "step": 950 + }, + { + "epoch": 0.05570303320422995, + "grad_norm": 0.21014730632305145, + "learning_rate": 0.00039996535606592334, + "loss": 3.9392, + "step": 960 + }, + { + "epoch": 0.05628327313344068, + "grad_norm": 0.26985248923301697, + "learning_rate": 0.0003999578450506487, + "loss": 3.9171, + "step": 970 + }, + { + "epoch": 0.05686351306265141, + "grad_norm": 0.28635284304618835, + "learning_rate": 0.00039994959776964165, + "loss": 3.9142, + "step": 980 + }, + { + "epoch": 0.057443752991862135, + "grad_norm": 0.26228785514831543, + "learning_rate": 0.0003999406142532694, + "loss": 3.8858, + "step": 990 + }, + { + "epoch": 0.05802399292107286, + "grad_norm": 0.24055221676826477, + "learning_rate": 0.00039993089453461023, + "loss": 3.886, + "step": 1000 + }, + { + "epoch": 0.05802399292107286, + "eval_loss": 3.8423373699188232, + "eval_runtime": 5.4409, + "eval_samples_per_second": 795.825, + "eval_steps_per_second": 1.654, + "step": 1000 + }, + { + "epoch": 0.05860423285028359, + "grad_norm": 0.23449479043483734, + "learning_rate": 0.00039992043864945325, + "loss": 3.8626, + "step": 1010 + }, + { + "epoch": 0.05918447277949432, + "grad_norm": 0.24179957807064056, + "learning_rate": 0.00039990924663629797, + "loss": 3.847, + "step": 1020 + }, + { + "epoch": 0.05976471270870505, + "grad_norm": 0.23391254246234894, + "learning_rate": 0.00039989731853635465, + "loss": 3.8457, + "step": 1030 + }, + { + "epoch": 0.060344952637915776, + "grad_norm": 0.3046986758708954, + "learning_rate": 0.0003998846543935438, + "loss": 3.8457, + "step": 1040 + }, + { + "epoch": 0.06092519256712651, + "grad_norm": 0.2446785569190979, + "learning_rate": 0.00039987125425449603, + "loss": 3.8428, + "step": 1050 + }, + { + "epoch": 0.061505432496337234, + "grad_norm": 0.21244779229164124, + "learning_rate": 0.00039985711816855224, + "loss": 3.8053, + "step": 1060 + }, + { + "epoch": 0.062085672425547966, + "grad_norm": 0.23864899575710297, + "learning_rate": 0.00039984224618776285, + "loss": 3.8097, + "step": 1070 + }, + { + "epoch": 0.0626659123547587, + "grad_norm": 0.2297637015581131, + "learning_rate": 0.0003998266383668881, + "loss": 3.7986, + "step": 1080 + }, + { + "epoch": 0.06324615228396942, + "grad_norm": 0.21897932887077332, + "learning_rate": 0.0003998102947633975, + "loss": 3.7788, + "step": 1090 + }, + { + "epoch": 0.06382639221318015, + "grad_norm": 0.2325911521911621, + "learning_rate": 0.0003997932154374701, + "loss": 3.778, + "step": 1100 + }, + { + "epoch": 0.06440663214239088, + "grad_norm": 0.2542111277580261, + "learning_rate": 0.0003997754004519936, + "loss": 3.7512, + "step": 1110 + }, + { + "epoch": 0.0649868720716016, + "grad_norm": 0.22122596204280853, + "learning_rate": 0.00039975684987256476, + "loss": 3.7449, + "step": 1120 + }, + { + "epoch": 0.06556711200081233, + "grad_norm": 0.21252059936523438, + "learning_rate": 0.00039973756376748875, + "loss": 3.7379, + "step": 1130 + }, + { + "epoch": 0.06614735193002307, + "grad_norm": 0.2249770313501358, + "learning_rate": 0.0003997175422077789, + "loss": 3.7369, + "step": 1140 + }, + { + "epoch": 0.0667275918592338, + "grad_norm": 0.1798800528049469, + "learning_rate": 0.00039969678526715686, + "loss": 3.7269, + "step": 1150 + }, + { + "epoch": 0.06730783178844452, + "grad_norm": 0.22175556421279907, + "learning_rate": 0.0003996752930220518, + "loss": 3.7057, + "step": 1160 + }, + { + "epoch": 0.06788807171765525, + "grad_norm": 0.1973702758550644, + "learning_rate": 0.0003996530655516003, + "loss": 3.6938, + "step": 1170 + }, + { + "epoch": 0.06846831164686598, + "grad_norm": 0.17350292205810547, + "learning_rate": 0.00039963010293764646, + "loss": 3.6902, + "step": 1180 + }, + { + "epoch": 0.06904855157607671, + "grad_norm": 0.44916781783103943, + "learning_rate": 0.0003996064052647408, + "loss": 3.7085, + "step": 1190 + }, + { + "epoch": 0.06962879150528743, + "grad_norm": 0.17633803188800812, + "learning_rate": 0.0003995819726201408, + "loss": 3.7162, + "step": 1200 + }, + { + "epoch": 0.07020903143449816, + "grad_norm": 0.18631690740585327, + "learning_rate": 0.00039955680509380995, + "loss": 3.6841, + "step": 1210 + }, + { + "epoch": 0.0707892713637089, + "grad_norm": 0.2087530940771103, + "learning_rate": 0.0003995309027784177, + "loss": 3.6755, + "step": 1220 + }, + { + "epoch": 0.07136951129291962, + "grad_norm": 0.2354528307914734, + "learning_rate": 0.0003995042657693391, + "loss": 3.6574, + "step": 1230 + }, + { + "epoch": 0.07194975122213035, + "grad_norm": 0.20092642307281494, + "learning_rate": 0.00039947689416465444, + "loss": 3.6568, + "step": 1240 + }, + { + "epoch": 0.07252999115134108, + "grad_norm": 0.17386938631534576, + "learning_rate": 0.00039944878806514884, + "loss": 3.6434, + "step": 1250 + }, + { + "epoch": 0.07311023108055181, + "grad_norm": 0.22579102218151093, + "learning_rate": 0.0003994199475743119, + "loss": 3.6382, + "step": 1260 + }, + { + "epoch": 0.07369047100976253, + "grad_norm": 0.1960754096508026, + "learning_rate": 0.0003993903727983373, + "loss": 3.6343, + "step": 1270 + }, + { + "epoch": 0.07427071093897326, + "grad_norm": 0.35644450783729553, + "learning_rate": 0.00039936006384612237, + "loss": 3.6366, + "step": 1280 + }, + { + "epoch": 0.074850950868184, + "grad_norm": 0.15999135375022888, + "learning_rate": 0.000399329020829268, + "loss": 3.6337, + "step": 1290 + }, + { + "epoch": 0.07543119079739473, + "grad_norm": 0.1807032823562622, + "learning_rate": 0.00039929724386207784, + "loss": 3.6182, + "step": 1300 + }, + { + "epoch": 0.07601143072660545, + "grad_norm": 0.19118531048297882, + "learning_rate": 0.00039926473306155794, + "loss": 3.6138, + "step": 1310 + }, + { + "epoch": 0.07659167065581618, + "grad_norm": 0.20418357849121094, + "learning_rate": 0.00039923148854741644, + "loss": 3.6009, + "step": 1320 + }, + { + "epoch": 0.07717191058502691, + "grad_norm": 0.17397375404834747, + "learning_rate": 0.0003991975104420632, + "loss": 3.5977, + "step": 1330 + }, + { + "epoch": 0.07775215051423763, + "grad_norm": 0.18904046714305878, + "learning_rate": 0.0003991627988706091, + "loss": 3.5917, + "step": 1340 + }, + { + "epoch": 0.07833239044344836, + "grad_norm": 0.1725725531578064, + "learning_rate": 0.0003991273539608658, + "loss": 3.5848, + "step": 1350 + }, + { + "epoch": 0.0789126303726591, + "grad_norm": 0.1786000281572342, + "learning_rate": 0.0003990911758433452, + "loss": 3.589, + "step": 1360 + }, + { + "epoch": 0.07949287030186983, + "grad_norm": 0.1812940239906311, + "learning_rate": 0.00039905426465125895, + "loss": 3.5774, + "step": 1370 + }, + { + "epoch": 0.08007311023108055, + "grad_norm": 0.17902950942516327, + "learning_rate": 0.00039901662052051787, + "loss": 3.5683, + "step": 1380 + }, + { + "epoch": 0.08065335016029128, + "grad_norm": 0.25934261083602905, + "learning_rate": 0.0003989782435897316, + "loss": 3.555, + "step": 1390 + }, + { + "epoch": 0.08123359008950201, + "grad_norm": 0.16889074444770813, + "learning_rate": 0.00039893913400020797, + "loss": 3.5638, + "step": 1400 + }, + { + "epoch": 0.08181383001871274, + "grad_norm": 0.1863388568162918, + "learning_rate": 0.00039889929189595264, + "loss": 3.5548, + "step": 1410 + }, + { + "epoch": 0.08239406994792346, + "grad_norm": 0.19880317151546478, + "learning_rate": 0.00039885871742366847, + "loss": 3.5539, + "step": 1420 + }, + { + "epoch": 0.0829743098771342, + "grad_norm": 0.2024102807044983, + "learning_rate": 0.00039881741073275476, + "loss": 3.5467, + "step": 1430 + }, + { + "epoch": 0.08355454980634493, + "grad_norm": 0.18990029394626617, + "learning_rate": 0.0003987753719753073, + "loss": 3.5297, + "step": 1440 + }, + { + "epoch": 0.08413478973555565, + "grad_norm": 0.15605872869491577, + "learning_rate": 0.00039873260130611694, + "loss": 3.5173, + "step": 1450 + }, + { + "epoch": 0.08471502966476638, + "grad_norm": 0.19115321338176727, + "learning_rate": 0.00039868909888267, + "loss": 3.5208, + "step": 1460 + }, + { + "epoch": 0.08529526959397711, + "grad_norm": 0.20802512764930725, + "learning_rate": 0.0003986448648651468, + "loss": 3.5295, + "step": 1470 + }, + { + "epoch": 0.08587550952318784, + "grad_norm": 0.207048699259758, + "learning_rate": 0.0003985998994164216, + "loss": 3.5266, + "step": 1480 + }, + { + "epoch": 0.08645574945239856, + "grad_norm": 0.21221551299095154, + "learning_rate": 0.00039855420270206213, + "loss": 3.5165, + "step": 1490 + }, + { + "epoch": 0.0870359893816093, + "grad_norm": 0.17720042169094086, + "learning_rate": 0.0003985077748903282, + "loss": 3.4996, + "step": 1500 + }, + { + "epoch": 0.08761622931082003, + "grad_norm": 0.1978517323732376, + "learning_rate": 0.0003984606161521721, + "loss": 3.4972, + "step": 1510 + }, + { + "epoch": 0.08819646924003076, + "grad_norm": 0.21575991809368134, + "learning_rate": 0.00039841272666123705, + "loss": 3.4846, + "step": 1520 + }, + { + "epoch": 0.08877670916924148, + "grad_norm": 0.16001969575881958, + "learning_rate": 0.0003983641065938573, + "loss": 3.4937, + "step": 1530 + }, + { + "epoch": 0.08935694909845221, + "grad_norm": 0.18556085228919983, + "learning_rate": 0.00039831475612905697, + "loss": 3.4797, + "step": 1540 + }, + { + "epoch": 0.08993718902766294, + "grad_norm": 0.193649560213089, + "learning_rate": 0.00039826467544854975, + "loss": 3.4808, + "step": 1550 + }, + { + "epoch": 0.09051742895687366, + "grad_norm": 0.18965889513492584, + "learning_rate": 0.00039821386473673775, + "loss": 3.4831, + "step": 1560 + }, + { + "epoch": 0.09109766888608439, + "grad_norm": 0.19180066883563995, + "learning_rate": 0.00039816232418071155, + "loss": 3.4768, + "step": 1570 + }, + { + "epoch": 0.09167790881529513, + "grad_norm": 0.18388795852661133, + "learning_rate": 0.0003981100539702487, + "loss": 3.4739, + "step": 1580 + }, + { + "epoch": 0.09225814874450586, + "grad_norm": 0.18927408754825592, + "learning_rate": 0.00039805705429781375, + "loss": 3.4572, + "step": 1590 + }, + { + "epoch": 0.09283838867371658, + "grad_norm": 0.1485849916934967, + "learning_rate": 0.00039800332535855695, + "loss": 3.4651, + "step": 1600 + }, + { + "epoch": 0.09341862860292731, + "grad_norm": 0.15955494344234467, + "learning_rate": 0.0003979488673503138, + "loss": 3.4644, + "step": 1610 + }, + { + "epoch": 0.09399886853213804, + "grad_norm": 0.17274880409240723, + "learning_rate": 0.0003978936804736046, + "loss": 3.4548, + "step": 1620 + }, + { + "epoch": 0.09457910846134877, + "grad_norm": 0.19138464331626892, + "learning_rate": 0.00039783776493163307, + "loss": 3.4504, + "step": 1630 + }, + { + "epoch": 0.09515934839055949, + "grad_norm": 0.19574107229709625, + "learning_rate": 0.0003977811209302861, + "loss": 3.4505, + "step": 1640 + }, + { + "epoch": 0.09573958831977022, + "grad_norm": 0.14684069156646729, + "learning_rate": 0.0003977237486781329, + "loss": 3.4536, + "step": 1650 + }, + { + "epoch": 0.09631982824898096, + "grad_norm": 0.17721091210842133, + "learning_rate": 0.00039766564838642404, + "loss": 3.4438, + "step": 1660 + }, + { + "epoch": 0.09690006817819168, + "grad_norm": 0.19373926520347595, + "learning_rate": 0.00039760682026909093, + "loss": 3.4303, + "step": 1670 + }, + { + "epoch": 0.09748030810740241, + "grad_norm": 0.15782880783081055, + "learning_rate": 0.00039754726454274485, + "loss": 3.4287, + "step": 1680 + }, + { + "epoch": 0.09806054803661314, + "grad_norm": 0.15952658653259277, + "learning_rate": 0.00039748698142667616, + "loss": 3.4409, + "step": 1690 + }, + { + "epoch": 0.09864078796582387, + "grad_norm": 0.1631140410900116, + "learning_rate": 0.00039742597114285377, + "loss": 3.4218, + "step": 1700 + }, + { + "epoch": 0.09922102789503459, + "grad_norm": 0.20951348543167114, + "learning_rate": 0.0003973642339159237, + "loss": 3.4259, + "step": 1710 + }, + { + "epoch": 0.09980126782424532, + "grad_norm": 0.18334078788757324, + "learning_rate": 0.0003973017699732092, + "loss": 3.4249, + "step": 1720 + }, + { + "epoch": 0.10038150775345606, + "grad_norm": 0.15507562458515167, + "learning_rate": 0.0003972385795447087, + "loss": 3.4177, + "step": 1730 + }, + { + "epoch": 0.10096174768266679, + "grad_norm": 0.19322511553764343, + "learning_rate": 0.0003971746628630962, + "loss": 3.4167, + "step": 1740 + }, + { + "epoch": 0.10154198761187751, + "grad_norm": 0.17755462229251862, + "learning_rate": 0.0003971100201637196, + "loss": 3.4152, + "step": 1750 + }, + { + "epoch": 0.10212222754108824, + "grad_norm": 0.1856836974620819, + "learning_rate": 0.0003970446516846, + "loss": 3.3932, + "step": 1760 + }, + { + "epoch": 0.10270246747029897, + "grad_norm": 0.19244609773159027, + "learning_rate": 0.0003969785576664311, + "loss": 3.4051, + "step": 1770 + }, + { + "epoch": 0.10328270739950969, + "grad_norm": 0.181436225771904, + "learning_rate": 0.0003969117383525779, + "loss": 3.3995, + "step": 1780 + }, + { + "epoch": 0.10386294732872042, + "grad_norm": 0.1531616598367691, + "learning_rate": 0.0003968441939890762, + "loss": 3.4055, + "step": 1790 + }, + { + "epoch": 0.10444318725793116, + "grad_norm": 0.1904386729001999, + "learning_rate": 0.00039677592482463135, + "loss": 3.3906, + "step": 1800 + }, + { + "epoch": 0.10502342718714189, + "grad_norm": 0.17353110015392303, + "learning_rate": 0.0003967069311106176, + "loss": 3.3925, + "step": 1810 + }, + { + "epoch": 0.1056036671163526, + "grad_norm": 0.15772603452205658, + "learning_rate": 0.000396637213101077, + "loss": 3.392, + "step": 1820 + }, + { + "epoch": 0.10618390704556334, + "grad_norm": 0.18432147800922394, + "learning_rate": 0.00039656677105271863, + "loss": 3.3907, + "step": 1830 + }, + { + "epoch": 0.10676414697477407, + "grad_norm": 0.16060298681259155, + "learning_rate": 0.0003964956052249174, + "loss": 3.3801, + "step": 1840 + }, + { + "epoch": 0.1073443869039848, + "grad_norm": 0.17931994795799255, + "learning_rate": 0.00039642371587971344, + "loss": 3.3788, + "step": 1850 + }, + { + "epoch": 0.10792462683319552, + "grad_norm": 0.15293100476264954, + "learning_rate": 0.0003963511032818108, + "loss": 3.3751, + "step": 1860 + }, + { + "epoch": 0.10850486676240625, + "grad_norm": 0.17011559009552002, + "learning_rate": 0.0003962777676985767, + "loss": 3.3794, + "step": 1870 + }, + { + "epoch": 0.10908510669161699, + "grad_norm": 0.13994856178760529, + "learning_rate": 0.00039620370940004037, + "loss": 3.3718, + "step": 1880 + }, + { + "epoch": 0.1096653466208277, + "grad_norm": 0.17306166887283325, + "learning_rate": 0.0003961289286588923, + "loss": 3.3733, + "step": 1890 + }, + { + "epoch": 0.11024558655003844, + "grad_norm": 0.19824837148189545, + "learning_rate": 0.000396053425750483, + "loss": 3.3625, + "step": 1900 + }, + { + "epoch": 0.11082582647924917, + "grad_norm": 0.16269779205322266, + "learning_rate": 0.00039597720095282203, + "loss": 3.3623, + "step": 1910 + }, + { + "epoch": 0.1114060664084599, + "grad_norm": 0.16221415996551514, + "learning_rate": 0.00039590025454657715, + "loss": 3.3556, + "step": 1920 + }, + { + "epoch": 0.11198630633767062, + "grad_norm": 0.16675835847854614, + "learning_rate": 0.000395822586815073, + "loss": 3.3626, + "step": 1930 + }, + { + "epoch": 0.11256654626688135, + "grad_norm": 0.1594047099351883, + "learning_rate": 0.0003957441980442904, + "loss": 3.3613, + "step": 1940 + }, + { + "epoch": 0.11314678619609209, + "grad_norm": 0.17084339261054993, + "learning_rate": 0.000395665088522865, + "loss": 3.3418, + "step": 1950 + }, + { + "epoch": 0.11372702612530282, + "grad_norm": 0.17646485567092896, + "learning_rate": 0.00039558525854208634, + "loss": 3.3425, + "step": 1960 + }, + { + "epoch": 0.11430726605451354, + "grad_norm": 0.16079320013523102, + "learning_rate": 0.00039550470839589666, + "loss": 3.3522, + "step": 1970 + }, + { + "epoch": 0.11488750598372427, + "grad_norm": 0.16689667105674744, + "learning_rate": 0.00039542343838089024, + "loss": 3.3528, + "step": 1980 + }, + { + "epoch": 0.115467745912935, + "grad_norm": 0.15823589265346527, + "learning_rate": 0.00039534144879631165, + "loss": 3.3392, + "step": 1990 + }, + { + "epoch": 0.11604798584214572, + "grad_norm": 0.1432604044675827, + "learning_rate": 0.00039525873994405514, + "loss": 3.3459, + "step": 2000 + }, + { + "epoch": 0.11604798584214572, + "eval_loss": 3.3026654720306396, + "eval_runtime": 5.3961, + "eval_samples_per_second": 802.435, + "eval_steps_per_second": 1.668, + "step": 2000 + }, + { + "epoch": 0.11662822577135645, + "grad_norm": 0.21145105361938477, + "learning_rate": 0.0003951753121286634, + "loss": 3.3382, + "step": 2010 + }, + { + "epoch": 0.11720846570056719, + "grad_norm": 0.1759408414363861, + "learning_rate": 0.00039509116565732643, + "loss": 3.3362, + "step": 2020 + }, + { + "epoch": 0.11778870562977792, + "grad_norm": 0.15327772498130798, + "learning_rate": 0.0003950063008398802, + "loss": 3.3375, + "step": 2030 + }, + { + "epoch": 0.11836894555898864, + "grad_norm": 0.16556823253631592, + "learning_rate": 0.000394920717988806, + "loss": 3.329, + "step": 2040 + }, + { + "epoch": 0.11894918548819937, + "grad_norm": 0.16622750461101532, + "learning_rate": 0.0003948344174192288, + "loss": 3.3216, + "step": 2050 + }, + { + "epoch": 0.1195294254174101, + "grad_norm": 0.17805641889572144, + "learning_rate": 0.00039474739944891636, + "loss": 3.3326, + "step": 2060 + }, + { + "epoch": 0.12010966534662083, + "grad_norm": 0.15195246040821075, + "learning_rate": 0.000394659664398278, + "loss": 3.3334, + "step": 2070 + }, + { + "epoch": 0.12068990527583155, + "grad_norm": 0.16342763602733612, + "learning_rate": 0.0003945712125903632, + "loss": 3.3203, + "step": 2080 + }, + { + "epoch": 0.12127014520504228, + "grad_norm": 0.15589125454425812, + "learning_rate": 0.00039448204435086096, + "loss": 3.3184, + "step": 2090 + }, + { + "epoch": 0.12185038513425302, + "grad_norm": 0.1632692962884903, + "learning_rate": 0.000394392160008098, + "loss": 3.3093, + "step": 2100 + }, + { + "epoch": 0.12243062506346374, + "grad_norm": 0.17584744095802307, + "learning_rate": 0.000394301559893038, + "loss": 3.3245, + "step": 2110 + }, + { + "epoch": 0.12301086499267447, + "grad_norm": 0.17610713839530945, + "learning_rate": 0.0003942102443392799, + "loss": 3.3209, + "step": 2120 + }, + { + "epoch": 0.1235911049218852, + "grad_norm": 0.17064009606838226, + "learning_rate": 0.00039411821368305725, + "loss": 3.3037, + "step": 2130 + }, + { + "epoch": 0.12417134485109593, + "grad_norm": 0.1743483990430832, + "learning_rate": 0.00039402546826323645, + "loss": 3.3111, + "step": 2140 + }, + { + "epoch": 0.12475158478030665, + "grad_norm": 0.15367501974105835, + "learning_rate": 0.000393932008421316, + "loss": 3.3087, + "step": 2150 + }, + { + "epoch": 0.1253318247095174, + "grad_norm": 0.21368132531642914, + "learning_rate": 0.00039383783450142474, + "loss": 3.3015, + "step": 2160 + }, + { + "epoch": 0.12591206463872812, + "grad_norm": 0.15691019594669342, + "learning_rate": 0.00039374294685032095, + "loss": 3.3067, + "step": 2170 + }, + { + "epoch": 0.12649230456793883, + "grad_norm": 0.164012610912323, + "learning_rate": 0.00039364734581739084, + "loss": 3.3069, + "step": 2180 + }, + { + "epoch": 0.12707254449714958, + "grad_norm": 0.14891429245471954, + "learning_rate": 0.0003935510317546475, + "loss": 3.3027, + "step": 2190 + }, + { + "epoch": 0.1276527844263603, + "grad_norm": 0.17328360676765442, + "learning_rate": 0.0003934540050167294, + "loss": 3.3122, + "step": 2200 + }, + { + "epoch": 0.12823302435557102, + "grad_norm": 0.16395007073879242, + "learning_rate": 0.00039335626596089906, + "loss": 3.2821, + "step": 2210 + }, + { + "epoch": 0.12881326428478176, + "grad_norm": 0.1645556390285492, + "learning_rate": 0.00039325781494704197, + "loss": 3.2988, + "step": 2220 + }, + { + "epoch": 0.12939350421399248, + "grad_norm": 0.22876814007759094, + "learning_rate": 0.0003931586523376652, + "loss": 3.3028, + "step": 2230 + }, + { + "epoch": 0.1299737441432032, + "grad_norm": 0.14323562383651733, + "learning_rate": 0.00039305877849789565, + "loss": 3.2887, + "step": 2240 + }, + { + "epoch": 0.13055398407241395, + "grad_norm": 0.15895529091358185, + "learning_rate": 0.0003929581937954794, + "loss": 3.2914, + "step": 2250 + }, + { + "epoch": 0.13113422400162467, + "grad_norm": 0.1550154834985733, + "learning_rate": 0.0003928568986007798, + "loss": 3.2814, + "step": 2260 + }, + { + "epoch": 0.1317144639308354, + "grad_norm": 0.19095434248447418, + "learning_rate": 0.00039275489328677646, + "loss": 3.2766, + "step": 2270 + }, + { + "epoch": 0.13229470386004613, + "grad_norm": 0.15697865188121796, + "learning_rate": 0.0003926521782290635, + "loss": 3.2826, + "step": 2280 + }, + { + "epoch": 0.13287494378925685, + "grad_norm": 0.16411994397640228, + "learning_rate": 0.00039254875380584863, + "loss": 3.2737, + "step": 2290 + }, + { + "epoch": 0.1334551837184676, + "grad_norm": 0.18466068804264069, + "learning_rate": 0.00039244462039795137, + "loss": 3.2768, + "step": 2300 + }, + { + "epoch": 0.13403542364767831, + "grad_norm": 0.15912406146526337, + "learning_rate": 0.00039233977838880183, + "loss": 3.2842, + "step": 2310 + }, + { + "epoch": 0.13461566357688903, + "grad_norm": 0.16722336411476135, + "learning_rate": 0.0003922342281644393, + "loss": 3.2664, + "step": 2320 + }, + { + "epoch": 0.13519590350609978, + "grad_norm": 0.1424356997013092, + "learning_rate": 0.00039212797011351066, + "loss": 3.2694, + "step": 2330 + }, + { + "epoch": 0.1357761434353105, + "grad_norm": 0.16306522488594055, + "learning_rate": 0.0003920210046272693, + "loss": 3.2668, + "step": 2340 + }, + { + "epoch": 0.13635638336452122, + "grad_norm": 0.18254053592681885, + "learning_rate": 0.00039191333209957335, + "loss": 3.2637, + "step": 2350 + }, + { + "epoch": 0.13693662329373196, + "grad_norm": 0.1430349200963974, + "learning_rate": 0.0003918049529268843, + "loss": 3.2579, + "step": 2360 + }, + { + "epoch": 0.13751686322294268, + "grad_norm": 0.15292882919311523, + "learning_rate": 0.00039169586750826564, + "loss": 3.2661, + "step": 2370 + }, + { + "epoch": 0.13809710315215343, + "grad_norm": 0.20139199495315552, + "learning_rate": 0.00039158607624538124, + "loss": 3.2595, + "step": 2380 + }, + { + "epoch": 0.13867734308136415, + "grad_norm": 0.14147156476974487, + "learning_rate": 0.0003914755795424941, + "loss": 3.2484, + "step": 2390 + }, + { + "epoch": 0.13925758301057486, + "grad_norm": 0.14732101559638977, + "learning_rate": 0.0003913643778064646, + "loss": 3.2564, + "step": 2400 + }, + { + "epoch": 0.1398378229397856, + "grad_norm": 0.18487505614757538, + "learning_rate": 0.00039125247144674923, + "loss": 3.245, + "step": 2410 + }, + { + "epoch": 0.14041806286899633, + "grad_norm": 0.1768069863319397, + "learning_rate": 0.0003911398608753989, + "loss": 3.2564, + "step": 2420 + }, + { + "epoch": 0.14099830279820705, + "grad_norm": 0.15146300196647644, + "learning_rate": 0.0003910265465070576, + "loss": 3.2616, + "step": 2430 + }, + { + "epoch": 0.1415785427274178, + "grad_norm": 0.15523262321949005, + "learning_rate": 0.00039091252875896054, + "loss": 3.2485, + "step": 2440 + }, + { + "epoch": 0.1421587826566285, + "grad_norm": 0.18047015368938446, + "learning_rate": 0.0003907978080509332, + "loss": 3.25, + "step": 2450 + }, + { + "epoch": 0.14273902258583923, + "grad_norm": 0.15500716865062714, + "learning_rate": 0.00039068238480538916, + "loss": 3.2571, + "step": 2460 + }, + { + "epoch": 0.14331926251504998, + "grad_norm": 0.1513693481683731, + "learning_rate": 0.0003905662594473289, + "loss": 3.2598, + "step": 2470 + }, + { + "epoch": 0.1438995024442607, + "grad_norm": 0.15952032804489136, + "learning_rate": 0.00039044943240433815, + "loss": 3.2426, + "step": 2480 + }, + { + "epoch": 0.14447974237347144, + "grad_norm": 0.1477235108613968, + "learning_rate": 0.0003903319041065863, + "loss": 3.2344, + "step": 2490 + }, + { + "epoch": 0.14505998230268216, + "grad_norm": 0.16264913976192474, + "learning_rate": 0.00039021367498682494, + "loss": 3.2407, + "step": 2500 + }, + { + "epoch": 0.14564022223189288, + "grad_norm": 0.16381001472473145, + "learning_rate": 0.000390094745480386, + "loss": 3.2408, + "step": 2510 + }, + { + "epoch": 0.14622046216110363, + "grad_norm": 0.1747412234544754, + "learning_rate": 0.00038997511602518044, + "loss": 3.2469, + "step": 2520 + }, + { + "epoch": 0.14680070209031434, + "grad_norm": 0.1646890491247177, + "learning_rate": 0.00038985478706169633, + "loss": 3.2368, + "step": 2530 + }, + { + "epoch": 0.14738094201952506, + "grad_norm": 0.18659403920173645, + "learning_rate": 0.00038973375903299766, + "loss": 3.229, + "step": 2540 + }, + { + "epoch": 0.1479611819487358, + "grad_norm": 0.15826112031936646, + "learning_rate": 0.0003896120323847222, + "loss": 3.2389, + "step": 2550 + }, + { + "epoch": 0.14854142187794653, + "grad_norm": 0.1658063381910324, + "learning_rate": 0.00038948960756508025, + "loss": 3.2326, + "step": 2560 + }, + { + "epoch": 0.14912166180715725, + "grad_norm": 0.18063782155513763, + "learning_rate": 0.0003893664850248529, + "loss": 3.2328, + "step": 2570 + }, + { + "epoch": 0.149701901736368, + "grad_norm": 0.16473062336444855, + "learning_rate": 0.0003892426652173901, + "loss": 3.2275, + "step": 2580 + }, + { + "epoch": 0.1502821416655787, + "grad_norm": 0.1553630381822586, + "learning_rate": 0.00038911814859860953, + "loss": 3.225, + "step": 2590 + }, + { + "epoch": 0.15086238159478946, + "grad_norm": 0.1639242172241211, + "learning_rate": 0.00038899293562699423, + "loss": 3.2222, + "step": 2600 + }, + { + "epoch": 0.15144262152400018, + "grad_norm": 0.16087086498737335, + "learning_rate": 0.00038886702676359166, + "loss": 3.2169, + "step": 2610 + }, + { + "epoch": 0.1520228614532109, + "grad_norm": 0.15263104438781738, + "learning_rate": 0.0003887404224720113, + "loss": 3.2239, + "step": 2620 + }, + { + "epoch": 0.15260310138242164, + "grad_norm": 0.16660773754119873, + "learning_rate": 0.0003886131232184235, + "loss": 3.2197, + "step": 2630 + }, + { + "epoch": 0.15318334131163236, + "grad_norm": 0.16617350280284882, + "learning_rate": 0.00038848512947155744, + "loss": 3.2181, + "step": 2640 + }, + { + "epoch": 0.15376358124084308, + "grad_norm": 0.1546907275915146, + "learning_rate": 0.00038835644170269945, + "loss": 3.2182, + "step": 2650 + }, + { + "epoch": 0.15434382117005382, + "grad_norm": 0.16329720616340637, + "learning_rate": 0.0003882270603856914, + "loss": 3.2224, + "step": 2660 + }, + { + "epoch": 0.15492406109926454, + "grad_norm": 0.1458432823419571, + "learning_rate": 0.00038809698599692884, + "loss": 3.217, + "step": 2670 + }, + { + "epoch": 0.15550430102847526, + "grad_norm": 0.17481467127799988, + "learning_rate": 0.00038796621901535935, + "loss": 3.2046, + "step": 2680 + }, + { + "epoch": 0.156084540957686, + "grad_norm": 0.15761947631835938, + "learning_rate": 0.00038783475992248067, + "loss": 3.2087, + "step": 2690 + }, + { + "epoch": 0.15666478088689673, + "grad_norm": 0.14478649199008942, + "learning_rate": 0.0003877026092023388, + "loss": 3.1956, + "step": 2700 + }, + { + "epoch": 0.15724502081610747, + "grad_norm": 0.16423922777175903, + "learning_rate": 0.00038756976734152673, + "loss": 3.21, + "step": 2710 + }, + { + "epoch": 0.1578252607453182, + "grad_norm": 0.16854038834571838, + "learning_rate": 0.000387436234829182, + "loss": 3.2031, + "step": 2720 + }, + { + "epoch": 0.1584055006745289, + "grad_norm": 0.1525936722755432, + "learning_rate": 0.00038730201215698534, + "loss": 3.2038, + "step": 2730 + }, + { + "epoch": 0.15898574060373966, + "grad_norm": 0.15923213958740234, + "learning_rate": 0.00038716709981915864, + "loss": 3.2074, + "step": 2740 + }, + { + "epoch": 0.15956598053295037, + "grad_norm": 0.1662934571504593, + "learning_rate": 0.0003870314983124633, + "loss": 3.1966, + "step": 2750 + }, + { + "epoch": 0.1601462204621611, + "grad_norm": 0.17807535827159882, + "learning_rate": 0.0003868952081361983, + "loss": 3.2076, + "step": 2760 + }, + { + "epoch": 0.16072646039137184, + "grad_norm": 0.1638825237751007, + "learning_rate": 0.0003867582297921983, + "loss": 3.2042, + "step": 2770 + }, + { + "epoch": 0.16130670032058256, + "grad_norm": 0.15023517608642578, + "learning_rate": 0.0003866205637848319, + "loss": 3.1912, + "step": 2780 + }, + { + "epoch": 0.16188694024979328, + "grad_norm": 0.17124484479427338, + "learning_rate": 0.00038648221062099987, + "loss": 3.2016, + "step": 2790 + }, + { + "epoch": 0.16246718017900402, + "grad_norm": 0.15234492719173431, + "learning_rate": 0.0003863431708101329, + "loss": 3.1892, + "step": 2800 + }, + { + "epoch": 0.16304742010821474, + "grad_norm": 0.16199001669883728, + "learning_rate": 0.0003862034448641902, + "loss": 3.1906, + "step": 2810 + }, + { + "epoch": 0.1636276600374255, + "grad_norm": 0.17630840837955475, + "learning_rate": 0.0003860630332976574, + "loss": 3.1921, + "step": 2820 + }, + { + "epoch": 0.1642078999666362, + "grad_norm": 0.16295549273490906, + "learning_rate": 0.0003859219366275445, + "loss": 3.1942, + "step": 2830 + }, + { + "epoch": 0.16478813989584692, + "grad_norm": 0.14281953871250153, + "learning_rate": 0.0003857801553733843, + "loss": 3.1869, + "step": 2840 + }, + { + "epoch": 0.16536837982505767, + "grad_norm": 0.16084730625152588, + "learning_rate": 0.00038563769005723025, + "loss": 3.1836, + "step": 2850 + }, + { + "epoch": 0.1659486197542684, + "grad_norm": 0.17616966366767883, + "learning_rate": 0.00038549454120365443, + "loss": 3.1909, + "step": 2860 + }, + { + "epoch": 0.1665288596834791, + "grad_norm": 0.16749773919582367, + "learning_rate": 0.00038535070933974603, + "loss": 3.1856, + "step": 2870 + }, + { + "epoch": 0.16710909961268985, + "grad_norm": 0.1418871134519577, + "learning_rate": 0.00038520619499510906, + "loss": 3.1804, + "step": 2880 + }, + { + "epoch": 0.16768933954190057, + "grad_norm": 0.1656585931777954, + "learning_rate": 0.00038506099870186036, + "loss": 3.1827, + "step": 2890 + }, + { + "epoch": 0.1682695794711113, + "grad_norm": 0.17027276754379272, + "learning_rate": 0.000384915120994628, + "loss": 3.1773, + "step": 2900 + }, + { + "epoch": 0.16884981940032204, + "grad_norm": 0.14748819172382355, + "learning_rate": 0.0003847685624105489, + "loss": 3.1852, + "step": 2910 + }, + { + "epoch": 0.16943005932953276, + "grad_norm": 0.14570088684558868, + "learning_rate": 0.00038462132348926725, + "loss": 3.1734, + "step": 2920 + }, + { + "epoch": 0.1700102992587435, + "grad_norm": 0.15523803234100342, + "learning_rate": 0.000384473404772932, + "loss": 3.1766, + "step": 2930 + }, + { + "epoch": 0.17059053918795422, + "grad_norm": 0.1739635467529297, + "learning_rate": 0.00038432480680619544, + "loss": 3.1826, + "step": 2940 + }, + { + "epoch": 0.17117077911716494, + "grad_norm": 0.171515554189682, + "learning_rate": 0.0003841755301362109, + "loss": 3.1771, + "step": 2950 + }, + { + "epoch": 0.17175101904637569, + "grad_norm": 0.15663164854049683, + "learning_rate": 0.00038402557531263073, + "loss": 3.1713, + "step": 2960 + }, + { + "epoch": 0.1723312589755864, + "grad_norm": 0.17138132452964783, + "learning_rate": 0.0003838749428876042, + "loss": 3.1771, + "step": 2970 + }, + { + "epoch": 0.17291149890479712, + "grad_norm": 0.1759568601846695, + "learning_rate": 0.0003837236334157757, + "loss": 3.1656, + "step": 2980 + }, + { + "epoch": 0.17349173883400787, + "grad_norm": 0.15324968099594116, + "learning_rate": 0.0003835716474542826, + "loss": 3.1619, + "step": 2990 + }, + { + "epoch": 0.1740719787632186, + "grad_norm": 0.1392621397972107, + "learning_rate": 0.00038341898556275316, + "loss": 3.1809, + "step": 3000 + }, + { + "epoch": 0.1740719787632186, + "eval_loss": 3.133876085281372, + "eval_runtime": 5.4039, + "eval_samples_per_second": 801.268, + "eval_steps_per_second": 1.665, + "step": 3000 + }, + { + "epoch": 0.1746522186924293, + "grad_norm": 0.1699080765247345, + "learning_rate": 0.00038326564830330436, + "loss": 3.161, + "step": 3010 + }, + { + "epoch": 0.17523245862164005, + "grad_norm": 0.15621140599250793, + "learning_rate": 0.0003831116362405401, + "loss": 3.1638, + "step": 3020 + }, + { + "epoch": 0.17581269855085077, + "grad_norm": 0.1653863489627838, + "learning_rate": 0.000382956949941549, + "loss": 3.166, + "step": 3030 + }, + { + "epoch": 0.17639293848006152, + "grad_norm": 0.16162124276161194, + "learning_rate": 0.0003828015899759021, + "loss": 3.1626, + "step": 3040 + }, + { + "epoch": 0.17697317840927224, + "grad_norm": 0.15676504373550415, + "learning_rate": 0.0003826455569156511, + "loss": 3.1747, + "step": 3050 + }, + { + "epoch": 0.17755341833848295, + "grad_norm": 0.15469352900981903, + "learning_rate": 0.00038248885133532613, + "loss": 3.1647, + "step": 3060 + }, + { + "epoch": 0.1781336582676937, + "grad_norm": 0.14125365018844604, + "learning_rate": 0.00038233147381193345, + "loss": 3.1577, + "step": 3070 + }, + { + "epoch": 0.17871389819690442, + "grad_norm": 0.1607305109500885, + "learning_rate": 0.00038217342492495376, + "loss": 3.155, + "step": 3080 + }, + { + "epoch": 0.17929413812611514, + "grad_norm": 0.15099143981933594, + "learning_rate": 0.0003820147052563394, + "loss": 3.156, + "step": 3090 + }, + { + "epoch": 0.17987437805532588, + "grad_norm": 0.1593606024980545, + "learning_rate": 0.00038185531539051303, + "loss": 3.1629, + "step": 3100 + }, + { + "epoch": 0.1804546179845366, + "grad_norm": 0.1826634258031845, + "learning_rate": 0.00038169525591436466, + "loss": 3.1433, + "step": 3110 + }, + { + "epoch": 0.18103485791374732, + "grad_norm": 0.15707820653915405, + "learning_rate": 0.00038153452741725017, + "loss": 3.1562, + "step": 3120 + }, + { + "epoch": 0.18161509784295807, + "grad_norm": 0.15769994258880615, + "learning_rate": 0.0003813731304909887, + "loss": 3.152, + "step": 3130 + }, + { + "epoch": 0.18219533777216879, + "grad_norm": 0.15145163238048553, + "learning_rate": 0.0003812110657298605, + "loss": 3.1527, + "step": 3140 + }, + { + "epoch": 0.18277557770137953, + "grad_norm": 0.15320977568626404, + "learning_rate": 0.0003810483337306052, + "loss": 3.1437, + "step": 3150 + }, + { + "epoch": 0.18335581763059025, + "grad_norm": 0.16322501003742218, + "learning_rate": 0.0003808849350924189, + "loss": 3.1564, + "step": 3160 + }, + { + "epoch": 0.18393605755980097, + "grad_norm": 0.14782309532165527, + "learning_rate": 0.0003807208704169527, + "loss": 3.1436, + "step": 3170 + }, + { + "epoch": 0.18451629748901172, + "grad_norm": 0.17319355905056, + "learning_rate": 0.0003805561403083097, + "loss": 3.1473, + "step": 3180 + }, + { + "epoch": 0.18509653741822243, + "grad_norm": 0.15451788902282715, + "learning_rate": 0.0003803907453730436, + "loss": 3.1395, + "step": 3190 + }, + { + "epoch": 0.18567677734743315, + "grad_norm": 0.17169536650180817, + "learning_rate": 0.00038022468622015576, + "loss": 3.1458, + "step": 3200 + }, + { + "epoch": 0.1862570172766439, + "grad_norm": 0.1508411318063736, + "learning_rate": 0.0003800579634610934, + "loss": 3.1425, + "step": 3210 + }, + { + "epoch": 0.18683725720585462, + "grad_norm": 0.17760975658893585, + "learning_rate": 0.00037989057770974725, + "loss": 3.144, + "step": 3220 + }, + { + "epoch": 0.18741749713506534, + "grad_norm": 0.15805186331272125, + "learning_rate": 0.0003797225295824491, + "loss": 3.1457, + "step": 3230 + }, + { + "epoch": 0.18799773706427608, + "grad_norm": 0.15923067927360535, + "learning_rate": 0.0003795538196979698, + "loss": 3.1366, + "step": 3240 + }, + { + "epoch": 0.1885779769934868, + "grad_norm": 0.16095119714736938, + "learning_rate": 0.00037938444867751677, + "loss": 3.1417, + "step": 3250 + }, + { + "epoch": 0.18915821692269755, + "grad_norm": 0.1521240770816803, + "learning_rate": 0.00037921441714473196, + "loss": 3.1393, + "step": 3260 + }, + { + "epoch": 0.18973845685190827, + "grad_norm": 0.16122375428676605, + "learning_rate": 0.0003790437257256892, + "loss": 3.1197, + "step": 3270 + }, + { + "epoch": 0.19031869678111898, + "grad_norm": 0.16436485946178436, + "learning_rate": 0.0003788723750488922, + "loss": 3.1349, + "step": 3280 + }, + { + "epoch": 0.19089893671032973, + "grad_norm": 0.15300863981246948, + "learning_rate": 0.00037870036574527206, + "loss": 3.1362, + "step": 3290 + }, + { + "epoch": 0.19147917663954045, + "grad_norm": 0.16245847940444946, + "learning_rate": 0.00037852769844818506, + "loss": 3.1323, + "step": 3300 + }, + { + "epoch": 0.19205941656875117, + "grad_norm": 0.14438757300376892, + "learning_rate": 0.00037835437379341036, + "loss": 3.1291, + "step": 3310 + }, + { + "epoch": 0.19263965649796191, + "grad_norm": 0.17143815755844116, + "learning_rate": 0.0003781803924191474, + "loss": 3.1266, + "step": 3320 + }, + { + "epoch": 0.19321989642717263, + "grad_norm": 0.15372861921787262, + "learning_rate": 0.0003780057549660139, + "loss": 3.141, + "step": 3330 + }, + { + "epoch": 0.19380013635638335, + "grad_norm": 0.15773558616638184, + "learning_rate": 0.0003778304620770432, + "loss": 3.1219, + "step": 3340 + }, + { + "epoch": 0.1943803762855941, + "grad_norm": 0.144402414560318, + "learning_rate": 0.0003776545143976821, + "loss": 3.129, + "step": 3350 + }, + { + "epoch": 0.19496061621480482, + "grad_norm": 0.14592647552490234, + "learning_rate": 0.00037747791257578846, + "loss": 3.117, + "step": 3360 + }, + { + "epoch": 0.19554085614401556, + "grad_norm": 0.146622434258461, + "learning_rate": 0.0003773006572616286, + "loss": 3.1317, + "step": 3370 + }, + { + "epoch": 0.19612109607322628, + "grad_norm": 0.14668260514736176, + "learning_rate": 0.00037712274910787515, + "loss": 3.1243, + "step": 3380 + }, + { + "epoch": 0.196701336002437, + "grad_norm": 0.166849285364151, + "learning_rate": 0.0003769441887696046, + "loss": 3.1273, + "step": 3390 + }, + { + "epoch": 0.19728157593164775, + "grad_norm": 0.16720564663410187, + "learning_rate": 0.0003767649769042948, + "loss": 3.118, + "step": 3400 + }, + { + "epoch": 0.19786181586085846, + "grad_norm": 0.17589610815048218, + "learning_rate": 0.00037658511417182263, + "loss": 3.1216, + "step": 3410 + }, + { + "epoch": 0.19844205579006918, + "grad_norm": 0.16391609609127045, + "learning_rate": 0.00037640460123446146, + "loss": 3.1152, + "step": 3420 + }, + { + "epoch": 0.19902229571927993, + "grad_norm": 0.14618930220603943, + "learning_rate": 0.000376223438756879, + "loss": 3.1129, + "step": 3430 + }, + { + "epoch": 0.19960253564849065, + "grad_norm": 0.1584542989730835, + "learning_rate": 0.0003760416274061343, + "loss": 3.1122, + "step": 3440 + }, + { + "epoch": 0.20018277557770137, + "grad_norm": 0.14501620829105377, + "learning_rate": 0.00037585916785167584, + "loss": 3.122, + "step": 3450 + }, + { + "epoch": 0.2007630155069121, + "grad_norm": 0.15992431342601776, + "learning_rate": 0.0003756760607653388, + "loss": 3.1164, + "step": 3460 + }, + { + "epoch": 0.20134325543612283, + "grad_norm": 0.15558482706546783, + "learning_rate": 0.0003754923068213428, + "loss": 3.1118, + "step": 3470 + }, + { + "epoch": 0.20192349536533358, + "grad_norm": 0.1663794219493866, + "learning_rate": 0.00037530790669628887, + "loss": 3.1149, + "step": 3480 + }, + { + "epoch": 0.2025037352945443, + "grad_norm": 0.1476125866174698, + "learning_rate": 0.0003751228610691578, + "loss": 3.1124, + "step": 3490 + }, + { + "epoch": 0.20308397522375501, + "grad_norm": 0.14965076744556427, + "learning_rate": 0.00037493717062130684, + "loss": 3.1167, + "step": 3500 + }, + { + "epoch": 0.20366421515296576, + "grad_norm": 0.17828093469142914, + "learning_rate": 0.0003747508360364677, + "loss": 3.1022, + "step": 3510 + }, + { + "epoch": 0.20424445508217648, + "grad_norm": 0.18014219403266907, + "learning_rate": 0.0003745638580007439, + "loss": 3.1002, + "step": 3520 + }, + { + "epoch": 0.2048246950113872, + "grad_norm": 0.1711190640926361, + "learning_rate": 0.0003743762372026081, + "loss": 3.1091, + "step": 3530 + }, + { + "epoch": 0.20540493494059794, + "grad_norm": 0.14033159613609314, + "learning_rate": 0.00037418797433289974, + "loss": 3.1023, + "step": 3540 + }, + { + "epoch": 0.20598517486980866, + "grad_norm": 0.15103302896022797, + "learning_rate": 0.00037399907008482246, + "loss": 3.0971, + "step": 3550 + }, + { + "epoch": 0.20656541479901938, + "grad_norm": 0.14676210284233093, + "learning_rate": 0.00037380952515394145, + "loss": 3.1013, + "step": 3560 + }, + { + "epoch": 0.20714565472823013, + "grad_norm": 0.15218688547611237, + "learning_rate": 0.000373619340238181, + "loss": 3.1072, + "step": 3570 + }, + { + "epoch": 0.20772589465744085, + "grad_norm": 0.16119298338890076, + "learning_rate": 0.00037342851603782193, + "loss": 3.1034, + "step": 3580 + }, + { + "epoch": 0.2083061345866516, + "grad_norm": 0.18104802072048187, + "learning_rate": 0.0003732370532554989, + "loss": 3.1047, + "step": 3590 + }, + { + "epoch": 0.2088863745158623, + "grad_norm": 0.16622230410575867, + "learning_rate": 0.00037304495259619794, + "loss": 3.0968, + "step": 3600 + }, + { + "epoch": 0.20946661444507303, + "grad_norm": 0.1655486822128296, + "learning_rate": 0.0003728522147672538, + "loss": 3.0997, + "step": 3610 + }, + { + "epoch": 0.21004685437428378, + "grad_norm": 0.14865244925022125, + "learning_rate": 0.0003726588404783474, + "loss": 3.1107, + "step": 3620 + }, + { + "epoch": 0.2106270943034945, + "grad_norm": 0.1397307813167572, + "learning_rate": 0.00037246483044150314, + "loss": 3.0949, + "step": 3630 + }, + { + "epoch": 0.2112073342327052, + "grad_norm": 0.15176993608474731, + "learning_rate": 0.0003722701853710862, + "loss": 3.0983, + "step": 3640 + }, + { + "epoch": 0.21178757416191596, + "grad_norm": 0.15583378076553345, + "learning_rate": 0.0003720749059838002, + "loss": 3.1001, + "step": 3650 + }, + { + "epoch": 0.21236781409112668, + "grad_norm": 0.14740754663944244, + "learning_rate": 0.0003718789929986843, + "loss": 3.0963, + "step": 3660 + }, + { + "epoch": 0.2129480540203374, + "grad_norm": 0.14856573939323425, + "learning_rate": 0.0003716824471371105, + "loss": 3.0926, + "step": 3670 + }, + { + "epoch": 0.21352829394954814, + "grad_norm": 0.1573585569858551, + "learning_rate": 0.0003714852691227814, + "loss": 3.0958, + "step": 3680 + }, + { + "epoch": 0.21410853387875886, + "grad_norm": 0.18037015199661255, + "learning_rate": 0.00037128745968172713, + "loss": 3.0936, + "step": 3690 + }, + { + "epoch": 0.2146887738079696, + "grad_norm": 0.15116359293460846, + "learning_rate": 0.00037108901954230263, + "loss": 3.0875, + "step": 3700 + }, + { + "epoch": 0.21526901373718033, + "grad_norm": 0.16249211132526398, + "learning_rate": 0.0003708899494351854, + "loss": 3.0927, + "step": 3710 + }, + { + "epoch": 0.21584925366639104, + "grad_norm": 0.1532929241657257, + "learning_rate": 0.00037069025009337246, + "loss": 3.0924, + "step": 3720 + }, + { + "epoch": 0.2164294935956018, + "grad_norm": 0.16931064426898956, + "learning_rate": 0.00037048992225217756, + "loss": 3.0961, + "step": 3730 + }, + { + "epoch": 0.2170097335248125, + "grad_norm": 0.15800388157367706, + "learning_rate": 0.0003702889666492289, + "loss": 3.0853, + "step": 3740 + }, + { + "epoch": 0.21758997345402323, + "grad_norm": 0.15049238502979279, + "learning_rate": 0.00037008738402446604, + "loss": 3.0863, + "step": 3750 + }, + { + "epoch": 0.21817021338323397, + "grad_norm": 0.1531439870595932, + "learning_rate": 0.0003698851751201373, + "loss": 3.0868, + "step": 3760 + }, + { + "epoch": 0.2187504533124447, + "grad_norm": 0.14501748979091644, + "learning_rate": 0.000369682340680797, + "loss": 3.0793, + "step": 3770 + }, + { + "epoch": 0.2193306932416554, + "grad_norm": 0.15761543810367584, + "learning_rate": 0.00036947888145330294, + "loss": 3.0912, + "step": 3780 + }, + { + "epoch": 0.21991093317086616, + "grad_norm": 0.15072381496429443, + "learning_rate": 0.00036927479818681325, + "loss": 3.0839, + "step": 3790 + }, + { + "epoch": 0.22049117310007688, + "grad_norm": 0.1547534018754959, + "learning_rate": 0.0003690700916327838, + "loss": 3.0793, + "step": 3800 + }, + { + "epoch": 0.22107141302928762, + "grad_norm": 0.16465094685554504, + "learning_rate": 0.0003688647625449657, + "loss": 3.0857, + "step": 3810 + }, + { + "epoch": 0.22165165295849834, + "grad_norm": 0.15939410030841827, + "learning_rate": 0.00036865881167940214, + "loss": 3.0748, + "step": 3820 + }, + { + "epoch": 0.22223189288770906, + "grad_norm": 0.1516985446214676, + "learning_rate": 0.00036845223979442565, + "loss": 3.0824, + "step": 3830 + }, + { + "epoch": 0.2228121328169198, + "grad_norm": 0.157265767455101, + "learning_rate": 0.00036824504765065573, + "loss": 3.0839, + "step": 3840 + }, + { + "epoch": 0.22339237274613052, + "grad_norm": 0.1470394879579544, + "learning_rate": 0.0003680372360109954, + "loss": 3.0831, + "step": 3850 + }, + { + "epoch": 0.22397261267534124, + "grad_norm": 0.16063229739665985, + "learning_rate": 0.000367828805640629, + "loss": 3.0752, + "step": 3860 + }, + { + "epoch": 0.224552852604552, + "grad_norm": 0.1397065967321396, + "learning_rate": 0.0003676197573070189, + "loss": 3.0689, + "step": 3870 + }, + { + "epoch": 0.2251330925337627, + "grad_norm": 0.16024722158908844, + "learning_rate": 0.0003674100917799031, + "loss": 3.0718, + "step": 3880 + }, + { + "epoch": 0.22571333246297343, + "grad_norm": 0.15182538330554962, + "learning_rate": 0.0003671998098312919, + "loss": 3.07, + "step": 3890 + }, + { + "epoch": 0.22629357239218417, + "grad_norm": 0.17511528730392456, + "learning_rate": 0.0003669889122354655, + "loss": 3.0768, + "step": 3900 + }, + { + "epoch": 0.2268738123213949, + "grad_norm": 0.1410207599401474, + "learning_rate": 0.00036677739976897095, + "loss": 3.0727, + "step": 3910 + }, + { + "epoch": 0.22745405225060564, + "grad_norm": 0.14334461092948914, + "learning_rate": 0.00036656527321061934, + "loss": 3.0677, + "step": 3920 + }, + { + "epoch": 0.22803429217981636, + "grad_norm": 0.16972528398036957, + "learning_rate": 0.0003663525333414828, + "loss": 3.0662, + "step": 3930 + }, + { + "epoch": 0.22861453210902707, + "grad_norm": 0.13627688586711884, + "learning_rate": 0.0003661391809448919, + "loss": 3.0743, + "step": 3940 + }, + { + "epoch": 0.22919477203823782, + "grad_norm": 0.14983052015304565, + "learning_rate": 0.0003659252168064325, + "loss": 3.0657, + "step": 3950 + }, + { + "epoch": 0.22977501196744854, + "grad_norm": 0.16689680516719818, + "learning_rate": 0.00036571064171394294, + "loss": 3.078, + "step": 3960 + }, + { + "epoch": 0.23035525189665926, + "grad_norm": 0.16488981246948242, + "learning_rate": 0.00036549545645751124, + "loss": 3.0733, + "step": 3970 + }, + { + "epoch": 0.23093549182587, + "grad_norm": 0.15926587581634521, + "learning_rate": 0.0003652796618294719, + "loss": 3.0602, + "step": 3980 + }, + { + "epoch": 0.23151573175508072, + "grad_norm": 0.16738209128379822, + "learning_rate": 0.0003650632586244036, + "loss": 3.0589, + "step": 3990 + }, + { + "epoch": 0.23209597168429144, + "grad_norm": 0.16227886080741882, + "learning_rate": 0.00036484624763912535, + "loss": 3.0677, + "step": 4000 + }, + { + "epoch": 0.23209597168429144, + "eval_loss": 3.035964250564575, + "eval_runtime": 5.3945, + "eval_samples_per_second": 802.668, + "eval_steps_per_second": 1.668, + "step": 4000 + }, + { + "epoch": 0.2326762116135022, + "grad_norm": 0.15232205390930176, + "learning_rate": 0.00036462862967269455, + "loss": 3.0652, + "step": 4010 + }, + { + "epoch": 0.2332564515427129, + "grad_norm": 0.16944393515586853, + "learning_rate": 0.0003644104055264032, + "loss": 3.0726, + "step": 4020 + }, + { + "epoch": 0.23383669147192365, + "grad_norm": 0.16069863736629486, + "learning_rate": 0.00036419157600377553, + "loss": 3.0671, + "step": 4030 + }, + { + "epoch": 0.23441693140113437, + "grad_norm": 0.13939155638217926, + "learning_rate": 0.00036397214191056474, + "loss": 3.0597, + "step": 4040 + }, + { + "epoch": 0.2349971713303451, + "grad_norm": 0.14409920573234558, + "learning_rate": 0.0003637521040547502, + "loss": 3.0608, + "step": 4050 + }, + { + "epoch": 0.23557741125955584, + "grad_norm": 0.144365593791008, + "learning_rate": 0.0003635314632465343, + "loss": 3.0722, + "step": 4060 + }, + { + "epoch": 0.23615765118876655, + "grad_norm": 0.15792323648929596, + "learning_rate": 0.00036331022029833967, + "loss": 3.056, + "step": 4070 + }, + { + "epoch": 0.23673789111797727, + "grad_norm": 0.1579774171113968, + "learning_rate": 0.00036308837602480593, + "loss": 3.0596, + "step": 4080 + }, + { + "epoch": 0.23731813104718802, + "grad_norm": 0.14511054754257202, + "learning_rate": 0.00036286593124278696, + "loss": 3.0552, + "step": 4090 + }, + { + "epoch": 0.23789837097639874, + "grad_norm": 0.14825165271759033, + "learning_rate": 0.0003626428867713478, + "loss": 3.0444, + "step": 4100 + }, + { + "epoch": 0.23847861090560946, + "grad_norm": 0.16554515063762665, + "learning_rate": 0.00036241924343176146, + "loss": 3.0519, + "step": 4110 + }, + { + "epoch": 0.2390588508348202, + "grad_norm": 0.13520917296409607, + "learning_rate": 0.00036219500204750626, + "loss": 3.0523, + "step": 4120 + }, + { + "epoch": 0.23963909076403092, + "grad_norm": 0.16214175522327423, + "learning_rate": 0.00036197016344426244, + "loss": 3.056, + "step": 4130 + }, + { + "epoch": 0.24021933069324167, + "grad_norm": 0.15843996405601501, + "learning_rate": 0.0003617447284499093, + "loss": 3.0582, + "step": 4140 + }, + { + "epoch": 0.24079957062245239, + "grad_norm": 0.1621207445859909, + "learning_rate": 0.000361518697894522, + "loss": 3.0547, + "step": 4150 + }, + { + "epoch": 0.2413798105516631, + "grad_norm": 0.16439932584762573, + "learning_rate": 0.0003612920726103688, + "loss": 3.0564, + "step": 4160 + }, + { + "epoch": 0.24196005048087385, + "grad_norm": 0.168155238032341, + "learning_rate": 0.00036106485343190785, + "loss": 3.0524, + "step": 4170 + }, + { + "epoch": 0.24254029041008457, + "grad_norm": 0.17214582860469818, + "learning_rate": 0.0003608370411957838, + "loss": 3.0549, + "step": 4180 + }, + { + "epoch": 0.2431205303392953, + "grad_norm": 0.14759258925914764, + "learning_rate": 0.00036060863674082516, + "loss": 3.0482, + "step": 4190 + }, + { + "epoch": 0.24370077026850603, + "grad_norm": 0.15004561841487885, + "learning_rate": 0.00036037964090804113, + "loss": 3.0485, + "step": 4200 + }, + { + "epoch": 0.24428101019771675, + "grad_norm": 0.15239353477954865, + "learning_rate": 0.0003601500545406184, + "loss": 3.0451, + "step": 4210 + }, + { + "epoch": 0.24486125012692747, + "grad_norm": 0.17232055962085724, + "learning_rate": 0.00035991987848391793, + "loss": 3.0633, + "step": 4220 + }, + { + "epoch": 0.24544149005613822, + "grad_norm": 0.15132233500480652, + "learning_rate": 0.0003596891135854722, + "loss": 3.0469, + "step": 4230 + }, + { + "epoch": 0.24602172998534894, + "grad_norm": 0.15383832156658173, + "learning_rate": 0.00035945776069498154, + "loss": 3.043, + "step": 4240 + }, + { + "epoch": 0.24660196991455968, + "grad_norm": 0.1648101806640625, + "learning_rate": 0.0003592258206643117, + "loss": 3.0494, + "step": 4250 + }, + { + "epoch": 0.2471822098437704, + "grad_norm": 0.14788667857646942, + "learning_rate": 0.0003589932943474901, + "loss": 3.051, + "step": 4260 + }, + { + "epoch": 0.24776244977298112, + "grad_norm": 0.15840385854244232, + "learning_rate": 0.00035876018260070307, + "loss": 3.04, + "step": 4270 + }, + { + "epoch": 0.24834268970219187, + "grad_norm": 0.1488913893699646, + "learning_rate": 0.0003585264862822924, + "loss": 3.0482, + "step": 4280 + }, + { + "epoch": 0.24892292963140258, + "grad_norm": 0.1538337618112564, + "learning_rate": 0.00035829220625275247, + "loss": 3.0438, + "step": 4290 + }, + { + "epoch": 0.2495031695606133, + "grad_norm": 0.15288054943084717, + "learning_rate": 0.00035805734337472677, + "loss": 3.047, + "step": 4300 + }, + { + "epoch": 0.25008340948982405, + "grad_norm": 0.14262276887893677, + "learning_rate": 0.0003578218985130052, + "loss": 3.0327, + "step": 4310 + }, + { + "epoch": 0.2506636494190348, + "grad_norm": 0.1732674241065979, + "learning_rate": 0.0003575858725345203, + "loss": 3.04, + "step": 4320 + }, + { + "epoch": 0.2512438893482455, + "grad_norm": 0.14097104966640472, + "learning_rate": 0.00035734926630834443, + "loss": 3.0479, + "step": 4330 + }, + { + "epoch": 0.25182412927745623, + "grad_norm": 0.1385360062122345, + "learning_rate": 0.0003571120807056866, + "loss": 3.0382, + "step": 4340 + }, + { + "epoch": 0.252404369206667, + "grad_norm": 0.14046671986579895, + "learning_rate": 0.0003568743165998889, + "loss": 3.0426, + "step": 4350 + }, + { + "epoch": 0.25298460913587767, + "grad_norm": 0.17887942492961884, + "learning_rate": 0.0003566359748664238, + "loss": 3.0435, + "step": 4360 + }, + { + "epoch": 0.2535648490650884, + "grad_norm": 0.14854948222637177, + "learning_rate": 0.00035639705638289054, + "loss": 3.0453, + "step": 4370 + }, + { + "epoch": 0.25414508899429916, + "grad_norm": 0.15000031888484955, + "learning_rate": 0.0003561575620290119, + "loss": 3.044, + "step": 4380 + }, + { + "epoch": 0.25472532892350985, + "grad_norm": 0.1506197601556778, + "learning_rate": 0.0003559174926866312, + "loss": 3.0411, + "step": 4390 + }, + { + "epoch": 0.2553055688527206, + "grad_norm": 0.1484203040599823, + "learning_rate": 0.000355676849239709, + "loss": 3.0383, + "step": 4400 + }, + { + "epoch": 0.25588580878193135, + "grad_norm": 0.15664157271385193, + "learning_rate": 0.00035543563257431967, + "loss": 3.03, + "step": 4410 + }, + { + "epoch": 0.25646604871114204, + "grad_norm": 0.1449301540851593, + "learning_rate": 0.00035519384357864814, + "loss": 3.0264, + "step": 4420 + }, + { + "epoch": 0.2570462886403528, + "grad_norm": 0.15928883850574493, + "learning_rate": 0.0003549514831429869, + "loss": 3.0256, + "step": 4430 + }, + { + "epoch": 0.25762652856956353, + "grad_norm": 0.15556131303310394, + "learning_rate": 0.0003547085521597324, + "loss": 3.042, + "step": 4440 + }, + { + "epoch": 0.2582067684987742, + "grad_norm": 0.15970775485038757, + "learning_rate": 0.00035446505152338196, + "loss": 3.0259, + "step": 4450 + }, + { + "epoch": 0.25878700842798497, + "grad_norm": 0.16376914083957672, + "learning_rate": 0.00035422098213053053, + "loss": 3.0263, + "step": 4460 + }, + { + "epoch": 0.2593672483571957, + "grad_norm": 0.15761475265026093, + "learning_rate": 0.00035397634487986716, + "loss": 3.0231, + "step": 4470 + }, + { + "epoch": 0.2599474882864064, + "grad_norm": 0.1443856954574585, + "learning_rate": 0.00035373114067217175, + "loss": 3.0262, + "step": 4480 + }, + { + "epoch": 0.26052772821561715, + "grad_norm": 0.16176897287368774, + "learning_rate": 0.000353485370410312, + "loss": 3.0209, + "step": 4490 + }, + { + "epoch": 0.2611079681448279, + "grad_norm": 0.15875308215618134, + "learning_rate": 0.00035323903499923966, + "loss": 3.0182, + "step": 4500 + }, + { + "epoch": 0.26168820807403864, + "grad_norm": 0.16142487525939941, + "learning_rate": 0.0003529921353459875, + "loss": 3.0303, + "step": 4510 + }, + { + "epoch": 0.26226844800324933, + "grad_norm": 0.1423715204000473, + "learning_rate": 0.00035274467235966604, + "loss": 3.027, + "step": 4520 + }, + { + "epoch": 0.2628486879324601, + "grad_norm": 0.1590990573167801, + "learning_rate": 0.0003524966469514598, + "loss": 3.0175, + "step": 4530 + }, + { + "epoch": 0.2634289278616708, + "grad_norm": 0.16158755123615265, + "learning_rate": 0.0003522480600346244, + "loss": 3.0201, + "step": 4540 + }, + { + "epoch": 0.2640091677908815, + "grad_norm": 0.14409056305885315, + "learning_rate": 0.00035199891252448286, + "loss": 3.0249, + "step": 4550 + }, + { + "epoch": 0.26458940772009226, + "grad_norm": 0.1601254642009735, + "learning_rate": 0.0003517492053384224, + "loss": 3.0259, + "step": 4560 + }, + { + "epoch": 0.265169647649303, + "grad_norm": 0.1478479653596878, + "learning_rate": 0.00035149893939589105, + "loss": 3.0304, + "step": 4570 + }, + { + "epoch": 0.2657498875785137, + "grad_norm": 0.15490871667861938, + "learning_rate": 0.0003512481156183943, + "loss": 3.0322, + "step": 4580 + }, + { + "epoch": 0.26633012750772445, + "grad_norm": 0.16943325102329254, + "learning_rate": 0.00035099673492949135, + "loss": 3.0206, + "step": 4590 + }, + { + "epoch": 0.2669103674369352, + "grad_norm": 0.14153891801834106, + "learning_rate": 0.00035074479825479256, + "loss": 3.0222, + "step": 4600 + }, + { + "epoch": 0.2674906073661459, + "grad_norm": 0.16155371069908142, + "learning_rate": 0.0003504923065219549, + "loss": 3.0235, + "step": 4610 + }, + { + "epoch": 0.26807084729535663, + "grad_norm": 0.16925618052482605, + "learning_rate": 0.0003502392606606795, + "loss": 3.023, + "step": 4620 + }, + { + "epoch": 0.2686510872245674, + "grad_norm": 0.1443454921245575, + "learning_rate": 0.00034998566160270765, + "loss": 3.0171, + "step": 4630 + }, + { + "epoch": 0.26923132715377807, + "grad_norm": 0.15553627908229828, + "learning_rate": 0.0003497315102818177, + "loss": 3.0249, + "step": 4640 + }, + { + "epoch": 0.2698115670829888, + "grad_norm": 0.15279638767242432, + "learning_rate": 0.00034947680763382146, + "loss": 3.0172, + "step": 4650 + }, + { + "epoch": 0.27039180701219956, + "grad_norm": 0.17935776710510254, + "learning_rate": 0.00034922155459656077, + "loss": 3.0227, + "step": 4660 + }, + { + "epoch": 0.27097204694141025, + "grad_norm": 0.15077771246433258, + "learning_rate": 0.000348965752109904, + "loss": 3.0012, + "step": 4670 + }, + { + "epoch": 0.271552286870621, + "grad_norm": 0.17899669706821442, + "learning_rate": 0.0003487094011157427, + "loss": 3.0186, + "step": 4680 + }, + { + "epoch": 0.27213252679983174, + "grad_norm": 0.1435423046350479, + "learning_rate": 0.00034845250255798813, + "loss": 3.0165, + "step": 4690 + }, + { + "epoch": 0.27271276672904243, + "grad_norm": 0.15444038808345795, + "learning_rate": 0.0003481950573825676, + "loss": 3.0207, + "step": 4700 + }, + { + "epoch": 0.2732930066582532, + "grad_norm": 0.14248330891132355, + "learning_rate": 0.0003479370665374213, + "loss": 3.0107, + "step": 4710 + }, + { + "epoch": 0.2738732465874639, + "grad_norm": 0.14584462344646454, + "learning_rate": 0.0003476785309724986, + "loss": 3.0094, + "step": 4720 + }, + { + "epoch": 0.27445348651667467, + "grad_norm": 0.1478513926267624, + "learning_rate": 0.0003474194516397544, + "loss": 3.0145, + "step": 4730 + }, + { + "epoch": 0.27503372644588536, + "grad_norm": 0.16704440116882324, + "learning_rate": 0.00034715982949314603, + "loss": 3.0119, + "step": 4740 + }, + { + "epoch": 0.2756139663750961, + "grad_norm": 0.1426423341035843, + "learning_rate": 0.0003468996654886294, + "loss": 3.0093, + "step": 4750 + }, + { + "epoch": 0.27619420630430686, + "grad_norm": 0.14460447430610657, + "learning_rate": 0.00034663896058415565, + "loss": 3.0113, + "step": 4760 + }, + { + "epoch": 0.27677444623351755, + "grad_norm": 0.1554524302482605, + "learning_rate": 0.0003463777157396676, + "loss": 3.0009, + "step": 4770 + }, + { + "epoch": 0.2773546861627283, + "grad_norm": 0.14673510193824768, + "learning_rate": 0.00034611593191709593, + "loss": 3.0107, + "step": 4780 + }, + { + "epoch": 0.27793492609193904, + "grad_norm": 0.15779437124729156, + "learning_rate": 0.0003458536100803564, + "loss": 3.0242, + "step": 4790 + }, + { + "epoch": 0.27851516602114973, + "grad_norm": 0.15662328898906708, + "learning_rate": 0.0003455907511953452, + "loss": 3.0155, + "step": 4800 + }, + { + "epoch": 0.2790954059503605, + "grad_norm": 0.1433698534965515, + "learning_rate": 0.00034532735622993643, + "loss": 3.0027, + "step": 4810 + }, + { + "epoch": 0.2796756458795712, + "grad_norm": 0.13105028867721558, + "learning_rate": 0.000345063426153978, + "loss": 3.0059, + "step": 4820 + }, + { + "epoch": 0.2802558858087819, + "grad_norm": 0.1466207504272461, + "learning_rate": 0.00034479896193928794, + "loss": 3.0132, + "step": 4830 + }, + { + "epoch": 0.28083612573799266, + "grad_norm": 0.1647462099790573, + "learning_rate": 0.00034453396455965134, + "loss": 3.0014, + "step": 4840 + }, + { + "epoch": 0.2814163656672034, + "grad_norm": 0.14212268590927124, + "learning_rate": 0.0003442684349908162, + "loss": 3.0069, + "step": 4850 + }, + { + "epoch": 0.2819966055964141, + "grad_norm": 0.15203924477100372, + "learning_rate": 0.00034400237421049033, + "loss": 3.0011, + "step": 4860 + }, + { + "epoch": 0.28257684552562484, + "grad_norm": 0.15818612277507782, + "learning_rate": 0.0003437357831983373, + "loss": 3.0014, + "step": 4870 + }, + { + "epoch": 0.2831570854548356, + "grad_norm": 0.15356150269508362, + "learning_rate": 0.0003434686629359732, + "loss": 3.0052, + "step": 4880 + }, + { + "epoch": 0.2837373253840463, + "grad_norm": 0.14168018102645874, + "learning_rate": 0.0003432010144069628, + "loss": 3.0062, + "step": 4890 + }, + { + "epoch": 0.284317565313257, + "grad_norm": 0.1359260231256485, + "learning_rate": 0.0003429328385968159, + "loss": 2.9985, + "step": 4900 + }, + { + "epoch": 0.28489780524246777, + "grad_norm": 0.1524299532175064, + "learning_rate": 0.00034266413649298414, + "loss": 3.0019, + "step": 4910 + }, + { + "epoch": 0.28547804517167846, + "grad_norm": 0.16266267001628876, + "learning_rate": 0.00034239490908485664, + "loss": 2.9963, + "step": 4920 + }, + { + "epoch": 0.2860582851008892, + "grad_norm": 0.18476419150829315, + "learning_rate": 0.00034212515736375704, + "loss": 3.0011, + "step": 4930 + }, + { + "epoch": 0.28663852503009996, + "grad_norm": 0.14814235270023346, + "learning_rate": 0.00034185488232293937, + "loss": 3.0012, + "step": 4940 + }, + { + "epoch": 0.2872187649593107, + "grad_norm": 0.13690084218978882, + "learning_rate": 0.00034158408495758467, + "loss": 3.0005, + "step": 4950 + }, + { + "epoch": 0.2877990048885214, + "grad_norm": 0.1411154866218567, + "learning_rate": 0.00034131276626479714, + "loss": 2.9906, + "step": 4960 + }, + { + "epoch": 0.28837924481773214, + "grad_norm": 0.1581835001707077, + "learning_rate": 0.0003410409272436008, + "loss": 3.0064, + "step": 4970 + }, + { + "epoch": 0.2889594847469429, + "grad_norm": 0.1560392826795578, + "learning_rate": 0.0003407685688949352, + "loss": 3.002, + "step": 4980 + }, + { + "epoch": 0.2895397246761536, + "grad_norm": 0.14799857139587402, + "learning_rate": 0.0003404956922216524, + "loss": 2.9964, + "step": 4990 + }, + { + "epoch": 0.2901199646053643, + "grad_norm": 0.14189326763153076, + "learning_rate": 0.000340222298228513, + "loss": 2.9938, + "step": 5000 + }, + { + "epoch": 0.2901199646053643, + "eval_loss": 2.9654901027679443, + "eval_runtime": 5.3907, + "eval_samples_per_second": 803.242, + "eval_steps_per_second": 1.67, + "step": 5000 + }, + { + "epoch": 0.29070020453457507, + "grad_norm": 0.14645496010780334, + "learning_rate": 0.00033994838792218213, + "loss": 2.9949, + "step": 5010 + }, + { + "epoch": 0.29128044446378576, + "grad_norm": 0.16313683986663818, + "learning_rate": 0.00033967396231122634, + "loss": 2.9859, + "step": 5020 + }, + { + "epoch": 0.2918606843929965, + "grad_norm": 0.14886920154094696, + "learning_rate": 0.00033939902240610946, + "loss": 2.993, + "step": 5030 + }, + { + "epoch": 0.29244092432220725, + "grad_norm": 0.15066947042942047, + "learning_rate": 0.0003391235692191891, + "loss": 2.995, + "step": 5040 + }, + { + "epoch": 0.29302116425141794, + "grad_norm": 0.14852523803710938, + "learning_rate": 0.0003388476037647125, + "loss": 2.9928, + "step": 5050 + }, + { + "epoch": 0.2936014041806287, + "grad_norm": 0.14773225784301758, + "learning_rate": 0.0003385711270588137, + "loss": 2.9887, + "step": 5060 + }, + { + "epoch": 0.29418164410983944, + "grad_norm": 0.1471126675605774, + "learning_rate": 0.0003382941401195087, + "loss": 2.9868, + "step": 5070 + }, + { + "epoch": 0.2947618840390501, + "grad_norm": 0.17213530838489532, + "learning_rate": 0.00033801664396669254, + "loss": 2.9993, + "step": 5080 + }, + { + "epoch": 0.2953421239682609, + "grad_norm": 0.17217926681041718, + "learning_rate": 0.00033773863962213496, + "loss": 2.9852, + "step": 5090 + }, + { + "epoch": 0.2959223638974716, + "grad_norm": 0.14535531401634216, + "learning_rate": 0.0003374601281094771, + "loss": 2.9892, + "step": 5100 + }, + { + "epoch": 0.2965026038266823, + "grad_norm": 0.15037214756011963, + "learning_rate": 0.0003371811104542277, + "loss": 2.9831, + "step": 5110 + }, + { + "epoch": 0.29708284375589306, + "grad_norm": 0.15636031329631805, + "learning_rate": 0.00033690158768375894, + "loss": 2.982, + "step": 5120 + }, + { + "epoch": 0.2976630836851038, + "grad_norm": 0.1443617343902588, + "learning_rate": 0.0003366215608273028, + "loss": 2.9893, + "step": 5130 + }, + { + "epoch": 0.2982433236143145, + "grad_norm": 0.15526960790157318, + "learning_rate": 0.0003363410309159477, + "loss": 2.9988, + "step": 5140 + }, + { + "epoch": 0.29882356354352524, + "grad_norm": 0.1502479910850525, + "learning_rate": 0.00033605999898263396, + "loss": 2.9802, + "step": 5150 + }, + { + "epoch": 0.299403803472736, + "grad_norm": 0.15904352068901062, + "learning_rate": 0.0003357784660621507, + "loss": 2.9866, + "step": 5160 + }, + { + "epoch": 0.29998404340194673, + "grad_norm": 0.1367734670639038, + "learning_rate": 0.00033549643319113163, + "loss": 2.992, + "step": 5170 + }, + { + "epoch": 0.3005642833311574, + "grad_norm": 0.1586538702249527, + "learning_rate": 0.00033521390140805134, + "loss": 2.9801, + "step": 5180 + }, + { + "epoch": 0.30114452326036817, + "grad_norm": 0.13850967586040497, + "learning_rate": 0.00033493087175322147, + "loss": 2.9865, + "step": 5190 + }, + { + "epoch": 0.3017247631895789, + "grad_norm": 0.15822730958461761, + "learning_rate": 0.00033464734526878674, + "loss": 2.9862, + "step": 5200 + }, + { + "epoch": 0.3023050031187896, + "grad_norm": 0.15228135883808136, + "learning_rate": 0.00033436332299872153, + "loss": 2.9831, + "step": 5210 + }, + { + "epoch": 0.30288524304800035, + "grad_norm": 0.15129725635051727, + "learning_rate": 0.00033407880598882545, + "loss": 2.9803, + "step": 5220 + }, + { + "epoch": 0.3034654829772111, + "grad_norm": 0.14200474321842194, + "learning_rate": 0.00033379379528672, + "loss": 2.9857, + "step": 5230 + }, + { + "epoch": 0.3040457229064218, + "grad_norm": 0.14871156215667725, + "learning_rate": 0.00033350829194184444, + "loss": 2.9838, + "step": 5240 + }, + { + "epoch": 0.30462596283563254, + "grad_norm": 0.157440185546875, + "learning_rate": 0.00033322229700545196, + "loss": 2.9818, + "step": 5250 + }, + { + "epoch": 0.3052062027648433, + "grad_norm": 0.13436777889728546, + "learning_rate": 0.000332935811530606, + "loss": 2.9785, + "step": 5260 + }, + { + "epoch": 0.305786442694054, + "grad_norm": 0.1572754830121994, + "learning_rate": 0.0003326488365721759, + "loss": 2.9882, + "step": 5270 + }, + { + "epoch": 0.3063666826232647, + "grad_norm": 0.13976529240608215, + "learning_rate": 0.00033236137318683363, + "loss": 2.9742, + "step": 5280 + }, + { + "epoch": 0.30694692255247547, + "grad_norm": 0.15709726512432098, + "learning_rate": 0.0003320734224330495, + "loss": 2.9786, + "step": 5290 + }, + { + "epoch": 0.30752716248168616, + "grad_norm": 0.16040396690368652, + "learning_rate": 0.00033178498537108833, + "loss": 2.9863, + "step": 5300 + }, + { + "epoch": 0.3081074024108969, + "grad_norm": 0.1624976396560669, + "learning_rate": 0.0003314960630630056, + "loss": 2.981, + "step": 5310 + }, + { + "epoch": 0.30868764234010765, + "grad_norm": 0.1401677429676056, + "learning_rate": 0.00033120665657264344, + "loss": 2.9798, + "step": 5320 + }, + { + "epoch": 0.30926788226931834, + "grad_norm": 0.15790094435214996, + "learning_rate": 0.00033091676696562697, + "loss": 2.97, + "step": 5330 + }, + { + "epoch": 0.3098481221985291, + "grad_norm": 0.13581514358520508, + "learning_rate": 0.0003306263953093601, + "loss": 2.9696, + "step": 5340 + }, + { + "epoch": 0.31042836212773983, + "grad_norm": 0.15074850618839264, + "learning_rate": 0.00033033554267302155, + "loss": 2.9666, + "step": 5350 + }, + { + "epoch": 0.3110086020569505, + "grad_norm": 0.1477966606616974, + "learning_rate": 0.0003300442101275614, + "loss": 2.976, + "step": 5360 + }, + { + "epoch": 0.31158884198616127, + "grad_norm": 0.16635243594646454, + "learning_rate": 0.00032975239874569645, + "loss": 2.9843, + "step": 5370 + }, + { + "epoch": 0.312169081915372, + "grad_norm": 0.1615062654018402, + "learning_rate": 0.00032946010960190677, + "loss": 2.9767, + "step": 5380 + }, + { + "epoch": 0.31274932184458276, + "grad_norm": 0.14296725392341614, + "learning_rate": 0.0003291673437724317, + "loss": 2.9669, + "step": 5390 + }, + { + "epoch": 0.31332956177379345, + "grad_norm": 0.14347875118255615, + "learning_rate": 0.0003288741023352656, + "loss": 2.9744, + "step": 5400 + }, + { + "epoch": 0.3139098017030042, + "grad_norm": 0.14779694378376007, + "learning_rate": 0.0003285803863701542, + "loss": 2.973, + "step": 5410 + }, + { + "epoch": 0.31449004163221495, + "grad_norm": 0.15234588086605072, + "learning_rate": 0.00032828619695859045, + "loss": 2.973, + "step": 5420 + }, + { + "epoch": 0.31507028156142564, + "grad_norm": 0.13710370659828186, + "learning_rate": 0.00032799153518381065, + "loss": 2.9752, + "step": 5430 + }, + { + "epoch": 0.3156505214906364, + "grad_norm": 0.16290375590324402, + "learning_rate": 0.00032769640213079024, + "loss": 2.9683, + "step": 5440 + }, + { + "epoch": 0.31623076141984713, + "grad_norm": 0.14308467507362366, + "learning_rate": 0.00032740079888624014, + "loss": 2.9705, + "step": 5450 + }, + { + "epoch": 0.3168110013490578, + "grad_norm": 0.14017410576343536, + "learning_rate": 0.0003271047265386022, + "loss": 2.9737, + "step": 5460 + }, + { + "epoch": 0.31739124127826857, + "grad_norm": 0.14751465618610382, + "learning_rate": 0.00032680818617804617, + "loss": 2.9683, + "step": 5470 + }, + { + "epoch": 0.3179714812074793, + "grad_norm": 0.15221184492111206, + "learning_rate": 0.0003265111788964645, + "loss": 2.9676, + "step": 5480 + }, + { + "epoch": 0.31855172113669, + "grad_norm": 0.15240968763828278, + "learning_rate": 0.00032621370578746916, + "loss": 2.9666, + "step": 5490 + }, + { + "epoch": 0.31913196106590075, + "grad_norm": 0.1354389488697052, + "learning_rate": 0.00032591576794638733, + "loss": 2.967, + "step": 5500 + }, + { + "epoch": 0.3197122009951115, + "grad_norm": 0.16345831751823425, + "learning_rate": 0.0003256173664702573, + "loss": 2.9655, + "step": 5510 + }, + { + "epoch": 0.3202924409243222, + "grad_norm": 0.14933979511260986, + "learning_rate": 0.00032531850245782465, + "loss": 2.9572, + "step": 5520 + }, + { + "epoch": 0.32087268085353293, + "grad_norm": 0.15477211773395538, + "learning_rate": 0.0003250191770095379, + "loss": 2.9679, + "step": 5530 + }, + { + "epoch": 0.3214529207827437, + "grad_norm": 0.1461821049451828, + "learning_rate": 0.0003247193912275448, + "loss": 2.9648, + "step": 5540 + }, + { + "epoch": 0.32203316071195437, + "grad_norm": 0.14192292094230652, + "learning_rate": 0.00032441914621568783, + "loss": 2.9637, + "step": 5550 + }, + { + "epoch": 0.3226134006411651, + "grad_norm": 0.15110647678375244, + "learning_rate": 0.00032411844307950074, + "loss": 2.9644, + "step": 5560 + }, + { + "epoch": 0.32319364057037586, + "grad_norm": 0.14680705964565277, + "learning_rate": 0.0003238172829262039, + "loss": 2.9645, + "step": 5570 + }, + { + "epoch": 0.32377388049958655, + "grad_norm": 0.14200454950332642, + "learning_rate": 0.00032351566686470064, + "loss": 2.9482, + "step": 5580 + }, + { + "epoch": 0.3243541204287973, + "grad_norm": 0.1601497381925583, + "learning_rate": 0.00032321359600557273, + "loss": 2.9551, + "step": 5590 + }, + { + "epoch": 0.32493436035800805, + "grad_norm": 0.14178360998630524, + "learning_rate": 0.00032291107146107686, + "loss": 2.9588, + "step": 5600 + }, + { + "epoch": 0.3255146002872188, + "grad_norm": 0.14087755978107452, + "learning_rate": 0.00032260809434514004, + "loss": 2.9599, + "step": 5610 + }, + { + "epoch": 0.3260948402164295, + "grad_norm": 0.14895635843276978, + "learning_rate": 0.00032230466577335575, + "loss": 2.9638, + "step": 5620 + }, + { + "epoch": 0.32667508014564023, + "grad_norm": 0.14719648659229279, + "learning_rate": 0.00032200078686297985, + "loss": 2.9506, + "step": 5630 + }, + { + "epoch": 0.327255320074851, + "grad_norm": 0.15351636707782745, + "learning_rate": 0.00032169645873292616, + "loss": 2.9642, + "step": 5640 + }, + { + "epoch": 0.32783556000406167, + "grad_norm": 0.14748744666576385, + "learning_rate": 0.0003213916825037629, + "loss": 2.9635, + "step": 5650 + }, + { + "epoch": 0.3284157999332724, + "grad_norm": 0.15472351014614105, + "learning_rate": 0.000321086459297708, + "loss": 2.9561, + "step": 5660 + }, + { + "epoch": 0.32899603986248316, + "grad_norm": 0.16757521033287048, + "learning_rate": 0.0003207807902386252, + "loss": 2.9488, + "step": 5670 + }, + { + "epoch": 0.32957627979169385, + "grad_norm": 0.13843347132205963, + "learning_rate": 0.00032047467645202017, + "loss": 2.9595, + "step": 5680 + }, + { + "epoch": 0.3301565197209046, + "grad_norm": 0.1418449878692627, + "learning_rate": 0.0003201681190650358, + "loss": 2.9732, + "step": 5690 + }, + { + "epoch": 0.33073675965011534, + "grad_norm": 0.14602024853229523, + "learning_rate": 0.00031986111920644854, + "loss": 2.9641, + "step": 5700 + }, + { + "epoch": 0.33131699957932603, + "grad_norm": 0.14498759806156158, + "learning_rate": 0.0003195536780066641, + "loss": 2.9616, + "step": 5710 + }, + { + "epoch": 0.3318972395085368, + "grad_norm": 0.1715255081653595, + "learning_rate": 0.0003192457965977131, + "loss": 2.9466, + "step": 5720 + }, + { + "epoch": 0.3324774794377475, + "grad_norm": 0.154770627617836, + "learning_rate": 0.0003189374761132472, + "loss": 2.9502, + "step": 5730 + }, + { + "epoch": 0.3330577193669582, + "grad_norm": 0.1444711685180664, + "learning_rate": 0.00031862871768853463, + "loss": 2.9541, + "step": 5740 + }, + { + "epoch": 0.33363795929616896, + "grad_norm": 0.1380070149898529, + "learning_rate": 0.0003183195224604563, + "loss": 2.9603, + "step": 5750 + }, + { + "epoch": 0.3342181992253797, + "grad_norm": 0.14697673916816711, + "learning_rate": 0.00031800989156750153, + "loss": 2.9534, + "step": 5760 + }, + { + "epoch": 0.3347984391545904, + "grad_norm": 0.16386087238788605, + "learning_rate": 0.00031769982614976357, + "loss": 2.9613, + "step": 5770 + }, + { + "epoch": 0.33537867908380115, + "grad_norm": 0.14437103271484375, + "learning_rate": 0.0003173893273489358, + "loss": 2.9544, + "step": 5780 + }, + { + "epoch": 0.3359589190130119, + "grad_norm": 0.14022108912467957, + "learning_rate": 0.00031707839630830734, + "loss": 2.9623, + "step": 5790 + }, + { + "epoch": 0.3365391589422226, + "grad_norm": 0.14068008959293365, + "learning_rate": 0.0003167670341727589, + "loss": 2.9499, + "step": 5800 + }, + { + "epoch": 0.33711939887143333, + "grad_norm": 0.14008371531963348, + "learning_rate": 0.00031645524208875843, + "loss": 2.9505, + "step": 5810 + }, + { + "epoch": 0.3376996388006441, + "grad_norm": 0.14702627062797546, + "learning_rate": 0.0003161430212043571, + "loss": 2.9456, + "step": 5820 + }, + { + "epoch": 0.3382798787298548, + "grad_norm": 0.13805069029331207, + "learning_rate": 0.0003158303726691848, + "loss": 2.9483, + "step": 5830 + }, + { + "epoch": 0.3388601186590655, + "grad_norm": 0.14876188337802887, + "learning_rate": 0.0003155172976344463, + "loss": 2.9478, + "step": 5840 + }, + { + "epoch": 0.33944035858827626, + "grad_norm": 0.15237033367156982, + "learning_rate": 0.0003152037972529167, + "loss": 2.9565, + "step": 5850 + }, + { + "epoch": 0.340020598517487, + "grad_norm": 0.14114312827587128, + "learning_rate": 0.0003148898726789371, + "loss": 2.9498, + "step": 5860 + }, + { + "epoch": 0.3406008384466977, + "grad_norm": 0.14159853756427765, + "learning_rate": 0.0003145755250684107, + "loss": 2.9576, + "step": 5870 + }, + { + "epoch": 0.34118107837590844, + "grad_norm": 0.143374502658844, + "learning_rate": 0.00031426075557879844, + "loss": 2.9532, + "step": 5880 + }, + { + "epoch": 0.3417613183051192, + "grad_norm": 0.14989744126796722, + "learning_rate": 0.0003139455653691146, + "loss": 2.9493, + "step": 5890 + }, + { + "epoch": 0.3423415582343299, + "grad_norm": 0.15993821620941162, + "learning_rate": 0.0003136299555999223, + "loss": 2.946, + "step": 5900 + }, + { + "epoch": 0.3429217981635406, + "grad_norm": 0.15719273686408997, + "learning_rate": 0.00031331392743333, + "loss": 2.9489, + "step": 5910 + }, + { + "epoch": 0.34350203809275137, + "grad_norm": 0.15121470391750336, + "learning_rate": 0.00031299748203298647, + "loss": 2.94, + "step": 5920 + }, + { + "epoch": 0.34408227802196206, + "grad_norm": 0.14775414764881134, + "learning_rate": 0.0003126806205640767, + "loss": 2.9317, + "step": 5930 + }, + { + "epoch": 0.3446625179511728, + "grad_norm": 0.158615842461586, + "learning_rate": 0.0003123633441933179, + "loss": 2.943, + "step": 5940 + }, + { + "epoch": 0.34524275788038356, + "grad_norm": 0.1479695737361908, + "learning_rate": 0.0003120456540889549, + "loss": 2.9435, + "step": 5950 + }, + { + "epoch": 0.34582299780959425, + "grad_norm": 0.13843965530395508, + "learning_rate": 0.00031172755142075604, + "loss": 2.9422, + "step": 5960 + }, + { + "epoch": 0.346403237738805, + "grad_norm": 0.1605447232723236, + "learning_rate": 0.00031140903736000855, + "loss": 2.9473, + "step": 5970 + }, + { + "epoch": 0.34698347766801574, + "grad_norm": 0.15243135392665863, + "learning_rate": 0.0003110901130795146, + "loss": 2.9397, + "step": 5980 + }, + { + "epoch": 0.34756371759722643, + "grad_norm": 0.14652499556541443, + "learning_rate": 0.00031077077975358677, + "loss": 2.9353, + "step": 5990 + }, + { + "epoch": 0.3481439575264372, + "grad_norm": 0.1344570368528366, + "learning_rate": 0.0003104510385580438, + "loss": 2.954, + "step": 6000 + }, + { + "epoch": 0.3481439575264372, + "eval_loss": 2.910837173461914, + "eval_runtime": 5.406, + "eval_samples_per_second": 800.965, + "eval_steps_per_second": 1.665, + "step": 6000 + }, + { + "epoch": 0.3487241974556479, + "grad_norm": 0.1439635157585144, + "learning_rate": 0.0003101308906702064, + "loss": 2.9376, + "step": 6010 + }, + { + "epoch": 0.3493044373848586, + "grad_norm": 0.14717575907707214, + "learning_rate": 0.00030981033726889255, + "loss": 2.946, + "step": 6020 + }, + { + "epoch": 0.34988467731406936, + "grad_norm": 0.1374998688697815, + "learning_rate": 0.00030948937953441337, + "loss": 2.9403, + "step": 6030 + }, + { + "epoch": 0.3504649172432801, + "grad_norm": 0.1412826031446457, + "learning_rate": 0.000309168018648569, + "loss": 2.9355, + "step": 6040 + }, + { + "epoch": 0.35104515717249085, + "grad_norm": 0.14428307116031647, + "learning_rate": 0.0003088462557946438, + "loss": 2.9445, + "step": 6050 + }, + { + "epoch": 0.35162539710170154, + "grad_norm": 0.1365126520395279, + "learning_rate": 0.00030852409215740233, + "loss": 2.9444, + "step": 6060 + }, + { + "epoch": 0.3522056370309123, + "grad_norm": 0.14436426758766174, + "learning_rate": 0.0003082015289230848, + "loss": 2.9283, + "step": 6070 + }, + { + "epoch": 0.35278587696012303, + "grad_norm": 0.14028888940811157, + "learning_rate": 0.000307878567279403, + "loss": 2.9419, + "step": 6080 + }, + { + "epoch": 0.3533661168893337, + "grad_norm": 0.15606454014778137, + "learning_rate": 0.0003075552084155354, + "loss": 2.9475, + "step": 6090 + }, + { + "epoch": 0.35394635681854447, + "grad_norm": 0.15795260667800903, + "learning_rate": 0.00030723145352212316, + "loss": 2.9387, + "step": 6100 + }, + { + "epoch": 0.3545265967477552, + "grad_norm": 0.14786306023597717, + "learning_rate": 0.0003069073037912658, + "loss": 2.9329, + "step": 6110 + }, + { + "epoch": 0.3551068366769659, + "grad_norm": 0.14278551936149597, + "learning_rate": 0.00030658276041651655, + "loss": 2.9327, + "step": 6120 + }, + { + "epoch": 0.35568707660617666, + "grad_norm": 0.15043289959430695, + "learning_rate": 0.0003062578245928782, + "loss": 2.9364, + "step": 6130 + }, + { + "epoch": 0.3562673165353874, + "grad_norm": 0.15945866703987122, + "learning_rate": 0.0003059324975167984, + "loss": 2.9343, + "step": 6140 + }, + { + "epoch": 0.3568475564645981, + "grad_norm": 0.14819258451461792, + "learning_rate": 0.00030560678038616545, + "loss": 2.9362, + "step": 6150 + }, + { + "epoch": 0.35742779639380884, + "grad_norm": 0.14421693980693817, + "learning_rate": 0.00030528067440030416, + "loss": 2.9339, + "step": 6160 + }, + { + "epoch": 0.3580080363230196, + "grad_norm": 0.1377636194229126, + "learning_rate": 0.00030495418075997076, + "loss": 2.9405, + "step": 6170 + }, + { + "epoch": 0.3585882762522303, + "grad_norm": 0.15939627587795258, + "learning_rate": 0.0003046273006673491, + "loss": 2.9381, + "step": 6180 + }, + { + "epoch": 0.359168516181441, + "grad_norm": 0.15114201605319977, + "learning_rate": 0.00030430003532604593, + "loss": 2.9364, + "step": 6190 + }, + { + "epoch": 0.35974875611065177, + "grad_norm": 0.1577521562576294, + "learning_rate": 0.0003039723859410865, + "loss": 2.9261, + "step": 6200 + }, + { + "epoch": 0.36032899603986246, + "grad_norm": 0.1542629450559616, + "learning_rate": 0.00030364435371891017, + "loss": 2.9255, + "step": 6210 + }, + { + "epoch": 0.3609092359690732, + "grad_norm": 0.1399134397506714, + "learning_rate": 0.000303315939867366, + "loss": 2.9322, + "step": 6220 + }, + { + "epoch": 0.36148947589828395, + "grad_norm": 0.1502622365951538, + "learning_rate": 0.0003029871455957081, + "loss": 2.9259, + "step": 6230 + }, + { + "epoch": 0.36206971582749464, + "grad_norm": 0.152298703789711, + "learning_rate": 0.00030265797211459137, + "loss": 2.9391, + "step": 6240 + }, + { + "epoch": 0.3626499557567054, + "grad_norm": 0.14043064415454865, + "learning_rate": 0.0003023284206360673, + "loss": 2.9278, + "step": 6250 + }, + { + "epoch": 0.36323019568591614, + "grad_norm": 0.1389763206243515, + "learning_rate": 0.0003019984923735787, + "loss": 2.9297, + "step": 6260 + }, + { + "epoch": 0.3638104356151269, + "grad_norm": 0.13958598673343658, + "learning_rate": 0.0003016681885419562, + "loss": 2.9278, + "step": 6270 + }, + { + "epoch": 0.36439067554433757, + "grad_norm": 0.1489957571029663, + "learning_rate": 0.00030133751035741295, + "loss": 2.9351, + "step": 6280 + }, + { + "epoch": 0.3649709154735483, + "grad_norm": 0.13442069292068481, + "learning_rate": 0.0003010064590375407, + "loss": 2.932, + "step": 6290 + }, + { + "epoch": 0.36555115540275906, + "grad_norm": 0.13820981979370117, + "learning_rate": 0.00030067503580130515, + "loss": 2.9297, + "step": 6300 + }, + { + "epoch": 0.36613139533196976, + "grad_norm": 0.1524563729763031, + "learning_rate": 0.00030034324186904135, + "loss": 2.9296, + "step": 6310 + }, + { + "epoch": 0.3667116352611805, + "grad_norm": 0.1481136530637741, + "learning_rate": 0.0003000110784624493, + "loss": 2.9181, + "step": 6320 + }, + { + "epoch": 0.36729187519039125, + "grad_norm": 0.1579747349023819, + "learning_rate": 0.00029967854680458945, + "loss": 2.9283, + "step": 6330 + }, + { + "epoch": 0.36787211511960194, + "grad_norm": 0.15123365819454193, + "learning_rate": 0.0002993456481198783, + "loss": 2.9284, + "step": 6340 + }, + { + "epoch": 0.3684523550488127, + "grad_norm": 0.14494162797927856, + "learning_rate": 0.00029901238363408357, + "loss": 2.9292, + "step": 6350 + }, + { + "epoch": 0.36903259497802343, + "grad_norm": 0.15171104669570923, + "learning_rate": 0.00029867875457431994, + "loss": 2.9232, + "step": 6360 + }, + { + "epoch": 0.3696128349072341, + "grad_norm": 0.13486771285533905, + "learning_rate": 0.0002983447621690447, + "loss": 2.9239, + "step": 6370 + }, + { + "epoch": 0.37019307483644487, + "grad_norm": 0.12901608645915985, + "learning_rate": 0.0002980104076480528, + "loss": 2.9247, + "step": 6380 + }, + { + "epoch": 0.3707733147656556, + "grad_norm": 0.14679357409477234, + "learning_rate": 0.00029767569224247267, + "loss": 2.9212, + "step": 6390 + }, + { + "epoch": 0.3713535546948663, + "grad_norm": 0.148614764213562, + "learning_rate": 0.0002973406171847615, + "loss": 2.9303, + "step": 6400 + }, + { + "epoch": 0.37193379462407705, + "grad_norm": 0.1406886726617813, + "learning_rate": 0.0002970051837087007, + "loss": 2.9228, + "step": 6410 + }, + { + "epoch": 0.3725140345532878, + "grad_norm": 0.1530819684267044, + "learning_rate": 0.00029666939304939143, + "loss": 2.9251, + "step": 6420 + }, + { + "epoch": 0.3730942744824985, + "grad_norm": 0.14401894807815552, + "learning_rate": 0.0002963332464432502, + "loss": 2.9211, + "step": 6430 + }, + { + "epoch": 0.37367451441170924, + "grad_norm": 0.14230068027973175, + "learning_rate": 0.000295996745128004, + "loss": 2.9283, + "step": 6440 + }, + { + "epoch": 0.37425475434092, + "grad_norm": 0.14690563082695007, + "learning_rate": 0.00029565989034268584, + "loss": 2.9243, + "step": 6450 + }, + { + "epoch": 0.3748349942701307, + "grad_norm": 0.15261906385421753, + "learning_rate": 0.0002953226833276304, + "loss": 2.9246, + "step": 6460 + }, + { + "epoch": 0.3754152341993414, + "grad_norm": 0.14708411693572998, + "learning_rate": 0.0002949851253244691, + "loss": 2.9203, + "step": 6470 + }, + { + "epoch": 0.37599547412855217, + "grad_norm": 0.13880537450313568, + "learning_rate": 0.0002946472175761261, + "loss": 2.9217, + "step": 6480 + }, + { + "epoch": 0.3765757140577629, + "grad_norm": 0.14952170848846436, + "learning_rate": 0.00029430896132681293, + "loss": 2.9137, + "step": 6490 + }, + { + "epoch": 0.3771559539869736, + "grad_norm": 0.14160752296447754, + "learning_rate": 0.0002939703578220246, + "loss": 2.9172, + "step": 6500 + }, + { + "epoch": 0.37773619391618435, + "grad_norm": 0.1431347280740738, + "learning_rate": 0.0002936314083085348, + "loss": 2.9132, + "step": 6510 + }, + { + "epoch": 0.3783164338453951, + "grad_norm": 0.1614401489496231, + "learning_rate": 0.0002932921140343909, + "loss": 2.9216, + "step": 6520 + }, + { + "epoch": 0.3788966737746058, + "grad_norm": 0.1511864811182022, + "learning_rate": 0.0002929524762489102, + "loss": 2.9178, + "step": 6530 + }, + { + "epoch": 0.37947691370381653, + "grad_norm": 0.15035977959632874, + "learning_rate": 0.0002926124962026744, + "loss": 2.9144, + "step": 6540 + }, + { + "epoch": 0.3800571536330273, + "grad_norm": 0.1370215266942978, + "learning_rate": 0.0002922721751475259, + "loss": 2.9157, + "step": 6550 + }, + { + "epoch": 0.38063739356223797, + "grad_norm": 0.1499612033367157, + "learning_rate": 0.00029193151433656227, + "loss": 2.915, + "step": 6560 + }, + { + "epoch": 0.3812176334914487, + "grad_norm": 0.15834592282772064, + "learning_rate": 0.00029159051502413233, + "loss": 2.9155, + "step": 6570 + }, + { + "epoch": 0.38179787342065946, + "grad_norm": 0.13816189765930176, + "learning_rate": 0.0002912491784658313, + "loss": 2.9242, + "step": 6580 + }, + { + "epoch": 0.38237811334987015, + "grad_norm": 0.16079404950141907, + "learning_rate": 0.00029090750591849614, + "loss": 2.9132, + "step": 6590 + }, + { + "epoch": 0.3829583532790809, + "grad_norm": 0.14459337294101715, + "learning_rate": 0.000290565498640201, + "loss": 2.9124, + "step": 6600 + }, + { + "epoch": 0.38353859320829164, + "grad_norm": 0.14939634501934052, + "learning_rate": 0.00029022315789025246, + "loss": 2.9052, + "step": 6610 + }, + { + "epoch": 0.38411883313750234, + "grad_norm": 0.1586650013923645, + "learning_rate": 0.0002898804849291851, + "loss": 2.914, + "step": 6620 + }, + { + "epoch": 0.3846990730667131, + "grad_norm": 0.14331689476966858, + "learning_rate": 0.0002895374810187565, + "loss": 2.9082, + "step": 6630 + }, + { + "epoch": 0.38527931299592383, + "grad_norm": 0.13959959149360657, + "learning_rate": 0.00028919414742194314, + "loss": 2.9198, + "step": 6640 + }, + { + "epoch": 0.3858595529251345, + "grad_norm": 0.13745518028736115, + "learning_rate": 0.00028885048540293524, + "loss": 2.9066, + "step": 6650 + }, + { + "epoch": 0.38643979285434527, + "grad_norm": 0.1419799029827118, + "learning_rate": 0.00028850649622713236, + "loss": 2.9121, + "step": 6660 + }, + { + "epoch": 0.387020032783556, + "grad_norm": 0.14384348690509796, + "learning_rate": 0.00028816218116113867, + "loss": 2.9162, + "step": 6670 + }, + { + "epoch": 0.3876002727127667, + "grad_norm": 0.15276901423931122, + "learning_rate": 0.0002878175414727583, + "loss": 2.9058, + "step": 6680 + }, + { + "epoch": 0.38818051264197745, + "grad_norm": 0.14799462258815765, + "learning_rate": 0.00028747257843099076, + "loss": 2.9062, + "step": 6690 + }, + { + "epoch": 0.3887607525711882, + "grad_norm": 0.14701081812381744, + "learning_rate": 0.000287127293306026, + "loss": 2.9033, + "step": 6700 + }, + { + "epoch": 0.38934099250039894, + "grad_norm": 0.14179976284503937, + "learning_rate": 0.0002867816873692401, + "loss": 2.915, + "step": 6710 + }, + { + "epoch": 0.38992123242960963, + "grad_norm": 0.15130779147148132, + "learning_rate": 0.0002864357618931902, + "loss": 2.9104, + "step": 6720 + }, + { + "epoch": 0.3905014723588204, + "grad_norm": 0.1438591182231903, + "learning_rate": 0.00028608951815161033, + "loss": 2.9072, + "step": 6730 + }, + { + "epoch": 0.3910817122880311, + "grad_norm": 0.13668999075889587, + "learning_rate": 0.0002857429574194061, + "loss": 2.914, + "step": 6740 + }, + { + "epoch": 0.3916619522172418, + "grad_norm": 0.14742673933506012, + "learning_rate": 0.00028539608097265056, + "loss": 2.911, + "step": 6750 + }, + { + "epoch": 0.39224219214645256, + "grad_norm": 0.1358884572982788, + "learning_rate": 0.0002850488900885789, + "loss": 2.9041, + "step": 6760 + }, + { + "epoch": 0.3928224320756633, + "grad_norm": 0.1417282521724701, + "learning_rate": 0.0002847013860455845, + "loss": 2.9038, + "step": 6770 + }, + { + "epoch": 0.393402672004874, + "grad_norm": 0.14336806535720825, + "learning_rate": 0.00028435357012321355, + "loss": 2.8981, + "step": 6780 + }, + { + "epoch": 0.39398291193408475, + "grad_norm": 0.1505221426486969, + "learning_rate": 0.00028400544360216074, + "loss": 2.9099, + "step": 6790 + }, + { + "epoch": 0.3945631518632955, + "grad_norm": 0.13609366118907928, + "learning_rate": 0.0002836570077642644, + "loss": 2.91, + "step": 6800 + }, + { + "epoch": 0.3951433917925062, + "grad_norm": 0.14644168317317963, + "learning_rate": 0.00028330826389250195, + "loss": 2.9037, + "step": 6810 + }, + { + "epoch": 0.39572363172171693, + "grad_norm": 0.13570183515548706, + "learning_rate": 0.0002829592132709846, + "loss": 2.9081, + "step": 6820 + }, + { + "epoch": 0.3963038716509277, + "grad_norm": 0.13495145738124847, + "learning_rate": 0.0002826098571849534, + "loss": 2.9046, + "step": 6830 + }, + { + "epoch": 0.39688411158013837, + "grad_norm": 0.155324786901474, + "learning_rate": 0.00028226019692077406, + "loss": 2.8966, + "step": 6840 + }, + { + "epoch": 0.3974643515093491, + "grad_norm": 0.15253056585788727, + "learning_rate": 0.0002819102337659323, + "loss": 2.8914, + "step": 6850 + }, + { + "epoch": 0.39804459143855986, + "grad_norm": 0.1406160295009613, + "learning_rate": 0.0002815599690090291, + "loss": 2.9042, + "step": 6860 + }, + { + "epoch": 0.39862483136777055, + "grad_norm": 0.152736097574234, + "learning_rate": 0.00028120940393977614, + "loss": 2.9106, + "step": 6870 + }, + { + "epoch": 0.3992050712969813, + "grad_norm": 0.13760948181152344, + "learning_rate": 0.00028085853984899053, + "loss": 2.9015, + "step": 6880 + }, + { + "epoch": 0.39978531122619204, + "grad_norm": 0.14538171887397766, + "learning_rate": 0.0002805073780285906, + "loss": 2.9018, + "step": 6890 + }, + { + "epoch": 0.40036555115540273, + "grad_norm": 0.14871671795845032, + "learning_rate": 0.0002801559197715911, + "loss": 2.9011, + "step": 6900 + }, + { + "epoch": 0.4009457910846135, + "grad_norm": 0.1533951461315155, + "learning_rate": 0.0002798041663720981, + "loss": 2.8993, + "step": 6910 + }, + { + "epoch": 0.4015260310138242, + "grad_norm": 0.14396968483924866, + "learning_rate": 0.0002794521191253045, + "loss": 2.8938, + "step": 6920 + }, + { + "epoch": 0.40210627094303497, + "grad_norm": 0.16459780931472778, + "learning_rate": 0.000279099779327485, + "loss": 2.8966, + "step": 6930 + }, + { + "epoch": 0.40268651087224566, + "grad_norm": 0.14155222475528717, + "learning_rate": 0.0002787471482759918, + "loss": 2.8944, + "step": 6940 + }, + { + "epoch": 0.4032667508014564, + "grad_norm": 0.14908990263938904, + "learning_rate": 0.0002783942272692493, + "loss": 2.9055, + "step": 6950 + }, + { + "epoch": 0.40384699073066715, + "grad_norm": 0.14655596017837524, + "learning_rate": 0.0002780410176067496, + "loss": 2.8998, + "step": 6960 + }, + { + "epoch": 0.40442723065987785, + "grad_norm": 0.1373496949672699, + "learning_rate": 0.00027768752058904777, + "loss": 2.9049, + "step": 6970 + }, + { + "epoch": 0.4050074705890886, + "grad_norm": 0.14998483657836914, + "learning_rate": 0.0002773337375177568, + "loss": 2.892, + "step": 6980 + }, + { + "epoch": 0.40558771051829934, + "grad_norm": 0.14310023188591003, + "learning_rate": 0.00027697966969554295, + "loss": 2.8971, + "step": 6990 + }, + { + "epoch": 0.40616795044751003, + "grad_norm": 0.15001101791858673, + "learning_rate": 0.00027662531842612115, + "loss": 2.8931, + "step": 7000 + }, + { + "epoch": 0.40616795044751003, + "eval_loss": 2.8682682514190674, + "eval_runtime": 5.3865, + "eval_samples_per_second": 803.854, + "eval_steps_per_second": 1.671, + "step": 7000 + }, + { + "epoch": 0.4067481903767208, + "grad_norm": 0.14453168213367462, + "learning_rate": 0.0002762706850142498, + "loss": 2.8964, + "step": 7010 + }, + { + "epoch": 0.4073284303059315, + "grad_norm": 0.15523375570774078, + "learning_rate": 0.0002759157707657264, + "loss": 2.8987, + "step": 7020 + }, + { + "epoch": 0.4079086702351422, + "grad_norm": 0.14011220633983612, + "learning_rate": 0.0002755605769873823, + "loss": 2.9069, + "step": 7030 + }, + { + "epoch": 0.40848891016435296, + "grad_norm": 0.1405903846025467, + "learning_rate": 0.0002752051049870782, + "loss": 2.8941, + "step": 7040 + }, + { + "epoch": 0.4090691500935637, + "grad_norm": 0.13568729162216187, + "learning_rate": 0.00027484935607369925, + "loss": 2.8864, + "step": 7050 + }, + { + "epoch": 0.4096493900227744, + "grad_norm": 0.13538648188114166, + "learning_rate": 0.00027449333155715023, + "loss": 2.9006, + "step": 7060 + }, + { + "epoch": 0.41022962995198514, + "grad_norm": 0.15839791297912598, + "learning_rate": 0.00027413703274835067, + "loss": 2.8905, + "step": 7070 + }, + { + "epoch": 0.4108098698811959, + "grad_norm": 0.14201544225215912, + "learning_rate": 0.0002737804609592302, + "loss": 2.9017, + "step": 7080 + }, + { + "epoch": 0.4113901098104066, + "grad_norm": 0.135043665766716, + "learning_rate": 0.0002734236175027234, + "loss": 2.8998, + "step": 7090 + }, + { + "epoch": 0.4119703497396173, + "grad_norm": 0.13998910784721375, + "learning_rate": 0.00027306650369276526, + "loss": 2.8953, + "step": 7100 + }, + { + "epoch": 0.41255058966882807, + "grad_norm": 0.13548001646995544, + "learning_rate": 0.0002727091208442864, + "loss": 2.89, + "step": 7110 + }, + { + "epoch": 0.41313082959803876, + "grad_norm": 0.14130084216594696, + "learning_rate": 0.0002723514702732077, + "loss": 2.8918, + "step": 7120 + }, + { + "epoch": 0.4137110695272495, + "grad_norm": 0.13718628883361816, + "learning_rate": 0.0002719935532964361, + "loss": 2.8879, + "step": 7130 + }, + { + "epoch": 0.41429130945646025, + "grad_norm": 0.1469535529613495, + "learning_rate": 0.00027163537123185943, + "loss": 2.8788, + "step": 7140 + }, + { + "epoch": 0.414871549385671, + "grad_norm": 0.13813258707523346, + "learning_rate": 0.0002712769253983416, + "loss": 2.8904, + "step": 7150 + }, + { + "epoch": 0.4154517893148817, + "grad_norm": 0.14960864186286926, + "learning_rate": 0.0002709182171157176, + "loss": 2.8982, + "step": 7160 + }, + { + "epoch": 0.41603202924409244, + "grad_norm": 0.13153991103172302, + "learning_rate": 0.00027055924770478905, + "loss": 2.8803, + "step": 7170 + }, + { + "epoch": 0.4166122691733032, + "grad_norm": 0.15670733153820038, + "learning_rate": 0.0002702000184873189, + "loss": 2.8961, + "step": 7180 + }, + { + "epoch": 0.4171925091025139, + "grad_norm": 0.14150060713291168, + "learning_rate": 0.00026984053078602665, + "loss": 2.8943, + "step": 7190 + }, + { + "epoch": 0.4177727490317246, + "grad_norm": 0.13610132038593292, + "learning_rate": 0.0002694807859245837, + "loss": 2.8932, + "step": 7200 + }, + { + "epoch": 0.41835298896093537, + "grad_norm": 0.13310842216014862, + "learning_rate": 0.0002691207852276084, + "loss": 2.8876, + "step": 7210 + }, + { + "epoch": 0.41893322889014606, + "grad_norm": 0.135100319981575, + "learning_rate": 0.00026876053002066104, + "loss": 2.894, + "step": 7220 + }, + { + "epoch": 0.4195134688193568, + "grad_norm": 0.13600456714630127, + "learning_rate": 0.00026840002163023896, + "loss": 2.8777, + "step": 7230 + }, + { + "epoch": 0.42009370874856755, + "grad_norm": 0.1357976198196411, + "learning_rate": 0.00026803926138377186, + "loss": 2.8869, + "step": 7240 + }, + { + "epoch": 0.42067394867777824, + "grad_norm": 0.13665033876895905, + "learning_rate": 0.00026767825060961664, + "loss": 2.8824, + "step": 7250 + }, + { + "epoch": 0.421254188606989, + "grad_norm": 0.14291678369045258, + "learning_rate": 0.00026731699063705294, + "loss": 2.8981, + "step": 7260 + }, + { + "epoch": 0.42183442853619973, + "grad_norm": 0.14100785553455353, + "learning_rate": 0.0002669554827962778, + "loss": 2.8894, + "step": 7270 + }, + { + "epoch": 0.4224146684654104, + "grad_norm": 0.1521817296743393, + "learning_rate": 0.000266593728418401, + "loss": 2.8745, + "step": 7280 + }, + { + "epoch": 0.42299490839462117, + "grad_norm": 0.16641181707382202, + "learning_rate": 0.0002662317288354399, + "loss": 2.8901, + "step": 7290 + }, + { + "epoch": 0.4235751483238319, + "grad_norm": 0.14584742486476898, + "learning_rate": 0.000265869485380315, + "loss": 2.8804, + "step": 7300 + }, + { + "epoch": 0.4241553882530426, + "grad_norm": 0.14091430604457855, + "learning_rate": 0.00026550699938684454, + "loss": 2.8814, + "step": 7310 + }, + { + "epoch": 0.42473562818225336, + "grad_norm": 0.13544070720672607, + "learning_rate": 0.0002651442721897401, + "loss": 2.8865, + "step": 7320 + }, + { + "epoch": 0.4253158681114641, + "grad_norm": 0.13583482801914215, + "learning_rate": 0.0002647813051246011, + "loss": 2.8687, + "step": 7330 + }, + { + "epoch": 0.4258961080406748, + "grad_norm": 0.15594419836997986, + "learning_rate": 0.0002644180995279103, + "loss": 2.8812, + "step": 7340 + }, + { + "epoch": 0.42647634796988554, + "grad_norm": 0.1415625512599945, + "learning_rate": 0.0002640546567370288, + "loss": 2.8922, + "step": 7350 + }, + { + "epoch": 0.4270565878990963, + "grad_norm": 0.13848547637462616, + "learning_rate": 0.000263690978090191, + "loss": 2.8816, + "step": 7360 + }, + { + "epoch": 0.42763682782830703, + "grad_norm": 0.1387799084186554, + "learning_rate": 0.00026332706492649977, + "loss": 2.8866, + "step": 7370 + }, + { + "epoch": 0.4282170677575177, + "grad_norm": 0.13819080591201782, + "learning_rate": 0.0002629629185859215, + "loss": 2.8838, + "step": 7380 + }, + { + "epoch": 0.42879730768672847, + "grad_norm": 0.14040718972682953, + "learning_rate": 0.00026259854040928124, + "loss": 2.8766, + "step": 7390 + }, + { + "epoch": 0.4293775476159392, + "grad_norm": 0.14268594980239868, + "learning_rate": 0.0002622339317382575, + "loss": 2.8778, + "step": 7400 + }, + { + "epoch": 0.4299577875451499, + "grad_norm": 0.13744668662548065, + "learning_rate": 0.00026186909391537767, + "loss": 2.8843, + "step": 7410 + }, + { + "epoch": 0.43053802747436065, + "grad_norm": 0.1380259245634079, + "learning_rate": 0.0002615040282840128, + "loss": 2.8819, + "step": 7420 + }, + { + "epoch": 0.4311182674035714, + "grad_norm": 0.15185829997062683, + "learning_rate": 0.00026113873618837275, + "loss": 2.8734, + "step": 7430 + }, + { + "epoch": 0.4316985073327821, + "grad_norm": 0.1312202364206314, + "learning_rate": 0.00026077321897350134, + "loss": 2.8769, + "step": 7440 + }, + { + "epoch": 0.43227874726199284, + "grad_norm": 0.15877887606620789, + "learning_rate": 0.0002604074779852713, + "loss": 2.8784, + "step": 7450 + }, + { + "epoch": 0.4328589871912036, + "grad_norm": 0.15031276643276215, + "learning_rate": 0.0002600415145703791, + "loss": 2.8942, + "step": 7460 + }, + { + "epoch": 0.43343922712041427, + "grad_norm": 0.14033295214176178, + "learning_rate": 0.00025967533007634056, + "loss": 2.8734, + "step": 7470 + }, + { + "epoch": 0.434019467049625, + "grad_norm": 0.13741450011730194, + "learning_rate": 0.00025930892585148525, + "loss": 2.8772, + "step": 7480 + }, + { + "epoch": 0.43459970697883576, + "grad_norm": 0.13238540291786194, + "learning_rate": 0.0002589423032449519, + "loss": 2.8872, + "step": 7490 + }, + { + "epoch": 0.43517994690804646, + "grad_norm": 0.15299426019191742, + "learning_rate": 0.0002585754636066833, + "loss": 2.8773, + "step": 7500 + }, + { + "epoch": 0.4357601868372572, + "grad_norm": 0.1423303186893463, + "learning_rate": 0.00025820840828742156, + "loss": 2.8814, + "step": 7510 + }, + { + "epoch": 0.43634042676646795, + "grad_norm": 0.135323166847229, + "learning_rate": 0.0002578411386387027, + "loss": 2.868, + "step": 7520 + }, + { + "epoch": 0.43692066669567864, + "grad_norm": 0.14203716814517975, + "learning_rate": 0.00025747365601285215, + "loss": 2.8856, + "step": 7530 + }, + { + "epoch": 0.4375009066248894, + "grad_norm": 0.14312061667442322, + "learning_rate": 0.00025710596176297936, + "loss": 2.8793, + "step": 7540 + }, + { + "epoch": 0.43808114655410013, + "grad_norm": 0.13628032803535461, + "learning_rate": 0.0002567380572429731, + "loss": 2.8727, + "step": 7550 + }, + { + "epoch": 0.4386613864833108, + "grad_norm": 0.13479942083358765, + "learning_rate": 0.00025636994380749635, + "loss": 2.8826, + "step": 7560 + }, + { + "epoch": 0.43924162641252157, + "grad_norm": 0.13710373640060425, + "learning_rate": 0.0002560016228119814, + "loss": 2.8741, + "step": 7570 + }, + { + "epoch": 0.4398218663417323, + "grad_norm": 0.14385974407196045, + "learning_rate": 0.0002556330956126246, + "loss": 2.8721, + "step": 7580 + }, + { + "epoch": 0.44040210627094306, + "grad_norm": 0.13590388000011444, + "learning_rate": 0.0002552643635663818, + "loss": 2.8751, + "step": 7590 + }, + { + "epoch": 0.44098234620015375, + "grad_norm": 0.14734290540218353, + "learning_rate": 0.000254895428030963, + "loss": 2.8657, + "step": 7600 + }, + { + "epoch": 0.4415625861293645, + "grad_norm": 0.14358721673488617, + "learning_rate": 0.00025452629036482754, + "loss": 2.8658, + "step": 7610 + }, + { + "epoch": 0.44214282605857524, + "grad_norm": 0.1496707648038864, + "learning_rate": 0.00025415695192717886, + "loss": 2.8647, + "step": 7620 + }, + { + "epoch": 0.44272306598778594, + "grad_norm": 0.14276018738746643, + "learning_rate": 0.0002537874140779599, + "loss": 2.8746, + "step": 7630 + }, + { + "epoch": 0.4433033059169967, + "grad_norm": 0.14037010073661804, + "learning_rate": 0.0002534176781778477, + "loss": 2.8721, + "step": 7640 + }, + { + "epoch": 0.44388354584620743, + "grad_norm": 0.13529075682163239, + "learning_rate": 0.00025304774558824854, + "loss": 2.8778, + "step": 7650 + }, + { + "epoch": 0.4444637857754181, + "grad_norm": 0.14173325896263123, + "learning_rate": 0.000252677617671293, + "loss": 2.8699, + "step": 7660 + }, + { + "epoch": 0.44504402570462887, + "grad_norm": 0.1350284367799759, + "learning_rate": 0.0002523072957898308, + "loss": 2.874, + "step": 7670 + }, + { + "epoch": 0.4456242656338396, + "grad_norm": 0.13940376043319702, + "learning_rate": 0.00025193678130742595, + "loss": 2.8709, + "step": 7680 + }, + { + "epoch": 0.4462045055630503, + "grad_norm": 0.13375508785247803, + "learning_rate": 0.00025156607558835155, + "loss": 2.8789, + "step": 7690 + }, + { + "epoch": 0.44678474549226105, + "grad_norm": 0.13682647049427032, + "learning_rate": 0.000251195179997585, + "loss": 2.8637, + "step": 7700 + }, + { + "epoch": 0.4473649854214718, + "grad_norm": 0.15215632319450378, + "learning_rate": 0.00025082409590080257, + "loss": 2.8656, + "step": 7710 + }, + { + "epoch": 0.4479452253506825, + "grad_norm": 0.13502159714698792, + "learning_rate": 0.0002504528246643749, + "loss": 2.8723, + "step": 7720 + }, + { + "epoch": 0.44852546527989323, + "grad_norm": 0.1432695984840393, + "learning_rate": 0.00025008136765536143, + "loss": 2.8769, + "step": 7730 + }, + { + "epoch": 0.449105705209104, + "grad_norm": 0.1322629153728485, + "learning_rate": 0.0002497097262415058, + "loss": 2.8718, + "step": 7740 + }, + { + "epoch": 0.44968594513831467, + "grad_norm": 0.14167630672454834, + "learning_rate": 0.00024933790179123086, + "loss": 2.876, + "step": 7750 + }, + { + "epoch": 0.4502661850675254, + "grad_norm": 0.1397087574005127, + "learning_rate": 0.000248965895673633, + "loss": 2.859, + "step": 7760 + }, + { + "epoch": 0.45084642499673616, + "grad_norm": 0.1338258534669876, + "learning_rate": 0.00024859370925847766, + "loss": 2.8832, + "step": 7770 + }, + { + "epoch": 0.45142666492594685, + "grad_norm": 0.13774670660495758, + "learning_rate": 0.0002482213439161943, + "loss": 2.8647, + "step": 7780 + }, + { + "epoch": 0.4520069048551576, + "grad_norm": 0.13594569265842438, + "learning_rate": 0.0002478488010178711, + "loss": 2.8695, + "step": 7790 + }, + { + "epoch": 0.45258714478436834, + "grad_norm": 0.13634726405143738, + "learning_rate": 0.0002474760819352501, + "loss": 2.8559, + "step": 7800 + }, + { + "epoch": 0.4531673847135791, + "grad_norm": 0.15008622407913208, + "learning_rate": 0.0002471031880407219, + "loss": 2.8595, + "step": 7810 + }, + { + "epoch": 0.4537476246427898, + "grad_norm": 0.132884681224823, + "learning_rate": 0.000246730120707321, + "loss": 2.8614, + "step": 7820 + }, + { + "epoch": 0.45432786457200053, + "grad_norm": 0.14270992577075958, + "learning_rate": 0.00024635688130872027, + "loss": 2.8676, + "step": 7830 + }, + { + "epoch": 0.4549081045012113, + "grad_norm": 0.13346141576766968, + "learning_rate": 0.00024598347121922636, + "loss": 2.8663, + "step": 7840 + }, + { + "epoch": 0.45548834443042197, + "grad_norm": 0.1440788060426712, + "learning_rate": 0.00024560989181377434, + "loss": 2.8742, + "step": 7850 + }, + { + "epoch": 0.4560685843596327, + "grad_norm": 0.13537128269672394, + "learning_rate": 0.00024523614446792267, + "loss": 2.8677, + "step": 7860 + }, + { + "epoch": 0.45664882428884346, + "grad_norm": 0.1451151967048645, + "learning_rate": 0.0002448622305578483, + "loss": 2.855, + "step": 7870 + }, + { + "epoch": 0.45722906421805415, + "grad_norm": 0.13695837557315826, + "learning_rate": 0.00024448815146034135, + "loss": 2.8736, + "step": 7880 + }, + { + "epoch": 0.4578093041472649, + "grad_norm": 0.13232262432575226, + "learning_rate": 0.00024411390855280023, + "loss": 2.865, + "step": 7890 + }, + { + "epoch": 0.45838954407647564, + "grad_norm": 0.1366211622953415, + "learning_rate": 0.00024373950321322663, + "loss": 2.86, + "step": 7900 + }, + { + "epoch": 0.45896978400568633, + "grad_norm": 0.14180238544940948, + "learning_rate": 0.00024336493682022012, + "loss": 2.8601, + "step": 7910 + }, + { + "epoch": 0.4595500239348971, + "grad_norm": 0.1388852894306183, + "learning_rate": 0.00024299021075297343, + "loss": 2.86, + "step": 7920 + }, + { + "epoch": 0.4601302638641078, + "grad_norm": 0.1402927190065384, + "learning_rate": 0.0002426153263912673, + "loss": 2.8718, + "step": 7930 + }, + { + "epoch": 0.4607105037933185, + "grad_norm": 0.1446109563112259, + "learning_rate": 0.00024224028511546505, + "loss": 2.8632, + "step": 7940 + }, + { + "epoch": 0.46129074372252926, + "grad_norm": 0.138419508934021, + "learning_rate": 0.00024186508830650806, + "loss": 2.8598, + "step": 7950 + }, + { + "epoch": 0.46187098365174, + "grad_norm": 0.14222180843353271, + "learning_rate": 0.00024148973734591027, + "loss": 2.861, + "step": 7960 + }, + { + "epoch": 0.4624512235809507, + "grad_norm": 0.12518581748008728, + "learning_rate": 0.00024111423361575322, + "loss": 2.8533, + "step": 7970 + }, + { + "epoch": 0.46303146351016145, + "grad_norm": 0.13554495573043823, + "learning_rate": 0.00024073857849868092, + "loss": 2.8599, + "step": 7980 + }, + { + "epoch": 0.4636117034393722, + "grad_norm": 0.13020184636116028, + "learning_rate": 0.000240362773377895, + "loss": 2.8551, + "step": 7990 + }, + { + "epoch": 0.4641919433685829, + "grad_norm": 0.14233264327049255, + "learning_rate": 0.00023998681963714914, + "loss": 2.8584, + "step": 8000 + }, + { + "epoch": 0.4641919433685829, + "eval_loss": 2.8317151069641113, + "eval_runtime": 5.391, + "eval_samples_per_second": 803.193, + "eval_steps_per_second": 1.669, + "step": 8000 + }, + { + "epoch": 0.46477218329779363, + "grad_norm": 0.1347561776638031, + "learning_rate": 0.00023961071866074454, + "loss": 2.8583, + "step": 8010 + }, + { + "epoch": 0.4653524232270044, + "grad_norm": 0.13999909162521362, + "learning_rate": 0.0002392344718335243, + "loss": 2.8618, + "step": 8020 + }, + { + "epoch": 0.4659326631562151, + "grad_norm": 0.15302954614162445, + "learning_rate": 0.00023885808054086867, + "loss": 2.857, + "step": 8030 + }, + { + "epoch": 0.4665129030854258, + "grad_norm": 0.1391817033290863, + "learning_rate": 0.00023848154616868978, + "loss": 2.855, + "step": 8040 + }, + { + "epoch": 0.46709314301463656, + "grad_norm": 0.13904796540737152, + "learning_rate": 0.0002381048701034267, + "loss": 2.8585, + "step": 8050 + }, + { + "epoch": 0.4676733829438473, + "grad_norm": 0.14073707163333893, + "learning_rate": 0.00023772805373204018, + "loss": 2.8599, + "step": 8060 + }, + { + "epoch": 0.468253622873058, + "grad_norm": 0.1362558752298355, + "learning_rate": 0.0002373510984420075, + "loss": 2.8605, + "step": 8070 + }, + { + "epoch": 0.46883386280226874, + "grad_norm": 0.14300206303596497, + "learning_rate": 0.00023697400562131758, + "loss": 2.8514, + "step": 8080 + }, + { + "epoch": 0.4694141027314795, + "grad_norm": 0.15007098019123077, + "learning_rate": 0.00023659677665846562, + "loss": 2.8581, + "step": 8090 + }, + { + "epoch": 0.4699943426606902, + "grad_norm": 0.1375647932291031, + "learning_rate": 0.00023621941294244824, + "loss": 2.851, + "step": 8100 + }, + { + "epoch": 0.4705745825899009, + "grad_norm": 0.14102703332901, + "learning_rate": 0.0002358419158627582, + "loss": 2.853, + "step": 8110 + }, + { + "epoch": 0.47115482251911167, + "grad_norm": 0.14791908860206604, + "learning_rate": 0.00023546428680937926, + "loss": 2.8586, + "step": 8120 + }, + { + "epoch": 0.47173506244832236, + "grad_norm": 0.1465589553117752, + "learning_rate": 0.00023508652717278111, + "loss": 2.856, + "step": 8130 + }, + { + "epoch": 0.4723153023775331, + "grad_norm": 0.1365519016981125, + "learning_rate": 0.00023470863834391438, + "loss": 2.8643, + "step": 8140 + }, + { + "epoch": 0.47289554230674385, + "grad_norm": 0.13424411416053772, + "learning_rate": 0.00023433062171420522, + "loss": 2.8562, + "step": 8150 + }, + { + "epoch": 0.47347578223595455, + "grad_norm": 0.15107622742652893, + "learning_rate": 0.00023395247867555053, + "loss": 2.857, + "step": 8160 + }, + { + "epoch": 0.4740560221651653, + "grad_norm": 0.13523408770561218, + "learning_rate": 0.00023357421062031265, + "loss": 2.853, + "step": 8170 + }, + { + "epoch": 0.47463626209437604, + "grad_norm": 0.1367097645998001, + "learning_rate": 0.0002331958189413141, + "loss": 2.853, + "step": 8180 + }, + { + "epoch": 0.47521650202358673, + "grad_norm": 0.139958456158638, + "learning_rate": 0.00023281730503183274, + "loss": 2.8529, + "step": 8190 + }, + { + "epoch": 0.4757967419527975, + "grad_norm": 0.1375078707933426, + "learning_rate": 0.00023243867028559633, + "loss": 2.8492, + "step": 8200 + }, + { + "epoch": 0.4763769818820082, + "grad_norm": 0.1325850486755371, + "learning_rate": 0.0002320599160967778, + "loss": 2.8555, + "step": 8210 + }, + { + "epoch": 0.4769572218112189, + "grad_norm": 0.13762585818767548, + "learning_rate": 0.00023168104385998963, + "loss": 2.8443, + "step": 8220 + }, + { + "epoch": 0.47753746174042966, + "grad_norm": 0.13820673525333405, + "learning_rate": 0.0002313020549702792, + "loss": 2.8495, + "step": 8230 + }, + { + "epoch": 0.4781177016696404, + "grad_norm": 0.13856127858161926, + "learning_rate": 0.00023092295082312325, + "loss": 2.8484, + "step": 8240 + }, + { + "epoch": 0.47869794159885115, + "grad_norm": 0.13684526085853577, + "learning_rate": 0.000230543732814423, + "loss": 2.8481, + "step": 8250 + }, + { + "epoch": 0.47927818152806184, + "grad_norm": 0.13395436108112335, + "learning_rate": 0.0002301644023404988, + "loss": 2.8482, + "step": 8260 + }, + { + "epoch": 0.4798584214572726, + "grad_norm": 0.14342832565307617, + "learning_rate": 0.00022978496079808526, + "loss": 2.8523, + "step": 8270 + }, + { + "epoch": 0.48043866138648333, + "grad_norm": 0.14550542831420898, + "learning_rate": 0.00022940540958432584, + "loss": 2.8512, + "step": 8280 + }, + { + "epoch": 0.481018901315694, + "grad_norm": 0.14272627234458923, + "learning_rate": 0.00022902575009676795, + "loss": 2.8479, + "step": 8290 + }, + { + "epoch": 0.48159914124490477, + "grad_norm": 0.13808168470859528, + "learning_rate": 0.00022864598373335753, + "loss": 2.8539, + "step": 8300 + }, + { + "epoch": 0.4821793811741155, + "grad_norm": 0.13078927993774414, + "learning_rate": 0.00022826611189243407, + "loss": 2.8494, + "step": 8310 + }, + { + "epoch": 0.4827596211033262, + "grad_norm": 0.12881894409656525, + "learning_rate": 0.0002278861359727256, + "loss": 2.8447, + "step": 8320 + }, + { + "epoch": 0.48333986103253695, + "grad_norm": 0.12993864715099335, + "learning_rate": 0.00022750605737334323, + "loss": 2.8411, + "step": 8330 + }, + { + "epoch": 0.4839201009617477, + "grad_norm": 0.13983088731765747, + "learning_rate": 0.00022712587749377608, + "loss": 2.8414, + "step": 8340 + }, + { + "epoch": 0.4845003408909584, + "grad_norm": 0.1545930653810501, + "learning_rate": 0.0002267455977338864, + "loss": 2.8432, + "step": 8350 + }, + { + "epoch": 0.48508058082016914, + "grad_norm": 0.13540688157081604, + "learning_rate": 0.00022636521949390406, + "loss": 2.8493, + "step": 8360 + }, + { + "epoch": 0.4856608207493799, + "grad_norm": 0.15347440540790558, + "learning_rate": 0.0002259847441744216, + "loss": 2.8424, + "step": 8370 + }, + { + "epoch": 0.4862410606785906, + "grad_norm": 0.1298235058784485, + "learning_rate": 0.00022560417317638907, + "loss": 2.8462, + "step": 8380 + }, + { + "epoch": 0.4868213006078013, + "grad_norm": 0.15003375709056854, + "learning_rate": 0.00022522350790110863, + "loss": 2.845, + "step": 8390 + }, + { + "epoch": 0.48740154053701207, + "grad_norm": 0.15155071020126343, + "learning_rate": 0.00022484274975022973, + "loss": 2.8421, + "step": 8400 + }, + { + "epoch": 0.48798178046622276, + "grad_norm": 0.12803903222084045, + "learning_rate": 0.0002244619001257438, + "loss": 2.8456, + "step": 8410 + }, + { + "epoch": 0.4885620203954335, + "grad_norm": 0.14884670078754425, + "learning_rate": 0.00022408096042997905, + "loss": 2.8433, + "step": 8420 + }, + { + "epoch": 0.48914226032464425, + "grad_norm": 0.14629362523555756, + "learning_rate": 0.00022369993206559533, + "loss": 2.8419, + "step": 8430 + }, + { + "epoch": 0.48972250025385494, + "grad_norm": 0.14142775535583496, + "learning_rate": 0.00022331881643557905, + "loss": 2.8509, + "step": 8440 + }, + { + "epoch": 0.4903027401830657, + "grad_norm": 0.14854149520397186, + "learning_rate": 0.00022293761494323783, + "loss": 2.8364, + "step": 8450 + }, + { + "epoch": 0.49088298011227643, + "grad_norm": 0.14419108629226685, + "learning_rate": 0.00022255632899219547, + "loss": 2.8466, + "step": 8460 + }, + { + "epoch": 0.4914632200414872, + "grad_norm": 0.13738323748111725, + "learning_rate": 0.0002221749599863868, + "loss": 2.844, + "step": 8470 + }, + { + "epoch": 0.49204345997069787, + "grad_norm": 0.14036454260349274, + "learning_rate": 0.00022179350933005255, + "loss": 2.8401, + "step": 8480 + }, + { + "epoch": 0.4926236998999086, + "grad_norm": 0.1321595460176468, + "learning_rate": 0.00022141197842773385, + "loss": 2.848, + "step": 8490 + }, + { + "epoch": 0.49320393982911936, + "grad_norm": 0.13454943895339966, + "learning_rate": 0.0002210303686842676, + "loss": 2.8359, + "step": 8500 + }, + { + "epoch": 0.49378417975833006, + "grad_norm": 0.13193447887897491, + "learning_rate": 0.00022064868150478066, + "loss": 2.8477, + "step": 8510 + }, + { + "epoch": 0.4943644196875408, + "grad_norm": 0.13707976043224335, + "learning_rate": 0.0002202669182946854, + "loss": 2.8433, + "step": 8520 + }, + { + "epoch": 0.49494465961675155, + "grad_norm": 0.1408550888299942, + "learning_rate": 0.00021988508045967385, + "loss": 2.8376, + "step": 8530 + }, + { + "epoch": 0.49552489954596224, + "grad_norm": 0.13284894824028015, + "learning_rate": 0.00021950316940571294, + "loss": 2.8442, + "step": 8540 + }, + { + "epoch": 0.496105139475173, + "grad_norm": 0.1373133808374405, + "learning_rate": 0.0002191211865390392, + "loss": 2.8333, + "step": 8550 + }, + { + "epoch": 0.49668537940438373, + "grad_norm": 0.1252400428056717, + "learning_rate": 0.00021873913326615356, + "loss": 2.8383, + "step": 8560 + }, + { + "epoch": 0.4972656193335944, + "grad_norm": 0.13226158916950226, + "learning_rate": 0.0002183570109938161, + "loss": 2.8334, + "step": 8570 + }, + { + "epoch": 0.49784585926280517, + "grad_norm": 0.1358543038368225, + "learning_rate": 0.00021797482112904118, + "loss": 2.8407, + "step": 8580 + }, + { + "epoch": 0.4984260991920159, + "grad_norm": 0.14613795280456543, + "learning_rate": 0.00021759256507909185, + "loss": 2.8415, + "step": 8590 + }, + { + "epoch": 0.4990063391212266, + "grad_norm": 0.14075227081775665, + "learning_rate": 0.00021721024425147496, + "loss": 2.8347, + "step": 8600 + }, + { + "epoch": 0.49958657905043735, + "grad_norm": 0.1412448287010193, + "learning_rate": 0.00021682786005393587, + "loss": 2.8366, + "step": 8610 + }, + { + "epoch": 0.5001668189796481, + "grad_norm": 0.13218757510185242, + "learning_rate": 0.00021644541389445317, + "loss": 2.8401, + "step": 8620 + }, + { + "epoch": 0.5007470589088588, + "grad_norm": 0.1320735365152359, + "learning_rate": 0.00021606290718123377, + "loss": 2.8443, + "step": 8630 + }, + { + "epoch": 0.5013272988380696, + "grad_norm": 0.13078896701335907, + "learning_rate": 0.0002156803413227074, + "loss": 2.8471, + "step": 8640 + }, + { + "epoch": 0.5019075387672802, + "grad_norm": 0.13882210850715637, + "learning_rate": 0.00021529771772752163, + "loss": 2.8499, + "step": 8650 + }, + { + "epoch": 0.502487778696491, + "grad_norm": 0.1350562423467636, + "learning_rate": 0.00021491503780453672, + "loss": 2.8324, + "step": 8660 + }, + { + "epoch": 0.5030680186257017, + "grad_norm": 0.1424356997013092, + "learning_rate": 0.0002145323029628201, + "loss": 2.8423, + "step": 8670 + }, + { + "epoch": 0.5036482585549125, + "grad_norm": 0.13661132752895355, + "learning_rate": 0.0002141495146116416, + "loss": 2.8403, + "step": 8680 + }, + { + "epoch": 0.5042284984841232, + "grad_norm": 0.13870751857757568, + "learning_rate": 0.00021376667416046806, + "loss": 2.8355, + "step": 8690 + }, + { + "epoch": 0.504808738413334, + "grad_norm": 0.1345418244600296, + "learning_rate": 0.0002133837830189581, + "loss": 2.8396, + "step": 8700 + }, + { + "epoch": 0.5053889783425446, + "grad_norm": 0.13897638022899628, + "learning_rate": 0.00021300084259695697, + "loss": 2.8376, + "step": 8710 + }, + { + "epoch": 0.5059692182717553, + "grad_norm": 0.14594705402851105, + "learning_rate": 0.00021261785430449153, + "loss": 2.8382, + "step": 8720 + }, + { + "epoch": 0.5065494582009661, + "grad_norm": 0.13326287269592285, + "learning_rate": 0.00021223481955176467, + "loss": 2.8332, + "step": 8730 + }, + { + "epoch": 0.5071296981301768, + "grad_norm": 0.1389443427324295, + "learning_rate": 0.00021185173974915057, + "loss": 2.8356, + "step": 8740 + }, + { + "epoch": 0.5077099380593876, + "grad_norm": 0.14216211438179016, + "learning_rate": 0.0002114686163071892, + "loss": 2.8374, + "step": 8750 + }, + { + "epoch": 0.5082901779885983, + "grad_norm": 0.13781870901584625, + "learning_rate": 0.00021108545063658113, + "loss": 2.8358, + "step": 8760 + }, + { + "epoch": 0.508870417917809, + "grad_norm": 0.13731315732002258, + "learning_rate": 0.00021070224414818247, + "loss": 2.824, + "step": 8770 + }, + { + "epoch": 0.5094506578470197, + "grad_norm": 0.13154597580432892, + "learning_rate": 0.00021031899825299974, + "loss": 2.8328, + "step": 8780 + }, + { + "epoch": 0.5100308977762305, + "grad_norm": 0.1381417214870453, + "learning_rate": 0.00020993571436218452, + "loss": 2.8275, + "step": 8790 + }, + { + "epoch": 0.5106111377054412, + "grad_norm": 0.14278624951839447, + "learning_rate": 0.00020955239388702817, + "loss": 2.8339, + "step": 8800 + }, + { + "epoch": 0.5111913776346519, + "grad_norm": 0.13169626891613007, + "learning_rate": 0.00020916903823895683, + "loss": 2.8297, + "step": 8810 + }, + { + "epoch": 0.5117716175638627, + "grad_norm": 0.12725697457790375, + "learning_rate": 0.0002087856488295262, + "loss": 2.8334, + "step": 8820 + }, + { + "epoch": 0.5123518574930734, + "grad_norm": 0.1367608606815338, + "learning_rate": 0.00020840222707041616, + "loss": 2.8345, + "step": 8830 + }, + { + "epoch": 0.5129320974222841, + "grad_norm": 0.13655100762844086, + "learning_rate": 0.00020801877437342584, + "loss": 2.8295, + "step": 8840 + }, + { + "epoch": 0.5135123373514948, + "grad_norm": 0.13129231333732605, + "learning_rate": 0.00020763529215046827, + "loss": 2.8403, + "step": 8850 + }, + { + "epoch": 0.5140925772807056, + "grad_norm": 0.1390671730041504, + "learning_rate": 0.0002072517818135652, + "loss": 2.8419, + "step": 8860 + }, + { + "epoch": 0.5146728172099163, + "grad_norm": 0.13736438751220703, + "learning_rate": 0.00020686824477484178, + "loss": 2.8326, + "step": 8870 + }, + { + "epoch": 0.5152530571391271, + "grad_norm": 0.1363706886768341, + "learning_rate": 0.0002064846824465216, + "loss": 2.8308, + "step": 8880 + }, + { + "epoch": 0.5158332970683378, + "grad_norm": 0.1408400982618332, + "learning_rate": 0.00020610109624092133, + "loss": 2.8328, + "step": 8890 + }, + { + "epoch": 0.5164135369975484, + "grad_norm": 0.13489697873592377, + "learning_rate": 0.00020571748757044556, + "loss": 2.8258, + "step": 8900 + }, + { + "epoch": 0.5169937769267592, + "grad_norm": 0.13712669909000397, + "learning_rate": 0.00020533385784758163, + "loss": 2.8357, + "step": 8910 + }, + { + "epoch": 0.5175740168559699, + "grad_norm": 0.13442225754261017, + "learning_rate": 0.00020495020848489438, + "loss": 2.8213, + "step": 8920 + }, + { + "epoch": 0.5181542567851807, + "grad_norm": 0.13573797047138214, + "learning_rate": 0.00020456654089502085, + "loss": 2.8359, + "step": 8930 + }, + { + "epoch": 0.5187344967143914, + "grad_norm": 0.12967585027217865, + "learning_rate": 0.0002041828564906654, + "loss": 2.8287, + "step": 8940 + }, + { + "epoch": 0.5193147366436022, + "grad_norm": 0.13563676178455353, + "learning_rate": 0.00020379915668459412, + "loss": 2.8245, + "step": 8950 + }, + { + "epoch": 0.5198949765728128, + "grad_norm": 0.14425864815711975, + "learning_rate": 0.00020341544288963, + "loss": 2.8288, + "step": 8960 + }, + { + "epoch": 0.5204752165020236, + "grad_norm": 0.13940462470054626, + "learning_rate": 0.00020303171651864737, + "loss": 2.8248, + "step": 8970 + }, + { + "epoch": 0.5210554564312343, + "grad_norm": 0.13967497646808624, + "learning_rate": 0.00020264797898456692, + "loss": 2.8302, + "step": 8980 + }, + { + "epoch": 0.521635696360445, + "grad_norm": 0.14502641558647156, + "learning_rate": 0.00020226423170035043, + "loss": 2.8241, + "step": 8990 + }, + { + "epoch": 0.5222159362896558, + "grad_norm": 0.12917964160442352, + "learning_rate": 0.00020188047607899563, + "loss": 2.8266, + "step": 9000 + }, + { + "epoch": 0.5222159362896558, + "eval_loss": 2.798532724380493, + "eval_runtime": 5.3934, + "eval_samples_per_second": 802.828, + "eval_steps_per_second": 1.669, + "step": 9000 + }, + { + "epoch": 0.5227961762188665, + "grad_norm": 0.131534606218338, + "learning_rate": 0.00020149671353353088, + "loss": 2.8277, + "step": 9010 + }, + { + "epoch": 0.5233764161480773, + "grad_norm": 0.13375796377658844, + "learning_rate": 0.00020111294547701017, + "loss": 2.8293, + "step": 9020 + }, + { + "epoch": 0.5239566560772879, + "grad_norm": 0.13121195137500763, + "learning_rate": 0.00020072917332250757, + "loss": 2.8292, + "step": 9030 + }, + { + "epoch": 0.5245368960064987, + "grad_norm": 0.12965995073318481, + "learning_rate": 0.0002003453984831124, + "loss": 2.8317, + "step": 9040 + }, + { + "epoch": 0.5251171359357094, + "grad_norm": 0.13647259771823883, + "learning_rate": 0.0001999616223719239, + "loss": 2.83, + "step": 9050 + }, + { + "epoch": 0.5256973758649202, + "grad_norm": 0.13376837968826294, + "learning_rate": 0.0001995778464020458, + "loss": 2.8333, + "step": 9060 + }, + { + "epoch": 0.5262776157941309, + "grad_norm": 0.13384296000003815, + "learning_rate": 0.00019919407198658155, + "loss": 2.8144, + "step": 9070 + }, + { + "epoch": 0.5268578557233417, + "grad_norm": 0.13470128178596497, + "learning_rate": 0.00019881030053862857, + "loss": 2.8216, + "step": 9080 + }, + { + "epoch": 0.5274380956525523, + "grad_norm": 0.14076776802539825, + "learning_rate": 0.0001984265334712737, + "loss": 2.8259, + "step": 9090 + }, + { + "epoch": 0.528018335581763, + "grad_norm": 0.12356127053499222, + "learning_rate": 0.00019804277219758737, + "loss": 2.825, + "step": 9100 + }, + { + "epoch": 0.5285985755109738, + "grad_norm": 0.13532927632331848, + "learning_rate": 0.00019765901813061882, + "loss": 2.8228, + "step": 9110 + }, + { + "epoch": 0.5291788154401845, + "grad_norm": 0.14467062056064606, + "learning_rate": 0.00019727527268339088, + "loss": 2.8316, + "step": 9120 + }, + { + "epoch": 0.5297590553693953, + "grad_norm": 0.14286072552204132, + "learning_rate": 0.00019689153726889423, + "loss": 2.824, + "step": 9130 + }, + { + "epoch": 0.530339295298606, + "grad_norm": 0.13074620068073273, + "learning_rate": 0.00019650781330008305, + "loss": 2.823, + "step": 9140 + }, + { + "epoch": 0.5309195352278167, + "grad_norm": 0.12763041257858276, + "learning_rate": 0.00019612410218986908, + "loss": 2.8227, + "step": 9150 + }, + { + "epoch": 0.5314997751570274, + "grad_norm": 0.1378396898508072, + "learning_rate": 0.0001957404053511169, + "loss": 2.8189, + "step": 9160 + }, + { + "epoch": 0.5320800150862381, + "grad_norm": 0.1301429569721222, + "learning_rate": 0.0001953567241966385, + "loss": 2.8173, + "step": 9170 + }, + { + "epoch": 0.5326602550154489, + "grad_norm": 0.13029593229293823, + "learning_rate": 0.00019497306013918793, + "loss": 2.8295, + "step": 9180 + }, + { + "epoch": 0.5332404949446596, + "grad_norm": 0.13020405173301697, + "learning_rate": 0.00019458941459145657, + "loss": 2.8249, + "step": 9190 + }, + { + "epoch": 0.5338207348738704, + "grad_norm": 0.1313934177160263, + "learning_rate": 0.00019420578896606747, + "loss": 2.8262, + "step": 9200 + }, + { + "epoch": 0.534400974803081, + "grad_norm": 0.13342641294002533, + "learning_rate": 0.00019382218467557048, + "loss": 2.8289, + "step": 9210 + }, + { + "epoch": 0.5349812147322918, + "grad_norm": 0.13609477877616882, + "learning_rate": 0.00019343860313243659, + "loss": 2.8192, + "step": 9220 + }, + { + "epoch": 0.5355614546615025, + "grad_norm": 0.1358291357755661, + "learning_rate": 0.00019305504574905328, + "loss": 2.8275, + "step": 9230 + }, + { + "epoch": 0.5361416945907133, + "grad_norm": 0.1305118352174759, + "learning_rate": 0.00019267151393771918, + "loss": 2.8214, + "step": 9240 + }, + { + "epoch": 0.536721934519924, + "grad_norm": 0.12923507392406464, + "learning_rate": 0.0001922880091106384, + "loss": 2.8266, + "step": 9250 + }, + { + "epoch": 0.5373021744491348, + "grad_norm": 0.13450828194618225, + "learning_rate": 0.00019190453267991598, + "loss": 2.8187, + "step": 9260 + }, + { + "epoch": 0.5378824143783455, + "grad_norm": 0.1334317922592163, + "learning_rate": 0.00019152108605755222, + "loss": 2.819, + "step": 9270 + }, + { + "epoch": 0.5384626543075561, + "grad_norm": 0.13587439060211182, + "learning_rate": 0.0001911376706554379, + "loss": 2.8167, + "step": 9280 + }, + { + "epoch": 0.5390428942367669, + "grad_norm": 0.13316716253757477, + "learning_rate": 0.00019075428788534863, + "loss": 2.8143, + "step": 9290 + }, + { + "epoch": 0.5396231341659776, + "grad_norm": 0.13156826794147491, + "learning_rate": 0.00019037093915893986, + "loss": 2.816, + "step": 9300 + }, + { + "epoch": 0.5402033740951884, + "grad_norm": 0.1294325441122055, + "learning_rate": 0.00018998762588774188, + "loss": 2.815, + "step": 9310 + }, + { + "epoch": 0.5407836140243991, + "grad_norm": 0.1401001513004303, + "learning_rate": 0.0001896043494831542, + "loss": 2.813, + "step": 9320 + }, + { + "epoch": 0.5413638539536099, + "grad_norm": 0.13030453026294708, + "learning_rate": 0.00018922111135644083, + "loss": 2.8207, + "step": 9330 + }, + { + "epoch": 0.5419440938828205, + "grad_norm": 0.13054735958576202, + "learning_rate": 0.00018883791291872452, + "loss": 2.8208, + "step": 9340 + }, + { + "epoch": 0.5425243338120312, + "grad_norm": 0.13646872341632843, + "learning_rate": 0.00018845475558098215, + "loss": 2.814, + "step": 9350 + }, + { + "epoch": 0.543104573741242, + "grad_norm": 0.12413671612739563, + "learning_rate": 0.00018807164075403923, + "loss": 2.8181, + "step": 9360 + }, + { + "epoch": 0.5436848136704527, + "grad_norm": 0.12649095058441162, + "learning_rate": 0.0001876885698485646, + "loss": 2.8199, + "step": 9370 + }, + { + "epoch": 0.5442650535996635, + "grad_norm": 0.12739893794059753, + "learning_rate": 0.00018730554427506558, + "loss": 2.8129, + "step": 9380 + }, + { + "epoch": 0.5448452935288742, + "grad_norm": 0.14016854763031006, + "learning_rate": 0.00018692256544388227, + "loss": 2.8023, + "step": 9390 + }, + { + "epoch": 0.5454255334580849, + "grad_norm": 0.1389675885438919, + "learning_rate": 0.00018653963476518296, + "loss": 2.8174, + "step": 9400 + }, + { + "epoch": 0.5460057733872956, + "grad_norm": 0.13839608430862427, + "learning_rate": 0.00018615675364895857, + "loss": 2.8153, + "step": 9410 + }, + { + "epoch": 0.5465860133165064, + "grad_norm": 0.14397138357162476, + "learning_rate": 0.00018577392350501736, + "loss": 2.8256, + "step": 9420 + }, + { + "epoch": 0.5471662532457171, + "grad_norm": 0.13635268807411194, + "learning_rate": 0.0001853911457429802, + "loss": 2.8074, + "step": 9430 + }, + { + "epoch": 0.5477464931749279, + "grad_norm": 0.14737777411937714, + "learning_rate": 0.0001850084217722747, + "loss": 2.8211, + "step": 9440 + }, + { + "epoch": 0.5483267331041386, + "grad_norm": 0.13884200155735016, + "learning_rate": 0.00018462575300213076, + "loss": 2.8142, + "step": 9450 + }, + { + "epoch": 0.5489069730333493, + "grad_norm": 0.13363014161586761, + "learning_rate": 0.0001842431408415748, + "loss": 2.8046, + "step": 9460 + }, + { + "epoch": 0.54948721296256, + "grad_norm": 0.13794805109500885, + "learning_rate": 0.00018386058669942487, + "loss": 2.8011, + "step": 9470 + }, + { + "epoch": 0.5500674528917707, + "grad_norm": 0.13166405260562897, + "learning_rate": 0.00018347809198428555, + "loss": 2.8138, + "step": 9480 + }, + { + "epoch": 0.5506476928209815, + "grad_norm": 0.13902068138122559, + "learning_rate": 0.00018309565810454222, + "loss": 2.8204, + "step": 9490 + }, + { + "epoch": 0.5512279327501922, + "grad_norm": 0.12628786265850067, + "learning_rate": 0.00018271328646835672, + "loss": 2.8203, + "step": 9500 + }, + { + "epoch": 0.551808172679403, + "grad_norm": 0.1412256807088852, + "learning_rate": 0.00018233097848366125, + "loss": 2.8169, + "step": 9510 + }, + { + "epoch": 0.5523884126086137, + "grad_norm": 0.13156931102275848, + "learning_rate": 0.00018194873555815394, + "loss": 2.8215, + "step": 9520 + }, + { + "epoch": 0.5529686525378243, + "grad_norm": 0.14922703802585602, + "learning_rate": 0.0001815665590992934, + "loss": 2.8131, + "step": 9530 + }, + { + "epoch": 0.5535488924670351, + "grad_norm": 0.13142195343971252, + "learning_rate": 0.0001811844505142932, + "loss": 2.8113, + "step": 9540 + }, + { + "epoch": 0.5541291323962458, + "grad_norm": 0.13327111303806305, + "learning_rate": 0.0001808024112101174, + "loss": 2.8157, + "step": 9550 + }, + { + "epoch": 0.5547093723254566, + "grad_norm": 0.12934935092926025, + "learning_rate": 0.0001804204425934745, + "loss": 2.8038, + "step": 9560 + }, + { + "epoch": 0.5552896122546673, + "grad_norm": 0.12223649024963379, + "learning_rate": 0.0001800385460708131, + "loss": 2.8231, + "step": 9570 + }, + { + "epoch": 0.5558698521838781, + "grad_norm": 0.13266624510288239, + "learning_rate": 0.00017965672304831614, + "loss": 2.8154, + "step": 9580 + }, + { + "epoch": 0.5564500921130887, + "grad_norm": 0.12882035970687866, + "learning_rate": 0.00017927497493189603, + "loss": 2.8011, + "step": 9590 + }, + { + "epoch": 0.5570303320422995, + "grad_norm": 0.14030689001083374, + "learning_rate": 0.0001788933031271894, + "loss": 2.8148, + "step": 9600 + }, + { + "epoch": 0.5576105719715102, + "grad_norm": 0.14994706213474274, + "learning_rate": 0.00017851170903955167, + "loss": 2.8033, + "step": 9610 + }, + { + "epoch": 0.558190811900721, + "grad_norm": 0.13254228234291077, + "learning_rate": 0.00017813019407405232, + "loss": 2.8123, + "step": 9620 + }, + { + "epoch": 0.5587710518299317, + "grad_norm": 0.13098488748073578, + "learning_rate": 0.0001777487596354694, + "loss": 2.81, + "step": 9630 + }, + { + "epoch": 0.5593512917591424, + "grad_norm": 0.13294130563735962, + "learning_rate": 0.00017736740712828443, + "loss": 2.8191, + "step": 9640 + }, + { + "epoch": 0.5599315316883531, + "grad_norm": 0.13288183510303497, + "learning_rate": 0.00017698613795667746, + "loss": 2.8131, + "step": 9650 + }, + { + "epoch": 0.5605117716175638, + "grad_norm": 0.12782664597034454, + "learning_rate": 0.00017660495352452132, + "loss": 2.8103, + "step": 9660 + }, + { + "epoch": 0.5610920115467746, + "grad_norm": 0.1320827752351761, + "learning_rate": 0.00017622385523537713, + "loss": 2.807, + "step": 9670 + }, + { + "epoch": 0.5616722514759853, + "grad_norm": 0.13161487877368927, + "learning_rate": 0.00017584284449248864, + "loss": 2.8104, + "step": 9680 + }, + { + "epoch": 0.5622524914051961, + "grad_norm": 0.1345384418964386, + "learning_rate": 0.00017546192269877748, + "loss": 2.812, + "step": 9690 + }, + { + "epoch": 0.5628327313344068, + "grad_norm": 0.1259540617465973, + "learning_rate": 0.00017508109125683737, + "loss": 2.8013, + "step": 9700 + }, + { + "epoch": 0.5634129712636176, + "grad_norm": 0.12676270306110382, + "learning_rate": 0.00017470035156892972, + "loss": 2.8109, + "step": 9710 + }, + { + "epoch": 0.5639932111928282, + "grad_norm": 0.13035669922828674, + "learning_rate": 0.00017431970503697795, + "loss": 2.8096, + "step": 9720 + }, + { + "epoch": 0.5645734511220389, + "grad_norm": 0.13196563720703125, + "learning_rate": 0.00017393915306256237, + "loss": 2.8044, + "step": 9730 + }, + { + "epoch": 0.5651536910512497, + "grad_norm": 0.12951119244098663, + "learning_rate": 0.00017355869704691537, + "loss": 2.8023, + "step": 9740 + }, + { + "epoch": 0.5657339309804604, + "grad_norm": 0.13105542957782745, + "learning_rate": 0.00017317833839091567, + "loss": 2.806, + "step": 9750 + }, + { + "epoch": 0.5663141709096712, + "grad_norm": 0.1464497148990631, + "learning_rate": 0.00017279807849508377, + "loss": 2.8131, + "step": 9760 + }, + { + "epoch": 0.5668944108388819, + "grad_norm": 0.14261843264102936, + "learning_rate": 0.00017241791875957657, + "loss": 2.812, + "step": 9770 + }, + { + "epoch": 0.5674746507680926, + "grad_norm": 0.1395425796508789, + "learning_rate": 0.0001720378605841818, + "loss": 2.8075, + "step": 9780 + }, + { + "epoch": 0.5680548906973033, + "grad_norm": 0.13989108800888062, + "learning_rate": 0.00017165790536831366, + "loss": 2.8076, + "step": 9790 + }, + { + "epoch": 0.568635130626514, + "grad_norm": 0.14962467551231384, + "learning_rate": 0.00017127805451100692, + "loss": 2.8027, + "step": 9800 + }, + { + "epoch": 0.5692153705557248, + "grad_norm": 0.13398703932762146, + "learning_rate": 0.0001708983094109124, + "loss": 2.8068, + "step": 9810 + }, + { + "epoch": 0.5697956104849355, + "grad_norm": 0.12953974306583405, + "learning_rate": 0.00017051867146629116, + "loss": 2.8114, + "step": 9820 + }, + { + "epoch": 0.5703758504141463, + "grad_norm": 0.14943692088127136, + "learning_rate": 0.00017013914207501, + "loss": 2.8009, + "step": 9830 + }, + { + "epoch": 0.5709560903433569, + "grad_norm": 0.13319486379623413, + "learning_rate": 0.00016975972263453585, + "loss": 2.8085, + "step": 9840 + }, + { + "epoch": 0.5715363302725677, + "grad_norm": 0.13063213229179382, + "learning_rate": 0.00016938041454193082, + "loss": 2.803, + "step": 9850 + }, + { + "epoch": 0.5721165702017784, + "grad_norm": 0.1308039277791977, + "learning_rate": 0.00016900121919384716, + "loss": 2.8039, + "step": 9860 + }, + { + "epoch": 0.5726968101309892, + "grad_norm": 0.12958461046218872, + "learning_rate": 0.0001686221379865217, + "loss": 2.8047, + "step": 9870 + }, + { + "epoch": 0.5732770500601999, + "grad_norm": 0.12382495403289795, + "learning_rate": 0.0001682431723157712, + "loss": 2.819, + "step": 9880 + }, + { + "epoch": 0.5738572899894107, + "grad_norm": 0.1312320977449417, + "learning_rate": 0.00016786432357698708, + "loss": 2.7955, + "step": 9890 + }, + { + "epoch": 0.5744375299186214, + "grad_norm": 0.12677842378616333, + "learning_rate": 0.00016748559316512993, + "loss": 2.7956, + "step": 9900 + }, + { + "epoch": 0.575017769847832, + "grad_norm": 0.130837082862854, + "learning_rate": 0.00016710698247472493, + "loss": 2.7972, + "step": 9910 + }, + { + "epoch": 0.5755980097770428, + "grad_norm": 0.13894180953502655, + "learning_rate": 0.0001667284928998562, + "loss": 2.8055, + "step": 9920 + }, + { + "epoch": 0.5761782497062535, + "grad_norm": 0.14453203976154327, + "learning_rate": 0.00016635012583416205, + "loss": 2.8029, + "step": 9930 + }, + { + "epoch": 0.5767584896354643, + "grad_norm": 0.12817683815956116, + "learning_rate": 0.0001659718826708296, + "loss": 2.7971, + "step": 9940 + }, + { + "epoch": 0.577338729564675, + "grad_norm": 0.1308722198009491, + "learning_rate": 0.00016559376480258987, + "loss": 2.7924, + "step": 9950 + }, + { + "epoch": 0.5779189694938858, + "grad_norm": 0.12425903975963593, + "learning_rate": 0.00016521577362171253, + "loss": 2.795, + "step": 9960 + }, + { + "epoch": 0.5784992094230964, + "grad_norm": 0.13473841547966003, + "learning_rate": 0.0001648379105200005, + "loss": 2.8023, + "step": 9970 + }, + { + "epoch": 0.5790794493523072, + "grad_norm": 0.13867899775505066, + "learning_rate": 0.00016446017688878547, + "loss": 2.803, + "step": 9980 + }, + { + "epoch": 0.5796596892815179, + "grad_norm": 0.1321575790643692, + "learning_rate": 0.00016408257411892215, + "loss": 2.7967, + "step": 9990 + }, + { + "epoch": 0.5802399292107286, + "grad_norm": 0.12522001564502716, + "learning_rate": 0.00016370510360078354, + "loss": 2.7939, + "step": 10000 + }, + { + "epoch": 0.5802399292107286, + "eval_loss": 2.7709856033325195, + "eval_runtime": 5.4112, + "eval_samples_per_second": 800.188, + "eval_steps_per_second": 1.663, + "step": 10000 + }, + { + "epoch": 0.5808201691399394, + "grad_norm": 0.13485607504844666, + "learning_rate": 0.0001633277667242557, + "loss": 2.7988, + "step": 10010 + }, + { + "epoch": 0.5814004090691501, + "grad_norm": 0.1311897337436676, + "learning_rate": 0.00016295056487873242, + "loss": 2.8103, + "step": 10020 + }, + { + "epoch": 0.5819806489983608, + "grad_norm": 0.13095928728580475, + "learning_rate": 0.00016257349945311044, + "loss": 2.7913, + "step": 10030 + }, + { + "epoch": 0.5825608889275715, + "grad_norm": 0.13543324172496796, + "learning_rate": 0.00016219657183578416, + "loss": 2.793, + "step": 10040 + }, + { + "epoch": 0.5831411288567823, + "grad_norm": 0.13157446682453156, + "learning_rate": 0.00016181978341464064, + "loss": 2.8001, + "step": 10050 + }, + { + "epoch": 0.583721368785993, + "grad_norm": 0.12915684282779694, + "learning_rate": 0.00016144313557705416, + "loss": 2.7973, + "step": 10060 + }, + { + "epoch": 0.5843016087152038, + "grad_norm": 0.12583227455615997, + "learning_rate": 0.0001610666297098816, + "loss": 2.7976, + "step": 10070 + }, + { + "epoch": 0.5848818486444145, + "grad_norm": 0.12779027223587036, + "learning_rate": 0.00016069026719945707, + "loss": 2.794, + "step": 10080 + }, + { + "epoch": 0.5854620885736251, + "grad_norm": 0.12447459995746613, + "learning_rate": 0.0001603140494315866, + "loss": 2.797, + "step": 10090 + }, + { + "epoch": 0.5860423285028359, + "grad_norm": 0.128029927611351, + "learning_rate": 0.00015993797779154356, + "loss": 2.8017, + "step": 10100 + }, + { + "epoch": 0.5866225684320466, + "grad_norm": 0.1264951229095459, + "learning_rate": 0.00015956205366406305, + "loss": 2.7931, + "step": 10110 + }, + { + "epoch": 0.5872028083612574, + "grad_norm": 0.13305509090423584, + "learning_rate": 0.0001591862784333371, + "loss": 2.7975, + "step": 10120 + }, + { + "epoch": 0.5877830482904681, + "grad_norm": 0.13381877541542053, + "learning_rate": 0.00015881065348300957, + "loss": 2.803, + "step": 10130 + }, + { + "epoch": 0.5883632882196789, + "grad_norm": 0.12950487434864044, + "learning_rate": 0.00015843518019617074, + "loss": 2.7968, + "step": 10140 + }, + { + "epoch": 0.5889435281488896, + "grad_norm": 0.12617862224578857, + "learning_rate": 0.0001580598599553527, + "loss": 2.7882, + "step": 10150 + }, + { + "epoch": 0.5895237680781003, + "grad_norm": 0.13075517117977142, + "learning_rate": 0.00015768469414252376, + "loss": 2.8059, + "step": 10160 + }, + { + "epoch": 0.590104008007311, + "grad_norm": 0.13265134394168854, + "learning_rate": 0.0001573096841390839, + "loss": 2.7919, + "step": 10170 + }, + { + "epoch": 0.5906842479365217, + "grad_norm": 0.12705199420452118, + "learning_rate": 0.00015693483132585908, + "loss": 2.7983, + "step": 10180 + }, + { + "epoch": 0.5912644878657325, + "grad_norm": 0.13738813996315002, + "learning_rate": 0.00015656013708309672, + "loss": 2.7949, + "step": 10190 + }, + { + "epoch": 0.5918447277949432, + "grad_norm": 0.13547635078430176, + "learning_rate": 0.0001561856027904603, + "loss": 2.7971, + "step": 10200 + }, + { + "epoch": 0.592424967724154, + "grad_norm": 0.12752999365329742, + "learning_rate": 0.00015581122982702425, + "loss": 2.798, + "step": 10210 + }, + { + "epoch": 0.5930052076533646, + "grad_norm": 0.12572290003299713, + "learning_rate": 0.00015543701957126916, + "loss": 2.7963, + "step": 10220 + }, + { + "epoch": 0.5935854475825754, + "grad_norm": 0.13692601025104523, + "learning_rate": 0.0001550629734010762, + "loss": 2.7906, + "step": 10230 + }, + { + "epoch": 0.5941656875117861, + "grad_norm": 0.12763461470603943, + "learning_rate": 0.00015468909269372266, + "loss": 2.7867, + "step": 10240 + }, + { + "epoch": 0.5947459274409969, + "grad_norm": 0.12994034588336945, + "learning_rate": 0.00015431537882587649, + "loss": 2.792, + "step": 10250 + }, + { + "epoch": 0.5953261673702076, + "grad_norm": 0.13037025928497314, + "learning_rate": 0.00015394183317359126, + "loss": 2.7964, + "step": 10260 + }, + { + "epoch": 0.5959064072994184, + "grad_norm": 0.13446973264217377, + "learning_rate": 0.00015356845711230128, + "loss": 2.7919, + "step": 10270 + }, + { + "epoch": 0.596486647228629, + "grad_norm": 0.1517745852470398, + "learning_rate": 0.00015319525201681617, + "loss": 2.7862, + "step": 10280 + }, + { + "epoch": 0.5970668871578397, + "grad_norm": 0.13981299102306366, + "learning_rate": 0.00015282221926131632, + "loss": 2.783, + "step": 10290 + }, + { + "epoch": 0.5976471270870505, + "grad_norm": 0.12882374227046967, + "learning_rate": 0.00015244936021934733, + "loss": 2.794, + "step": 10300 + }, + { + "epoch": 0.5982273670162612, + "grad_norm": 0.12829270958900452, + "learning_rate": 0.00015207667626381528, + "loss": 2.7928, + "step": 10310 + }, + { + "epoch": 0.598807606945472, + "grad_norm": 0.13218654692173004, + "learning_rate": 0.0001517041687669816, + "loss": 2.7832, + "step": 10320 + }, + { + "epoch": 0.5993878468746827, + "grad_norm": 0.13118650019168854, + "learning_rate": 0.0001513318391004578, + "loss": 2.7923, + "step": 10330 + }, + { + "epoch": 0.5999680868038935, + "grad_norm": 0.12444707006216049, + "learning_rate": 0.00015095968863520088, + "loss": 2.7941, + "step": 10340 + }, + { + "epoch": 0.6005483267331041, + "grad_norm": 0.13274915516376495, + "learning_rate": 0.00015058771874150762, + "loss": 2.7907, + "step": 10350 + }, + { + "epoch": 0.6011285666623148, + "grad_norm": 0.1304982304573059, + "learning_rate": 0.00015021593078901025, + "loss": 2.7849, + "step": 10360 + }, + { + "epoch": 0.6017088065915256, + "grad_norm": 0.12944455444812775, + "learning_rate": 0.000149844326146671, + "loss": 2.7902, + "step": 10370 + }, + { + "epoch": 0.6022890465207363, + "grad_norm": 0.13214483857154846, + "learning_rate": 0.000149472906182777, + "loss": 2.7986, + "step": 10380 + }, + { + "epoch": 0.6028692864499471, + "grad_norm": 0.1335555911064148, + "learning_rate": 0.00014910167226493562, + "loss": 2.7839, + "step": 10390 + }, + { + "epoch": 0.6034495263791578, + "grad_norm": 0.12958268821239471, + "learning_rate": 0.0001487306257600688, + "loss": 2.7837, + "step": 10400 + }, + { + "epoch": 0.6040297663083685, + "grad_norm": 0.13215544819831848, + "learning_rate": 0.00014835976803440886, + "loss": 2.7761, + "step": 10410 + }, + { + "epoch": 0.6046100062375792, + "grad_norm": 0.12011713534593582, + "learning_rate": 0.00014798910045349265, + "loss": 2.7899, + "step": 10420 + }, + { + "epoch": 0.60519024616679, + "grad_norm": 0.1265355348587036, + "learning_rate": 0.00014761862438215708, + "loss": 2.7856, + "step": 10430 + }, + { + "epoch": 0.6057704860960007, + "grad_norm": 0.12620458006858826, + "learning_rate": 0.0001472483411845339, + "loss": 2.795, + "step": 10440 + }, + { + "epoch": 0.6063507260252115, + "grad_norm": 0.1332314908504486, + "learning_rate": 0.0001468782522240446, + "loss": 2.7867, + "step": 10450 + }, + { + "epoch": 0.6069309659544222, + "grad_norm": 0.13342437148094177, + "learning_rate": 0.0001465083588633955, + "loss": 2.7831, + "step": 10460 + }, + { + "epoch": 0.6075112058836328, + "grad_norm": 0.13332657516002655, + "learning_rate": 0.00014613866246457265, + "loss": 2.7936, + "step": 10470 + }, + { + "epoch": 0.6080914458128436, + "grad_norm": 0.13002225756645203, + "learning_rate": 0.00014576916438883698, + "loss": 2.7875, + "step": 10480 + }, + { + "epoch": 0.6086716857420543, + "grad_norm": 0.13596920669078827, + "learning_rate": 0.0001453998659967192, + "loss": 2.7938, + "step": 10490 + }, + { + "epoch": 0.6092519256712651, + "grad_norm": 0.13465642929077148, + "learning_rate": 0.00014503076864801447, + "loss": 2.7886, + "step": 10500 + }, + { + "epoch": 0.6098321656004758, + "grad_norm": 0.13074541091918945, + "learning_rate": 0.00014466187370177806, + "loss": 2.7876, + "step": 10510 + }, + { + "epoch": 0.6104124055296866, + "grad_norm": 0.12734749913215637, + "learning_rate": 0.00014429318251631972, + "loss": 2.7756, + "step": 10520 + }, + { + "epoch": 0.6109926454588972, + "grad_norm": 0.12566278874874115, + "learning_rate": 0.0001439246964491991, + "loss": 2.7902, + "step": 10530 + }, + { + "epoch": 0.611572885388108, + "grad_norm": 0.13200882077217102, + "learning_rate": 0.0001435564168572204, + "loss": 2.781, + "step": 10540 + }, + { + "epoch": 0.6121531253173187, + "grad_norm": 0.13373680412769318, + "learning_rate": 0.00014318834509642766, + "loss": 2.7913, + "step": 10550 + }, + { + "epoch": 0.6127333652465294, + "grad_norm": 0.12818191945552826, + "learning_rate": 0.0001428204825220998, + "loss": 2.786, + "step": 10560 + }, + { + "epoch": 0.6133136051757402, + "grad_norm": 0.12577253580093384, + "learning_rate": 0.00014245283048874518, + "loss": 2.785, + "step": 10570 + }, + { + "epoch": 0.6138938451049509, + "grad_norm": 0.1309729665517807, + "learning_rate": 0.0001420853903500973, + "loss": 2.787, + "step": 10580 + }, + { + "epoch": 0.6144740850341617, + "grad_norm": 0.13818277418613434, + "learning_rate": 0.00014171816345910903, + "loss": 2.7856, + "step": 10590 + }, + { + "epoch": 0.6150543249633723, + "grad_norm": 0.13121308386325836, + "learning_rate": 0.00014135115116794834, + "loss": 2.7882, + "step": 10600 + }, + { + "epoch": 0.6156345648925831, + "grad_norm": 0.12498753517866135, + "learning_rate": 0.00014098435482799303, + "loss": 2.7865, + "step": 10610 + }, + { + "epoch": 0.6162148048217938, + "grad_norm": 0.12569987773895264, + "learning_rate": 0.00014061777578982547, + "loss": 2.7822, + "step": 10620 + }, + { + "epoch": 0.6167950447510046, + "grad_norm": 0.1260175108909607, + "learning_rate": 0.0001402514154032282, + "loss": 2.7785, + "step": 10630 + }, + { + "epoch": 0.6173752846802153, + "grad_norm": 0.12860171496868134, + "learning_rate": 0.00013988527501717848, + "loss": 2.787, + "step": 10640 + }, + { + "epoch": 0.617955524609426, + "grad_norm": 0.13064344227313995, + "learning_rate": 0.0001395193559798437, + "loss": 2.7959, + "step": 10650 + }, + { + "epoch": 0.6185357645386367, + "grad_norm": 0.12624002993106842, + "learning_rate": 0.0001391536596385759, + "loss": 2.7757, + "step": 10660 + }, + { + "epoch": 0.6191160044678474, + "grad_norm": 0.1249968633055687, + "learning_rate": 0.00013878818733990738, + "loss": 2.7928, + "step": 10670 + }, + { + "epoch": 0.6196962443970582, + "grad_norm": 0.1293274611234665, + "learning_rate": 0.00013842294042954554, + "loss": 2.7823, + "step": 10680 + }, + { + "epoch": 0.6202764843262689, + "grad_norm": 0.1331176459789276, + "learning_rate": 0.0001380579202523676, + "loss": 2.7898, + "step": 10690 + }, + { + "epoch": 0.6208567242554797, + "grad_norm": 0.12228766083717346, + "learning_rate": 0.00013769312815241626, + "loss": 2.7947, + "step": 10700 + }, + { + "epoch": 0.6214369641846904, + "grad_norm": 0.12410891056060791, + "learning_rate": 0.0001373285654728941, + "loss": 2.7706, + "step": 10710 + }, + { + "epoch": 0.622017204113901, + "grad_norm": 0.1261039823293686, + "learning_rate": 0.00013696423355615914, + "loss": 2.7705, + "step": 10720 + }, + { + "epoch": 0.6225974440431118, + "grad_norm": 0.13225802779197693, + "learning_rate": 0.00013660013374371973, + "loss": 2.7883, + "step": 10730 + }, + { + "epoch": 0.6231776839723225, + "grad_norm": 0.12750951945781708, + "learning_rate": 0.00013623626737622942, + "loss": 2.7895, + "step": 10740 + }, + { + "epoch": 0.6237579239015333, + "grad_norm": 0.1364651620388031, + "learning_rate": 0.00013587263579348239, + "loss": 2.7772, + "step": 10750 + }, + { + "epoch": 0.624338163830744, + "grad_norm": 0.13347889482975006, + "learning_rate": 0.00013550924033440813, + "loss": 2.787, + "step": 10760 + }, + { + "epoch": 0.6249184037599548, + "grad_norm": 0.1324913650751114, + "learning_rate": 0.0001351460823370669, + "loss": 2.786, + "step": 10770 + }, + { + "epoch": 0.6254986436891655, + "grad_norm": 0.1301848590373993, + "learning_rate": 0.00013478316313864433, + "loss": 2.7848, + "step": 10780 + }, + { + "epoch": 0.6260788836183762, + "grad_norm": 0.1281544417142868, + "learning_rate": 0.00013442048407544705, + "loss": 2.7803, + "step": 10790 + }, + { + "epoch": 0.6266591235475869, + "grad_norm": 0.1410907804965973, + "learning_rate": 0.0001340580464828974, + "loss": 2.7809, + "step": 10800 + }, + { + "epoch": 0.6272393634767977, + "grad_norm": 0.1318545639514923, + "learning_rate": 0.0001336958516955284, + "loss": 2.773, + "step": 10810 + }, + { + "epoch": 0.6278196034060084, + "grad_norm": 0.12339744716882706, + "learning_rate": 0.00013333390104697937, + "loss": 2.774, + "step": 10820 + }, + { + "epoch": 0.6283998433352191, + "grad_norm": 0.13089622557163239, + "learning_rate": 0.0001329721958699904, + "loss": 2.7764, + "step": 10830 + }, + { + "epoch": 0.6289800832644299, + "grad_norm": 0.1368187516927719, + "learning_rate": 0.00013261073749639785, + "loss": 2.7855, + "step": 10840 + }, + { + "epoch": 0.6295603231936405, + "grad_norm": 0.1409740447998047, + "learning_rate": 0.00013224952725712948, + "loss": 2.7771, + "step": 10850 + }, + { + "epoch": 0.6301405631228513, + "grad_norm": 0.1319727897644043, + "learning_rate": 0.000131888566482199, + "loss": 2.7745, + "step": 10860 + }, + { + "epoch": 0.630720803052062, + "grad_norm": 0.1281840205192566, + "learning_rate": 0.00013152785650070198, + "loss": 2.7799, + "step": 10870 + }, + { + "epoch": 0.6313010429812728, + "grad_norm": 0.1265205293893814, + "learning_rate": 0.00013116739864081018, + "loss": 2.7767, + "step": 10880 + }, + { + "epoch": 0.6318812829104835, + "grad_norm": 0.1335509866476059, + "learning_rate": 0.00013080719422976732, + "loss": 2.7734, + "step": 10890 + }, + { + "epoch": 0.6324615228396943, + "grad_norm": 0.1291695237159729, + "learning_rate": 0.00013044724459388375, + "loss": 2.7717, + "step": 10900 + }, + { + "epoch": 0.6330417627689049, + "grad_norm": 0.1341160535812378, + "learning_rate": 0.00013008755105853174, + "loss": 2.7797, + "step": 10910 + }, + { + "epoch": 0.6336220026981156, + "grad_norm": 0.13332979381084442, + "learning_rate": 0.00012972811494814062, + "loss": 2.7757, + "step": 10920 + }, + { + "epoch": 0.6342022426273264, + "grad_norm": 0.1305907964706421, + "learning_rate": 0.00012936893758619172, + "loss": 2.7826, + "step": 10930 + }, + { + "epoch": 0.6347824825565371, + "grad_norm": 0.1261076033115387, + "learning_rate": 0.00012901002029521377, + "loss": 2.7736, + "step": 10940 + }, + { + "epoch": 0.6353627224857479, + "grad_norm": 0.12599441409111023, + "learning_rate": 0.00012865136439677772, + "loss": 2.7678, + "step": 10950 + }, + { + "epoch": 0.6359429624149586, + "grad_norm": 0.13536348938941956, + "learning_rate": 0.0001282929712114923, + "loss": 2.7736, + "step": 10960 + }, + { + "epoch": 0.6365232023441693, + "grad_norm": 0.1305113434791565, + "learning_rate": 0.00012793484205899874, + "loss": 2.7856, + "step": 10970 + }, + { + "epoch": 0.63710344227338, + "grad_norm": 0.1301414519548416, + "learning_rate": 0.00012757697825796602, + "loss": 2.7801, + "step": 10980 + }, + { + "epoch": 0.6376836822025908, + "grad_norm": 0.12686558067798615, + "learning_rate": 0.00012721938112608623, + "loss": 2.7767, + "step": 10990 + }, + { + "epoch": 0.6382639221318015, + "grad_norm": 0.1347103714942932, + "learning_rate": 0.00012686205198006938, + "loss": 2.7718, + "step": 11000 + }, + { + "epoch": 0.6382639221318015, + "eval_loss": 2.7470815181732178, + "eval_runtime": 5.3884, + "eval_samples_per_second": 803.575, + "eval_steps_per_second": 1.67, + "step": 11000 + }, + { + "epoch": 0.6388441620610122, + "grad_norm": 0.1285264790058136, + "learning_rate": 0.00012650499213563894, + "loss": 2.7835, + "step": 11010 + }, + { + "epoch": 0.639424401990223, + "grad_norm": 0.13870897889137268, + "learning_rate": 0.00012614820290752653, + "loss": 2.771, + "step": 11020 + }, + { + "epoch": 0.6400046419194337, + "grad_norm": 0.12220434844493866, + "learning_rate": 0.0001257916856094675, + "loss": 2.7725, + "step": 11030 + }, + { + "epoch": 0.6405848818486444, + "grad_norm": 0.12159667909145355, + "learning_rate": 0.00012543544155419598, + "loss": 2.7679, + "step": 11040 + }, + { + "epoch": 0.6411651217778551, + "grad_norm": 0.1229986697435379, + "learning_rate": 0.0001250794720534398, + "loss": 2.7774, + "step": 11050 + }, + { + "epoch": 0.6417453617070659, + "grad_norm": 0.12473072856664658, + "learning_rate": 0.00012472377841791604, + "loss": 2.7729, + "step": 11060 + }, + { + "epoch": 0.6423256016362766, + "grad_norm": 0.12544026970863342, + "learning_rate": 0.0001243683619573258, + "loss": 2.7581, + "step": 11070 + }, + { + "epoch": 0.6429058415654874, + "grad_norm": 0.13708697259426117, + "learning_rate": 0.0001240132239803498, + "loss": 2.7734, + "step": 11080 + }, + { + "epoch": 0.6434860814946981, + "grad_norm": 0.13081100583076477, + "learning_rate": 0.00012365836579464332, + "loss": 2.7763, + "step": 11090 + }, + { + "epoch": 0.6440663214239087, + "grad_norm": 0.13071762025356293, + "learning_rate": 0.00012330378870683124, + "loss": 2.7725, + "step": 11100 + }, + { + "epoch": 0.6446465613531195, + "grad_norm": 0.12263841927051544, + "learning_rate": 0.00012294949402250378, + "loss": 2.7615, + "step": 11110 + }, + { + "epoch": 0.6452268012823302, + "grad_norm": 0.12744446098804474, + "learning_rate": 0.00012259548304621078, + "loss": 2.773, + "step": 11120 + }, + { + "epoch": 0.645807041211541, + "grad_norm": 0.12339978665113449, + "learning_rate": 0.00012224175708145797, + "loss": 2.7672, + "step": 11130 + }, + { + "epoch": 0.6463872811407517, + "grad_norm": 0.12739062309265137, + "learning_rate": 0.00012188831743070125, + "loss": 2.7768, + "step": 11140 + }, + { + "epoch": 0.6469675210699625, + "grad_norm": 0.1270364671945572, + "learning_rate": 0.00012153516539534253, + "loss": 2.7759, + "step": 11150 + }, + { + "epoch": 0.6475477609991731, + "grad_norm": 0.12549492716789246, + "learning_rate": 0.00012118230227572467, + "loss": 2.7727, + "step": 11160 + }, + { + "epoch": 0.6481280009283839, + "grad_norm": 0.13069987297058105, + "learning_rate": 0.00012082972937112646, + "loss": 2.7671, + "step": 11170 + }, + { + "epoch": 0.6487082408575946, + "grad_norm": 0.1319432556629181, + "learning_rate": 0.00012047744797975848, + "loss": 2.7725, + "step": 11180 + }, + { + "epoch": 0.6492884807868053, + "grad_norm": 0.1310606598854065, + "learning_rate": 0.00012012545939875747, + "loss": 2.7731, + "step": 11190 + }, + { + "epoch": 0.6498687207160161, + "grad_norm": 0.12737832963466644, + "learning_rate": 0.00011977376492418245, + "loss": 2.7681, + "step": 11200 + }, + { + "epoch": 0.6504489606452268, + "grad_norm": 0.12653803825378418, + "learning_rate": 0.00011942236585100926, + "loss": 2.7698, + "step": 11210 + }, + { + "epoch": 0.6510292005744376, + "grad_norm": 0.12586897611618042, + "learning_rate": 0.00011907126347312605, + "loss": 2.7767, + "step": 11220 + }, + { + "epoch": 0.6516094405036482, + "grad_norm": 0.12253769487142563, + "learning_rate": 0.0001187204590833287, + "loss": 2.7547, + "step": 11230 + }, + { + "epoch": 0.652189680432859, + "grad_norm": 0.12751977145671844, + "learning_rate": 0.00011836995397331554, + "loss": 2.7699, + "step": 11240 + }, + { + "epoch": 0.6527699203620697, + "grad_norm": 0.12828411161899567, + "learning_rate": 0.00011801974943368321, + "loss": 2.7704, + "step": 11250 + }, + { + "epoch": 0.6533501602912805, + "grad_norm": 0.12598510086536407, + "learning_rate": 0.00011766984675392147, + "loss": 2.7734, + "step": 11260 + }, + { + "epoch": 0.6539304002204912, + "grad_norm": 0.12886223196983337, + "learning_rate": 0.00011732024722240869, + "loss": 2.7621, + "step": 11270 + }, + { + "epoch": 0.654510640149702, + "grad_norm": 0.12585338950157166, + "learning_rate": 0.00011697095212640699, + "loss": 2.7658, + "step": 11280 + }, + { + "epoch": 0.6550908800789126, + "grad_norm": 0.1305324286222458, + "learning_rate": 0.00011662196275205736, + "loss": 2.7719, + "step": 11290 + }, + { + "epoch": 0.6556711200081233, + "grad_norm": 0.12806616723537445, + "learning_rate": 0.00011627328038437537, + "loss": 2.7749, + "step": 11300 + }, + { + "epoch": 0.6562513599373341, + "grad_norm": 0.13020043075084686, + "learning_rate": 0.00011592490630724602, + "loss": 2.7611, + "step": 11310 + }, + { + "epoch": 0.6568315998665448, + "grad_norm": 0.13302190601825714, + "learning_rate": 0.00011557684180341901, + "loss": 2.7708, + "step": 11320 + }, + { + "epoch": 0.6574118397957556, + "grad_norm": 0.1290377378463745, + "learning_rate": 0.00011522908815450448, + "loss": 2.7743, + "step": 11330 + }, + { + "epoch": 0.6579920797249663, + "grad_norm": 0.12511534988880157, + "learning_rate": 0.00011488164664096777, + "loss": 2.7712, + "step": 11340 + }, + { + "epoch": 0.658572319654177, + "grad_norm": 0.12530122697353363, + "learning_rate": 0.00011453451854212489, + "loss": 2.7757, + "step": 11350 + }, + { + "epoch": 0.6591525595833877, + "grad_norm": 0.13579903542995453, + "learning_rate": 0.00011418770513613783, + "loss": 2.7633, + "step": 11360 + }, + { + "epoch": 0.6597327995125984, + "grad_norm": 0.12218291312456131, + "learning_rate": 0.00011384120770000997, + "loss": 2.7704, + "step": 11370 + }, + { + "epoch": 0.6603130394418092, + "grad_norm": 0.12762351334095, + "learning_rate": 0.00011349502750958101, + "loss": 2.7602, + "step": 11380 + }, + { + "epoch": 0.6608932793710199, + "grad_norm": 0.1221364438533783, + "learning_rate": 0.00011314916583952287, + "loss": 2.7779, + "step": 11390 + }, + { + "epoch": 0.6614735193002307, + "grad_norm": 0.1309012919664383, + "learning_rate": 0.00011280362396333433, + "loss": 2.7678, + "step": 11400 + }, + { + "epoch": 0.6620537592294413, + "grad_norm": 0.12953191995620728, + "learning_rate": 0.00011245840315333685, + "loss": 2.7707, + "step": 11410 + }, + { + "epoch": 0.6626339991586521, + "grad_norm": 0.12538152933120728, + "learning_rate": 0.00011211350468066954, + "loss": 2.7684, + "step": 11420 + }, + { + "epoch": 0.6632142390878628, + "grad_norm": 0.1254589706659317, + "learning_rate": 0.00011176892981528478, + "loss": 2.7689, + "step": 11430 + }, + { + "epoch": 0.6637944790170736, + "grad_norm": 0.12923027575016022, + "learning_rate": 0.00011142467982594316, + "loss": 2.7776, + "step": 11440 + }, + { + "epoch": 0.6643747189462843, + "grad_norm": 0.13753275573253632, + "learning_rate": 0.00011108075598020944, + "loss": 2.7567, + "step": 11450 + }, + { + "epoch": 0.664954958875495, + "grad_norm": 0.1266188770532608, + "learning_rate": 0.00011073715954444712, + "loss": 2.7705, + "step": 11460 + }, + { + "epoch": 0.6655351988047058, + "grad_norm": 0.12759283185005188, + "learning_rate": 0.00011039389178381427, + "loss": 2.7604, + "step": 11470 + }, + { + "epoch": 0.6661154387339164, + "grad_norm": 0.1300540566444397, + "learning_rate": 0.0001100509539622588, + "loss": 2.7636, + "step": 11480 + }, + { + "epoch": 0.6666956786631272, + "grad_norm": 0.12630164623260498, + "learning_rate": 0.00010970834734251363, + "loss": 2.7766, + "step": 11490 + }, + { + "epoch": 0.6672759185923379, + "grad_norm": 0.1265067309141159, + "learning_rate": 0.00010936607318609218, + "loss": 2.7604, + "step": 11500 + }, + { + "epoch": 0.6678561585215487, + "grad_norm": 0.12710276246070862, + "learning_rate": 0.00010902413275328389, + "loss": 2.7562, + "step": 11510 + }, + { + "epoch": 0.6684363984507594, + "grad_norm": 0.12407544255256653, + "learning_rate": 0.00010868252730314918, + "loss": 2.7669, + "step": 11520 + }, + { + "epoch": 0.6690166383799702, + "grad_norm": 0.12599880993366241, + "learning_rate": 0.00010834125809351512, + "loss": 2.772, + "step": 11530 + }, + { + "epoch": 0.6695968783091808, + "grad_norm": 0.12807178497314453, + "learning_rate": 0.00010800032638097067, + "loss": 2.759, + "step": 11540 + }, + { + "epoch": 0.6701771182383915, + "grad_norm": 0.13285909593105316, + "learning_rate": 0.00010765973342086204, + "loss": 2.7591, + "step": 11550 + }, + { + "epoch": 0.6707573581676023, + "grad_norm": 0.1285742074251175, + "learning_rate": 0.00010731948046728834, + "loss": 2.7567, + "step": 11560 + }, + { + "epoch": 0.671337598096813, + "grad_norm": 0.1229885146021843, + "learning_rate": 0.00010697956877309651, + "loss": 2.7585, + "step": 11570 + }, + { + "epoch": 0.6719178380260238, + "grad_norm": 0.12948161363601685, + "learning_rate": 0.00010663999958987702, + "loss": 2.76, + "step": 11580 + }, + { + "epoch": 0.6724980779552345, + "grad_norm": 0.12574850022792816, + "learning_rate": 0.00010630077416795919, + "loss": 2.7581, + "step": 11590 + }, + { + "epoch": 0.6730783178844452, + "grad_norm": 0.12991534173488617, + "learning_rate": 0.00010596189375640646, + "loss": 2.7543, + "step": 11600 + }, + { + "epoch": 0.6736585578136559, + "grad_norm": 0.12388639152050018, + "learning_rate": 0.00010562335960301225, + "loss": 2.7503, + "step": 11610 + }, + { + "epoch": 0.6742387977428667, + "grad_norm": 0.13579106330871582, + "learning_rate": 0.00010528517295429445, + "loss": 2.7579, + "step": 11620 + }, + { + "epoch": 0.6748190376720774, + "grad_norm": 0.1378217339515686, + "learning_rate": 0.00010494733505549197, + "loss": 2.7699, + "step": 11630 + }, + { + "epoch": 0.6753992776012882, + "grad_norm": 0.13772232830524445, + "learning_rate": 0.0001046098471505593, + "loss": 2.7658, + "step": 11640 + }, + { + "epoch": 0.6759795175304989, + "grad_norm": 0.13989433646202087, + "learning_rate": 0.00010427271048216214, + "loss": 2.767, + "step": 11650 + }, + { + "epoch": 0.6765597574597096, + "grad_norm": 0.13066108524799347, + "learning_rate": 0.00010393592629167326, + "loss": 2.7671, + "step": 11660 + }, + { + "epoch": 0.6771399973889203, + "grad_norm": 0.12652446329593658, + "learning_rate": 0.00010359949581916701, + "loss": 2.7602, + "step": 11670 + }, + { + "epoch": 0.677720237318131, + "grad_norm": 0.12439344823360443, + "learning_rate": 0.00010326342030341591, + "loss": 2.7597, + "step": 11680 + }, + { + "epoch": 0.6783004772473418, + "grad_norm": 0.12505538761615753, + "learning_rate": 0.00010292770098188511, + "loss": 2.7552, + "step": 11690 + }, + { + "epoch": 0.6788807171765525, + "grad_norm": 0.12654612958431244, + "learning_rate": 0.00010259233909072823, + "loss": 2.7624, + "step": 11700 + }, + { + "epoch": 0.6794609571057633, + "grad_norm": 0.12148457765579224, + "learning_rate": 0.00010225733586478315, + "loss": 2.7619, + "step": 11710 + }, + { + "epoch": 0.680041197034974, + "grad_norm": 0.12959624826908112, + "learning_rate": 0.00010192269253756648, + "loss": 2.7636, + "step": 11720 + }, + { + "epoch": 0.6806214369641846, + "grad_norm": 0.13695235550403595, + "learning_rate": 0.00010158841034127035, + "loss": 2.7599, + "step": 11730 + }, + { + "epoch": 0.6812016768933954, + "grad_norm": 0.1335146725177765, + "learning_rate": 0.00010125449050675655, + "loss": 2.7629, + "step": 11740 + }, + { + "epoch": 0.6817819168226061, + "grad_norm": 0.1221490204334259, + "learning_rate": 0.00010092093426355307, + "loss": 2.7578, + "step": 11750 + }, + { + "epoch": 0.6823621567518169, + "grad_norm": 0.12168210744857788, + "learning_rate": 0.00010058774283984887, + "loss": 2.7605, + "step": 11760 + }, + { + "epoch": 0.6829423966810276, + "grad_norm": 0.12987020611763, + "learning_rate": 0.00010025491746248963, + "loss": 2.7601, + "step": 11770 + }, + { + "epoch": 0.6835226366102384, + "grad_norm": 0.12631021440029144, + "learning_rate": 9.992245935697346e-05, + "loss": 2.7466, + "step": 11780 + }, + { + "epoch": 0.684102876539449, + "grad_norm": 0.12719561159610748, + "learning_rate": 9.959036974744562e-05, + "loss": 2.7544, + "step": 11790 + }, + { + "epoch": 0.6846831164686598, + "grad_norm": 0.12334798276424408, + "learning_rate": 9.925864985669509e-05, + "loss": 2.7511, + "step": 11800 + }, + { + "epoch": 0.6852633563978705, + "grad_norm": 0.12944762408733368, + "learning_rate": 9.892730090614917e-05, + "loss": 2.7651, + "step": 11810 + }, + { + "epoch": 0.6858435963270813, + "grad_norm": 0.1283605396747589, + "learning_rate": 9.859632411586935e-05, + "loss": 2.7533, + "step": 11820 + }, + { + "epoch": 0.686423836256292, + "grad_norm": 0.12465015053749084, + "learning_rate": 9.826572070454702e-05, + "loss": 2.7572, + "step": 11830 + }, + { + "epoch": 0.6870040761855027, + "grad_norm": 0.13003583252429962, + "learning_rate": 9.793549188949835e-05, + "loss": 2.7584, + "step": 11840 + }, + { + "epoch": 0.6875843161147134, + "grad_norm": 0.12272375077009201, + "learning_rate": 9.760563888666059e-05, + "loss": 2.7473, + "step": 11850 + }, + { + "epoch": 0.6881645560439241, + "grad_norm": 0.12487037479877472, + "learning_rate": 9.7276162910587e-05, + "loss": 2.7501, + "step": 11860 + }, + { + "epoch": 0.6887447959731349, + "grad_norm": 0.12201700359582901, + "learning_rate": 9.694706517444256e-05, + "loss": 2.7487, + "step": 11870 + }, + { + "epoch": 0.6893250359023456, + "grad_norm": 0.1306881606578827, + "learning_rate": 9.661834688999987e-05, + "loss": 2.7551, + "step": 11880 + }, + { + "epoch": 0.6899052758315564, + "grad_norm": 0.123641736805439, + "learning_rate": 9.629000926763371e-05, + "loss": 2.7461, + "step": 11890 + }, + { + "epoch": 0.6904855157607671, + "grad_norm": 0.13387881219387054, + "learning_rate": 9.596205351631791e-05, + "loss": 2.7595, + "step": 11900 + }, + { + "epoch": 0.6910657556899779, + "grad_norm": 0.12325596064329147, + "learning_rate": 9.563448084361979e-05, + "loss": 2.7546, + "step": 11910 + }, + { + "epoch": 0.6916459956191885, + "grad_norm": 0.12377669662237167, + "learning_rate": 9.530729245569614e-05, + "loss": 2.7551, + "step": 11920 + }, + { + "epoch": 0.6922262355483992, + "grad_norm": 0.12587293982505798, + "learning_rate": 9.498048955728917e-05, + "loss": 2.7536, + "step": 11930 + }, + { + "epoch": 0.69280647547761, + "grad_norm": 0.12992674112319946, + "learning_rate": 9.465407335172102e-05, + "loss": 2.7633, + "step": 11940 + }, + { + "epoch": 0.6933867154068207, + "grad_norm": 0.1415330022573471, + "learning_rate": 9.432804504089065e-05, + "loss": 2.7563, + "step": 11950 + }, + { + "epoch": 0.6939669553360315, + "grad_norm": 0.13113632798194885, + "learning_rate": 9.400240582526834e-05, + "loss": 2.7571, + "step": 11960 + }, + { + "epoch": 0.6945471952652422, + "grad_norm": 0.12729544937610626, + "learning_rate": 9.367715690389178e-05, + "loss": 2.753, + "step": 11970 + }, + { + "epoch": 0.6951274351944529, + "grad_norm": 0.12055703997612, + "learning_rate": 9.335229947436157e-05, + "loss": 2.7618, + "step": 11980 + }, + { + "epoch": 0.6957076751236636, + "grad_norm": 0.12297140061855316, + "learning_rate": 9.302783473283676e-05, + "loss": 2.7526, + "step": 11990 + }, + { + "epoch": 0.6962879150528744, + "grad_norm": 0.12334892898797989, + "learning_rate": 9.270376387403073e-05, + "loss": 2.7557, + "step": 12000 + }, + { + "epoch": 0.6962879150528744, + "eval_loss": 2.7262585163116455, + "eval_runtime": 5.3974, + "eval_samples_per_second": 802.231, + "eval_steps_per_second": 1.667, + "step": 12000 + }, + { + "epoch": 0.6968681549820851, + "grad_norm": 0.12369856983423233, + "learning_rate": 9.238008809120602e-05, + "loss": 2.7586, + "step": 12010 + }, + { + "epoch": 0.6974483949112958, + "grad_norm": 0.12686264514923096, + "learning_rate": 9.205680857617099e-05, + "loss": 2.7587, + "step": 12020 + }, + { + "epoch": 0.6980286348405066, + "grad_norm": 0.12067441642284393, + "learning_rate": 9.173392651927462e-05, + "loss": 2.7581, + "step": 12030 + }, + { + "epoch": 0.6986088747697172, + "grad_norm": 0.12240596115589142, + "learning_rate": 9.141144310940237e-05, + "loss": 2.7504, + "step": 12040 + }, + { + "epoch": 0.699189114698928, + "grad_norm": 0.12386554479598999, + "learning_rate": 9.10893595339722e-05, + "loss": 2.7533, + "step": 12050 + }, + { + "epoch": 0.6997693546281387, + "grad_norm": 0.1218453124165535, + "learning_rate": 9.076767697892923e-05, + "loss": 2.7528, + "step": 12060 + }, + { + "epoch": 0.7003495945573495, + "grad_norm": 0.12844803929328918, + "learning_rate": 9.04463966287426e-05, + "loss": 2.7485, + "step": 12070 + }, + { + "epoch": 0.7009298344865602, + "grad_norm": 0.1264135241508484, + "learning_rate": 9.01255196664001e-05, + "loss": 2.7659, + "step": 12080 + }, + { + "epoch": 0.701510074415771, + "grad_norm": 0.12528979778289795, + "learning_rate": 8.980504727340433e-05, + "loss": 2.751, + "step": 12090 + }, + { + "epoch": 0.7020903143449817, + "grad_norm": 0.1233081966638565, + "learning_rate": 8.948498062976825e-05, + "loss": 2.7523, + "step": 12100 + }, + { + "epoch": 0.7026705542741923, + "grad_norm": 0.1261734664440155, + "learning_rate": 8.916532091401065e-05, + "loss": 2.7573, + "step": 12110 + }, + { + "epoch": 0.7032507942034031, + "grad_norm": 0.12680375576019287, + "learning_rate": 8.884606930315223e-05, + "loss": 2.7523, + "step": 12120 + }, + { + "epoch": 0.7038310341326138, + "grad_norm": 0.12935955822467804, + "learning_rate": 8.852722697271084e-05, + "loss": 2.7474, + "step": 12130 + }, + { + "epoch": 0.7044112740618246, + "grad_norm": 0.12751561403274536, + "learning_rate": 8.820879509669731e-05, + "loss": 2.7596, + "step": 12140 + }, + { + "epoch": 0.7049915139910353, + "grad_norm": 0.11992467194795609, + "learning_rate": 8.789077484761116e-05, + "loss": 2.7521, + "step": 12150 + }, + { + "epoch": 0.7055717539202461, + "grad_norm": 0.12174970656633377, + "learning_rate": 8.757316739643621e-05, + "loss": 2.749, + "step": 12160 + }, + { + "epoch": 0.7061519938494567, + "grad_norm": 0.12190863490104675, + "learning_rate": 8.725597391263651e-05, + "loss": 2.7512, + "step": 12170 + }, + { + "epoch": 0.7067322337786675, + "grad_norm": 0.1286507099866867, + "learning_rate": 8.69391955641516e-05, + "loss": 2.7448, + "step": 12180 + }, + { + "epoch": 0.7073124737078782, + "grad_norm": 0.12180124223232269, + "learning_rate": 8.662283351739257e-05, + "loss": 2.7594, + "step": 12190 + }, + { + "epoch": 0.7078927136370889, + "grad_norm": 0.12321013957262039, + "learning_rate": 8.630688893723762e-05, + "loss": 2.748, + "step": 12200 + }, + { + "epoch": 0.7084729535662997, + "grad_norm": 0.12682393193244934, + "learning_rate": 8.599136298702776e-05, + "loss": 2.7451, + "step": 12210 + }, + { + "epoch": 0.7090531934955104, + "grad_norm": 0.12241560220718384, + "learning_rate": 8.567625682856255e-05, + "loss": 2.7472, + "step": 12220 + }, + { + "epoch": 0.7096334334247211, + "grad_norm": 0.1252165287733078, + "learning_rate": 8.536157162209601e-05, + "loss": 2.7484, + "step": 12230 + }, + { + "epoch": 0.7102136733539318, + "grad_norm": 0.12328600883483887, + "learning_rate": 8.504730852633197e-05, + "loss": 2.7518, + "step": 12240 + }, + { + "epoch": 0.7107939132831426, + "grad_norm": 0.12212226539850235, + "learning_rate": 8.473346869842003e-05, + "loss": 2.7409, + "step": 12250 + }, + { + "epoch": 0.7113741532123533, + "grad_norm": 0.12457285821437836, + "learning_rate": 8.442005329395137e-05, + "loss": 2.7466, + "step": 12260 + }, + { + "epoch": 0.7119543931415641, + "grad_norm": 0.13215821981430054, + "learning_rate": 8.410706346695432e-05, + "loss": 2.7566, + "step": 12270 + }, + { + "epoch": 0.7125346330707748, + "grad_norm": 0.12475377321243286, + "learning_rate": 8.379450036989014e-05, + "loss": 2.7524, + "step": 12280 + }, + { + "epoch": 0.7131148729999854, + "grad_norm": 0.12629540264606476, + "learning_rate": 8.348236515364903e-05, + "loss": 2.7526, + "step": 12290 + }, + { + "epoch": 0.7136951129291962, + "grad_norm": 0.1277417689561844, + "learning_rate": 8.317065896754548e-05, + "loss": 2.745, + "step": 12300 + }, + { + "epoch": 0.7142753528584069, + "grad_norm": 0.136353999376297, + "learning_rate": 8.285938295931435e-05, + "loss": 2.7577, + "step": 12310 + }, + { + "epoch": 0.7148555927876177, + "grad_norm": 0.12383802980184555, + "learning_rate": 8.254853827510646e-05, + "loss": 2.7461, + "step": 12320 + }, + { + "epoch": 0.7154358327168284, + "grad_norm": 0.12012789398431778, + "learning_rate": 8.223812605948458e-05, + "loss": 2.7471, + "step": 12330 + }, + { + "epoch": 0.7160160726460392, + "grad_norm": 0.12090951204299927, + "learning_rate": 8.192814745541884e-05, + "loss": 2.743, + "step": 12340 + }, + { + "epoch": 0.7165963125752499, + "grad_norm": 0.12389807403087616, + "learning_rate": 8.161860360428315e-05, + "loss": 2.7397, + "step": 12350 + }, + { + "epoch": 0.7171765525044606, + "grad_norm": 0.12754105031490326, + "learning_rate": 8.130949564585028e-05, + "loss": 2.7405, + "step": 12360 + }, + { + "epoch": 0.7177567924336713, + "grad_norm": 0.12422367185354233, + "learning_rate": 8.100082471828813e-05, + "loss": 2.7499, + "step": 12370 + }, + { + "epoch": 0.718337032362882, + "grad_norm": 0.12758466601371765, + "learning_rate": 8.069259195815542e-05, + "loss": 2.749, + "step": 12380 + }, + { + "epoch": 0.7189172722920928, + "grad_norm": 0.12679055333137512, + "learning_rate": 8.038479850039735e-05, + "loss": 2.7423, + "step": 12390 + }, + { + "epoch": 0.7194975122213035, + "grad_norm": 0.12068577855825424, + "learning_rate": 8.007744547834182e-05, + "loss": 2.745, + "step": 12400 + }, + { + "epoch": 0.7200777521505143, + "grad_norm": 0.1262856423854828, + "learning_rate": 7.977053402369482e-05, + "loss": 2.7466, + "step": 12410 + }, + { + "epoch": 0.7206579920797249, + "grad_norm": 0.12377514690160751, + "learning_rate": 7.946406526653641e-05, + "loss": 2.738, + "step": 12420 + }, + { + "epoch": 0.7212382320089357, + "grad_norm": 0.12496384978294373, + "learning_rate": 7.915804033531673e-05, + "loss": 2.7468, + "step": 12430 + }, + { + "epoch": 0.7218184719381464, + "grad_norm": 0.12281250953674316, + "learning_rate": 7.885246035685153e-05, + "loss": 2.7477, + "step": 12440 + }, + { + "epoch": 0.7223987118673572, + "grad_norm": 0.12681566178798676, + "learning_rate": 7.85473264563185e-05, + "loss": 2.7382, + "step": 12450 + }, + { + "epoch": 0.7229789517965679, + "grad_norm": 0.12231052666902542, + "learning_rate": 7.824263975725238e-05, + "loss": 2.7501, + "step": 12460 + }, + { + "epoch": 0.7235591917257786, + "grad_norm": 0.1255701333284378, + "learning_rate": 7.793840138154172e-05, + "loss": 2.7444, + "step": 12470 + }, + { + "epoch": 0.7241394316549893, + "grad_norm": 0.11985606700181961, + "learning_rate": 7.763461244942398e-05, + "loss": 2.7464, + "step": 12480 + }, + { + "epoch": 0.7247196715842, + "grad_norm": 0.12225638329982758, + "learning_rate": 7.733127407948182e-05, + "loss": 2.7449, + "step": 12490 + }, + { + "epoch": 0.7252999115134108, + "grad_norm": 0.12188901752233505, + "learning_rate": 7.702838738863907e-05, + "loss": 2.7308, + "step": 12500 + }, + { + "epoch": 0.7258801514426215, + "grad_norm": 0.12092640995979309, + "learning_rate": 7.672595349215597e-05, + "loss": 2.7393, + "step": 12510 + }, + { + "epoch": 0.7264603913718323, + "grad_norm": 0.13984154164791107, + "learning_rate": 7.642397350362604e-05, + "loss": 2.7399, + "step": 12520 + }, + { + "epoch": 0.727040631301043, + "grad_norm": 0.12072645872831345, + "learning_rate": 7.612244853497114e-05, + "loss": 2.7361, + "step": 12530 + }, + { + "epoch": 0.7276208712302538, + "grad_norm": 0.12388517707586288, + "learning_rate": 7.582137969643775e-05, + "loss": 2.7512, + "step": 12540 + }, + { + "epoch": 0.7282011111594644, + "grad_norm": 0.1254212111234665, + "learning_rate": 7.552076809659308e-05, + "loss": 2.755, + "step": 12550 + }, + { + "epoch": 0.7287813510886751, + "grad_norm": 0.12324492633342743, + "learning_rate": 7.522061484232022e-05, + "loss": 2.7484, + "step": 12560 + }, + { + "epoch": 0.7293615910178859, + "grad_norm": 0.12253236025571823, + "learning_rate": 7.492092103881518e-05, + "loss": 2.7395, + "step": 12570 + }, + { + "epoch": 0.7299418309470966, + "grad_norm": 0.1230064406991005, + "learning_rate": 7.462168778958169e-05, + "loss": 2.7499, + "step": 12580 + }, + { + "epoch": 0.7305220708763074, + "grad_norm": 0.1225818321108818, + "learning_rate": 7.43229161964281e-05, + "loss": 2.7338, + "step": 12590 + }, + { + "epoch": 0.7311023108055181, + "grad_norm": 0.12051333487033844, + "learning_rate": 7.402460735946269e-05, + "loss": 2.742, + "step": 12600 + }, + { + "epoch": 0.7316825507347288, + "grad_norm": 0.12291798740625381, + "learning_rate": 7.372676237708973e-05, + "loss": 2.7379, + "step": 12610 + }, + { + "epoch": 0.7322627906639395, + "grad_norm": 0.12274395674467087, + "learning_rate": 7.342938234600587e-05, + "loss": 2.7474, + "step": 12620 + }, + { + "epoch": 0.7328430305931503, + "grad_norm": 0.12541697919368744, + "learning_rate": 7.313246836119525e-05, + "loss": 2.7451, + "step": 12630 + }, + { + "epoch": 0.733423270522361, + "grad_norm": 0.1245257779955864, + "learning_rate": 7.28360215159265e-05, + "loss": 2.7523, + "step": 12640 + }, + { + "epoch": 0.7340035104515718, + "grad_norm": 0.12861104309558868, + "learning_rate": 7.254004290174788e-05, + "loss": 2.7459, + "step": 12650 + }, + { + "epoch": 0.7345837503807825, + "grad_norm": 0.11994564533233643, + "learning_rate": 7.224453360848358e-05, + "loss": 2.7383, + "step": 12660 + }, + { + "epoch": 0.7351639903099931, + "grad_norm": 0.11946084350347519, + "learning_rate": 7.194949472422998e-05, + "loss": 2.7489, + "step": 12670 + }, + { + "epoch": 0.7357442302392039, + "grad_norm": 0.129042848944664, + "learning_rate": 7.165492733535086e-05, + "loss": 2.7329, + "step": 12680 + }, + { + "epoch": 0.7363244701684146, + "grad_norm": 0.12452303618192673, + "learning_rate": 7.136083252647447e-05, + "loss": 2.7441, + "step": 12690 + }, + { + "epoch": 0.7369047100976254, + "grad_norm": 0.12241894006729126, + "learning_rate": 7.10672113804886e-05, + "loss": 2.7388, + "step": 12700 + }, + { + "epoch": 0.7374849500268361, + "grad_norm": 0.12106358259916306, + "learning_rate": 7.077406497853698e-05, + "loss": 2.7303, + "step": 12710 + }, + { + "epoch": 0.7380651899560469, + "grad_norm": 0.11599821597337723, + "learning_rate": 7.04813944000156e-05, + "loss": 2.7378, + "step": 12720 + }, + { + "epoch": 0.7386454298852576, + "grad_norm": 0.12181167304515839, + "learning_rate": 7.018920072256792e-05, + "loss": 2.745, + "step": 12730 + }, + { + "epoch": 0.7392256698144682, + "grad_norm": 0.12115510553121567, + "learning_rate": 6.989748502208186e-05, + "loss": 2.7309, + "step": 12740 + }, + { + "epoch": 0.739805909743679, + "grad_norm": 0.12130045890808105, + "learning_rate": 6.960624837268514e-05, + "loss": 2.7432, + "step": 12750 + }, + { + "epoch": 0.7403861496728897, + "grad_norm": 0.13140781223773956, + "learning_rate": 6.931549184674153e-05, + "loss": 2.7451, + "step": 12760 + }, + { + "epoch": 0.7409663896021005, + "grad_norm": 0.1199527308344841, + "learning_rate": 6.902521651484724e-05, + "loss": 2.7439, + "step": 12770 + }, + { + "epoch": 0.7415466295313112, + "grad_norm": 0.1253666877746582, + "learning_rate": 6.873542344582616e-05, + "loss": 2.7375, + "step": 12780 + }, + { + "epoch": 0.742126869460522, + "grad_norm": 0.12600301206111908, + "learning_rate": 6.844611370672691e-05, + "loss": 2.7401, + "step": 12790 + }, + { + "epoch": 0.7427071093897326, + "grad_norm": 0.12067416310310364, + "learning_rate": 6.815728836281823e-05, + "loss": 2.7335, + "step": 12800 + }, + { + "epoch": 0.7432873493189434, + "grad_norm": 0.12302874028682709, + "learning_rate": 6.786894847758527e-05, + "loss": 2.7447, + "step": 12810 + }, + { + "epoch": 0.7438675892481541, + "grad_norm": 0.12169067561626434, + "learning_rate": 6.75810951127257e-05, + "loss": 2.7416, + "step": 12820 + }, + { + "epoch": 0.7444478291773649, + "grad_norm": 0.12148208916187286, + "learning_rate": 6.729372932814571e-05, + "loss": 2.7341, + "step": 12830 + }, + { + "epoch": 0.7450280691065756, + "grad_norm": 0.1290818154811859, + "learning_rate": 6.700685218195639e-05, + "loss": 2.7445, + "step": 12840 + }, + { + "epoch": 0.7456083090357863, + "grad_norm": 0.12071933597326279, + "learning_rate": 6.672046473046921e-05, + "loss": 2.7398, + "step": 12850 + }, + { + "epoch": 0.746188548964997, + "grad_norm": 0.12640851736068726, + "learning_rate": 6.643456802819294e-05, + "loss": 2.7411, + "step": 12860 + }, + { + "epoch": 0.7467687888942077, + "grad_norm": 0.13351675868034363, + "learning_rate": 6.614916312782915e-05, + "loss": 2.728, + "step": 12870 + }, + { + "epoch": 0.7473490288234185, + "grad_norm": 0.12772291898727417, + "learning_rate": 6.58642510802685e-05, + "loss": 2.7412, + "step": 12880 + }, + { + "epoch": 0.7479292687526292, + "grad_norm": 0.12297698110342026, + "learning_rate": 6.55798329345872e-05, + "loss": 2.7494, + "step": 12890 + }, + { + "epoch": 0.74850950868184, + "grad_norm": 0.12183728814125061, + "learning_rate": 6.529590973804238e-05, + "loss": 2.7399, + "step": 12900 + }, + { + "epoch": 0.7490897486110507, + "grad_norm": 0.12221384793519974, + "learning_rate": 6.50124825360692e-05, + "loss": 2.7397, + "step": 12910 + }, + { + "epoch": 0.7496699885402613, + "grad_norm": 0.11953077465295792, + "learning_rate": 6.472955237227625e-05, + "loss": 2.7346, + "step": 12920 + }, + { + "epoch": 0.7502502284694721, + "grad_norm": 0.1241031214594841, + "learning_rate": 6.444712028844202e-05, + "loss": 2.7376, + "step": 12930 + }, + { + "epoch": 0.7508304683986828, + "grad_norm": 0.12113183736801147, + "learning_rate": 6.416518732451103e-05, + "loss": 2.7425, + "step": 12940 + }, + { + "epoch": 0.7514107083278936, + "grad_norm": 0.12162219732999802, + "learning_rate": 6.388375451858993e-05, + "loss": 2.7403, + "step": 12950 + }, + { + "epoch": 0.7519909482571043, + "grad_norm": 0.12234422564506531, + "learning_rate": 6.36028229069439e-05, + "loss": 2.7377, + "step": 12960 + }, + { + "epoch": 0.7525711881863151, + "grad_norm": 0.123825304210186, + "learning_rate": 6.332239352399254e-05, + "loss": 2.7276, + "step": 12970 + }, + { + "epoch": 0.7531514281155258, + "grad_norm": 0.12295151501893997, + "learning_rate": 6.304246740230619e-05, + "loss": 2.7404, + "step": 12980 + }, + { + "epoch": 0.7537316680447365, + "grad_norm": 0.12254820764064789, + "learning_rate": 6.276304557260215e-05, + "loss": 2.7373, + "step": 12990 + }, + { + "epoch": 0.7543119079739472, + "grad_norm": 0.12362895160913467, + "learning_rate": 6.248412906374082e-05, + "loss": 2.7418, + "step": 13000 + }, + { + "epoch": 0.7543119079739472, + "eval_loss": 2.70877742767334, + "eval_runtime": 5.3922, + "eval_samples_per_second": 803.007, + "eval_steps_per_second": 1.669, + "step": 13000 + }, + { + "epoch": 0.754892147903158, + "grad_norm": 0.12260004132986069, + "learning_rate": 6.220571890272213e-05, + "loss": 2.7352, + "step": 13010 + }, + { + "epoch": 0.7554723878323687, + "grad_norm": 0.12325233221054077, + "learning_rate": 6.192781611468137e-05, + "loss": 2.7369, + "step": 13020 + }, + { + "epoch": 0.7560526277615794, + "grad_norm": 0.12207638472318649, + "learning_rate": 6.165042172288576e-05, + "loss": 2.7365, + "step": 13030 + }, + { + "epoch": 0.7566328676907902, + "grad_norm": 0.1201200932264328, + "learning_rate": 6.137353674873046e-05, + "loss": 2.736, + "step": 13040 + }, + { + "epoch": 0.7572131076200008, + "grad_norm": 0.11834630370140076, + "learning_rate": 6.109716221173499e-05, + "loss": 2.7374, + "step": 13050 + }, + { + "epoch": 0.7577933475492116, + "grad_norm": 0.12760621309280396, + "learning_rate": 6.0821299129539267e-05, + "loss": 2.7297, + "step": 13060 + }, + { + "epoch": 0.7583735874784223, + "grad_norm": 0.12132176011800766, + "learning_rate": 6.0545948517900186e-05, + "loss": 2.7318, + "step": 13070 + }, + { + "epoch": 0.7589538274076331, + "grad_norm": 0.1261221319437027, + "learning_rate": 6.0271111390687506e-05, + "loss": 2.734, + "step": 13080 + }, + { + "epoch": 0.7595340673368438, + "grad_norm": 0.11981641501188278, + "learning_rate": 5.9996788759880265e-05, + "loss": 2.7343, + "step": 13090 + }, + { + "epoch": 0.7601143072660546, + "grad_norm": 0.12327931821346283, + "learning_rate": 5.972298163556318e-05, + "loss": 2.7309, + "step": 13100 + }, + { + "epoch": 0.7606945471952652, + "grad_norm": 0.12560048699378967, + "learning_rate": 5.944969102592275e-05, + "loss": 2.7368, + "step": 13110 + }, + { + "epoch": 0.7612747871244759, + "grad_norm": 0.12151816487312317, + "learning_rate": 5.9176917937243534e-05, + "loss": 2.7315, + "step": 13120 + }, + { + "epoch": 0.7618550270536867, + "grad_norm": 0.127033993601799, + "learning_rate": 5.890466337390481e-05, + "loss": 2.7405, + "step": 13130 + }, + { + "epoch": 0.7624352669828974, + "grad_norm": 0.11884118616580963, + "learning_rate": 5.863292833837628e-05, + "loss": 2.7324, + "step": 13140 + }, + { + "epoch": 0.7630155069121082, + "grad_norm": 0.12274018675088882, + "learning_rate": 5.836171383121483e-05, + "loss": 2.7327, + "step": 13150 + }, + { + "epoch": 0.7635957468413189, + "grad_norm": 0.12369240075349808, + "learning_rate": 5.809102085106071e-05, + "loss": 2.7322, + "step": 13160 + }, + { + "epoch": 0.7641759867705297, + "grad_norm": 0.121719129383564, + "learning_rate": 5.7820850394633786e-05, + "loss": 2.7368, + "step": 13170 + }, + { + "epoch": 0.7647562266997403, + "grad_norm": 0.12220139056444168, + "learning_rate": 5.755120345672995e-05, + "loss": 2.7283, + "step": 13180 + }, + { + "epoch": 0.765336466628951, + "grad_norm": 0.12299592047929764, + "learning_rate": 5.7282081030217595e-05, + "loss": 2.7314, + "step": 13190 + }, + { + "epoch": 0.7659167065581618, + "grad_norm": 0.13223952054977417, + "learning_rate": 5.70134841060336e-05, + "loss": 2.7392, + "step": 13200 + }, + { + "epoch": 0.7664969464873725, + "grad_norm": 0.12204018235206604, + "learning_rate": 5.674541367318003e-05, + "loss": 2.7342, + "step": 13210 + }, + { + "epoch": 0.7670771864165833, + "grad_norm": 0.12159973382949829, + "learning_rate": 5.647787071872024e-05, + "loss": 2.7378, + "step": 13220 + }, + { + "epoch": 0.767657426345794, + "grad_norm": 0.1208658516407013, + "learning_rate": 5.62108562277754e-05, + "loss": 2.7387, + "step": 13230 + }, + { + "epoch": 0.7682376662750047, + "grad_norm": 0.12024756520986557, + "learning_rate": 5.5944371183520964e-05, + "loss": 2.734, + "step": 13240 + }, + { + "epoch": 0.7688179062042154, + "grad_norm": 0.12415524572134018, + "learning_rate": 5.567841656718267e-05, + "loss": 2.7347, + "step": 13250 + }, + { + "epoch": 0.7693981461334262, + "grad_norm": 0.1189967542886734, + "learning_rate": 5.541299335803332e-05, + "loss": 2.7382, + "step": 13260 + }, + { + "epoch": 0.7699783860626369, + "grad_norm": 0.12029008567333221, + "learning_rate": 5.514810253338896e-05, + "loss": 2.7386, + "step": 13270 + }, + { + "epoch": 0.7705586259918477, + "grad_norm": 0.11998321861028671, + "learning_rate": 5.48837450686053e-05, + "loss": 2.7352, + "step": 13280 + }, + { + "epoch": 0.7711388659210584, + "grad_norm": 0.12730252742767334, + "learning_rate": 5.461992193707439e-05, + "loss": 2.7345, + "step": 13290 + }, + { + "epoch": 0.771719105850269, + "grad_norm": 0.12133444845676422, + "learning_rate": 5.4356634110220386e-05, + "loss": 2.7282, + "step": 13300 + }, + { + "epoch": 0.7722993457794798, + "grad_norm": 0.12256559729576111, + "learning_rate": 5.409388255749688e-05, + "loss": 2.7312, + "step": 13310 + }, + { + "epoch": 0.7728795857086905, + "grad_norm": 0.12941910326480865, + "learning_rate": 5.3831668246382485e-05, + "loss": 2.7344, + "step": 13320 + }, + { + "epoch": 0.7734598256379013, + "grad_norm": 0.12094937264919281, + "learning_rate": 5.356999214237777e-05, + "loss": 2.7331, + "step": 13330 + }, + { + "epoch": 0.774040065567112, + "grad_norm": 0.12196173518896103, + "learning_rate": 5.3308855209001684e-05, + "loss": 2.737, + "step": 13340 + }, + { + "epoch": 0.7746203054963228, + "grad_norm": 0.12579815089702606, + "learning_rate": 5.304825840778758e-05, + "loss": 2.7229, + "step": 13350 + }, + { + "epoch": 0.7752005454255334, + "grad_norm": 0.1212158054113388, + "learning_rate": 5.278820269828031e-05, + "loss": 2.7314, + "step": 13360 + }, + { + "epoch": 0.7757807853547442, + "grad_norm": 0.12120276689529419, + "learning_rate": 5.252868903803223e-05, + "loss": 2.7444, + "step": 13370 + }, + { + "epoch": 0.7763610252839549, + "grad_norm": 0.12396235018968582, + "learning_rate": 5.2269718382599796e-05, + "loss": 2.7397, + "step": 13380 + }, + { + "epoch": 0.7769412652131656, + "grad_norm": 0.11907251924276352, + "learning_rate": 5.201129168554009e-05, + "loss": 2.7282, + "step": 13390 + }, + { + "epoch": 0.7775215051423764, + "grad_norm": 0.12151432782411575, + "learning_rate": 5.1753409898407226e-05, + "loss": 2.7332, + "step": 13400 + }, + { + "epoch": 0.7781017450715871, + "grad_norm": 0.11936885118484497, + "learning_rate": 5.149607397074911e-05, + "loss": 2.7374, + "step": 13410 + }, + { + "epoch": 0.7786819850007979, + "grad_norm": 0.12219711393117905, + "learning_rate": 5.1239284850103407e-05, + "loss": 2.7287, + "step": 13420 + }, + { + "epoch": 0.7792622249300085, + "grad_norm": 0.11801363527774811, + "learning_rate": 5.098304348199472e-05, + "loss": 2.7224, + "step": 13430 + }, + { + "epoch": 0.7798424648592193, + "grad_norm": 0.11896246671676636, + "learning_rate": 5.072735080993052e-05, + "loss": 2.7295, + "step": 13440 + }, + { + "epoch": 0.78042270478843, + "grad_norm": 0.11604252457618713, + "learning_rate": 5.047220777539796e-05, + "loss": 2.7228, + "step": 13450 + }, + { + "epoch": 0.7810029447176408, + "grad_norm": 0.12236449867486954, + "learning_rate": 5.021761531786062e-05, + "loss": 2.7322, + "step": 13460 + }, + { + "epoch": 0.7815831846468515, + "grad_norm": 0.12057973444461823, + "learning_rate": 4.996357437475434e-05, + "loss": 2.73, + "step": 13470 + }, + { + "epoch": 0.7821634245760622, + "grad_norm": 0.12264855206012726, + "learning_rate": 4.9710085881484694e-05, + "loss": 2.7251, + "step": 13480 + }, + { + "epoch": 0.7827436645052729, + "grad_norm": 0.12081897258758545, + "learning_rate": 4.945715077142277e-05, + "loss": 2.7271, + "step": 13490 + }, + { + "epoch": 0.7833239044344836, + "grad_norm": 0.12207679450511932, + "learning_rate": 4.920476997590211e-05, + "loss": 2.7319, + "step": 13500 + }, + { + "epoch": 0.7839041443636944, + "grad_norm": 0.120822474360466, + "learning_rate": 4.895294442421541e-05, + "loss": 2.7308, + "step": 13510 + }, + { + "epoch": 0.7844843842929051, + "grad_norm": 0.12106281518936157, + "learning_rate": 4.8701675043610474e-05, + "loss": 2.7258, + "step": 13520 + }, + { + "epoch": 0.7850646242221159, + "grad_norm": 0.12197288870811462, + "learning_rate": 4.845096275928769e-05, + "loss": 2.7309, + "step": 13530 + }, + { + "epoch": 0.7856448641513266, + "grad_norm": 0.12526744604110718, + "learning_rate": 4.82008084943959e-05, + "loss": 2.7293, + "step": 13540 + }, + { + "epoch": 0.7862251040805373, + "grad_norm": 0.1226550042629242, + "learning_rate": 4.795121317002922e-05, + "loss": 2.7231, + "step": 13550 + }, + { + "epoch": 0.786805344009748, + "grad_norm": 0.11998672783374786, + "learning_rate": 4.770217770522398e-05, + "loss": 2.7271, + "step": 13560 + }, + { + "epoch": 0.7873855839389587, + "grad_norm": 0.122990183532238, + "learning_rate": 4.745370301695462e-05, + "loss": 2.7322, + "step": 13570 + }, + { + "epoch": 0.7879658238681695, + "grad_norm": 0.12234378606081009, + "learning_rate": 4.720579002013115e-05, + "loss": 2.7224, + "step": 13580 + }, + { + "epoch": 0.7885460637973802, + "grad_norm": 0.12186608463525772, + "learning_rate": 4.69584396275951e-05, + "loss": 2.7183, + "step": 13590 + }, + { + "epoch": 0.789126303726591, + "grad_norm": 0.12713122367858887, + "learning_rate": 4.6711652750116505e-05, + "loss": 2.7299, + "step": 13600 + }, + { + "epoch": 0.7897065436558017, + "grad_norm": 0.12040334939956665, + "learning_rate": 4.646543029639068e-05, + "loss": 2.7274, + "step": 13610 + }, + { + "epoch": 0.7902867835850124, + "grad_norm": 0.11610428988933563, + "learning_rate": 4.621977317303423e-05, + "loss": 2.7225, + "step": 13620 + }, + { + "epoch": 0.7908670235142231, + "grad_norm": 0.12505796551704407, + "learning_rate": 4.5974682284582656e-05, + "loss": 2.7291, + "step": 13630 + }, + { + "epoch": 0.7914472634434339, + "grad_norm": 0.12457990646362305, + "learning_rate": 4.573015853348608e-05, + "loss": 2.7279, + "step": 13640 + }, + { + "epoch": 0.7920275033726446, + "grad_norm": 0.12435191869735718, + "learning_rate": 4.5486202820106695e-05, + "loss": 2.7387, + "step": 13650 + }, + { + "epoch": 0.7926077433018553, + "grad_norm": 0.11536847054958344, + "learning_rate": 4.524281604271499e-05, + "loss": 2.7206, + "step": 13660 + }, + { + "epoch": 0.7931879832310661, + "grad_norm": 0.12171947956085205, + "learning_rate": 4.499999909748649e-05, + "loss": 2.717, + "step": 13670 + }, + { + "epoch": 0.7937682231602767, + "grad_norm": 0.11659212410449982, + "learning_rate": 4.4757752878498794e-05, + "loss": 2.729, + "step": 13680 + }, + { + "epoch": 0.7943484630894875, + "grad_norm": 0.1195807233452797, + "learning_rate": 4.4516078277727635e-05, + "loss": 2.7286, + "step": 13690 + }, + { + "epoch": 0.7949287030186982, + "grad_norm": 0.12461701035499573, + "learning_rate": 4.427497618504439e-05, + "loss": 2.7313, + "step": 13700 + }, + { + "epoch": 0.795508942947909, + "grad_norm": 0.11976811289787292, + "learning_rate": 4.403444748821215e-05, + "loss": 2.7217, + "step": 13710 + }, + { + "epoch": 0.7960891828771197, + "grad_norm": 0.12124411016702652, + "learning_rate": 4.37944930728827e-05, + "loss": 2.7247, + "step": 13720 + }, + { + "epoch": 0.7966694228063305, + "grad_norm": 0.11961103975772858, + "learning_rate": 4.355511382259356e-05, + "loss": 2.7238, + "step": 13730 + }, + { + "epoch": 0.7972496627355411, + "grad_norm": 0.12003281712532043, + "learning_rate": 4.3316310618763936e-05, + "loss": 2.7336, + "step": 13740 + }, + { + "epoch": 0.7978299026647518, + "grad_norm": 0.11927200108766556, + "learning_rate": 4.3078084340692406e-05, + "loss": 2.7312, + "step": 13750 + }, + { + "epoch": 0.7984101425939626, + "grad_norm": 0.12134095281362534, + "learning_rate": 4.2840435865553065e-05, + "loss": 2.7319, + "step": 13760 + }, + { + "epoch": 0.7989903825231733, + "grad_norm": 0.12080392986536026, + "learning_rate": 4.2603366068392455e-05, + "loss": 2.7316, + "step": 13770 + }, + { + "epoch": 0.7995706224523841, + "grad_norm": 0.12257473915815353, + "learning_rate": 4.236687582212642e-05, + "loss": 2.7358, + "step": 13780 + }, + { + "epoch": 0.8001508623815948, + "grad_norm": 0.11869396269321442, + "learning_rate": 4.213096599753676e-05, + "loss": 2.7313, + "step": 13790 + }, + { + "epoch": 0.8007311023108055, + "grad_norm": 0.11939482390880585, + "learning_rate": 4.189563746326828e-05, + "loss": 2.7261, + "step": 13800 + }, + { + "epoch": 0.8013113422400162, + "grad_norm": 0.11985889077186584, + "learning_rate": 4.166089108582523e-05, + "loss": 2.7359, + "step": 13810 + }, + { + "epoch": 0.801891582169227, + "grad_norm": 0.12536443769931793, + "learning_rate": 4.142672772956837e-05, + "loss": 2.7209, + "step": 13820 + }, + { + "epoch": 0.8024718220984377, + "grad_norm": 0.1199636310338974, + "learning_rate": 4.119314825671172e-05, + "loss": 2.7336, + "step": 13830 + }, + { + "epoch": 0.8030520620276485, + "grad_norm": 0.12134099006652832, + "learning_rate": 4.0960153527319276e-05, + "loss": 2.7214, + "step": 13840 + }, + { + "epoch": 0.8036323019568592, + "grad_norm": 0.1173613891005516, + "learning_rate": 4.07277443993022e-05, + "loss": 2.7255, + "step": 13850 + }, + { + "epoch": 0.8042125418860699, + "grad_norm": 0.1184094101190567, + "learning_rate": 4.049592172841516e-05, + "loss": 2.7238, + "step": 13860 + }, + { + "epoch": 0.8047927818152806, + "grad_norm": 0.11648872494697571, + "learning_rate": 4.026468636825351e-05, + "loss": 2.7161, + "step": 13870 + }, + { + "epoch": 0.8053730217444913, + "grad_norm": 0.12471094727516174, + "learning_rate": 4.00340391702501e-05, + "loss": 2.7194, + "step": 13880 + }, + { + "epoch": 0.8059532616737021, + "grad_norm": 0.12097521126270294, + "learning_rate": 3.980398098367206e-05, + "loss": 2.7344, + "step": 13890 + }, + { + "epoch": 0.8065335016029128, + "grad_norm": 0.11875994503498077, + "learning_rate": 3.957451265561767e-05, + "loss": 2.7282, + "step": 13900 + }, + { + "epoch": 0.8071137415321236, + "grad_norm": 0.12130565941333771, + "learning_rate": 3.934563503101345e-05, + "loss": 2.7285, + "step": 13910 + }, + { + "epoch": 0.8076939814613343, + "grad_norm": 0.12270623445510864, + "learning_rate": 3.911734895261079e-05, + "loss": 2.7338, + "step": 13920 + }, + { + "epoch": 0.808274221390545, + "grad_norm": 0.12415996193885803, + "learning_rate": 3.888965526098287e-05, + "loss": 2.7266, + "step": 13930 + }, + { + "epoch": 0.8088544613197557, + "grad_norm": 0.11743751913309097, + "learning_rate": 3.866255479452177e-05, + "loss": 2.7303, + "step": 13940 + }, + { + "epoch": 0.8094347012489664, + "grad_norm": 0.11905871331691742, + "learning_rate": 3.8436048389435196e-05, + "loss": 2.7235, + "step": 13950 + }, + { + "epoch": 0.8100149411781772, + "grad_norm": 0.11726722121238708, + "learning_rate": 3.8210136879743375e-05, + "loss": 2.7297, + "step": 13960 + }, + { + "epoch": 0.8105951811073879, + "grad_norm": 0.12093175947666168, + "learning_rate": 3.798482109727628e-05, + "loss": 2.7272, + "step": 13970 + }, + { + "epoch": 0.8111754210365987, + "grad_norm": 0.11959103494882584, + "learning_rate": 3.776010187167016e-05, + "loss": 2.7226, + "step": 13980 + }, + { + "epoch": 0.8117556609658093, + "grad_norm": 0.12149166315793991, + "learning_rate": 3.753598003036476e-05, + "loss": 2.7244, + "step": 13990 + }, + { + "epoch": 0.8123359008950201, + "grad_norm": 0.12182975560426712, + "learning_rate": 3.731245639860017e-05, + "loss": 2.7167, + "step": 14000 + }, + { + "epoch": 0.8123359008950201, + "eval_loss": 2.696218490600586, + "eval_runtime": 5.3955, + "eval_samples_per_second": 802.521, + "eval_steps_per_second": 1.668, + "step": 14000 + }, + { + "epoch": 0.8129161408242308, + "grad_norm": 0.12197499722242355, + "learning_rate": 3.7089531799413815e-05, + "loss": 2.7234, + "step": 14010 + }, + { + "epoch": 0.8134963807534416, + "grad_norm": 0.11665521562099457, + "learning_rate": 3.6867207053637376e-05, + "loss": 2.7232, + "step": 14020 + }, + { + "epoch": 0.8140766206826523, + "grad_norm": 0.11618278175592422, + "learning_rate": 3.6645482979893966e-05, + "loss": 2.7195, + "step": 14030 + }, + { + "epoch": 0.814656860611863, + "grad_norm": 0.11773023754358292, + "learning_rate": 3.642436039459478e-05, + "loss": 2.7262, + "step": 14040 + }, + { + "epoch": 0.8152371005410738, + "grad_norm": 0.12087783962488174, + "learning_rate": 3.620384011193636e-05, + "loss": 2.7267, + "step": 14050 + }, + { + "epoch": 0.8158173404702844, + "grad_norm": 0.11980880051851273, + "learning_rate": 3.598392294389747e-05, + "loss": 2.723, + "step": 14060 + }, + { + "epoch": 0.8163975803994952, + "grad_norm": 0.11848059296607971, + "learning_rate": 3.576460970023614e-05, + "loss": 2.7269, + "step": 14070 + }, + { + "epoch": 0.8169778203287059, + "grad_norm": 0.11839718371629715, + "learning_rate": 3.5545901188486776e-05, + "loss": 2.7135, + "step": 14080 + }, + { + "epoch": 0.8175580602579167, + "grad_norm": 0.11868947744369507, + "learning_rate": 3.5327798213957e-05, + "loss": 2.7271, + "step": 14090 + }, + { + "epoch": 0.8181383001871274, + "grad_norm": 0.11662468314170837, + "learning_rate": 3.511030157972479e-05, + "loss": 2.7257, + "step": 14100 + }, + { + "epoch": 0.8187185401163382, + "grad_norm": 0.11749114841222763, + "learning_rate": 3.4893412086635566e-05, + "loss": 2.7173, + "step": 14110 + }, + { + "epoch": 0.8192987800455488, + "grad_norm": 0.11833590269088745, + "learning_rate": 3.467713053329911e-05, + "loss": 2.7217, + "step": 14120 + }, + { + "epoch": 0.8198790199747595, + "grad_norm": 0.11520706862211227, + "learning_rate": 3.446145771608689e-05, + "loss": 2.7179, + "step": 14130 + }, + { + "epoch": 0.8204592599039703, + "grad_norm": 0.12385083734989166, + "learning_rate": 3.4246394429128604e-05, + "loss": 2.7162, + "step": 14140 + }, + { + "epoch": 0.821039499833181, + "grad_norm": 0.12348052859306335, + "learning_rate": 3.403194146430997e-05, + "loss": 2.7184, + "step": 14150 + }, + { + "epoch": 0.8216197397623918, + "grad_norm": 0.11942430585622787, + "learning_rate": 3.381809961126925e-05, + "loss": 2.7215, + "step": 14160 + }, + { + "epoch": 0.8221999796916025, + "grad_norm": 0.12090287357568741, + "learning_rate": 3.360486965739444e-05, + "loss": 2.7094, + "step": 14170 + }, + { + "epoch": 0.8227802196208132, + "grad_norm": 0.11702247709035873, + "learning_rate": 3.3392252387820754e-05, + "loss": 2.7298, + "step": 14180 + }, + { + "epoch": 0.8233604595500239, + "grad_norm": 0.12156783044338226, + "learning_rate": 3.3180248585427054e-05, + "loss": 2.7238, + "step": 14190 + }, + { + "epoch": 0.8239406994792347, + "grad_norm": 0.11878366768360138, + "learning_rate": 3.296885903083366e-05, + "loss": 2.7175, + "step": 14200 + }, + { + "epoch": 0.8245209394084454, + "grad_norm": 0.11734297126531601, + "learning_rate": 3.275808450239908e-05, + "loss": 2.7271, + "step": 14210 + }, + { + "epoch": 0.8251011793376561, + "grad_norm": 0.11686153709888458, + "learning_rate": 3.2547925776217126e-05, + "loss": 2.7206, + "step": 14220 + }, + { + "epoch": 0.8256814192668669, + "grad_norm": 0.12111202627420425, + "learning_rate": 3.23383836261143e-05, + "loss": 2.7179, + "step": 14230 + }, + { + "epoch": 0.8262616591960775, + "grad_norm": 0.12011639773845673, + "learning_rate": 3.212945882364666e-05, + "loss": 2.7172, + "step": 14240 + }, + { + "epoch": 0.8268418991252883, + "grad_norm": 0.11470998078584671, + "learning_rate": 3.192115213809741e-05, + "loss": 2.7249, + "step": 14250 + }, + { + "epoch": 0.827422139054499, + "grad_norm": 0.12129820883274078, + "learning_rate": 3.171346433647335e-05, + "loss": 2.7175, + "step": 14260 + }, + { + "epoch": 0.8280023789837098, + "grad_norm": 0.11826759576797485, + "learning_rate": 3.150639618350289e-05, + "loss": 2.7346, + "step": 14270 + }, + { + "epoch": 0.8285826189129205, + "grad_norm": 0.12145520746707916, + "learning_rate": 3.12999484416326e-05, + "loss": 2.7271, + "step": 14280 + }, + { + "epoch": 0.8291628588421313, + "grad_norm": 0.11638975143432617, + "learning_rate": 3.1094121871024676e-05, + "loss": 2.7148, + "step": 14290 + }, + { + "epoch": 0.829743098771342, + "grad_norm": 0.11663084477186203, + "learning_rate": 3.0888917229554204e-05, + "loss": 2.7238, + "step": 14300 + }, + { + "epoch": 0.8303233387005526, + "grad_norm": 0.11826130747795105, + "learning_rate": 3.068433527280601e-05, + "loss": 2.7233, + "step": 14310 + }, + { + "epoch": 0.8309035786297634, + "grad_norm": 0.11905242502689362, + "learning_rate": 3.0480376754072448e-05, + "loss": 2.7111, + "step": 14320 + }, + { + "epoch": 0.8314838185589741, + "grad_norm": 0.11812193691730499, + "learning_rate": 3.0277042424350076e-05, + "loss": 2.7146, + "step": 14330 + }, + { + "epoch": 0.8320640584881849, + "grad_norm": 0.1194220557808876, + "learning_rate": 3.0074333032337154e-05, + "loss": 2.7197, + "step": 14340 + }, + { + "epoch": 0.8326442984173956, + "grad_norm": 0.12228541076183319, + "learning_rate": 2.9872249324431046e-05, + "loss": 2.7239, + "step": 14350 + }, + { + "epoch": 0.8332245383466064, + "grad_norm": 0.11680544167757034, + "learning_rate": 2.9670792044724937e-05, + "loss": 2.7152, + "step": 14360 + }, + { + "epoch": 0.833804778275817, + "grad_norm": 0.117955781519413, + "learning_rate": 2.9469961935005797e-05, + "loss": 2.7177, + "step": 14370 + }, + { + "epoch": 0.8343850182050278, + "grad_norm": 0.11889876425266266, + "learning_rate": 2.9269759734751056e-05, + "loss": 2.7169, + "step": 14380 + }, + { + "epoch": 0.8349652581342385, + "grad_norm": 0.11606640368700027, + "learning_rate": 2.907018618112618e-05, + "loss": 2.7232, + "step": 14390 + }, + { + "epoch": 0.8355454980634492, + "grad_norm": 0.11701922863721848, + "learning_rate": 2.8871242008981992e-05, + "loss": 2.7254, + "step": 14400 + }, + { + "epoch": 0.83612573799266, + "grad_norm": 0.11705347150564194, + "learning_rate": 2.8672927950851612e-05, + "loss": 2.7296, + "step": 14410 + }, + { + "epoch": 0.8367059779218707, + "grad_norm": 0.121455118060112, + "learning_rate": 2.8475244736948315e-05, + "loss": 2.717, + "step": 14420 + }, + { + "epoch": 0.8372862178510814, + "grad_norm": 0.1162460669875145, + "learning_rate": 2.8278193095162353e-05, + "loss": 2.7161, + "step": 14430 + }, + { + "epoch": 0.8378664577802921, + "grad_norm": 0.11314946413040161, + "learning_rate": 2.8081773751058516e-05, + "loss": 2.7217, + "step": 14440 + }, + { + "epoch": 0.8384466977095029, + "grad_norm": 0.11895614117383957, + "learning_rate": 2.7885987427873406e-05, + "loss": 2.7165, + "step": 14450 + }, + { + "epoch": 0.8390269376387136, + "grad_norm": 0.12004794180393219, + "learning_rate": 2.7690834846512736e-05, + "loss": 2.7311, + "step": 14460 + }, + { + "epoch": 0.8396071775679244, + "grad_norm": 0.11621776968240738, + "learning_rate": 2.7496316725548887e-05, + "loss": 2.7114, + "step": 14470 + }, + { + "epoch": 0.8401874174971351, + "grad_norm": 0.11720835417509079, + "learning_rate": 2.7302433781217774e-05, + "loss": 2.7241, + "step": 14480 + }, + { + "epoch": 0.8407676574263458, + "grad_norm": 0.11709107458591461, + "learning_rate": 2.7109186727416824e-05, + "loss": 2.7181, + "step": 14490 + }, + { + "epoch": 0.8413478973555565, + "grad_norm": 0.11878082901239395, + "learning_rate": 2.691657627570192e-05, + "loss": 2.7206, + "step": 14500 + }, + { + "epoch": 0.8419281372847672, + "grad_norm": 0.11816674470901489, + "learning_rate": 2.6724603135284887e-05, + "loss": 2.7191, + "step": 14510 + }, + { + "epoch": 0.842508377213978, + "grad_norm": 0.11694590002298355, + "learning_rate": 2.653326801303102e-05, + "loss": 2.7187, + "step": 14520 + }, + { + "epoch": 0.8430886171431887, + "grad_norm": 0.11519923061132431, + "learning_rate": 2.6342571613456146e-05, + "loss": 2.7212, + "step": 14530 + }, + { + "epoch": 0.8436688570723995, + "grad_norm": 0.11877317726612091, + "learning_rate": 2.6152514638724522e-05, + "loss": 2.7245, + "step": 14540 + }, + { + "epoch": 0.8442490970016102, + "grad_norm": 0.11796387284994125, + "learning_rate": 2.5963097788645764e-05, + "loss": 2.7243, + "step": 14550 + }, + { + "epoch": 0.8448293369308209, + "grad_norm": 0.117183618247509, + "learning_rate": 2.577432176067258e-05, + "loss": 2.7266, + "step": 14560 + }, + { + "epoch": 0.8454095768600316, + "grad_norm": 0.12094635516405106, + "learning_rate": 2.5586187249898074e-05, + "loss": 2.719, + "step": 14570 + }, + { + "epoch": 0.8459898167892423, + "grad_norm": 0.11904298514127731, + "learning_rate": 2.539869494905318e-05, + "loss": 2.716, + "step": 14580 + }, + { + "epoch": 0.8465700567184531, + "grad_norm": 0.11641751229763031, + "learning_rate": 2.5211845548504264e-05, + "loss": 2.7237, + "step": 14590 + }, + { + "epoch": 0.8471502966476638, + "grad_norm": 0.12003939598798752, + "learning_rate": 2.5025639736250382e-05, + "loss": 2.7194, + "step": 14600 + }, + { + "epoch": 0.8477305365768746, + "grad_norm": 0.12015482783317566, + "learning_rate": 2.48400781979208e-05, + "loss": 2.7133, + "step": 14610 + }, + { + "epoch": 0.8483107765060852, + "grad_norm": 0.11893987655639648, + "learning_rate": 2.4655161616772594e-05, + "loss": 2.7217, + "step": 14620 + }, + { + "epoch": 0.848891016435296, + "grad_norm": 0.11744572222232819, + "learning_rate": 2.4470890673687884e-05, + "loss": 2.7173, + "step": 14630 + }, + { + "epoch": 0.8494712563645067, + "grad_norm": 0.12006239593029022, + "learning_rate": 2.428726604717173e-05, + "loss": 2.7304, + "step": 14640 + }, + { + "epoch": 0.8500514962937175, + "grad_norm": 0.11663104593753815, + "learning_rate": 2.410428841334915e-05, + "loss": 2.7114, + "step": 14650 + }, + { + "epoch": 0.8506317362229282, + "grad_norm": 0.11702371388673782, + "learning_rate": 2.392195844596299e-05, + "loss": 2.7135, + "step": 14660 + }, + { + "epoch": 0.851211976152139, + "grad_norm": 0.11572497338056564, + "learning_rate": 2.3740276816371278e-05, + "loss": 2.726, + "step": 14670 + }, + { + "epoch": 0.8517922160813496, + "grad_norm": 0.11705095320940018, + "learning_rate": 2.3559244193544806e-05, + "loss": 2.7128, + "step": 14680 + }, + { + "epoch": 0.8523724560105603, + "grad_norm": 0.11780356615781784, + "learning_rate": 2.337886124406461e-05, + "loss": 2.7184, + "step": 14690 + }, + { + "epoch": 0.8529526959397711, + "grad_norm": 0.11437036097049713, + "learning_rate": 2.3199128632119705e-05, + "loss": 2.713, + "step": 14700 + }, + { + "epoch": 0.8535329358689818, + "grad_norm": 0.11990920454263687, + "learning_rate": 2.3020047019504355e-05, + "loss": 2.7161, + "step": 14710 + }, + { + "epoch": 0.8541131757981926, + "grad_norm": 0.11468629539012909, + "learning_rate": 2.2841617065615805e-05, + "loss": 2.7223, + "step": 14720 + }, + { + "epoch": 0.8546934157274033, + "grad_norm": 0.11571449041366577, + "learning_rate": 2.266383942745185e-05, + "loss": 2.7133, + "step": 14730 + }, + { + "epoch": 0.8552736556566141, + "grad_norm": 0.11584048718214035, + "learning_rate": 2.2486714759608306e-05, + "loss": 2.7218, + "step": 14740 + }, + { + "epoch": 0.8558538955858247, + "grad_norm": 0.11709973216056824, + "learning_rate": 2.231024371427688e-05, + "loss": 2.7084, + "step": 14750 + }, + { + "epoch": 0.8564341355150354, + "grad_norm": 0.11297730356454849, + "learning_rate": 2.213442694124239e-05, + "loss": 2.7061, + "step": 14760 + }, + { + "epoch": 0.8570143754442462, + "grad_norm": 0.11450552940368652, + "learning_rate": 2.19592650878806e-05, + "loss": 2.7242, + "step": 14770 + }, + { + "epoch": 0.8575946153734569, + "grad_norm": 0.11598943918943405, + "learning_rate": 2.1784758799155803e-05, + "loss": 2.7153, + "step": 14780 + }, + { + "epoch": 0.8581748553026677, + "grad_norm": 0.11803678423166275, + "learning_rate": 2.161090871761846e-05, + "loss": 2.7207, + "step": 14790 + }, + { + "epoch": 0.8587550952318784, + "grad_norm": 0.11763158440589905, + "learning_rate": 2.1437715483402764e-05, + "loss": 2.7191, + "step": 14800 + }, + { + "epoch": 0.8593353351610891, + "grad_norm": 0.12016081064939499, + "learning_rate": 2.1265179734224307e-05, + "loss": 2.719, + "step": 14810 + }, + { + "epoch": 0.8599155750902998, + "grad_norm": 0.11358082294464111, + "learning_rate": 2.1093302105377877e-05, + "loss": 2.7062, + "step": 14820 + }, + { + "epoch": 0.8604958150195106, + "grad_norm": 0.11611097306013107, + "learning_rate": 2.0922083229734855e-05, + "loss": 2.7122, + "step": 14830 + }, + { + "epoch": 0.8610760549487213, + "grad_norm": 0.11881374567747116, + "learning_rate": 2.0751523737741095e-05, + "loss": 2.718, + "step": 14840 + }, + { + "epoch": 0.861656294877932, + "grad_norm": 0.11680326610803604, + "learning_rate": 2.058162425741452e-05, + "loss": 2.7115, + "step": 14850 + }, + { + "epoch": 0.8622365348071428, + "grad_norm": 0.11934536695480347, + "learning_rate": 2.041238541434276e-05, + "loss": 2.7155, + "step": 14860 + }, + { + "epoch": 0.8628167747363534, + "grad_norm": 0.11599881947040558, + "learning_rate": 2.0243807831681027e-05, + "loss": 2.715, + "step": 14870 + }, + { + "epoch": 0.8633970146655642, + "grad_norm": 0.11855433881282806, + "learning_rate": 2.007589213014964e-05, + "loss": 2.719, + "step": 14880 + }, + { + "epoch": 0.8639772545947749, + "grad_norm": 0.1187380775809288, + "learning_rate": 1.9908638928031765e-05, + "loss": 2.7197, + "step": 14890 + }, + { + "epoch": 0.8645574945239857, + "grad_norm": 0.12009769678115845, + "learning_rate": 1.9742048841171255e-05, + "loss": 2.7124, + "step": 14900 + }, + { + "epoch": 0.8651377344531964, + "grad_norm": 0.1180645152926445, + "learning_rate": 1.9576122482970184e-05, + "loss": 2.7175, + "step": 14910 + }, + { + "epoch": 0.8657179743824072, + "grad_norm": 0.11391542106866837, + "learning_rate": 1.9410860464386916e-05, + "loss": 2.7138, + "step": 14920 + }, + { + "epoch": 0.8662982143116179, + "grad_norm": 0.11604123562574387, + "learning_rate": 1.924626339393336e-05, + "loss": 2.7171, + "step": 14930 + }, + { + "epoch": 0.8668784542408285, + "grad_norm": 0.1202244833111763, + "learning_rate": 1.9082331877673277e-05, + "loss": 2.7213, + "step": 14940 + }, + { + "epoch": 0.8674586941700393, + "grad_norm": 0.11324458569288254, + "learning_rate": 1.8919066519219664e-05, + "loss": 2.7155, + "step": 14950 + }, + { + "epoch": 0.86803893409925, + "grad_norm": 0.11484729498624802, + "learning_rate": 1.8756467919732645e-05, + "loss": 2.7248, + "step": 14960 + }, + { + "epoch": 0.8686191740284608, + "grad_norm": 0.11329852789640427, + "learning_rate": 1.8594536677917373e-05, + "loss": 2.717, + "step": 14970 + }, + { + "epoch": 0.8691994139576715, + "grad_norm": 0.11292688548564911, + "learning_rate": 1.8433273390021523e-05, + "loss": 2.7189, + "step": 14980 + }, + { + "epoch": 0.8697796538868823, + "grad_norm": 0.11284969747066498, + "learning_rate": 1.8272678649833508e-05, + "loss": 2.7048, + "step": 14990 + }, + { + "epoch": 0.8703598938160929, + "grad_norm": 0.11553023010492325, + "learning_rate": 1.8112753048679965e-05, + "loss": 2.7161, + "step": 15000 + }, + { + "epoch": 0.8703598938160929, + "eval_loss": 2.687220335006714, + "eval_runtime": 5.3901, + "eval_samples_per_second": 803.32, + "eval_steps_per_second": 1.67, + "step": 15000 + }, + { + "epoch": 0.8709401337453037, + "grad_norm": 0.1153511255979538, + "learning_rate": 1.7953497175423673e-05, + "loss": 2.7144, + "step": 15010 + }, + { + "epoch": 0.8715203736745144, + "grad_norm": 0.11592899262905121, + "learning_rate": 1.7794911616461517e-05, + "loss": 2.7263, + "step": 15020 + }, + { + "epoch": 0.8721006136037251, + "grad_norm": 0.11693062633275986, + "learning_rate": 1.763699695572203e-05, + "loss": 2.7125, + "step": 15030 + }, + { + "epoch": 0.8726808535329359, + "grad_norm": 0.11720691621303558, + "learning_rate": 1.747975377466369e-05, + "loss": 2.7207, + "step": 15040 + }, + { + "epoch": 0.8732610934621466, + "grad_norm": 0.11742518097162247, + "learning_rate": 1.7323182652272173e-05, + "loss": 2.7137, + "step": 15050 + }, + { + "epoch": 0.8738413333913573, + "grad_norm": 0.1138685792684555, + "learning_rate": 1.7167284165058885e-05, + "loss": 2.7092, + "step": 15060 + }, + { + "epoch": 0.874421573320568, + "grad_norm": 0.11519136279821396, + "learning_rate": 1.701205888705837e-05, + "loss": 2.7266, + "step": 15070 + }, + { + "epoch": 0.8750018132497788, + "grad_norm": 0.11361874639987946, + "learning_rate": 1.68575073898263e-05, + "loss": 2.7219, + "step": 15080 + }, + { + "epoch": 0.8755820531789895, + "grad_norm": 0.11410374194383621, + "learning_rate": 1.6703630242437573e-05, + "loss": 2.724, + "step": 15090 + }, + { + "epoch": 0.8761622931082003, + "grad_norm": 0.11599191278219223, + "learning_rate": 1.6550428011483876e-05, + "loss": 2.7236, + "step": 15100 + }, + { + "epoch": 0.876742533037411, + "grad_norm": 0.11562803387641907, + "learning_rate": 1.6397901261071923e-05, + "loss": 2.7228, + "step": 15110 + }, + { + "epoch": 0.8773227729666216, + "grad_norm": 0.11603621393442154, + "learning_rate": 1.624605055282118e-05, + "loss": 2.7152, + "step": 15120 + }, + { + "epoch": 0.8779030128958324, + "grad_norm": 0.11416131258010864, + "learning_rate": 1.6094876445861828e-05, + "loss": 2.7124, + "step": 15130 + }, + { + "epoch": 0.8784832528250431, + "grad_norm": 0.11274771392345428, + "learning_rate": 1.5944379496832873e-05, + "loss": 2.7235, + "step": 15140 + }, + { + "epoch": 0.8790634927542539, + "grad_norm": 0.1134885847568512, + "learning_rate": 1.5794560259879686e-05, + "loss": 2.701, + "step": 15150 + }, + { + "epoch": 0.8796437326834646, + "grad_norm": 0.11914920806884766, + "learning_rate": 1.5645419286652507e-05, + "loss": 2.7114, + "step": 15160 + }, + { + "epoch": 0.8802239726126754, + "grad_norm": 0.11518207937479019, + "learning_rate": 1.5496957126304013e-05, + "loss": 2.7111, + "step": 15170 + }, + { + "epoch": 0.8808042125418861, + "grad_norm": 0.1119842380285263, + "learning_rate": 1.534917432548735e-05, + "loss": 2.7136, + "step": 15180 + }, + { + "epoch": 0.8813844524710968, + "grad_norm": 0.11328744888305664, + "learning_rate": 1.5202071428354414e-05, + "loss": 2.7128, + "step": 15190 + }, + { + "epoch": 0.8819646924003075, + "grad_norm": 0.11526224762201309, + "learning_rate": 1.5055648976553338e-05, + "loss": 2.7206, + "step": 15200 + }, + { + "epoch": 0.8825449323295183, + "grad_norm": 0.11353620141744614, + "learning_rate": 1.4909907509227006e-05, + "loss": 2.7275, + "step": 15210 + }, + { + "epoch": 0.883125172258729, + "grad_norm": 0.11482030898332596, + "learning_rate": 1.4764847563010753e-05, + "loss": 2.7176, + "step": 15220 + }, + { + "epoch": 0.8837054121879397, + "grad_norm": 0.11562719196081161, + "learning_rate": 1.4620469672030479e-05, + "loss": 2.7166, + "step": 15230 + }, + { + "epoch": 0.8842856521171505, + "grad_norm": 0.11470736563205719, + "learning_rate": 1.447677436790078e-05, + "loss": 2.7194, + "step": 15240 + }, + { + "epoch": 0.8848658920463611, + "grad_norm": 0.1143270805478096, + "learning_rate": 1.4333762179722688e-05, + "loss": 2.7086, + "step": 15250 + }, + { + "epoch": 0.8854461319755719, + "grad_norm": 0.11537613719701767, + "learning_rate": 1.4191433634082152e-05, + "loss": 2.7165, + "step": 15260 + }, + { + "epoch": 0.8860263719047826, + "grad_norm": 0.11545541882514954, + "learning_rate": 1.4049789255047786e-05, + "loss": 2.7135, + "step": 15270 + }, + { + "epoch": 0.8866066118339934, + "grad_norm": 0.11168920993804932, + "learning_rate": 1.3908829564169013e-05, + "loss": 2.7101, + "step": 15280 + }, + { + "epoch": 0.8871868517632041, + "grad_norm": 0.11263593286275864, + "learning_rate": 1.3768555080474189e-05, + "loss": 2.7157, + "step": 15290 + }, + { + "epoch": 0.8877670916924149, + "grad_norm": 0.1139439269900322, + "learning_rate": 1.3628966320468595e-05, + "loss": 2.7095, + "step": 15300 + }, + { + "epoch": 0.8883473316216255, + "grad_norm": 0.11641982942819595, + "learning_rate": 1.3490063798132802e-05, + "loss": 2.7105, + "step": 15310 + }, + { + "epoch": 0.8889275715508362, + "grad_norm": 0.11295609176158905, + "learning_rate": 1.335184802492031e-05, + "loss": 2.7104, + "step": 15320 + }, + { + "epoch": 0.889507811480047, + "grad_norm": 0.11365869641304016, + "learning_rate": 1.3214319509756158e-05, + "loss": 2.7151, + "step": 15330 + }, + { + "epoch": 0.8900880514092577, + "grad_norm": 0.11353792250156403, + "learning_rate": 1.3077478759034733e-05, + "loss": 2.7207, + "step": 15340 + }, + { + "epoch": 0.8906682913384685, + "grad_norm": 0.11343677341938019, + "learning_rate": 1.294132627661797e-05, + "loss": 2.7095, + "step": 15350 + }, + { + "epoch": 0.8912485312676792, + "grad_norm": 0.11483877897262573, + "learning_rate": 1.280586256383367e-05, + "loss": 2.7138, + "step": 15360 + }, + { + "epoch": 0.89182877119689, + "grad_norm": 0.11700621247291565, + "learning_rate": 1.2671088119473284e-05, + "loss": 2.7164, + "step": 15370 + }, + { + "epoch": 0.8924090111261006, + "grad_norm": 0.11624756455421448, + "learning_rate": 1.253700343979054e-05, + "loss": 2.7064, + "step": 15380 + }, + { + "epoch": 0.8929892510553114, + "grad_norm": 0.1139611005783081, + "learning_rate": 1.2403609018499219e-05, + "loss": 2.7125, + "step": 15390 + }, + { + "epoch": 0.8935694909845221, + "grad_norm": 0.11446714401245117, + "learning_rate": 1.2270905346771577e-05, + "loss": 2.7072, + "step": 15400 + }, + { + "epoch": 0.8941497309137328, + "grad_norm": 0.11538238823413849, + "learning_rate": 1.2138892913236444e-05, + "loss": 2.718, + "step": 15410 + }, + { + "epoch": 0.8947299708429436, + "grad_norm": 0.11417897045612335, + "learning_rate": 1.2007572203977369e-05, + "loss": 2.7022, + "step": 15420 + }, + { + "epoch": 0.8953102107721543, + "grad_norm": 0.11221955716609955, + "learning_rate": 1.1876943702531052e-05, + "loss": 2.7063, + "step": 15430 + }, + { + "epoch": 0.895890450701365, + "grad_norm": 0.11339222639799118, + "learning_rate": 1.1747007889885252e-05, + "loss": 2.7063, + "step": 15440 + }, + { + "epoch": 0.8964706906305757, + "grad_norm": 0.11208420246839523, + "learning_rate": 1.1617765244477285e-05, + "loss": 2.7113, + "step": 15450 + }, + { + "epoch": 0.8970509305597865, + "grad_norm": 0.11443324387073517, + "learning_rate": 1.148921624219208e-05, + "loss": 2.7151, + "step": 15460 + }, + { + "epoch": 0.8976311704889972, + "grad_norm": 0.1121024414896965, + "learning_rate": 1.1361361356360523e-05, + "loss": 2.7105, + "step": 15470 + }, + { + "epoch": 0.898211410418208, + "grad_norm": 0.11098407953977585, + "learning_rate": 1.1234201057757743e-05, + "loss": 2.7157, + "step": 15480 + }, + { + "epoch": 0.8987916503474187, + "grad_norm": 0.11558841168880463, + "learning_rate": 1.110773581460125e-05, + "loss": 2.7207, + "step": 15490 + }, + { + "epoch": 0.8993718902766293, + "grad_norm": 0.11299290508031845, + "learning_rate": 1.0981966092549311e-05, + "loss": 2.7231, + "step": 15500 + }, + { + "epoch": 0.8999521302058401, + "grad_norm": 0.11334340274333954, + "learning_rate": 1.0856892354699222e-05, + "loss": 2.7113, + "step": 15510 + }, + { + "epoch": 0.9005323701350508, + "grad_norm": 0.11435014754533768, + "learning_rate": 1.0732515061585613e-05, + "loss": 2.7142, + "step": 15520 + }, + { + "epoch": 0.9011126100642616, + "grad_norm": 0.11135628074407578, + "learning_rate": 1.0608834671178635e-05, + "loss": 2.7064, + "step": 15530 + }, + { + "epoch": 0.9016928499934723, + "grad_norm": 0.11612440645694733, + "learning_rate": 1.0485851638882537e-05, + "loss": 2.7138, + "step": 15540 + }, + { + "epoch": 0.9022730899226831, + "grad_norm": 0.1127479076385498, + "learning_rate": 1.0363566417533687e-05, + "loss": 2.7149, + "step": 15550 + }, + { + "epoch": 0.9028533298518937, + "grad_norm": 0.11329977214336395, + "learning_rate": 1.0241979457399064e-05, + "loss": 2.7056, + "step": 15560 + }, + { + "epoch": 0.9034335697811045, + "grad_norm": 0.1118236631155014, + "learning_rate": 1.0121091206174615e-05, + "loss": 2.7131, + "step": 15570 + }, + { + "epoch": 0.9040138097103152, + "grad_norm": 0.11316058784723282, + "learning_rate": 1.0000902108983523e-05, + "loss": 2.7104, + "step": 15580 + }, + { + "epoch": 0.9045940496395259, + "grad_norm": 0.11303921043872833, + "learning_rate": 9.881412608374629e-06, + "loss": 2.7026, + "step": 15590 + }, + { + "epoch": 0.9051742895687367, + "grad_norm": 0.11121919751167297, + "learning_rate": 9.762623144320838e-06, + "loss": 2.7049, + "step": 15600 + }, + { + "epoch": 0.9057545294979474, + "grad_norm": 0.11181233078241348, + "learning_rate": 9.644534154217354e-06, + "loss": 2.7145, + "step": 15610 + }, + { + "epoch": 0.9063347694271582, + "grad_norm": 0.11229850351810455, + "learning_rate": 9.527146072880254e-06, + "loss": 2.7089, + "step": 15620 + }, + { + "epoch": 0.9069150093563688, + "grad_norm": 0.11836584657430649, + "learning_rate": 9.410459332544697e-06, + "loss": 2.7143, + "step": 15630 + }, + { + "epoch": 0.9074952492855796, + "grad_norm": 0.11599016189575195, + "learning_rate": 9.294474362863525e-06, + "loss": 2.7071, + "step": 15640 + }, + { + "epoch": 0.9080754892147903, + "grad_norm": 0.11280685663223267, + "learning_rate": 9.179191590905523e-06, + "loss": 2.7099, + "step": 15650 + }, + { + "epoch": 0.9086557291440011, + "grad_norm": 0.11349350959062576, + "learning_rate": 9.064611441153935e-06, + "loss": 2.7031, + "step": 15660 + }, + { + "epoch": 0.9092359690732118, + "grad_norm": 0.1137542724609375, + "learning_rate": 8.950734335504907e-06, + "loss": 2.6978, + "step": 15670 + }, + { + "epoch": 0.9098162090024225, + "grad_norm": 0.11099807173013687, + "learning_rate": 8.837560693265844e-06, + "loss": 2.7104, + "step": 15680 + }, + { + "epoch": 0.9103964489316332, + "grad_norm": 0.11295212060213089, + "learning_rate": 8.725090931153968e-06, + "loss": 2.7144, + "step": 15690 + }, + { + "epoch": 0.9109766888608439, + "grad_norm": 0.11119643598794937, + "learning_rate": 8.613325463294675e-06, + "loss": 2.7047, + "step": 15700 + }, + { + "epoch": 0.9115569287900547, + "grad_norm": 0.11346277594566345, + "learning_rate": 8.502264701220198e-06, + "loss": 2.7137, + "step": 15710 + }, + { + "epoch": 0.9121371687192654, + "grad_norm": 0.11389115452766418, + "learning_rate": 8.391909053867863e-06, + "loss": 2.7077, + "step": 15720 + }, + { + "epoch": 0.9127174086484762, + "grad_norm": 0.11094717681407928, + "learning_rate": 8.282258927578723e-06, + "loss": 2.707, + "step": 15730 + }, + { + "epoch": 0.9132976485776869, + "grad_norm": 0.1122366338968277, + "learning_rate": 8.173314726096038e-06, + "loss": 2.7167, + "step": 15740 + }, + { + "epoch": 0.9138778885068976, + "grad_norm": 0.11315654963254929, + "learning_rate": 8.065076850563746e-06, + "loss": 2.7104, + "step": 15750 + }, + { + "epoch": 0.9144581284361083, + "grad_norm": 0.1108250766992569, + "learning_rate": 7.957545699525093e-06, + "loss": 2.7144, + "step": 15760 + }, + { + "epoch": 0.915038368365319, + "grad_norm": 0.11187420040369034, + "learning_rate": 7.85072166892098e-06, + "loss": 2.7163, + "step": 15770 + }, + { + "epoch": 0.9156186082945298, + "grad_norm": 0.11263624578714371, + "learning_rate": 7.744605152088724e-06, + "loss": 2.7184, + "step": 15780 + }, + { + "epoch": 0.9161988482237405, + "grad_norm": 0.11211346089839935, + "learning_rate": 7.639196539760462e-06, + "loss": 2.7106, + "step": 15790 + }, + { + "epoch": 0.9167790881529513, + "grad_norm": 0.11310411244630814, + "learning_rate": 7.534496220061682e-06, + "loss": 2.711, + "step": 15800 + }, + { + "epoch": 0.917359328082162, + "grad_norm": 0.11148010939359665, + "learning_rate": 7.430504578510023e-06, + "loss": 2.716, + "step": 15810 + }, + { + "epoch": 0.9179395680113727, + "grad_norm": 0.11117308586835861, + "learning_rate": 7.327221998013522e-06, + "loss": 2.7111, + "step": 15820 + }, + { + "epoch": 0.9185198079405834, + "grad_norm": 0.1113533303141594, + "learning_rate": 7.224648858869487e-06, + "loss": 2.7152, + "step": 15830 + }, + { + "epoch": 0.9191000478697942, + "grad_norm": 0.1133064553141594, + "learning_rate": 7.122785538762999e-06, + "loss": 2.7071, + "step": 15840 + }, + { + "epoch": 0.9196802877990049, + "grad_norm": 0.11209236830472946, + "learning_rate": 7.021632412765411e-06, + "loss": 2.7113, + "step": 15850 + }, + { + "epoch": 0.9202605277282156, + "grad_norm": 0.11444966495037079, + "learning_rate": 6.9211898533331874e-06, + "loss": 2.7072, + "step": 15860 + }, + { + "epoch": 0.9208407676574264, + "grad_norm": 0.1107277199625969, + "learning_rate": 6.821458230306288e-06, + "loss": 2.7075, + "step": 15870 + }, + { + "epoch": 0.921421007586637, + "grad_norm": 0.11145395040512085, + "learning_rate": 6.722437910907098e-06, + "loss": 2.7063, + "step": 15880 + }, + { + "epoch": 0.9220012475158478, + "grad_norm": 0.11115839332342148, + "learning_rate": 6.6241292597386764e-06, + "loss": 2.7071, + "step": 15890 + }, + { + "epoch": 0.9225814874450585, + "grad_norm": 0.114626444876194, + "learning_rate": 6.5265326387838885e-06, + "loss": 2.7121, + "step": 15900 + }, + { + "epoch": 0.9231617273742693, + "grad_norm": 0.11295740306377411, + "learning_rate": 6.429648407403655e-06, + "loss": 2.7101, + "step": 15910 + }, + { + "epoch": 0.92374196730348, + "grad_norm": 0.11063504219055176, + "learning_rate": 6.333476922335857e-06, + "loss": 2.7043, + "step": 15920 + }, + { + "epoch": 0.9243222072326908, + "grad_norm": 0.11329693347215652, + "learning_rate": 6.238018537694057e-06, + "loss": 2.7148, + "step": 15930 + }, + { + "epoch": 0.9249024471619014, + "grad_norm": 0.11279025673866272, + "learning_rate": 6.143273604965915e-06, + "loss": 2.7225, + "step": 15940 + }, + { + "epoch": 0.9254826870911121, + "grad_norm": 0.11234597116708755, + "learning_rate": 6.049242473012284e-06, + "loss": 2.71, + "step": 15950 + }, + { + "epoch": 0.9260629270203229, + "grad_norm": 0.11071603745222092, + "learning_rate": 5.955925488065605e-06, + "loss": 2.7065, + "step": 15960 + }, + { + "epoch": 0.9266431669495336, + "grad_norm": 0.11233749240636826, + "learning_rate": 5.863322993728781e-06, + "loss": 2.7108, + "step": 15970 + }, + { + "epoch": 0.9272234068787444, + "grad_norm": 0.1108274981379509, + "learning_rate": 5.771435330973973e-06, + "loss": 2.6998, + "step": 15980 + }, + { + "epoch": 0.9278036468079551, + "grad_norm": 0.11140532791614532, + "learning_rate": 5.6802628381410705e-06, + "loss": 2.7102, + "step": 15990 + }, + { + "epoch": 0.9283838867371658, + "grad_norm": 0.110744908452034, + "learning_rate": 5.5898058509368245e-06, + "loss": 2.7094, + "step": 16000 + }, + { + "epoch": 0.9283838867371658, + "eval_loss": 2.6828300952911377, + "eval_runtime": 5.3865, + "eval_samples_per_second": 803.858, + "eval_steps_per_second": 1.671, + "step": 16000 + }, + { + "epoch": 0.9289641266663765, + "grad_norm": 0.11381204426288605, + "learning_rate": 5.500064702433294e-06, + "loss": 2.7111, + "step": 16010 + }, + { + "epoch": 0.9295443665955873, + "grad_norm": 0.11151892691850662, + "learning_rate": 5.411039723066802e-06, + "loss": 2.714, + "step": 16020 + }, + { + "epoch": 0.930124606524798, + "grad_norm": 0.1121894121170044, + "learning_rate": 5.3227312406366915e-06, + "loss": 2.7176, + "step": 16030 + }, + { + "epoch": 0.9307048464540087, + "grad_norm": 0.11182450503110886, + "learning_rate": 5.235139580303949e-06, + "loss": 2.7077, + "step": 16040 + }, + { + "epoch": 0.9312850863832195, + "grad_norm": 0.11129864305257797, + "learning_rate": 5.148265064590341e-06, + "loss": 2.7088, + "step": 16050 + }, + { + "epoch": 0.9318653263124302, + "grad_norm": 0.1094869002699852, + "learning_rate": 5.062108013376876e-06, + "loss": 2.7157, + "step": 16060 + }, + { + "epoch": 0.9324455662416409, + "grad_norm": 0.1108771562576294, + "learning_rate": 4.976668743902857e-06, + "loss": 2.7095, + "step": 16070 + }, + { + "epoch": 0.9330258061708516, + "grad_norm": 0.1098882406949997, + "learning_rate": 4.891947570764655e-06, + "loss": 2.7085, + "step": 16080 + }, + { + "epoch": 0.9336060461000624, + "grad_norm": 0.11074597388505936, + "learning_rate": 4.807944805914444e-06, + "loss": 2.7089, + "step": 16090 + }, + { + "epoch": 0.9341862860292731, + "grad_norm": 0.11346107721328735, + "learning_rate": 4.724660758659272e-06, + "loss": 2.6967, + "step": 16100 + }, + { + "epoch": 0.9347665259584839, + "grad_norm": 0.11357604712247849, + "learning_rate": 4.64209573565968e-06, + "loss": 2.7057, + "step": 16110 + }, + { + "epoch": 0.9353467658876946, + "grad_norm": 0.11271534860134125, + "learning_rate": 4.560250040928748e-06, + "loss": 2.7033, + "step": 16120 + }, + { + "epoch": 0.9359270058169052, + "grad_norm": 0.11321555078029633, + "learning_rate": 4.479123975830879e-06, + "loss": 2.7137, + "step": 16130 + }, + { + "epoch": 0.936507245746116, + "grad_norm": 0.11405043303966522, + "learning_rate": 4.398717839080746e-06, + "loss": 2.7191, + "step": 16140 + }, + { + "epoch": 0.9370874856753267, + "grad_norm": 0.11109951883554459, + "learning_rate": 4.319031926742234e-06, + "loss": 2.7126, + "step": 16150 + }, + { + "epoch": 0.9376677256045375, + "grad_norm": 0.11525629460811615, + "learning_rate": 4.240066532227105e-06, + "loss": 2.7033, + "step": 16160 + }, + { + "epoch": 0.9382479655337482, + "grad_norm": 0.1110944077372551, + "learning_rate": 4.161821946294309e-06, + "loss": 2.7133, + "step": 16170 + }, + { + "epoch": 0.938828205462959, + "grad_norm": 0.1128690242767334, + "learning_rate": 4.084298457048563e-06, + "loss": 2.7172, + "step": 16180 + }, + { + "epoch": 0.9394084453921696, + "grad_norm": 0.11161357909440994, + "learning_rate": 4.007496349939466e-06, + "loss": 2.7159, + "step": 16190 + }, + { + "epoch": 0.9399886853213804, + "grad_norm": 0.11112505942583084, + "learning_rate": 3.931415907760494e-06, + "loss": 2.7015, + "step": 16200 + }, + { + "epoch": 0.9405689252505911, + "grad_norm": 0.11174745112657547, + "learning_rate": 3.856057410647695e-06, + "loss": 2.7141, + "step": 16210 + }, + { + "epoch": 0.9411491651798018, + "grad_norm": 0.11169620603322983, + "learning_rate": 3.781421136079044e-06, + "loss": 2.7126, + "step": 16220 + }, + { + "epoch": 0.9417294051090126, + "grad_norm": 0.11062929034233093, + "learning_rate": 3.707507358873086e-06, + "loss": 2.7104, + "step": 16230 + }, + { + "epoch": 0.9423096450382233, + "grad_norm": 0.11094637215137482, + "learning_rate": 3.634316351188094e-06, + "loss": 2.7094, + "step": 16240 + }, + { + "epoch": 0.9428898849674341, + "grad_norm": 0.11300352960824966, + "learning_rate": 3.5618483825210048e-06, + "loss": 2.7006, + "step": 16250 + }, + { + "epoch": 0.9434701248966447, + "grad_norm": 0.11278436332941055, + "learning_rate": 3.4901037197064834e-06, + "loss": 2.7102, + "step": 16260 + }, + { + "epoch": 0.9440503648258555, + "grad_norm": 0.10999085009098053, + "learning_rate": 3.419082626915926e-06, + "loss": 2.7085, + "step": 16270 + }, + { + "epoch": 0.9446306047550662, + "grad_norm": 0.11130973696708679, + "learning_rate": 3.3487853656563927e-06, + "loss": 2.709, + "step": 16280 + }, + { + "epoch": 0.945210844684277, + "grad_norm": 0.11214598268270493, + "learning_rate": 3.279212194769787e-06, + "loss": 2.7168, + "step": 16290 + }, + { + "epoch": 0.9457910846134877, + "grad_norm": 0.11364555358886719, + "learning_rate": 3.2103633704318124e-06, + "loss": 2.7116, + "step": 16300 + }, + { + "epoch": 0.9463713245426985, + "grad_norm": 0.11130227893590927, + "learning_rate": 3.142239146151016e-06, + "loss": 2.7174, + "step": 16310 + }, + { + "epoch": 0.9469515644719091, + "grad_norm": 0.1102483943104744, + "learning_rate": 3.07483977276799e-06, + "loss": 2.71, + "step": 16320 + }, + { + "epoch": 0.9475318044011198, + "grad_norm": 0.11199984699487686, + "learning_rate": 3.0081654984542628e-06, + "loss": 2.7217, + "step": 16330 + }, + { + "epoch": 0.9481120443303306, + "grad_norm": 0.11214699596166611, + "learning_rate": 2.9422165687114754e-06, + "loss": 2.7177, + "step": 16340 + }, + { + "epoch": 0.9486922842595413, + "grad_norm": 0.11154871433973312, + "learning_rate": 2.8769932263705167e-06, + "loss": 2.7183, + "step": 16350 + }, + { + "epoch": 0.9492725241887521, + "grad_norm": 0.11083105206489563, + "learning_rate": 2.8124957115905683e-06, + "loss": 2.7145, + "step": 16360 + }, + { + "epoch": 0.9498527641179628, + "grad_norm": 0.11013077944517136, + "learning_rate": 2.7487242618581933e-06, + "loss": 2.7047, + "step": 16370 + }, + { + "epoch": 0.9504330040471735, + "grad_norm": 0.11090140789747238, + "learning_rate": 2.6856791119866275e-06, + "loss": 2.7061, + "step": 16380 + }, + { + "epoch": 0.9510132439763842, + "grad_norm": 0.11299975216388702, + "learning_rate": 2.623360494114646e-06, + "loss": 2.7003, + "step": 16390 + }, + { + "epoch": 0.951593483905595, + "grad_norm": 0.1101190522313118, + "learning_rate": 2.5617686377059637e-06, + "loss": 2.706, + "step": 16400 + }, + { + "epoch": 0.9521737238348057, + "grad_norm": 0.11057887226343155, + "learning_rate": 2.5009037695482574e-06, + "loss": 2.7179, + "step": 16410 + }, + { + "epoch": 0.9527539637640164, + "grad_norm": 0.1128411814570427, + "learning_rate": 2.4407661137523243e-06, + "loss": 2.7082, + "step": 16420 + }, + { + "epoch": 0.9533342036932272, + "grad_norm": 0.11165603250265121, + "learning_rate": 2.3813558917513025e-06, + "loss": 2.7253, + "step": 16430 + }, + { + "epoch": 0.9539144436224378, + "grad_norm": 0.11112351715564728, + "learning_rate": 2.322673322299873e-06, + "loss": 2.7214, + "step": 16440 + }, + { + "epoch": 0.9544946835516486, + "grad_norm": 0.11173354089260101, + "learning_rate": 2.2647186214734162e-06, + "loss": 2.709, + "step": 16450 + }, + { + "epoch": 0.9550749234808593, + "grad_norm": 0.1092383936047554, + "learning_rate": 2.207492002667211e-06, + "loss": 2.7124, + "step": 16460 + }, + { + "epoch": 0.9556551634100701, + "grad_norm": 0.11275044083595276, + "learning_rate": 2.150993676595614e-06, + "loss": 2.7105, + "step": 16470 + }, + { + "epoch": 0.9562354033392808, + "grad_norm": 0.11197572201490402, + "learning_rate": 2.095223851291439e-06, + "loss": 2.7034, + "step": 16480 + }, + { + "epoch": 0.9568156432684916, + "grad_norm": 0.11042193323373795, + "learning_rate": 2.0401827321049783e-06, + "loss": 2.7091, + "step": 16490 + }, + { + "epoch": 0.9573958831977023, + "grad_norm": 0.11012552678585052, + "learning_rate": 1.9858705217034478e-06, + "loss": 2.7145, + "step": 16500 + }, + { + "epoch": 0.9579761231269129, + "grad_norm": 0.11172161996364594, + "learning_rate": 1.9322874200700558e-06, + "loss": 2.7217, + "step": 16510 + }, + { + "epoch": 0.9585563630561237, + "grad_norm": 0.11069660633802414, + "learning_rate": 1.8794336245034238e-06, + "loss": 2.7084, + "step": 16520 + }, + { + "epoch": 0.9591366029853344, + "grad_norm": 0.11089422553777695, + "learning_rate": 1.8273093296167443e-06, + "loss": 2.71, + "step": 16530 + }, + { + "epoch": 0.9597168429145452, + "grad_norm": 0.11164766550064087, + "learning_rate": 1.7759147273371136e-06, + "loss": 2.7163, + "step": 16540 + }, + { + "epoch": 0.9602970828437559, + "grad_norm": 0.1093573048710823, + "learning_rate": 1.7252500069048882e-06, + "loss": 2.7069, + "step": 16550 + }, + { + "epoch": 0.9608773227729667, + "grad_norm": 0.11053937673568726, + "learning_rate": 1.6753153548728417e-06, + "loss": 2.7067, + "step": 16560 + }, + { + "epoch": 0.9614575627021773, + "grad_norm": 0.11065319180488586, + "learning_rate": 1.6261109551056307e-06, + "loss": 2.7054, + "step": 16570 + }, + { + "epoch": 0.962037802631388, + "grad_norm": 0.11450429260730743, + "learning_rate": 1.5776369887789521e-06, + "loss": 2.7046, + "step": 16580 + }, + { + "epoch": 0.9626180425605988, + "grad_norm": 0.1109624058008194, + "learning_rate": 1.529893634379076e-06, + "loss": 2.7088, + "step": 16590 + }, + { + "epoch": 0.9631982824898095, + "grad_norm": 0.1117752194404602, + "learning_rate": 1.4828810677020244e-06, + "loss": 2.7108, + "step": 16600 + }, + { + "epoch": 0.9637785224190203, + "grad_norm": 0.11164771765470505, + "learning_rate": 1.4365994618529499e-06, + "loss": 2.7099, + "step": 16610 + }, + { + "epoch": 0.964358762348231, + "grad_norm": 0.11132363975048065, + "learning_rate": 1.3910489872456468e-06, + "loss": 2.7074, + "step": 16620 + }, + { + "epoch": 0.9649390022774417, + "grad_norm": 0.11013256758451462, + "learning_rate": 1.3462298116016847e-06, + "loss": 2.7112, + "step": 16630 + }, + { + "epoch": 0.9655192422066524, + "grad_norm": 0.11211226135492325, + "learning_rate": 1.3021420999499656e-06, + "loss": 2.7194, + "step": 16640 + }, + { + "epoch": 0.9660994821358632, + "grad_norm": 0.11033762246370316, + "learning_rate": 1.258786014626101e-06, + "loss": 2.7058, + "step": 16650 + }, + { + "epoch": 0.9666797220650739, + "grad_norm": 0.11029258370399475, + "learning_rate": 1.216161715271702e-06, + "loss": 2.698, + "step": 16660 + }, + { + "epoch": 0.9672599619942847, + "grad_norm": 0.1115414947271347, + "learning_rate": 1.1742693588339126e-06, + "loss": 2.7096, + "step": 16670 + }, + { + "epoch": 0.9678402019234954, + "grad_norm": 0.11232610791921616, + "learning_rate": 1.1331090995647665e-06, + "loss": 2.7081, + "step": 16680 + }, + { + "epoch": 0.9684204418527061, + "grad_norm": 0.11218692362308502, + "learning_rate": 1.0926810890206528e-06, + "loss": 2.7094, + "step": 16690 + }, + { + "epoch": 0.9690006817819168, + "grad_norm": 0.1100686714053154, + "learning_rate": 1.0529854760617853e-06, + "loss": 2.7133, + "step": 16700 + }, + { + "epoch": 0.9695809217111275, + "grad_norm": 0.11020591855049133, + "learning_rate": 1.0140224068515113e-06, + "loss": 2.7086, + "step": 16710 + }, + { + "epoch": 0.9701611616403383, + "grad_norm": 0.1117192879319191, + "learning_rate": 9.757920248559815e-07, + "loss": 2.7066, + "step": 16720 + }, + { + "epoch": 0.970741401569549, + "grad_norm": 0.10976511240005493, + "learning_rate": 9.382944708434149e-07, + "loss": 2.7279, + "step": 16730 + }, + { + "epoch": 0.9713216414987598, + "grad_norm": 0.1115812286734581, + "learning_rate": 9.01529882883767e-07, + "loss": 2.7104, + "step": 16740 + }, + { + "epoch": 0.9719018814279705, + "grad_norm": 0.11191580444574356, + "learning_rate": 8.654983963481078e-07, + "loss": 2.7109, + "step": 16750 + }, + { + "epoch": 0.9724821213571812, + "grad_norm": 0.11055436730384827, + "learning_rate": 8.302001439081108e-07, + "loss": 2.7166, + "step": 16760 + }, + { + "epoch": 0.9730623612863919, + "grad_norm": 0.1103782057762146, + "learning_rate": 7.956352555356761e-07, + "loss": 2.7106, + "step": 16770 + }, + { + "epoch": 0.9736426012156026, + "grad_norm": 0.11125820130109787, + "learning_rate": 7.618038585023301e-07, + "loss": 2.7046, + "step": 16780 + }, + { + "epoch": 0.9742228411448134, + "grad_norm": 0.10956889390945435, + "learning_rate": 7.287060773788268e-07, + "loss": 2.7138, + "step": 16790 + }, + { + "epoch": 0.9748030810740241, + "grad_norm": 0.10879559069871902, + "learning_rate": 6.96342034034636e-07, + "loss": 2.7084, + "step": 16800 + }, + { + "epoch": 0.9753833210032349, + "grad_norm": 0.11071236431598663, + "learning_rate": 6.647118476375891e-07, + "loss": 2.708, + "step": 16810 + }, + { + "epoch": 0.9759635609324455, + "grad_norm": 0.1103687658905983, + "learning_rate": 6.338156346533452e-07, + "loss": 2.7076, + "step": 16820 + }, + { + "epoch": 0.9765438008616563, + "grad_norm": 0.11137609928846359, + "learning_rate": 6.036535088449702e-07, + "loss": 2.7063, + "step": 16830 + }, + { + "epoch": 0.977124040790867, + "grad_norm": 0.10921121388673782, + "learning_rate": 5.742255812726027e-07, + "loss": 2.7155, + "step": 16840 + }, + { + "epoch": 0.9777042807200778, + "grad_norm": 0.10973076522350311, + "learning_rate": 5.455319602929221e-07, + "loss": 2.7143, + "step": 16850 + }, + { + "epoch": 0.9782845206492885, + "grad_norm": 0.11031708121299744, + "learning_rate": 5.175727515588591e-07, + "loss": 2.7045, + "step": 16860 + }, + { + "epoch": 0.9788647605784992, + "grad_norm": 0.10978976637125015, + "learning_rate": 4.903480580191744e-07, + "loss": 2.7021, + "step": 16870 + }, + { + "epoch": 0.9794450005077099, + "grad_norm": 0.11084671318531036, + "learning_rate": 4.638579799179921e-07, + "loss": 2.7039, + "step": 16880 + }, + { + "epoch": 0.9800252404369206, + "grad_norm": 0.11046291887760162, + "learning_rate": 4.381026147945999e-07, + "loss": 2.7127, + "step": 16890 + }, + { + "epoch": 0.9806054803661314, + "grad_norm": 0.10937215387821198, + "learning_rate": 4.130820574829386e-07, + "loss": 2.7184, + "step": 16900 + }, + { + "epoch": 0.9811857202953421, + "grad_norm": 0.11020946502685547, + "learning_rate": 3.887964001113131e-07, + "loss": 2.717, + "step": 16910 + }, + { + "epoch": 0.9817659602245529, + "grad_norm": 0.11252225935459137, + "learning_rate": 3.652457321020597e-07, + "loss": 2.7121, + "step": 16920 + }, + { + "epoch": 0.9823462001537636, + "grad_norm": 0.11090285331010818, + "learning_rate": 3.4243014017119045e-07, + "loss": 2.7113, + "step": 16930 + }, + { + "epoch": 0.9829264400829744, + "grad_norm": 0.11007850617170334, + "learning_rate": 3.203497083281493e-07, + "loss": 2.7147, + "step": 16940 + }, + { + "epoch": 0.983506680012185, + "grad_norm": 0.11209884285926819, + "learning_rate": 2.9900451787534533e-07, + "loss": 2.7086, + "step": 16950 + }, + { + "epoch": 0.9840869199413957, + "grad_norm": 0.1093859076499939, + "learning_rate": 2.783946474080423e-07, + "loss": 2.708, + "step": 16960 + }, + { + "epoch": 0.9846671598706065, + "grad_norm": 0.11188683658838272, + "learning_rate": 2.5852017281393636e-07, + "loss": 2.7107, + "step": 16970 + }, + { + "epoch": 0.9852473997998172, + "grad_norm": 0.10943835973739624, + "learning_rate": 2.393811672729118e-07, + "loss": 2.7148, + "step": 16980 + }, + { + "epoch": 0.985827639729028, + "grad_norm": 0.10962080210447311, + "learning_rate": 2.2097770125679705e-07, + "loss": 2.7067, + "step": 16990 + }, + { + "epoch": 0.9864078796582387, + "grad_norm": 0.11074183881282806, + "learning_rate": 2.0330984252909801e-07, + "loss": 2.7068, + "step": 17000 + }, + { + "epoch": 0.9864078796582387, + "eval_loss": 2.681455612182617, + "eval_runtime": 5.3878, + "eval_samples_per_second": 803.669, + "eval_steps_per_second": 1.67, + "step": 17000 + }, + { + "epoch": 0.9869881195874494, + "grad_norm": 0.11070505529642105, + "learning_rate": 1.8637765614468728e-07, + "loss": 2.7131, + "step": 17010 + }, + { + "epoch": 0.9875683595166601, + "grad_norm": 0.10992126911878586, + "learning_rate": 1.701812044496931e-07, + "loss": 2.7069, + "step": 17020 + }, + { + "epoch": 0.9881485994458709, + "grad_norm": 0.10988050699234009, + "learning_rate": 1.5472054708112193e-07, + "loss": 2.7057, + "step": 17030 + }, + { + "epoch": 0.9887288393750816, + "grad_norm": 0.10954176634550095, + "learning_rate": 1.3999574096672518e-07, + "loss": 2.7031, + "step": 17040 + }, + { + "epoch": 0.9893090793042923, + "grad_norm": 0.10873284935951233, + "learning_rate": 1.2600684032479936e-07, + "loss": 2.7103, + "step": 17050 + }, + { + "epoch": 0.9898893192335031, + "grad_norm": 0.10949879884719849, + "learning_rate": 1.1275389666391967e-07, + "loss": 2.716, + "step": 17060 + }, + { + "epoch": 0.9904695591627137, + "grad_norm": 0.11077286303043365, + "learning_rate": 1.0023695878285111e-07, + "loss": 2.6973, + "step": 17070 + }, + { + "epoch": 0.9910497990919245, + "grad_norm": 0.10955335944890976, + "learning_rate": 8.845607277021551e-08, + "loss": 2.7106, + "step": 17080 + }, + { + "epoch": 0.9916300390211352, + "grad_norm": 0.11161547154188156, + "learning_rate": 7.741128200453584e-08, + "loss": 2.7117, + "step": 17090 + }, + { + "epoch": 0.992210278950346, + "grad_norm": 0.11127256602048874, + "learning_rate": 6.710262715383664e-08, + "loss": 2.721, + "step": 17100 + }, + { + "epoch": 0.9927905188795567, + "grad_norm": 0.10978476703166962, + "learning_rate": 5.7530146175688305e-08, + "loss": 2.7057, + "step": 17110 + }, + { + "epoch": 0.9933707588087675, + "grad_norm": 0.10954172164201736, + "learning_rate": 4.869387431696293e-08, + "loss": 2.7204, + "step": 17120 + }, + { + "epoch": 0.9939509987379782, + "grad_norm": 0.10985864698886871, + "learning_rate": 4.059384411372325e-08, + "loss": 2.7082, + "step": 17130 + }, + { + "epoch": 0.9945312386671888, + "grad_norm": 0.11031010746955872, + "learning_rate": 3.323008539115602e-08, + "loss": 2.7154, + "step": 17140 + }, + { + "epoch": 0.9951114785963996, + "grad_norm": 0.10947979241609573, + "learning_rate": 2.660262526339441e-08, + "loss": 2.7108, + "step": 17150 + }, + { + "epoch": 0.9956917185256103, + "grad_norm": 0.11240995675325394, + "learning_rate": 2.0711488133406954e-08, + "loss": 2.7082, + "step": 17160 + }, + { + "epoch": 0.9962719584548211, + "grad_norm": 0.1106034591794014, + "learning_rate": 1.5556695693019763e-08, + "loss": 2.7135, + "step": 17170 + }, + { + "epoch": 0.9968521983840318, + "grad_norm": 0.11040083318948746, + "learning_rate": 1.113826692267228e-08, + "loss": 2.706, + "step": 17180 + }, + { + "epoch": 0.9974324383132426, + "grad_norm": 0.1106574684381485, + "learning_rate": 7.4562180915283e-09, + "loss": 2.7083, + "step": 17190 + }, + { + "epoch": 0.9980126782424532, + "grad_norm": 0.10932475328445435, + "learning_rate": 4.510562757231718e-09, + "loss": 2.7048, + "step": 17200 + }, + { + "epoch": 0.998592918171664, + "grad_norm": 0.11014498770236969, + "learning_rate": 2.30131176603976e-09, + "loss": 2.7159, + "step": 17210 + }, + { + "epoch": 0.9991731581008747, + "grad_norm": 0.10927695780992508, + "learning_rate": 8.284732526231409e-10, + "loss": 2.709, + "step": 17220 + }, + { + "epoch": 0.9997533980300854, + "grad_norm": 0.11252807825803757, + "learning_rate": 9.205264011047376e-11, + "loss": 2.7063, + "step": 17230 + }, + { + "epoch": 0.9999854940017697, + "step": 17234, + "total_flos": 8.768782702139251e+19, + "train_loss": 3.0425821965374635, + "train_runtime": 37582.8145, + "train_samples_per_second": 234.783, + "train_steps_per_second": 0.459 + } + ], + "logging_steps": 10, + "max_steps": 17234, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.768782702139251e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}