{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.904895484571532, "eval_steps": 5000, "global_step": 50000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003619581938286128, "grad_norm": 1.063517451286316, "learning_rate": 1.0858418290402365e-06, "loss": 10.3292, "step": 20 }, { "epoch": 0.0007239163876572256, "grad_norm": 0.7639716267585754, "learning_rate": 2.171683658080473e-06, "loss": 10.3095, "step": 40 }, { "epoch": 0.0010858745814858383, "grad_norm": 0.5479791760444641, "learning_rate": 3.2575254871207094e-06, "loss": 10.2858, "step": 60 }, { "epoch": 0.0014478327753144511, "grad_norm": 0.5541791319847107, "learning_rate": 4.343367316160946e-06, "loss": 10.2636, "step": 80 }, { "epoch": 0.001809790969143064, "grad_norm": 0.5530744791030884, "learning_rate": 5.429209145201182e-06, "loss": 10.2491, "step": 100 }, { "epoch": 0.0021717491629716767, "grad_norm": 0.5137726068496704, "learning_rate": 6.515050974241419e-06, "loss": 10.2343, "step": 120 }, { "epoch": 0.0025337073568002895, "grad_norm": 0.512411892414093, "learning_rate": 7.600892803281655e-06, "loss": 10.2178, "step": 140 }, { "epoch": 0.0028956655506289022, "grad_norm": 0.5807462334632874, "learning_rate": 8.686734632321892e-06, "loss": 10.1966, "step": 160 }, { "epoch": 0.003257623744457515, "grad_norm": 0.7059533596038818, "learning_rate": 9.772576461362129e-06, "loss": 10.1748, "step": 180 }, { "epoch": 0.003619581938286128, "grad_norm": 1.3133445978164673, "learning_rate": 1.0858418290402365e-05, "loss": 10.1451, "step": 200 }, { "epoch": 0.003981540132114741, "grad_norm": 1.6613627672195435, "learning_rate": 1.1944260119442602e-05, "loss": 10.1148, "step": 220 }, { "epoch": 0.004343498325943353, "grad_norm": 1.7141027450561523, "learning_rate": 1.3030101948482837e-05, "loss": 10.0838, "step": 240 }, { "epoch": 0.004705456519771966, "grad_norm": 2.0595993995666504, "learning_rate": 1.4115943777523073e-05, "loss": 10.0519, "step": 260 }, { "epoch": 0.005067414713600579, "grad_norm": 2.2947471141815186, "learning_rate": 1.520178560656331e-05, "loss": 10.0166, "step": 280 }, { "epoch": 0.005429372907429192, "grad_norm": 2.2053394317626953, "learning_rate": 1.628762743560355e-05, "loss": 9.9809, "step": 300 }, { "epoch": 0.0057913311012578045, "grad_norm": 2.2479448318481445, "learning_rate": 1.7373469264643783e-05, "loss": 9.9443, "step": 320 }, { "epoch": 0.006153289295086417, "grad_norm": 1.9384366273880005, "learning_rate": 1.845931109368402e-05, "loss": 9.9023, "step": 340 }, { "epoch": 0.00651524748891503, "grad_norm": 1.7514017820358276, "learning_rate": 1.9545152922724258e-05, "loss": 9.8601, "step": 360 }, { "epoch": 0.006877205682743643, "grad_norm": 1.5420582294464111, "learning_rate": 2.0630994751764492e-05, "loss": 9.8214, "step": 380 }, { "epoch": 0.007239163876572256, "grad_norm": 1.223652958869934, "learning_rate": 2.171683658080473e-05, "loss": 9.7768, "step": 400 }, { "epoch": 0.007601122070400868, "grad_norm": 1.058700442314148, "learning_rate": 2.2802678409844966e-05, "loss": 9.7302, "step": 420 }, { "epoch": 0.007963080264229481, "grad_norm": 1.0271687507629395, "learning_rate": 2.3888520238885204e-05, "loss": 9.6882, "step": 440 }, { "epoch": 0.008325038458058095, "grad_norm": 0.9583302140235901, "learning_rate": 2.4974362067925438e-05, "loss": 9.6446, "step": 460 }, { "epoch": 0.008686996651886707, "grad_norm": 0.8648065328598022, "learning_rate": 2.6060203896965675e-05, "loss": 9.6003, "step": 480 }, { "epoch": 0.00904895484571532, "grad_norm": 0.7204363346099854, "learning_rate": 2.7146045726005912e-05, "loss": 9.5586, "step": 500 }, { "epoch": 0.009410913039543932, "grad_norm": 0.8680516481399536, "learning_rate": 2.8231887555046146e-05, "loss": 9.5111, "step": 520 }, { "epoch": 0.009772871233372546, "grad_norm": 0.6860862374305725, "learning_rate": 2.9317729384086387e-05, "loss": 9.4771, "step": 540 }, { "epoch": 0.010134829427201158, "grad_norm": 0.711931049823761, "learning_rate": 3.040357121312662e-05, "loss": 9.4369, "step": 560 }, { "epoch": 0.010496787621029771, "grad_norm": 0.7360084652900696, "learning_rate": 3.148941304216686e-05, "loss": 9.3904, "step": 580 }, { "epoch": 0.010858745814858383, "grad_norm": 0.7077947854995728, "learning_rate": 3.25752548712071e-05, "loss": 9.3552, "step": 600 }, { "epoch": 0.011220704008686997, "grad_norm": 0.562315821647644, "learning_rate": 3.366109670024733e-05, "loss": 9.3164, "step": 620 }, { "epoch": 0.011582662202515609, "grad_norm": 0.6069725751876831, "learning_rate": 3.4746938529287566e-05, "loss": 9.2838, "step": 640 }, { "epoch": 0.011944620396344223, "grad_norm": 0.6498883962631226, "learning_rate": 3.583278035832781e-05, "loss": 9.2487, "step": 660 }, { "epoch": 0.012306578590172835, "grad_norm": 0.615738034248352, "learning_rate": 3.691862218736804e-05, "loss": 9.2222, "step": 680 }, { "epoch": 0.012668536784001448, "grad_norm": 0.5743957757949829, "learning_rate": 3.8004464016408275e-05, "loss": 9.1898, "step": 700 }, { "epoch": 0.01303049497783006, "grad_norm": 0.5580448508262634, "learning_rate": 3.9090305845448516e-05, "loss": 9.1659, "step": 720 }, { "epoch": 0.013392453171658674, "grad_norm": 0.5862032771110535, "learning_rate": 4.017614767448875e-05, "loss": 9.1376, "step": 740 }, { "epoch": 0.013754411365487286, "grad_norm": 0.650966465473175, "learning_rate": 4.1261989503528983e-05, "loss": 9.1042, "step": 760 }, { "epoch": 0.0141163695593159, "grad_norm": 0.6448431015014648, "learning_rate": 4.2347831332569224e-05, "loss": 9.0836, "step": 780 }, { "epoch": 0.014478327753144511, "grad_norm": 0.5597690939903259, "learning_rate": 4.343367316160946e-05, "loss": 9.0728, "step": 800 }, { "epoch": 0.014840285946973125, "grad_norm": 0.5503562092781067, "learning_rate": 4.451951499064969e-05, "loss": 9.0544, "step": 820 }, { "epoch": 0.015202244140801737, "grad_norm": 0.5494912266731262, "learning_rate": 4.560535681968993e-05, "loss": 9.0229, "step": 840 }, { "epoch": 0.01556420233463035, "grad_norm": 0.5567879676818848, "learning_rate": 4.6691198648730167e-05, "loss": 9.016, "step": 860 }, { "epoch": 0.015926160528458962, "grad_norm": 0.5893707275390625, "learning_rate": 4.777704047777041e-05, "loss": 8.9989, "step": 880 }, { "epoch": 0.016288118722287574, "grad_norm": 0.5258359909057617, "learning_rate": 4.886288230681064e-05, "loss": 8.9894, "step": 900 }, { "epoch": 0.01665007691611619, "grad_norm": 0.5892924070358276, "learning_rate": 4.9948724135850875e-05, "loss": 8.9762, "step": 920 }, { "epoch": 0.0170120351099448, "grad_norm": 0.6365529894828796, "learning_rate": 5.1034565964891116e-05, "loss": 8.9545, "step": 940 }, { "epoch": 0.017373993303773413, "grad_norm": 0.5396739840507507, "learning_rate": 5.212040779393135e-05, "loss": 8.9387, "step": 960 }, { "epoch": 0.017735951497602025, "grad_norm": 0.5578063130378723, "learning_rate": 5.3206249622971584e-05, "loss": 8.9452, "step": 980 }, { "epoch": 0.01809790969143064, "grad_norm": 0.49258968234062195, "learning_rate": 5.4292091452011824e-05, "loss": 8.9329, "step": 1000 }, { "epoch": 0.018459867885259253, "grad_norm": 0.5692590475082397, "learning_rate": 5.537793328105206e-05, "loss": 8.9197, "step": 1020 }, { "epoch": 0.018821826079087865, "grad_norm": 0.5728791356086731, "learning_rate": 5.646377511009229e-05, "loss": 8.915, "step": 1040 }, { "epoch": 0.019183784272916476, "grad_norm": 0.4755489230155945, "learning_rate": 5.754961693913253e-05, "loss": 8.8982, "step": 1060 }, { "epoch": 0.019545742466745092, "grad_norm": 0.5339570641517639, "learning_rate": 5.8635458768172773e-05, "loss": 8.896, "step": 1080 }, { "epoch": 0.019907700660573704, "grad_norm": 0.58181232213974, "learning_rate": 5.9721300597213e-05, "loss": 8.8863, "step": 1100 }, { "epoch": 0.020269658854402316, "grad_norm": 0.5068308115005493, "learning_rate": 6.080714242625324e-05, "loss": 8.8815, "step": 1120 }, { "epoch": 0.020631617048230928, "grad_norm": 0.5282208919525146, "learning_rate": 6.189298425529349e-05, "loss": 8.8762, "step": 1140 }, { "epoch": 0.020993575242059543, "grad_norm": 0.6251116991043091, "learning_rate": 6.297882608433372e-05, "loss": 8.875, "step": 1160 }, { "epoch": 0.021355533435888155, "grad_norm": 0.7021293044090271, "learning_rate": 6.406466791337396e-05, "loss": 8.8549, "step": 1180 }, { "epoch": 0.021717491629716767, "grad_norm": 0.5882667303085327, "learning_rate": 6.51505097424142e-05, "loss": 8.8602, "step": 1200 }, { "epoch": 0.022079449823545382, "grad_norm": 0.552535891532898, "learning_rate": 6.623635157145442e-05, "loss": 8.8478, "step": 1220 }, { "epoch": 0.022441408017373994, "grad_norm": 0.5536801218986511, "learning_rate": 6.732219340049467e-05, "loss": 8.8465, "step": 1240 }, { "epoch": 0.022803366211202606, "grad_norm": 0.7319750189781189, "learning_rate": 6.84080352295349e-05, "loss": 8.8456, "step": 1260 }, { "epoch": 0.023165324405031218, "grad_norm": 0.6104538440704346, "learning_rate": 6.949387705857513e-05, "loss": 8.8237, "step": 1280 }, { "epoch": 0.023527282598859833, "grad_norm": 0.596066415309906, "learning_rate": 7.057971888761537e-05, "loss": 8.8247, "step": 1300 }, { "epoch": 0.023889240792688445, "grad_norm": 0.516325056552887, "learning_rate": 7.166556071665561e-05, "loss": 8.8194, "step": 1320 }, { "epoch": 0.024251198986517057, "grad_norm": 0.5828744173049927, "learning_rate": 7.275140254569584e-05, "loss": 8.8158, "step": 1340 }, { "epoch": 0.02461315718034567, "grad_norm": 0.6400988101959229, "learning_rate": 7.383724437473608e-05, "loss": 8.797, "step": 1360 }, { "epoch": 0.024975115374174284, "grad_norm": 0.6185368895530701, "learning_rate": 7.492308620377632e-05, "loss": 8.798, "step": 1380 }, { "epoch": 0.025337073568002896, "grad_norm": 0.5269763469696045, "learning_rate": 7.600892803281655e-05, "loss": 8.7835, "step": 1400 }, { "epoch": 0.025699031761831508, "grad_norm": 0.5572532415390015, "learning_rate": 7.709476986185679e-05, "loss": 8.7743, "step": 1420 }, { "epoch": 0.02606098995566012, "grad_norm": 0.5819870233535767, "learning_rate": 7.818061169089703e-05, "loss": 8.7728, "step": 1440 }, { "epoch": 0.026422948149488736, "grad_norm": 0.724423348903656, "learning_rate": 7.926645351993726e-05, "loss": 8.7537, "step": 1460 }, { "epoch": 0.026784906343317347, "grad_norm": 0.729290246963501, "learning_rate": 8.03522953489775e-05, "loss": 8.7566, "step": 1480 }, { "epoch": 0.02714686453714596, "grad_norm": 0.6978908777236938, "learning_rate": 8.143813717801774e-05, "loss": 8.7412, "step": 1500 }, { "epoch": 0.02750882273097457, "grad_norm": 0.9260867238044739, "learning_rate": 8.252397900705797e-05, "loss": 8.7469, "step": 1520 }, { "epoch": 0.027870780924803187, "grad_norm": 0.8353539109230042, "learning_rate": 8.360982083609821e-05, "loss": 8.726, "step": 1540 }, { "epoch": 0.0282327391186318, "grad_norm": 0.6905779242515564, "learning_rate": 8.469566266513845e-05, "loss": 8.7128, "step": 1560 }, { "epoch": 0.02859469731246041, "grad_norm": 0.6228475570678711, "learning_rate": 8.578150449417868e-05, "loss": 8.7487, "step": 1580 }, { "epoch": 0.028956655506289022, "grad_norm": 0.9750573635101318, "learning_rate": 8.686734632321892e-05, "loss": 8.7054, "step": 1600 }, { "epoch": 0.029318613700117638, "grad_norm": 0.874622106552124, "learning_rate": 8.795318815225916e-05, "loss": 8.715, "step": 1620 }, { "epoch": 0.02968057189394625, "grad_norm": 0.7310810685157776, "learning_rate": 8.903902998129938e-05, "loss": 8.6835, "step": 1640 }, { "epoch": 0.03004253008777486, "grad_norm": 0.7120797038078308, "learning_rate": 9.012487181033962e-05, "loss": 8.6734, "step": 1660 }, { "epoch": 0.030404488281603474, "grad_norm": 1.0088528394699097, "learning_rate": 9.121071363937987e-05, "loss": 8.6657, "step": 1680 }, { "epoch": 0.03076644647543209, "grad_norm": 1.4212431907653809, "learning_rate": 9.229655546842009e-05, "loss": 8.6429, "step": 1700 }, { "epoch": 0.0311284046692607, "grad_norm": 1.2623440027236938, "learning_rate": 9.338239729746033e-05, "loss": 8.6424, "step": 1720 }, { "epoch": 0.031490362863089316, "grad_norm": 1.190894365310669, "learning_rate": 9.446823912650057e-05, "loss": 8.616, "step": 1740 }, { "epoch": 0.031852321056917925, "grad_norm": 0.8714050054550171, "learning_rate": 9.555408095554081e-05, "loss": 8.6261, "step": 1760 }, { "epoch": 0.03221427925074654, "grad_norm": 0.7428932189941406, "learning_rate": 9.663992278458104e-05, "loss": 8.6054, "step": 1780 }, { "epoch": 0.03257623744457515, "grad_norm": 0.8327052593231201, "learning_rate": 9.772576461362128e-05, "loss": 8.6072, "step": 1800 }, { "epoch": 0.032938195638403764, "grad_norm": 0.7944102883338928, "learning_rate": 9.881160644266152e-05, "loss": 8.5757, "step": 1820 }, { "epoch": 0.03330015383223238, "grad_norm": 0.929286003112793, "learning_rate": 9.989744827170175e-05, "loss": 8.5651, "step": 1840 }, { "epoch": 0.03366211202606099, "grad_norm": 0.9866963028907776, "learning_rate": 0.00010098329010074199, "loss": 8.55, "step": 1860 }, { "epoch": 0.0340240702198896, "grad_norm": 1.5684864521026611, "learning_rate": 0.00010206913192978223, "loss": 8.5508, "step": 1880 }, { "epoch": 0.03438602841371822, "grad_norm": 1.268312692642212, "learning_rate": 0.00010315497375882246, "loss": 8.5303, "step": 1900 }, { "epoch": 0.03474798660754683, "grad_norm": 1.384734034538269, "learning_rate": 0.0001042408155878627, "loss": 8.526, "step": 1920 }, { "epoch": 0.03510994480137544, "grad_norm": 1.1619597673416138, "learning_rate": 0.00010532665741690294, "loss": 8.5083, "step": 1940 }, { "epoch": 0.03547190299520405, "grad_norm": 1.4747376441955566, "learning_rate": 0.00010641249924594317, "loss": 8.492, "step": 1960 }, { "epoch": 0.035833861189032666, "grad_norm": 1.1740059852600098, "learning_rate": 0.00010749834107498341, "loss": 8.4921, "step": 1980 }, { "epoch": 0.03619581938286128, "grad_norm": 1.6106988191604614, "learning_rate": 0.00010858418290402365, "loss": 8.4829, "step": 2000 }, { "epoch": 0.03655777757668989, "grad_norm": 1.2826563119888306, "learning_rate": 0.00010967002473306388, "loss": 8.4542, "step": 2020 }, { "epoch": 0.036919735770518505, "grad_norm": 1.4442939758300781, "learning_rate": 0.00011075586656210412, "loss": 8.4472, "step": 2040 }, { "epoch": 0.03728169396434712, "grad_norm": 1.3481926918029785, "learning_rate": 0.00011184170839114436, "loss": 8.4245, "step": 2060 }, { "epoch": 0.03764365215817573, "grad_norm": 1.377825379371643, "learning_rate": 0.00011292755022018458, "loss": 8.4233, "step": 2080 }, { "epoch": 0.038005610352004345, "grad_norm": 1.656119465827942, "learning_rate": 0.00011401339204922482, "loss": 8.4225, "step": 2100 }, { "epoch": 0.03836756854583295, "grad_norm": 1.5955251455307007, "learning_rate": 0.00011509923387826507, "loss": 8.4216, "step": 2120 }, { "epoch": 0.03872952673966157, "grad_norm": 1.6190309524536133, "learning_rate": 0.0001161850757073053, "loss": 8.4182, "step": 2140 }, { "epoch": 0.039091484933490184, "grad_norm": 1.4823400974273682, "learning_rate": 0.00011727091753634555, "loss": 8.3738, "step": 2160 }, { "epoch": 0.03945344312731879, "grad_norm": 4.104274749755859, "learning_rate": 0.00011835675936538576, "loss": 8.3636, "step": 2180 }, { "epoch": 0.03981540132114741, "grad_norm": 1.6707680225372314, "learning_rate": 0.000119442601194426, "loss": 8.3745, "step": 2200 }, { "epoch": 0.04017735951497602, "grad_norm": 1.1206501722335815, "learning_rate": 0.00012052844302346624, "loss": 8.353, "step": 2220 }, { "epoch": 0.04053931770880463, "grad_norm": 2.2229607105255127, "learning_rate": 0.00012161428485250648, "loss": 8.3539, "step": 2240 }, { "epoch": 0.04090127590263325, "grad_norm": 1.9011199474334717, "learning_rate": 0.00012270012668154674, "loss": 8.3293, "step": 2260 }, { "epoch": 0.041263234096461855, "grad_norm": 2.2467918395996094, "learning_rate": 0.00012378596851058698, "loss": 8.326, "step": 2280 }, { "epoch": 0.04162519229029047, "grad_norm": 1.530720829963684, "learning_rate": 0.0001248718103396272, "loss": 8.3197, "step": 2300 }, { "epoch": 0.041987150484119086, "grad_norm": 2.6763076782226562, "learning_rate": 0.00012595765216866743, "loss": 8.3004, "step": 2320 }, { "epoch": 0.042349108677947694, "grad_norm": 2.142010450363159, "learning_rate": 0.00012704349399770767, "loss": 8.3065, "step": 2340 }, { "epoch": 0.04271106687177631, "grad_norm": 2.1896350383758545, "learning_rate": 0.0001281293358267479, "loss": 8.2688, "step": 2360 }, { "epoch": 0.043073025065604925, "grad_norm": 2.1078433990478516, "learning_rate": 0.00012921517765578815, "loss": 8.2639, "step": 2380 }, { "epoch": 0.043434983259433534, "grad_norm": 1.8464548587799072, "learning_rate": 0.0001303010194848284, "loss": 8.2638, "step": 2400 }, { "epoch": 0.04379694145326215, "grad_norm": 2.71945858001709, "learning_rate": 0.0001313868613138686, "loss": 8.2516, "step": 2420 }, { "epoch": 0.044158899647090764, "grad_norm": 1.1496859788894653, "learning_rate": 0.00013247270314290885, "loss": 8.2561, "step": 2440 }, { "epoch": 0.04452085784091937, "grad_norm": 2.2801716327667236, "learning_rate": 0.0001335585449719491, "loss": 8.2525, "step": 2460 }, { "epoch": 0.04488281603474799, "grad_norm": 2.1865906715393066, "learning_rate": 0.00013464438680098933, "loss": 8.2548, "step": 2480 }, { "epoch": 0.0452447742285766, "grad_norm": 1.8173776865005493, "learning_rate": 0.00013573022863002957, "loss": 8.2447, "step": 2500 }, { "epoch": 0.04560673242240521, "grad_norm": 2.018167018890381, "learning_rate": 0.0001368160704590698, "loss": 8.2116, "step": 2520 }, { "epoch": 0.04596869061623383, "grad_norm": 2.387749433517456, "learning_rate": 0.00013790191228811003, "loss": 8.2071, "step": 2540 }, { "epoch": 0.046330648810062436, "grad_norm": 2.1164238452911377, "learning_rate": 0.00013898775411715027, "loss": 8.2173, "step": 2560 }, { "epoch": 0.04669260700389105, "grad_norm": 2.6271204948425293, "learning_rate": 0.0001400735959461905, "loss": 8.1928, "step": 2580 }, { "epoch": 0.04705456519771967, "grad_norm": 2.146430730819702, "learning_rate": 0.00014115943777523075, "loss": 8.2093, "step": 2600 }, { "epoch": 0.047416523391548275, "grad_norm": 1.758144736289978, "learning_rate": 0.000142245279604271, "loss": 8.1709, "step": 2620 }, { "epoch": 0.04777848158537689, "grad_norm": 1.3466659784317017, "learning_rate": 0.00014333112143331123, "loss": 8.1766, "step": 2640 }, { "epoch": 0.0481404397792055, "grad_norm": 1.9450665712356567, "learning_rate": 0.00014441696326235144, "loss": 8.1901, "step": 2660 }, { "epoch": 0.048502397973034114, "grad_norm": 1.6330885887145996, "learning_rate": 0.00014550280509139168, "loss": 8.1911, "step": 2680 }, { "epoch": 0.04886435616686273, "grad_norm": 1.8187795877456665, "learning_rate": 0.00014658864692043192, "loss": 8.1737, "step": 2700 }, { "epoch": 0.04922631436069134, "grad_norm": 2.8557980060577393, "learning_rate": 0.00014767448874947216, "loss": 8.1732, "step": 2720 }, { "epoch": 0.049588272554519953, "grad_norm": 2.0480148792266846, "learning_rate": 0.0001487603305785124, "loss": 8.1636, "step": 2740 }, { "epoch": 0.04995023074834857, "grad_norm": 1.8413054943084717, "learning_rate": 0.00014984617240755265, "loss": 8.1644, "step": 2760 }, { "epoch": 0.05031218894217718, "grad_norm": 1.5977145433425903, "learning_rate": 0.00015093201423659289, "loss": 8.1652, "step": 2780 }, { "epoch": 0.05067414713600579, "grad_norm": 2.060908317565918, "learning_rate": 0.0001520178560656331, "loss": 8.1747, "step": 2800 }, { "epoch": 0.0510361053298344, "grad_norm": 2.097968339920044, "learning_rate": 0.00015310369789467334, "loss": 8.1595, "step": 2820 }, { "epoch": 0.051398063523663017, "grad_norm": 2.275170087814331, "learning_rate": 0.00015418953972371358, "loss": 8.1677, "step": 2840 }, { "epoch": 0.05176002171749163, "grad_norm": 1.372065544128418, "learning_rate": 0.00015527538155275382, "loss": 8.14, "step": 2860 }, { "epoch": 0.05212197991132024, "grad_norm": 1.472987174987793, "learning_rate": 0.00015636122338179406, "loss": 8.1664, "step": 2880 }, { "epoch": 0.052483938105148856, "grad_norm": 1.398430347442627, "learning_rate": 0.0001574470652108343, "loss": 8.1435, "step": 2900 }, { "epoch": 0.05284589629897747, "grad_norm": 2.2276878356933594, "learning_rate": 0.00015853290703987452, "loss": 8.1588, "step": 2920 }, { "epoch": 0.05320785449280608, "grad_norm": 2.8768556118011475, "learning_rate": 0.00015961874886891476, "loss": 8.15, "step": 2940 }, { "epoch": 0.053569812686634695, "grad_norm": 2.1943325996398926, "learning_rate": 0.000160704590697955, "loss": 8.1515, "step": 2960 }, { "epoch": 0.0539317708804633, "grad_norm": 2.0583720207214355, "learning_rate": 0.00016179043252699524, "loss": 8.1343, "step": 2980 }, { "epoch": 0.05429372907429192, "grad_norm": 2.024512767791748, "learning_rate": 0.00016282198226458345, "loss": 8.1324, "step": 3000 }, { "epoch": 0.054655687268120534, "grad_norm": 2.0019822120666504, "learning_rate": 0.0001639078240936237, "loss": 8.1268, "step": 3020 }, { "epoch": 0.05501764546194914, "grad_norm": 2.0442886352539062, "learning_rate": 0.00016499366592266393, "loss": 8.1173, "step": 3040 }, { "epoch": 0.05537960365577776, "grad_norm": 2.780470371246338, "learning_rate": 0.00016607950775170417, "loss": 8.1345, "step": 3060 }, { "epoch": 0.05574156184960637, "grad_norm": 2.3489673137664795, "learning_rate": 0.00016716534958074441, "loss": 8.1356, "step": 3080 }, { "epoch": 0.05610352004343498, "grad_norm": 1.5495760440826416, "learning_rate": 0.00016825119140978465, "loss": 8.1344, "step": 3100 }, { "epoch": 0.0564654782372636, "grad_norm": 1.9256787300109863, "learning_rate": 0.00016933703323882487, "loss": 8.1298, "step": 3120 }, { "epoch": 0.056827436431092206, "grad_norm": 2.0177950859069824, "learning_rate": 0.0001704228750678651, "loss": 8.1168, "step": 3140 }, { "epoch": 0.05718939462492082, "grad_norm": 2.141857147216797, "learning_rate": 0.00017150871689690535, "loss": 8.1286, "step": 3160 }, { "epoch": 0.057551352818749436, "grad_norm": 2.1056764125823975, "learning_rate": 0.0001725945587259456, "loss": 8.1398, "step": 3180 }, { "epoch": 0.057913311012578045, "grad_norm": 1.9108024835586548, "learning_rate": 0.00017368040055498583, "loss": 8.1244, "step": 3200 }, { "epoch": 0.05827526920640666, "grad_norm": 1.7270424365997314, "learning_rate": 0.00017476624238402607, "loss": 8.1102, "step": 3220 }, { "epoch": 0.058637227400235276, "grad_norm": 1.684164047241211, "learning_rate": 0.00017585208421306629, "loss": 8.1079, "step": 3240 }, { "epoch": 0.058999185594063884, "grad_norm": 2.553480625152588, "learning_rate": 0.00017688363395065452, "loss": 8.1094, "step": 3260 }, { "epoch": 0.0593611437878925, "grad_norm": 1.9391483068466187, "learning_rate": 0.00017796947577969477, "loss": 8.1078, "step": 3280 }, { "epoch": 0.05972310198172111, "grad_norm": 2.539398193359375, "learning_rate": 0.000179055317608735, "loss": 8.1159, "step": 3300 }, { "epoch": 0.06008506017554972, "grad_norm": 2.3800413608551025, "learning_rate": 0.00018014115943777525, "loss": 8.1043, "step": 3320 }, { "epoch": 0.06044701836937834, "grad_norm": 2.1216628551483154, "learning_rate": 0.00018122700126681546, "loss": 8.0919, "step": 3340 }, { "epoch": 0.06080897656320695, "grad_norm": 3.430650472640991, "learning_rate": 0.0001823128430958557, "loss": 8.1078, "step": 3360 }, { "epoch": 0.06117093475703556, "grad_norm": 2.1858770847320557, "learning_rate": 0.00018339868492489594, "loss": 8.1184, "step": 3380 }, { "epoch": 0.06153289295086418, "grad_norm": 2.896089553833008, "learning_rate": 0.00018448452675393618, "loss": 8.106, "step": 3400 }, { "epoch": 0.061894851144692786, "grad_norm": 2.7624003887176514, "learning_rate": 0.00018557036858297642, "loss": 8.1072, "step": 3420 }, { "epoch": 0.0622568093385214, "grad_norm": 2.464115619659424, "learning_rate": 0.00018665621041201666, "loss": 8.0962, "step": 3440 }, { "epoch": 0.06261876753235002, "grad_norm": 2.5943491458892822, "learning_rate": 0.00018774205224105688, "loss": 8.0914, "step": 3460 }, { "epoch": 0.06298072572617863, "grad_norm": 2.0824356079101562, "learning_rate": 0.00018882789407009712, "loss": 8.1222, "step": 3480 }, { "epoch": 0.06334268392000723, "grad_norm": 2.8781402111053467, "learning_rate": 0.00018991373589913736, "loss": 8.088, "step": 3500 }, { "epoch": 0.06370464211383585, "grad_norm": 2.0000219345092773, "learning_rate": 0.0001909995777281776, "loss": 8.103, "step": 3520 }, { "epoch": 0.06406660030766446, "grad_norm": 2.4691524505615234, "learning_rate": 0.00019208541955721784, "loss": 8.1045, "step": 3540 }, { "epoch": 0.06442855850149308, "grad_norm": 2.583723545074463, "learning_rate": 0.00019317126138625808, "loss": 8.0879, "step": 3560 }, { "epoch": 0.0647905166953217, "grad_norm": 2.7288269996643066, "learning_rate": 0.0001942571032152983, "loss": 8.091, "step": 3580 }, { "epoch": 0.0651524748891503, "grad_norm": 2.360276699066162, "learning_rate": 0.00019534294504433854, "loss": 8.0894, "step": 3600 }, { "epoch": 0.06551443308297891, "grad_norm": 2.6591217517852783, "learning_rate": 0.00019642878687337878, "loss": 8.082, "step": 3620 }, { "epoch": 0.06587639127680753, "grad_norm": 2.5572097301483154, "learning_rate": 0.00019751462870241902, "loss": 8.1033, "step": 3640 }, { "epoch": 0.06623834947063614, "grad_norm": 2.643139600753784, "learning_rate": 0.00019860047053145926, "loss": 8.1124, "step": 3660 }, { "epoch": 0.06660030766446476, "grad_norm": 3.0322484970092773, "learning_rate": 0.0001996863123604995, "loss": 8.1049, "step": 3680 }, { "epoch": 0.06696226585829337, "grad_norm": 2.7740566730499268, "learning_rate": 0.0002007721541895397, "loss": 8.0876, "step": 3700 }, { "epoch": 0.06732422405212198, "grad_norm": 2.7816436290740967, "learning_rate": 0.00020185799601857995, "loss": 8.1121, "step": 3720 }, { "epoch": 0.06768618224595059, "grad_norm": 2.4198217391967773, "learning_rate": 0.0002029438378476202, "loss": 8.0958, "step": 3740 }, { "epoch": 0.0680481404397792, "grad_norm": 2.8162808418273926, "learning_rate": 0.00020402967967666043, "loss": 8.0897, "step": 3760 }, { "epoch": 0.06841009863360782, "grad_norm": 2.883631706237793, "learning_rate": 0.00020511552150570067, "loss": 8.085, "step": 3780 }, { "epoch": 0.06877205682743644, "grad_norm": 2.4725236892700195, "learning_rate": 0.00020620136333474092, "loss": 8.0807, "step": 3800 }, { "epoch": 0.06913401502126504, "grad_norm": 2.3156867027282715, "learning_rate": 0.00020728720516378116, "loss": 8.0747, "step": 3820 }, { "epoch": 0.06949597321509365, "grad_norm": 3.308699607849121, "learning_rate": 0.00020837304699282137, "loss": 8.0767, "step": 3840 }, { "epoch": 0.06985793140892227, "grad_norm": 3.143287181854248, "learning_rate": 0.0002094588888218616, "loss": 8.0831, "step": 3860 }, { "epoch": 0.07021988960275088, "grad_norm": 3.100562810897827, "learning_rate": 0.00021054473065090185, "loss": 8.0862, "step": 3880 }, { "epoch": 0.0705818477965795, "grad_norm": 2.48494029045105, "learning_rate": 0.0002116305724799421, "loss": 8.0952, "step": 3900 }, { "epoch": 0.0709438059904081, "grad_norm": 3.1432759761810303, "learning_rate": 0.00021271641430898233, "loss": 8.0906, "step": 3920 }, { "epoch": 0.07130576418423672, "grad_norm": 3.330761194229126, "learning_rate": 0.00021380225613802257, "loss": 8.0856, "step": 3940 }, { "epoch": 0.07166772237806533, "grad_norm": 3.6338093280792236, "learning_rate": 0.00021488809796706279, "loss": 8.0733, "step": 3960 }, { "epoch": 0.07202968057189395, "grad_norm": 3.014366388320923, "learning_rate": 0.00021597393979610303, "loss": 8.086, "step": 3980 }, { "epoch": 0.07239163876572256, "grad_norm": 2.774247169494629, "learning_rate": 0.00021705978162514327, "loss": 8.093, "step": 4000 }, { "epoch": 0.07275359695955118, "grad_norm": 4.0621867179870605, "learning_rate": 0.0002181456234541835, "loss": 8.0801, "step": 4020 }, { "epoch": 0.07311555515337978, "grad_norm": 3.9556710720062256, "learning_rate": 0.00021923146528322375, "loss": 8.0632, "step": 4040 }, { "epoch": 0.0734775133472084, "grad_norm": 3.0062179565429688, "learning_rate": 0.000220317307112264, "loss": 8.0713, "step": 4060 }, { "epoch": 0.07383947154103701, "grad_norm": 3.4333982467651367, "learning_rate": 0.0002214031489413042, "loss": 8.0656, "step": 4080 }, { "epoch": 0.07420142973486563, "grad_norm": 3.107091188430786, "learning_rate": 0.00022248899077034444, "loss": 8.0832, "step": 4100 }, { "epoch": 0.07456338792869424, "grad_norm": 3.5279381275177, "learning_rate": 0.00022357483259938468, "loss": 8.0703, "step": 4120 }, { "epoch": 0.07492534612252284, "grad_norm": 2.9503841400146484, "learning_rate": 0.00022466067442842493, "loss": 8.0796, "step": 4140 }, { "epoch": 0.07528730431635146, "grad_norm": 3.018066644668579, "learning_rate": 0.00022574651625746517, "loss": 8.07, "step": 4160 }, { "epoch": 0.07564926251018007, "grad_norm": 3.552546501159668, "learning_rate": 0.0002268323580865054, "loss": 8.1046, "step": 4180 }, { "epoch": 0.07601122070400869, "grad_norm": 3.881967306137085, "learning_rate": 0.00022791819991554565, "loss": 8.0622, "step": 4200 }, { "epoch": 0.0763731788978373, "grad_norm": 4.438878536224365, "learning_rate": 0.0002290040417445859, "loss": 8.0783, "step": 4220 }, { "epoch": 0.0767351370916659, "grad_norm": 3.928950071334839, "learning_rate": 0.00023008988357362613, "loss": 8.0887, "step": 4240 }, { "epoch": 0.07709709528549452, "grad_norm": 4.781463623046875, "learning_rate": 0.00023117572540266632, "loss": 8.0672, "step": 4260 }, { "epoch": 0.07745905347932314, "grad_norm": 3.8346338272094727, "learning_rate": 0.00023226156723170656, "loss": 8.0822, "step": 4280 }, { "epoch": 0.07782101167315175, "grad_norm": 3.835999011993408, "learning_rate": 0.0002333474090607468, "loss": 8.0662, "step": 4300 }, { "epoch": 0.07818296986698037, "grad_norm": 4.432645320892334, "learning_rate": 0.00023443325088978704, "loss": 8.0816, "step": 4320 }, { "epoch": 0.07854492806080898, "grad_norm": 3.856933116912842, "learning_rate": 0.00023551909271882728, "loss": 8.055, "step": 4340 }, { "epoch": 0.07890688625463758, "grad_norm": 4.055251598358154, "learning_rate": 0.00023660493454786752, "loss": 8.0833, "step": 4360 }, { "epoch": 0.0792688444484662, "grad_norm": 4.129009246826172, "learning_rate": 0.00023769077637690776, "loss": 8.0715, "step": 4380 }, { "epoch": 0.07963080264229482, "grad_norm": 3.944307565689087, "learning_rate": 0.000238776618205948, "loss": 8.0721, "step": 4400 }, { "epoch": 0.07999276083612343, "grad_norm": 4.226454257965088, "learning_rate": 0.00023986246003498824, "loss": 8.081, "step": 4420 }, { "epoch": 0.08035471902995205, "grad_norm": 4.180859088897705, "learning_rate": 0.00024094830186402848, "loss": 8.0674, "step": 4440 }, { "epoch": 0.08071667722378065, "grad_norm": 4.678595066070557, "learning_rate": 0.0002418712674187127, "loss": 8.6491, "step": 4460 }, { "epoch": 0.08107863541760926, "grad_norm": 3.916884422302246, "learning_rate": 0.00024295710924775293, "loss": 8.0667, "step": 4480 }, { "epoch": 0.08144059361143788, "grad_norm": 5.014923572540283, "learning_rate": 0.00024404295107679317, "loss": 8.0709, "step": 4500 }, { "epoch": 0.0818025518052665, "grad_norm": 5.424384593963623, "learning_rate": 0.00024512879290583333, "loss": 8.0714, "step": 4520 }, { "epoch": 0.08216450999909511, "grad_norm": 4.7506890296936035, "learning_rate": 0.0002462146347348736, "loss": 8.1164, "step": 4540 }, { "epoch": 0.08252646819292371, "grad_norm": 4.271077632904053, "learning_rate": 0.0002473004765639138, "loss": 8.0456, "step": 4560 }, { "epoch": 0.08288842638675233, "grad_norm": 5.108422756195068, "learning_rate": 0.00024838631839295405, "loss": 8.0683, "step": 4580 }, { "epoch": 0.08325038458058094, "grad_norm": 4.00137186050415, "learning_rate": 0.0002494721602219943, "loss": 8.0877, "step": 4600 }, { "epoch": 0.08361234277440956, "grad_norm": 5.020883560180664, "learning_rate": 0.00025055800205103454, "loss": 8.0669, "step": 4620 }, { "epoch": 0.08397430096823817, "grad_norm": 4.774672985076904, "learning_rate": 0.0002516438438800748, "loss": 8.0978, "step": 4640 }, { "epoch": 0.08433625916206679, "grad_norm": 4.414867877960205, "learning_rate": 0.000252729685709115, "loss": 8.0909, "step": 4660 }, { "epoch": 0.08469821735589539, "grad_norm": 4.128323554992676, "learning_rate": 0.00025381552753815526, "loss": 8.09, "step": 4680 }, { "epoch": 0.085060175549724, "grad_norm": 4.486546039581299, "learning_rate": 0.0002549013693671955, "loss": 8.0744, "step": 4700 }, { "epoch": 0.08542213374355262, "grad_norm": 4.218783855438232, "learning_rate": 0.00025598721119623574, "loss": 8.0902, "step": 4720 }, { "epoch": 0.08578409193738123, "grad_norm": 4.9139084815979, "learning_rate": 0.000257073053025276, "loss": 8.0726, "step": 4740 }, { "epoch": 0.08614605013120985, "grad_norm": 4.544093132019043, "learning_rate": 0.0002581588948543162, "loss": 8.0735, "step": 4760 }, { "epoch": 0.08650800832503845, "grad_norm": 5.45837926864624, "learning_rate": 0.0002592447366833564, "loss": 8.0781, "step": 4780 }, { "epoch": 0.08686996651886707, "grad_norm": 5.2784423828125, "learning_rate": 0.00026033057851239665, "loss": 8.088, "step": 4800 }, { "epoch": 0.08723192471269568, "grad_norm": 4.507415294647217, "learning_rate": 0.0002614164203414369, "loss": 8.0707, "step": 4820 }, { "epoch": 0.0875938829065243, "grad_norm": 4.857511520385742, "learning_rate": 0.00026250226217047713, "loss": 8.0994, "step": 4840 }, { "epoch": 0.08795584110035291, "grad_norm": 4.420199871063232, "learning_rate": 0.00026358810399951737, "loss": 8.0605, "step": 4860 }, { "epoch": 0.08831779929418153, "grad_norm": 3.7216994762420654, "learning_rate": 0.0002646739458285576, "loss": 8.1182, "step": 4880 }, { "epoch": 0.08867975748801013, "grad_norm": 4.462796688079834, "learning_rate": 0.00026575978765759785, "loss": 8.0666, "step": 4900 }, { "epoch": 0.08904171568183875, "grad_norm": 4.29760217666626, "learning_rate": 0.0002668456294866381, "loss": 8.0548, "step": 4920 }, { "epoch": 0.08940367387566736, "grad_norm": 5.2155046463012695, "learning_rate": 0.00026793147131567833, "loss": 8.1614, "step": 4940 }, { "epoch": 0.08976563206949598, "grad_norm": 4.687706470489502, "learning_rate": 0.0002690173131447186, "loss": 8.051, "step": 4960 }, { "epoch": 0.09012759026332459, "grad_norm": 3.9040534496307373, "learning_rate": 0.0002701031549737588, "loss": 8.0645, "step": 4980 }, { "epoch": 0.0904895484571532, "grad_norm": 4.2080159187316895, "learning_rate": 0.00027118899680279905, "loss": 8.059, "step": 5000 }, { "epoch": 0.0904895484571532, "eval_accuracy": 0.10885986383166214, "eval_loss": 8.154509544372559, "eval_runtime": 172.8056, "eval_samples_per_second": 3517.479, "eval_steps_per_second": 3.437, "step": 5000 }, { "epoch": 0.09085150665098181, "grad_norm": 4.571134567260742, "learning_rate": 0.00027227483863183924, "loss": 8.0411, "step": 5020 }, { "epoch": 0.09121346484481042, "grad_norm": 5.403674602508545, "learning_rate": 0.0002733606804608795, "loss": 8.0345, "step": 5040 }, { "epoch": 0.09157542303863904, "grad_norm": 4.911813259124756, "learning_rate": 0.0002744465222899197, "loss": 8.0946, "step": 5060 }, { "epoch": 0.09193738123246765, "grad_norm": 5.051859378814697, "learning_rate": 0.00027553236411895996, "loss": 8.0794, "step": 5080 }, { "epoch": 0.09229933942629626, "grad_norm": 5.029412746429443, "learning_rate": 0.0002766182059480002, "loss": 8.0873, "step": 5100 }, { "epoch": 0.09266129762012487, "grad_norm": 4.249423503875732, "learning_rate": 0.00027770404777704044, "loss": 8.098, "step": 5120 }, { "epoch": 0.09302325581395349, "grad_norm": 4.305202960968018, "learning_rate": 0.0002787898896060807, "loss": 8.0734, "step": 5140 }, { "epoch": 0.0933852140077821, "grad_norm": 5.8397345542907715, "learning_rate": 0.0002798757314351209, "loss": 8.12, "step": 5160 }, { "epoch": 0.09374717220161072, "grad_norm": 4.833122253417969, "learning_rate": 0.00028096157326416117, "loss": 8.0778, "step": 5180 }, { "epoch": 0.09410913039543933, "grad_norm": 3.883317232131958, "learning_rate": 0.0002820474150932014, "loss": 8.0997, "step": 5200 }, { "epoch": 0.09447108858926793, "grad_norm": 4.111331939697266, "learning_rate": 0.00028313325692224165, "loss": 8.1039, "step": 5220 }, { "epoch": 0.09483304678309655, "grad_norm": 3.988339424133301, "learning_rate": 0.0002842190987512819, "loss": 8.0941, "step": 5240 }, { "epoch": 0.09519500497692517, "grad_norm": 3.9682798385620117, "learning_rate": 0.0002853049405803221, "loss": 8.0477, "step": 5260 }, { "epoch": 0.09555696317075378, "grad_norm": 5.8604655265808105, "learning_rate": 0.0002863907824093623, "loss": 8.077, "step": 5280 }, { "epoch": 0.0959189213645824, "grad_norm": 4.823431015014648, "learning_rate": 0.00028747662423840256, "loss": 8.0723, "step": 5300 }, { "epoch": 0.096280879558411, "grad_norm": 8.868093490600586, "learning_rate": 0.0002885624660674428, "loss": 8.312, "step": 5320 }, { "epoch": 0.09664283775223961, "grad_norm": 3.8635661602020264, "learning_rate": 0.00028964830789648304, "loss": 8.1751, "step": 5340 }, { "epoch": 0.09700479594606823, "grad_norm": 4.368812084197998, "learning_rate": 0.0002907341497255233, "loss": 8.0755, "step": 5360 }, { "epoch": 0.09736675413989684, "grad_norm": 4.956539154052734, "learning_rate": 0.0002918199915545635, "loss": 8.0912, "step": 5380 }, { "epoch": 0.09772871233372546, "grad_norm": 4.059233665466309, "learning_rate": 0.00029290583338360376, "loss": 8.0815, "step": 5400 }, { "epoch": 0.09809067052755406, "grad_norm": 4.747849464416504, "learning_rate": 0.000293991675212644, "loss": 8.0874, "step": 5420 }, { "epoch": 0.09845262872138268, "grad_norm": 4.266099452972412, "learning_rate": 0.00029507751704168424, "loss": 8.0662, "step": 5440 }, { "epoch": 0.09881458691521129, "grad_norm": 4.489388465881348, "learning_rate": 0.0002961633588707245, "loss": 8.0817, "step": 5460 }, { "epoch": 0.09917654510903991, "grad_norm": 4.930331230163574, "learning_rate": 0.0002972492006997647, "loss": 8.0799, "step": 5480 }, { "epoch": 0.09953850330286852, "grad_norm": 5.424180030822754, "learning_rate": 0.00029833504252880496, "loss": 8.065, "step": 5500 }, { "epoch": 0.09990046149669714, "grad_norm": 4.461760520935059, "learning_rate": 0.0002994208843578452, "loss": 8.0738, "step": 5520 }, { "epoch": 0.10026241969052574, "grad_norm": 3.7566046714782715, "learning_rate": 0.0003005067261868854, "loss": 8.0848, "step": 5540 }, { "epoch": 0.10062437788435435, "grad_norm": 6.27786111831665, "learning_rate": 0.00030159256801592563, "loss": 8.088, "step": 5560 }, { "epoch": 0.10098633607818297, "grad_norm": 4.437178134918213, "learning_rate": 0.00030267840984496587, "loss": 8.068, "step": 5580 }, { "epoch": 0.10134829427201159, "grad_norm": 4.757803440093994, "learning_rate": 0.0003037642516740061, "loss": 8.0712, "step": 5600 }, { "epoch": 0.1017102524658402, "grad_norm": 4.162949085235596, "learning_rate": 0.00030485009350304635, "loss": 8.0941, "step": 5620 }, { "epoch": 0.1020722106596688, "grad_norm": 5.036799430847168, "learning_rate": 0.0003059359353320866, "loss": 8.1277, "step": 5640 }, { "epoch": 0.10243416885349742, "grad_norm": 6.508123397827148, "learning_rate": 0.00030702177716112683, "loss": 8.1014, "step": 5660 }, { "epoch": 0.10279612704732603, "grad_norm": 4.099546432495117, "learning_rate": 0.0003081076189901671, "loss": 8.1013, "step": 5680 }, { "epoch": 0.10315808524115465, "grad_norm": 5.094971656799316, "learning_rate": 0.0003091934608192073, "loss": 8.0943, "step": 5700 }, { "epoch": 0.10352004343498326, "grad_norm": 3.8019566535949707, "learning_rate": 0.00031027930264824756, "loss": 8.0894, "step": 5720 }, { "epoch": 0.10388200162881188, "grad_norm": 5.287724494934082, "learning_rate": 0.0003113651444772878, "loss": 8.0937, "step": 5740 }, { "epoch": 0.10424395982264048, "grad_norm": 4.729523658752441, "learning_rate": 0.00031245098630632804, "loss": 8.1002, "step": 5760 }, { "epoch": 0.1046059180164691, "grad_norm": 4.7321319580078125, "learning_rate": 0.0003135368281353683, "loss": 8.0607, "step": 5780 }, { "epoch": 0.10496787621029771, "grad_norm": 5.564339637756348, "learning_rate": 0.0003146226699644085, "loss": 8.1031, "step": 5800 }, { "epoch": 0.10532983440412633, "grad_norm": 3.883352041244507, "learning_rate": 0.00031570851179344876, "loss": 8.0864, "step": 5820 }, { "epoch": 0.10569179259795494, "grad_norm": 6.192608833312988, "learning_rate": 0.000316794353622489, "loss": 8.0805, "step": 5840 }, { "epoch": 0.10605375079178354, "grad_norm": 3.4210565090179443, "learning_rate": 0.00031788019545152924, "loss": 8.104, "step": 5860 }, { "epoch": 0.10641570898561216, "grad_norm": 5.195882797241211, "learning_rate": 0.0003189660372805695, "loss": 8.0621, "step": 5880 }, { "epoch": 0.10677766717944077, "grad_norm": 4.863924980163574, "learning_rate": 0.0003200518791096097, "loss": 8.2223, "step": 5900 }, { "epoch": 0.10713962537326939, "grad_norm": 6.122769355773926, "learning_rate": 0.00032113772093864996, "loss": 8.1023, "step": 5920 }, { "epoch": 0.107501583567098, "grad_norm": 10.902450561523438, "learning_rate": 0.00032222356276769015, "loss": 8.1196, "step": 5940 }, { "epoch": 0.1078635417609266, "grad_norm": 35.497840881347656, "learning_rate": 0.0003233094045967304, "loss": 8.1074, "step": 5960 }, { "epoch": 0.10822549995475522, "grad_norm": 4.4585862159729, "learning_rate": 0.00032417807805996257, "loss": 8.9866, "step": 5980 }, { "epoch": 0.10858745814858384, "grad_norm": 7.580636024475098, "learning_rate": 0.0003252639198890028, "loss": 9.1972, "step": 6000 }, { "epoch": 0.10894941634241245, "grad_norm": 4.390789031982422, "learning_rate": 0.00032634976171804305, "loss": 10.38, "step": 6020 }, { "epoch": 0.10931137453624107, "grad_norm": 3.9264190196990967, "learning_rate": 0.0003274356035470833, "loss": 9.9821, "step": 6040 }, { "epoch": 0.10967333273006968, "grad_norm": 4.207955837249756, "learning_rate": 0.00032852144537612354, "loss": 9.7519, "step": 6060 }, { "epoch": 0.11003529092389829, "grad_norm": 3.7314298152923584, "learning_rate": 0.0003292815346564517, "loss": 9.4857, "step": 6080 }, { "epoch": 0.1103972491177269, "grad_norm": NaN, "learning_rate": 0.0003296072872051638, "loss": 8.6937, "step": 6100 }, { "epoch": 0.11075920731155552, "grad_norm": NaN, "learning_rate": 0.0003296072872051638, "loss": 4.8707, "step": 6120 }, { "epoch": 0.11112116550538413, "grad_norm": NaN, "learning_rate": 0.0003296072872051638, "loss": 4.4629, "step": 6140 }, { "epoch": 0.11148312369921275, "grad_norm": NaN, "learning_rate": 0.0003297158713880678, "loss": 4.7899, "step": 6160 }, { "epoch": 0.11184508189304135, "grad_norm": NaN, "learning_rate": 0.0003297158713880678, "loss": 4.5487, "step": 6180 }, { "epoch": 0.11220704008686996, "grad_norm": NaN, "learning_rate": 0.0003297158713880678, "loss": 0.3068, "step": 6200 }, { "epoch": 0.11256899828069858, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 3.6855, "step": 6220 }, { "epoch": 0.1129309564745272, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 1.1705, "step": 6240 }, { "epoch": 0.11329291466835581, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 4.775, "step": 6260 }, { "epoch": 0.11365487286218441, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 1.8774, "step": 6280 }, { "epoch": 0.11401683105601303, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 1.6138, "step": 6300 }, { "epoch": 0.11437878924984164, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 4.092, "step": 6320 }, { "epoch": 0.11474074744367026, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 1.877, "step": 6340 }, { "epoch": 0.11510270563749887, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 0.0, "step": 6360 }, { "epoch": 0.11546466383132749, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 1.4496, "step": 6380 }, { "epoch": 0.11582662202515609, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 0.8283, "step": 6400 }, { "epoch": 0.1161885802189847, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 1.0648, "step": 6420 }, { "epoch": 0.11655053841281332, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 1.5593, "step": 6440 }, { "epoch": 0.11691249660664194, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 0.3318, "step": 6460 }, { "epoch": 0.11727445480047055, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 0.2933, "step": 6480 }, { "epoch": 0.11763641299429915, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 0.8112, "step": 6500 }, { "epoch": 0.11799837118812777, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 0.9949, "step": 6520 }, { "epoch": 0.11836032938195638, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 1.0499, "step": 6540 }, { "epoch": 0.118722287575785, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 1.1902, "step": 6560 }, { "epoch": 0.11908424576961361, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 4.6519, "step": 6580 }, { "epoch": 0.11944620396344222, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 3.6229, "step": 6600 }, { "epoch": 0.11980816215727083, "grad_norm": NaN, "learning_rate": 0.0003297701634795198, "loss": 0.0, "step": 6620 }, { "epoch": 0.12017012035109945, "grad_norm": NaN, "learning_rate": 0.00032982445557097183, "loss": 1.116, "step": 6640 }, { "epoch": 0.12053207854492806, "grad_norm": NaN, "learning_rate": 0.00032987874766242386, "loss": 2.6337, "step": 6660 }, { "epoch": 0.12089403673875668, "grad_norm": NaN, "learning_rate": 0.00032987874766242386, "loss": 2.4189, "step": 6680 }, { "epoch": 0.12125599493258529, "grad_norm": NaN, "learning_rate": 0.00032987874766242386, "loss": 0.0, "step": 6700 }, { "epoch": 0.1216179531264139, "grad_norm": NaN, "learning_rate": 0.00032993303975387584, "loss": 5.334, "step": 6720 }, { "epoch": 0.12197991132024251, "grad_norm": NaN, "learning_rate": 0.00032993303975387584, "loss": 1.2443, "step": 6740 }, { "epoch": 0.12234186951407112, "grad_norm": NaN, "learning_rate": 0.00032993303975387584, "loss": 4.497, "step": 6760 }, { "epoch": 0.12270382770789974, "grad_norm": NaN, "learning_rate": 0.00032993303975387584, "loss": 1.03, "step": 6780 }, { "epoch": 0.12306578590172836, "grad_norm": NaN, "learning_rate": 0.00032993303975387584, "loss": 0.5876, "step": 6800 }, { "epoch": 0.12342774409555696, "grad_norm": NaN, "learning_rate": 0.00032993303975387584, "loss": 3.1744, "step": 6820 }, { "epoch": 0.12378970228938557, "grad_norm": NaN, "learning_rate": 0.00032993303975387584, "loss": 1.7883, "step": 6840 }, { "epoch": 0.12415166048321419, "grad_norm": NaN, "learning_rate": 0.00032993303975387584, "loss": 0.6097, "step": 6860 }, { "epoch": 0.1245136186770428, "grad_norm": NaN, "learning_rate": 0.00032993303975387584, "loss": 2.116, "step": 6880 }, { "epoch": 0.12487557687087142, "grad_norm": NaN, "learning_rate": 0.00032993303975387584, "loss": 1.1183, "step": 6900 }, { "epoch": 0.12523753506470003, "grad_norm": NaN, "learning_rate": 0.00032998733184532787, "loss": 2.0884, "step": 6920 }, { "epoch": 0.12559949325852865, "grad_norm": NaN, "learning_rate": 0.00032998733184532787, "loss": 0.0, "step": 6940 }, { "epoch": 0.12596145145235726, "grad_norm": NaN, "learning_rate": 0.00032998733184532787, "loss": 3.2805, "step": 6960 }, { "epoch": 0.12632340964618585, "grad_norm": NaN, "learning_rate": 0.0003300416239367799, "loss": 2.9308, "step": 6980 }, { "epoch": 0.12668536784001447, "grad_norm": NaN, "learning_rate": 0.0003300416239367799, "loss": 0.2905, "step": 7000 }, { "epoch": 0.12704732603384308, "grad_norm": NaN, "learning_rate": 0.0003300416239367799, "loss": 1.5342, "step": 7020 }, { "epoch": 0.1274092842276717, "grad_norm": NaN, "learning_rate": 0.0003300416239367799, "loss": 2.1439, "step": 7040 }, { "epoch": 0.12777124242150031, "grad_norm": NaN, "learning_rate": 0.0003300416239367799, "loss": 1.8486, "step": 7060 }, { "epoch": 0.12813320061532893, "grad_norm": NaN, "learning_rate": 0.0003300416239367799, "loss": 2.2983, "step": 7080 }, { "epoch": 0.12849515880915754, "grad_norm": NaN, "learning_rate": 0.00033009591602823187, "loss": 5.8454, "step": 7100 }, { "epoch": 0.12885711700298616, "grad_norm": NaN, "learning_rate": 0.00033009591602823187, "loss": 0.0, "step": 7120 }, { "epoch": 0.12921907519681478, "grad_norm": NaN, "learning_rate": 0.00033009591602823187, "loss": 3.4919, "step": 7140 }, { "epoch": 0.1295810333906434, "grad_norm": NaN, "learning_rate": 0.00033009591602823187, "loss": 0.0, "step": 7160 }, { "epoch": 0.129942991584472, "grad_norm": NaN, "learning_rate": 0.0003301502081196839, "loss": 1.6828, "step": 7180 }, { "epoch": 0.1303049497783006, "grad_norm": NaN, "learning_rate": 0.0003302045002111359, "loss": 2.8762, "step": 7200 }, { "epoch": 0.1306669079721292, "grad_norm": NaN, "learning_rate": 0.0003302045002111359, "loss": 0.995, "step": 7220 }, { "epoch": 0.13102886616595782, "grad_norm": NaN, "learning_rate": 0.0003302045002111359, "loss": 3.0309, "step": 7240 }, { "epoch": 0.13139082435978644, "grad_norm": NaN, "learning_rate": 0.0003302045002111359, "loss": 2.1536, "step": 7260 }, { "epoch": 0.13175278255361506, "grad_norm": NaN, "learning_rate": 0.0003302045002111359, "loss": 1.7328, "step": 7280 }, { "epoch": 0.13211474074744367, "grad_norm": NaN, "learning_rate": 0.0003302045002111359, "loss": 1.4516, "step": 7300 }, { "epoch": 0.1324766989412723, "grad_norm": NaN, "learning_rate": 0.0003302045002111359, "loss": 2.3328, "step": 7320 }, { "epoch": 0.1328386571351009, "grad_norm": NaN, "learning_rate": 0.0003302045002111359, "loss": 5.0458, "step": 7340 }, { "epoch": 0.13320061532892952, "grad_norm": NaN, "learning_rate": 0.0003302045002111359, "loss": 0.6658, "step": 7360 }, { "epoch": 0.13356257352275813, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 3.0189, "step": 7380 }, { "epoch": 0.13392453171658675, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 1.0563, "step": 7400 }, { "epoch": 0.13428648991041534, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 0.2771, "step": 7420 }, { "epoch": 0.13464844810424395, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 2.2785, "step": 7440 }, { "epoch": 0.13501040629807257, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 3.2136, "step": 7460 }, { "epoch": 0.13537236449190118, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 0.4388, "step": 7480 }, { "epoch": 0.1357343226857298, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 1.5316, "step": 7500 }, { "epoch": 0.1360962808795584, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 4.91, "step": 7520 }, { "epoch": 0.13645823907338703, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 0.6, "step": 7540 }, { "epoch": 0.13682019726721564, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 1.3436, "step": 7560 }, { "epoch": 0.13718215546104426, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 0.971, "step": 7580 }, { "epoch": 0.13754411365487287, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 0.9318, "step": 7600 }, { "epoch": 0.13790607184870146, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 1.9039, "step": 7620 }, { "epoch": 0.13826803004253008, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 1.7444, "step": 7640 }, { "epoch": 0.1386299882363587, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 1.1124, "step": 7660 }, { "epoch": 0.1389919464301873, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 2.8302, "step": 7680 }, { "epoch": 0.13935390462401592, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 4.0674, "step": 7700 }, { "epoch": 0.13971586281784454, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 0.275, "step": 7720 }, { "epoch": 0.14007782101167315, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 2.7992, "step": 7740 }, { "epoch": 0.14043977920550177, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 3.474, "step": 7760 }, { "epoch": 0.14080173739933038, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 2.9148, "step": 7780 }, { "epoch": 0.141163695593159, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 0.2909, "step": 7800 }, { "epoch": 0.14152565378698762, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 2.1936, "step": 7820 }, { "epoch": 0.1418876119808162, "grad_norm": NaN, "learning_rate": 0.0003302587923025879, "loss": 4.1645, "step": 7840 }, { "epoch": 0.14224957017464482, "grad_norm": NaN, "learning_rate": 0.0003303130843940399, "loss": 4.7352, "step": 7860 }, { "epoch": 0.14261152836847343, "grad_norm": NaN, "learning_rate": 0.0003303130843940399, "loss": 0.9052, "step": 7880 }, { "epoch": 0.14297348656230205, "grad_norm": NaN, "learning_rate": 0.0003303130843940399, "loss": 3.5735, "step": 7900 }, { "epoch": 0.14333544475613066, "grad_norm": NaN, "learning_rate": 0.0003303130843940399, "loss": 0.4788, "step": 7920 }, { "epoch": 0.14369740294995928, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 2.5942, "step": 7940 }, { "epoch": 0.1440593611437879, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 0.5562, "step": 7960 }, { "epoch": 0.1444213193376165, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 1.268, "step": 7980 }, { "epoch": 0.14478327753144513, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 2.5866, "step": 8000 }, { "epoch": 0.14514523572527374, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 0.0, "step": 8020 }, { "epoch": 0.14550719391910236, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 1.2675, "step": 8040 }, { "epoch": 0.14586915211293094, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 0.407, "step": 8060 }, { "epoch": 0.14623111030675956, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 0.0, "step": 8080 }, { "epoch": 0.14659306850058818, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 1.6142, "step": 8100 }, { "epoch": 0.1469550266944168, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 0.6237, "step": 8120 }, { "epoch": 0.1473169848882454, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 1.5398, "step": 8140 }, { "epoch": 0.14767894308207402, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 0.4752, "step": 8160 }, { "epoch": 0.14804090127590264, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 2.9843, "step": 8180 }, { "epoch": 0.14840285946973125, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 1.2889, "step": 8200 }, { "epoch": 0.14876481766355987, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 0.0, "step": 8220 }, { "epoch": 0.14912677585738848, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 1.5344, "step": 8240 }, { "epoch": 0.1494887340512171, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 4.309, "step": 8260 }, { "epoch": 0.14985069224504569, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 4.7959, "step": 8280 }, { "epoch": 0.1502126504388743, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 0.2643, "step": 8300 }, { "epoch": 0.15057460863270292, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 1.5931, "step": 8320 }, { "epoch": 0.15093656682653153, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 3.252, "step": 8340 }, { "epoch": 0.15129852502036015, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 0.4066, "step": 8360 }, { "epoch": 0.15166048321418876, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 1.5006, "step": 8380 }, { "epoch": 0.15202244140801738, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 1.7442, "step": 8400 }, { "epoch": 0.152384399601846, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 0.4045, "step": 8420 }, { "epoch": 0.1527463577956746, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 1.252, "step": 8440 }, { "epoch": 0.15310831598950322, "grad_norm": NaN, "learning_rate": 0.00033036737648549196, "loss": 1.0349, "step": 8460 }, { "epoch": 0.1534702741833318, "grad_norm": NaN, "learning_rate": 0.00033042166857694393, "loss": 0.5888, "step": 8480 }, { "epoch": 0.15383223237716043, "grad_norm": NaN, "learning_rate": 0.00033047596066839596, "loss": 1.9747, "step": 8500 }, { "epoch": 0.15419419057098904, "grad_norm": NaN, "learning_rate": 0.00033047596066839596, "loss": 1.1524, "step": 8520 }, { "epoch": 0.15455614876481766, "grad_norm": NaN, "learning_rate": 0.00033047596066839596, "loss": 3.9692, "step": 8540 }, { "epoch": 0.15491810695864627, "grad_norm": NaN, "learning_rate": 0.00033047596066839596, "loss": 2.6825, "step": 8560 }, { "epoch": 0.1552800651524749, "grad_norm": NaN, "learning_rate": 0.00033047596066839596, "loss": 2.5432, "step": 8580 }, { "epoch": 0.1556420233463035, "grad_norm": NaN, "learning_rate": 0.000330530252759848, "loss": 3.0545, "step": 8600 }, { "epoch": 0.15600398154013212, "grad_norm": NaN, "learning_rate": 0.00033058454485129996, "loss": 1.709, "step": 8620 }, { "epoch": 0.15636593973396073, "grad_norm": NaN, "learning_rate": 0.00033058454485129996, "loss": 1.1557, "step": 8640 }, { "epoch": 0.15672789792778935, "grad_norm": NaN, "learning_rate": 0.00033058454485129996, "loss": 1.9568, "step": 8660 }, { "epoch": 0.15708985612161797, "grad_norm": NaN, "learning_rate": 0.00033058454485129996, "loss": 0.4101, "step": 8680 }, { "epoch": 0.15745181431544655, "grad_norm": NaN, "learning_rate": 0.00033058454485129996, "loss": 1.0798, "step": 8700 }, { "epoch": 0.15781377250927517, "grad_norm": NaN, "learning_rate": 0.00033058454485129996, "loss": 0.9417, "step": 8720 }, { "epoch": 0.15817573070310378, "grad_norm": NaN, "learning_rate": 0.00033058454485129996, "loss": 0.3268, "step": 8740 }, { "epoch": 0.1585376888969324, "grad_norm": NaN, "learning_rate": 0.00033058454485129996, "loss": 2.7045, "step": 8760 }, { "epoch": 0.15889964709076101, "grad_norm": NaN, "learning_rate": 0.00033058454485129996, "loss": 3.1654, "step": 8780 }, { "epoch": 0.15926160528458963, "grad_norm": NaN, "learning_rate": 0.00033058454485129996, "loss": 3.7107, "step": 8800 }, { "epoch": 0.15962356347841825, "grad_norm": NaN, "learning_rate": 0.00033058454485129996, "loss": 1.1776, "step": 8820 }, { "epoch": 0.15998552167224686, "grad_norm": NaN, "learning_rate": 0.00033058454485129996, "loss": 0.7538, "step": 8840 }, { "epoch": 0.16034747986607548, "grad_norm": NaN, "learning_rate": 0.000330638836942752, "loss": 5.9555, "step": 8860 }, { "epoch": 0.1607094380599041, "grad_norm": NaN, "learning_rate": 0.000330693129034204, "loss": 2.4652, "step": 8880 }, { "epoch": 0.1610713962537327, "grad_norm": NaN, "learning_rate": 0.000330693129034204, "loss": 0.6042, "step": 8900 }, { "epoch": 0.1614333544475613, "grad_norm": NaN, "learning_rate": 0.000330693129034204, "loss": 2.164, "step": 8920 }, { "epoch": 0.1617953126413899, "grad_norm": NaN, "learning_rate": 0.000330693129034204, "loss": 1.5732, "step": 8940 }, { "epoch": 0.16215727083521853, "grad_norm": NaN, "learning_rate": 0.000330693129034204, "loss": 0.7035, "step": 8960 }, { "epoch": 0.16251922902904714, "grad_norm": NaN, "learning_rate": 0.000330693129034204, "loss": 1.3199, "step": 8980 }, { "epoch": 0.16288118722287576, "grad_norm": NaN, "learning_rate": 0.000330693129034204, "loss": 2.9008, "step": 9000 }, { "epoch": 0.16324314541670437, "grad_norm": NaN, "learning_rate": 0.000330693129034204, "loss": 2.8915, "step": 9020 }, { "epoch": 0.163605103610533, "grad_norm": NaN, "learning_rate": 0.000330693129034204, "loss": 0.6013, "step": 9040 }, { "epoch": 0.1639670618043616, "grad_norm": NaN, "learning_rate": 0.000330693129034204, "loss": 0.7603, "step": 9060 }, { "epoch": 0.16432901999819022, "grad_norm": NaN, "learning_rate": 0.000330693129034204, "loss": 2.1826, "step": 9080 }, { "epoch": 0.16469097819201883, "grad_norm": NaN, "learning_rate": 0.000330693129034204, "loss": 3.9643, "step": 9100 }, { "epoch": 0.16505293638584742, "grad_norm": NaN, "learning_rate": 0.000330693129034204, "loss": 1.6171, "step": 9120 }, { "epoch": 0.16541489457967604, "grad_norm": NaN, "learning_rate": 0.000330693129034204, "loss": 0.5245, "step": 9140 }, { "epoch": 0.16577685277350465, "grad_norm": NaN, "learning_rate": 0.000330693129034204, "loss": 2.243, "step": 9160 }, { "epoch": 0.16613881096733327, "grad_norm": NaN, "learning_rate": 0.000330693129034204, "loss": 2.6442, "step": 9180 }, { "epoch": 0.16650076916116188, "grad_norm": NaN, "learning_rate": 0.000330747421125656, "loss": 1.4556, "step": 9200 }, { "epoch": 0.1668627273549905, "grad_norm": NaN, "learning_rate": 0.000330747421125656, "loss": 2.888, "step": 9220 }, { "epoch": 0.1672246855488191, "grad_norm": NaN, "learning_rate": 0.000330747421125656, "loss": 0.8837, "step": 9240 }, { "epoch": 0.16758664374264773, "grad_norm": NaN, "learning_rate": 0.000330747421125656, "loss": 3.352, "step": 9260 }, { "epoch": 0.16794860193647634, "grad_norm": NaN, "learning_rate": 0.000330747421125656, "loss": 1.1593, "step": 9280 }, { "epoch": 0.16831056013030496, "grad_norm": NaN, "learning_rate": 0.000330801713217108, "loss": 1.2725, "step": 9300 }, { "epoch": 0.16867251832413357, "grad_norm": NaN, "learning_rate": 0.000330801713217108, "loss": 2.86, "step": 9320 }, { "epoch": 0.16903447651796216, "grad_norm": NaN, "learning_rate": 0.000330801713217108, "loss": 1.3259, "step": 9340 }, { "epoch": 0.16939643471179078, "grad_norm": NaN, "learning_rate": 0.000330801713217108, "loss": 1.9639, "step": 9360 }, { "epoch": 0.1697583929056194, "grad_norm": NaN, "learning_rate": 0.000330801713217108, "loss": 1.0361, "step": 9380 }, { "epoch": 0.170120351099448, "grad_norm": NaN, "learning_rate": 0.000330801713217108, "loss": 0.8044, "step": 9400 }, { "epoch": 0.17048230929327662, "grad_norm": NaN, "learning_rate": 0.00033085600530856005, "loss": 2.3303, "step": 9420 }, { "epoch": 0.17084426748710524, "grad_norm": NaN, "learning_rate": 0.00033085600530856005, "loss": 2.9078, "step": 9440 }, { "epoch": 0.17120622568093385, "grad_norm": NaN, "learning_rate": 0.00033085600530856005, "loss": 0.4421, "step": 9460 }, { "epoch": 0.17156818387476247, "grad_norm": NaN, "learning_rate": 0.00033085600530856005, "loss": 5.0288, "step": 9480 }, { "epoch": 0.17193014206859109, "grad_norm": NaN, "learning_rate": 0.00033085600530856005, "loss": 0.8455, "step": 9500 }, { "epoch": 0.1722921002624197, "grad_norm": NaN, "learning_rate": 0.00033085600530856005, "loss": 3.2879, "step": 9520 }, { "epoch": 0.17265405845624832, "grad_norm": NaN, "learning_rate": 0.00033085600530856005, "loss": 3.2948, "step": 9540 }, { "epoch": 0.1730160166500769, "grad_norm": NaN, "learning_rate": 0.00033085600530856005, "loss": 1.6391, "step": 9560 }, { "epoch": 0.17337797484390552, "grad_norm": NaN, "learning_rate": 0.00033085600530856005, "loss": 0.8699, "step": 9580 }, { "epoch": 0.17373993303773413, "grad_norm": NaN, "learning_rate": 0.00033085600530856005, "loss": 0.3308, "step": 9600 }, { "epoch": 0.17410189123156275, "grad_norm": NaN, "learning_rate": 0.00033085600530856005, "loss": 0.5138, "step": 9620 }, { "epoch": 0.17446384942539137, "grad_norm": NaN, "learning_rate": 0.00033085600530856005, "loss": 2.5083, "step": 9640 }, { "epoch": 0.17482580761921998, "grad_norm": NaN, "learning_rate": 0.00033085600530856005, "loss": 2.8029, "step": 9660 }, { "epoch": 0.1751877658130486, "grad_norm": NaN, "learning_rate": 0.0003309102974000121, "loss": 2.2681, "step": 9680 }, { "epoch": 0.1755497240068772, "grad_norm": NaN, "learning_rate": 0.0003309102974000121, "loss": 0.0, "step": 9700 }, { "epoch": 0.17591168220070583, "grad_norm": NaN, "learning_rate": 0.0003309102974000121, "loss": 0.7887, "step": 9720 }, { "epoch": 0.17627364039453444, "grad_norm": NaN, "learning_rate": 0.0003309102974000121, "loss": 3.1913, "step": 9740 }, { "epoch": 0.17663559858836306, "grad_norm": NaN, "learning_rate": 0.0003309102974000121, "loss": 0.0, "step": 9760 }, { "epoch": 0.17699755678219165, "grad_norm": NaN, "learning_rate": 0.0003309102974000121, "loss": 0.4424, "step": 9780 }, { "epoch": 0.17735951497602026, "grad_norm": NaN, "learning_rate": 0.0003309102974000121, "loss": 2.4798, "step": 9800 }, { "epoch": 0.17772147316984888, "grad_norm": NaN, "learning_rate": 0.0003309645894914641, "loss": 2.4174, "step": 9820 }, { "epoch": 0.1780834313636775, "grad_norm": NaN, "learning_rate": 0.0003309645894914641, "loss": 1.3177, "step": 9840 }, { "epoch": 0.1784453895575061, "grad_norm": NaN, "learning_rate": 0.0003309645894914641, "loss": 3.8794, "step": 9860 }, { "epoch": 0.17880734775133472, "grad_norm": NaN, "learning_rate": 0.0003309645894914641, "loss": 1.0543, "step": 9880 }, { "epoch": 0.17916930594516334, "grad_norm": NaN, "learning_rate": 0.0003309645894914641, "loss": 1.4625, "step": 9900 }, { "epoch": 0.17953126413899195, "grad_norm": NaN, "learning_rate": 0.0003309645894914641, "loss": 3.5846, "step": 9920 }, { "epoch": 0.17989322233282057, "grad_norm": NaN, "learning_rate": 0.0003309645894914641, "loss": 1.6589, "step": 9940 }, { "epoch": 0.18025518052664918, "grad_norm": NaN, "learning_rate": 0.0003309645894914641, "loss": 3.6713, "step": 9960 }, { "epoch": 0.18061713872047777, "grad_norm": NaN, "learning_rate": 0.0003309645894914641, "loss": 0.0, "step": 9980 }, { "epoch": 0.1809790969143064, "grad_norm": NaN, "learning_rate": 0.0003309645894914641, "loss": 0.404, "step": 10000 }, { "epoch": 0.1809790969143064, "eval_accuracy": 4.7270488930998444e-05, "eval_loss": NaN, "eval_runtime": 168.7884, "eval_samples_per_second": 3601.195, "eval_steps_per_second": 3.519, "step": 10000 }, { "epoch": 0.181341055108135, "grad_norm": NaN, "learning_rate": 0.0003309645894914641, "loss": 3.2789, "step": 10020 }, { "epoch": 0.18170301330196362, "grad_norm": NaN, "learning_rate": 0.0003309645894914641, "loss": 3.5452, "step": 10040 }, { "epoch": 0.18206497149579223, "grad_norm": NaN, "learning_rate": 0.0003309645894914641, "loss": 1.9675, "step": 10060 }, { "epoch": 0.18242692968962085, "grad_norm": NaN, "learning_rate": 0.0003309645894914641, "loss": 2.5327, "step": 10080 }, { "epoch": 0.18278888788344946, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 2.1478, "step": 10100 }, { "epoch": 0.18315084607727808, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 1.2229, "step": 10120 }, { "epoch": 0.1835128042711067, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 0.5796, "step": 10140 }, { "epoch": 0.1838747624649353, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 0.5845, "step": 10160 }, { "epoch": 0.18423672065876393, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 1.4512, "step": 10180 }, { "epoch": 0.1845986788525925, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 1.1098, "step": 10200 }, { "epoch": 0.18496063704642113, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 0.8807, "step": 10220 }, { "epoch": 0.18532259524024974, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 1.7685, "step": 10240 }, { "epoch": 0.18568455343407836, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 1.319, "step": 10260 }, { "epoch": 0.18604651162790697, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 1.8892, "step": 10280 }, { "epoch": 0.1864084698217356, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 0.5865, "step": 10300 }, { "epoch": 0.1867704280155642, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 1.2616, "step": 10320 }, { "epoch": 0.18713238620939282, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 2.8315, "step": 10340 }, { "epoch": 0.18749434440322144, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 0.0, "step": 10360 }, { "epoch": 0.18785630259705005, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 1.6023, "step": 10380 }, { "epoch": 0.18821826079087867, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 1.307, "step": 10400 }, { "epoch": 0.18858021898470725, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 0.6094, "step": 10420 }, { "epoch": 0.18894217717853587, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 1.7019, "step": 10440 }, { "epoch": 0.18930413537236448, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 0.7946, "step": 10460 }, { "epoch": 0.1896660935661931, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 2.6347, "step": 10480 }, { "epoch": 0.19002805176002172, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 0.5617, "step": 10500 }, { "epoch": 0.19039000995385033, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 0.7904, "step": 10520 }, { "epoch": 0.19075196814767895, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 3.4674, "step": 10540 }, { "epoch": 0.19111392634150756, "grad_norm": NaN, "learning_rate": 0.0003310188815829161, "loss": 3.9894, "step": 10560 }, { "epoch": 0.19147588453533618, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 2.3749, "step": 10580 }, { "epoch": 0.1918378427291648, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 3.2962, "step": 10600 }, { "epoch": 0.1921998009229934, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 0.3296, "step": 10620 }, { "epoch": 0.192561759116822, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 0.8688, "step": 10640 }, { "epoch": 0.1929237173106506, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 2.4233, "step": 10660 }, { "epoch": 0.19328567550447923, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 2.3217, "step": 10680 }, { "epoch": 0.19364763369830784, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 0.3312, "step": 10700 }, { "epoch": 0.19400959189213646, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 1.5754, "step": 10720 }, { "epoch": 0.19437155008596507, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 1.2652, "step": 10740 }, { "epoch": 0.1947335082797937, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 0.0, "step": 10760 }, { "epoch": 0.1950954664736223, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 3.0766, "step": 10780 }, { "epoch": 0.19545742466745092, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 0.8772, "step": 10800 }, { "epoch": 0.19581938286127953, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 3.1372, "step": 10820 }, { "epoch": 0.19618134105510812, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 1.7832, "step": 10840 }, { "epoch": 0.19654329924893674, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 1.7805, "step": 10860 }, { "epoch": 0.19690525744276535, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 0.7744, "step": 10880 }, { "epoch": 0.19726721563659397, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 0.7481, "step": 10900 }, { "epoch": 0.19762917383042258, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 1.2984, "step": 10920 }, { "epoch": 0.1979911320242512, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 3.8704, "step": 10940 }, { "epoch": 0.19835309021807981, "grad_norm": NaN, "learning_rate": 0.0003310731736743681, "loss": 1.5112, "step": 10960 }, { "epoch": 0.19871504841190843, "grad_norm": NaN, "learning_rate": 0.00033112746576582013, "loss": 2.7037, "step": 10980 }, { "epoch": 0.19907700660573704, "grad_norm": NaN, "learning_rate": 0.00033112746576582013, "loss": 0.8565, "step": 11000 }, { "epoch": 0.19943896479956566, "grad_norm": NaN, "learning_rate": 0.00033112746576582013, "loss": 1.2738, "step": 11020 }, { "epoch": 0.19980092299339428, "grad_norm": NaN, "learning_rate": 0.00033112746576582013, "loss": 0.7074, "step": 11040 }, { "epoch": 0.20016288118722286, "grad_norm": NaN, "learning_rate": 0.00033112746576582013, "loss": 2.5413, "step": 11060 }, { "epoch": 0.20052483938105148, "grad_norm": NaN, "learning_rate": 0.00033112746576582013, "loss": 0.0, "step": 11080 }, { "epoch": 0.2008867975748801, "grad_norm": NaN, "learning_rate": 0.00033112746576582013, "loss": 3.7606, "step": 11100 }, { "epoch": 0.2012487557687087, "grad_norm": NaN, "learning_rate": 0.0003311817578572721, "loss": 2.7023, "step": 11120 }, { "epoch": 0.20161071396253732, "grad_norm": NaN, "learning_rate": 0.0003311817578572721, "loss": 1.2409, "step": 11140 }, { "epoch": 0.20197267215636594, "grad_norm": NaN, "learning_rate": 0.0003311817578572721, "loss": 2.6445, "step": 11160 }, { "epoch": 0.20233463035019456, "grad_norm": NaN, "learning_rate": 0.00033123604994872414, "loss": 3.1012, "step": 11180 }, { "epoch": 0.20269658854402317, "grad_norm": NaN, "learning_rate": 0.00033123604994872414, "loss": 2.177, "step": 11200 }, { "epoch": 0.20305854673785179, "grad_norm": NaN, "learning_rate": 0.00033123604994872414, "loss": 1.7604, "step": 11220 }, { "epoch": 0.2034205049316804, "grad_norm": NaN, "learning_rate": 0.00033123604994872414, "loss": 2.6092, "step": 11240 }, { "epoch": 0.20378246312550902, "grad_norm": NaN, "learning_rate": 0.00033123604994872414, "loss": 0.0, "step": 11260 }, { "epoch": 0.2041444213193376, "grad_norm": NaN, "learning_rate": 0.00033123604994872414, "loss": 1.3965, "step": 11280 }, { "epoch": 0.20450637951316622, "grad_norm": NaN, "learning_rate": 0.00033129034204017617, "loss": 2.6337, "step": 11300 }, { "epoch": 0.20486833770699484, "grad_norm": NaN, "learning_rate": 0.00033129034204017617, "loss": 0.2649, "step": 11320 }, { "epoch": 0.20523029590082345, "grad_norm": NaN, "learning_rate": 0.00033129034204017617, "loss": 2.3625, "step": 11340 }, { "epoch": 0.20559225409465207, "grad_norm": NaN, "learning_rate": 0.00033129034204017617, "loss": 1.1838, "step": 11360 }, { "epoch": 0.20595421228848068, "grad_norm": NaN, "learning_rate": 0.00033129034204017617, "loss": 0.4392, "step": 11380 }, { "epoch": 0.2063161704823093, "grad_norm": NaN, "learning_rate": 0.00033129034204017617, "loss": 4.5749, "step": 11400 }, { "epoch": 0.2066781286761379, "grad_norm": NaN, "learning_rate": 0.00033129034204017617, "loss": 1.9525, "step": 11420 }, { "epoch": 0.20704008686996653, "grad_norm": NaN, "learning_rate": 0.00033129034204017617, "loss": 1.268, "step": 11440 }, { "epoch": 0.20740204506379514, "grad_norm": NaN, "learning_rate": 0.00033134463413162814, "loss": 5.2503, "step": 11460 }, { "epoch": 0.20776400325762376, "grad_norm": NaN, "learning_rate": 0.00033134463413162814, "loss": 2.5942, "step": 11480 }, { "epoch": 0.20812596145145235, "grad_norm": NaN, "learning_rate": 0.00033134463413162814, "loss": 0.9446, "step": 11500 }, { "epoch": 0.20848791964528096, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 2.5663, "step": 11520 }, { "epoch": 0.20884987783910958, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 1.8402, "step": 11540 }, { "epoch": 0.2092118360329382, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 0.3537, "step": 11560 }, { "epoch": 0.2095737942267668, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 0.8712, "step": 11580 }, { "epoch": 0.20993575242059542, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 1.3149, "step": 11600 }, { "epoch": 0.21029771061442404, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 0.7537, "step": 11620 }, { "epoch": 0.21065966880825265, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 0.6643, "step": 11640 }, { "epoch": 0.21102162700208127, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 1.9121, "step": 11660 }, { "epoch": 0.21138358519590988, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 1.2808, "step": 11680 }, { "epoch": 0.21174554338973847, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 2.5578, "step": 11700 }, { "epoch": 0.2121075015835671, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 1.8118, "step": 11720 }, { "epoch": 0.2124694597773957, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 0.6818, "step": 11740 }, { "epoch": 0.21283141797122432, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 0.0, "step": 11760 }, { "epoch": 0.21319337616505293, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 0.3759, "step": 11780 }, { "epoch": 0.21355533435888155, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 1.0226, "step": 11800 }, { "epoch": 0.21391729255271016, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 3.8345, "step": 11820 }, { "epoch": 0.21427925074653878, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 0.3489, "step": 11840 }, { "epoch": 0.2146412089403674, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 0.5576, "step": 11860 }, { "epoch": 0.215003167134196, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 0.2773, "step": 11880 }, { "epoch": 0.21536512532802463, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 1.1728, "step": 11900 }, { "epoch": 0.2157270835218532, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 0.6381, "step": 11920 }, { "epoch": 0.21608904171568183, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 2.5759, "step": 11940 }, { "epoch": 0.21645099990951044, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 1.3899, "step": 11960 }, { "epoch": 0.21681295810333906, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 3.7077, "step": 11980 }, { "epoch": 0.21717491629716767, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 0.7368, "step": 12000 }, { "epoch": 0.2175368744909963, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 0.2641, "step": 12020 }, { "epoch": 0.2178988326848249, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 1.3131, "step": 12040 }, { "epoch": 0.21826079087865352, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 1.3886, "step": 12060 }, { "epoch": 0.21862274907248214, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 1.5487, "step": 12080 }, { "epoch": 0.21898470726631075, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 0.0, "step": 12100 }, { "epoch": 0.21934666546013937, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 2.7379, "step": 12120 }, { "epoch": 0.21970862365396795, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 0.8479, "step": 12140 }, { "epoch": 0.22007058184779657, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 3.5232, "step": 12160 }, { "epoch": 0.22043254004162519, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 1.9266, "step": 12180 }, { "epoch": 0.2207944982354538, "grad_norm": NaN, "learning_rate": 0.00033139892622308017, "loss": 0.7972, "step": 12200 }, { "epoch": 0.22115645642928242, "grad_norm": NaN, "learning_rate": 0.0003314532183145322, "loss": 3.8739, "step": 12220 }, { "epoch": 0.22151841462311103, "grad_norm": NaN, "learning_rate": 0.0003314532183145322, "loss": 2.3801, "step": 12240 }, { "epoch": 0.22188037281693965, "grad_norm": NaN, "learning_rate": 0.0003314532183145322, "loss": 0.0, "step": 12260 }, { "epoch": 0.22224233101076826, "grad_norm": NaN, "learning_rate": 0.0003314532183145322, "loss": 1.317, "step": 12280 }, { "epoch": 0.22260428920459688, "grad_norm": NaN, "learning_rate": 0.0003314532183145322, "loss": 3.5084, "step": 12300 }, { "epoch": 0.2229662473984255, "grad_norm": NaN, "learning_rate": 0.0003314532183145322, "loss": 2.331, "step": 12320 }, { "epoch": 0.22332820559225408, "grad_norm": NaN, "learning_rate": 0.00033150751040598417, "loss": 3.5959, "step": 12340 }, { "epoch": 0.2236901637860827, "grad_norm": NaN, "learning_rate": 0.00033150751040598417, "loss": 3.1326, "step": 12360 }, { "epoch": 0.2240521219799113, "grad_norm": NaN, "learning_rate": 0.00033150751040598417, "loss": 1.1822, "step": 12380 }, { "epoch": 0.22441408017373993, "grad_norm": NaN, "learning_rate": 0.0003315618024974362, "loss": 2.9039, "step": 12400 }, { "epoch": 0.22477603836756854, "grad_norm": NaN, "learning_rate": 0.0003315618024974362, "loss": 2.8964, "step": 12420 }, { "epoch": 0.22513799656139716, "grad_norm": NaN, "learning_rate": 0.0003315618024974362, "loss": 1.1023, "step": 12440 }, { "epoch": 0.22549995475522577, "grad_norm": NaN, "learning_rate": 0.0003315618024974362, "loss": 1.3736, "step": 12460 }, { "epoch": 0.2258619129490544, "grad_norm": NaN, "learning_rate": 0.0003315618024974362, "loss": 1.3162, "step": 12480 }, { "epoch": 0.226223871142883, "grad_norm": NaN, "learning_rate": 0.0003315618024974362, "loss": 0.375, "step": 12500 }, { "epoch": 0.22658582933671162, "grad_norm": NaN, "learning_rate": 0.00033161609458888817, "loss": 1.5488, "step": 12520 }, { "epoch": 0.22694778753054023, "grad_norm": NaN, "learning_rate": 0.0003316703866803402, "loss": 3.4129, "step": 12540 }, { "epoch": 0.22730974572436882, "grad_norm": NaN, "learning_rate": 0.0003316703866803402, "loss": 1.3923, "step": 12560 }, { "epoch": 0.22767170391819744, "grad_norm": NaN, "learning_rate": 0.0003316703866803402, "loss": 0.4053, "step": 12580 }, { "epoch": 0.22803366211202605, "grad_norm": NaN, "learning_rate": 0.0003316703866803402, "loss": 1.6378, "step": 12600 }, { "epoch": 0.22839562030585467, "grad_norm": NaN, "learning_rate": 0.0003316703866803402, "loss": 0.0, "step": 12620 }, { "epoch": 0.22875757849968328, "grad_norm": NaN, "learning_rate": 0.0003316703866803402, "loss": 1.8906, "step": 12640 }, { "epoch": 0.2291195366935119, "grad_norm": NaN, "learning_rate": 0.0003316703866803402, "loss": 0.9872, "step": 12660 }, { "epoch": 0.22948149488734051, "grad_norm": NaN, "learning_rate": 0.0003316703866803402, "loss": 2.6298, "step": 12680 }, { "epoch": 0.22984345308116913, "grad_norm": NaN, "learning_rate": 0.0003316703866803402, "loss": 1.3228, "step": 12700 }, { "epoch": 0.23020541127499775, "grad_norm": NaN, "learning_rate": 0.0003316703866803402, "loss": 1.5571, "step": 12720 }, { "epoch": 0.23056736946882636, "grad_norm": NaN, "learning_rate": 0.0003316703866803402, "loss": 1.2987, "step": 12740 }, { "epoch": 0.23092932766265498, "grad_norm": NaN, "learning_rate": 0.0003316703866803402, "loss": 0.9939, "step": 12760 }, { "epoch": 0.23129128585648356, "grad_norm": NaN, "learning_rate": 0.0003316703866803402, "loss": 1.3592, "step": 12780 }, { "epoch": 0.23165324405031218, "grad_norm": NaN, "learning_rate": 0.0003316703866803402, "loss": 1.5959, "step": 12800 }, { "epoch": 0.2320152022441408, "grad_norm": NaN, "learning_rate": 0.00033172467877179223, "loss": 6.2469, "step": 12820 }, { "epoch": 0.2323771604379694, "grad_norm": NaN, "learning_rate": 0.00033172467877179223, "loss": 1.7704, "step": 12840 }, { "epoch": 0.23273911863179803, "grad_norm": NaN, "learning_rate": 0.00033172467877179223, "loss": 2.1183, "step": 12860 }, { "epoch": 0.23310107682562664, "grad_norm": NaN, "learning_rate": 0.00033172467877179223, "loss": 0.0, "step": 12880 }, { "epoch": 0.23346303501945526, "grad_norm": NaN, "learning_rate": 0.00033172467877179223, "loss": 1.8694, "step": 12900 }, { "epoch": 0.23382499321328387, "grad_norm": NaN, "learning_rate": 0.00033172467877179223, "loss": 1.0327, "step": 12920 }, { "epoch": 0.2341869514071125, "grad_norm": NaN, "learning_rate": 0.00033172467877179223, "loss": 3.3219, "step": 12940 }, { "epoch": 0.2345489096009411, "grad_norm": NaN, "learning_rate": 0.00033172467877179223, "loss": 0.7038, "step": 12960 }, { "epoch": 0.23491086779476972, "grad_norm": NaN, "learning_rate": 0.00033172467877179223, "loss": 0.8102, "step": 12980 }, { "epoch": 0.2352728259885983, "grad_norm": NaN, "learning_rate": 0.00033172467877179223, "loss": 2.6303, "step": 13000 }, { "epoch": 0.23563478418242692, "grad_norm": NaN, "learning_rate": 0.00033172467877179223, "loss": 2.6235, "step": 13020 }, { "epoch": 0.23599674237625554, "grad_norm": NaN, "learning_rate": 0.00033172467877179223, "loss": 3.3297, "step": 13040 }, { "epoch": 0.23635870057008415, "grad_norm": NaN, "learning_rate": 0.00033172467877179223, "loss": 0.6631, "step": 13060 }, { "epoch": 0.23672065876391277, "grad_norm": NaN, "learning_rate": 0.00033172467877179223, "loss": 0.8192, "step": 13080 }, { "epoch": 0.23708261695774138, "grad_norm": NaN, "learning_rate": 0.0003317789708632442, "loss": 1.9897, "step": 13100 }, { "epoch": 0.23744457515157, "grad_norm": NaN, "learning_rate": 0.0003317789708632442, "loss": 2.9406, "step": 13120 }, { "epoch": 0.2378065333453986, "grad_norm": NaN, "learning_rate": 0.0003317789708632442, "loss": 2.8512, "step": 13140 }, { "epoch": 0.23816849153922723, "grad_norm": NaN, "learning_rate": 0.0003317789708632442, "loss": 2.354, "step": 13160 }, { "epoch": 0.23853044973305584, "grad_norm": NaN, "learning_rate": 0.0003317789708632442, "loss": 2.9572, "step": 13180 }, { "epoch": 0.23889240792688443, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 1.8117, "step": 13200 }, { "epoch": 0.23925436612071305, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 4.224, "step": 13220 }, { "epoch": 0.23961632431454166, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 4.0798, "step": 13240 }, { "epoch": 0.23997828250837028, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 1.3255, "step": 13260 }, { "epoch": 0.2403402407021989, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 0.2523, "step": 13280 }, { "epoch": 0.2407021988960275, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 0.2934, "step": 13300 }, { "epoch": 0.24106415708985612, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 2.6591, "step": 13320 }, { "epoch": 0.24142611528368474, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 1.6821, "step": 13340 }, { "epoch": 0.24178807347751335, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 1.0646, "step": 13360 }, { "epoch": 0.24215003167134197, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 2.028, "step": 13380 }, { "epoch": 0.24251198986517059, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 2.2866, "step": 13400 }, { "epoch": 0.24287394805899917, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 4.1451, "step": 13420 }, { "epoch": 0.2432359062528278, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 0.7085, "step": 13440 }, { "epoch": 0.2435978644466564, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 1.7351, "step": 13460 }, { "epoch": 0.24395982264048502, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 0.8787, "step": 13480 }, { "epoch": 0.24432178083431363, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 1.1582, "step": 13500 }, { "epoch": 0.24468373902814225, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 1.0302, "step": 13520 }, { "epoch": 0.24504569722197087, "grad_norm": NaN, "learning_rate": 0.00033183326295469623, "loss": 1.3266, "step": 13540 }, { "epoch": 0.24540765541579948, "grad_norm": NaN, "learning_rate": 0.00033188755504614826, "loss": 1.748, "step": 13560 }, { "epoch": 0.2457696136096281, "grad_norm": NaN, "learning_rate": 0.00033188755504614826, "loss": 1.9466, "step": 13580 }, { "epoch": 0.2461315718034567, "grad_norm": NaN, "learning_rate": 0.00033188755504614826, "loss": 1.1822, "step": 13600 }, { "epoch": 0.24649352999728533, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 1.3307, "step": 13620 }, { "epoch": 0.24685548819111391, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 0.9708, "step": 13640 }, { "epoch": 0.24721744638494253, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 2.1441, "step": 13660 }, { "epoch": 0.24757940457877115, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 1.3153, "step": 13680 }, { "epoch": 0.24794136277259976, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 1.1421, "step": 13700 }, { "epoch": 0.24830332096642838, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 2.6188, "step": 13720 }, { "epoch": 0.248665279160257, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 0.4062, "step": 13740 }, { "epoch": 0.2490272373540856, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 1.3454, "step": 13760 }, { "epoch": 0.24938919554791422, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 0.2786, "step": 13780 }, { "epoch": 0.24975115374174284, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 0.5933, "step": 13800 }, { "epoch": 0.25011311193557145, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 1.0515, "step": 13820 }, { "epoch": 0.25047507012940007, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 1.6173, "step": 13840 }, { "epoch": 0.2508370283232287, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 1.317, "step": 13860 }, { "epoch": 0.2511989865170573, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 2.1028, "step": 13880 }, { "epoch": 0.2515609447108859, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 2.4195, "step": 13900 }, { "epoch": 0.25192290290471453, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 1.739, "step": 13920 }, { "epoch": 0.25228486109854315, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 0.4393, "step": 13940 }, { "epoch": 0.2526468192923717, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 2.8693, "step": 13960 }, { "epoch": 0.2530087774862003, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 2.4178, "step": 13980 }, { "epoch": 0.25337073568002894, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 1.0575, "step": 14000 }, { "epoch": 0.25373269387385755, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 2.0835, "step": 14020 }, { "epoch": 0.25409465206768617, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 2.8238, "step": 14040 }, { "epoch": 0.2544566102615148, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 0.6661, "step": 14060 }, { "epoch": 0.2548185684553434, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 0.4801, "step": 14080 }, { "epoch": 0.255180526649172, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 0.0, "step": 14100 }, { "epoch": 0.25554248484300063, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 2.0042, "step": 14120 }, { "epoch": 0.25590444303682924, "grad_norm": NaN, "learning_rate": 0.0003319418471376003, "loss": 2.0685, "step": 14140 }, { "epoch": 0.25626640123065786, "grad_norm": NaN, "learning_rate": 0.0003319961392290523, "loss": 5.5943, "step": 14160 }, { "epoch": 0.2566283594244865, "grad_norm": NaN, "learning_rate": 0.0003319961392290523, "loss": 2.1728, "step": 14180 }, { "epoch": 0.2569903176183151, "grad_norm": NaN, "learning_rate": 0.0003319961392290523, "loss": 1.4864, "step": 14200 }, { "epoch": 0.2573522758121437, "grad_norm": NaN, "learning_rate": 0.0003319961392290523, "loss": 1.4339, "step": 14220 }, { "epoch": 0.2577142340059723, "grad_norm": NaN, "learning_rate": 0.0003319961392290523, "loss": 2.5961, "step": 14240 }, { "epoch": 0.25807619219980094, "grad_norm": NaN, "learning_rate": 0.0003319961392290523, "loss": 3.9077, "step": 14260 }, { "epoch": 0.25843815039362955, "grad_norm": NaN, "learning_rate": 0.0003319961392290523, "loss": 2.087, "step": 14280 }, { "epoch": 0.25880010858745817, "grad_norm": NaN, "learning_rate": 0.0003319961392290523, "loss": 1.0479, "step": 14300 }, { "epoch": 0.2591620667812868, "grad_norm": NaN, "learning_rate": 0.0003319961392290523, "loss": 0.2763, "step": 14320 }, { "epoch": 0.2595240249751154, "grad_norm": NaN, "learning_rate": 0.0003319961392290523, "loss": 1.8039, "step": 14340 }, { "epoch": 0.259885983168944, "grad_norm": NaN, "learning_rate": 0.0003319961392290523, "loss": 0.7725, "step": 14360 }, { "epoch": 0.2602479413627726, "grad_norm": NaN, "learning_rate": 0.0003319961392290523, "loss": 0.7769, "step": 14380 }, { "epoch": 0.2606098995566012, "grad_norm": NaN, "learning_rate": 0.0003319961392290523, "loss": 1.6173, "step": 14400 }, { "epoch": 0.2609718577504298, "grad_norm": NaN, "learning_rate": 0.0003319961392290523, "loss": 3.1579, "step": 14420 }, { "epoch": 0.2613338159442584, "grad_norm": NaN, "learning_rate": 0.0003319961392290523, "loss": 2.5452, "step": 14440 }, { "epoch": 0.26169577413808703, "grad_norm": NaN, "learning_rate": 0.0003319961392290523, "loss": 2.3302, "step": 14460 }, { "epoch": 0.26205773233191565, "grad_norm": NaN, "learning_rate": 0.00033205043132050434, "loss": 1.2823, "step": 14480 }, { "epoch": 0.26241969052574426, "grad_norm": NaN, "learning_rate": 0.00033205043132050434, "loss": 0.0, "step": 14500 }, { "epoch": 0.2627816487195729, "grad_norm": NaN, "learning_rate": 0.00033205043132050434, "loss": 3.6047, "step": 14520 }, { "epoch": 0.2631436069134015, "grad_norm": NaN, "learning_rate": 0.00033205043132050434, "loss": 0.3507, "step": 14540 }, { "epoch": 0.2635055651072301, "grad_norm": NaN, "learning_rate": 0.00033205043132050434, "loss": 2.2048, "step": 14560 }, { "epoch": 0.2638675233010587, "grad_norm": NaN, "learning_rate": 0.00033205043132050434, "loss": 0.0, "step": 14580 }, { "epoch": 0.26422948149488734, "grad_norm": NaN, "learning_rate": 0.00033205043132050434, "loss": 3.138, "step": 14600 }, { "epoch": 0.26459143968871596, "grad_norm": NaN, "learning_rate": 0.00033205043132050434, "loss": 2.2094, "step": 14620 }, { "epoch": 0.2649533978825446, "grad_norm": NaN, "learning_rate": 0.0003321047234119563, "loss": 1.2991, "step": 14640 }, { "epoch": 0.2653153560763732, "grad_norm": NaN, "learning_rate": 0.0003321047234119563, "loss": 1.9475, "step": 14660 }, { "epoch": 0.2656773142702018, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 1.4229, "step": 14680 }, { "epoch": 0.2660392724640304, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 1.2296, "step": 14700 }, { "epoch": 0.26640123065785903, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 0.9691, "step": 14720 }, { "epoch": 0.26676318885168765, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 3.0861, "step": 14740 }, { "epoch": 0.26712514704551626, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 3.157, "step": 14760 }, { "epoch": 0.2674871052393449, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 3.5982, "step": 14780 }, { "epoch": 0.2678490634331735, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 2.4855, "step": 14800 }, { "epoch": 0.26821102162700206, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 1.3317, "step": 14820 }, { "epoch": 0.26857297982083067, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 1.0104, "step": 14840 }, { "epoch": 0.2689349380146593, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 2.0084, "step": 14860 }, { "epoch": 0.2692968962084879, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 1.3151, "step": 14880 }, { "epoch": 0.2696588544023165, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 0.8811, "step": 14900 }, { "epoch": 0.27002081259614513, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 0.262, "step": 14920 }, { "epoch": 0.27038277078997375, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 1.6678, "step": 14940 }, { "epoch": 0.27074472898380236, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 3.2316, "step": 14960 }, { "epoch": 0.271106687177631, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 2.78, "step": 14980 }, { "epoch": 0.2714686453714596, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 3.7747, "step": 15000 }, { "epoch": 0.2714686453714596, "eval_accuracy": 4.456280630502007e-05, "eval_loss": NaN, "eval_runtime": 169.7419, "eval_samples_per_second": 3580.967, "eval_steps_per_second": 3.499, "step": 15000 }, { "epoch": 0.2718306035652882, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 2.1612, "step": 15020 }, { "epoch": 0.2721925617591168, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 2.3457, "step": 15040 }, { "epoch": 0.27255451995294544, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 2.66, "step": 15060 }, { "epoch": 0.27291647814677406, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 0.7467, "step": 15080 }, { "epoch": 0.27327843634060267, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 1.0473, "step": 15100 }, { "epoch": 0.2736403945344313, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 4.2817, "step": 15120 }, { "epoch": 0.2740023527282599, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 1.9683, "step": 15140 }, { "epoch": 0.2743643109220885, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 2.2001, "step": 15160 }, { "epoch": 0.27472626911591713, "grad_norm": NaN, "learning_rate": 0.00033215901550340835, "loss": 2.1365, "step": 15180 }, { "epoch": 0.27508822730974575, "grad_norm": NaN, "learning_rate": 0.0003322133075948604, "loss": 1.5323, "step": 15200 }, { "epoch": 0.27545018550357436, "grad_norm": NaN, "learning_rate": 0.0003322133075948604, "loss": 0.2926, "step": 15220 }, { "epoch": 0.2758121436974029, "grad_norm": NaN, "learning_rate": 0.0003322133075948604, "loss": 1.203, "step": 15240 }, { "epoch": 0.27617410189123154, "grad_norm": NaN, "learning_rate": 0.00033226759968631235, "loss": 6.2523, "step": 15260 }, { "epoch": 0.27653606008506015, "grad_norm": NaN, "learning_rate": 0.00033226759968631235, "loss": 1.8917, "step": 15280 }, { "epoch": 0.27689801827888877, "grad_norm": NaN, "learning_rate": 0.00033226759968631235, "loss": 0.787, "step": 15300 }, { "epoch": 0.2772599764727174, "grad_norm": NaN, "learning_rate": 0.00033226759968631235, "loss": 0.7079, "step": 15320 }, { "epoch": 0.277621934666546, "grad_norm": NaN, "learning_rate": 0.00033226759968631235, "loss": 4.1041, "step": 15340 }, { "epoch": 0.2779838928603746, "grad_norm": NaN, "learning_rate": 0.0003323761838692164, "loss": 6.8995, "step": 15360 }, { "epoch": 0.27834585105420323, "grad_norm": NaN, "learning_rate": 0.0003323761838692164, "loss": 0.0, "step": 15380 }, { "epoch": 0.27870780924803185, "grad_norm": NaN, "learning_rate": 0.0003323761838692164, "loss": 4.1352, "step": 15400 }, { "epoch": 0.27906976744186046, "grad_norm": NaN, "learning_rate": 0.0003323761838692164, "loss": 2.4992, "step": 15420 }, { "epoch": 0.2794317256356891, "grad_norm": NaN, "learning_rate": 0.0003323761838692164, "loss": 2.4702, "step": 15440 }, { "epoch": 0.2797936838295177, "grad_norm": NaN, "learning_rate": 0.0003323761838692164, "loss": 0.307, "step": 15460 }, { "epoch": 0.2801556420233463, "grad_norm": NaN, "learning_rate": 0.0003323761838692164, "loss": 2.3855, "step": 15480 }, { "epoch": 0.2805176002171749, "grad_norm": NaN, "learning_rate": 0.0003323761838692164, "loss": 0.0, "step": 15500 }, { "epoch": 0.28087955841100354, "grad_norm": NaN, "learning_rate": 0.0003323761838692164, "loss": 0.0, "step": 15520 }, { "epoch": 0.28124151660483215, "grad_norm": NaN, "learning_rate": 0.0003323761838692164, "loss": 0.528, "step": 15540 }, { "epoch": 0.28160347479866077, "grad_norm": NaN, "learning_rate": 0.0003324304759606684, "loss": 1.8322, "step": 15560 }, { "epoch": 0.2819654329924894, "grad_norm": NaN, "learning_rate": 0.0003324304759606684, "loss": 4.091, "step": 15580 }, { "epoch": 0.282327391186318, "grad_norm": NaN, "learning_rate": 0.0003324304759606684, "loss": 1.9259, "step": 15600 }, { "epoch": 0.2826893493801466, "grad_norm": NaN, "learning_rate": 0.0003324304759606684, "loss": 0.9673, "step": 15620 }, { "epoch": 0.28305130757397523, "grad_norm": NaN, "learning_rate": 0.0003324304759606684, "loss": 0.6619, "step": 15640 }, { "epoch": 0.28341326576780385, "grad_norm": NaN, "learning_rate": 0.0003324304759606684, "loss": 2.4704, "step": 15660 }, { "epoch": 0.2837752239616324, "grad_norm": NaN, "learning_rate": 0.0003324304759606684, "loss": 0.7316, "step": 15680 }, { "epoch": 0.284137182155461, "grad_norm": NaN, "learning_rate": 0.0003324304759606684, "loss": 3.7923, "step": 15700 }, { "epoch": 0.28449914034928964, "grad_norm": NaN, "learning_rate": 0.0003324304759606684, "loss": 6.1166, "step": 15720 }, { "epoch": 0.28486109854311825, "grad_norm": NaN, "learning_rate": 0.0003324304759606684, "loss": 0.0, "step": 15740 }, { "epoch": 0.28522305673694687, "grad_norm": NaN, "learning_rate": 0.0003324304759606684, "loss": 0.0, "step": 15760 }, { "epoch": 0.2855850149307755, "grad_norm": NaN, "learning_rate": 0.0003324304759606684, "loss": 2.8713, "step": 15780 }, { "epoch": 0.2859469731246041, "grad_norm": NaN, "learning_rate": 0.0003324304759606684, "loss": 3.2676, "step": 15800 }, { "epoch": 0.2863089313184327, "grad_norm": NaN, "learning_rate": 0.0003324304759606684, "loss": 1.0438, "step": 15820 }, { "epoch": 0.28667088951226133, "grad_norm": NaN, "learning_rate": 0.0003324847680521204, "loss": 2.7696, "step": 15840 }, { "epoch": 0.28703284770608994, "grad_norm": NaN, "learning_rate": 0.0003324847680521204, "loss": 1.5355, "step": 15860 }, { "epoch": 0.28739480589991856, "grad_norm": NaN, "learning_rate": 0.0003325390601435724, "loss": 6.3647, "step": 15880 }, { "epoch": 0.2877567640937472, "grad_norm": NaN, "learning_rate": 0.0003325390601435724, "loss": 0.8543, "step": 15900 }, { "epoch": 0.2881187222875758, "grad_norm": NaN, "learning_rate": 0.0003325390601435724, "loss": 2.5328, "step": 15920 }, { "epoch": 0.2884806804814044, "grad_norm": NaN, "learning_rate": 0.0003325390601435724, "loss": 1.297, "step": 15940 }, { "epoch": 0.288842638675233, "grad_norm": NaN, "learning_rate": 0.0003325390601435724, "loss": 1.5096, "step": 15960 }, { "epoch": 0.28920459686906164, "grad_norm": NaN, "learning_rate": 0.0003325390601435724, "loss": 1.5733, "step": 15980 }, { "epoch": 0.28956655506289025, "grad_norm": NaN, "learning_rate": 0.0003325390601435724, "loss": 1.1257, "step": 16000 }, { "epoch": 0.28992851325671887, "grad_norm": NaN, "learning_rate": 0.0003325390601435724, "loss": 3.0097, "step": 16020 }, { "epoch": 0.2902904714505475, "grad_norm": NaN, "learning_rate": 0.0003325390601435724, "loss": 0.5271, "step": 16040 }, { "epoch": 0.2906524296443761, "grad_norm": NaN, "learning_rate": 0.0003325390601435724, "loss": 0.5818, "step": 16060 }, { "epoch": 0.2910143878382047, "grad_norm": NaN, "learning_rate": 0.0003325390601435724, "loss": 3.8861, "step": 16080 }, { "epoch": 0.2913763460320333, "grad_norm": NaN, "learning_rate": 0.0003325390601435724, "loss": 3.7348, "step": 16100 }, { "epoch": 0.2917383042258619, "grad_norm": NaN, "learning_rate": 0.0003325390601435724, "loss": 1.6733, "step": 16120 }, { "epoch": 0.2921002624196905, "grad_norm": NaN, "learning_rate": 0.0003325933522350244, "loss": 1.5756, "step": 16140 }, { "epoch": 0.2924622206135191, "grad_norm": NaN, "learning_rate": 0.00033264764432647644, "loss": 2.7821, "step": 16160 }, { "epoch": 0.29282417880734773, "grad_norm": NaN, "learning_rate": 0.00033264764432647644, "loss": 1.8281, "step": 16180 }, { "epoch": 0.29318613700117635, "grad_norm": NaN, "learning_rate": 0.00033264764432647644, "loss": 3.9718, "step": 16200 }, { "epoch": 0.29354809519500497, "grad_norm": NaN, "learning_rate": 0.00033264764432647644, "loss": 3.195, "step": 16220 }, { "epoch": 0.2939100533888336, "grad_norm": NaN, "learning_rate": 0.00033264764432647644, "loss": 3.588, "step": 16240 }, { "epoch": 0.2942720115826622, "grad_norm": NaN, "learning_rate": 0.00033264764432647644, "loss": 0.404, "step": 16260 }, { "epoch": 0.2946339697764908, "grad_norm": NaN, "learning_rate": 0.00033264764432647644, "loss": 1.2487, "step": 16280 }, { "epoch": 0.2949959279703194, "grad_norm": NaN, "learning_rate": 0.00033264764432647644, "loss": 3.2057, "step": 16300 }, { "epoch": 0.29535788616414804, "grad_norm": NaN, "learning_rate": 0.00033264764432647644, "loss": 3.3175, "step": 16320 }, { "epoch": 0.29571984435797666, "grad_norm": NaN, "learning_rate": 0.0003327019364179284, "loss": 3.0467, "step": 16340 }, { "epoch": 0.2960818025518053, "grad_norm": NaN, "learning_rate": 0.0003327019364179284, "loss": 4.2457, "step": 16360 }, { "epoch": 0.2964437607456339, "grad_norm": NaN, "learning_rate": 0.0003327019364179284, "loss": 2.6049, "step": 16380 }, { "epoch": 0.2968057189394625, "grad_norm": NaN, "learning_rate": 0.0003327019364179284, "loss": 0.704, "step": 16400 }, { "epoch": 0.2971676771332911, "grad_norm": NaN, "learning_rate": 0.0003327019364179284, "loss": 0.0, "step": 16420 }, { "epoch": 0.29752963532711973, "grad_norm": NaN, "learning_rate": 0.0003327019364179284, "loss": 1.0099, "step": 16440 }, { "epoch": 0.29789159352094835, "grad_norm": NaN, "learning_rate": 0.0003327019364179284, "loss": 1.926, "step": 16460 }, { "epoch": 0.29825355171477697, "grad_norm": NaN, "learning_rate": 0.0003327019364179284, "loss": 1.7635, "step": 16480 }, { "epoch": 0.2986155099086056, "grad_norm": NaN, "learning_rate": 0.0003327019364179284, "loss": 2.0268, "step": 16500 }, { "epoch": 0.2989774681024342, "grad_norm": NaN, "learning_rate": 0.00033275622850938044, "loss": 3.157, "step": 16520 }, { "epoch": 0.29933942629626276, "grad_norm": NaN, "learning_rate": 0.00033275622850938044, "loss": 1.325, "step": 16540 }, { "epoch": 0.29970138449009137, "grad_norm": NaN, "learning_rate": 0.00033275622850938044, "loss": 0.8732, "step": 16560 }, { "epoch": 0.30006334268392, "grad_norm": NaN, "learning_rate": 0.00033275622850938044, "loss": 1.2071, "step": 16580 }, { "epoch": 0.3004253008777486, "grad_norm": NaN, "learning_rate": 0.00033275622850938044, "loss": 0.31, "step": 16600 }, { "epoch": 0.3007872590715772, "grad_norm": NaN, "learning_rate": 0.00033275622850938044, "loss": 0.3304, "step": 16620 }, { "epoch": 0.30114921726540583, "grad_norm": NaN, "learning_rate": 0.00033275622850938044, "loss": 1.5856, "step": 16640 }, { "epoch": 0.30151117545923445, "grad_norm": NaN, "learning_rate": 0.00033275622850938044, "loss": 2.3223, "step": 16660 }, { "epoch": 0.30187313365306306, "grad_norm": NaN, "learning_rate": 0.00033275622850938044, "loss": 1.6233, "step": 16680 }, { "epoch": 0.3022350918468917, "grad_norm": NaN, "learning_rate": 0.00033275622850938044, "loss": 0.4039, "step": 16700 }, { "epoch": 0.3025970500407203, "grad_norm": NaN, "learning_rate": 0.00033275622850938044, "loss": 0.5894, "step": 16720 }, { "epoch": 0.3029590082345489, "grad_norm": NaN, "learning_rate": 0.00033275622850938044, "loss": 1.5192, "step": 16740 }, { "epoch": 0.3033209664283775, "grad_norm": NaN, "learning_rate": 0.00033275622850938044, "loss": 2.3448, "step": 16760 }, { "epoch": 0.30368292462220614, "grad_norm": NaN, "learning_rate": 0.00033275622850938044, "loss": 1.5671, "step": 16780 }, { "epoch": 0.30404488281603476, "grad_norm": NaN, "learning_rate": 0.00033281052060083247, "loss": 1.6537, "step": 16800 }, { "epoch": 0.30440684100986337, "grad_norm": NaN, "learning_rate": 0.00033281052060083247, "loss": 0.4753, "step": 16820 }, { "epoch": 0.304768799203692, "grad_norm": NaN, "learning_rate": 0.00033281052060083247, "loss": 0.9992, "step": 16840 }, { "epoch": 0.3051307573975206, "grad_norm": NaN, "learning_rate": 0.00033281052060083247, "loss": 0.7515, "step": 16860 }, { "epoch": 0.3054927155913492, "grad_norm": NaN, "learning_rate": 0.00033281052060083247, "loss": 4.0394, "step": 16880 }, { "epoch": 0.30585467378517783, "grad_norm": NaN, "learning_rate": 0.00033281052060083247, "loss": 3.1043, "step": 16900 }, { "epoch": 0.30621663197900645, "grad_norm": NaN, "learning_rate": 0.00033281052060083247, "loss": 1.1429, "step": 16920 }, { "epoch": 0.30657859017283506, "grad_norm": NaN, "learning_rate": 0.00033281052060083247, "loss": 3.6798, "step": 16940 }, { "epoch": 0.3069405483666636, "grad_norm": NaN, "learning_rate": 0.00033281052060083247, "loss": 1.7657, "step": 16960 }, { "epoch": 0.30730250656049224, "grad_norm": NaN, "learning_rate": 0.00033281052060083247, "loss": 3.08, "step": 16980 }, { "epoch": 0.30766446475432085, "grad_norm": NaN, "learning_rate": 0.00033281052060083247, "loss": 1.1346, "step": 17000 }, { "epoch": 0.30802642294814947, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 5.4598, "step": 17020 }, { "epoch": 0.3083883811419781, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 1.9448, "step": 17040 }, { "epoch": 0.3087503393358067, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 3.3066, "step": 17060 }, { "epoch": 0.3091122975296353, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 1.7473, "step": 17080 }, { "epoch": 0.30947425572346393, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 1.3353, "step": 17100 }, { "epoch": 0.30983621391729255, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 2.3668, "step": 17120 }, { "epoch": 0.31019817211112116, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 0.6624, "step": 17140 }, { "epoch": 0.3105601303049498, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 1.165, "step": 17160 }, { "epoch": 0.3109220884987784, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 1.9519, "step": 17180 }, { "epoch": 0.311284046692607, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 1.0541, "step": 17200 }, { "epoch": 0.3116460048864356, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 0.4037, "step": 17220 }, { "epoch": 0.31200796308026424, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 0.0, "step": 17240 }, { "epoch": 0.31236992127409285, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 0.5949, "step": 17260 }, { "epoch": 0.31273187946792147, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 0.0, "step": 17280 }, { "epoch": 0.3130938376617501, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 0.7537, "step": 17300 }, { "epoch": 0.3134557958555787, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 1.0618, "step": 17320 }, { "epoch": 0.3138177540494073, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 0.9915, "step": 17340 }, { "epoch": 0.31417971224323593, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 0.7284, "step": 17360 }, { "epoch": 0.31454167043706455, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 3.413, "step": 17380 }, { "epoch": 0.3149036286308931, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 4.1376, "step": 17400 }, { "epoch": 0.3152655868247217, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 2.1988, "step": 17420 }, { "epoch": 0.31562754501855034, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 1.32, "step": 17440 }, { "epoch": 0.31598950321237895, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 0.0, "step": 17460 }, { "epoch": 0.31635146140620757, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 0.0, "step": 17480 }, { "epoch": 0.3167134196000362, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 1.9768, "step": 17500 }, { "epoch": 0.3170753777938648, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 1.4571, "step": 17520 }, { "epoch": 0.3174373359876934, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 3.323, "step": 17540 }, { "epoch": 0.31779929418152203, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 1.9752, "step": 17560 }, { "epoch": 0.31816125237535064, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 1.1101, "step": 17580 }, { "epoch": 0.31852321056917926, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 1.6424, "step": 17600 }, { "epoch": 0.3188851687630079, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 0.8108, "step": 17620 }, { "epoch": 0.3192471269568365, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 0.6593, "step": 17640 }, { "epoch": 0.3196090851506651, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 0.3098, "step": 17660 }, { "epoch": 0.3199710433444937, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 1.3124, "step": 17680 }, { "epoch": 0.32033300153832234, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 1.0883, "step": 17700 }, { "epoch": 0.32069495973215095, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 3.4976, "step": 17720 }, { "epoch": 0.32105691792597957, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 0.0, "step": 17740 }, { "epoch": 0.3214188761198082, "grad_norm": NaN, "learning_rate": 0.00033286481269228444, "loss": 1.238, "step": 17760 }, { "epoch": 0.3217808343136368, "grad_norm": NaN, "learning_rate": 0.00033291910478373647, "loss": 1.529, "step": 17780 }, { "epoch": 0.3221427925074654, "grad_norm": NaN, "learning_rate": 0.0003329733968751885, "loss": 2.5668, "step": 17800 }, { "epoch": 0.322504750701294, "grad_norm": NaN, "learning_rate": 0.0003329733968751885, "loss": 3.124, "step": 17820 }, { "epoch": 0.3228667088951226, "grad_norm": NaN, "learning_rate": 0.00033302768896664053, "loss": 6.5073, "step": 17840 }, { "epoch": 0.3232286670889512, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 3.5299, "step": 17860 }, { "epoch": 0.3235906252827798, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 0.8753, "step": 17880 }, { "epoch": 0.32395258347660844, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 1.3042, "step": 17900 }, { "epoch": 0.32431454167043705, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 0.0, "step": 17920 }, { "epoch": 0.32467649986426567, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 4.5838, "step": 17940 }, { "epoch": 0.3250384580580943, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 2.2672, "step": 17960 }, { "epoch": 0.3254004162519229, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 1.1385, "step": 17980 }, { "epoch": 0.3257623744457515, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 0.7396, "step": 18000 }, { "epoch": 0.32612433263958013, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 1.1962, "step": 18020 }, { "epoch": 0.32648629083340874, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 2.5472, "step": 18040 }, { "epoch": 0.32684824902723736, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 1.0541, "step": 18060 }, { "epoch": 0.327210207221066, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 0.941, "step": 18080 }, { "epoch": 0.3275721654148946, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 0.7545, "step": 18100 }, { "epoch": 0.3279341236087232, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 2.3557, "step": 18120 }, { "epoch": 0.3282960818025518, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 2.2081, "step": 18140 }, { "epoch": 0.32865803999638044, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 0.251, "step": 18160 }, { "epoch": 0.32901999819020905, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 3.59, "step": 18180 }, { "epoch": 0.32938195638403767, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 2.0915, "step": 18200 }, { "epoch": 0.3297439145778663, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 2.4202, "step": 18220 }, { "epoch": 0.33010587277169484, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 1.7603, "step": 18240 }, { "epoch": 0.33046783096552346, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 3.1097, "step": 18260 }, { "epoch": 0.3308297891593521, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 0.7033, "step": 18280 }, { "epoch": 0.3311917473531807, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 1.1082, "step": 18300 }, { "epoch": 0.3315537055470093, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 0.3281, "step": 18320 }, { "epoch": 0.3319156637408379, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 1.6996, "step": 18340 }, { "epoch": 0.33227762193466653, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 0.6227, "step": 18360 }, { "epoch": 0.33263958012849515, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 4.9563, "step": 18380 }, { "epoch": 0.33300153832232376, "grad_norm": NaN, "learning_rate": 0.00033308198105809256, "loss": 0.0, "step": 18400 }, { "epoch": 0.3333634965161524, "grad_norm": NaN, "learning_rate": 0.0003331362731495446, "loss": 1.7304, "step": 18420 }, { "epoch": 0.333725454709981, "grad_norm": NaN, "learning_rate": 0.00033319056524099656, "loss": 3.5502, "step": 18440 }, { "epoch": 0.3340874129038096, "grad_norm": NaN, "learning_rate": 0.00033319056524099656, "loss": 2.4866, "step": 18460 }, { "epoch": 0.3344493710976382, "grad_norm": NaN, "learning_rate": 0.00033319056524099656, "loss": 0.8953, "step": 18480 }, { "epoch": 0.33481132929146684, "grad_norm": NaN, "learning_rate": 0.00033319056524099656, "loss": 2.035, "step": 18500 }, { "epoch": 0.33517328748529546, "grad_norm": NaN, "learning_rate": 0.00033319056524099656, "loss": 1.7541, "step": 18520 }, { "epoch": 0.3355352456791241, "grad_norm": NaN, "learning_rate": 0.00033319056524099656, "loss": 0.0, "step": 18540 }, { "epoch": 0.3358972038729527, "grad_norm": NaN, "learning_rate": 0.00033319056524099656, "loss": 1.6424, "step": 18560 }, { "epoch": 0.3362591620667813, "grad_norm": NaN, "learning_rate": 0.00033319056524099656, "loss": 2.378, "step": 18580 }, { "epoch": 0.3366211202606099, "grad_norm": NaN, "learning_rate": 0.00033319056524099656, "loss": 2.9569, "step": 18600 }, { "epoch": 0.33698307845443853, "grad_norm": NaN, "learning_rate": 0.00033319056524099656, "loss": 1.7266, "step": 18620 }, { "epoch": 0.33734503664826715, "grad_norm": NaN, "learning_rate": 0.00033319056524099656, "loss": 1.0321, "step": 18640 }, { "epoch": 0.33770699484209576, "grad_norm": NaN, "learning_rate": 0.00033319056524099656, "loss": 0.2778, "step": 18660 }, { "epoch": 0.3380689530359243, "grad_norm": NaN, "learning_rate": 0.0003332448573324486, "loss": 3.4297, "step": 18680 }, { "epoch": 0.33843091122975294, "grad_norm": NaN, "learning_rate": 0.0003332448573324486, "loss": 0.7859, "step": 18700 }, { "epoch": 0.33879286942358156, "grad_norm": NaN, "learning_rate": 0.0003332991494239006, "loss": 3.2499, "step": 18720 }, { "epoch": 0.33915482761741017, "grad_norm": NaN, "learning_rate": 0.0003333534415153526, "loss": 2.2594, "step": 18740 }, { "epoch": 0.3395167858112388, "grad_norm": NaN, "learning_rate": 0.0003333534415153526, "loss": 0.4796, "step": 18760 }, { "epoch": 0.3398787440050674, "grad_norm": NaN, "learning_rate": 0.0003333534415153526, "loss": 0.7544, "step": 18780 }, { "epoch": 0.340240702198896, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 2.3338, "step": 18800 }, { "epoch": 0.34060266039272463, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 4.6119, "step": 18820 }, { "epoch": 0.34096461858655325, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 1.8012, "step": 18840 }, { "epoch": 0.34132657678038186, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 4.9295, "step": 18860 }, { "epoch": 0.3416885349742105, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 1.6249, "step": 18880 }, { "epoch": 0.3420504931680391, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 1.017, "step": 18900 }, { "epoch": 0.3424124513618677, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 0.4396, "step": 18920 }, { "epoch": 0.3427744095556963, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 1.3932, "step": 18940 }, { "epoch": 0.34313636774952494, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 0.9902, "step": 18960 }, { "epoch": 0.34349832594335356, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 4.0204, "step": 18980 }, { "epoch": 0.34386028413718217, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 1.935, "step": 19000 }, { "epoch": 0.3442222423310108, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 1.1265, "step": 19020 }, { "epoch": 0.3445842005248394, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 2.0312, "step": 19040 }, { "epoch": 0.344946158718668, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 1.2756, "step": 19060 }, { "epoch": 0.34530811691249663, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 0.0, "step": 19080 }, { "epoch": 0.3456700751063252, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 0.0, "step": 19100 }, { "epoch": 0.3460320333001538, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 1.3191, "step": 19120 }, { "epoch": 0.3463939914939824, "grad_norm": NaN, "learning_rate": 0.0003334077336068046, "loss": 1.6867, "step": 19140 }, { "epoch": 0.34675594968781104, "grad_norm": NaN, "learning_rate": 0.0003334620256982566, "loss": 1.151, "step": 19160 }, { "epoch": 0.34711790788163965, "grad_norm": NaN, "learning_rate": 0.0003334620256982566, "loss": 0.8367, "step": 19180 }, { "epoch": 0.34747986607546827, "grad_norm": NaN, "learning_rate": 0.0003334620256982566, "loss": 0.6611, "step": 19200 }, { "epoch": 0.3478418242692969, "grad_norm": NaN, "learning_rate": 0.0003334620256982566, "loss": 1.8341, "step": 19220 }, { "epoch": 0.3482037824631255, "grad_norm": NaN, "learning_rate": 0.0003334620256982566, "loss": 2.6597, "step": 19240 }, { "epoch": 0.3485657406569541, "grad_norm": NaN, "learning_rate": 0.0003334620256982566, "loss": 0.5833, "step": 19260 }, { "epoch": 0.34892769885078273, "grad_norm": NaN, "learning_rate": 0.0003334620256982566, "loss": 3.1321, "step": 19280 }, { "epoch": 0.34928965704461135, "grad_norm": NaN, "learning_rate": 0.0003334620256982566, "loss": 3.9864, "step": 19300 }, { "epoch": 0.34965161523843996, "grad_norm": NaN, "learning_rate": 0.0003334620256982566, "loss": 1.4254, "step": 19320 }, { "epoch": 0.3500135734322686, "grad_norm": NaN, "learning_rate": 0.0003334620256982566, "loss": 1.0119, "step": 19340 }, { "epoch": 0.3503755316260972, "grad_norm": NaN, "learning_rate": 0.0003334620256982566, "loss": 3.3433, "step": 19360 }, { "epoch": 0.3507374898199258, "grad_norm": NaN, "learning_rate": 0.0003334620256982566, "loss": 1.7585, "step": 19380 }, { "epoch": 0.3510994480137544, "grad_norm": NaN, "learning_rate": 0.0003334620256982566, "loss": 2.9094, "step": 19400 }, { "epoch": 0.35146140620758304, "grad_norm": NaN, "learning_rate": 0.0003334620256982566, "loss": 1.3262, "step": 19420 }, { "epoch": 0.35182336440141165, "grad_norm": NaN, "learning_rate": 0.0003335163177897086, "loss": 3.6679, "step": 19440 }, { "epoch": 0.35218532259524027, "grad_norm": NaN, "learning_rate": 0.0003335163177897086, "loss": 2.6711, "step": 19460 }, { "epoch": 0.3525472807890689, "grad_norm": NaN, "learning_rate": 0.0003335163177897086, "loss": 0.8779, "step": 19480 }, { "epoch": 0.3529092389828975, "grad_norm": NaN, "learning_rate": 0.0003335163177897086, "loss": 3.4498, "step": 19500 }, { "epoch": 0.3532711971767261, "grad_norm": NaN, "learning_rate": 0.0003335163177897086, "loss": 2.5726, "step": 19520 }, { "epoch": 0.3536331553705547, "grad_norm": NaN, "learning_rate": 0.0003335163177897086, "loss": 0.8757, "step": 19540 }, { "epoch": 0.3539951135643833, "grad_norm": NaN, "learning_rate": 0.0003335163177897086, "loss": 3.5817, "step": 19560 }, { "epoch": 0.3543570717582119, "grad_norm": NaN, "learning_rate": 0.00033357060988116065, "loss": 3.7041, "step": 19580 }, { "epoch": 0.3547190299520405, "grad_norm": NaN, "learning_rate": 0.00033357060988116065, "loss": 1.4522, "step": 19600 }, { "epoch": 0.35508098814586914, "grad_norm": NaN, "learning_rate": 0.0003336249019726126, "loss": 3.5386, "step": 19620 }, { "epoch": 0.35544294633969775, "grad_norm": NaN, "learning_rate": 0.0003336249019726126, "loss": 0.8943, "step": 19640 }, { "epoch": 0.35580490453352637, "grad_norm": NaN, "learning_rate": 0.0003336249019726126, "loss": 1.3094, "step": 19660 }, { "epoch": 0.356166862727355, "grad_norm": NaN, "learning_rate": 0.0003336249019726126, "loss": 1.0738, "step": 19680 }, { "epoch": 0.3565288209211836, "grad_norm": NaN, "learning_rate": 0.0003336249019726126, "loss": 1.1954, "step": 19700 }, { "epoch": 0.3568907791150122, "grad_norm": NaN, "learning_rate": 0.0003336249019726126, "loss": 1.0454, "step": 19720 }, { "epoch": 0.35725273730884083, "grad_norm": NaN, "learning_rate": 0.0003336249019726126, "loss": 0.6583, "step": 19740 }, { "epoch": 0.35761469550266944, "grad_norm": NaN, "learning_rate": 0.0003336249019726126, "loss": 3.3489, "step": 19760 }, { "epoch": 0.35797665369649806, "grad_norm": NaN, "learning_rate": 0.0003336249019726126, "loss": 1.2979, "step": 19780 }, { "epoch": 0.3583386118903267, "grad_norm": NaN, "learning_rate": 0.0003336249019726126, "loss": 0.349, "step": 19800 }, { "epoch": 0.3587005700841553, "grad_norm": NaN, "learning_rate": 0.0003336249019726126, "loss": 1.1312, "step": 19820 }, { "epoch": 0.3590625282779839, "grad_norm": NaN, "learning_rate": 0.0003336249019726126, "loss": 0.2498, "step": 19840 }, { "epoch": 0.3594244864718125, "grad_norm": NaN, "learning_rate": 0.0003336249019726126, "loss": 4.468, "step": 19860 }, { "epoch": 0.35978644466564114, "grad_norm": NaN, "learning_rate": 0.0003336249019726126, "loss": 1.32, "step": 19880 }, { "epoch": 0.36014840285946975, "grad_norm": NaN, "learning_rate": 0.0003336249019726126, "loss": 0.7239, "step": 19900 }, { "epoch": 0.36051036105329837, "grad_norm": NaN, "learning_rate": 0.0003336249019726126, "loss": 1.1135, "step": 19920 }, { "epoch": 0.360872319247127, "grad_norm": NaN, "learning_rate": 0.00033367919406406465, "loss": 1.9236, "step": 19940 }, { "epoch": 0.36123427744095554, "grad_norm": NaN, "learning_rate": 0.00033367919406406465, "loss": 4.2393, "step": 19960 }, { "epoch": 0.36159623563478416, "grad_norm": NaN, "learning_rate": 0.00033367919406406465, "loss": 0.4781, "step": 19980 }, { "epoch": 0.3619581938286128, "grad_norm": NaN, "learning_rate": 0.00033367919406406465, "loss": 0.3781, "step": 20000 }, { "epoch": 0.3619581938286128, "eval_accuracy": 4.5924917557918516e-05, "eval_loss": NaN, "eval_runtime": 169.5822, "eval_samples_per_second": 3584.338, "eval_steps_per_second": 3.503, "step": 20000 }, { "epoch": 0.3623201520224414, "grad_norm": NaN, "learning_rate": 0.00033367919406406465, "loss": 0.5977, "step": 20020 }, { "epoch": 0.36268211021627, "grad_norm": NaN, "learning_rate": 0.00033367919406406465, "loss": 1.1533, "step": 20040 }, { "epoch": 0.3630440684100986, "grad_norm": NaN, "learning_rate": 0.00033367919406406465, "loss": 0.3765, "step": 20060 }, { "epoch": 0.36340602660392723, "grad_norm": NaN, "learning_rate": 0.00033367919406406465, "loss": 2.33, "step": 20080 }, { "epoch": 0.36376798479775585, "grad_norm": NaN, "learning_rate": 0.0003337334861555167, "loss": 1.2777, "step": 20100 }, { "epoch": 0.36412994299158447, "grad_norm": NaN, "learning_rate": 0.0003337334861555167, "loss": 1.422, "step": 20120 }, { "epoch": 0.3644919011854131, "grad_norm": NaN, "learning_rate": 0.00033378777824696865, "loss": 2.2333, "step": 20140 }, { "epoch": 0.3648538593792417, "grad_norm": NaN, "learning_rate": 0.00033378777824696865, "loss": 3.8144, "step": 20160 }, { "epoch": 0.3652158175730703, "grad_norm": NaN, "learning_rate": 0.00033378777824696865, "loss": 2.2294, "step": 20180 }, { "epoch": 0.3655777757668989, "grad_norm": NaN, "learning_rate": 0.00033378777824696865, "loss": 5.1316, "step": 20200 }, { "epoch": 0.36593973396072754, "grad_norm": NaN, "learning_rate": 0.00033378777824696865, "loss": 1.9572, "step": 20220 }, { "epoch": 0.36630169215455616, "grad_norm": NaN, "learning_rate": 0.00033378777824696865, "loss": 1.0426, "step": 20240 }, { "epoch": 0.3666636503483848, "grad_norm": NaN, "learning_rate": 0.00033378777824696865, "loss": 2.1384, "step": 20260 }, { "epoch": 0.3670256085422134, "grad_norm": NaN, "learning_rate": 0.00033378777824696865, "loss": 2.2363, "step": 20280 }, { "epoch": 0.367387566736042, "grad_norm": NaN, "learning_rate": 0.00033378777824696865, "loss": 1.0018, "step": 20300 }, { "epoch": 0.3677495249298706, "grad_norm": NaN, "learning_rate": 0.00033378777824696865, "loss": 1.5234, "step": 20320 }, { "epoch": 0.36811148312369923, "grad_norm": NaN, "learning_rate": 0.00033378777824696865, "loss": 1.6194, "step": 20340 }, { "epoch": 0.36847344131752785, "grad_norm": NaN, "learning_rate": 0.0003338420703384207, "loss": 1.3923, "step": 20360 }, { "epoch": 0.36883539951135647, "grad_norm": NaN, "learning_rate": 0.0003338420703384207, "loss": 3.7858, "step": 20380 }, { "epoch": 0.369197357705185, "grad_norm": NaN, "learning_rate": 0.0003338420703384207, "loss": 0.0, "step": 20400 }, { "epoch": 0.36955931589901364, "grad_norm": NaN, "learning_rate": 0.0003338420703384207, "loss": 1.6126, "step": 20420 }, { "epoch": 0.36992127409284226, "grad_norm": NaN, "learning_rate": 0.0003338420703384207, "loss": 1.3086, "step": 20440 }, { "epoch": 0.37028323228667087, "grad_norm": NaN, "learning_rate": 0.0003338420703384207, "loss": 1.6198, "step": 20460 }, { "epoch": 0.3706451904804995, "grad_norm": NaN, "learning_rate": 0.0003338420703384207, "loss": 2.5732, "step": 20480 }, { "epoch": 0.3710071486743281, "grad_norm": NaN, "learning_rate": 0.0003338420703384207, "loss": 0.8089, "step": 20500 }, { "epoch": 0.3713691068681567, "grad_norm": NaN, "learning_rate": 0.0003338420703384207, "loss": 2.9731, "step": 20520 }, { "epoch": 0.37173106506198533, "grad_norm": NaN, "learning_rate": 0.0003338963624298727, "loss": 4.8895, "step": 20540 }, { "epoch": 0.37209302325581395, "grad_norm": NaN, "learning_rate": 0.0003338963624298727, "loss": 0.3752, "step": 20560 }, { "epoch": 0.37245498144964256, "grad_norm": NaN, "learning_rate": 0.0003338963624298727, "loss": 0.4367, "step": 20580 }, { "epoch": 0.3728169396434712, "grad_norm": NaN, "learning_rate": 0.0003338963624298727, "loss": 2.6872, "step": 20600 }, { "epoch": 0.3731788978372998, "grad_norm": NaN, "learning_rate": 0.0003338963624298727, "loss": 0.0, "step": 20620 }, { "epoch": 0.3735408560311284, "grad_norm": NaN, "learning_rate": 0.0003339506545213247, "loss": 3.9871, "step": 20640 }, { "epoch": 0.373902814224957, "grad_norm": NaN, "learning_rate": 0.0003339506545213247, "loss": 1.3163, "step": 20660 }, { "epoch": 0.37426477241878564, "grad_norm": NaN, "learning_rate": 0.0003340049466127767, "loss": 3.2523, "step": 20680 }, { "epoch": 0.37462673061261426, "grad_norm": NaN, "learning_rate": 0.0003340049466127767, "loss": 0.5847, "step": 20700 }, { "epoch": 0.37498868880644287, "grad_norm": NaN, "learning_rate": 0.0003340049466127767, "loss": 1.2382, "step": 20720 }, { "epoch": 0.3753506470002715, "grad_norm": NaN, "learning_rate": 0.0003340049466127767, "loss": 2.0603, "step": 20740 }, { "epoch": 0.3757126051941001, "grad_norm": NaN, "learning_rate": 0.0003340049466127767, "loss": 3.7686, "step": 20760 }, { "epoch": 0.3760745633879287, "grad_norm": NaN, "learning_rate": 0.0003340049466127767, "loss": 3.0872, "step": 20780 }, { "epoch": 0.37643652158175733, "grad_norm": NaN, "learning_rate": 0.0003340049466127767, "loss": 1.7594, "step": 20800 }, { "epoch": 0.3767984797755859, "grad_norm": NaN, "learning_rate": 0.0003340049466127767, "loss": 2.162, "step": 20820 }, { "epoch": 0.3771604379694145, "grad_norm": NaN, "learning_rate": 0.0003340049466127767, "loss": 3.6616, "step": 20840 }, { "epoch": 0.3775223961632431, "grad_norm": NaN, "learning_rate": 0.0003340049466127767, "loss": 2.6705, "step": 20860 }, { "epoch": 0.37788435435707174, "grad_norm": NaN, "learning_rate": 0.0003340049466127767, "loss": 0.4361, "step": 20880 }, { "epoch": 0.37824631255090035, "grad_norm": NaN, "learning_rate": 0.0003340049466127767, "loss": 0.2951, "step": 20900 }, { "epoch": 0.37860827074472897, "grad_norm": NaN, "learning_rate": 0.0003340049466127767, "loss": 0.7553, "step": 20920 }, { "epoch": 0.3789702289385576, "grad_norm": NaN, "learning_rate": 0.0003340049466127767, "loss": 2.8237, "step": 20940 }, { "epoch": 0.3793321871323862, "grad_norm": NaN, "learning_rate": 0.00033405923870422874, "loss": 3.6588, "step": 20960 }, { "epoch": 0.3796941453262148, "grad_norm": NaN, "learning_rate": 0.00033405923870422874, "loss": 0.5848, "step": 20980 }, { "epoch": 0.38005610352004343, "grad_norm": NaN, "learning_rate": 0.00033405923870422874, "loss": 3.7641, "step": 21000 }, { "epoch": 0.38041806171387205, "grad_norm": NaN, "learning_rate": 0.00033405923870422874, "loss": 1.2978, "step": 21020 }, { "epoch": 0.38078001990770066, "grad_norm": NaN, "learning_rate": 0.00033405923870422874, "loss": 1.9695, "step": 21040 }, { "epoch": 0.3811419781015293, "grad_norm": NaN, "learning_rate": 0.00033411353079568077, "loss": 3.3305, "step": 21060 }, { "epoch": 0.3815039362953579, "grad_norm": NaN, "learning_rate": 0.0003341678228871328, "loss": 3.0131, "step": 21080 }, { "epoch": 0.3818658944891865, "grad_norm": NaN, "learning_rate": 0.0003341678228871328, "loss": 1.7585, "step": 21100 }, { "epoch": 0.3822278526830151, "grad_norm": NaN, "learning_rate": 0.0003341678228871328, "loss": 0.6571, "step": 21120 }, { "epoch": 0.38258981087684374, "grad_norm": NaN, "learning_rate": 0.0003341678228871328, "loss": 0.7245, "step": 21140 }, { "epoch": 0.38295176907067235, "grad_norm": NaN, "learning_rate": 0.0003341678228871328, "loss": 0.526, "step": 21160 }, { "epoch": 0.38331372726450097, "grad_norm": NaN, "learning_rate": 0.0003341678228871328, "loss": 3.3944, "step": 21180 }, { "epoch": 0.3836756854583296, "grad_norm": NaN, "learning_rate": 0.0003341678228871328, "loss": 3.025, "step": 21200 }, { "epoch": 0.3840376436521582, "grad_norm": NaN, "learning_rate": 0.0003342221149785848, "loss": 1.5639, "step": 21220 }, { "epoch": 0.3843996018459868, "grad_norm": NaN, "learning_rate": 0.0003342764070700368, "loss": 0.8148, "step": 21240 }, { "epoch": 0.3847615600398154, "grad_norm": NaN, "learning_rate": 0.0003342764070700368, "loss": 2.7396, "step": 21260 }, { "epoch": 0.385123518233644, "grad_norm": NaN, "learning_rate": 0.0003342764070700368, "loss": 2.7556, "step": 21280 }, { "epoch": 0.3854854764274726, "grad_norm": NaN, "learning_rate": 0.0003342764070700368, "loss": 0.0, "step": 21300 }, { "epoch": 0.3858474346213012, "grad_norm": NaN, "learning_rate": 0.0003342764070700368, "loss": 4.2935, "step": 21320 }, { "epoch": 0.38620939281512984, "grad_norm": NaN, "learning_rate": 0.0003342764070700368, "loss": 3.2179, "step": 21340 }, { "epoch": 0.38657135100895845, "grad_norm": NaN, "learning_rate": 0.0003342764070700368, "loss": 1.0393, "step": 21360 }, { "epoch": 0.38693330920278707, "grad_norm": NaN, "learning_rate": 0.0003342764070700368, "loss": 1.0742, "step": 21380 }, { "epoch": 0.3872952673966157, "grad_norm": NaN, "learning_rate": 0.0003342764070700368, "loss": 0.9591, "step": 21400 }, { "epoch": 0.3876572255904443, "grad_norm": NaN, "learning_rate": 0.00033433069916148883, "loss": 2.9979, "step": 21420 }, { "epoch": 0.3880191837842729, "grad_norm": NaN, "learning_rate": 0.00033433069916148883, "loss": 4.7197, "step": 21440 }, { "epoch": 0.38838114197810153, "grad_norm": NaN, "learning_rate": 0.0003343849912529408, "loss": 1.4314, "step": 21460 }, { "epoch": 0.38874310017193014, "grad_norm": NaN, "learning_rate": 0.0003343849912529408, "loss": 0.4766, "step": 21480 }, { "epoch": 0.38910505836575876, "grad_norm": NaN, "learning_rate": 0.00033443928334439283, "loss": 3.4382, "step": 21500 }, { "epoch": 0.3894670165595874, "grad_norm": NaN, "learning_rate": 0.00033443928334439283, "loss": 4.6451, "step": 21520 }, { "epoch": 0.389828974753416, "grad_norm": NaN, "learning_rate": 0.00033443928334439283, "loss": 2.5881, "step": 21540 }, { "epoch": 0.3901909329472446, "grad_norm": NaN, "learning_rate": 0.00033443928334439283, "loss": 1.1637, "step": 21560 }, { "epoch": 0.3905528911410732, "grad_norm": NaN, "learning_rate": 0.00033443928334439283, "loss": 2.6228, "step": 21580 }, { "epoch": 0.39091484933490184, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 1.6142, "step": 21600 }, { "epoch": 0.39127680752873045, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 4.5033, "step": 21620 }, { "epoch": 0.39163876572255907, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 2.9984, "step": 21640 }, { "epoch": 0.3920007239163877, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 1.8573, "step": 21660 }, { "epoch": 0.39236268211021624, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 2.6262, "step": 21680 }, { "epoch": 0.39272464030404486, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 1.3206, "step": 21700 }, { "epoch": 0.3930865984978735, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 1.9881, "step": 21720 }, { "epoch": 0.3934485566917021, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 0.2504, "step": 21740 }, { "epoch": 0.3938105148855307, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 1.9533, "step": 21760 }, { "epoch": 0.3941724730793593, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 0.7642, "step": 21780 }, { "epoch": 0.39453443127318794, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 3.9072, "step": 21800 }, { "epoch": 0.39489638946701655, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 0.0, "step": 21820 }, { "epoch": 0.39525834766084517, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 0.0, "step": 21840 }, { "epoch": 0.3956203058546738, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 1.3351, "step": 21860 }, { "epoch": 0.3959822640485024, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 0.0, "step": 21880 }, { "epoch": 0.396344222242331, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 0.3306, "step": 21900 }, { "epoch": 0.39670618043615963, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 1.5289, "step": 21920 }, { "epoch": 0.39706813862998824, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 3.501, "step": 21940 }, { "epoch": 0.39743009682381686, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 1.5889, "step": 21960 }, { "epoch": 0.3977920550176455, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 0.7611, "step": 21980 }, { "epoch": 0.3981540132114741, "grad_norm": NaN, "learning_rate": 0.00033449357543584486, "loss": 4.1066, "step": 22000 }, { "epoch": 0.3985159714053027, "grad_norm": NaN, "learning_rate": 0.00033454786752729683, "loss": 0.5246, "step": 22020 }, { "epoch": 0.3988779295991313, "grad_norm": NaN, "learning_rate": 0.00033454786752729683, "loss": 1.4201, "step": 22040 }, { "epoch": 0.39923988779295994, "grad_norm": NaN, "learning_rate": 0.00033454786752729683, "loss": 1.3488, "step": 22060 }, { "epoch": 0.39960184598678855, "grad_norm": NaN, "learning_rate": 0.00033454786752729683, "loss": 0.8468, "step": 22080 }, { "epoch": 0.39996380418061717, "grad_norm": NaN, "learning_rate": 0.00033454786752729683, "loss": 3.3495, "step": 22100 }, { "epoch": 0.4003257623744457, "grad_norm": NaN, "learning_rate": 0.00033454786752729683, "loss": 3.8645, "step": 22120 }, { "epoch": 0.40068772056827434, "grad_norm": NaN, "learning_rate": 0.00033454786752729683, "loss": 0.5808, "step": 22140 }, { "epoch": 0.40104967876210296, "grad_norm": NaN, "learning_rate": 0.00033454786752729683, "loss": 3.0974, "step": 22160 }, { "epoch": 0.40141163695593157, "grad_norm": NaN, "learning_rate": 0.00033454786752729683, "loss": 1.3186, "step": 22180 }, { "epoch": 0.4017735951497602, "grad_norm": NaN, "learning_rate": 0.00033454786752729683, "loss": 1.8253, "step": 22200 }, { "epoch": 0.4021355533435888, "grad_norm": NaN, "learning_rate": 0.00033460215961874886, "loss": 1.6486, "step": 22220 }, { "epoch": 0.4024975115374174, "grad_norm": NaN, "learning_rate": 0.00033460215961874886, "loss": 1.1772, "step": 22240 }, { "epoch": 0.40285946973124603, "grad_norm": NaN, "learning_rate": 0.00033460215961874886, "loss": 1.6682, "step": 22260 }, { "epoch": 0.40322142792507465, "grad_norm": NaN, "learning_rate": 0.00033460215961874886, "loss": 0.6574, "step": 22280 }, { "epoch": 0.40358338611890326, "grad_norm": NaN, "learning_rate": 0.00033460215961874886, "loss": 1.1711, "step": 22300 }, { "epoch": 0.4039453443127319, "grad_norm": NaN, "learning_rate": 0.00033460215961874886, "loss": 3.0751, "step": 22320 }, { "epoch": 0.4043073025065605, "grad_norm": NaN, "learning_rate": 0.00033460215961874886, "loss": 3.3668, "step": 22340 }, { "epoch": 0.4046692607003891, "grad_norm": NaN, "learning_rate": 0.00033460215961874886, "loss": 2.1696, "step": 22360 }, { "epoch": 0.4050312188942177, "grad_norm": NaN, "learning_rate": 0.00033460215961874886, "loss": 0.6277, "step": 22380 }, { "epoch": 0.40539317708804634, "grad_norm": NaN, "learning_rate": 0.00033460215961874886, "loss": 0.7426, "step": 22400 }, { "epoch": 0.40575513528187496, "grad_norm": NaN, "learning_rate": 0.00033460215961874886, "loss": 3.4131, "step": 22420 }, { "epoch": 0.40611709347570357, "grad_norm": NaN, "learning_rate": 0.00033460215961874886, "loss": 2.467, "step": 22440 }, { "epoch": 0.4064790516695322, "grad_norm": NaN, "learning_rate": 0.00033471074380165286, "loss": 4.0799, "step": 22460 }, { "epoch": 0.4068410098633608, "grad_norm": NaN, "learning_rate": 0.00033471074380165286, "loss": 1.5376, "step": 22480 }, { "epoch": 0.4072029680571894, "grad_norm": NaN, "learning_rate": 0.00033471074380165286, "loss": 0.2512, "step": 22500 }, { "epoch": 0.40756492625101803, "grad_norm": NaN, "learning_rate": 0.00033471074380165286, "loss": 0.2534, "step": 22520 }, { "epoch": 0.4079268844448466, "grad_norm": NaN, "learning_rate": 0.00033471074380165286, "loss": 2.1457, "step": 22540 }, { "epoch": 0.4082888426386752, "grad_norm": NaN, "learning_rate": 0.00033471074380165286, "loss": 1.1894, "step": 22560 }, { "epoch": 0.4086508008325038, "grad_norm": NaN, "learning_rate": 0.00033471074380165286, "loss": 1.185, "step": 22580 }, { "epoch": 0.40901275902633244, "grad_norm": NaN, "learning_rate": 0.0003347650358931049, "loss": 3.1199, "step": 22600 }, { "epoch": 0.40937471722016106, "grad_norm": NaN, "learning_rate": 0.0003347650358931049, "loss": 2.0831, "step": 22620 }, { "epoch": 0.40973667541398967, "grad_norm": NaN, "learning_rate": 0.0003347650358931049, "loss": 0.845, "step": 22640 }, { "epoch": 0.4100986336078183, "grad_norm": NaN, "learning_rate": 0.0003348193279845569, "loss": 3.2634, "step": 22660 }, { "epoch": 0.4104605918016469, "grad_norm": NaN, "learning_rate": 0.0003348736200760089, "loss": 3.371, "step": 22680 }, { "epoch": 0.4108225499954755, "grad_norm": NaN, "learning_rate": 0.0003349279121674609, "loss": 3.0996, "step": 22700 }, { "epoch": 0.41118450818930413, "grad_norm": NaN, "learning_rate": 0.0003349279121674609, "loss": 2.624, "step": 22720 }, { "epoch": 0.41154646638313275, "grad_norm": NaN, "learning_rate": 0.00033498220425891295, "loss": 3.3037, "step": 22740 }, { "epoch": 0.41190842457696136, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 1.4924, "step": 22760 }, { "epoch": 0.41227038277079, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 3.0942, "step": 22780 }, { "epoch": 0.4126323409646186, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 2.9515, "step": 22800 }, { "epoch": 0.4129942991584472, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 1.0037, "step": 22820 }, { "epoch": 0.4133562573522758, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 0.6821, "step": 22840 }, { "epoch": 0.41371821554610444, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 2.6087, "step": 22860 }, { "epoch": 0.41408017373993306, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 3.5579, "step": 22880 }, { "epoch": 0.41444213193376167, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 0.7588, "step": 22900 }, { "epoch": 0.4148040901275903, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 1.4643, "step": 22920 }, { "epoch": 0.4151660483214189, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 3.5402, "step": 22940 }, { "epoch": 0.4155280065152475, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 0.8158, "step": 22960 }, { "epoch": 0.4158899647090761, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 1.2537, "step": 22980 }, { "epoch": 0.4162519229029047, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 2.9486, "step": 23000 }, { "epoch": 0.4166138810967333, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 2.2822, "step": 23020 }, { "epoch": 0.4169758392905619, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 0.8744, "step": 23040 }, { "epoch": 0.41733779748439054, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 0.8093, "step": 23060 }, { "epoch": 0.41769975567821915, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 1.1765, "step": 23080 }, { "epoch": 0.41806171387204777, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 4.4135, "step": 23100 }, { "epoch": 0.4184236720658764, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 2.2565, "step": 23120 }, { "epoch": 0.418785630259705, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 1.2795, "step": 23140 }, { "epoch": 0.4191475884535336, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 1.7489, "step": 23160 }, { "epoch": 0.41950954664736223, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 0.4425, "step": 23180 }, { "epoch": 0.41987150484119085, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 1.6838, "step": 23200 }, { "epoch": 0.42023346303501946, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 2.038, "step": 23220 }, { "epoch": 0.4205954212288481, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 2.0615, "step": 23240 }, { "epoch": 0.4209573794226767, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 1.4218, "step": 23260 }, { "epoch": 0.4213193376165053, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 2.4517, "step": 23280 }, { "epoch": 0.4216812958103339, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 0.7847, "step": 23300 }, { "epoch": 0.42204325400416254, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 3.9707, "step": 23320 }, { "epoch": 0.42240521219799115, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 0.9281, "step": 23340 }, { "epoch": 0.42276717039181977, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 2.9033, "step": 23360 }, { "epoch": 0.4231291285856484, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 0.5237, "step": 23380 }, { "epoch": 0.42349108677947694, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 1.5621, "step": 23400 }, { "epoch": 0.42385304497330556, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 0.3501, "step": 23420 }, { "epoch": 0.4242150031671342, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 1.1926, "step": 23440 }, { "epoch": 0.4245769613609628, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 2.1763, "step": 23460 }, { "epoch": 0.4249389195547914, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 0.5022, "step": 23480 }, { "epoch": 0.42530087774862, "grad_norm": NaN, "learning_rate": 0.0003350364963503649, "loss": 0.7589, "step": 23500 }, { "epoch": 0.42566283594244864, "grad_norm": NaN, "learning_rate": 0.00033509078844181695, "loss": 1.9113, "step": 23520 }, { "epoch": 0.42602479413627725, "grad_norm": NaN, "learning_rate": 0.00033509078844181695, "loss": 3.2865, "step": 23540 }, { "epoch": 0.42638675233010587, "grad_norm": NaN, "learning_rate": 0.00033509078844181695, "loss": 1.466, "step": 23560 }, { "epoch": 0.4267487105239345, "grad_norm": NaN, "learning_rate": 0.00033509078844181695, "loss": 1.467, "step": 23580 }, { "epoch": 0.4271106687177631, "grad_norm": NaN, "learning_rate": 0.00033509078844181695, "loss": 5.8724, "step": 23600 }, { "epoch": 0.4274726269115917, "grad_norm": NaN, "learning_rate": 0.00033509078844181695, "loss": 0.8428, "step": 23620 }, { "epoch": 0.42783458510542033, "grad_norm": NaN, "learning_rate": 0.00033509078844181695, "loss": 1.5917, "step": 23640 }, { "epoch": 0.42819654329924894, "grad_norm": NaN, "learning_rate": 0.00033509078844181695, "loss": 0.3767, "step": 23660 }, { "epoch": 0.42855850149307756, "grad_norm": NaN, "learning_rate": 0.00033509078844181695, "loss": 0.9215, "step": 23680 }, { "epoch": 0.4289204596869062, "grad_norm": NaN, "learning_rate": 0.00033509078844181695, "loss": 1.494, "step": 23700 }, { "epoch": 0.4292824178807348, "grad_norm": NaN, "learning_rate": 0.00033514508053326893, "loss": 1.6466, "step": 23720 }, { "epoch": 0.4296443760745634, "grad_norm": NaN, "learning_rate": 0.00033514508053326893, "loss": 2.9783, "step": 23740 }, { "epoch": 0.430006334268392, "grad_norm": NaN, "learning_rate": 0.00033514508053326893, "loss": 2.1064, "step": 23760 }, { "epoch": 0.43036829246222064, "grad_norm": NaN, "learning_rate": 0.00033514508053326893, "loss": 1.8096, "step": 23780 }, { "epoch": 0.43073025065604925, "grad_norm": NaN, "learning_rate": 0.000335199372624721, "loss": 3.5151, "step": 23800 }, { "epoch": 0.4310922088498778, "grad_norm": NaN, "learning_rate": 0.000335199372624721, "loss": 0.3762, "step": 23820 }, { "epoch": 0.4314541670437064, "grad_norm": NaN, "learning_rate": 0.000335199372624721, "loss": 4.9286, "step": 23840 }, { "epoch": 0.43181612523753504, "grad_norm": NaN, "learning_rate": 0.000335199372624721, "loss": 0.9628, "step": 23860 }, { "epoch": 0.43217808343136366, "grad_norm": NaN, "learning_rate": 0.000335199372624721, "loss": 1.0853, "step": 23880 }, { "epoch": 0.4325400416251923, "grad_norm": NaN, "learning_rate": 0.000335199372624721, "loss": 0.5847, "step": 23900 }, { "epoch": 0.4329019998190209, "grad_norm": NaN, "learning_rate": 0.000335199372624721, "loss": 1.722, "step": 23920 }, { "epoch": 0.4332639580128495, "grad_norm": NaN, "learning_rate": 0.000335199372624721, "loss": 0.6478, "step": 23940 }, { "epoch": 0.4336259162066781, "grad_norm": NaN, "learning_rate": 0.000335199372624721, "loss": 2.2296, "step": 23960 }, { "epoch": 0.43398787440050673, "grad_norm": NaN, "learning_rate": 0.00033525366471617304, "loss": 1.7298, "step": 23980 }, { "epoch": 0.43434983259433535, "grad_norm": NaN, "learning_rate": 0.00033525366471617304, "loss": 2.3659, "step": 24000 }, { "epoch": 0.43471179078816397, "grad_norm": NaN, "learning_rate": 0.00033525366471617304, "loss": 1.3267, "step": 24020 }, { "epoch": 0.4350737489819926, "grad_norm": NaN, "learning_rate": 0.00033525366471617304, "loss": 0.4819, "step": 24040 }, { "epoch": 0.4354357071758212, "grad_norm": NaN, "learning_rate": 0.000335307956807625, "loss": 5.7384, "step": 24060 }, { "epoch": 0.4357976653696498, "grad_norm": NaN, "learning_rate": 0.000335307956807625, "loss": 0.7603, "step": 24080 }, { "epoch": 0.4361596235634784, "grad_norm": NaN, "learning_rate": 0.000335307956807625, "loss": 1.6447, "step": 24100 }, { "epoch": 0.43652158175730704, "grad_norm": NaN, "learning_rate": 0.000335307956807625, "loss": 1.6689, "step": 24120 }, { "epoch": 0.43688353995113566, "grad_norm": NaN, "learning_rate": 0.000335307956807625, "loss": 0.6656, "step": 24140 }, { "epoch": 0.4372454981449643, "grad_norm": NaN, "learning_rate": 0.000335307956807625, "loss": 0.0, "step": 24160 }, { "epoch": 0.4376074563387929, "grad_norm": NaN, "learning_rate": 0.000335307956807625, "loss": 1.3209, "step": 24180 }, { "epoch": 0.4379694145326215, "grad_norm": NaN, "learning_rate": 0.000335307956807625, "loss": 0.9676, "step": 24200 }, { "epoch": 0.4383313727264501, "grad_norm": NaN, "learning_rate": 0.000335307956807625, "loss": 2.585, "step": 24220 }, { "epoch": 0.43869333092027873, "grad_norm": NaN, "learning_rate": 0.000335307956807625, "loss": 0.293, "step": 24240 }, { "epoch": 0.4390552891141073, "grad_norm": NaN, "learning_rate": 0.000335307956807625, "loss": 1.1345, "step": 24260 }, { "epoch": 0.4394172473079359, "grad_norm": NaN, "learning_rate": 0.000335307956807625, "loss": 2.5312, "step": 24280 }, { "epoch": 0.4397792055017645, "grad_norm": NaN, "learning_rate": 0.000335307956807625, "loss": 1.4933, "step": 24300 }, { "epoch": 0.44014116369559314, "grad_norm": NaN, "learning_rate": 0.000335307956807625, "loss": 0.6842, "step": 24320 }, { "epoch": 0.44050312188942176, "grad_norm": NaN, "learning_rate": 0.000335307956807625, "loss": 2.5811, "step": 24340 }, { "epoch": 0.44086508008325037, "grad_norm": NaN, "learning_rate": 0.00033536224889907704, "loss": 3.1952, "step": 24360 }, { "epoch": 0.441227038277079, "grad_norm": NaN, "learning_rate": 0.00033536224889907704, "loss": 2.7716, "step": 24380 }, { "epoch": 0.4415889964709076, "grad_norm": NaN, "learning_rate": 0.00033536224889907704, "loss": 2.7827, "step": 24400 }, { "epoch": 0.4419509546647362, "grad_norm": NaN, "learning_rate": 0.00033536224889907704, "loss": 1.2902, "step": 24420 }, { "epoch": 0.44231291285856483, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 3.7062, "step": 24440 }, { "epoch": 0.44267487105239345, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 2.7749, "step": 24460 }, { "epoch": 0.44303682924622206, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 3.4393, "step": 24480 }, { "epoch": 0.4433987874400507, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 0.0, "step": 24500 }, { "epoch": 0.4437607456338793, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 0.7504, "step": 24520 }, { "epoch": 0.4441227038277079, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 0.0, "step": 24540 }, { "epoch": 0.4444846620215365, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 2.9392, "step": 24560 }, { "epoch": 0.44484662021536514, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 0.35, "step": 24580 }, { "epoch": 0.44520857840919376, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 1.172, "step": 24600 }, { "epoch": 0.44557053660302237, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 2.9951, "step": 24620 }, { "epoch": 0.445932494796851, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 1.7868, "step": 24640 }, { "epoch": 0.4462944529906796, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 3.5226, "step": 24660 }, { "epoch": 0.44665641118450816, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 2.6764, "step": 24680 }, { "epoch": 0.4470183693783368, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 0.5111, "step": 24700 }, { "epoch": 0.4473803275721654, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 1.3284, "step": 24720 }, { "epoch": 0.447742285765994, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 1.3755, "step": 24740 }, { "epoch": 0.4481042439598226, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 3.4041, "step": 24760 }, { "epoch": 0.44846620215365124, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 1.2108, "step": 24780 }, { "epoch": 0.44882816034747985, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 0.0, "step": 24800 }, { "epoch": 0.44919011854130847, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 0.9631, "step": 24820 }, { "epoch": 0.4495520767351371, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 1.3788, "step": 24840 }, { "epoch": 0.4499140349289657, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 2.2883, "step": 24860 }, { "epoch": 0.4502759931227943, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 3.9153, "step": 24880 }, { "epoch": 0.45063795131662293, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 1.9303, "step": 24900 }, { "epoch": 0.45099990951045155, "grad_norm": NaN, "learning_rate": 0.00033541654099052907, "loss": 2.4893, "step": 24920 }, { "epoch": 0.45136186770428016, "grad_norm": NaN, "learning_rate": 0.00033552512517343307, "loss": 6.7374, "step": 24940 }, { "epoch": 0.4517238258981088, "grad_norm": NaN, "learning_rate": 0.00033552512517343307, "loss": 0.8976, "step": 24960 }, { "epoch": 0.4520857840919374, "grad_norm": NaN, "learning_rate": 0.00033552512517343307, "loss": 0.407, "step": 24980 }, { "epoch": 0.452447742285766, "grad_norm": NaN, "learning_rate": 0.00033552512517343307, "loss": 1.5422, "step": 25000 }, { "epoch": 0.452447742285766, "eval_accuracy": 4.18397557930057e-05, "eval_loss": NaN, "eval_runtime": 170.2569, "eval_samples_per_second": 3570.134, "eval_steps_per_second": 3.489, "step": 25000 }, { "epoch": 0.4528097004795946, "grad_norm": NaN, "learning_rate": 0.00033552512517343307, "loss": 2.3439, "step": 25020 }, { "epoch": 0.45317165867342324, "grad_norm": NaN, "learning_rate": 0.00033552512517343307, "loss": 1.4188, "step": 25040 }, { "epoch": 0.45353361686725185, "grad_norm": NaN, "learning_rate": 0.00033552512517343307, "loss": 1.5832, "step": 25060 }, { "epoch": 0.45389557506108047, "grad_norm": NaN, "learning_rate": 0.00033552512517343307, "loss": 2.3008, "step": 25080 }, { "epoch": 0.4542575332549091, "grad_norm": NaN, "learning_rate": 0.00033552512517343307, "loss": 1.7512, "step": 25100 }, { "epoch": 0.45461949144873764, "grad_norm": NaN, "learning_rate": 0.00033552512517343307, "loss": 2.7761, "step": 25120 }, { "epoch": 0.45498144964256626, "grad_norm": NaN, "learning_rate": 0.00033552512517343307, "loss": 1.913, "step": 25140 }, { "epoch": 0.4553434078363949, "grad_norm": NaN, "learning_rate": 0.00033552512517343307, "loss": 1.0217, "step": 25160 }, { "epoch": 0.4557053660302235, "grad_norm": NaN, "learning_rate": 0.00033552512517343307, "loss": 1.5315, "step": 25180 }, { "epoch": 0.4560673242240521, "grad_norm": NaN, "learning_rate": 0.00033552512517343307, "loss": 2.271, "step": 25200 }, { "epoch": 0.4564292824178807, "grad_norm": NaN, "learning_rate": 0.00033552512517343307, "loss": 1.9121, "step": 25220 }, { "epoch": 0.45679124061170934, "grad_norm": NaN, "learning_rate": 0.00033552512517343307, "loss": 3.6262, "step": 25240 }, { "epoch": 0.45715319880553795, "grad_norm": NaN, "learning_rate": 0.0003355794172648851, "loss": 2.9812, "step": 25260 }, { "epoch": 0.45751515699936657, "grad_norm": NaN, "learning_rate": 0.0003355794172648851, "loss": 0.6566, "step": 25280 }, { "epoch": 0.4578771151931952, "grad_norm": NaN, "learning_rate": 0.0003355794172648851, "loss": 3.5261, "step": 25300 }, { "epoch": 0.4582390733870238, "grad_norm": NaN, "learning_rate": 0.0003355794172648851, "loss": 2.4145, "step": 25320 }, { "epoch": 0.4586010315808524, "grad_norm": NaN, "learning_rate": 0.0003355794172648851, "loss": 1.6088, "step": 25340 }, { "epoch": 0.45896298977468103, "grad_norm": NaN, "learning_rate": 0.0003355794172648851, "loss": 0.8194, "step": 25360 }, { "epoch": 0.45932494796850964, "grad_norm": NaN, "learning_rate": 0.0003355794172648851, "loss": 2.6297, "step": 25380 }, { "epoch": 0.45968690616233826, "grad_norm": NaN, "learning_rate": 0.0003355794172648851, "loss": 0.4046, "step": 25400 }, { "epoch": 0.4600488643561669, "grad_norm": NaN, "learning_rate": 0.0003355794172648851, "loss": 3.057, "step": 25420 }, { "epoch": 0.4604108225499955, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 1.7057, "step": 25440 }, { "epoch": 0.4607727807438241, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 1.311, "step": 25460 }, { "epoch": 0.4611347389376527, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 1.0188, "step": 25480 }, { "epoch": 0.46149669713148134, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 0.5968, "step": 25500 }, { "epoch": 0.46185865532530995, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 0.5252, "step": 25520 }, { "epoch": 0.4622206135191385, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 0.3101, "step": 25540 }, { "epoch": 0.4625825717129671, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 0.0, "step": 25560 }, { "epoch": 0.46294452990679574, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 1.9277, "step": 25580 }, { "epoch": 0.46330648810062436, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 1.8473, "step": 25600 }, { "epoch": 0.463668446294453, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 3.0616, "step": 25620 }, { "epoch": 0.4640304044882816, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 4.6299, "step": 25640 }, { "epoch": 0.4643923626821102, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 1.641, "step": 25660 }, { "epoch": 0.4647543208759388, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 0.8742, "step": 25680 }, { "epoch": 0.46511627906976744, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 1.2366, "step": 25700 }, { "epoch": 0.46547823726359605, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 1.6146, "step": 25720 }, { "epoch": 0.46584019545742467, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 1.3745, "step": 25740 }, { "epoch": 0.4662021536512533, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 2.6726, "step": 25760 }, { "epoch": 0.4665641118450819, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 2.3095, "step": 25780 }, { "epoch": 0.4669260700389105, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 1.5765, "step": 25800 }, { "epoch": 0.4672880282327391, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 0.5565, "step": 25820 }, { "epoch": 0.46764998642656774, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 0.3083, "step": 25840 }, { "epoch": 0.46801194462039636, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 1.6012, "step": 25860 }, { "epoch": 0.468373902814225, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 1.1553, "step": 25880 }, { "epoch": 0.4687358610080536, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 0.9913, "step": 25900 }, { "epoch": 0.4690978192018822, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 0.0, "step": 25920 }, { "epoch": 0.4694597773957108, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 0.625, "step": 25940 }, { "epoch": 0.46982173558953944, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 3.3216, "step": 25960 }, { "epoch": 0.470183693783368, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 0.6484, "step": 25980 }, { "epoch": 0.4705456519771966, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 0.7286, "step": 26000 }, { "epoch": 0.4709076101710252, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 1.7941, "step": 26020 }, { "epoch": 0.47126956836485384, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 1.1019, "step": 26040 }, { "epoch": 0.47163152655868246, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 4.0271, "step": 26060 }, { "epoch": 0.47199348475251107, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 1.9299, "step": 26080 }, { "epoch": 0.4723554429463397, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 0.8113, "step": 26100 }, { "epoch": 0.4727174011401683, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 0.3264, "step": 26120 }, { "epoch": 0.4730793593339969, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 3.6824, "step": 26140 }, { "epoch": 0.47344131752782553, "grad_norm": NaN, "learning_rate": 0.0003356337093563371, "loss": 1.2918, "step": 26160 }, { "epoch": 0.47380327572165415, "grad_norm": NaN, "learning_rate": 0.0003356880014477891, "loss": 6.2563, "step": 26180 }, { "epoch": 0.47416523391548276, "grad_norm": NaN, "learning_rate": 0.0003356880014477891, "loss": 2.1305, "step": 26200 }, { "epoch": 0.4745271921093114, "grad_norm": NaN, "learning_rate": 0.0003356880014477891, "loss": 2.3668, "step": 26220 }, { "epoch": 0.47488915030314, "grad_norm": NaN, "learning_rate": 0.0003356880014477891, "loss": 0.6577, "step": 26240 }, { "epoch": 0.4752511084969686, "grad_norm": NaN, "learning_rate": 0.0003356880014477891, "loss": 3.3453, "step": 26260 }, { "epoch": 0.4756130666907972, "grad_norm": NaN, "learning_rate": 0.0003356880014477891, "loss": 2.9417, "step": 26280 }, { "epoch": 0.47597502488462584, "grad_norm": NaN, "learning_rate": 0.0003356880014477891, "loss": 0.587, "step": 26300 }, { "epoch": 0.47633698307845446, "grad_norm": NaN, "learning_rate": 0.0003356880014477891, "loss": 2.0321, "step": 26320 }, { "epoch": 0.47669894127228307, "grad_norm": NaN, "learning_rate": 0.00033574229353924113, "loss": 2.4647, "step": 26340 }, { "epoch": 0.4770608994661117, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 4.3747, "step": 26360 }, { "epoch": 0.4774228576599403, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 3.2226, "step": 26380 }, { "epoch": 0.47778481585376886, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 1.9031, "step": 26400 }, { "epoch": 0.4781467740475975, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 3.3406, "step": 26420 }, { "epoch": 0.4785087322414261, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 2.2105, "step": 26440 }, { "epoch": 0.4788706904352547, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 4.5148, "step": 26460 }, { "epoch": 0.4792326486290833, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 2.1052, "step": 26480 }, { "epoch": 0.47959460682291194, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 1.2447, "step": 26500 }, { "epoch": 0.47995656501674056, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 1.9474, "step": 26520 }, { "epoch": 0.48031852321056917, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 3.2753, "step": 26540 }, { "epoch": 0.4806804814043978, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 4.0634, "step": 26560 }, { "epoch": 0.4810424395982264, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 0.9836, "step": 26580 }, { "epoch": 0.481404397792055, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 1.7762, "step": 26600 }, { "epoch": 0.48176635598588363, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 4.9824, "step": 26620 }, { "epoch": 0.48212831417971225, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 3.987, "step": 26640 }, { "epoch": 0.48249027237354086, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 1.2768, "step": 26660 }, { "epoch": 0.4828522305673695, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 2.6338, "step": 26680 }, { "epoch": 0.4832141887611981, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 1.1868, "step": 26700 }, { "epoch": 0.4835761469550267, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 1.8313, "step": 26720 }, { "epoch": 0.4839381051488553, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 2.3637, "step": 26740 }, { "epoch": 0.48430006334268394, "grad_norm": NaN, "learning_rate": 0.0003357965856306931, "loss": 2.7929, "step": 26760 }, { "epoch": 0.48466202153651256, "grad_norm": NaN, "learning_rate": 0.00033585087772214513, "loss": 6.2724, "step": 26780 }, { "epoch": 0.48502397973034117, "grad_norm": NaN, "learning_rate": 0.00033585087772214513, "loss": 2.1151, "step": 26800 }, { "epoch": 0.4853859379241698, "grad_norm": NaN, "learning_rate": 0.00033585087772214513, "loss": 4.0124, "step": 26820 }, { "epoch": 0.48574789611799835, "grad_norm": NaN, "learning_rate": 0.00033585087772214513, "loss": 3.4104, "step": 26840 }, { "epoch": 0.48610985431182696, "grad_norm": NaN, "learning_rate": 0.00033585087772214513, "loss": 1.4459, "step": 26860 }, { "epoch": 0.4864718125056556, "grad_norm": NaN, "learning_rate": 0.00033585087772214513, "loss": 4.8461, "step": 26880 }, { "epoch": 0.4868337706994842, "grad_norm": NaN, "learning_rate": 0.00033585087772214513, "loss": 0.2915, "step": 26900 }, { "epoch": 0.4871957288933128, "grad_norm": NaN, "learning_rate": 0.00033585087772214513, "loss": 2.9453, "step": 26920 }, { "epoch": 0.4875576870871414, "grad_norm": NaN, "learning_rate": 0.00033585087772214513, "loss": 1.6669, "step": 26940 }, { "epoch": 0.48791964528097004, "grad_norm": NaN, "learning_rate": 0.00033590516981359716, "loss": 4.153, "step": 26960 }, { "epoch": 0.48828160347479865, "grad_norm": NaN, "learning_rate": 0.00033590516981359716, "loss": 1.1367, "step": 26980 }, { "epoch": 0.48864356166862727, "grad_norm": NaN, "learning_rate": 0.00033590516981359716, "loss": 0.7335, "step": 27000 }, { "epoch": 0.4890055198624559, "grad_norm": NaN, "learning_rate": 0.00033590516981359716, "loss": 1.4231, "step": 27020 }, { "epoch": 0.4893674780562845, "grad_norm": NaN, "learning_rate": 0.00033595946190504914, "loss": 1.5677, "step": 27040 }, { "epoch": 0.4897294362501131, "grad_norm": NaN, "learning_rate": 0.00033595946190504914, "loss": 1.2877, "step": 27060 }, { "epoch": 0.49009139444394173, "grad_norm": NaN, "learning_rate": 0.00033595946190504914, "loss": 1.0986, "step": 27080 }, { "epoch": 0.49045335263777035, "grad_norm": NaN, "learning_rate": 0.00033595946190504914, "loss": 1.6446, "step": 27100 }, { "epoch": 0.49081531083159896, "grad_norm": NaN, "learning_rate": 0.00033595946190504914, "loss": 0.9012, "step": 27120 }, { "epoch": 0.4911772690254276, "grad_norm": NaN, "learning_rate": 0.00033595946190504914, "loss": 2.6305, "step": 27140 }, { "epoch": 0.4915392272192562, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 3.1293, "step": 27160 }, { "epoch": 0.4919011854130848, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 1.359, "step": 27180 }, { "epoch": 0.4922631436069134, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 0.3099, "step": 27200 }, { "epoch": 0.49262510180074204, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 1.5537, "step": 27220 }, { "epoch": 0.49298705999457065, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 0.4033, "step": 27240 }, { "epoch": 0.4933490181883992, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 2.7647, "step": 27260 }, { "epoch": 0.49371097638222783, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 2.8715, "step": 27280 }, { "epoch": 0.49407293457605644, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 2.3458, "step": 27300 }, { "epoch": 0.49443489276988506, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 0.2636, "step": 27320 }, { "epoch": 0.4947968509637137, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 1.911, "step": 27340 }, { "epoch": 0.4951588091575423, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 0.8095, "step": 27360 }, { "epoch": 0.4955207673513709, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 1.9547, "step": 27380 }, { "epoch": 0.4958827255451995, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 3.585, "step": 27400 }, { "epoch": 0.49624468373902814, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 2.4619, "step": 27420 }, { "epoch": 0.49660664193285675, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 1.1667, "step": 27440 }, { "epoch": 0.49696860012668537, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 2.5093, "step": 27460 }, { "epoch": 0.497330558320514, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 0.979, "step": 27480 }, { "epoch": 0.4976925165143426, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 2.4626, "step": 27500 }, { "epoch": 0.4980544747081712, "grad_norm": NaN, "learning_rate": 0.00033601375399650116, "loss": 1.3528, "step": 27520 }, { "epoch": 0.49841643290199983, "grad_norm": NaN, "learning_rate": 0.00033606804608795314, "loss": 2.9594, "step": 27540 }, { "epoch": 0.49877839109582844, "grad_norm": NaN, "learning_rate": 0.00033606804608795314, "loss": 2.9261, "step": 27560 }, { "epoch": 0.49914034928965706, "grad_norm": NaN, "learning_rate": 0.00033606804608795314, "loss": 4.6089, "step": 27580 }, { "epoch": 0.4995023074834857, "grad_norm": NaN, "learning_rate": 0.00033612233817940517, "loss": 0.7009, "step": 27600 }, { "epoch": 0.4998642656773143, "grad_norm": NaN, "learning_rate": 0.00033612233817940517, "loss": 1.3202, "step": 27620 }, { "epoch": 0.5002262238711429, "grad_norm": NaN, "learning_rate": 0.00033612233817940517, "loss": 3.7712, "step": 27640 }, { "epoch": 0.5005881820649715, "grad_norm": NaN, "learning_rate": 0.00033612233817940517, "loss": 0.3282, "step": 27660 }, { "epoch": 0.5009501402588001, "grad_norm": NaN, "learning_rate": 0.00033612233817940517, "loss": 2.5498, "step": 27680 }, { "epoch": 0.5013120984526287, "grad_norm": NaN, "learning_rate": 0.0003361766302708572, "loss": 0.755, "step": 27700 }, { "epoch": 0.5016740566464574, "grad_norm": NaN, "learning_rate": 0.0003361766302708572, "loss": 1.9296, "step": 27720 }, { "epoch": 0.5020360148402859, "grad_norm": NaN, "learning_rate": 0.0003361766302708572, "loss": 2.5995, "step": 27740 }, { "epoch": 0.5023979730341146, "grad_norm": NaN, "learning_rate": 0.0003361766302708572, "loss": 1.0732, "step": 27760 }, { "epoch": 0.5027599312279432, "grad_norm": NaN, "learning_rate": 0.00033623092236230917, "loss": 1.6838, "step": 27780 }, { "epoch": 0.5031218894217718, "grad_norm": NaN, "learning_rate": 0.00033623092236230917, "loss": 2.5511, "step": 27800 }, { "epoch": 0.5034838476156004, "grad_norm": NaN, "learning_rate": 0.00033623092236230917, "loss": 1.7666, "step": 27820 }, { "epoch": 0.5038458058094291, "grad_norm": NaN, "learning_rate": 0.00033623092236230917, "loss": 0.6833, "step": 27840 }, { "epoch": 0.5042077640032576, "grad_norm": NaN, "learning_rate": 0.00033623092236230917, "loss": 1.4577, "step": 27860 }, { "epoch": 0.5045697221970863, "grad_norm": NaN, "learning_rate": 0.00033623092236230917, "loss": 4.5719, "step": 27880 }, { "epoch": 0.5049316803909148, "grad_norm": NaN, "learning_rate": 0.00033628521445376125, "loss": 1.1376, "step": 27900 }, { "epoch": 0.5052936385847434, "grad_norm": NaN, "learning_rate": 0.0003363395065452133, "loss": 2.4106, "step": 27920 }, { "epoch": 0.5056555967785721, "grad_norm": NaN, "learning_rate": 0.0003363395065452133, "loss": 2.1211, "step": 27940 }, { "epoch": 0.5060175549724006, "grad_norm": NaN, "learning_rate": 0.0003363395065452133, "loss": 1.4196, "step": 27960 }, { "epoch": 0.5063795131662293, "grad_norm": NaN, "learning_rate": 0.00033639379863666525, "loss": 5.2558, "step": 27980 }, { "epoch": 0.5067414713600579, "grad_norm": NaN, "learning_rate": 0.00033639379863666525, "loss": 0.5132, "step": 28000 }, { "epoch": 0.5071034295538865, "grad_norm": NaN, "learning_rate": 0.0003364480907281173, "loss": 1.992, "step": 28020 }, { "epoch": 0.5074653877477151, "grad_norm": NaN, "learning_rate": 0.0003364480907281173, "loss": 2.3578, "step": 28040 }, { "epoch": 0.5078273459415438, "grad_norm": NaN, "learning_rate": 0.0003364480907281173, "loss": 1.4773, "step": 28060 }, { "epoch": 0.5081893041353723, "grad_norm": NaN, "learning_rate": 0.0003364480907281173, "loss": 0.8052, "step": 28080 }, { "epoch": 0.508551262329201, "grad_norm": NaN, "learning_rate": 0.0003364480907281173, "loss": 2.8013, "step": 28100 }, { "epoch": 0.5089132205230296, "grad_norm": NaN, "learning_rate": 0.0003364480907281173, "loss": 2.4446, "step": 28120 }, { "epoch": 0.5092751787168582, "grad_norm": NaN, "learning_rate": 0.0003364480907281173, "loss": 3.2994, "step": 28140 }, { "epoch": 0.5096371369106868, "grad_norm": NaN, "learning_rate": 0.0003364480907281173, "loss": 1.6093, "step": 28160 }, { "epoch": 0.5099990951045155, "grad_norm": NaN, "learning_rate": 0.0003364480907281173, "loss": 3.3921, "step": 28180 }, { "epoch": 0.510361053298344, "grad_norm": NaN, "learning_rate": 0.0003364480907281173, "loss": 1.3276, "step": 28200 }, { "epoch": 0.5107230114921727, "grad_norm": NaN, "learning_rate": 0.0003365566749110213, "loss": 3.9981, "step": 28220 }, { "epoch": 0.5110849696860013, "grad_norm": NaN, "learning_rate": 0.0003365566749110213, "loss": 0.4405, "step": 28240 }, { "epoch": 0.5114469278798299, "grad_norm": NaN, "learning_rate": 0.0003365566749110213, "loss": 2.4271, "step": 28260 }, { "epoch": 0.5118088860736585, "grad_norm": NaN, "learning_rate": 0.0003365566749110213, "loss": 0.8778, "step": 28280 }, { "epoch": 0.5121708442674872, "grad_norm": NaN, "learning_rate": 0.0003365566749110213, "loss": 0.0, "step": 28300 }, { "epoch": 0.5125328024613157, "grad_norm": NaN, "learning_rate": 0.0003366109670024733, "loss": 0.767, "step": 28320 }, { "epoch": 0.5128947606551443, "grad_norm": NaN, "learning_rate": 0.0003366109670024733, "loss": 1.9647, "step": 28340 }, { "epoch": 0.513256718848973, "grad_norm": NaN, "learning_rate": 0.0003366109670024733, "loss": 1.8607, "step": 28360 }, { "epoch": 0.5136186770428015, "grad_norm": NaN, "learning_rate": 0.0003366109670024733, "loss": 1.1568, "step": 28380 }, { "epoch": 0.5139806352366302, "grad_norm": NaN, "learning_rate": 0.0003366109670024733, "loss": 1.5674, "step": 28400 }, { "epoch": 0.5143425934304587, "grad_norm": NaN, "learning_rate": 0.0003366109670024733, "loss": 1.0066, "step": 28420 }, { "epoch": 0.5147045516242874, "grad_norm": NaN, "learning_rate": 0.0003366109670024733, "loss": 3.1565, "step": 28440 }, { "epoch": 0.515066509818116, "grad_norm": NaN, "learning_rate": 0.00033666525909392534, "loss": 4.106, "step": 28460 }, { "epoch": 0.5154284680119446, "grad_norm": NaN, "learning_rate": 0.00033666525909392534, "loss": 0.8154, "step": 28480 }, { "epoch": 0.5157904262057732, "grad_norm": NaN, "learning_rate": 0.00033666525909392534, "loss": 3.305, "step": 28500 }, { "epoch": 0.5161523843996019, "grad_norm": NaN, "learning_rate": 0.00033666525909392534, "loss": 0.8101, "step": 28520 }, { "epoch": 0.5165143425934304, "grad_norm": NaN, "learning_rate": 0.00033666525909392534, "loss": 1.3072, "step": 28540 }, { "epoch": 0.5168763007872591, "grad_norm": NaN, "learning_rate": 0.0003367195511853773, "loss": 2.4194, "step": 28560 }, { "epoch": 0.5172382589810877, "grad_norm": NaN, "learning_rate": 0.0003367195511853773, "loss": 0.4776, "step": 28580 }, { "epoch": 0.5176002171749163, "grad_norm": NaN, "learning_rate": 0.0003367195511853773, "loss": 2.4544, "step": 28600 }, { "epoch": 0.5179621753687449, "grad_norm": NaN, "learning_rate": 0.0003367195511853773, "loss": 1.3137, "step": 28620 }, { "epoch": 0.5183241335625736, "grad_norm": NaN, "learning_rate": 0.0003367195511853773, "loss": 1.0826, "step": 28640 }, { "epoch": 0.5186860917564021, "grad_norm": NaN, "learning_rate": 0.0003367195511853773, "loss": 2.348, "step": 28660 }, { "epoch": 0.5190480499502308, "grad_norm": NaN, "learning_rate": 0.0003367195511853773, "loss": 0.4398, "step": 28680 }, { "epoch": 0.5194100081440594, "grad_norm": NaN, "learning_rate": 0.0003367195511853773, "loss": 1.0112, "step": 28700 }, { "epoch": 0.519771966337888, "grad_norm": NaN, "learning_rate": 0.0003367195511853773, "loss": 1.0888, "step": 28720 }, { "epoch": 0.5201339245317166, "grad_norm": NaN, "learning_rate": 0.0003367195511853773, "loss": 0.6808, "step": 28740 }, { "epoch": 0.5204958827255451, "grad_norm": NaN, "learning_rate": 0.0003367195511853773, "loss": 1.5677, "step": 28760 }, { "epoch": 0.5208578409193738, "grad_norm": NaN, "learning_rate": 0.0003367195511853773, "loss": 1.126, "step": 28780 }, { "epoch": 0.5212197991132024, "grad_norm": NaN, "learning_rate": 0.0003367195511853773, "loss": 1.7623, "step": 28800 }, { "epoch": 0.521581757307031, "grad_norm": NaN, "learning_rate": 0.0003367195511853773, "loss": 2.4801, "step": 28820 }, { "epoch": 0.5219437155008596, "grad_norm": NaN, "learning_rate": 0.0003367195511853773, "loss": 2.3897, "step": 28840 }, { "epoch": 0.5223056736946883, "grad_norm": NaN, "learning_rate": 0.0003367195511853773, "loss": 1.8097, "step": 28860 }, { "epoch": 0.5226676318885168, "grad_norm": NaN, "learning_rate": 0.00033677384327682934, "loss": 1.4597, "step": 28880 }, { "epoch": 0.5230295900823455, "grad_norm": NaN, "learning_rate": 0.00033677384327682934, "loss": 4.035, "step": 28900 }, { "epoch": 0.5233915482761741, "grad_norm": NaN, "learning_rate": 0.00033677384327682934, "loss": 0.2504, "step": 28920 }, { "epoch": 0.5237535064700027, "grad_norm": NaN, "learning_rate": 0.00033677384327682934, "loss": 1.4283, "step": 28940 }, { "epoch": 0.5241154646638313, "grad_norm": NaN, "learning_rate": 0.00033677384327682934, "loss": 0.2523, "step": 28960 }, { "epoch": 0.52447742285766, "grad_norm": NaN, "learning_rate": 0.00033677384327682934, "loss": 1.5595, "step": 28980 }, { "epoch": 0.5248393810514885, "grad_norm": NaN, "learning_rate": 0.00033677384327682934, "loss": 1.5499, "step": 29000 }, { "epoch": 0.5252013392453172, "grad_norm": NaN, "learning_rate": 0.00033677384327682934, "loss": 0.6875, "step": 29020 }, { "epoch": 0.5255632974391458, "grad_norm": NaN, "learning_rate": 0.00033677384327682934, "loss": 2.8188, "step": 29040 }, { "epoch": 0.5259252556329744, "grad_norm": NaN, "learning_rate": 0.00033677384327682934, "loss": 2.2597, "step": 29060 }, { "epoch": 0.526287213826803, "grad_norm": NaN, "learning_rate": 0.00033677384327682934, "loss": 1.9333, "step": 29080 }, { "epoch": 0.5266491720206317, "grad_norm": NaN, "learning_rate": 0.00033677384327682934, "loss": 0.4807, "step": 29100 }, { "epoch": 0.5270111302144602, "grad_norm": NaN, "learning_rate": 0.00033677384327682934, "loss": 0.0, "step": 29120 }, { "epoch": 0.5273730884082889, "grad_norm": NaN, "learning_rate": 0.00033677384327682934, "loss": 2.2827, "step": 29140 }, { "epoch": 0.5277350466021175, "grad_norm": NaN, "learning_rate": 0.00033677384327682934, "loss": 3.1091, "step": 29160 }, { "epoch": 0.5280970047959461, "grad_norm": NaN, "learning_rate": 0.00033682813536828137, "loss": 4.6729, "step": 29180 }, { "epoch": 0.5284589629897747, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 1.2756, "step": 29200 }, { "epoch": 0.5288209211836032, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 1.4674, "step": 29220 }, { "epoch": 0.5291828793774319, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 3.2296, "step": 29240 }, { "epoch": 0.5295448375712605, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 1.6508, "step": 29260 }, { "epoch": 0.5299067957650891, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 1.2729, "step": 29280 }, { "epoch": 0.5302687539589177, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 2.1077, "step": 29300 }, { "epoch": 0.5306307121527464, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 3.0026, "step": 29320 }, { "epoch": 0.5309926703465749, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 1.8396, "step": 29340 }, { "epoch": 0.5313546285404036, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 0.9778, "step": 29360 }, { "epoch": 0.5317165867342322, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 1.8226, "step": 29380 }, { "epoch": 0.5320785449280608, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 0.3753, "step": 29400 }, { "epoch": 0.5324405031218894, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 0.5317, "step": 29420 }, { "epoch": 0.5328024613157181, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 2.9181, "step": 29440 }, { "epoch": 0.5331644195095466, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 0.5965, "step": 29460 }, { "epoch": 0.5335263777033753, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 0.9398, "step": 29480 }, { "epoch": 0.5338883358972039, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 0.0, "step": 29500 }, { "epoch": 0.5342502940910325, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 1.2958, "step": 29520 }, { "epoch": 0.5346122522848611, "grad_norm": NaN, "learning_rate": 0.00033688242745973335, "loss": 1.8982, "step": 29540 }, { "epoch": 0.5349742104786898, "grad_norm": NaN, "learning_rate": 0.0003369367195511854, "loss": 3.7872, "step": 29560 }, { "epoch": 0.5353361686725183, "grad_norm": NaN, "learning_rate": 0.0003369367195511854, "loss": 2.0331, "step": 29580 }, { "epoch": 0.535698126866347, "grad_norm": NaN, "learning_rate": 0.0003369367195511854, "loss": 4.3899, "step": 29600 }, { "epoch": 0.5360600850601756, "grad_norm": NaN, "learning_rate": 0.0003369367195511854, "loss": 1.4404, "step": 29620 }, { "epoch": 0.5364220432540041, "grad_norm": NaN, "learning_rate": 0.0003369367195511854, "loss": 0.0, "step": 29640 }, { "epoch": 0.5367840014478328, "grad_norm": NaN, "learning_rate": 0.0003369367195511854, "loss": 1.0522, "step": 29660 }, { "epoch": 0.5371459596416613, "grad_norm": NaN, "learning_rate": 0.0003369367195511854, "loss": 2.8168, "step": 29680 }, { "epoch": 0.53750791783549, "grad_norm": NaN, "learning_rate": 0.00033699101164263735, "loss": 3.0505, "step": 29700 }, { "epoch": 0.5378698760293186, "grad_norm": NaN, "learning_rate": 0.00033699101164263735, "loss": 0.2763, "step": 29720 }, { "epoch": 0.5382318342231472, "grad_norm": NaN, "learning_rate": 0.00033699101164263735, "loss": 1.9107, "step": 29740 }, { "epoch": 0.5385937924169758, "grad_norm": NaN, "learning_rate": 0.00033699101164263735, "loss": 0.8095, "step": 29760 }, { "epoch": 0.5389557506108045, "grad_norm": NaN, "learning_rate": 0.00033699101164263735, "loss": 2.2456, "step": 29780 }, { "epoch": 0.539317708804633, "grad_norm": NaN, "learning_rate": 0.0003370453037340894, "loss": 2.3767, "step": 29800 }, { "epoch": 0.5396796669984617, "grad_norm": NaN, "learning_rate": 0.0003370995958255414, "loss": 3.8996, "step": 29820 }, { "epoch": 0.5400416251922903, "grad_norm": NaN, "learning_rate": 0.0003370995958255414, "loss": 2.038, "step": 29840 }, { "epoch": 0.5404035833861189, "grad_norm": NaN, "learning_rate": 0.0003370995958255414, "loss": 1.871, "step": 29860 }, { "epoch": 0.5407655415799475, "grad_norm": NaN, "learning_rate": 0.0003370995958255414, "loss": 2.3282, "step": 29880 }, { "epoch": 0.5411274997737762, "grad_norm": NaN, "learning_rate": 0.0003370995958255414, "loss": 2.3466, "step": 29900 }, { "epoch": 0.5414894579676047, "grad_norm": NaN, "learning_rate": 0.0003370995958255414, "loss": 1.3206, "step": 29920 }, { "epoch": 0.5418514161614334, "grad_norm": NaN, "learning_rate": 0.0003370995958255414, "loss": 2.6498, "step": 29940 }, { "epoch": 0.542213374355262, "grad_norm": NaN, "learning_rate": 0.0003371538879169934, "loss": 2.9053, "step": 29960 }, { "epoch": 0.5425753325490906, "grad_norm": NaN, "learning_rate": 0.0003371538879169934, "loss": 2.4447, "step": 29980 }, { "epoch": 0.5429372907429192, "grad_norm": NaN, "learning_rate": 0.00033726247209989743, "loss": 2.5282, "step": 30000 }, { "epoch": 0.5429372907429192, "eval_accuracy": 4.2183558152264125e-05, "eval_loss": NaN, "eval_runtime": 170.3905, "eval_samples_per_second": 3567.336, "eval_steps_per_second": 3.486, "step": 30000 }, { "epoch": 0.5432992489367479, "grad_norm": NaN, "learning_rate": 0.0003373167641913494, "loss": 4.4527, "step": 30020 }, { "epoch": 0.5436612071305764, "grad_norm": NaN, "learning_rate": 0.0003373167641913494, "loss": 1.3497, "step": 30040 }, { "epoch": 0.544023165324405, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 3.7444, "step": 30060 }, { "epoch": 0.5443851235182336, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 0.0, "step": 30080 }, { "epoch": 0.5447470817120622, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 4.6331, "step": 30100 }, { "epoch": 0.5451090399058909, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 3.726, "step": 30120 }, { "epoch": 0.5454709980997194, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 2.8333, "step": 30140 }, { "epoch": 0.5458329562935481, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 1.0241, "step": 30160 }, { "epoch": 0.5461949144873767, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 2.0789, "step": 30180 }, { "epoch": 0.5465568726812053, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 1.4731, "step": 30200 }, { "epoch": 0.5469188308750339, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 1.4267, "step": 30220 }, { "epoch": 0.5472807890688626, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 1.8427, "step": 30240 }, { "epoch": 0.5476427472626911, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 0.8168, "step": 30260 }, { "epoch": 0.5480047054565198, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 1.763, "step": 30280 }, { "epoch": 0.5483666636503484, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 3.572, "step": 30300 }, { "epoch": 0.548728621844177, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 2.2074, "step": 30320 }, { "epoch": 0.5490905800380056, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 0.9044, "step": 30340 }, { "epoch": 0.5494525382318343, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 2.1892, "step": 30360 }, { "epoch": 0.5498144964256628, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 1.997, "step": 30380 }, { "epoch": 0.5501764546194915, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 1.605, "step": 30400 }, { "epoch": 0.55053841281332, "grad_norm": NaN, "learning_rate": 0.0003373710562828015, "loss": 1.4339, "step": 30420 }, { "epoch": 0.5509003710071487, "grad_norm": NaN, "learning_rate": 0.0003374253483742535, "loss": 3.1752, "step": 30440 }, { "epoch": 0.5512623292009773, "grad_norm": NaN, "learning_rate": 0.0003374253483742535, "loss": 1.001, "step": 30460 }, { "epoch": 0.5516242873948058, "grad_norm": NaN, "learning_rate": 0.0003374253483742535, "loss": 2.1747, "step": 30480 }, { "epoch": 0.5519862455886345, "grad_norm": NaN, "learning_rate": 0.0003374253483742535, "loss": 1.652, "step": 30500 }, { "epoch": 0.5523482037824631, "grad_norm": NaN, "learning_rate": 0.0003374253483742535, "loss": 2.9897, "step": 30520 }, { "epoch": 0.5527101619762917, "grad_norm": NaN, "learning_rate": 0.0003374253483742535, "loss": 0.9179, "step": 30540 }, { "epoch": 0.5530721201701203, "grad_norm": NaN, "learning_rate": 0.0003374253483742535, "loss": 0.2923, "step": 30560 }, { "epoch": 0.553434078363949, "grad_norm": NaN, "learning_rate": 0.0003374253483742535, "loss": 0.8205, "step": 30580 }, { "epoch": 0.5537960365577775, "grad_norm": NaN, "learning_rate": 0.0003374253483742535, "loss": 2.1928, "step": 30600 }, { "epoch": 0.5541579947516062, "grad_norm": NaN, "learning_rate": 0.0003374253483742535, "loss": 2.7092, "step": 30620 }, { "epoch": 0.5545199529454348, "grad_norm": NaN, "learning_rate": 0.0003374796404657055, "loss": 5.2901, "step": 30640 }, { "epoch": 0.5548819111392634, "grad_norm": NaN, "learning_rate": 0.0003374796404657055, "loss": 3.344, "step": 30660 }, { "epoch": 0.555243869333092, "grad_norm": NaN, "learning_rate": 0.0003374796404657055, "loss": 0.527, "step": 30680 }, { "epoch": 0.5556058275269207, "grad_norm": NaN, "learning_rate": 0.0003374796404657055, "loss": 1.5043, "step": 30700 }, { "epoch": 0.5559677857207492, "grad_norm": NaN, "learning_rate": 0.00033758822464860955, "loss": 1.2209, "step": 30720 }, { "epoch": 0.5563297439145779, "grad_norm": NaN, "learning_rate": 0.00033758822464860955, "loss": 0.7636, "step": 30740 }, { "epoch": 0.5566917021084065, "grad_norm": NaN, "learning_rate": 0.00033758822464860955, "loss": 1.0511, "step": 30760 }, { "epoch": 0.5570536603022351, "grad_norm": NaN, "learning_rate": 0.00033758822464860955, "loss": 0.7603, "step": 30780 }, { "epoch": 0.5574156184960637, "grad_norm": NaN, "learning_rate": 0.0003376425167400615, "loss": 3.298, "step": 30800 }, { "epoch": 0.5577775766898924, "grad_norm": NaN, "learning_rate": 0.0003376425167400615, "loss": 1.3454, "step": 30820 }, { "epoch": 0.5581395348837209, "grad_norm": NaN, "learning_rate": 0.0003376425167400615, "loss": 2.6237, "step": 30840 }, { "epoch": 0.5585014930775496, "grad_norm": NaN, "learning_rate": 0.0003376425167400615, "loss": 1.3254, "step": 30860 }, { "epoch": 0.5588634512713782, "grad_norm": NaN, "learning_rate": 0.0003376425167400615, "loss": 2.1697, "step": 30880 }, { "epoch": 0.5592254094652067, "grad_norm": NaN, "learning_rate": 0.0003376425167400615, "loss": 2.2754, "step": 30900 }, { "epoch": 0.5595873676590354, "grad_norm": NaN, "learning_rate": 0.0003376425167400615, "loss": 3.7609, "step": 30920 }, { "epoch": 0.5599493258528639, "grad_norm": NaN, "learning_rate": 0.0003376425167400615, "loss": 0.2628, "step": 30940 }, { "epoch": 0.5603112840466926, "grad_norm": NaN, "learning_rate": 0.0003376425167400615, "loss": 0.8309, "step": 30960 }, { "epoch": 0.5606732422405212, "grad_norm": NaN, "learning_rate": 0.0003376425167400615, "loss": 0.5879, "step": 30980 }, { "epoch": 0.5610352004343498, "grad_norm": NaN, "learning_rate": 0.0003376425167400615, "loss": 4.2088, "step": 31000 }, { "epoch": 0.5613971586281784, "grad_norm": NaN, "learning_rate": 0.0003376425167400615, "loss": 0.9703, "step": 31020 }, { "epoch": 0.5617591168220071, "grad_norm": NaN, "learning_rate": 0.0003376425167400615, "loss": 0.0, "step": 31040 }, { "epoch": 0.5621210750158356, "grad_norm": NaN, "learning_rate": 0.0003376425167400615, "loss": 1.3204, "step": 31060 }, { "epoch": 0.5624830332096643, "grad_norm": NaN, "learning_rate": 0.0003376425167400615, "loss": 2.5039, "step": 31080 }, { "epoch": 0.5628449914034929, "grad_norm": NaN, "learning_rate": 0.0003376425167400615, "loss": 2.8935, "step": 31100 }, { "epoch": 0.5632069495973215, "grad_norm": NaN, "learning_rate": 0.00033769680883151355, "loss": 3.7335, "step": 31120 }, { "epoch": 0.5635689077911501, "grad_norm": NaN, "learning_rate": 0.00033769680883151355, "loss": 0.0, "step": 31140 }, { "epoch": 0.5639308659849788, "grad_norm": NaN, "learning_rate": 0.00033769680883151355, "loss": 2.4728, "step": 31160 }, { "epoch": 0.5642928241788073, "grad_norm": NaN, "learning_rate": 0.00033769680883151355, "loss": 3.032, "step": 31180 }, { "epoch": 0.564654782372636, "grad_norm": NaN, "learning_rate": 0.00033769680883151355, "loss": 1.2609, "step": 31200 }, { "epoch": 0.5650167405664646, "grad_norm": NaN, "learning_rate": 0.00033769680883151355, "loss": 0.5703, "step": 31220 }, { "epoch": 0.5653786987602932, "grad_norm": NaN, "learning_rate": 0.00033769680883151355, "loss": 0.0, "step": 31240 }, { "epoch": 0.5657406569541218, "grad_norm": NaN, "learning_rate": 0.00033769680883151355, "loss": 1.2567, "step": 31260 }, { "epoch": 0.5661026151479505, "grad_norm": NaN, "learning_rate": 0.00033769680883151355, "loss": 0.2647, "step": 31280 }, { "epoch": 0.566464573341779, "grad_norm": NaN, "learning_rate": 0.00033769680883151355, "loss": 0.4066, "step": 31300 }, { "epoch": 0.5668265315356077, "grad_norm": NaN, "learning_rate": 0.00033769680883151355, "loss": 1.8915, "step": 31320 }, { "epoch": 0.5671884897294363, "grad_norm": NaN, "learning_rate": 0.0003377511009229655, "loss": 4.0601, "step": 31340 }, { "epoch": 0.5675504479232648, "grad_norm": NaN, "learning_rate": 0.0003377511009229655, "loss": 0.377, "step": 31360 }, { "epoch": 0.5679124061170935, "grad_norm": NaN, "learning_rate": 0.0003377511009229655, "loss": 3.3188, "step": 31380 }, { "epoch": 0.568274364310922, "grad_norm": NaN, "learning_rate": 0.0003377511009229655, "loss": 0.776, "step": 31400 }, { "epoch": 0.5686363225047507, "grad_norm": NaN, "learning_rate": 0.00033780539301441756, "loss": 4.5019, "step": 31420 }, { "epoch": 0.5689982806985793, "grad_norm": NaN, "learning_rate": 0.00033780539301441756, "loss": 1.3692, "step": 31440 }, { "epoch": 0.5693602388924079, "grad_norm": NaN, "learning_rate": 0.0003378596851058696, "loss": 4.8838, "step": 31460 }, { "epoch": 0.5697221970862365, "grad_norm": NaN, "learning_rate": 0.0003378596851058696, "loss": 0.5303, "step": 31480 }, { "epoch": 0.5700841552800652, "grad_norm": NaN, "learning_rate": 0.0003378596851058696, "loss": 0.3097, "step": 31500 }, { "epoch": 0.5704461134738937, "grad_norm": NaN, "learning_rate": 0.0003378596851058696, "loss": 0.5267, "step": 31520 }, { "epoch": 0.5708080716677224, "grad_norm": NaN, "learning_rate": 0.0003378596851058696, "loss": 2.0099, "step": 31540 }, { "epoch": 0.571170029861551, "grad_norm": NaN, "learning_rate": 0.0003378596851058696, "loss": 0.7905, "step": 31560 }, { "epoch": 0.5715319880553796, "grad_norm": NaN, "learning_rate": 0.00033791397719732156, "loss": 4.0043, "step": 31580 }, { "epoch": 0.5718939462492082, "grad_norm": NaN, "learning_rate": 0.00033791397719732156, "loss": 0.6581, "step": 31600 }, { "epoch": 0.5722559044430369, "grad_norm": NaN, "learning_rate": 0.00033791397719732156, "loss": 0.6558, "step": 31620 }, { "epoch": 0.5726178626368654, "grad_norm": NaN, "learning_rate": 0.0003379682692887736, "loss": 2.1176, "step": 31640 }, { "epoch": 0.5729798208306941, "grad_norm": NaN, "learning_rate": 0.0003379682692887736, "loss": 1.5842, "step": 31660 }, { "epoch": 0.5733417790245227, "grad_norm": NaN, "learning_rate": 0.0003379682692887736, "loss": 0.261, "step": 31680 }, { "epoch": 0.5737037372183513, "grad_norm": NaN, "learning_rate": 0.0003379682692887736, "loss": 0.661, "step": 31700 }, { "epoch": 0.5740656954121799, "grad_norm": NaN, "learning_rate": 0.0003379682692887736, "loss": 4.9456, "step": 31720 }, { "epoch": 0.5744276536060086, "grad_norm": NaN, "learning_rate": 0.0003379682692887736, "loss": 1.377, "step": 31740 }, { "epoch": 0.5747896117998371, "grad_norm": NaN, "learning_rate": 0.0003379682692887736, "loss": 1.4774, "step": 31760 }, { "epoch": 0.5751515699936657, "grad_norm": NaN, "learning_rate": 0.0003379682692887736, "loss": 2.2917, "step": 31780 }, { "epoch": 0.5755135281874943, "grad_norm": NaN, "learning_rate": 0.0003379682692887736, "loss": 1.3992, "step": 31800 }, { "epoch": 0.5758754863813229, "grad_norm": NaN, "learning_rate": 0.0003379682692887736, "loss": 0.0, "step": 31820 }, { "epoch": 0.5762374445751516, "grad_norm": NaN, "learning_rate": 0.0003380225613802256, "loss": 2.9791, "step": 31840 }, { "epoch": 0.5765994027689801, "grad_norm": NaN, "learning_rate": 0.0003380225613802256, "loss": 0.6377, "step": 31860 }, { "epoch": 0.5769613609628088, "grad_norm": NaN, "learning_rate": 0.0003380225613802256, "loss": 1.0643, "step": 31880 }, { "epoch": 0.5773233191566374, "grad_norm": NaN, "learning_rate": 0.0003380225613802256, "loss": 1.4489, "step": 31900 }, { "epoch": 0.577685277350466, "grad_norm": NaN, "learning_rate": 0.0003380225613802256, "loss": 2.8674, "step": 31920 }, { "epoch": 0.5780472355442946, "grad_norm": NaN, "learning_rate": 0.0003380768534716776, "loss": 1.5195, "step": 31940 }, { "epoch": 0.5784091937381233, "grad_norm": NaN, "learning_rate": 0.0003380768534716776, "loss": 2.4152, "step": 31960 }, { "epoch": 0.5787711519319518, "grad_norm": NaN, "learning_rate": 0.0003381311455631296, "loss": 0.876, "step": 31980 }, { "epoch": 0.5791331101257805, "grad_norm": NaN, "learning_rate": 0.0003381311455631296, "loss": 0.8222, "step": 32000 }, { "epoch": 0.5794950683196091, "grad_norm": NaN, "learning_rate": 0.0003381311455631296, "loss": 1.0204, "step": 32020 }, { "epoch": 0.5798570265134377, "grad_norm": NaN, "learning_rate": 0.0003381311455631296, "loss": 4.3559, "step": 32040 }, { "epoch": 0.5802189847072663, "grad_norm": NaN, "learning_rate": 0.0003381311455631296, "loss": 2.5171, "step": 32060 }, { "epoch": 0.580580942901095, "grad_norm": NaN, "learning_rate": 0.00033818543765458164, "loss": 1.8858, "step": 32080 }, { "epoch": 0.5809429010949235, "grad_norm": NaN, "learning_rate": 0.00033818543765458164, "loss": 0.5861, "step": 32100 }, { "epoch": 0.5813048592887522, "grad_norm": NaN, "learning_rate": 0.00033818543765458164, "loss": 1.9742, "step": 32120 }, { "epoch": 0.5816668174825808, "grad_norm": NaN, "learning_rate": 0.0003382397297460336, "loss": 4.5212, "step": 32140 }, { "epoch": 0.5820287756764094, "grad_norm": NaN, "learning_rate": 0.0003382397297460336, "loss": 3.7017, "step": 32160 }, { "epoch": 0.582390733870238, "grad_norm": NaN, "learning_rate": 0.0003382397297460336, "loss": 1.5592, "step": 32180 }, { "epoch": 0.5827526920640665, "grad_norm": NaN, "learning_rate": 0.0003382397297460336, "loss": 3.7027, "step": 32200 }, { "epoch": 0.5831146502578952, "grad_norm": NaN, "learning_rate": 0.0003382397297460336, "loss": 0.4786, "step": 32220 }, { "epoch": 0.5834766084517238, "grad_norm": NaN, "learning_rate": 0.0003382397297460336, "loss": 1.0346, "step": 32240 }, { "epoch": 0.5838385666455524, "grad_norm": NaN, "learning_rate": 0.0003382397297460336, "loss": 2.9245, "step": 32260 }, { "epoch": 0.584200524839381, "grad_norm": NaN, "learning_rate": 0.0003382397297460336, "loss": 1.9817, "step": 32280 }, { "epoch": 0.5845624830332097, "grad_norm": NaN, "learning_rate": 0.0003382397297460336, "loss": 1.9598, "step": 32300 }, { "epoch": 0.5849244412270382, "grad_norm": NaN, "learning_rate": 0.0003382397297460336, "loss": 0.8555, "step": 32320 }, { "epoch": 0.5852863994208669, "grad_norm": NaN, "learning_rate": 0.0003382397297460336, "loss": 3.6154, "step": 32340 }, { "epoch": 0.5856483576146955, "grad_norm": NaN, "learning_rate": 0.0003382397297460336, "loss": 2.4032, "step": 32360 }, { "epoch": 0.5860103158085241, "grad_norm": NaN, "learning_rate": 0.0003382397297460336, "loss": 1.2044, "step": 32380 }, { "epoch": 0.5863722740023527, "grad_norm": NaN, "learning_rate": 0.00033829402183748565, "loss": 3.1887, "step": 32400 }, { "epoch": 0.5867342321961814, "grad_norm": NaN, "learning_rate": 0.00033829402183748565, "loss": 0.3301, "step": 32420 }, { "epoch": 0.5870961903900099, "grad_norm": NaN, "learning_rate": 0.00033829402183748565, "loss": 1.8997, "step": 32440 }, { "epoch": 0.5874581485838386, "grad_norm": NaN, "learning_rate": 0.00033829402183748565, "loss": 1.3953, "step": 32460 }, { "epoch": 0.5878201067776672, "grad_norm": NaN, "learning_rate": 0.00033829402183748565, "loss": 0.6638, "step": 32480 }, { "epoch": 0.5881820649714958, "grad_norm": NaN, "learning_rate": 0.00033829402183748565, "loss": 4.518, "step": 32500 }, { "epoch": 0.5885440231653244, "grad_norm": NaN, "learning_rate": 0.00033829402183748565, "loss": 1.2981, "step": 32520 }, { "epoch": 0.5889059813591531, "grad_norm": NaN, "learning_rate": 0.00033829402183748565, "loss": 2.1839, "step": 32540 }, { "epoch": 0.5892679395529816, "grad_norm": NaN, "learning_rate": 0.00033829402183748565, "loss": 2.8422, "step": 32560 }, { "epoch": 0.5896298977468103, "grad_norm": NaN, "learning_rate": 0.00033829402183748565, "loss": 0.8342, "step": 32580 }, { "epoch": 0.5899918559406389, "grad_norm": NaN, "learning_rate": 0.00033829402183748565, "loss": 0.8171, "step": 32600 }, { "epoch": 0.5903538141344674, "grad_norm": NaN, "learning_rate": 0.00033829402183748565, "loss": 2.7784, "step": 32620 }, { "epoch": 0.5907157723282961, "grad_norm": NaN, "learning_rate": 0.00033829402183748565, "loss": 2.5405, "step": 32640 }, { "epoch": 0.5910777305221246, "grad_norm": NaN, "learning_rate": 0.00033829402183748565, "loss": 2.8424, "step": 32660 }, { "epoch": 0.5914396887159533, "grad_norm": NaN, "learning_rate": 0.0003383483139289377, "loss": 2.3829, "step": 32680 }, { "epoch": 0.5918016469097819, "grad_norm": NaN, "learning_rate": 0.0003383483139289377, "loss": 1.9787, "step": 32700 }, { "epoch": 0.5921636051036105, "grad_norm": NaN, "learning_rate": 0.0003383483139289377, "loss": 2.3822, "step": 32720 }, { "epoch": 0.5925255632974391, "grad_norm": NaN, "learning_rate": 0.0003383483139289377, "loss": 3.9111, "step": 32740 }, { "epoch": 0.5928875214912678, "grad_norm": NaN, "learning_rate": 0.0003383483139289377, "loss": 1.3643, "step": 32760 }, { "epoch": 0.5932494796850963, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 2.047, "step": 32780 }, { "epoch": 0.593611437878925, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 2.9585, "step": 32800 }, { "epoch": 0.5939733960727536, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 4.1351, "step": 32820 }, { "epoch": 0.5943353542665822, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 0.6157, "step": 32840 }, { "epoch": 0.5946973124604108, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 2.7448, "step": 32860 }, { "epoch": 0.5950592706542395, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 0.7489, "step": 32880 }, { "epoch": 0.595421228848068, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 0.8747, "step": 32900 }, { "epoch": 0.5957831870418967, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 2.8152, "step": 32920 }, { "epoch": 0.5961451452357253, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 1.1254, "step": 32940 }, { "epoch": 0.5965071034295539, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 1.5716, "step": 32960 }, { "epoch": 0.5968690616233825, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 4.1376, "step": 32980 }, { "epoch": 0.5972310198172112, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 0.5842, "step": 33000 }, { "epoch": 0.5975929780110397, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 0.875, "step": 33020 }, { "epoch": 0.5979549362048684, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 2.7708, "step": 33040 }, { "epoch": 0.598316894398697, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 0.3296, "step": 33060 }, { "epoch": 0.5986788525925255, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 2.8913, "step": 33080 }, { "epoch": 0.5990408107863542, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 2.8881, "step": 33100 }, { "epoch": 0.5994027689801827, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 0.5676, "step": 33120 }, { "epoch": 0.5997647271740114, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 3.7095, "step": 33140 }, { "epoch": 0.60012668536784, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 1.7405, "step": 33160 }, { "epoch": 0.6004886435616686, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 0.5894, "step": 33180 }, { "epoch": 0.6008506017554972, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 2.6553, "step": 33200 }, { "epoch": 0.6012125599493259, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 0.6764, "step": 33220 }, { "epoch": 0.6015745181431544, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 1.4665, "step": 33240 }, { "epoch": 0.6019364763369831, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 0.5828, "step": 33260 }, { "epoch": 0.6022984345308117, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 1.3079, "step": 33280 }, { "epoch": 0.6026603927246403, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 1.2401, "step": 33300 }, { "epoch": 0.6030223509184689, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 2.473, "step": 33320 }, { "epoch": 0.6033843091122976, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 0.942, "step": 33340 }, { "epoch": 0.6037462673061261, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 3.2602, "step": 33360 }, { "epoch": 0.6041082254999548, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 0.9383, "step": 33380 }, { "epoch": 0.6044701836937834, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 1.8251, "step": 33400 }, { "epoch": 0.604832141887612, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 0.8731, "step": 33420 }, { "epoch": 0.6051941000814406, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 3.7498, "step": 33440 }, { "epoch": 0.6055560582752693, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 4.6315, "step": 33460 }, { "epoch": 0.6059180164690978, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 2.1757, "step": 33480 }, { "epoch": 0.6062799746629264, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 3.2739, "step": 33500 }, { "epoch": 0.606641932856755, "grad_norm": NaN, "learning_rate": 0.00033840260602038965, "loss": 0.3309, "step": 33520 }, { "epoch": 0.6070038910505836, "grad_norm": NaN, "learning_rate": 0.0003384568981118417, "loss": 4.0132, "step": 33540 }, { "epoch": 0.6073658492444123, "grad_norm": NaN, "learning_rate": 0.00033851119020329376, "loss": 1.2405, "step": 33560 }, { "epoch": 0.6077278074382408, "grad_norm": NaN, "learning_rate": 0.00033851119020329376, "loss": 0.753, "step": 33580 }, { "epoch": 0.6080897656320695, "grad_norm": NaN, "learning_rate": 0.00033851119020329376, "loss": 1.6577, "step": 33600 }, { "epoch": 0.6084517238258981, "grad_norm": NaN, "learning_rate": 0.00033851119020329376, "loss": 1.7591, "step": 33620 }, { "epoch": 0.6088136820197267, "grad_norm": NaN, "learning_rate": 0.00033851119020329376, "loss": 2.2422, "step": 33640 }, { "epoch": 0.6091756402135553, "grad_norm": NaN, "learning_rate": 0.00033851119020329376, "loss": 1.0707, "step": 33660 }, { "epoch": 0.609537598407384, "grad_norm": NaN, "learning_rate": 0.00033851119020329376, "loss": 1.5656, "step": 33680 }, { "epoch": 0.6098995566012125, "grad_norm": NaN, "learning_rate": 0.00033851119020329376, "loss": 2.6784, "step": 33700 }, { "epoch": 0.6102615147950412, "grad_norm": NaN, "learning_rate": 0.00033851119020329376, "loss": 2.3588, "step": 33720 }, { "epoch": 0.6106234729888698, "grad_norm": NaN, "learning_rate": 0.00033851119020329376, "loss": 4.6354, "step": 33740 }, { "epoch": 0.6109854311826984, "grad_norm": NaN, "learning_rate": 0.00033851119020329376, "loss": 1.5306, "step": 33760 }, { "epoch": 0.611347389376527, "grad_norm": NaN, "learning_rate": 0.00033851119020329376, "loss": 1.2227, "step": 33780 }, { "epoch": 0.6117093475703557, "grad_norm": NaN, "learning_rate": 0.00033851119020329376, "loss": 0.0, "step": 33800 }, { "epoch": 0.6120713057641842, "grad_norm": NaN, "learning_rate": 0.00033856548229474573, "loss": 2.1893, "step": 33820 }, { "epoch": 0.6124332639580129, "grad_norm": NaN, "learning_rate": 0.00033856548229474573, "loss": 1.0192, "step": 33840 }, { "epoch": 0.6127952221518415, "grad_norm": NaN, "learning_rate": 0.00033856548229474573, "loss": 0.0, "step": 33860 }, { "epoch": 0.6131571803456701, "grad_norm": NaN, "learning_rate": 0.00033856548229474573, "loss": 2.1149, "step": 33880 }, { "epoch": 0.6135191385394987, "grad_norm": NaN, "learning_rate": 0.00033856548229474573, "loss": 1.0614, "step": 33900 }, { "epoch": 0.6138810967333272, "grad_norm": NaN, "learning_rate": 0.00033856548229474573, "loss": 0.5862, "step": 33920 }, { "epoch": 0.6142430549271559, "grad_norm": NaN, "learning_rate": 0.00033856548229474573, "loss": 3.149, "step": 33940 }, { "epoch": 0.6146050131209845, "grad_norm": NaN, "learning_rate": 0.00033861977438619776, "loss": 2.9405, "step": 33960 }, { "epoch": 0.6149669713148131, "grad_norm": NaN, "learning_rate": 0.00033861977438619776, "loss": 4.0777, "step": 33980 }, { "epoch": 0.6153289295086417, "grad_norm": NaN, "learning_rate": 0.00033861977438619776, "loss": 3.36, "step": 34000 }, { "epoch": 0.6156908877024704, "grad_norm": NaN, "learning_rate": 0.00033861977438619776, "loss": 1.6447, "step": 34020 }, { "epoch": 0.6160528458962989, "grad_norm": NaN, "learning_rate": 0.00033867406647764974, "loss": 2.5351, "step": 34040 }, { "epoch": 0.6164148040901276, "grad_norm": NaN, "learning_rate": 0.00033867406647764974, "loss": 2.008, "step": 34060 }, { "epoch": 0.6167767622839562, "grad_norm": NaN, "learning_rate": 0.00033867406647764974, "loss": 1.1559, "step": 34080 }, { "epoch": 0.6171387204777848, "grad_norm": NaN, "learning_rate": 0.00033867406647764974, "loss": 1.84, "step": 34100 }, { "epoch": 0.6175006786716134, "grad_norm": NaN, "learning_rate": 0.00033867406647764974, "loss": 1.3541, "step": 34120 }, { "epoch": 0.6178626368654421, "grad_norm": NaN, "learning_rate": 0.00033867406647764974, "loss": 1.1305, "step": 34140 }, { "epoch": 0.6182245950592706, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 3.6217, "step": 34160 }, { "epoch": 0.6185865532530993, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 2.908, "step": 34180 }, { "epoch": 0.6189485114469279, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 1.6959, "step": 34200 }, { "epoch": 0.6193104696407565, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 2.1185, "step": 34220 }, { "epoch": 0.6196724278345851, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 0.6876, "step": 34240 }, { "epoch": 0.6200343860284138, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 0.7483, "step": 34260 }, { "epoch": 0.6203963442222423, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 3.7268, "step": 34280 }, { "epoch": 0.620758302416071, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 2.0185, "step": 34300 }, { "epoch": 0.6211202606098996, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 0.6343, "step": 34320 }, { "epoch": 0.6214822188037281, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 2.0173, "step": 34340 }, { "epoch": 0.6218441769975568, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 0.6121, "step": 34360 }, { "epoch": 0.6222061351913853, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 1.4123, "step": 34380 }, { "epoch": 0.622568093385214, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 1.8988, "step": 34400 }, { "epoch": 0.6229300515790426, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 1.4129, "step": 34420 }, { "epoch": 0.6232920097728712, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 1.6322, "step": 34440 }, { "epoch": 0.6236539679666998, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 0.3118, "step": 34460 }, { "epoch": 0.6240159261605285, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 1.7433, "step": 34480 }, { "epoch": 0.624377884354357, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 3.4847, "step": 34500 }, { "epoch": 0.6247398425481857, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 2.0673, "step": 34520 }, { "epoch": 0.6251018007420143, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 3.4612, "step": 34540 }, { "epoch": 0.6254637589358429, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 0.6037, "step": 34560 }, { "epoch": 0.6258257171296715, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 1.4632, "step": 34580 }, { "epoch": 0.6261876753235002, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 1.3263, "step": 34600 }, { "epoch": 0.6265496335173287, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 2.8046, "step": 34620 }, { "epoch": 0.6269115917111574, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 2.4509, "step": 34640 }, { "epoch": 0.627273549904986, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 3.4612, "step": 34660 }, { "epoch": 0.6276355080988146, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 2.6659, "step": 34680 }, { "epoch": 0.6279974662926432, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 0.2545, "step": 34700 }, { "epoch": 0.6283594244864719, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 2.1847, "step": 34720 }, { "epoch": 0.6287213826803004, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 1.9613, "step": 34740 }, { "epoch": 0.6290833408741291, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 3.0888, "step": 34760 }, { "epoch": 0.6294452990679577, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 1.4538, "step": 34780 }, { "epoch": 0.6298072572617862, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 0.8354, "step": 34800 }, { "epoch": 0.6301692154556149, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 1.0278, "step": 34820 }, { "epoch": 0.6305311736494434, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 2.2199, "step": 34840 }, { "epoch": 0.6308931318432721, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 0.7863, "step": 34860 }, { "epoch": 0.6312550900371007, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 2.1344, "step": 34880 }, { "epoch": 0.6316170482309293, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 0.3531, "step": 34900 }, { "epoch": 0.6319790064247579, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 3.5952, "step": 34920 }, { "epoch": 0.6323409646185866, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 1.2902, "step": 34940 }, { "epoch": 0.6327029228124151, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 4.1284, "step": 34960 }, { "epoch": 0.6330648810062438, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 6.2895, "step": 34980 }, { "epoch": 0.6334268392000724, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 0.6642, "step": 35000 }, { "epoch": 0.6334268392000724, "eval_accuracy": 4.044985725999523e-05, "eval_loss": NaN, "eval_runtime": 170.124, "eval_samples_per_second": 3572.923, "eval_steps_per_second": 3.492, "step": 35000 }, { "epoch": 0.633788797393901, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 0.8256, "step": 35020 }, { "epoch": 0.6341507555877296, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 1.7584, "step": 35040 }, { "epoch": 0.6345127137815583, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 1.4393, "step": 35060 }, { "epoch": 0.6348746719753868, "grad_norm": NaN, "learning_rate": 0.00033872835856910177, "loss": 1.9828, "step": 35080 }, { "epoch": 0.6352366301692155, "grad_norm": NaN, "learning_rate": 0.0003387826506605538, "loss": 3.1671, "step": 35100 }, { "epoch": 0.6355985883630441, "grad_norm": NaN, "learning_rate": 0.00033883694275200577, "loss": 2.0757, "step": 35120 }, { "epoch": 0.6359605465568727, "grad_norm": NaN, "learning_rate": 0.00033883694275200577, "loss": 0.0, "step": 35140 }, { "epoch": 0.6363225047507013, "grad_norm": NaN, "learning_rate": 0.0003388912348434578, "loss": 1.7126, "step": 35160 }, { "epoch": 0.63668446294453, "grad_norm": NaN, "learning_rate": 0.0003388912348434578, "loss": 0.6618, "step": 35180 }, { "epoch": 0.6370464211383585, "grad_norm": NaN, "learning_rate": 0.0003388912348434578, "loss": 0.757, "step": 35200 }, { "epoch": 0.6374083793321871, "grad_norm": NaN, "learning_rate": 0.0003388912348434578, "loss": 0.0, "step": 35220 }, { "epoch": 0.6377703375260158, "grad_norm": NaN, "learning_rate": 0.0003388912348434578, "loss": 1.5613, "step": 35240 }, { "epoch": 0.6381322957198443, "grad_norm": NaN, "learning_rate": 0.0003389455269349098, "loss": 1.3998, "step": 35260 }, { "epoch": 0.638494253913673, "grad_norm": NaN, "learning_rate": 0.0003389455269349098, "loss": 0.7427, "step": 35280 }, { "epoch": 0.6388562121075015, "grad_norm": NaN, "learning_rate": 0.0003389998190263618, "loss": 5.5387, "step": 35300 }, { "epoch": 0.6392181703013302, "grad_norm": NaN, "learning_rate": 0.0003389998190263618, "loss": 1.9642, "step": 35320 }, { "epoch": 0.6395801284951588, "grad_norm": NaN, "learning_rate": 0.0003389998190263618, "loss": 1.5359, "step": 35340 }, { "epoch": 0.6399420866889874, "grad_norm": NaN, "learning_rate": 0.0003389998190263618, "loss": 0.4792, "step": 35360 }, { "epoch": 0.640304044882816, "grad_norm": NaN, "learning_rate": 0.0003389998190263618, "loss": 1.3603, "step": 35380 }, { "epoch": 0.6406660030766447, "grad_norm": NaN, "learning_rate": 0.0003389998190263618, "loss": 0.7629, "step": 35400 }, { "epoch": 0.6410279612704732, "grad_norm": NaN, "learning_rate": 0.0003389998190263618, "loss": 1.4802, "step": 35420 }, { "epoch": 0.6413899194643019, "grad_norm": NaN, "learning_rate": 0.0003390541111178138, "loss": 4.3307, "step": 35440 }, { "epoch": 0.6417518776581305, "grad_norm": NaN, "learning_rate": 0.00033910840320926585, "loss": 2.5926, "step": 35460 }, { "epoch": 0.6421138358519591, "grad_norm": NaN, "learning_rate": 0.00033910840320926585, "loss": 3.2833, "step": 35480 }, { "epoch": 0.6424757940457877, "grad_norm": NaN, "learning_rate": 0.00033910840320926585, "loss": 4.9173, "step": 35500 }, { "epoch": 0.6428377522396164, "grad_norm": NaN, "learning_rate": 0.00033910840320926585, "loss": 0.5275, "step": 35520 }, { "epoch": 0.6431997104334449, "grad_norm": NaN, "learning_rate": 0.00033910840320926585, "loss": 1.4497, "step": 35540 }, { "epoch": 0.6435616686272736, "grad_norm": NaN, "learning_rate": 0.00033910840320926585, "loss": 0.4034, "step": 35560 }, { "epoch": 0.6439236268211022, "grad_norm": NaN, "learning_rate": 0.00033910840320926585, "loss": 2.4885, "step": 35580 }, { "epoch": 0.6442855850149308, "grad_norm": NaN, "learning_rate": 0.00033910840320926585, "loss": 1.5954, "step": 35600 }, { "epoch": 0.6446475432087594, "grad_norm": NaN, "learning_rate": 0.00033910840320926585, "loss": 2.7346, "step": 35620 }, { "epoch": 0.645009501402588, "grad_norm": NaN, "learning_rate": 0.00033910840320926585, "loss": 4.7249, "step": 35640 }, { "epoch": 0.6453714595964166, "grad_norm": NaN, "learning_rate": 0.00033910840320926585, "loss": 0.2657, "step": 35660 }, { "epoch": 0.6457334177902452, "grad_norm": NaN, "learning_rate": 0.00033910840320926585, "loss": 0.48, "step": 35680 }, { "epoch": 0.6460953759840738, "grad_norm": NaN, "learning_rate": 0.00033910840320926585, "loss": 2.6495, "step": 35700 }, { "epoch": 0.6464573341779024, "grad_norm": NaN, "learning_rate": 0.00033910840320926585, "loss": 3.6956, "step": 35720 }, { "epoch": 0.6468192923717311, "grad_norm": NaN, "learning_rate": 0.00033916269530071783, "loss": 4.862, "step": 35740 }, { "epoch": 0.6471812505655596, "grad_norm": NaN, "learning_rate": 0.00033921698739216986, "loss": 5.4816, "step": 35760 }, { "epoch": 0.6475432087593883, "grad_norm": NaN, "learning_rate": 0.00033921698739216986, "loss": 0.3298, "step": 35780 }, { "epoch": 0.6479051669532169, "grad_norm": NaN, "learning_rate": 0.00033921698739216986, "loss": 3.0427, "step": 35800 }, { "epoch": 0.6482671251470455, "grad_norm": NaN, "learning_rate": 0.00033921698739216986, "loss": 1.5398, "step": 35820 }, { "epoch": 0.6486290833408741, "grad_norm": NaN, "learning_rate": 0.00033921698739216986, "loss": 2.1597, "step": 35840 }, { "epoch": 0.6489910415347028, "grad_norm": NaN, "learning_rate": 0.00033921698739216986, "loss": 2.6124, "step": 35860 }, { "epoch": 0.6493529997285313, "grad_norm": NaN, "learning_rate": 0.00033921698739216986, "loss": 2.3736, "step": 35880 }, { "epoch": 0.64971495792236, "grad_norm": NaN, "learning_rate": 0.00033921698739216986, "loss": 0.5241, "step": 35900 }, { "epoch": 0.6500769161161886, "grad_norm": NaN, "learning_rate": 0.00033921698739216986, "loss": 0.329, "step": 35920 }, { "epoch": 0.6504388743100172, "grad_norm": NaN, "learning_rate": 0.00033921698739216986, "loss": 0.7409, "step": 35940 }, { "epoch": 0.6508008325038458, "grad_norm": NaN, "learning_rate": 0.00033921698739216986, "loss": 0.5299, "step": 35960 }, { "epoch": 0.6511627906976745, "grad_norm": NaN, "learning_rate": 0.00033921698739216986, "loss": 0.0, "step": 35980 }, { "epoch": 0.651524748891503, "grad_norm": NaN, "learning_rate": 0.0003392712794836219, "loss": 0.8578, "step": 36000 }, { "epoch": 0.6518867070853317, "grad_norm": NaN, "learning_rate": 0.00033932557157507386, "loss": 2.9669, "step": 36020 }, { "epoch": 0.6522486652791603, "grad_norm": NaN, "learning_rate": 0.00033932557157507386, "loss": 1.7578, "step": 36040 }, { "epoch": 0.6526106234729888, "grad_norm": NaN, "learning_rate": 0.00033932557157507386, "loss": 3.1266, "step": 36060 }, { "epoch": 0.6529725816668175, "grad_norm": NaN, "learning_rate": 0.00033932557157507386, "loss": 1.5293, "step": 36080 }, { "epoch": 0.653334539860646, "grad_norm": NaN, "learning_rate": 0.00033932557157507386, "loss": 1.134, "step": 36100 }, { "epoch": 0.6536964980544747, "grad_norm": NaN, "learning_rate": 0.00033932557157507386, "loss": 3.078, "step": 36120 }, { "epoch": 0.6540584562483033, "grad_norm": NaN, "learning_rate": 0.00033932557157507386, "loss": 1.4875, "step": 36140 }, { "epoch": 0.654420414442132, "grad_norm": NaN, "learning_rate": 0.00033932557157507386, "loss": 0.772, "step": 36160 }, { "epoch": 0.6547823726359605, "grad_norm": NaN, "learning_rate": 0.00033932557157507386, "loss": 0.0, "step": 36180 }, { "epoch": 0.6551443308297892, "grad_norm": NaN, "learning_rate": 0.00033932557157507386, "loss": 0.5703, "step": 36200 }, { "epoch": 0.6555062890236177, "grad_norm": NaN, "learning_rate": 0.00033932557157507386, "loss": 0.2789, "step": 36220 }, { "epoch": 0.6558682472174464, "grad_norm": NaN, "learning_rate": 0.00033932557157507386, "loss": 2.9863, "step": 36240 }, { "epoch": 0.656230205411275, "grad_norm": NaN, "learning_rate": 0.00033932557157507386, "loss": 1.1667, "step": 36260 }, { "epoch": 0.6565921636051036, "grad_norm": NaN, "learning_rate": 0.00033932557157507386, "loss": 1.313, "step": 36280 }, { "epoch": 0.6569541217989322, "grad_norm": NaN, "learning_rate": 0.00033932557157507386, "loss": 1.2836, "step": 36300 }, { "epoch": 0.6573160799927609, "grad_norm": NaN, "learning_rate": 0.0003393798636665259, "loss": 2.2436, "step": 36320 }, { "epoch": 0.6576780381865894, "grad_norm": NaN, "learning_rate": 0.0003393798636665259, "loss": 1.0165, "step": 36340 }, { "epoch": 0.6580399963804181, "grad_norm": NaN, "learning_rate": 0.0003393798636665259, "loss": 1.149, "step": 36360 }, { "epoch": 0.6584019545742467, "grad_norm": NaN, "learning_rate": 0.0003393798636665259, "loss": 2.7208, "step": 36380 }, { "epoch": 0.6587639127680753, "grad_norm": NaN, "learning_rate": 0.0003393798636665259, "loss": 0.8133, "step": 36400 }, { "epoch": 0.6591258709619039, "grad_norm": NaN, "learning_rate": 0.0003393798636665259, "loss": 0.8109, "step": 36420 }, { "epoch": 0.6594878291557326, "grad_norm": NaN, "learning_rate": 0.0003393798636665259, "loss": 0.997, "step": 36440 }, { "epoch": 0.6598497873495611, "grad_norm": NaN, "learning_rate": 0.0003394341557579779, "loss": 0.8476, "step": 36460 }, { "epoch": 0.6602117455433897, "grad_norm": NaN, "learning_rate": 0.0003394341557579779, "loss": 2.5703, "step": 36480 }, { "epoch": 0.6605737037372184, "grad_norm": NaN, "learning_rate": 0.0003394341557579779, "loss": 0.6823, "step": 36500 }, { "epoch": 0.6609356619310469, "grad_norm": NaN, "learning_rate": 0.0003394341557579779, "loss": 0.2757, "step": 36520 }, { "epoch": 0.6612976201248756, "grad_norm": NaN, "learning_rate": 0.0003394341557579779, "loss": 2.4816, "step": 36540 }, { "epoch": 0.6616595783187041, "grad_norm": NaN, "learning_rate": 0.0003394341557579779, "loss": 0.0, "step": 36560 }, { "epoch": 0.6620215365125328, "grad_norm": NaN, "learning_rate": 0.0003394341557579779, "loss": 2.6417, "step": 36580 }, { "epoch": 0.6623834947063614, "grad_norm": NaN, "learning_rate": 0.0003394341557579779, "loss": 2.0684, "step": 36600 }, { "epoch": 0.66274545290019, "grad_norm": NaN, "learning_rate": 0.0003394341557579779, "loss": 2.3207, "step": 36620 }, { "epoch": 0.6631074110940186, "grad_norm": NaN, "learning_rate": 0.0003394341557579779, "loss": 2.33, "step": 36640 }, { "epoch": 0.6634693692878473, "grad_norm": NaN, "learning_rate": 0.0003394341557579779, "loss": 4.4484, "step": 36660 }, { "epoch": 0.6638313274816758, "grad_norm": NaN, "learning_rate": 0.0003394341557579779, "loss": 0.2786, "step": 36680 }, { "epoch": 0.6641932856755045, "grad_norm": NaN, "learning_rate": 0.0003394341557579779, "loss": 5.3884, "step": 36700 }, { "epoch": 0.6645552438693331, "grad_norm": NaN, "learning_rate": 0.0003394884478494299, "loss": 1.0773, "step": 36720 }, { "epoch": 0.6649172020631617, "grad_norm": NaN, "learning_rate": 0.0003394884478494299, "loss": 0.2935, "step": 36740 }, { "epoch": 0.6652791602569903, "grad_norm": NaN, "learning_rate": 0.0003394884478494299, "loss": 2.4258, "step": 36760 }, { "epoch": 0.665641118450819, "grad_norm": NaN, "learning_rate": 0.0003395427399408819, "loss": 0.941, "step": 36780 }, { "epoch": 0.6660030766446475, "grad_norm": NaN, "learning_rate": 0.0003395427399408819, "loss": 0.8534, "step": 36800 }, { "epoch": 0.6663650348384762, "grad_norm": NaN, "learning_rate": 0.0003395427399408819, "loss": 1.1632, "step": 36820 }, { "epoch": 0.6667269930323048, "grad_norm": NaN, "learning_rate": 0.0003395427399408819, "loss": 1.0667, "step": 36840 }, { "epoch": 0.6670889512261334, "grad_norm": NaN, "learning_rate": 0.00033959703203233395, "loss": 2.3333, "step": 36860 }, { "epoch": 0.667450909419962, "grad_norm": NaN, "learning_rate": 0.00033959703203233395, "loss": 1.9855, "step": 36880 }, { "epoch": 0.6678128676137907, "grad_norm": NaN, "learning_rate": 0.00033959703203233395, "loss": 0.6104, "step": 36900 }, { "epoch": 0.6681748258076192, "grad_norm": NaN, "learning_rate": 0.00033959703203233395, "loss": 3.0791, "step": 36920 }, { "epoch": 0.6685367840014478, "grad_norm": NaN, "learning_rate": 0.00033959703203233395, "loss": 1.97, "step": 36940 }, { "epoch": 0.6688987421952765, "grad_norm": NaN, "learning_rate": 0.000339651324123786, "loss": 1.2541, "step": 36960 }, { "epoch": 0.669260700389105, "grad_norm": NaN, "learning_rate": 0.000339651324123786, "loss": 0.5397, "step": 36980 }, { "epoch": 0.6696226585829337, "grad_norm": NaN, "learning_rate": 0.000339651324123786, "loss": 4.1352, "step": 37000 }, { "epoch": 0.6699846167767622, "grad_norm": NaN, "learning_rate": 0.000339651324123786, "loss": 0.6441, "step": 37020 }, { "epoch": 0.6703465749705909, "grad_norm": NaN, "learning_rate": 0.000339651324123786, "loss": 1.3072, "step": 37040 }, { "epoch": 0.6707085331644195, "grad_norm": NaN, "learning_rate": 0.000339651324123786, "loss": 0.0, "step": 37060 }, { "epoch": 0.6710704913582481, "grad_norm": NaN, "learning_rate": 0.000339651324123786, "loss": 2.8441, "step": 37080 }, { "epoch": 0.6714324495520767, "grad_norm": NaN, "learning_rate": 0.000339651324123786, "loss": 2.4202, "step": 37100 }, { "epoch": 0.6717944077459054, "grad_norm": NaN, "learning_rate": 0.000339651324123786, "loss": 3.0204, "step": 37120 }, { "epoch": 0.6721563659397339, "grad_norm": NaN, "learning_rate": 0.000339651324123786, "loss": 0.2653, "step": 37140 }, { "epoch": 0.6725183241335626, "grad_norm": NaN, "learning_rate": 0.000339705616215238, "loss": 1.6176, "step": 37160 }, { "epoch": 0.6728802823273912, "grad_norm": NaN, "learning_rate": 0.000339705616215238, "loss": 2.0377, "step": 37180 }, { "epoch": 0.6732422405212198, "grad_norm": NaN, "learning_rate": 0.000339705616215238, "loss": 2.6625, "step": 37200 }, { "epoch": 0.6736041987150484, "grad_norm": NaN, "learning_rate": 0.00033975990830669, "loss": 4.3986, "step": 37220 }, { "epoch": 0.6739661569088771, "grad_norm": NaN, "learning_rate": 0.00033986849248959403, "loss": 2.5446, "step": 37240 }, { "epoch": 0.6743281151027056, "grad_norm": NaN, "learning_rate": 0.00033986849248959403, "loss": 1.3551, "step": 37260 }, { "epoch": 0.6746900732965343, "grad_norm": NaN, "learning_rate": 0.00033986849248959403, "loss": 0.2778, "step": 37280 }, { "epoch": 0.6750520314903629, "grad_norm": NaN, "learning_rate": 0.00033986849248959403, "loss": 1.4963, "step": 37300 }, { "epoch": 0.6754139896841915, "grad_norm": NaN, "learning_rate": 0.00033986849248959403, "loss": 0.5676, "step": 37320 }, { "epoch": 0.6757759478780201, "grad_norm": NaN, "learning_rate": 0.00033986849248959403, "loss": 1.1903, "step": 37340 }, { "epoch": 0.6761379060718486, "grad_norm": NaN, "learning_rate": 0.00033986849248959403, "loss": 1.4512, "step": 37360 }, { "epoch": 0.6764998642656773, "grad_norm": NaN, "learning_rate": 0.00033986849248959403, "loss": 3.489, "step": 37380 }, { "epoch": 0.6768618224595059, "grad_norm": NaN, "learning_rate": 0.00033986849248959403, "loss": 1.9502, "step": 37400 }, { "epoch": 0.6772237806533346, "grad_norm": NaN, "learning_rate": 0.00033986849248959403, "loss": 2.8639, "step": 37420 }, { "epoch": 0.6775857388471631, "grad_norm": NaN, "learning_rate": 0.00033986849248959403, "loss": 4.7671, "step": 37440 }, { "epoch": 0.6779476970409918, "grad_norm": NaN, "learning_rate": 0.00033986849248959403, "loss": 0.4403, "step": 37460 }, { "epoch": 0.6783096552348203, "grad_norm": NaN, "learning_rate": 0.000339922784581046, "loss": 3.0565, "step": 37480 }, { "epoch": 0.678671613428649, "grad_norm": NaN, "learning_rate": 0.000339922784581046, "loss": 0.5899, "step": 37500 }, { "epoch": 0.6790335716224776, "grad_norm": NaN, "learning_rate": 0.000339922784581046, "loss": 1.9067, "step": 37520 }, { "epoch": 0.6793955298163062, "grad_norm": NaN, "learning_rate": 0.000339922784581046, "loss": 2.0158, "step": 37540 }, { "epoch": 0.6797574880101348, "grad_norm": NaN, "learning_rate": 0.000339922784581046, "loss": 0.2924, "step": 37560 }, { "epoch": 0.6801194462039635, "grad_norm": NaN, "learning_rate": 0.000339922784581046, "loss": 0.0, "step": 37580 }, { "epoch": 0.680481404397792, "grad_norm": NaN, "learning_rate": 0.000339922784581046, "loss": 1.6403, "step": 37600 }, { "epoch": 0.6808433625916207, "grad_norm": NaN, "learning_rate": 0.000339922784581046, "loss": 0.5313, "step": 37620 }, { "epoch": 0.6812053207854493, "grad_norm": NaN, "learning_rate": 0.000339922784581046, "loss": 3.8097, "step": 37640 }, { "epoch": 0.6815672789792779, "grad_norm": NaN, "learning_rate": 0.000339922784581046, "loss": 1.0063, "step": 37660 }, { "epoch": 0.6819292371731065, "grad_norm": NaN, "learning_rate": 0.000339922784581046, "loss": 0.0, "step": 37680 }, { "epoch": 0.6822911953669352, "grad_norm": NaN, "learning_rate": 0.000339922784581046, "loss": 1.6253, "step": 37700 }, { "epoch": 0.6826531535607637, "grad_norm": NaN, "learning_rate": 0.00033997707667249804, "loss": 4.8992, "step": 37720 }, { "epoch": 0.6830151117545924, "grad_norm": NaN, "learning_rate": 0.00034008566085540204, "loss": 3.6954, "step": 37740 }, { "epoch": 0.683377069948421, "grad_norm": NaN, "learning_rate": 0.00034008566085540204, "loss": 1.4844, "step": 37760 }, { "epoch": 0.6837390281422495, "grad_norm": NaN, "learning_rate": 0.00034008566085540204, "loss": 3.8384, "step": 37780 }, { "epoch": 0.6841009863360782, "grad_norm": NaN, "learning_rate": 0.00034008566085540204, "loss": 0.2784, "step": 37800 }, { "epoch": 0.6844629445299067, "grad_norm": NaN, "learning_rate": 0.00034008566085540204, "loss": 2.1919, "step": 37820 }, { "epoch": 0.6848249027237354, "grad_norm": NaN, "learning_rate": 0.00034008566085540204, "loss": 2.4023, "step": 37840 }, { "epoch": 0.685186860917564, "grad_norm": NaN, "learning_rate": 0.00034008566085540204, "loss": 0.2495, "step": 37860 }, { "epoch": 0.6855488191113926, "grad_norm": NaN, "learning_rate": 0.00034008566085540204, "loss": 1.3315, "step": 37880 }, { "epoch": 0.6859107773052212, "grad_norm": NaN, "learning_rate": 0.00034008566085540204, "loss": 0.6564, "step": 37900 }, { "epoch": 0.6862727354990499, "grad_norm": NaN, "learning_rate": 0.00034008566085540204, "loss": 3.516, "step": 37920 }, { "epoch": 0.6866346936928784, "grad_norm": NaN, "learning_rate": 0.00034013995294685407, "loss": 0.8847, "step": 37940 }, { "epoch": 0.6869966518867071, "grad_norm": NaN, "learning_rate": 0.00034013995294685407, "loss": 1.0364, "step": 37960 }, { "epoch": 0.6873586100805357, "grad_norm": NaN, "learning_rate": 0.00034013995294685407, "loss": 1.3164, "step": 37980 }, { "epoch": 0.6877205682743643, "grad_norm": NaN, "learning_rate": 0.00034013995294685407, "loss": 1.8299, "step": 38000 }, { "epoch": 0.6880825264681929, "grad_norm": NaN, "learning_rate": 0.00034013995294685407, "loss": 1.9752, "step": 38020 }, { "epoch": 0.6884444846620216, "grad_norm": NaN, "learning_rate": 0.00034013995294685407, "loss": 2.2107, "step": 38040 }, { "epoch": 0.6888064428558501, "grad_norm": NaN, "learning_rate": 0.00034013995294685407, "loss": 1.367, "step": 38060 }, { "epoch": 0.6891684010496788, "grad_norm": NaN, "learning_rate": 0.00034013995294685407, "loss": 0.0, "step": 38080 }, { "epoch": 0.6895303592435074, "grad_norm": NaN, "learning_rate": 0.00034013995294685407, "loss": 2.9568, "step": 38100 }, { "epoch": 0.689892317437336, "grad_norm": NaN, "learning_rate": 0.00034013995294685407, "loss": 2.4739, "step": 38120 }, { "epoch": 0.6902542756311646, "grad_norm": NaN, "learning_rate": 0.00034013995294685407, "loss": 2.0114, "step": 38140 }, { "epoch": 0.6906162338249933, "grad_norm": NaN, "learning_rate": 0.00034013995294685407, "loss": 1.122, "step": 38160 }, { "epoch": 0.6909781920188218, "grad_norm": NaN, "learning_rate": 0.00034013995294685407, "loss": 1.742, "step": 38180 }, { "epoch": 0.6913401502126504, "grad_norm": NaN, "learning_rate": 0.0003401942450383061, "loss": 1.8506, "step": 38200 }, { "epoch": 0.691702108406479, "grad_norm": NaN, "learning_rate": 0.0003401942450383061, "loss": 0.0, "step": 38220 }, { "epoch": 0.6920640666003076, "grad_norm": NaN, "learning_rate": 0.0003401942450383061, "loss": 1.2562, "step": 38240 }, { "epoch": 0.6924260247941363, "grad_norm": NaN, "learning_rate": 0.0003401942450383061, "loss": 3.8821, "step": 38260 }, { "epoch": 0.6927879829879648, "grad_norm": NaN, "learning_rate": 0.0003401942450383061, "loss": 1.3118, "step": 38280 }, { "epoch": 0.6931499411817935, "grad_norm": NaN, "learning_rate": 0.0003401942450383061, "loss": 2.8177, "step": 38300 }, { "epoch": 0.6935118993756221, "grad_norm": NaN, "learning_rate": 0.0003401942450383061, "loss": 3.6119, "step": 38320 }, { "epoch": 0.6938738575694507, "grad_norm": NaN, "learning_rate": 0.0003401942450383061, "loss": 0.3784, "step": 38340 }, { "epoch": 0.6942358157632793, "grad_norm": NaN, "learning_rate": 0.0003401942450383061, "loss": 3.3262, "step": 38360 }, { "epoch": 0.694597773957108, "grad_norm": NaN, "learning_rate": 0.0003401942450383061, "loss": 0.9229, "step": 38380 }, { "epoch": 0.6949597321509365, "grad_norm": NaN, "learning_rate": 0.0003401942450383061, "loss": 4.1083, "step": 38400 }, { "epoch": 0.6953216903447652, "grad_norm": NaN, "learning_rate": 0.0003401942450383061, "loss": 1.1548, "step": 38420 }, { "epoch": 0.6956836485385938, "grad_norm": NaN, "learning_rate": 0.00034024853712975807, "loss": 3.4682, "step": 38440 }, { "epoch": 0.6960456067324224, "grad_norm": NaN, "learning_rate": 0.00034024853712975807, "loss": 1.3823, "step": 38460 }, { "epoch": 0.696407564926251, "grad_norm": NaN, "learning_rate": 0.00034024853712975807, "loss": 2.0866, "step": 38480 }, { "epoch": 0.6967695231200797, "grad_norm": NaN, "learning_rate": 0.0003403028292212101, "loss": 5.5967, "step": 38500 }, { "epoch": 0.6971314813139082, "grad_norm": NaN, "learning_rate": 0.0003403028292212101, "loss": 1.4755, "step": 38520 }, { "epoch": 0.6974934395077369, "grad_norm": NaN, "learning_rate": 0.0003403028292212101, "loss": 1.2612, "step": 38540 }, { "epoch": 0.6978553977015655, "grad_norm": NaN, "learning_rate": 0.0003403028292212101, "loss": 2.2109, "step": 38560 }, { "epoch": 0.6982173558953941, "grad_norm": NaN, "learning_rate": 0.0003403028292212101, "loss": 0.0, "step": 38580 }, { "epoch": 0.6985793140892227, "grad_norm": NaN, "learning_rate": 0.0003403028292212101, "loss": 0.0, "step": 38600 }, { "epoch": 0.6989412722830514, "grad_norm": NaN, "learning_rate": 0.0003403028292212101, "loss": 0.744, "step": 38620 }, { "epoch": 0.6993032304768799, "grad_norm": NaN, "learning_rate": 0.0003403028292212101, "loss": 3.4216, "step": 38640 }, { "epoch": 0.6996651886707085, "grad_norm": NaN, "learning_rate": 0.0003403028292212101, "loss": 2.0636, "step": 38660 }, { "epoch": 0.7000271468645372, "grad_norm": NaN, "learning_rate": 0.0003403028292212101, "loss": 0.5826, "step": 38680 }, { "epoch": 0.7003891050583657, "grad_norm": NaN, "learning_rate": 0.0003403028292212101, "loss": 2.348, "step": 38700 }, { "epoch": 0.7007510632521944, "grad_norm": NaN, "learning_rate": 0.0003403028292212101, "loss": 0.5428, "step": 38720 }, { "epoch": 0.7011130214460229, "grad_norm": NaN, "learning_rate": 0.0003403028292212101, "loss": 1.5572, "step": 38740 }, { "epoch": 0.7014749796398516, "grad_norm": NaN, "learning_rate": 0.0003403028292212101, "loss": 2.5182, "step": 38760 }, { "epoch": 0.7018369378336802, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 2.0251, "step": 38780 }, { "epoch": 0.7021988960275088, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 1.1695, "step": 38800 }, { "epoch": 0.7025608542213374, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 0.4377, "step": 38820 }, { "epoch": 0.7029228124151661, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 1.8601, "step": 38840 }, { "epoch": 0.7032847706089946, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 0.7387, "step": 38860 }, { "epoch": 0.7036467288028233, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 2.4922, "step": 38880 }, { "epoch": 0.7040086869966519, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 0.5694, "step": 38900 }, { "epoch": 0.7043706451904805, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 3.0826, "step": 38920 }, { "epoch": 0.7047326033843091, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 0.3303, "step": 38940 }, { "epoch": 0.7050945615781378, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 1.3634, "step": 38960 }, { "epoch": 0.7054565197719663, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 1.0544, "step": 38980 }, { "epoch": 0.705818477965795, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 3.5782, "step": 39000 }, { "epoch": 0.7061804361596236, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 1.515, "step": 39020 }, { "epoch": 0.7065423943534522, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 1.1727, "step": 39040 }, { "epoch": 0.7069043525472808, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 1.2853, "step": 39060 }, { "epoch": 0.7072663107411093, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 0.728, "step": 39080 }, { "epoch": 0.707628268934938, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 0.8595, "step": 39100 }, { "epoch": 0.7079902271287666, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 0.9104, "step": 39120 }, { "epoch": 0.7083521853225953, "grad_norm": NaN, "learning_rate": 0.00034035712131266207, "loss": 2.1795, "step": 39140 }, { "epoch": 0.7087141435164238, "grad_norm": NaN, "learning_rate": 0.0003404114134041141, "loss": 0.956, "step": 39160 }, { "epoch": 0.7090761017102525, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 3.1399, "step": 39180 }, { "epoch": 0.709438059904081, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 0.0, "step": 39200 }, { "epoch": 0.7098000180979097, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 2.0207, "step": 39220 }, { "epoch": 0.7101619762917383, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 3.9813, "step": 39240 }, { "epoch": 0.7105239344855669, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 1.3318, "step": 39260 }, { "epoch": 0.7108858926793955, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 2.6069, "step": 39280 }, { "epoch": 0.7112478508732242, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 0.8781, "step": 39300 }, { "epoch": 0.7116098090670527, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 2.3427, "step": 39320 }, { "epoch": 0.7119717672608814, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 1.2404, "step": 39340 }, { "epoch": 0.71233372545471, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 1.7288, "step": 39360 }, { "epoch": 0.7126956836485386, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 1.6429, "step": 39380 }, { "epoch": 0.7130576418423672, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 1.9149, "step": 39400 }, { "epoch": 0.7134196000361959, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 1.5917, "step": 39420 }, { "epoch": 0.7137815582300244, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 1.1368, "step": 39440 }, { "epoch": 0.7141435164238531, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 1.1706, "step": 39460 }, { "epoch": 0.7145054746176817, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 1.4094, "step": 39480 }, { "epoch": 0.7148674328115102, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 0.6547, "step": 39500 }, { "epoch": 0.7152293910053389, "grad_norm": NaN, "learning_rate": 0.00034046570549556613, "loss": 0.0, "step": 39520 }, { "epoch": 0.7155913491991674, "grad_norm": NaN, "learning_rate": 0.0003405199975870181, "loss": 0.8785, "step": 39540 }, { "epoch": 0.7159533073929961, "grad_norm": NaN, "learning_rate": 0.0003405199975870181, "loss": 2.2526, "step": 39560 }, { "epoch": 0.7163152655868247, "grad_norm": NaN, "learning_rate": 0.0003405199975870181, "loss": 1.8788, "step": 39580 }, { "epoch": 0.7166772237806533, "grad_norm": NaN, "learning_rate": 0.0003405199975870181, "loss": 2.678, "step": 39600 }, { "epoch": 0.7170391819744819, "grad_norm": NaN, "learning_rate": 0.0003405199975870181, "loss": 3.1551, "step": 39620 }, { "epoch": 0.7174011401683106, "grad_norm": NaN, "learning_rate": 0.0003405199975870181, "loss": 0.4397, "step": 39640 }, { "epoch": 0.7177630983621391, "grad_norm": NaN, "learning_rate": 0.0003405199975870181, "loss": 1.7975, "step": 39660 }, { "epoch": 0.7181250565559678, "grad_norm": NaN, "learning_rate": 0.0003405199975870181, "loss": 2.6416, "step": 39680 }, { "epoch": 0.7184870147497964, "grad_norm": NaN, "learning_rate": 0.00034057428967847013, "loss": 3.2459, "step": 39700 }, { "epoch": 0.718848972943625, "grad_norm": NaN, "learning_rate": 0.00034057428967847013, "loss": 0.5649, "step": 39720 }, { "epoch": 0.7192109311374536, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 0.5232, "step": 39740 }, { "epoch": 0.7195728893312823, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 1.3086, "step": 39760 }, { "epoch": 0.7199348475251108, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 1.3057, "step": 39780 }, { "epoch": 0.7202968057189395, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 1.8062, "step": 39800 }, { "epoch": 0.7206587639127681, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 2.1885, "step": 39820 }, { "epoch": 0.7210207221065967, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 1.0445, "step": 39840 }, { "epoch": 0.7213826803004253, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 3.7641, "step": 39860 }, { "epoch": 0.721744638494254, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 1.4201, "step": 39880 }, { "epoch": 0.7221065966880825, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 4.2815, "step": 39900 }, { "epoch": 0.7224685548819111, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 2.343, "step": 39920 }, { "epoch": 0.7228305130757398, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 0.6963, "step": 39940 }, { "epoch": 0.7231924712695683, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 0.9873, "step": 39960 }, { "epoch": 0.723554429463397, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 1.5226, "step": 39980 }, { "epoch": 0.7239163876572255, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 3.9576, "step": 40000 }, { "epoch": 0.7239163876572255, "eval_accuracy": 4.48844260292667e-05, "eval_loss": NaN, "eval_runtime": 170.7615, "eval_samples_per_second": 3559.585, "eval_steps_per_second": 3.479, "step": 40000 }, { "epoch": 0.7242783458510542, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 0.48, "step": 40020 }, { "epoch": 0.7246403040448828, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 3.9882, "step": 40040 }, { "epoch": 0.7250022622387114, "grad_norm": NaN, "learning_rate": 0.00034062858176992216, "loss": 2.1734, "step": 40060 }, { "epoch": 0.72536422043254, "grad_norm": NaN, "learning_rate": 0.00034079145804427824, "loss": 4.2086, "step": 40080 }, { "epoch": 0.7257261786263687, "grad_norm": NaN, "learning_rate": 0.00034079145804427824, "loss": 1.3529, "step": 40100 }, { "epoch": 0.7260881368201972, "grad_norm": NaN, "learning_rate": 0.00034079145804427824, "loss": 3.0816, "step": 40120 }, { "epoch": 0.7264500950140259, "grad_norm": NaN, "learning_rate": 0.00034079145804427824, "loss": 4.4137, "step": 40140 }, { "epoch": 0.7268120532078545, "grad_norm": NaN, "learning_rate": 0.00034079145804427824, "loss": 1.5198, "step": 40160 }, { "epoch": 0.7271740114016831, "grad_norm": NaN, "learning_rate": 0.00034079145804427824, "loss": 0.9062, "step": 40180 }, { "epoch": 0.7275359695955117, "grad_norm": NaN, "learning_rate": 0.00034079145804427824, "loss": 2.6378, "step": 40200 }, { "epoch": 0.7278979277893404, "grad_norm": NaN, "learning_rate": 0.00034079145804427824, "loss": 0.2773, "step": 40220 }, { "epoch": 0.7282598859831689, "grad_norm": NaN, "learning_rate": 0.00034079145804427824, "loss": 0.3519, "step": 40240 }, { "epoch": 0.7286218441769976, "grad_norm": NaN, "learning_rate": 0.00034079145804427824, "loss": 0.9196, "step": 40260 }, { "epoch": 0.7289838023708262, "grad_norm": NaN, "learning_rate": 0.00034079145804427824, "loss": 0.0, "step": 40280 }, { "epoch": 0.7293457605646548, "grad_norm": NaN, "learning_rate": 0.00034079145804427824, "loss": 4.1216, "step": 40300 }, { "epoch": 0.7297077187584834, "grad_norm": NaN, "learning_rate": 0.00034079145804427824, "loss": 2.9557, "step": 40320 }, { "epoch": 0.7300696769523121, "grad_norm": NaN, "learning_rate": 0.00034079145804427824, "loss": 1.4142, "step": 40340 }, { "epoch": 0.7304316351461406, "grad_norm": NaN, "learning_rate": 0.00034079145804427824, "loss": 2.5863, "step": 40360 }, { "epoch": 0.7307935933399692, "grad_norm": NaN, "learning_rate": 0.00034079145804427824, "loss": 3.506, "step": 40380 }, { "epoch": 0.7311555515337979, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 1.8693, "step": 40400 }, { "epoch": 0.7315175097276264, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 0.8687, "step": 40420 }, { "epoch": 0.7318794679214551, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 2.1844, "step": 40440 }, { "epoch": 0.7322414261152836, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 0.0, "step": 40460 }, { "epoch": 0.7326033843091123, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 2.2826, "step": 40480 }, { "epoch": 0.7329653425029409, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 1.1744, "step": 40500 }, { "epoch": 0.7333273006967695, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 1.6571, "step": 40520 }, { "epoch": 0.7336892588905981, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 0.3281, "step": 40540 }, { "epoch": 0.7340512170844268, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 2.3448, "step": 40560 }, { "epoch": 0.7344131752782553, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 0.6611, "step": 40580 }, { "epoch": 0.734775133472084, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 0.3116, "step": 40600 }, { "epoch": 0.7351370916659126, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 1.9978, "step": 40620 }, { "epoch": 0.7354990498597412, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 0.5876, "step": 40640 }, { "epoch": 0.7358610080535698, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 0.347, "step": 40660 }, { "epoch": 0.7362229662473985, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 4.2818, "step": 40680 }, { "epoch": 0.736584924441227, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 3.0174, "step": 40700 }, { "epoch": 0.7369468826350557, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 3.3626, "step": 40720 }, { "epoch": 0.7373088408288843, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 3.003, "step": 40740 }, { "epoch": 0.7376707990227129, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 1.7469, "step": 40760 }, { "epoch": 0.7380327572165415, "grad_norm": NaN, "learning_rate": 0.0003408457501357302, "loss": 0.3286, "step": 40780 }, { "epoch": 0.73839471541037, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 6.0467, "step": 40800 }, { "epoch": 0.7387566736041987, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 0.3473, "step": 40820 }, { "epoch": 0.7391186317980273, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 1.4486, "step": 40840 }, { "epoch": 0.739480589991856, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 0.0, "step": 40860 }, { "epoch": 0.7398425481856845, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 3.4974, "step": 40880 }, { "epoch": 0.7402045063795132, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 1.8629, "step": 40900 }, { "epoch": 0.7405664645733417, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 1.5151, "step": 40920 }, { "epoch": 0.7409284227671704, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 0.88, "step": 40940 }, { "epoch": 0.741290380960999, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 2.1396, "step": 40960 }, { "epoch": 0.7416523391548276, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 2.1135, "step": 40980 }, { "epoch": 0.7420142973486562, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 2.521, "step": 41000 }, { "epoch": 0.7423762555424849, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 1.7925, "step": 41020 }, { "epoch": 0.7427382137363134, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 0.2648, "step": 41040 }, { "epoch": 0.7431001719301421, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 0.9229, "step": 41060 }, { "epoch": 0.7434621301239707, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 0.0, "step": 41080 }, { "epoch": 0.7438240883177993, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 2.057, "step": 41100 }, { "epoch": 0.7441860465116279, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 0.0, "step": 41120 }, { "epoch": 0.7445480047054566, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 0.6402, "step": 41140 }, { "epoch": 0.7449099628992851, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 1.3159, "step": 41160 }, { "epoch": 0.7452719210931138, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 3.5791, "step": 41180 }, { "epoch": 0.7456338792869424, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 0.7649, "step": 41200 }, { "epoch": 0.7459958374807709, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 1.4292, "step": 41220 }, { "epoch": 0.7463577956745996, "grad_norm": NaN, "learning_rate": 0.00034090004222718225, "loss": 2.4568, "step": 41240 }, { "epoch": 0.7467197538684281, "grad_norm": NaN, "learning_rate": 0.0003409543343186343, "loss": 3.525, "step": 41260 }, { "epoch": 0.7470817120622568, "grad_norm": NaN, "learning_rate": 0.0003409543343186343, "loss": 3.5533, "step": 41280 }, { "epoch": 0.7474436702560854, "grad_norm": NaN, "learning_rate": 0.0003409543343186343, "loss": 0.2635, "step": 41300 }, { "epoch": 0.747805628449914, "grad_norm": NaN, "learning_rate": 0.0003409543343186343, "loss": 0.8783, "step": 41320 }, { "epoch": 0.7481675866437426, "grad_norm": NaN, "learning_rate": 0.0003409543343186343, "loss": 3.0662, "step": 41340 }, { "epoch": 0.7485295448375713, "grad_norm": NaN, "learning_rate": 0.0003409543343186343, "loss": 0.0, "step": 41360 }, { "epoch": 0.7488915030313998, "grad_norm": NaN, "learning_rate": 0.0003409543343186343, "loss": 0.7552, "step": 41380 }, { "epoch": 0.7492534612252285, "grad_norm": NaN, "learning_rate": 0.0003409543343186343, "loss": 1.8623, "step": 41400 }, { "epoch": 0.7496154194190571, "grad_norm": NaN, "learning_rate": 0.0003409543343186343, "loss": 0.6694, "step": 41420 }, { "epoch": 0.7499773776128857, "grad_norm": NaN, "learning_rate": 0.0003409543343186343, "loss": 0.685, "step": 41440 }, { "epoch": 0.7503393358067143, "grad_norm": NaN, "learning_rate": 0.0003409543343186343, "loss": 3.3854, "step": 41460 }, { "epoch": 0.750701294000543, "grad_norm": NaN, "learning_rate": 0.0003409543343186343, "loss": 0.4405, "step": 41480 }, { "epoch": 0.7510632521943715, "grad_norm": NaN, "learning_rate": 0.00034100862641008625, "loss": 0.9356, "step": 41500 }, { "epoch": 0.7514252103882002, "grad_norm": NaN, "learning_rate": 0.00034100862641008625, "loss": 3.8862, "step": 41520 }, { "epoch": 0.7517871685820288, "grad_norm": NaN, "learning_rate": 0.00034100862641008625, "loss": 2.4345, "step": 41540 }, { "epoch": 0.7521491267758574, "grad_norm": NaN, "learning_rate": 0.00034100862641008625, "loss": 0.4036, "step": 41560 }, { "epoch": 0.752511084969686, "grad_norm": NaN, "learning_rate": 0.00034100862641008625, "loss": 1.6544, "step": 41580 }, { "epoch": 0.7528730431635147, "grad_norm": NaN, "learning_rate": 0.00034100862641008625, "loss": 4.0443, "step": 41600 }, { "epoch": 0.7532350013573432, "grad_norm": NaN, "learning_rate": 0.00034100862641008625, "loss": 1.5719, "step": 41620 }, { "epoch": 0.7535969595511718, "grad_norm": NaN, "learning_rate": 0.00034100862641008625, "loss": 0.0, "step": 41640 }, { "epoch": 0.7539589177450005, "grad_norm": NaN, "learning_rate": 0.00034100862641008625, "loss": 3.5074, "step": 41660 }, { "epoch": 0.754320875938829, "grad_norm": NaN, "learning_rate": 0.00034100862641008625, "loss": 3.298, "step": 41680 }, { "epoch": 0.7546828341326577, "grad_norm": NaN, "learning_rate": 0.00034100862641008625, "loss": 1.9071, "step": 41700 }, { "epoch": 0.7550447923264862, "grad_norm": NaN, "learning_rate": 0.0003410629185015383, "loss": 2.8022, "step": 41720 }, { "epoch": 0.7554067505203149, "grad_norm": NaN, "learning_rate": 0.0003410629185015383, "loss": 0.5853, "step": 41740 }, { "epoch": 0.7557687087141435, "grad_norm": NaN, "learning_rate": 0.0003411172105929903, "loss": 5.6129, "step": 41760 }, { "epoch": 0.7561306669079721, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 0.6186, "step": 41780 }, { "epoch": 0.7564926251018007, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 1.6322, "step": 41800 }, { "epoch": 0.7568545832956294, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 0.2516, "step": 41820 }, { "epoch": 0.7572165414894579, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 1.3816, "step": 41840 }, { "epoch": 0.7575784996832866, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 0.0, "step": 41860 }, { "epoch": 0.7579404578771152, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 0.3332, "step": 41880 }, { "epoch": 0.7583024160709438, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 0.8718, "step": 41900 }, { "epoch": 0.7586643742647724, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 3.3619, "step": 41920 }, { "epoch": 0.7590263324586011, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 0.6052, "step": 41940 }, { "epoch": 0.7593882906524296, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 0.3095, "step": 41960 }, { "epoch": 0.7597502488462583, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 0.515, "step": 41980 }, { "epoch": 0.7601122070400869, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 1.5269, "step": 42000 }, { "epoch": 0.7604741652339155, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 0.9995, "step": 42020 }, { "epoch": 0.7608361234277441, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 0.6057, "step": 42040 }, { "epoch": 0.7611980816215727, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 2.5703, "step": 42060 }, { "epoch": 0.7615600398154013, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 1.4425, "step": 42080 }, { "epoch": 0.7619219980092299, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 3.2961, "step": 42100 }, { "epoch": 0.7622839562030586, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 1.9865, "step": 42120 }, { "epoch": 0.7626459143968871, "grad_norm": NaN, "learning_rate": 0.0003411715026844423, "loss": 2.7619, "step": 42140 }, { "epoch": 0.7630078725907158, "grad_norm": NaN, "learning_rate": 0.0003412257947758943, "loss": 0.9523, "step": 42160 }, { "epoch": 0.7633698307845443, "grad_norm": NaN, "learning_rate": 0.0003412257947758943, "loss": 2.6472, "step": 42180 }, { "epoch": 0.763731788978373, "grad_norm": NaN, "learning_rate": 0.0003412257947758943, "loss": 3.5945, "step": 42200 }, { "epoch": 0.7640937471722016, "grad_norm": NaN, "learning_rate": 0.0003412257947758943, "loss": 1.0469, "step": 42220 }, { "epoch": 0.7644557053660302, "grad_norm": NaN, "learning_rate": 0.0003412257947758943, "loss": 2.881, "step": 42240 }, { "epoch": 0.7648176635598588, "grad_norm": NaN, "learning_rate": 0.0003412257947758943, "loss": 2.1307, "step": 42260 }, { "epoch": 0.7651796217536875, "grad_norm": NaN, "learning_rate": 0.0003412257947758943, "loss": 1.8804, "step": 42280 }, { "epoch": 0.765541579947516, "grad_norm": NaN, "learning_rate": 0.0003412257947758943, "loss": 0.7057, "step": 42300 }, { "epoch": 0.7659035381413447, "grad_norm": NaN, "learning_rate": 0.0003412257947758943, "loss": 0.2772, "step": 42320 }, { "epoch": 0.7662654963351733, "grad_norm": NaN, "learning_rate": 0.0003412257947758943, "loss": 0.0, "step": 42340 }, { "epoch": 0.7666274545290019, "grad_norm": NaN, "learning_rate": 0.0003412257947758943, "loss": 4.0575, "step": 42360 }, { "epoch": 0.7669894127228305, "grad_norm": NaN, "learning_rate": 0.0003412257947758943, "loss": 0.3747, "step": 42380 }, { "epoch": 0.7673513709166592, "grad_norm": NaN, "learning_rate": 0.0003412257947758943, "loss": 2.8997, "step": 42400 }, { "epoch": 0.7677133291104877, "grad_norm": NaN, "learning_rate": 0.0003412257947758943, "loss": 1.9952, "step": 42420 }, { "epoch": 0.7680752873043164, "grad_norm": NaN, "learning_rate": 0.0003412257947758943, "loss": 2.3111, "step": 42440 }, { "epoch": 0.768437245498145, "grad_norm": NaN, "learning_rate": 0.0003412800868673463, "loss": 3.4829, "step": 42460 }, { "epoch": 0.7687992036919736, "grad_norm": NaN, "learning_rate": 0.0003412800868673463, "loss": 1.5128, "step": 42480 }, { "epoch": 0.7691611618858022, "grad_norm": NaN, "learning_rate": 0.0003412800868673463, "loss": 1.5463, "step": 42500 }, { "epoch": 0.7695231200796308, "grad_norm": NaN, "learning_rate": 0.0003412800868673463, "loss": 0.9137, "step": 42520 }, { "epoch": 0.7698850782734594, "grad_norm": NaN, "learning_rate": 0.0003412800868673463, "loss": 2.4126, "step": 42540 }, { "epoch": 0.770247036467288, "grad_norm": NaN, "learning_rate": 0.0003412800868673463, "loss": 2.1134, "step": 42560 }, { "epoch": 0.7706089946611167, "grad_norm": NaN, "learning_rate": 0.0003412800868673463, "loss": 0.6579, "step": 42580 }, { "epoch": 0.7709709528549452, "grad_norm": NaN, "learning_rate": 0.0003412800868673463, "loss": 1.6776, "step": 42600 }, { "epoch": 0.7713329110487739, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 2.9237, "step": 42620 }, { "epoch": 0.7716948692426024, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 1.5943, "step": 42640 }, { "epoch": 0.7720568274364311, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 2.7924, "step": 42660 }, { "epoch": 0.7724187856302597, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 3.5732, "step": 42680 }, { "epoch": 0.7727807438240883, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 3.7438, "step": 42700 }, { "epoch": 0.7731427020179169, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 0.9085, "step": 42720 }, { "epoch": 0.7735046602117456, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 0.875, "step": 42740 }, { "epoch": 0.7738666184055741, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 2.9115, "step": 42760 }, { "epoch": 0.7742285765994028, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 0.0, "step": 42780 }, { "epoch": 0.7745905347932314, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 4.3598, "step": 42800 }, { "epoch": 0.77495249298706, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 2.9054, "step": 42820 }, { "epoch": 0.7753144511808886, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 2.8356, "step": 42840 }, { "epoch": 0.7756764093747173, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 2.6337, "step": 42860 }, { "epoch": 0.7760383675685458, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 1.99, "step": 42880 }, { "epoch": 0.7764003257623745, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 0.0, "step": 42900 }, { "epoch": 0.7767622839562031, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 1.1923, "step": 42920 }, { "epoch": 0.7771242421500316, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 0.8791, "step": 42940 }, { "epoch": 0.7774862003438603, "grad_norm": NaN, "learning_rate": 0.00034138867105025034, "loss": 1.8362, "step": 42960 }, { "epoch": 0.7778481585376888, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 2.6254, "step": 42980 }, { "epoch": 0.7782101167315175, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 0.3522, "step": 43000 }, { "epoch": 0.7785720749253461, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 0.7873, "step": 43020 }, { "epoch": 0.7789340331191748, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 1.3321, "step": 43040 }, { "epoch": 0.7792959913130033, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 0.5861, "step": 43060 }, { "epoch": 0.779657949506832, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 0.8736, "step": 43080 }, { "epoch": 0.7800199077006605, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 1.5841, "step": 43100 }, { "epoch": 0.7803818658944892, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 1.5798, "step": 43120 }, { "epoch": 0.7807438240883178, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 1.6668, "step": 43140 }, { "epoch": 0.7811057822821464, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 0.3763, "step": 43160 }, { "epoch": 0.781467740475975, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 3.5091, "step": 43180 }, { "epoch": 0.7818296986698037, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 1.1577, "step": 43200 }, { "epoch": 0.7821916568636322, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 5.4725, "step": 43220 }, { "epoch": 0.7825536150574609, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 4.2603, "step": 43240 }, { "epoch": 0.7829155732512895, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 1.6528, "step": 43260 }, { "epoch": 0.7832775314451181, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 2.3669, "step": 43280 }, { "epoch": 0.7836394896389467, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 2.0703, "step": 43300 }, { "epoch": 0.7840014478327754, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 1.7422, "step": 43320 }, { "epoch": 0.7843634060266039, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 0.6751, "step": 43340 }, { "epoch": 0.7847253642204325, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 0.4809, "step": 43360 }, { "epoch": 0.7850873224142612, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 1.3203, "step": 43380 }, { "epoch": 0.7854492806080897, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 1.4595, "step": 43400 }, { "epoch": 0.7858112388019184, "grad_norm": NaN, "learning_rate": 0.0003414429631417023, "loss": 0.8783, "step": 43420 }, { "epoch": 0.786173196995747, "grad_norm": NaN, "learning_rate": 0.00034149725523315434, "loss": 0.8333, "step": 43440 }, { "epoch": 0.7865351551895756, "grad_norm": NaN, "learning_rate": 0.00034149725523315434, "loss": 1.0495, "step": 43460 }, { "epoch": 0.7868971133834042, "grad_norm": NaN, "learning_rate": 0.00034149725523315434, "loss": 0.4389, "step": 43480 }, { "epoch": 0.7872590715772328, "grad_norm": NaN, "learning_rate": 0.00034149725523315434, "loss": 1.0187, "step": 43500 }, { "epoch": 0.7876210297710614, "grad_norm": NaN, "learning_rate": 0.00034149725523315434, "loss": 4.4166, "step": 43520 }, { "epoch": 0.7879829879648901, "grad_norm": NaN, "learning_rate": 0.00034149725523315434, "loss": 2.5267, "step": 43540 }, { "epoch": 0.7883449461587186, "grad_norm": NaN, "learning_rate": 0.00034155154732460637, "loss": 1.7665, "step": 43560 }, { "epoch": 0.7887069043525473, "grad_norm": NaN, "learning_rate": 0.00034155154732460637, "loss": 1.9786, "step": 43580 }, { "epoch": 0.7890688625463759, "grad_norm": NaN, "learning_rate": 0.00034155154732460637, "loss": 1.7914, "step": 43600 }, { "epoch": 0.7894308207402045, "grad_norm": NaN, "learning_rate": 0.00034155154732460637, "loss": 0.5808, "step": 43620 }, { "epoch": 0.7897927789340331, "grad_norm": NaN, "learning_rate": 0.00034155154732460637, "loss": 0.6569, "step": 43640 }, { "epoch": 0.7901547371278618, "grad_norm": NaN, "learning_rate": 0.00034160583941605834, "loss": 3.5249, "step": 43660 }, { "epoch": 0.7905166953216903, "grad_norm": NaN, "learning_rate": 0.00034166013150751037, "loss": 3.9399, "step": 43680 }, { "epoch": 0.790878653515519, "grad_norm": NaN, "learning_rate": 0.00034166013150751037, "loss": 1.0985, "step": 43700 }, { "epoch": 0.7912406117093476, "grad_norm": NaN, "learning_rate": 0.00034166013150751037, "loss": 0.3511, "step": 43720 }, { "epoch": 0.7916025699031762, "grad_norm": NaN, "learning_rate": 0.00034166013150751037, "loss": 0.263, "step": 43740 }, { "epoch": 0.7919645280970048, "grad_norm": NaN, "learning_rate": 0.00034166013150751037, "loss": 2.8273, "step": 43760 }, { "epoch": 0.7923264862908334, "grad_norm": NaN, "learning_rate": 0.00034166013150751037, "loss": 0.6398, "step": 43780 }, { "epoch": 0.792688444484662, "grad_norm": NaN, "learning_rate": 0.00034166013150751037, "loss": 2.284, "step": 43800 }, { "epoch": 0.7930504026784906, "grad_norm": NaN, "learning_rate": 0.00034166013150751037, "loss": 3.439, "step": 43820 }, { "epoch": 0.7934123608723193, "grad_norm": NaN, "learning_rate": 0.00034166013150751037, "loss": 2.7479, "step": 43840 }, { "epoch": 0.7937743190661478, "grad_norm": NaN, "learning_rate": 0.0003417144235989624, "loss": 3.3077, "step": 43860 }, { "epoch": 0.7941362772599765, "grad_norm": NaN, "learning_rate": 0.0003417144235989624, "loss": 1.112, "step": 43880 }, { "epoch": 0.794498235453805, "grad_norm": NaN, "learning_rate": 0.0003417144235989624, "loss": 2.1702, "step": 43900 }, { "epoch": 0.7948601936476337, "grad_norm": NaN, "learning_rate": 0.0003417144235989624, "loss": 3.4029, "step": 43920 }, { "epoch": 0.7952221518414623, "grad_norm": NaN, "learning_rate": 0.0003417144235989624, "loss": 0.0, "step": 43940 }, { "epoch": 0.795584110035291, "grad_norm": NaN, "learning_rate": 0.0003417144235989624, "loss": 2.2386, "step": 43960 }, { "epoch": 0.7959460682291195, "grad_norm": NaN, "learning_rate": 0.0003417144235989624, "loss": 1.4024, "step": 43980 }, { "epoch": 0.7963080264229482, "grad_norm": NaN, "learning_rate": 0.0003417144235989624, "loss": 0.5845, "step": 44000 }, { "epoch": 0.7966699846167767, "grad_norm": NaN, "learning_rate": 0.0003417144235989624, "loss": 1.2316, "step": 44020 }, { "epoch": 0.7970319428106054, "grad_norm": NaN, "learning_rate": 0.0003417144235989624, "loss": 2.7125, "step": 44040 }, { "epoch": 0.797393901004434, "grad_norm": NaN, "learning_rate": 0.0003417144235989624, "loss": 0.6009, "step": 44060 }, { "epoch": 0.7977558591982626, "grad_norm": NaN, "learning_rate": 0.00034176871569041443, "loss": 1.1332, "step": 44080 }, { "epoch": 0.7981178173920912, "grad_norm": NaN, "learning_rate": 0.00034176871569041443, "loss": 4.5306, "step": 44100 }, { "epoch": 0.7984797755859199, "grad_norm": NaN, "learning_rate": 0.00034176871569041443, "loss": 2.142, "step": 44120 }, { "epoch": 0.7988417337797484, "grad_norm": NaN, "learning_rate": 0.00034176871569041443, "loss": 4.7463, "step": 44140 }, { "epoch": 0.7992036919735771, "grad_norm": NaN, "learning_rate": 0.00034176871569041443, "loss": 1.2576, "step": 44160 }, { "epoch": 0.7995656501674057, "grad_norm": NaN, "learning_rate": 0.00034182300778186646, "loss": 1.9843, "step": 44180 }, { "epoch": 0.7999276083612343, "grad_norm": NaN, "learning_rate": 0.00034182300778186646, "loss": 0.6952, "step": 44200 }, { "epoch": 0.8002895665550629, "grad_norm": NaN, "learning_rate": 0.0003418772998733185, "loss": 6.0177, "step": 44220 }, { "epoch": 0.8006515247488915, "grad_norm": NaN, "learning_rate": 0.0003418772998733185, "loss": 0.3539, "step": 44240 }, { "epoch": 0.8010134829427201, "grad_norm": NaN, "learning_rate": 0.00034193159196477046, "loss": 5.6859, "step": 44260 }, { "epoch": 0.8013754411365487, "grad_norm": NaN, "learning_rate": 0.00034193159196477046, "loss": 3.7923, "step": 44280 }, { "epoch": 0.8017373993303774, "grad_norm": NaN, "learning_rate": 0.00034193159196477046, "loss": 0.8769, "step": 44300 }, { "epoch": 0.8020993575242059, "grad_norm": NaN, "learning_rate": 0.00034193159196477046, "loss": 0.7451, "step": 44320 }, { "epoch": 0.8024613157180346, "grad_norm": NaN, "learning_rate": 0.00034193159196477046, "loss": 0.3733, "step": 44340 }, { "epoch": 0.8028232739118631, "grad_norm": NaN, "learning_rate": 0.00034193159196477046, "loss": 0.3106, "step": 44360 }, { "epoch": 0.8031852321056918, "grad_norm": NaN, "learning_rate": 0.0003419858840562225, "loss": 4.184, "step": 44380 }, { "epoch": 0.8035471902995204, "grad_norm": NaN, "learning_rate": 0.0003419858840562225, "loss": 0.6967, "step": 44400 }, { "epoch": 0.803909148493349, "grad_norm": NaN, "learning_rate": 0.0003419858840562225, "loss": 1.204, "step": 44420 }, { "epoch": 0.8042711066871776, "grad_norm": NaN, "learning_rate": 0.0003420944682391265, "loss": 1.7957, "step": 44440 }, { "epoch": 0.8046330648810063, "grad_norm": NaN, "learning_rate": 0.0003420944682391265, "loss": 0.8114, "step": 44460 }, { "epoch": 0.8049950230748348, "grad_norm": NaN, "learning_rate": 0.0003420944682391265, "loss": 0.6631, "step": 44480 }, { "epoch": 0.8053569812686635, "grad_norm": NaN, "learning_rate": 0.0003420944682391265, "loss": 2.457, "step": 44500 }, { "epoch": 0.8057189394624921, "grad_norm": NaN, "learning_rate": 0.0003420944682391265, "loss": 4.1129, "step": 44520 }, { "epoch": 0.8060808976563207, "grad_norm": NaN, "learning_rate": 0.0003420944682391265, "loss": 1.4922, "step": 44540 }, { "epoch": 0.8064428558501493, "grad_norm": NaN, "learning_rate": 0.0003421487603305785, "loss": 5.882, "step": 44560 }, { "epoch": 0.806804814043978, "grad_norm": NaN, "learning_rate": 0.0003421487603305785, "loss": 0.7792, "step": 44580 }, { "epoch": 0.8071667722378065, "grad_norm": NaN, "learning_rate": 0.0003421487603305785, "loss": 0.0, "step": 44600 }, { "epoch": 0.8075287304316352, "grad_norm": NaN, "learning_rate": 0.0003421487603305785, "loss": 2.2024, "step": 44620 }, { "epoch": 0.8078906886254638, "grad_norm": NaN, "learning_rate": 0.0003421487603305785, "loss": 3.3518, "step": 44640 }, { "epoch": 0.8082526468192923, "grad_norm": NaN, "learning_rate": 0.0003421487603305785, "loss": 2.6258, "step": 44660 }, { "epoch": 0.808614605013121, "grad_norm": NaN, "learning_rate": 0.0003421487603305785, "loss": 2.9838, "step": 44680 }, { "epoch": 0.8089765632069496, "grad_norm": NaN, "learning_rate": 0.0003421487603305785, "loss": 0.7323, "step": 44700 }, { "epoch": 0.8093385214007782, "grad_norm": NaN, "learning_rate": 0.0003421487603305785, "loss": 1.0926, "step": 44720 }, { "epoch": 0.8097004795946068, "grad_norm": NaN, "learning_rate": 0.0003421487603305785, "loss": 0.0, "step": 44740 }, { "epoch": 0.8100624377884355, "grad_norm": NaN, "learning_rate": 0.0003421487603305785, "loss": 0.7118, "step": 44760 }, { "epoch": 0.810424395982264, "grad_norm": NaN, "learning_rate": 0.0003421487603305785, "loss": 1.8119, "step": 44780 }, { "epoch": 0.8107863541760927, "grad_norm": NaN, "learning_rate": 0.0003421487603305785, "loss": 1.4065, "step": 44800 }, { "epoch": 0.8111483123699212, "grad_norm": NaN, "learning_rate": 0.0003422030524220305, "loss": 5.8694, "step": 44820 }, { "epoch": 0.8115102705637499, "grad_norm": NaN, "learning_rate": 0.0003422030524220305, "loss": 1.7547, "step": 44840 }, { "epoch": 0.8118722287575785, "grad_norm": NaN, "learning_rate": 0.0003422030524220305, "loss": 0.9287, "step": 44860 }, { "epoch": 0.8122341869514071, "grad_norm": NaN, "learning_rate": 0.0003422030524220305, "loss": 1.7602, "step": 44880 }, { "epoch": 0.8125961451452357, "grad_norm": NaN, "learning_rate": 0.0003422030524220305, "loss": 1.6351, "step": 44900 }, { "epoch": 0.8129581033390644, "grad_norm": NaN, "learning_rate": 0.0003422030524220305, "loss": 0.4784, "step": 44920 }, { "epoch": 0.8133200615328929, "grad_norm": NaN, "learning_rate": 0.0003422030524220305, "loss": 3.0653, "step": 44940 }, { "epoch": 0.8136820197267216, "grad_norm": NaN, "learning_rate": 0.0003422030524220305, "loss": 0.3747, "step": 44960 }, { "epoch": 0.8140439779205502, "grad_norm": NaN, "learning_rate": 0.0003422030524220305, "loss": 2.1391, "step": 44980 }, { "epoch": 0.8144059361143788, "grad_norm": NaN, "learning_rate": 0.0003422030524220305, "loss": 3.4971, "step": 45000 }, { "epoch": 0.8144059361143788, "eval_accuracy": 4.6577639445232914e-05, "eval_loss": NaN, "eval_runtime": 170.0213, "eval_samples_per_second": 3575.081, "eval_steps_per_second": 3.494, "step": 45000 }, { "epoch": 0.8147678943082074, "grad_norm": NaN, "learning_rate": 0.0003422030524220305, "loss": 1.1013, "step": 45020 }, { "epoch": 0.8151298525020361, "grad_norm": NaN, "learning_rate": 0.0003422030524220305, "loss": 0.0, "step": 45040 }, { "epoch": 0.8154918106958646, "grad_norm": NaN, "learning_rate": 0.0003422030524220305, "loss": 1.0811, "step": 45060 }, { "epoch": 0.8158537688896932, "grad_norm": NaN, "learning_rate": 0.0003422030524220305, "loss": 0.5771, "step": 45080 }, { "epoch": 0.8162157270835219, "grad_norm": NaN, "learning_rate": 0.0003422030524220305, "loss": 1.2551, "step": 45100 }, { "epoch": 0.8165776852773504, "grad_norm": NaN, "learning_rate": 0.0003422573445134825, "loss": 1.5968, "step": 45120 }, { "epoch": 0.8169396434711791, "grad_norm": NaN, "learning_rate": 0.0003422573445134825, "loss": 0.6273, "step": 45140 }, { "epoch": 0.8173016016650076, "grad_norm": NaN, "learning_rate": 0.00034231163660493455, "loss": 1.762, "step": 45160 }, { "epoch": 0.8176635598588363, "grad_norm": NaN, "learning_rate": 0.00034231163660493455, "loss": 0.7482, "step": 45180 }, { "epoch": 0.8180255180526649, "grad_norm": NaN, "learning_rate": 0.00034231163660493455, "loss": 0.6348, "step": 45200 }, { "epoch": 0.8183874762464936, "grad_norm": NaN, "learning_rate": 0.00034231163660493455, "loss": 2.9022, "step": 45220 }, { "epoch": 0.8187494344403221, "grad_norm": NaN, "learning_rate": 0.00034231163660493455, "loss": 3.9902, "step": 45240 }, { "epoch": 0.8191113926341508, "grad_norm": NaN, "learning_rate": 0.00034231163660493455, "loss": 1.3003, "step": 45260 }, { "epoch": 0.8194733508279793, "grad_norm": NaN, "learning_rate": 0.00034231163660493455, "loss": 0.7519, "step": 45280 }, { "epoch": 0.819835309021808, "grad_norm": NaN, "learning_rate": 0.00034231163660493455, "loss": 3.7594, "step": 45300 }, { "epoch": 0.8201972672156366, "grad_norm": NaN, "learning_rate": 0.00034231163660493455, "loss": 0.5259, "step": 45320 }, { "epoch": 0.8205592254094652, "grad_norm": NaN, "learning_rate": 0.0003423659286963865, "loss": 1.3492, "step": 45340 }, { "epoch": 0.8209211836032938, "grad_norm": NaN, "learning_rate": 0.0003423659286963865, "loss": 1.8379, "step": 45360 }, { "epoch": 0.8212831417971225, "grad_norm": NaN, "learning_rate": 0.0003423659286963865, "loss": 0.9331, "step": 45380 }, { "epoch": 0.821645099990951, "grad_norm": NaN, "learning_rate": 0.0003423659286963865, "loss": 1.242, "step": 45400 }, { "epoch": 0.8220070581847797, "grad_norm": NaN, "learning_rate": 0.0003423659286963865, "loss": 1.2382, "step": 45420 }, { "epoch": 0.8223690163786083, "grad_norm": NaN, "learning_rate": 0.0003423659286963865, "loss": 3.0546, "step": 45440 }, { "epoch": 0.8227309745724369, "grad_norm": NaN, "learning_rate": 0.0003423659286963865, "loss": 1.1098, "step": 45460 }, { "epoch": 0.8230929327662655, "grad_norm": NaN, "learning_rate": 0.0003423659286963865, "loss": 4.5014, "step": 45480 }, { "epoch": 0.823454890960094, "grad_norm": NaN, "learning_rate": 0.0003423659286963865, "loss": 0.8949, "step": 45500 }, { "epoch": 0.8238168491539227, "grad_norm": NaN, "learning_rate": 0.0003423659286963865, "loss": 3.0434, "step": 45520 }, { "epoch": 0.8241788073477513, "grad_norm": NaN, "learning_rate": 0.0003423659286963865, "loss": 0.8785, "step": 45540 }, { "epoch": 0.82454076554158, "grad_norm": NaN, "learning_rate": 0.0003423659286963865, "loss": 3.6592, "step": 45560 }, { "epoch": 0.8249027237354085, "grad_norm": NaN, "learning_rate": 0.00034242022078783855, "loss": 3.3117, "step": 45580 }, { "epoch": 0.8252646819292372, "grad_norm": NaN, "learning_rate": 0.00034242022078783855, "loss": 0.6415, "step": 45600 }, { "epoch": 0.8256266401230657, "grad_norm": NaN, "learning_rate": 0.00034242022078783855, "loss": 0.855, "step": 45620 }, { "epoch": 0.8259885983168944, "grad_norm": NaN, "learning_rate": 0.00034242022078783855, "loss": 5.2258, "step": 45640 }, { "epoch": 0.826350556510723, "grad_norm": NaN, "learning_rate": 0.0003424745128792906, "loss": 1.9846, "step": 45660 }, { "epoch": 0.8267125147045516, "grad_norm": NaN, "learning_rate": 0.0003424745128792906, "loss": 0.6199, "step": 45680 }, { "epoch": 0.8270744728983802, "grad_norm": NaN, "learning_rate": 0.0003424745128792906, "loss": 1.4499, "step": 45700 }, { "epoch": 0.8274364310922089, "grad_norm": NaN, "learning_rate": 0.0003424745128792906, "loss": 0.9893, "step": 45720 }, { "epoch": 0.8277983892860374, "grad_norm": NaN, "learning_rate": 0.0003424745128792906, "loss": 1.379, "step": 45740 }, { "epoch": 0.8281603474798661, "grad_norm": NaN, "learning_rate": 0.0003424745128792906, "loss": 2.8108, "step": 45760 }, { "epoch": 0.8285223056736947, "grad_norm": NaN, "learning_rate": 0.0003424745128792906, "loss": 3.075, "step": 45780 }, { "epoch": 0.8288842638675233, "grad_norm": NaN, "learning_rate": 0.0003424745128792906, "loss": 3.9405, "step": 45800 }, { "epoch": 0.8292462220613519, "grad_norm": NaN, "learning_rate": 0.0003424745128792906, "loss": 3.0244, "step": 45820 }, { "epoch": 0.8296081802551806, "grad_norm": NaN, "learning_rate": 0.0003424745128792906, "loss": 0.9193, "step": 45840 }, { "epoch": 0.8299701384490091, "grad_norm": NaN, "learning_rate": 0.0003424745128792906, "loss": 2.2784, "step": 45860 }, { "epoch": 0.8303320966428378, "grad_norm": NaN, "learning_rate": 0.0003425830970621946, "loss": 5.6385, "step": 45880 }, { "epoch": 0.8306940548366664, "grad_norm": NaN, "learning_rate": 0.0003425830970621946, "loss": 0.6865, "step": 45900 }, { "epoch": 0.831056013030495, "grad_norm": NaN, "learning_rate": 0.0003425830970621946, "loss": 0.6382, "step": 45920 }, { "epoch": 0.8314179712243236, "grad_norm": NaN, "learning_rate": 0.0003425830970621946, "loss": 2.9461, "step": 45940 }, { "epoch": 0.8317799294181522, "grad_norm": NaN, "learning_rate": 0.0003425830970621946, "loss": 0.5432, "step": 45960 }, { "epoch": 0.8321418876119808, "grad_norm": NaN, "learning_rate": 0.0003425830970621946, "loss": 0.3098, "step": 45980 }, { "epoch": 0.8325038458058094, "grad_norm": NaN, "learning_rate": 0.0003425830970621946, "loss": 0.7931, "step": 46000 }, { "epoch": 0.832865803999638, "grad_norm": NaN, "learning_rate": 0.0003425830970621946, "loss": 1.3182, "step": 46020 }, { "epoch": 0.8332277621934666, "grad_norm": NaN, "learning_rate": 0.0003425830970621946, "loss": 0.883, "step": 46040 }, { "epoch": 0.8335897203872953, "grad_norm": NaN, "learning_rate": 0.0003425830970621946, "loss": 0.8214, "step": 46060 }, { "epoch": 0.8339516785811238, "grad_norm": NaN, "learning_rate": 0.0003425830970621946, "loss": 1.4961, "step": 46080 }, { "epoch": 0.8343136367749525, "grad_norm": NaN, "learning_rate": 0.0003425830970621946, "loss": 1.9674, "step": 46100 }, { "epoch": 0.8346755949687811, "grad_norm": NaN, "learning_rate": 0.0003426373891536466, "loss": 2.7082, "step": 46120 }, { "epoch": 0.8350375531626097, "grad_norm": NaN, "learning_rate": 0.0003426373891536466, "loss": 4.7325, "step": 46140 }, { "epoch": 0.8353995113564383, "grad_norm": NaN, "learning_rate": 0.0003426373891536466, "loss": 3.6999, "step": 46160 }, { "epoch": 0.835761469550267, "grad_norm": NaN, "learning_rate": 0.0003426373891536466, "loss": 1.4293, "step": 46180 }, { "epoch": 0.8361234277440955, "grad_norm": NaN, "learning_rate": 0.0003426373891536466, "loss": 2.882, "step": 46200 }, { "epoch": 0.8364853859379242, "grad_norm": NaN, "learning_rate": 0.0003426373891536466, "loss": 1.5639, "step": 46220 }, { "epoch": 0.8368473441317528, "grad_norm": NaN, "learning_rate": 0.0003426373891536466, "loss": 3.3409, "step": 46240 }, { "epoch": 0.8372093023255814, "grad_norm": NaN, "learning_rate": 0.0003426373891536466, "loss": 2.6536, "step": 46260 }, { "epoch": 0.83757126051941, "grad_norm": NaN, "learning_rate": 0.0003426373891536466, "loss": 1.5417, "step": 46280 }, { "epoch": 0.8379332187132387, "grad_norm": NaN, "learning_rate": 0.0003426916812450986, "loss": 1.6685, "step": 46300 }, { "epoch": 0.8382951769070672, "grad_norm": NaN, "learning_rate": 0.0003426916812450986, "loss": 2.3909, "step": 46320 }, { "epoch": 0.8386571351008959, "grad_norm": NaN, "learning_rate": 0.0003426916812450986, "loss": 0.3746, "step": 46340 }, { "epoch": 0.8390190932947245, "grad_norm": NaN, "learning_rate": 0.0003426916812450986, "loss": 2.5086, "step": 46360 }, { "epoch": 0.839381051488553, "grad_norm": NaN, "learning_rate": 0.0003426916812450986, "loss": 0.5264, "step": 46380 }, { "epoch": 0.8397430096823817, "grad_norm": NaN, "learning_rate": 0.0003426916812450986, "loss": 1.46, "step": 46400 }, { "epoch": 0.8401049678762103, "grad_norm": NaN, "learning_rate": 0.0003426916812450986, "loss": 3.4459, "step": 46420 }, { "epoch": 0.8404669260700389, "grad_norm": NaN, "learning_rate": 0.0003426916812450986, "loss": 3.2337, "step": 46440 }, { "epoch": 0.8408288842638675, "grad_norm": NaN, "learning_rate": 0.0003426916812450986, "loss": 2.0381, "step": 46460 }, { "epoch": 0.8411908424576962, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 1.5477, "step": 46480 }, { "epoch": 0.8415528006515247, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 2.866, "step": 46500 }, { "epoch": 0.8419147588453534, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 0.0, "step": 46520 }, { "epoch": 0.8422767170391819, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 0.6529, "step": 46540 }, { "epoch": 0.8426386752330106, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 1.8621, "step": 46560 }, { "epoch": 0.8430006334268392, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 1.5342, "step": 46580 }, { "epoch": 0.8433625916206678, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 0.3288, "step": 46600 }, { "epoch": 0.8437245498144964, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 2.7671, "step": 46620 }, { "epoch": 0.8440865080083251, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 4.344, "step": 46640 }, { "epoch": 0.8444484662021536, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 1.3153, "step": 46660 }, { "epoch": 0.8448104243959823, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 0.0, "step": 46680 }, { "epoch": 0.8451723825898109, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 3.0078, "step": 46700 }, { "epoch": 0.8455343407836395, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 2.3489, "step": 46720 }, { "epoch": 0.8458962989774681, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 1.9476, "step": 46740 }, { "epoch": 0.8462582571712968, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 0.0, "step": 46760 }, { "epoch": 0.8466202153651253, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 1.507, "step": 46780 }, { "epoch": 0.8469821735589539, "grad_norm": NaN, "learning_rate": 0.0003427459733365506, "loss": 4.0155, "step": 46800 }, { "epoch": 0.8473441317527826, "grad_norm": NaN, "learning_rate": 0.00034280026542800264, "loss": 6.1755, "step": 46820 }, { "epoch": 0.8477060899466111, "grad_norm": NaN, "learning_rate": 0.00034280026542800264, "loss": 2.4112, "step": 46840 }, { "epoch": 0.8480680481404398, "grad_norm": NaN, "learning_rate": 0.00034280026542800264, "loss": 0.2907, "step": 46860 }, { "epoch": 0.8484300063342683, "grad_norm": NaN, "learning_rate": 0.00034280026542800264, "loss": 3.5964, "step": 46880 }, { "epoch": 0.848791964528097, "grad_norm": NaN, "learning_rate": 0.00034280026542800264, "loss": 0.3755, "step": 46900 }, { "epoch": 0.8491539227219256, "grad_norm": NaN, "learning_rate": 0.00034280026542800264, "loss": 0.8192, "step": 46920 }, { "epoch": 0.8495158809157543, "grad_norm": NaN, "learning_rate": 0.00034280026542800264, "loss": 2.625, "step": 46940 }, { "epoch": 0.8498778391095828, "grad_norm": NaN, "learning_rate": 0.00034285455751945467, "loss": 3.4662, "step": 46960 }, { "epoch": 0.8502397973034115, "grad_norm": NaN, "learning_rate": 0.00034285455751945467, "loss": 0.3487, "step": 46980 }, { "epoch": 0.85060175549724, "grad_norm": NaN, "learning_rate": 0.00034285455751945467, "loss": 1.3247, "step": 47000 }, { "epoch": 0.8509637136910687, "grad_norm": NaN, "learning_rate": 0.00034285455751945467, "loss": 2.4207, "step": 47020 }, { "epoch": 0.8513256718848973, "grad_norm": NaN, "learning_rate": 0.00034285455751945467, "loss": 3.5487, "step": 47040 }, { "epoch": 0.8516876300787259, "grad_norm": NaN, "learning_rate": 0.00034285455751945467, "loss": 2.8014, "step": 47060 }, { "epoch": 0.8520495882725545, "grad_norm": NaN, "learning_rate": 0.00034285455751945467, "loss": 0.8827, "step": 47080 }, { "epoch": 0.8524115464663832, "grad_norm": NaN, "learning_rate": 0.00034285455751945467, "loss": 1.686, "step": 47100 }, { "epoch": 0.8527735046602117, "grad_norm": NaN, "learning_rate": 0.0003429088496109067, "loss": 3.4005, "step": 47120 }, { "epoch": 0.8531354628540404, "grad_norm": NaN, "learning_rate": 0.0003429088496109067, "loss": 4.9585, "step": 47140 }, { "epoch": 0.853497421047869, "grad_norm": NaN, "learning_rate": 0.0003429088496109067, "loss": 0.7554, "step": 47160 }, { "epoch": 0.8538593792416976, "grad_norm": NaN, "learning_rate": 0.0003429088496109067, "loss": 1.6588, "step": 47180 }, { "epoch": 0.8542213374355262, "grad_norm": NaN, "learning_rate": 0.0003429088496109067, "loss": 0.8945, "step": 47200 }, { "epoch": 0.8545832956293548, "grad_norm": NaN, "learning_rate": 0.0003429088496109067, "loss": 1.9265, "step": 47220 }, { "epoch": 0.8549452538231834, "grad_norm": NaN, "learning_rate": 0.0003429088496109067, "loss": 1.0519, "step": 47240 }, { "epoch": 0.855307212017012, "grad_norm": NaN, "learning_rate": 0.0003429088496109067, "loss": 0.7737, "step": 47260 }, { "epoch": 0.8556691702108407, "grad_norm": NaN, "learning_rate": 0.0003429088496109067, "loss": 1.9628, "step": 47280 }, { "epoch": 0.8560311284046692, "grad_norm": NaN, "learning_rate": 0.0003429088496109067, "loss": 2.8813, "step": 47300 }, { "epoch": 0.8563930865984979, "grad_norm": NaN, "learning_rate": 0.0003429088496109067, "loss": 0.3755, "step": 47320 }, { "epoch": 0.8567550447923264, "grad_norm": NaN, "learning_rate": 0.0003429088496109067, "loss": 0.3112, "step": 47340 }, { "epoch": 0.8571170029861551, "grad_norm": NaN, "learning_rate": 0.0003429088496109067, "loss": 0.2919, "step": 47360 }, { "epoch": 0.8574789611799837, "grad_norm": NaN, "learning_rate": 0.0003430174337938107, "loss": 3.1259, "step": 47380 }, { "epoch": 0.8578409193738123, "grad_norm": NaN, "learning_rate": 0.00034307172588526273, "loss": 2.6819, "step": 47400 }, { "epoch": 0.8582028775676409, "grad_norm": NaN, "learning_rate": 0.00034307172588526273, "loss": 0.0, "step": 47420 }, { "epoch": 0.8585648357614696, "grad_norm": NaN, "learning_rate": 0.00034307172588526273, "loss": 3.8217, "step": 47440 }, { "epoch": 0.8589267939552981, "grad_norm": NaN, "learning_rate": 0.0003431260179767147, "loss": 4.0489, "step": 47460 }, { "epoch": 0.8592887521491268, "grad_norm": NaN, "learning_rate": 0.0003431260179767147, "loss": 5.597, "step": 47480 }, { "epoch": 0.8596507103429554, "grad_norm": NaN, "learning_rate": 0.0003431260179767147, "loss": 0.8297, "step": 47500 }, { "epoch": 0.860012668536784, "grad_norm": NaN, "learning_rate": 0.0003431260179767147, "loss": 0.9444, "step": 47520 }, { "epoch": 0.8603746267306126, "grad_norm": NaN, "learning_rate": 0.0003431260179767147, "loss": 0.0, "step": 47540 }, { "epoch": 0.8607365849244413, "grad_norm": NaN, "learning_rate": 0.0003431260179767147, "loss": 0.911, "step": 47560 }, { "epoch": 0.8610985431182698, "grad_norm": NaN, "learning_rate": 0.0003431260179767147, "loss": 2.627, "step": 47580 }, { "epoch": 0.8614605013120985, "grad_norm": NaN, "learning_rate": 0.0003431260179767147, "loss": 2.7942, "step": 47600 }, { "epoch": 0.8618224595059271, "grad_norm": NaN, "learning_rate": 0.00034318031006816673, "loss": 1.1399, "step": 47620 }, { "epoch": 0.8621844176997556, "grad_norm": NaN, "learning_rate": 0.00034318031006816673, "loss": 0.0, "step": 47640 }, { "epoch": 0.8625463758935843, "grad_norm": NaN, "learning_rate": 0.00034318031006816673, "loss": 1.3527, "step": 47660 }, { "epoch": 0.8629083340874129, "grad_norm": NaN, "learning_rate": 0.00034318031006816673, "loss": 0.9181, "step": 47680 }, { "epoch": 0.8632702922812415, "grad_norm": NaN, "learning_rate": 0.00034323460215961876, "loss": 3.826, "step": 47700 }, { "epoch": 0.8636322504750701, "grad_norm": NaN, "learning_rate": 0.00034323460215961876, "loss": 1.3246, "step": 47720 }, { "epoch": 0.8639942086688988, "grad_norm": NaN, "learning_rate": 0.00034323460215961876, "loss": 0.9301, "step": 47740 }, { "epoch": 0.8643561668627273, "grad_norm": NaN, "learning_rate": 0.00034328889425107073, "loss": 2.455, "step": 47760 }, { "epoch": 0.864718125056556, "grad_norm": NaN, "learning_rate": 0.00034328889425107073, "loss": 0.3296, "step": 47780 }, { "epoch": 0.8650800832503845, "grad_norm": NaN, "learning_rate": 0.00034328889425107073, "loss": 3.0621, "step": 47800 }, { "epoch": 0.8654420414442132, "grad_norm": NaN, "learning_rate": 0.00034328889425107073, "loss": 1.9652, "step": 47820 }, { "epoch": 0.8658039996380418, "grad_norm": NaN, "learning_rate": 0.00034328889425107073, "loss": 2.605, "step": 47840 }, { "epoch": 0.8661659578318704, "grad_norm": NaN, "learning_rate": 0.00034328889425107073, "loss": 2.629, "step": 47860 }, { "epoch": 0.866527916025699, "grad_norm": NaN, "learning_rate": 0.00034328889425107073, "loss": 0.5865, "step": 47880 }, { "epoch": 0.8668898742195277, "grad_norm": NaN, "learning_rate": 0.00034328889425107073, "loss": 2.0123, "step": 47900 }, { "epoch": 0.8672518324133562, "grad_norm": NaN, "learning_rate": 0.00034328889425107073, "loss": 1.215, "step": 47920 }, { "epoch": 0.8676137906071849, "grad_norm": NaN, "learning_rate": 0.00034328889425107073, "loss": 2.0918, "step": 47940 }, { "epoch": 0.8679757488010135, "grad_norm": NaN, "learning_rate": 0.00034328889425107073, "loss": 0.4018, "step": 47960 }, { "epoch": 0.8683377069948421, "grad_norm": NaN, "learning_rate": 0.00034334318634252276, "loss": 6.8876, "step": 47980 }, { "epoch": 0.8686996651886707, "grad_norm": NaN, "learning_rate": 0.00034334318634252276, "loss": 3.6365, "step": 48000 }, { "epoch": 0.8690616233824994, "grad_norm": NaN, "learning_rate": 0.00034334318634252276, "loss": 1.1087, "step": 48020 }, { "epoch": 0.8694235815763279, "grad_norm": NaN, "learning_rate": 0.00034334318634252276, "loss": 0.7051, "step": 48040 }, { "epoch": 0.8697855397701566, "grad_norm": NaN, "learning_rate": 0.00034334318634252276, "loss": 0.0, "step": 48060 }, { "epoch": 0.8701474979639852, "grad_norm": NaN, "learning_rate": 0.00034334318634252276, "loss": 0.0, "step": 48080 }, { "epoch": 0.8705094561578137, "grad_norm": NaN, "learning_rate": 0.00034334318634252276, "loss": 2.3614, "step": 48100 }, { "epoch": 0.8708714143516424, "grad_norm": NaN, "learning_rate": 0.00034334318634252276, "loss": 2.1832, "step": 48120 }, { "epoch": 0.871233372545471, "grad_norm": NaN, "learning_rate": 0.00034334318634252276, "loss": 1.172, "step": 48140 }, { "epoch": 0.8715953307392996, "grad_norm": NaN, "learning_rate": 0.0003433974784339748, "loss": 2.1687, "step": 48160 }, { "epoch": 0.8719572889331282, "grad_norm": NaN, "learning_rate": 0.0003433974784339748, "loss": 0.786, "step": 48180 }, { "epoch": 0.8723192471269569, "grad_norm": NaN, "learning_rate": 0.0003433974784339748, "loss": 0.4405, "step": 48200 }, { "epoch": 0.8726812053207854, "grad_norm": NaN, "learning_rate": 0.0003433974784339748, "loss": 0.5855, "step": 48220 }, { "epoch": 0.8730431635146141, "grad_norm": NaN, "learning_rate": 0.0003433974784339748, "loss": 0.8813, "step": 48240 }, { "epoch": 0.8734051217084426, "grad_norm": NaN, "learning_rate": 0.0003435060626168788, "loss": 2.8654, "step": 48260 }, { "epoch": 0.8737670799022713, "grad_norm": NaN, "learning_rate": 0.0003435060626168788, "loss": 0.7883, "step": 48280 }, { "epoch": 0.8741290380960999, "grad_norm": NaN, "learning_rate": 0.0003435060626168788, "loss": 1.6977, "step": 48300 }, { "epoch": 0.8744909962899285, "grad_norm": NaN, "learning_rate": 0.0003435060626168788, "loss": 0.6638, "step": 48320 }, { "epoch": 0.8748529544837571, "grad_norm": NaN, "learning_rate": 0.0003435060626168788, "loss": 0.5823, "step": 48340 }, { "epoch": 0.8752149126775858, "grad_norm": NaN, "learning_rate": 0.0003435060626168788, "loss": 0.8745, "step": 48360 }, { "epoch": 0.8755768708714143, "grad_norm": NaN, "learning_rate": 0.0003435060626168788, "loss": 2.6587, "step": 48380 }, { "epoch": 0.875938829065243, "grad_norm": NaN, "learning_rate": 0.0003435603547083308, "loss": 1.6498, "step": 48400 }, { "epoch": 0.8763007872590716, "grad_norm": NaN, "learning_rate": 0.0003435603547083308, "loss": 1.0264, "step": 48420 }, { "epoch": 0.8766627454529002, "grad_norm": NaN, "learning_rate": 0.0003435603547083308, "loss": 4.9079, "step": 48440 }, { "epoch": 0.8770247036467288, "grad_norm": NaN, "learning_rate": 0.0003435603547083308, "loss": 0.5742, "step": 48460 }, { "epoch": 0.8773866618405575, "grad_norm": NaN, "learning_rate": 0.0003436146467997828, "loss": 3.0971, "step": 48480 }, { "epoch": 0.877748620034386, "grad_norm": NaN, "learning_rate": 0.0003436146467997828, "loss": 1.5435, "step": 48500 }, { "epoch": 0.8781105782282146, "grad_norm": NaN, "learning_rate": 0.0003436146467997828, "loss": 0.8805, "step": 48520 }, { "epoch": 0.8784725364220433, "grad_norm": NaN, "learning_rate": 0.0003436146467997828, "loss": 1.0995, "step": 48540 }, { "epoch": 0.8788344946158718, "grad_norm": NaN, "learning_rate": 0.0003436689388912348, "loss": 2.2358, "step": 48560 }, { "epoch": 0.8791964528097005, "grad_norm": NaN, "learning_rate": 0.0003436689388912348, "loss": 1.6608, "step": 48580 }, { "epoch": 0.879558411003529, "grad_norm": NaN, "learning_rate": 0.0003436689388912348, "loss": 2.8619, "step": 48600 }, { "epoch": 0.8799203691973577, "grad_norm": NaN, "learning_rate": 0.0003436689388912348, "loss": 0.9254, "step": 48620 }, { "epoch": 0.8802823273911863, "grad_norm": NaN, "learning_rate": 0.00034372323098268685, "loss": 4.944, "step": 48640 }, { "epoch": 0.880644285585015, "grad_norm": NaN, "learning_rate": 0.00034372323098268685, "loss": 1.3753, "step": 48660 }, { "epoch": 0.8810062437788435, "grad_norm": NaN, "learning_rate": 0.00034372323098268685, "loss": 1.2057, "step": 48680 }, { "epoch": 0.8813682019726722, "grad_norm": NaN, "learning_rate": 0.00034372323098268685, "loss": 0.8484, "step": 48700 }, { "epoch": 0.8817301601665007, "grad_norm": NaN, "learning_rate": 0.00034372323098268685, "loss": 1.9288, "step": 48720 }, { "epoch": 0.8820921183603294, "grad_norm": NaN, "learning_rate": 0.0003437775230741388, "loss": 2.7616, "step": 48740 }, { "epoch": 0.882454076554158, "grad_norm": NaN, "learning_rate": 0.0003437775230741388, "loss": 1.1404, "step": 48760 }, { "epoch": 0.8828160347479866, "grad_norm": NaN, "learning_rate": 0.0003437775230741388, "loss": 0.9172, "step": 48780 }, { "epoch": 0.8831779929418152, "grad_norm": NaN, "learning_rate": 0.0003437775230741388, "loss": 0.4789, "step": 48800 }, { "epoch": 0.8835399511356439, "grad_norm": NaN, "learning_rate": 0.0003437775230741388, "loss": 2.6007, "step": 48820 }, { "epoch": 0.8839019093294724, "grad_norm": NaN, "learning_rate": 0.0003437775230741388, "loss": 2.2257, "step": 48840 }, { "epoch": 0.8842638675233011, "grad_norm": NaN, "learning_rate": 0.0003437775230741388, "loss": 1.0041, "step": 48860 }, { "epoch": 0.8846258257171297, "grad_norm": NaN, "learning_rate": 0.0003437775230741388, "loss": 0.4423, "step": 48880 }, { "epoch": 0.8849877839109583, "grad_norm": NaN, "learning_rate": 0.0003437775230741388, "loss": 0.2653, "step": 48900 }, { "epoch": 0.8853497421047869, "grad_norm": NaN, "learning_rate": 0.0003437775230741388, "loss": 2.8073, "step": 48920 }, { "epoch": 0.8857117002986155, "grad_norm": NaN, "learning_rate": 0.0003437775230741388, "loss": 1.0093, "step": 48940 }, { "epoch": 0.8860736584924441, "grad_norm": NaN, "learning_rate": 0.0003437775230741388, "loss": 0.0, "step": 48960 }, { "epoch": 0.8864356166862727, "grad_norm": NaN, "learning_rate": 0.0003437775230741388, "loss": 0.2918, "step": 48980 }, { "epoch": 0.8867975748801014, "grad_norm": NaN, "learning_rate": 0.0003437775230741388, "loss": 0.4383, "step": 49000 }, { "epoch": 0.8871595330739299, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 2.0041, "step": 49020 }, { "epoch": 0.8875214912677586, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 1.1767, "step": 49040 }, { "epoch": 0.8878834494615871, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 1.8047, "step": 49060 }, { "epoch": 0.8882454076554158, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 0.7776, "step": 49080 }, { "epoch": 0.8886073658492444, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 2.9128, "step": 49100 }, { "epoch": 0.888969324043073, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 3.8472, "step": 49120 }, { "epoch": 0.8893312822369016, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 1.9238, "step": 49140 }, { "epoch": 0.8896932404307303, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 0.8734, "step": 49160 }, { "epoch": 0.8900551986245588, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 1.8095, "step": 49180 }, { "epoch": 0.8904171568183875, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 3.3127, "step": 49200 }, { "epoch": 0.8907791150122161, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 0.3799, "step": 49220 }, { "epoch": 0.8911410732060447, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 1.4065, "step": 49240 }, { "epoch": 0.8915030313998733, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 1.4969, "step": 49260 }, { "epoch": 0.891864989593702, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 1.4145, "step": 49280 }, { "epoch": 0.8922269477875305, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 1.1898, "step": 49300 }, { "epoch": 0.8925889059813592, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 0.3764, "step": 49320 }, { "epoch": 0.8929508641751878, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 1.0612, "step": 49340 }, { "epoch": 0.8933128223690163, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 0.6792, "step": 49360 }, { "epoch": 0.893674780562845, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 1.563, "step": 49380 }, { "epoch": 0.8940367387566736, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 1.8383, "step": 49400 }, { "epoch": 0.8943986969505022, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 2.0042, "step": 49420 }, { "epoch": 0.8947606551443308, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 1.3138, "step": 49440 }, { "epoch": 0.8951226133381595, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 2.119, "step": 49460 }, { "epoch": 0.895484571531988, "grad_norm": NaN, "learning_rate": 0.00034383181516559085, "loss": 1.8186, "step": 49480 }, { "epoch": 0.8958465297258167, "grad_norm": NaN, "learning_rate": 0.0003438861072570429, "loss": 0.7511, "step": 49500 }, { "epoch": 0.8962084879196452, "grad_norm": NaN, "learning_rate": 0.0003438861072570429, "loss": 1.9937, "step": 49520 }, { "epoch": 0.8965704461134739, "grad_norm": NaN, "learning_rate": 0.0003438861072570429, "loss": 1.9893, "step": 49540 }, { "epoch": 0.8969324043073025, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 0.5879, "step": 49560 }, { "epoch": 0.8972943625011311, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 0.0, "step": 49580 }, { "epoch": 0.8976563206949597, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 1.9287, "step": 49600 }, { "epoch": 0.8980182788887884, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 2.0609, "step": 49620 }, { "epoch": 0.8983802370826169, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 2.6145, "step": 49640 }, { "epoch": 0.8987421952764456, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 3.2522, "step": 49660 }, { "epoch": 0.8991041534702742, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 2.7444, "step": 49680 }, { "epoch": 0.8994661116641028, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 4.5388, "step": 49700 }, { "epoch": 0.8998280698579314, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 1.8448, "step": 49720 }, { "epoch": 0.9001900280517601, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 1.514, "step": 49740 }, { "epoch": 0.9005519862455886, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 2.3255, "step": 49760 }, { "epoch": 0.9009139444394173, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 0.3116, "step": 49780 }, { "epoch": 0.9012759026332459, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 2.52, "step": 49800 }, { "epoch": 0.9016378608270744, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 3.8961, "step": 49820 }, { "epoch": 0.9019998190209031, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 2.786, "step": 49840 }, { "epoch": 0.9023617772147317, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 0.263, "step": 49860 }, { "epoch": 0.9027237354085603, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 1.4603, "step": 49880 }, { "epoch": 0.9030856936023889, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 2.8763, "step": 49900 }, { "epoch": 0.9034476517962176, "grad_norm": NaN, "learning_rate": 0.0003439403993484949, "loss": 2.9407, "step": 49920 }, { "epoch": 0.9038096099900461, "grad_norm": NaN, "learning_rate": 0.00034399469143994694, "loss": 3.1258, "step": 49940 }, { "epoch": 0.9041715681838748, "grad_norm": NaN, "learning_rate": 0.00034399469143994694, "loss": 0.8813, "step": 49960 }, { "epoch": 0.9045335263777033, "grad_norm": NaN, "learning_rate": 0.00034399469143994694, "loss": 1.0321, "step": 49980 }, { "epoch": 0.904895484571532, "grad_norm": NaN, "learning_rate": 0.00034399469143994694, "loss": 3.1083, "step": 50000 }, { "epoch": 0.904895484571532, "eval_accuracy": 4.551287535289588e-05, "eval_loss": NaN, "eval_runtime": 170.9007, "eval_samples_per_second": 3556.686, "eval_steps_per_second": 3.476, "step": 50000 } ], "logging_steps": 20, "max_steps": 165765, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.19181053952e+16, "train_batch_size": 512, "trial_name": null, "trial_params": null }