{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.074558032282859, "eval_steps": 500, "global_step": 60000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ar_loss": 0.7918, "epoch": 0.015039855617386072, "fm_loss": 2.2779, "grad_norm": 4.37157678604126, "learning_rate": 4.9999971366617235e-05, "loss": 3.0697, "step": 100 }, { "ar_loss": 0.6852, "epoch": 0.030079711234772145, "fm_loss": 0.5703, "grad_norm": 4.328033447265625, "learning_rate": 4.999951278658594e-05, "loss": 1.2555, "step": 200 }, { "ar_loss": 0.669, "epoch": 0.04511956685215822, "fm_loss": 0.3866, "grad_norm": 3.0744214057922363, "learning_rate": 4.9998494972632174e-05, "loss": 1.0556, "step": 300 }, { "ar_loss": 0.6594, "epoch": 0.06015942246954429, "fm_loss": 0.319, "grad_norm": 2.9869937896728516, "learning_rate": 4.9996917947524234e-05, "loss": 0.9783, "step": 400 }, { "ar_loss": 0.6539, "epoch": 0.07519927808693036, "fm_loss": 0.2901, "grad_norm": 2.676532745361328, "learning_rate": 4.999478174653984e-05, "loss": 0.9441, "step": 500 }, { "ar_loss": 0.6451, "epoch": 0.09023913370431644, "fm_loss": 0.271, "grad_norm": 2.8496103286743164, "learning_rate": 4.999208641746537e-05, "loss": 0.9161, "step": 600 }, { "ar_loss": 0.639, "epoch": 0.10527898932170252, "fm_loss": 0.262, "grad_norm": 3.5984983444213867, "learning_rate": 4.998883202059478e-05, "loss": 0.901, "step": 700 }, { "ar_loss": 0.6368, "epoch": 0.12031884493908858, "fm_loss": 0.251, "grad_norm": 3.3547682762145996, "learning_rate": 4.998501862872824e-05, "loss": 0.8878, "step": 800 }, { "ar_loss": 0.6325, "epoch": 0.13535870055647467, "fm_loss": 0.2422, "grad_norm": 3.7177250385284424, "learning_rate": 4.998064632717054e-05, "loss": 0.8747, "step": 900 }, { "ar_loss": 0.6295, "epoch": 0.15039855617386072, "fm_loss": 0.2364, "grad_norm": 2.8715012073516846, "learning_rate": 4.997571521372918e-05, "loss": 0.8659, "step": 1000 }, { "ar_loss": 0.6261, "epoch": 0.1654384117912468, "fm_loss": 0.233, "grad_norm": 2.686033248901367, "learning_rate": 4.9970225398712116e-05, "loss": 0.8591, "step": 1100 }, { "ar_loss": 0.6227, "epoch": 0.18047826740863288, "fm_loss": 0.2337, "grad_norm": 2.735515594482422, "learning_rate": 4.99641770049254e-05, "loss": 0.8564, "step": 1200 }, { "ar_loss": 0.6288, "epoch": 0.19551812302601895, "fm_loss": 0.2343, "grad_norm": 3.192143440246582, "learning_rate": 4.995757016767032e-05, "loss": 0.8631, "step": 1300 }, { "ar_loss": 0.6179, "epoch": 0.21055797864340503, "fm_loss": 0.2254, "grad_norm": 4.529001235961914, "learning_rate": 4.995040503474049e-05, "loss": 0.8433, "step": 1400 }, { "ar_loss": 0.6117, "epoch": 0.2255978342607911, "fm_loss": 0.2265, "grad_norm": 2.0048255920410156, "learning_rate": 4.994268176641842e-05, "loss": 0.8382, "step": 1500 }, { "ar_loss": 0.6197, "epoch": 0.24063768987817716, "fm_loss": 0.2174, "grad_norm": 2.0079429149627686, "learning_rate": 4.993440053547204e-05, "loss": 0.8372, "step": 1600 }, { "ar_loss": 0.6073, "epoch": 0.25567754549556326, "fm_loss": 0.2158, "grad_norm": 2.0703108310699463, "learning_rate": 4.992556152715076e-05, "loss": 0.8231, "step": 1700 }, { "ar_loss": 0.6091, "epoch": 0.27071740111294934, "fm_loss": 0.2121, "grad_norm": 2.3248846530914307, "learning_rate": 4.991616493918137e-05, "loss": 0.8213, "step": 1800 }, { "ar_loss": 0.608, "epoch": 0.28575725673033536, "fm_loss": 0.2088, "grad_norm": 2.5417592525482178, "learning_rate": 4.99062109817636e-05, "loss": 0.8167, "step": 1900 }, { "ar_loss": 0.6097, "epoch": 0.30079711234772144, "fm_loss": 0.21, "grad_norm": 2.219682216644287, "learning_rate": 4.989569987756542e-05, "loss": 0.8196, "step": 2000 }, { "ar_loss": 0.6056, "epoch": 0.3158369679651075, "fm_loss": 0.2079, "grad_norm": 1.9187816381454468, "learning_rate": 4.988463186171804e-05, "loss": 0.8136, "step": 2100 }, { "ar_loss": 0.6064, "epoch": 0.3308768235824936, "fm_loss": 0.2072, "grad_norm": 2.246711492538452, "learning_rate": 4.987300718181068e-05, "loss": 0.8136, "step": 2200 }, { "ar_loss": 0.6015, "epoch": 0.3459166791998797, "fm_loss": 0.2044, "grad_norm": 1.855542540550232, "learning_rate": 4.986082609788504e-05, "loss": 0.8059, "step": 2300 }, { "ar_loss": 0.6, "epoch": 0.36095653481726575, "fm_loss": 0.2081, "grad_norm": 1.883419156074524, "learning_rate": 4.9848088882429426e-05, "loss": 0.8082, "step": 2400 }, { "ar_loss": 0.6048, "epoch": 0.37599639043465183, "fm_loss": 0.201, "grad_norm": 4.141597747802734, "learning_rate": 4.983479582037272e-05, "loss": 0.8058, "step": 2500 }, { "ar_loss": 0.6025, "epoch": 0.3910362460520379, "fm_loss": 0.2012, "grad_norm": 1.3507440090179443, "learning_rate": 4.9820947209077965e-05, "loss": 0.8037, "step": 2600 }, { "ar_loss": 0.5966, "epoch": 0.406076101669424, "fm_loss": 0.2103, "grad_norm": 1.5875388383865356, "learning_rate": 4.980654335833572e-05, "loss": 0.8069, "step": 2700 }, { "ar_loss": 0.5995, "epoch": 0.42111595728681006, "fm_loss": 0.1975, "grad_norm": 2.4850363731384277, "learning_rate": 4.979158459035715e-05, "loss": 0.7971, "step": 2800 }, { "ar_loss": 0.595, "epoch": 0.43615581290419614, "fm_loss": 0.1981, "grad_norm": 2.0931544303894043, "learning_rate": 4.97760712397668e-05, "loss": 0.7931, "step": 2900 }, { "ar_loss": 0.5959, "epoch": 0.4511956685215822, "fm_loss": 0.1984, "grad_norm": 2.2044456005096436, "learning_rate": 4.97600036535951e-05, "loss": 0.7943, "step": 3000 }, { "ar_loss": 0.5894, "epoch": 0.4662355241389683, "fm_loss": 0.197, "grad_norm": 2.905266046524048, "learning_rate": 4.974338219127062e-05, "loss": 0.7863, "step": 3100 }, { "ar_loss": 0.5851, "epoch": 0.4812753797563543, "fm_loss": 0.1953, "grad_norm": 2.5252127647399902, "learning_rate": 4.9726207224612034e-05, "loss": 0.7805, "step": 3200 }, { "ar_loss": 0.5936, "epoch": 0.4963152353737404, "fm_loss": 0.1957, "grad_norm": 2.7935545444488525, "learning_rate": 4.97084791378198e-05, "loss": 0.7893, "step": 3300 }, { "ar_loss": 0.5869, "epoch": 0.5113550909911265, "fm_loss": 0.1933, "grad_norm": 1.2401388883590698, "learning_rate": 4.9690198327467534e-05, "loss": 0.7802, "step": 3400 }, { "ar_loss": 0.5888, "epoch": 0.5263949466085126, "fm_loss": 0.1944, "grad_norm": 3.7216362953186035, "learning_rate": 4.967136520249318e-05, "loss": 0.7832, "step": 3500 }, { "ar_loss": 0.587, "epoch": 0.5414348022258987, "fm_loss": 0.1916, "grad_norm": 1.946953296661377, "learning_rate": 4.965198018418985e-05, "loss": 0.7786, "step": 3600 }, { "ar_loss": 0.5828, "epoch": 0.5564746578432848, "fm_loss": 0.1921, "grad_norm": 1.695999264717102, "learning_rate": 4.963204370619637e-05, "loss": 0.7749, "step": 3700 }, { "ar_loss": 0.5832, "epoch": 0.5715145134606707, "fm_loss": 0.1973, "grad_norm": 2.257406234741211, "learning_rate": 4.9611556214487645e-05, "loss": 0.7805, "step": 3800 }, { "ar_loss": 0.5825, "epoch": 0.5865543690780568, "fm_loss": 0.1915, "grad_norm": 2.4373984336853027, "learning_rate": 4.95905181673646e-05, "loss": 0.7739, "step": 3900 }, { "ar_loss": 0.5809, "epoch": 0.6015942246954429, "fm_loss": 0.1924, "grad_norm": 1.4836270809173584, "learning_rate": 4.956893003544401e-05, "loss": 0.7733, "step": 4000 }, { "ar_loss": 0.5801, "epoch": 0.616634080312829, "fm_loss": 0.1956, "grad_norm": 1.864709734916687, "learning_rate": 4.954679230164789e-05, "loss": 0.7757, "step": 4100 }, { "ar_loss": 0.5824, "epoch": 0.631673935930215, "fm_loss": 0.1895, "grad_norm": 1.5477041006088257, "learning_rate": 4.952410546119278e-05, "loss": 0.7719, "step": 4200 }, { "ar_loss": 0.5819, "epoch": 0.6467137915476011, "fm_loss": 0.1925, "grad_norm": 1.6031211614608765, "learning_rate": 4.95008700215786e-05, "loss": 0.7744, "step": 4300 }, { "ar_loss": 0.5778, "epoch": 0.6617536471649872, "fm_loss": 0.2067, "grad_norm": 5.476424694061279, "learning_rate": 4.947708650257732e-05, "loss": 0.7845, "step": 4400 }, { "ar_loss": 0.5767, "epoch": 0.6767935027823733, "fm_loss": 0.1875, "grad_norm": 1.901498794555664, "learning_rate": 4.945275543622133e-05, "loss": 0.7642, "step": 4500 }, { "ar_loss": 0.5766, "epoch": 0.6918333583997593, "fm_loss": 0.1912, "grad_norm": 1.4781795740127563, "learning_rate": 4.942787736679153e-05, "loss": 0.7678, "step": 4600 }, { "ar_loss": 0.5735, "epoch": 0.7068732140171454, "fm_loss": 0.1893, "grad_norm": 2.639249801635742, "learning_rate": 4.940245285080521e-05, "loss": 0.7628, "step": 4700 }, { "ar_loss": 0.5736, "epoch": 0.7219130696345315, "fm_loss": 0.1854, "grad_norm": 1.5374678373336792, "learning_rate": 4.93764824570035e-05, "loss": 0.759, "step": 4800 }, { "ar_loss": 0.5755, "epoch": 0.7369529252519176, "fm_loss": 0.187, "grad_norm": 1.971647024154663, "learning_rate": 4.934996676633874e-05, "loss": 0.7625, "step": 4900 }, { "ar_loss": 0.5725, "epoch": 0.7519927808693037, "fm_loss": 0.1875, "grad_norm": 5.200182914733887, "learning_rate": 4.932290637196144e-05, "loss": 0.76, "step": 5000 }, { "ar_loss": 0.5504, "epoch": 0.015037593984962405, "fm_loss": 0.1893, "grad_norm": 2.8439042568206787, "learning_rate": 4.929551301429187e-05, "loss": 0.7396, "step": 5100 }, { "ar_loss": 0.5503, "epoch": 0.03007518796992481, "fm_loss": 0.1831, "grad_norm": 1.733323335647583, "learning_rate": 4.926737343223295e-05, "loss": 0.7334, "step": 5200 }, { "ar_loss": 0.5514, "epoch": 0.045112781954887216, "fm_loss": 0.185, "grad_norm": 2.580357074737549, "learning_rate": 4.9238691157474316e-05, "loss": 0.7364, "step": 5300 }, { "ar_loss": 0.5493, "epoch": 0.06015037593984962, "fm_loss": 0.1835, "grad_norm": 1.9058268070220947, "learning_rate": 4.920946683143935e-05, "loss": 0.7327, "step": 5400 }, { "ar_loss": 0.5497, "epoch": 0.07518796992481203, "fm_loss": 0.184, "grad_norm": 1.3233968019485474, "learning_rate": 4.91797011076734e-05, "loss": 0.7337, "step": 5500 }, { "ar_loss": 0.5466, "epoch": 0.09022556390977443, "fm_loss": 0.1829, "grad_norm": 1.8950995206832886, "learning_rate": 4.9149394651829086e-05, "loss": 0.7295, "step": 5600 }, { "ar_loss": 0.5495, "epoch": 0.10526315789473684, "fm_loss": 0.1829, "grad_norm": 1.7234010696411133, "learning_rate": 4.911854814165145e-05, "loss": 0.7324, "step": 5700 }, { "ar_loss": 0.5475, "epoch": 0.12030075187969924, "fm_loss": 0.1821, "grad_norm": 2.068098783493042, "learning_rate": 4.908716226696284e-05, "loss": 0.7295, "step": 5800 }, { "ar_loss": 0.5457, "epoch": 0.13533834586466165, "fm_loss": 0.1837, "grad_norm": 1.4124906063079834, "learning_rate": 4.905523772964739e-05, "loss": 0.7294, "step": 5900 }, { "ar_loss": 0.5504, "epoch": 0.15037593984962405, "fm_loss": 0.1828, "grad_norm": 1.7072114944458008, "learning_rate": 4.902277524363543e-05, "loss": 0.7332, "step": 6000 }, { "ar_loss": 0.5454, "epoch": 0.16541353383458646, "fm_loss": 0.181, "grad_norm": 1.737274169921875, "learning_rate": 4.898977553488743e-05, "loss": 0.7264, "step": 6100 }, { "ar_loss": 0.5501, "epoch": 0.18045112781954886, "fm_loss": 0.1808, "grad_norm": 2.1727452278137207, "learning_rate": 4.895623934137783e-05, "loss": 0.7309, "step": 6200 }, { "ar_loss": 0.5439, "epoch": 0.19548872180451127, "fm_loss": 0.1798, "grad_norm": 2.8658437728881836, "learning_rate": 4.892216741307848e-05, "loss": 0.7237, "step": 6300 }, { "ar_loss": 0.5438, "epoch": 0.21052631578947367, "fm_loss": 0.1812, "grad_norm": 1.8495811223983765, "learning_rate": 4.888756051194193e-05, "loss": 0.725, "step": 6400 }, { "ar_loss": 0.5391, "epoch": 0.22556390977443608, "fm_loss": 0.1894, "grad_norm": 2.1281821727752686, "learning_rate": 4.885241941188435e-05, "loss": 0.7286, "step": 6500 }, { "ar_loss": 0.5437, "epoch": 0.24060150375939848, "fm_loss": 0.1787, "grad_norm": 1.7707561254501343, "learning_rate": 4.881674489876822e-05, "loss": 0.7224, "step": 6600 }, { "ar_loss": 0.5413, "epoch": 0.2556390977443609, "fm_loss": 0.1795, "grad_norm": 1.4524445533752441, "learning_rate": 4.878053777038478e-05, "loss": 0.7208, "step": 6700 }, { "ar_loss": 0.5411, "epoch": 0.2706766917293233, "fm_loss": 0.18, "grad_norm": 2.2196991443634033, "learning_rate": 4.8743798836436166e-05, "loss": 0.7212, "step": 6800 }, { "ar_loss": 0.5395, "epoch": 0.2857142857142857, "fm_loss": 0.1778, "grad_norm": 3.7135586738586426, "learning_rate": 4.8706528918517326e-05, "loss": 0.7173, "step": 6900 }, { "ar_loss": 0.5397, "epoch": 0.3007518796992481, "fm_loss": 0.179, "grad_norm": 2.551405191421509, "learning_rate": 4.866872885009762e-05, "loss": 0.7188, "step": 7000 }, { "ar_loss": 0.5433, "epoch": 0.3157894736842105, "fm_loss": 0.1769, "grad_norm": 1.5937398672103882, "learning_rate": 4.863039947650221e-05, "loss": 0.7202, "step": 7100 }, { "ar_loss": 0.537, "epoch": 0.3308270676691729, "fm_loss": 0.1778, "grad_norm": 2.1410443782806396, "learning_rate": 4.859154165489313e-05, "loss": 0.7148, "step": 7200 }, { "ar_loss": 0.5345, "epoch": 0.3458646616541353, "fm_loss": 0.1786, "grad_norm": 1.9549566507339478, "learning_rate": 4.855215625425012e-05, "loss": 0.7131, "step": 7300 }, { "ar_loss": 0.5385, "epoch": 0.3609022556390977, "fm_loss": 0.1786, "grad_norm": 1.6920804977416992, "learning_rate": 4.851224415535123e-05, "loss": 0.7171, "step": 7400 }, { "ar_loss": 0.5403, "epoch": 0.37593984962406013, "fm_loss": 0.1778, "grad_norm": 1.460734248161316, "learning_rate": 4.847180625075306e-05, "loss": 0.7182, "step": 7500 }, { "ar_loss": 0.5333, "epoch": 0.39097744360902253, "fm_loss": 0.1802, "grad_norm": 2.369119644165039, "learning_rate": 4.8430843444770856e-05, "loss": 0.7135, "step": 7600 }, { "ar_loss": 0.5343, "epoch": 0.40601503759398494, "fm_loss": 0.1777, "grad_norm": 2.0863733291625977, "learning_rate": 4.838935665345826e-05, "loss": 0.712, "step": 7700 }, { "ar_loss": 0.5361, "epoch": 0.42105263157894735, "fm_loss": 0.1826, "grad_norm": 2.1648268699645996, "learning_rate": 4.834734680458682e-05, "loss": 0.7187, "step": 7800 }, { "ar_loss": 0.5362, "epoch": 0.43609022556390975, "fm_loss": 0.1759, "grad_norm": 3.731491804122925, "learning_rate": 4.8304814837625275e-05, "loss": 0.7121, "step": 7900 }, { "ar_loss": 0.5361, "epoch": 0.45112781954887216, "fm_loss": 0.1763, "grad_norm": 3.129291296005249, "learning_rate": 4.826176170371848e-05, "loss": 0.7124, "step": 8000 }, { "ar_loss": 0.5308, "epoch": 0.46616541353383456, "fm_loss": 0.1803, "grad_norm": 1.837388038635254, "learning_rate": 4.8218188365666216e-05, "loss": 0.7111, "step": 8100 }, { "ar_loss": 0.5384, "epoch": 0.48120300751879697, "fm_loss": 0.1765, "grad_norm": 1.696321964263916, "learning_rate": 4.817409579790161e-05, "loss": 0.7148, "step": 8200 }, { "ar_loss": 0.5351, "epoch": 0.49624060150375937, "fm_loss": 0.177, "grad_norm": 2.0662057399749756, "learning_rate": 4.812948498646933e-05, "loss": 0.7121, "step": 8300 }, { "ar_loss": 0.5358, "epoch": 0.5112781954887218, "fm_loss": 0.1757, "grad_norm": 1.4204461574554443, "learning_rate": 4.80843569290036e-05, "loss": 0.7114, "step": 8400 }, { "ar_loss": 0.5316, "epoch": 0.5263157894736842, "fm_loss": 0.176, "grad_norm": 3.6314117908477783, "learning_rate": 4.80387126347058e-05, "loss": 0.7075, "step": 8500 }, { "ar_loss": 0.5307, "epoch": 0.5413533834586466, "fm_loss": 0.1754, "grad_norm": 2.164659023284912, "learning_rate": 4.799255312432199e-05, "loss": 0.706, "step": 8600 }, { "ar_loss": 0.5331, "epoch": 0.556390977443609, "fm_loss": 0.184, "grad_norm": 1.9370899200439453, "learning_rate": 4.794587943012e-05, "loss": 0.7171, "step": 8700 }, { "ar_loss": 0.5278, "epoch": 0.5714285714285714, "fm_loss": 0.1738, "grad_norm": 2.0422327518463135, "learning_rate": 4.7898692595866415e-05, "loss": 0.7016, "step": 8800 }, { "ar_loss": 0.5299, "epoch": 0.5864661654135338, "fm_loss": 0.1729, "grad_norm": 1.643233060836792, "learning_rate": 4.785099367680317e-05, "loss": 0.7028, "step": 8900 }, { "ar_loss": 0.5279, "epoch": 0.6015037593984962, "fm_loss": 0.173, "grad_norm": 1.7165354490280151, "learning_rate": 4.7802783739624e-05, "loss": 0.7009, "step": 9000 }, { "ar_loss": 0.5297, "epoch": 0.6165413533834586, "fm_loss": 0.181, "grad_norm": 1.632948875427246, "learning_rate": 4.7754063862450576e-05, "loss": 0.7107, "step": 9100 }, { "ar_loss": 0.5266, "epoch": 0.631578947368421, "fm_loss": 0.2008, "grad_norm": 2.7003791332244873, "learning_rate": 4.770483513480837e-05, "loss": 0.7274, "step": 9200 }, { "ar_loss": 0.5304, "epoch": 0.6466165413533834, "fm_loss": 0.1754, "grad_norm": 2.2766501903533936, "learning_rate": 4.765509865760233e-05, "loss": 0.7057, "step": 9300 }, { "ar_loss": 0.5262, "epoch": 0.6616541353383458, "fm_loss": 0.18, "grad_norm": 10.034100532531738, "learning_rate": 4.760485554309219e-05, "loss": 0.7061, "step": 9400 }, { "ar_loss": 0.52, "epoch": 0.6766917293233082, "fm_loss": 0.1753, "grad_norm": 1.70254385471344, "learning_rate": 4.7554106914867705e-05, "loss": 0.6953, "step": 9500 }, { "ar_loss": 0.5249, "epoch": 0.6917293233082706, "fm_loss": 0.1744, "grad_norm": 1.7009040117263794, "learning_rate": 4.750285390782342e-05, "loss": 0.6992, "step": 9600 }, { "ar_loss": 0.523, "epoch": 0.706766917293233, "fm_loss": 0.1739, "grad_norm": 1.4413669109344482, "learning_rate": 4.745109766813334e-05, "loss": 0.6968, "step": 9700 }, { "ar_loss": 0.5242, "epoch": 0.7218045112781954, "fm_loss": 0.1727, "grad_norm": 1.4672104120254517, "learning_rate": 4.739883935322532e-05, "loss": 0.6969, "step": 9800 }, { "ar_loss": 0.5244, "epoch": 0.7368421052631579, "fm_loss": 0.1728, "grad_norm": 1.458977222442627, "learning_rate": 4.734608013175512e-05, "loss": 0.6972, "step": 9900 }, { "ar_loss": 0.5247, "epoch": 0.7518796992481203, "fm_loss": 0.1721, "grad_norm": 2.412842273712158, "learning_rate": 4.72928211835803e-05, "loss": 0.6968, "step": 10000 }, { "ar_loss": 0.5206, "epoch": 0.7669172932330827, "fm_loss": 0.172, "grad_norm": 2.1207447052001953, "learning_rate": 4.723906369973386e-05, "loss": 0.6926, "step": 10100 }, { "ar_loss": 0.5214, "epoch": 0.7819548872180451, "fm_loss": 0.1735, "grad_norm": 2.6220345497131348, "learning_rate": 4.7184808882397594e-05, "loss": 0.695, "step": 10200 }, { "ar_loss": 0.5236, "epoch": 0.7969924812030075, "fm_loss": 0.1819, "grad_norm": 1.9105595350265503, "learning_rate": 4.713005794487515e-05, "loss": 0.7055, "step": 10300 }, { "ar_loss": 0.5202, "epoch": 0.8120300751879699, "fm_loss": 0.1718, "grad_norm": 2.312239646911621, "learning_rate": 4.707481211156497e-05, "loss": 0.692, "step": 10400 }, { "ar_loss": 0.5207, "epoch": 0.8270676691729323, "fm_loss": 0.1788, "grad_norm": 1.709987759590149, "learning_rate": 4.701907261793287e-05, "loss": 0.6995, "step": 10500 }, { "ar_loss": 0.5178, "epoch": 0.8421052631578947, "fm_loss": 0.172, "grad_norm": 2.644814968109131, "learning_rate": 4.696284071048444e-05, "loss": 0.6898, "step": 10600 }, { "ar_loss": 0.5175, "epoch": 0.8571428571428571, "fm_loss": 0.1737, "grad_norm": 1.4387990236282349, "learning_rate": 4.690611764673713e-05, "loss": 0.6912, "step": 10700 }, { "ar_loss": 0.5154, "epoch": 0.8721804511278195, "fm_loss": 0.1727, "grad_norm": 1.308214545249939, "learning_rate": 4.684890469519213e-05, "loss": 0.6881, "step": 10800 }, { "ar_loss": 0.5158, "epoch": 0.8872180451127819, "fm_loss": 0.1707, "grad_norm": 3.754106283187866, "learning_rate": 4.6791203135306075e-05, "loss": 0.6865, "step": 10900 }, { "ar_loss": 0.5213, "epoch": 0.9022556390977443, "fm_loss": 0.1709, "grad_norm": 1.5025776624679565, "learning_rate": 4.673301425746232e-05, "loss": 0.6922, "step": 11000 }, { "ar_loss": 0.5129, "epoch": 0.9172932330827067, "fm_loss": 0.1705, "grad_norm": 1.8450067043304443, "learning_rate": 4.667433936294217e-05, "loss": 0.6835, "step": 11100 }, { "ar_loss": 0.5176, "epoch": 0.9323308270676691, "fm_loss": 0.1768, "grad_norm": 1.772120475769043, "learning_rate": 4.661517976389574e-05, "loss": 0.6944, "step": 11200 }, { "ar_loss": 0.5117, "epoch": 0.9473684210526315, "fm_loss": 0.1722, "grad_norm": 1.8486195802688599, "learning_rate": 4.6555536783312634e-05, "loss": 0.6839, "step": 11300 }, { "ar_loss": 0.5139, "epoch": 0.9624060150375939, "fm_loss": 0.1705, "grad_norm": 3.964401960372925, "learning_rate": 4.649541175499232e-05, "loss": 0.6844, "step": 11400 }, { "ar_loss": 0.5154, "epoch": 0.9774436090225563, "fm_loss": 0.1923, "grad_norm": 2.2421326637268066, "learning_rate": 4.6434806023514354e-05, "loss": 0.7077, "step": 11500 }, { "ar_loss": 0.5147, "epoch": 0.9924812030075187, "fm_loss": 0.171, "grad_norm": 1.773090124130249, "learning_rate": 4.6373720944208275e-05, "loss": 0.6856, "step": 11600 }, { "ar_loss": 0.4967, "epoch": 1.0075187969924813, "fm_loss": 0.1708, "grad_norm": 2.0621070861816406, "learning_rate": 4.631215788312331e-05, "loss": 0.6675, "step": 11700 }, { "ar_loss": 0.4895, "epoch": 1.0225563909774436, "fm_loss": 0.1706, "grad_norm": 1.7303022146224976, "learning_rate": 4.6250118216997795e-05, "loss": 0.6601, "step": 11800 }, { "ar_loss": 0.4896, "epoch": 1.037593984962406, "fm_loss": 0.1688, "grad_norm": 2.481395959854126, "learning_rate": 4.618760333322846e-05, "loss": 0.6584, "step": 11900 }, { "ar_loss": 0.484, "epoch": 1.0526315789473684, "fm_loss": 0.1686, "grad_norm": 8.144444465637207, "learning_rate": 4.61246146298393e-05, "loss": 0.6527, "step": 12000 }, { "ar_loss": 0.4851, "epoch": 1.0676691729323309, "fm_loss": 0.1695, "grad_norm": 4.188894271850586, "learning_rate": 4.606115351545043e-05, "loss": 0.6546, "step": 12100 }, { "ar_loss": 0.4853, "epoch": 1.0827067669172932, "fm_loss": 0.1685, "grad_norm": 2.112070083618164, "learning_rate": 4.599722140924645e-05, "loss": 0.6538, "step": 12200 }, { "ar_loss": 0.4862, "epoch": 1.0977443609022557, "fm_loss": 0.1684, "grad_norm": 5.455174922943115, "learning_rate": 4.593281974094483e-05, "loss": 0.6547, "step": 12300 }, { "ar_loss": 0.4887, "epoch": 1.112781954887218, "fm_loss": 0.1682, "grad_norm": 2.762221574783325, "learning_rate": 4.5867949950763864e-05, "loss": 0.6569, "step": 12400 }, { "ar_loss": 0.4829, "epoch": 1.1278195488721805, "fm_loss": 0.1683, "grad_norm": 1.505560278892517, "learning_rate": 4.5802613489390487e-05, "loss": 0.6511, "step": 12500 }, { "ar_loss": 0.487, "epoch": 1.1428571428571428, "fm_loss": 0.1686, "grad_norm": 12.36954402923584, "learning_rate": 4.5736811817947824e-05, "loss": 0.6557, "step": 12600 }, { "ar_loss": 0.4796, "epoch": 1.1578947368421053, "fm_loss": 0.1675, "grad_norm": 1.7835297584533691, "learning_rate": 4.5670546407962525e-05, "loss": 0.6471, "step": 12700 }, { "ar_loss": 0.4841, "epoch": 1.1729323308270676, "fm_loss": 0.1685, "grad_norm": 1.500798225402832, "learning_rate": 4.560381874133186e-05, "loss": 0.6526, "step": 12800 }, { "ar_loss": 0.4818, "epoch": 1.1879699248120301, "fm_loss": 0.1682, "grad_norm": 3.4301960468292236, "learning_rate": 4.553663031029055e-05, "loss": 0.65, "step": 12900 }, { "ar_loss": 0.4821, "epoch": 1.2030075187969924, "fm_loss": 0.1699, "grad_norm": 1.9192440509796143, "learning_rate": 4.546898261737745e-05, "loss": 0.6519, "step": 13000 }, { "ar_loss": 0.4854, "epoch": 1.218045112781955, "fm_loss": 0.1688, "grad_norm": 2.5409817695617676, "learning_rate": 4.540087717540188e-05, "loss": 0.6542, "step": 13100 }, { "ar_loss": 0.4807, "epoch": 1.2330827067669172, "fm_loss": 0.1684, "grad_norm": 1.6167821884155273, "learning_rate": 4.533231550740985e-05, "loss": 0.6491, "step": 13200 }, { "ar_loss": 0.4793, "epoch": 1.2481203007518797, "fm_loss": 0.1673, "grad_norm": 2.658431053161621, "learning_rate": 4.526329914664999e-05, "loss": 0.6466, "step": 13300 }, { "ar_loss": 0.4802, "epoch": 1.263157894736842, "fm_loss": 0.1678, "grad_norm": 1.7002625465393066, "learning_rate": 4.51938296365392e-05, "loss": 0.6481, "step": 13400 }, { "ar_loss": 0.4799, "epoch": 1.2781954887218046, "fm_loss": 0.1694, "grad_norm": 2.5322110652923584, "learning_rate": 4.5123908530628254e-05, "loss": 0.6493, "step": 13500 }, { "ar_loss": 0.4807, "epoch": 1.2932330827067668, "fm_loss": 0.1674, "grad_norm": 1.8980706930160522, "learning_rate": 4.5053537392566946e-05, "loss": 0.6481, "step": 13600 }, { "ar_loss": 0.4818, "epoch": 1.3082706766917294, "fm_loss": 0.1688, "grad_norm": 1.619292140007019, "learning_rate": 4.4982717796069176e-05, "loss": 0.6507, "step": 13700 }, { "ar_loss": 0.4841, "epoch": 1.3233082706766917, "fm_loss": 0.1684, "grad_norm": 1.7827472686767578, "learning_rate": 4.491145132487775e-05, "loss": 0.6526, "step": 13800 }, { "ar_loss": 0.4794, "epoch": 1.3383458646616542, "fm_loss": 0.1779, "grad_norm": 2.303715944290161, "learning_rate": 4.483973957272895e-05, "loss": 0.6572, "step": 13900 }, { "ar_loss": 0.4729, "epoch": 1.3533834586466165, "fm_loss": 0.1767, "grad_norm": 2.095193386077881, "learning_rate": 4.476758414331691e-05, "loss": 0.6496, "step": 14000 }, { "ar_loss": 0.4765, "epoch": 1.368421052631579, "fm_loss": 0.1679, "grad_norm": 3.301593065261841, "learning_rate": 4.4694986650257754e-05, "loss": 0.6445, "step": 14100 }, { "ar_loss": 0.4787, "epoch": 1.3834586466165413, "fm_loss": 0.1675, "grad_norm": 4.087191581726074, "learning_rate": 4.462194871705347e-05, "loss": 0.6462, "step": 14200 }, { "ar_loss": 0.4788, "epoch": 1.3984962406015038, "fm_loss": 0.1666, "grad_norm": 1.5592633485794067, "learning_rate": 4.4548471977055665e-05, "loss": 0.6455, "step": 14300 }, { "ar_loss": 0.4747, "epoch": 1.413533834586466, "fm_loss": 0.1682, "grad_norm": 7.437148571014404, "learning_rate": 4.447455807342901e-05, "loss": 0.6429, "step": 14400 }, { "ar_loss": 0.4798, "epoch": 1.4285714285714286, "fm_loss": 0.1674, "grad_norm": 1.5794994831085205, "learning_rate": 4.440020865911446e-05, "loss": 0.6472, "step": 14500 }, { "ar_loss": 0.4729, "epoch": 1.443609022556391, "fm_loss": 0.1674, "grad_norm": 2.327096700668335, "learning_rate": 4.432542539679235e-05, "loss": 0.6404, "step": 14600 }, { "ar_loss": 0.4711, "epoch": 1.4586466165413534, "fm_loss": 0.1699, "grad_norm": 1.5044869184494019, "learning_rate": 4.425020995884517e-05, "loss": 0.641, "step": 14700 }, { "ar_loss": 0.4757, "epoch": 1.4736842105263157, "fm_loss": 0.1678, "grad_norm": 1.8027641773223877, "learning_rate": 4.41745640273202e-05, "loss": 0.6435, "step": 14800 }, { "ar_loss": 0.4727, "epoch": 1.4887218045112782, "fm_loss": 0.1716, "grad_norm": 1.5030885934829712, "learning_rate": 4.4098489293891845e-05, "loss": 0.6443, "step": 14900 }, { "ar_loss": 0.4741, "epoch": 1.5037593984962405, "fm_loss": 0.1678, "grad_norm": 2.254826068878174, "learning_rate": 4.4021987459823834e-05, "loss": 0.6419, "step": 15000 }, { "ar_loss": 0.4636, "epoch": 2.015368065160596, "fm_loss": 0.1667, "grad_norm": 5.636004447937012, "learning_rate": 4.3687046036407485e-05, "loss": 0.6303, "step": 15100 }, { "ar_loss": 0.4622, "epoch": 2.0307361303211926, "fm_loss": 0.1678, "grad_norm": 2.2144827842712402, "learning_rate": 4.360656750389484e-05, "loss": 0.63, "step": 15200 }, { "ar_loss": 0.4643, "epoch": 2.0461041954817887, "fm_loss": 0.1659, "grad_norm": 1.623849868774414, "learning_rate": 4.3525654376107785e-05, "loss": 0.6302, "step": 15300 }, { "ar_loss": 0.4617, "epoch": 2.061472260642385, "fm_loss": 0.1671, "grad_norm": 10.88976001739502, "learning_rate": 4.344430854294155e-05, "loss": 0.6288, "step": 15400 }, { "ar_loss": 0.4622, "epoch": 2.0768403258029813, "fm_loss": 0.1679, "grad_norm": 2.079127550125122, "learning_rate": 4.3362531904398086e-05, "loss": 0.6301, "step": 15500 }, { "ar_loss": 0.4593, "epoch": 2.092208390963578, "fm_loss": 0.1663, "grad_norm": 3.9787468910217285, "learning_rate": 4.3280326370541716e-05, "loss": 0.6256, "step": 15600 }, { "ar_loss": 0.4606, "epoch": 2.107576456124174, "fm_loss": 0.1686, "grad_norm": 2.992798089981079, "learning_rate": 4.3197693861454505e-05, "loss": 0.6292, "step": 15700 }, { "ar_loss": 0.4612, "epoch": 2.1229445212847704, "fm_loss": 0.1693, "grad_norm": 1.920535922050476, "learning_rate": 4.31146363071914e-05, "loss": 0.6305, "step": 15800 }, { "ar_loss": 0.4626, "epoch": 2.1383125864453665, "fm_loss": 0.1793, "grad_norm": 1.652984380722046, "learning_rate": 4.303115564773521e-05, "loss": 0.6419, "step": 15900 }, { "ar_loss": 0.4583, "epoch": 2.153680651605963, "fm_loss": 0.1665, "grad_norm": 1.9141411781311035, "learning_rate": 4.294725383295121e-05, "loss": 0.6248, "step": 16000 }, { "ar_loss": 0.4646, "epoch": 2.169048716766559, "fm_loss": 0.1762, "grad_norm": 1.7625855207443237, "learning_rate": 4.286293282254165e-05, "loss": 0.6407, "step": 16100 }, { "ar_loss": 0.4622, "epoch": 2.1844167819271556, "fm_loss": 0.1674, "grad_norm": 1.4725229740142822, "learning_rate": 4.2778194585999965e-05, "loss": 0.6295, "step": 16200 }, { "ar_loss": 0.4642, "epoch": 2.1997848470877517, "fm_loss": 0.1654, "grad_norm": 2.979617118835449, "learning_rate": 4.269304110256479e-05, "loss": 0.6296, "step": 16300 }, { "ar_loss": 0.4605, "epoch": 2.2151529122483478, "fm_loss": 0.1656, "grad_norm": 4.4237470626831055, "learning_rate": 4.2607474361173714e-05, "loss": 0.6262, "step": 16400 }, { "ar_loss": 0.4597, "epoch": 2.2305209774089443, "fm_loss": 0.1673, "grad_norm": 3.3088274002075195, "learning_rate": 4.2521496360416834e-05, "loss": 0.627, "step": 16500 }, { "ar_loss": 0.4588, "epoch": 2.2458890425695404, "fm_loss": 0.166, "grad_norm": 2.4934027194976807, "learning_rate": 4.243510910849006e-05, "loss": 0.6248, "step": 16600 }, { "ar_loss": 0.4577, "epoch": 2.261257107730137, "fm_loss": 0.1654, "grad_norm": 3.4698071479797363, "learning_rate": 4.234831462314822e-05, "loss": 0.6231, "step": 16700 }, { "ar_loss": 0.459, "epoch": 2.276625172890733, "fm_loss": 0.1663, "grad_norm": 1.6527596712112427, "learning_rate": 4.226111493165793e-05, "loss": 0.6253, "step": 16800 }, { "ar_loss": 0.4563, "epoch": 2.2919932380513295, "fm_loss": 0.1647, "grad_norm": 2.62147855758667, "learning_rate": 4.217351207075024e-05, "loss": 0.621, "step": 16900 }, { "ar_loss": 0.4583, "epoch": 2.3073613032119256, "fm_loss": 0.1651, "grad_norm": 2.403658151626587, "learning_rate": 4.208550808657309e-05, "loss": 0.6234, "step": 17000 }, { "ar_loss": 0.4565, "epoch": 2.322729368372522, "fm_loss": 0.1651, "grad_norm": 2.243856906890869, "learning_rate": 4.199710503464345e-05, "loss": 0.6216, "step": 17100 }, { "ar_loss": 0.456, "epoch": 2.338097433533118, "fm_loss": 0.1651, "grad_norm": 2.199291229248047, "learning_rate": 4.190830497979938e-05, "loss": 0.6211, "step": 17200 }, { "ar_loss": 0.4532, "epoch": 2.3534654986937147, "fm_loss": 0.1656, "grad_norm": 2.230714797973633, "learning_rate": 4.1819109996151775e-05, "loss": 0.6188, "step": 17300 }, { "ar_loss": 0.4526, "epoch": 2.3688335638543108, "fm_loss": 0.1651, "grad_norm": 2.124840259552002, "learning_rate": 4.172952216703588e-05, "loss": 0.6176, "step": 17400 }, { "ar_loss": 0.4502, "epoch": 2.384201629014907, "fm_loss": 0.165, "grad_norm": 2.1426265239715576, "learning_rate": 4.1639543584962726e-05, "loss": 0.6152, "step": 17500 }, { "ar_loss": 0.4542, "epoch": 2.3995696941755034, "fm_loss": 0.1651, "grad_norm": 4.364583492279053, "learning_rate": 4.154917635157015e-05, "loss": 0.6193, "step": 17600 }, { "ar_loss": 0.4558, "epoch": 2.4149377593360994, "fm_loss": 0.1651, "grad_norm": 1.8425495624542236, "learning_rate": 4.145842257757377e-05, "loss": 0.621, "step": 17700 }, { "ar_loss": 0.4533, "epoch": 2.430305824496696, "fm_loss": 0.1657, "grad_norm": 1.7966125011444092, "learning_rate": 4.136728438271768e-05, "loss": 0.619, "step": 17800 }, { "ar_loss": 0.4493, "epoch": 2.445673889657292, "fm_loss": 0.1651, "grad_norm": 2.5523791313171387, "learning_rate": 4.127576389572488e-05, "loss": 0.6144, "step": 17900 }, { "ar_loss": 0.4501, "epoch": 2.4610419548178886, "fm_loss": 0.1647, "grad_norm": 2.2898178100585938, "learning_rate": 4.1183863254247655e-05, "loss": 0.6148, "step": 18000 }, { "ar_loss": 0.4509, "epoch": 2.4764100199784846, "fm_loss": 0.1635, "grad_norm": 2.085273504257202, "learning_rate": 4.109158460481758e-05, "loss": 0.6144, "step": 18100 }, { "ar_loss": 0.4515, "epoch": 2.491778085139081, "fm_loss": 0.1651, "grad_norm": 2.7309823036193848, "learning_rate": 4.0998930102795377e-05, "loss": 0.6166, "step": 18200 }, { "ar_loss": 0.4491, "epoch": 2.5071461502996772, "fm_loss": 0.1645, "grad_norm": 1.9515680074691772, "learning_rate": 4.090590191232061e-05, "loss": 0.6137, "step": 18300 }, { "ar_loss": 0.4541, "epoch": 2.5225142154602738, "fm_loss": 0.1639, "grad_norm": 3.2586522102355957, "learning_rate": 4.0812502206261096e-05, "loss": 0.618, "step": 18400 }, { "ar_loss": 0.4463, "epoch": 2.53788228062087, "fm_loss": 0.1645, "grad_norm": 3.2903804779052734, "learning_rate": 4.071873316616219e-05, "loss": 0.6108, "step": 18500 }, { "ar_loss": 0.4465, "epoch": 2.553250345781466, "fm_loss": 0.1659, "grad_norm": 3.0805041790008545, "learning_rate": 4.062459698219583e-05, "loss": 0.6124, "step": 18600 }, { "ar_loss": 0.4476, "epoch": 2.5686184109420624, "fm_loss": 0.1643, "grad_norm": 10.558906555175781, "learning_rate": 4.053009585310933e-05, "loss": 0.6119, "step": 18700 }, { "ar_loss": 0.4461, "epoch": 2.5839864761026585, "fm_loss": 0.164, "grad_norm": 1.7276713848114014, "learning_rate": 4.04352319861741e-05, "loss": 0.6102, "step": 18800 }, { "ar_loss": 0.4486, "epoch": 2.599354541263255, "fm_loss": 0.1655, "grad_norm": 2.4759597778320312, "learning_rate": 4.034000759713401e-05, "loss": 0.6142, "step": 18900 }, { "ar_loss": 0.4461, "epoch": 2.614722606423851, "fm_loss": 0.164, "grad_norm": 1.5683785676956177, "learning_rate": 4.024442491015372e-05, "loss": 0.6101, "step": 19000 }, { "ar_loss": 0.4495, "epoch": 2.6300906715844476, "fm_loss": 0.1637, "grad_norm": 3.866807460784912, "learning_rate": 4.014848615776666e-05, "loss": 0.6132, "step": 19100 }, { "ar_loss": 0.447, "epoch": 2.6454587367450437, "fm_loss": 0.1635, "grad_norm": 1.7105318307876587, "learning_rate": 4.00521935808229e-05, "loss": 0.6105, "step": 19200 }, { "ar_loss": 0.4454, "epoch": 2.66082680190564, "fm_loss": 0.1634, "grad_norm": 2.6716930866241455, "learning_rate": 3.995554942843687e-05, "loss": 0.6088, "step": 19300 }, { "ar_loss": 0.4491, "epoch": 2.6761948670662363, "fm_loss": 0.1647, "grad_norm": 1.8674702644348145, "learning_rate": 3.9858555957934715e-05, "loss": 0.6137, "step": 19400 }, { "ar_loss": 0.445, "epoch": 2.691562932226833, "fm_loss": 0.1631, "grad_norm": 2.926995277404785, "learning_rate": 3.976121543480169e-05, "loss": 0.6081, "step": 19500 }, { "ar_loss": 0.4416, "epoch": 2.706930997387429, "fm_loss": 0.1641, "grad_norm": 2.341655731201172, "learning_rate": 3.966353013262917e-05, "loss": 0.6057, "step": 19600 }, { "ar_loss": 0.4406, "epoch": 2.722299062548025, "fm_loss": 0.1648, "grad_norm": 1.6259019374847412, "learning_rate": 3.956550233306155e-05, "loss": 0.6054, "step": 19700 }, { "ar_loss": 0.4439, "epoch": 2.7376671277086215, "fm_loss": 0.1626, "grad_norm": 3.2317330837249756, "learning_rate": 3.946713432574299e-05, "loss": 0.6065, "step": 19800 }, { "ar_loss": 0.4445, "epoch": 2.7530351928692176, "fm_loss": 0.1636, "grad_norm": 2.053748369216919, "learning_rate": 3.936842840826391e-05, "loss": 0.6081, "step": 19900 }, { "ar_loss": 0.444, "epoch": 2.768403258029814, "fm_loss": 0.1635, "grad_norm": 3.10992431640625, "learning_rate": 3.9269386886107304e-05, "loss": 0.6076, "step": 20000 }, { "ar_loss": 0.4431, "epoch": 2.78377132319041, "fm_loss": 0.1634, "grad_norm": 1.6531990766525269, "learning_rate": 3.9170012072594944e-05, "loss": 0.6065, "step": 20100 }, { "ar_loss": 0.4413, "epoch": 2.7991393883510067, "fm_loss": 0.1642, "grad_norm": 2.239360809326172, "learning_rate": 3.90703062888333e-05, "loss": 0.6055, "step": 20200 }, { "ar_loss": 0.4411, "epoch": 2.814507453511603, "fm_loss": 0.1635, "grad_norm": 2.6573100090026855, "learning_rate": 3.8970271863659366e-05, "loss": 0.6046, "step": 20300 }, { "ar_loss": 0.4388, "epoch": 2.8298755186721993, "fm_loss": 0.1639, "grad_norm": 2.9059882164001465, "learning_rate": 3.886991113358621e-05, "loss": 0.6027, "step": 20400 }, { "ar_loss": 0.4413, "epoch": 2.8452435838327954, "fm_loss": 0.1647, "grad_norm": 1.4836231470108032, "learning_rate": 3.876922644274847e-05, "loss": 0.6061, "step": 20500 }, { "ar_loss": 0.4394, "epoch": 2.860611648993392, "fm_loss": 0.1639, "grad_norm": 3.667249917984009, "learning_rate": 3.866822014284753e-05, "loss": 0.6033, "step": 20600 }, { "ar_loss": 0.437, "epoch": 2.875979714153988, "fm_loss": 0.1631, "grad_norm": 2.724766731262207, "learning_rate": 3.8566894593096646e-05, "loss": 0.6001, "step": 20700 }, { "ar_loss": 0.4408, "epoch": 2.891347779314584, "fm_loss": 0.1634, "grad_norm": 3.302048683166504, "learning_rate": 3.846525216016581e-05, "loss": 0.6043, "step": 20800 }, { "ar_loss": 0.4374, "epoch": 2.9067158444751806, "fm_loss": 0.1632, "grad_norm": 5.427467346191406, "learning_rate": 3.836329521812651e-05, "loss": 0.6006, "step": 20900 }, { "ar_loss": 0.4404, "epoch": 2.922083909635777, "fm_loss": 0.1632, "grad_norm": 2.2337872982025146, "learning_rate": 3.826102614839621e-05, "loss": 0.6037, "step": 21000 }, { "ar_loss": 0.4384, "epoch": 2.937451974796373, "fm_loss": 0.1629, "grad_norm": 13.053716659545898, "learning_rate": 3.815844733968281e-05, "loss": 0.6014, "step": 21100 }, { "ar_loss": 0.4374, "epoch": 2.9528200399569693, "fm_loss": 0.1623, "grad_norm": 2.655907392501831, "learning_rate": 3.8055561187928776e-05, "loss": 0.5997, "step": 21200 }, { "ar_loss": 0.4344, "epoch": 2.968188105117566, "fm_loss": 0.163, "grad_norm": 4.967069625854492, "learning_rate": 3.795237009625523e-05, "loss": 0.5974, "step": 21300 }, { "ar_loss": 0.4384, "epoch": 2.983556170278162, "fm_loss": 0.1639, "grad_norm": 1.5047295093536377, "learning_rate": 3.784887647490581e-05, "loss": 0.6023, "step": 21400 }, { "ar_loss": 0.435, "epoch": 2.9989242354387584, "fm_loss": 0.1626, "grad_norm": 2.5669796466827393, "learning_rate": 3.774508274119035e-05, "loss": 0.5976, "step": 21500 }, { "ar_loss": 0.4195, "epoch": 3.0142923005993545, "fm_loss": 0.1629, "grad_norm": 2.004462718963623, "learning_rate": 3.764099131942846e-05, "loss": 0.5825, "step": 21600 }, { "ar_loss": 0.4153, "epoch": 3.029660365759951, "fm_loss": 0.1619, "grad_norm": 4.897035121917725, "learning_rate": 3.753660464089285e-05, "loss": 0.5773, "step": 21700 }, { "ar_loss": 0.414, "epoch": 3.045028430920547, "fm_loss": 0.1628, "grad_norm": 5.358692169189453, "learning_rate": 3.743192514375257e-05, "loss": 0.5768, "step": 21800 }, { "ar_loss": 0.417, "epoch": 3.060396496081143, "fm_loss": 0.1624, "grad_norm": 2.0926132202148438, "learning_rate": 3.732695527301609e-05, "loss": 0.5793, "step": 21900 }, { "ar_loss": 0.4168, "epoch": 3.0757645612417397, "fm_loss": 0.1626, "grad_norm": 2.1865603923797607, "learning_rate": 3.722169748047413e-05, "loss": 0.5793, "step": 22000 }, { "ar_loss": 0.4142, "epoch": 3.0911326264023358, "fm_loss": 0.163, "grad_norm": 1.9334361553192139, "learning_rate": 3.711615422464244e-05, "loss": 0.5772, "step": 22100 }, { "ar_loss": 0.4156, "epoch": 3.1065006915629323, "fm_loss": 0.163, "grad_norm": 2.2630386352539062, "learning_rate": 3.701032797070436e-05, "loss": 0.5785, "step": 22200 }, { "ar_loss": 0.4122, "epoch": 3.1218687567235284, "fm_loss": 0.1706, "grad_norm": 1.959439754486084, "learning_rate": 3.690422119045325e-05, "loss": 0.5828, "step": 22300 }, { "ar_loss": 0.4142, "epoch": 3.137236821884125, "fm_loss": 0.1717, "grad_norm": 2.2783379554748535, "learning_rate": 3.6797836362234745e-05, "loss": 0.5859, "step": 22400 }, { "ar_loss": 0.4142, "epoch": 3.152604887044721, "fm_loss": 0.1634, "grad_norm": 2.1327381134033203, "learning_rate": 3.669117597088885e-05, "loss": 0.5776, "step": 22500 }, { "ar_loss": 0.4125, "epoch": 3.1679729522053175, "fm_loss": 0.1665, "grad_norm": 2.315673828125, "learning_rate": 3.658424250769195e-05, "loss": 0.579, "step": 22600 }, { "ar_loss": 0.4148, "epoch": 3.1833410173659136, "fm_loss": 0.1629, "grad_norm": 3.1342201232910156, "learning_rate": 3.647703847029858e-05, "loss": 0.5778, "step": 22700 }, { "ar_loss": 0.4164, "epoch": 3.19870908252651, "fm_loss": 0.1615, "grad_norm": 2.276103973388672, "learning_rate": 3.6369566362683115e-05, "loss": 0.5778, "step": 22800 }, { "ar_loss": 0.415, "epoch": 3.214077147687106, "fm_loss": 0.1634, "grad_norm": 3.8148179054260254, "learning_rate": 3.626182869508124e-05, "loss": 0.5784, "step": 22900 }, { "ar_loss": 0.4135, "epoch": 3.2294452128477023, "fm_loss": 0.163, "grad_norm": 2.029802083969116, "learning_rate": 3.6153827983931395e-05, "loss": 0.5765, "step": 23000 }, { "ar_loss": 0.4108, "epoch": 3.2448132780082988, "fm_loss": 0.1616, "grad_norm": 1.9012216329574585, "learning_rate": 3.6045566751815906e-05, "loss": 0.5724, "step": 23100 }, { "ar_loss": 0.4118, "epoch": 3.260181343168895, "fm_loss": 0.1631, "grad_norm": 1.9454518556594849, "learning_rate": 3.593704752740214e-05, "loss": 0.5749, "step": 23200 }, { "ar_loss": 0.4114, "epoch": 3.2755494083294914, "fm_loss": 0.1646, "grad_norm": 5.499399662017822, "learning_rate": 3.5828272845383395e-05, "loss": 0.576, "step": 23300 }, { "ar_loss": 0.4099, "epoch": 3.2909174734900875, "fm_loss": 0.1622, "grad_norm": 2.5215775966644287, "learning_rate": 3.571924524641973e-05, "loss": 0.5721, "step": 23400 }, { "ar_loss": 0.4132, "epoch": 3.306285538650684, "fm_loss": 0.1619, "grad_norm": 1.9290796518325806, "learning_rate": 3.56099672770786e-05, "loss": 0.575, "step": 23500 }, { "ar_loss": 0.4106, "epoch": 3.32165360381128, "fm_loss": 0.1653, "grad_norm": 3.0975518226623535, "learning_rate": 3.550044148977539e-05, "loss": 0.5759, "step": 23600 }, { "ar_loss": 0.4109, "epoch": 3.3370216689718766, "fm_loss": 0.1625, "grad_norm": 3.079251289367676, "learning_rate": 3.539067044271378e-05, "loss": 0.5734, "step": 23700 }, { "ar_loss": 0.4106, "epoch": 3.3523897341324727, "fm_loss": 0.162, "grad_norm": 3.6910111904144287, "learning_rate": 3.5280656699826016e-05, "loss": 0.5726, "step": 23800 }, { "ar_loss": 0.4081, "epoch": 3.367757799293069, "fm_loss": 0.1616, "grad_norm": 2.8438339233398438, "learning_rate": 3.5170402830713004e-05, "loss": 0.5698, "step": 23900 }, { "ar_loss": 0.4122, "epoch": 3.3831258644536653, "fm_loss": 0.1616, "grad_norm": 5.298975467681885, "learning_rate": 3.505991141058431e-05, "loss": 0.5738, "step": 24000 }, { "ar_loss": 0.4107, "epoch": 3.3984939296142613, "fm_loss": 0.1614, "grad_norm": 2.1549863815307617, "learning_rate": 3.494918502019798e-05, "loss": 0.5721, "step": 24100 }, { "ar_loss": 0.4095, "epoch": 3.413861994774858, "fm_loss": 0.1615, "grad_norm": 2.1645219326019287, "learning_rate": 3.483822624580031e-05, "loss": 0.571, "step": 24200 }, { "ar_loss": 0.4102, "epoch": 3.429230059935454, "fm_loss": 0.1623, "grad_norm": 3.294847249984741, "learning_rate": 3.472703767906539e-05, "loss": 0.5724, "step": 24300 }, { "ar_loss": 0.4116, "epoch": 3.4445981250960505, "fm_loss": 0.1628, "grad_norm": 2.6318018436431885, "learning_rate": 3.461562191703459e-05, "loss": 0.5744, "step": 24400 }, { "ar_loss": 0.4116, "epoch": 3.4599661902566465, "fm_loss": 0.161, "grad_norm": 4.470794677734375, "learning_rate": 3.450398156205592e-05, "loss": 0.5726, "step": 24500 }, { "ar_loss": 0.4101, "epoch": 3.475334255417243, "fm_loss": 0.1609, "grad_norm": 3.786229133605957, "learning_rate": 3.43921192217232e-05, "loss": 0.571, "step": 24600 }, { "ar_loss": 0.4085, "epoch": 3.490702320577839, "fm_loss": 0.1606, "grad_norm": 3.3834242820739746, "learning_rate": 3.42800375088152e-05, "loss": 0.5691, "step": 24700 }, { "ar_loss": 0.4106, "epoch": 3.5060703857384357, "fm_loss": 0.1612, "grad_norm": 2.6395816802978516, "learning_rate": 3.4167739041234595e-05, "loss": 0.5718, "step": 24800 }, { "ar_loss": 0.4095, "epoch": 3.5214384508990317, "fm_loss": 0.161, "grad_norm": 3.1765825748443604, "learning_rate": 3.405522644194682e-05, "loss": 0.5705, "step": 24900 }, { "ar_loss": 0.4098, "epoch": 3.5368065160596283, "fm_loss": 0.1619, "grad_norm": 2.2531938552856445, "learning_rate": 3.3942502338918795e-05, "loss": 0.5716, "step": 25000 }, { "ar_loss": 0.4074, "epoch": 3.5521745812202243, "fm_loss": 0.1624, "grad_norm": 2.6430165767669678, "learning_rate": 3.382956936505755e-05, "loss": 0.5698, "step": 25100 }, { "ar_loss": 0.4096, "epoch": 3.5675426463808204, "fm_loss": 0.1607, "grad_norm": 2.1325323581695557, "learning_rate": 3.371643015814874e-05, "loss": 0.5703, "step": 25200 }, { "ar_loss": 0.4111, "epoch": 3.582910711541417, "fm_loss": 0.1614, "grad_norm": 2.9714601039886475, "learning_rate": 3.360308736079502e-05, "loss": 0.5725, "step": 25300 }, { "ar_loss": 0.4086, "epoch": 3.5982787767020135, "fm_loss": 0.1612, "grad_norm": 2.1280746459960938, "learning_rate": 3.348954362035432e-05, "loss": 0.5698, "step": 25400 }, { "ar_loss": 0.4074, "epoch": 3.6136468418626095, "fm_loss": 0.1608, "grad_norm": 8.301246643066406, "learning_rate": 3.337580158887802e-05, "loss": 0.5681, "step": 25500 }, { "ar_loss": 0.4054, "epoch": 3.6290149070232056, "fm_loss": 0.164, "grad_norm": 2.632568120956421, "learning_rate": 3.326186392304901e-05, "loss": 0.5694, "step": 25600 }, { "ar_loss": 0.4076, "epoch": 3.644382972183802, "fm_loss": 0.1616, "grad_norm": 4.138895511627197, "learning_rate": 3.314773328411962e-05, "loss": 0.5692, "step": 25700 }, { "ar_loss": 0.4059, "epoch": 3.659751037344398, "fm_loss": 0.1616, "grad_norm": 2.5377886295318604, "learning_rate": 3.3033412337849466e-05, "loss": 0.5675, "step": 25800 }, { "ar_loss": 0.4085, "epoch": 3.6751191025049947, "fm_loss": 0.1617, "grad_norm": 2.334913730621338, "learning_rate": 3.2918903754443195e-05, "loss": 0.5702, "step": 25900 }, { "ar_loss": 0.4064, "epoch": 3.690487167665591, "fm_loss": 0.1625, "grad_norm": 1.9625128507614136, "learning_rate": 3.2804210208488114e-05, "loss": 0.5689, "step": 26000 }, { "ar_loss": 0.4057, "epoch": 3.7058552328261873, "fm_loss": 0.1617, "grad_norm": 2.2251503467559814, "learning_rate": 3.268933437889172e-05, "loss": 0.5674, "step": 26100 }, { "ar_loss": 0.4066, "epoch": 3.7212232979867834, "fm_loss": 0.1617, "grad_norm": 3.2878170013427734, "learning_rate": 3.2574278948819105e-05, "loss": 0.5683, "step": 26200 }, { "ar_loss": 0.4097, "epoch": 3.7365913631473795, "fm_loss": 0.1618, "grad_norm": 3.281667947769165, "learning_rate": 3.2459046605630334e-05, "loss": 0.5715, "step": 26300 }, { "ar_loss": 0.406, "epoch": 3.751959428307976, "fm_loss": 0.1607, "grad_norm": 2.7066891193389893, "learning_rate": 3.234364004081763e-05, "loss": 0.5667, "step": 26400 }, { "ar_loss": 0.4041, "epoch": 3.7673274934685725, "fm_loss": 0.1614, "grad_norm": 4.11818790435791, "learning_rate": 3.222806194994253e-05, "loss": 0.5655, "step": 26500 }, { "ar_loss": 0.4039, "epoch": 3.7826955586291686, "fm_loss": 0.1611, "grad_norm": 2.0963382720947266, "learning_rate": 3.211231503257292e-05, "loss": 0.565, "step": 26600 }, { "ar_loss": 0.4039, "epoch": 3.7980636237897647, "fm_loss": 0.1612, "grad_norm": 8.769935607910156, "learning_rate": 3.199640199221998e-05, "loss": 0.5651, "step": 26700 }, { "ar_loss": 0.407, "epoch": 3.813431688950361, "fm_loss": 0.1614, "grad_norm": 4.367818832397461, "learning_rate": 3.188032553627505e-05, "loss": 0.5684, "step": 26800 }, { "ar_loss": 0.4025, "epoch": 3.8287997541109573, "fm_loss": 0.1612, "grad_norm": 3.1772165298461914, "learning_rate": 3.1764088375946355e-05, "loss": 0.5637, "step": 26900 }, { "ar_loss": 0.4051, "epoch": 3.844167819271554, "fm_loss": 0.1606, "grad_norm": 2.3004674911499023, "learning_rate": 3.1647693226195764e-05, "loss": 0.5657, "step": 27000 }, { "ar_loss": 0.4023, "epoch": 3.85953588443215, "fm_loss": 0.1609, "grad_norm": 2.438415050506592, "learning_rate": 3.1531142805675244e-05, "loss": 0.5633, "step": 27100 }, { "ar_loss": 0.405, "epoch": 3.8749039495927464, "fm_loss": 0.1615, "grad_norm": 2.6293303966522217, "learning_rate": 3.141443983666349e-05, "loss": 0.5666, "step": 27200 }, { "ar_loss": 0.4022, "epoch": 3.8902720147533425, "fm_loss": 0.1609, "grad_norm": 3.9701802730560303, "learning_rate": 3.1297587045002265e-05, "loss": 0.5631, "step": 27300 }, { "ar_loss": 0.4008, "epoch": 3.9056400799139386, "fm_loss": 0.1615, "grad_norm": 1.8459994792938232, "learning_rate": 3.118058716003277e-05, "loss": 0.5623, "step": 27400 }, { "ar_loss": 0.4058, "epoch": 3.921008145074535, "fm_loss": 0.1612, "grad_norm": 10.706082344055176, "learning_rate": 3.106344291453185e-05, "loss": 0.567, "step": 27500 }, { "ar_loss": 0.4022, "epoch": 3.9363762102351316, "fm_loss": 0.1609, "grad_norm": 2.411083221435547, "learning_rate": 3.09461570446482e-05, "loss": 0.5632, "step": 27600 }, { "ar_loss": 0.4021, "epoch": 3.9517442753957277, "fm_loss": 0.1609, "grad_norm": 1.9293774366378784, "learning_rate": 3.082873228983847e-05, "loss": 0.563, "step": 27700 }, { "ar_loss": 0.4021, "epoch": 3.967112340556324, "fm_loss": 0.1607, "grad_norm": 2.12009334564209, "learning_rate": 3.071117139280325e-05, "loss": 0.5628, "step": 27800 }, { "ar_loss": 0.4052, "epoch": 3.9824804057169203, "fm_loss": 0.16, "grad_norm": 1.866605281829834, "learning_rate": 3.059347709942299e-05, "loss": 0.5652, "step": 27900 }, { "ar_loss": 0.404, "epoch": 3.9978484708775164, "fm_loss": 0.1614, "grad_norm": 3.483856439590454, "learning_rate": 3.0475652158693912e-05, "loss": 0.5653, "step": 28000 }, { "ar_loss": 0.3897, "epoch": 4.0132165360381125, "fm_loss": 0.1654, "grad_norm": 6.412163734436035, "learning_rate": 3.0357699322663784e-05, "loss": 0.5552, "step": 28100 }, { "ar_loss": 0.3912, "epoch": 4.028584601198709, "fm_loss": 0.1622, "grad_norm": 2.0246286392211914, "learning_rate": 3.023962134636763e-05, "loss": 0.5535, "step": 28200 }, { "ar_loss": 0.3892, "epoch": 4.0439526663593055, "fm_loss": 0.1619, "grad_norm": 1.9690488576889038, "learning_rate": 3.0121420987763393e-05, "loss": 0.5511, "step": 28300 }, { "ar_loss": 0.3891, "epoch": 4.059320731519902, "fm_loss": 0.161, "grad_norm": 2.742201805114746, "learning_rate": 3.0003101007667485e-05, "loss": 0.55, "step": 28400 }, { "ar_loss": 0.3889, "epoch": 4.074688796680498, "fm_loss": 0.1608, "grad_norm": 15.931248664855957, "learning_rate": 2.9884664169690356e-05, "loss": 0.5496, "step": 28500 }, { "ar_loss": 0.3909, "epoch": 4.090056861841094, "fm_loss": 0.16, "grad_norm": 2.156740665435791, "learning_rate": 2.976611324017191e-05, "loss": 0.5509, "step": 28600 }, { "ar_loss": 0.388, "epoch": 4.105424927001691, "fm_loss": 0.1659, "grad_norm": 5.583569049835205, "learning_rate": 2.9647450988116893e-05, "loss": 0.5539, "step": 28700 }, { "ar_loss": 0.3909, "epoch": 4.120792992162286, "fm_loss": 0.1597, "grad_norm": 2.5985569953918457, "learning_rate": 2.9528680185130214e-05, "loss": 0.5505, "step": 28800 }, { "ar_loss": 0.3892, "epoch": 4.136161057322883, "fm_loss": 0.1596, "grad_norm": 3.106172800064087, "learning_rate": 2.9409803605352237e-05, "loss": 0.5488, "step": 28900 }, { "ar_loss": 0.3863, "epoch": 4.151529122483479, "fm_loss": 0.1611, "grad_norm": 2.211129665374756, "learning_rate": 2.929082402539395e-05, "loss": 0.5473, "step": 29000 }, { "ar_loss": 0.39, "epoch": 4.166897187644076, "fm_loss": 0.1602, "grad_norm": 1.949877142906189, "learning_rate": 2.9171744224272113e-05, "loss": 0.5502, "step": 29100 }, { "ar_loss": 0.387, "epoch": 4.1822652528046715, "fm_loss": 0.1646, "grad_norm": 8.61090087890625, "learning_rate": 2.9052566983344388e-05, "loss": 0.5516, "step": 29200 }, { "ar_loss": 0.391, "epoch": 4.197633317965268, "fm_loss": 0.1596, "grad_norm": 2.0502259731292725, "learning_rate": 2.893329508624433e-05, "loss": 0.5506, "step": 29300 }, { "ar_loss": 0.3897, "epoch": 4.213001383125865, "fm_loss": 0.1603, "grad_norm": 2.44155216217041, "learning_rate": 2.8813931318816395e-05, "loss": 0.55, "step": 29400 }, { "ar_loss": 0.387, "epoch": 4.228369448286461, "fm_loss": 0.1616, "grad_norm": 2.004561185836792, "learning_rate": 2.869447846905085e-05, "loss": 0.5487, "step": 29500 }, { "ar_loss": 0.3875, "epoch": 4.243737513447057, "fm_loss": 0.1598, "grad_norm": 5.270979881286621, "learning_rate": 2.8574939327018685e-05, "loss": 0.5473, "step": 29600 }, { "ar_loss": 0.3873, "epoch": 4.259105578607653, "fm_loss": 0.1605, "grad_norm": 6.422144412994385, "learning_rate": 2.8455316684806404e-05, "loss": 0.5478, "step": 29700 }, { "ar_loss": 0.3856, "epoch": 4.27447364376825, "fm_loss": 0.1609, "grad_norm": 9.331077575683594, "learning_rate": 2.833561333645085e-05, "loss": 0.5465, "step": 29800 }, { "ar_loss": 0.3887, "epoch": 4.289841708928845, "fm_loss": 0.1603, "grad_norm": 3.2531440258026123, "learning_rate": 2.8215832077873928e-05, "loss": 0.549, "step": 29900 }, { "ar_loss": 0.3879, "epoch": 4.305209774089442, "fm_loss": 0.1616, "grad_norm": 4.434958457946777, "learning_rate": 2.8095975706817283e-05, "loss": 0.5495, "step": 30000 }, { "ar_loss": 0.3873, "epoch": 4.015372790161415, "fm_loss": 0.16, "grad_norm": 2.223679542541504, "learning_rate": 2.7964958224364322e-05, "loss": 0.5474, "step": 30100 }, { "ar_loss": 0.3861, "epoch": 4.0307455803228285, "fm_loss": 0.1608, "grad_norm": 2.2263758182525635, "learning_rate": 2.7844916800976393e-05, "loss": 0.5468, "step": 30200 }, { "ar_loss": 0.385, "epoch": 4.046118370484243, "fm_loss": 0.1591, "grad_norm": 3.2188937664031982, "learning_rate": 2.772480888770234e-05, "loss": 0.5442, "step": 30300 }, { "ar_loss": 0.3838, "epoch": 4.061491160645657, "fm_loss": 0.1605, "grad_norm": 2.1031410694122314, "learning_rate": 2.7604637291640594e-05, "loss": 0.5443, "step": 30400 }, { "ar_loss": 0.3874, "epoch": 4.076863950807072, "fm_loss": 0.1604, "grad_norm": 3.646202564239502, "learning_rate": 2.748440482137793e-05, "loss": 0.5478, "step": 30500 }, { "ar_loss": 0.3888, "epoch": 4.092236740968485, "fm_loss": 0.16, "grad_norm": 3.0315663814544678, "learning_rate": 2.7364114286923865e-05, "loss": 0.5488, "step": 30600 }, { "ar_loss": 0.386, "epoch": 4.1076095311299, "fm_loss": 0.1597, "grad_norm": 2.8235671520233154, "learning_rate": 2.7243768499644946e-05, "loss": 0.5457, "step": 30700 }, { "ar_loss": 0.3889, "epoch": 4.122982321291314, "fm_loss": 0.161, "grad_norm": 2.2409422397613525, "learning_rate": 2.7123370272199055e-05, "loss": 0.55, "step": 30800 }, { "ar_loss": 0.3876, "epoch": 4.138355111452729, "fm_loss": 0.1612, "grad_norm": 2.19675874710083, "learning_rate": 2.700292241846971e-05, "loss": 0.5488, "step": 30900 }, { "ar_loss": 0.3871, "epoch": 4.153727901614143, "fm_loss": 0.1601, "grad_norm": 2.340182304382324, "learning_rate": 2.6882427753500245e-05, "loss": 0.5471, "step": 31000 }, { "ar_loss": 0.3879, "epoch": 4.169100691775557, "fm_loss": 0.16, "grad_norm": 3.1240203380584717, "learning_rate": 2.676188909342801e-05, "loss": 0.5479, "step": 31100 }, { "ar_loss": 0.3832, "epoch": 4.184473481936972, "fm_loss": 0.1603, "grad_norm": 2.1297569274902344, "learning_rate": 2.664130925541865e-05, "loss": 0.5436, "step": 31200 }, { "ar_loss": 0.3889, "epoch": 4.199846272098386, "fm_loss": 0.1599, "grad_norm": 3.824352264404297, "learning_rate": 2.6520691057600155e-05, "loss": 0.5488, "step": 31300 }, { "ar_loss": 0.388, "epoch": 4.2152190622598, "fm_loss": 0.1605, "grad_norm": 1.8939049243927002, "learning_rate": 2.6400037318997046e-05, "loss": 0.5485, "step": 31400 }, { "ar_loss": 0.3886, "epoch": 4.230591852421214, "fm_loss": 0.1609, "grad_norm": 2.476659059524536, "learning_rate": 2.6279350859464502e-05, "loss": 0.5495, "step": 31500 }, { "ar_loss": 0.3888, "epoch": 4.245964642582629, "fm_loss": 0.1602, "grad_norm": 3.117105007171631, "learning_rate": 2.6158634499622425e-05, "loss": 0.549, "step": 31600 }, { "ar_loss": 0.386, "epoch": 4.261337432744043, "fm_loss": 0.1603, "grad_norm": 2.931809663772583, "learning_rate": 2.6037891060789514e-05, "loss": 0.5464, "step": 31700 }, { "ar_loss": 0.382, "epoch": 4.276710222905457, "fm_loss": 0.1591, "grad_norm": 2.1063570976257324, "learning_rate": 2.5917123364917378e-05, "loss": 0.5411, "step": 31800 }, { "ar_loss": 0.3863, "epoch": 4.292083013066872, "fm_loss": 0.1601, "grad_norm": 4.796857833862305, "learning_rate": 2.5796334234524533e-05, "loss": 0.5463, "step": 31900 }, { "ar_loss": 0.3849, "epoch": 4.307455803228286, "fm_loss": 0.1612, "grad_norm": 2.840190887451172, "learning_rate": 2.567552649263044e-05, "loss": 0.5461, "step": 32000 }, { "ar_loss": 0.3876, "epoch": 4.3228285933897, "fm_loss": 0.1686, "grad_norm": 2.64380145072937, "learning_rate": 2.5554702962689563e-05, "loss": 0.5562, "step": 32100 }, { "ar_loss": 0.3855, "epoch": 4.338201383551114, "fm_loss": 0.1599, "grad_norm": 4.3930535316467285, "learning_rate": 2.5433866468525342e-05, "loss": 0.5453, "step": 32200 }, { "ar_loss": 0.3825, "epoch": 4.353574173712529, "fm_loss": 0.1587, "grad_norm": 2.1165313720703125, "learning_rate": 2.531301983426419e-05, "loss": 0.5412, "step": 32300 }, { "ar_loss": 0.3849, "epoch": 4.3689469638739435, "fm_loss": 0.1601, "grad_norm": 3.832287073135376, "learning_rate": 2.519216588426955e-05, "loss": 0.5449, "step": 32400 }, { "ar_loss": 0.3872, "epoch": 4.384319754035357, "fm_loss": 0.16, "grad_norm": 2.8910598754882812, "learning_rate": 2.507130744307581e-05, "loss": 0.5472, "step": 32500 }, { "ar_loss": 0.3847, "epoch": 4.399692544196772, "fm_loss": 0.1603, "grad_norm": 2.692793607711792, "learning_rate": 2.4950447335322335e-05, "loss": 0.545, "step": 32600 }, { "ar_loss": 0.3831, "epoch": 4.415065334358186, "fm_loss": 0.1595, "grad_norm": 2.35683536529541, "learning_rate": 2.482958838568746e-05, "loss": 0.5426, "step": 32700 }, { "ar_loss": 0.3861, "epoch": 4.4304381245196005, "fm_loss": 0.159, "grad_norm": 1.9129565954208374, "learning_rate": 2.4708733418822427e-05, "loss": 0.5451, "step": 32800 }, { "ar_loss": 0.3868, "epoch": 4.445810914681014, "fm_loss": 0.1599, "grad_norm": 4.045548915863037, "learning_rate": 2.4587885259285396e-05, "loss": 0.5466, "step": 32900 }, { "ar_loss": 0.3834, "epoch": 4.461183704842429, "fm_loss": 0.1601, "grad_norm": 2.0863168239593506, "learning_rate": 2.446704673147544e-05, "loss": 0.5436, "step": 33000 }, { "ar_loss": 0.3837, "epoch": 4.476556495003843, "fm_loss": 0.1596, "grad_norm": 2.455124855041504, "learning_rate": 2.4346220659566513e-05, "loss": 0.5433, "step": 33100 }, { "ar_loss": 0.3841, "epoch": 4.4919292851652575, "fm_loss": 0.1593, "grad_norm": 2.560342311859131, "learning_rate": 2.4225409867441483e-05, "loss": 0.5434, "step": 33200 }, { "ar_loss": 0.3813, "epoch": 4.507302075326672, "fm_loss": 0.1589, "grad_norm": 13.45487117767334, "learning_rate": 2.4104617178626075e-05, "loss": 0.5402, "step": 33300 }, { "ar_loss": 0.3854, "epoch": 4.522674865488086, "fm_loss": 0.1606, "grad_norm": 1.9280191659927368, "learning_rate": 2.3983845416222943e-05, "loss": 0.546, "step": 33400 }, { "ar_loss": 0.3861, "epoch": 4.538047655649501, "fm_loss": 0.1599, "grad_norm": 4.483201503753662, "learning_rate": 2.386309740284562e-05, "loss": 0.5461, "step": 33500 }, { "ar_loss": 0.3837, "epoch": 4.553420445810914, "fm_loss": 0.1597, "grad_norm": 2.269436836242676, "learning_rate": 2.3742375960552628e-05, "loss": 0.5434, "step": 33600 }, { "ar_loss": 0.3845, "epoch": 4.568793235972329, "fm_loss": 0.1595, "grad_norm": 2.5172126293182373, "learning_rate": 2.3621683910781458e-05, "loss": 0.544, "step": 33700 }, { "ar_loss": 0.3859, "epoch": 4.584166026133743, "fm_loss": 0.1586, "grad_norm": 29.834903717041016, "learning_rate": 2.3501024074282665e-05, "loss": 0.5445, "step": 33800 }, { "ar_loss": 0.3864, "epoch": 4.599538816295158, "fm_loss": 0.16, "grad_norm": 2.567058563232422, "learning_rate": 2.3380399271053953e-05, "loss": 0.5464, "step": 33900 }, { "ar_loss": 0.3836, "epoch": 4.614911606456571, "fm_loss": 0.1598, "grad_norm": 2.2282016277313232, "learning_rate": 2.3259812320274206e-05, "loss": 0.5434, "step": 34000 }, { "ar_loss": 0.3875, "epoch": 4.630284396617986, "fm_loss": 0.1595, "grad_norm": 2.367128849029541, "learning_rate": 2.313926604023767e-05, "loss": 0.5469, "step": 34100 }, { "ar_loss": 0.3826, "epoch": 4.645657186779401, "fm_loss": 0.1587, "grad_norm": 2.4427452087402344, "learning_rate": 2.3018763248288043e-05, "loss": 0.5413, "step": 34200 }, { "ar_loss": 0.3856, "epoch": 4.661029976940815, "fm_loss": 0.1597, "grad_norm": 3.8236427307128906, "learning_rate": 2.289830676075265e-05, "loss": 0.5453, "step": 34300 }, { "ar_loss": 0.3857, "epoch": 4.676402767102229, "fm_loss": 0.16, "grad_norm": 3.789658546447754, "learning_rate": 2.2777899392876596e-05, "loss": 0.5457, "step": 34400 }, { "ar_loss": 0.3843, "epoch": 4.691775557263643, "fm_loss": 0.1598, "grad_norm": 7.402520656585693, "learning_rate": 2.265754395875703e-05, "loss": 0.5441, "step": 34500 }, { "ar_loss": 0.3828, "epoch": 4.707148347425058, "fm_loss": 0.1606, "grad_norm": 2.951808452606201, "learning_rate": 2.2537243271277286e-05, "loss": 0.5434, "step": 34600 }, { "ar_loss": 0.3828, "epoch": 4.722521137586472, "fm_loss": 0.1602, "grad_norm": 2.665632724761963, "learning_rate": 2.241700014204121e-05, "loss": 0.543, "step": 34700 }, { "ar_loss": 0.3818, "epoch": 4.737893927747886, "fm_loss": 0.1606, "grad_norm": 2.390002727508545, "learning_rate": 2.2296817381307425e-05, "loss": 0.5423, "step": 34800 }, { "ar_loss": 0.3864, "epoch": 4.753266717909301, "fm_loss": 0.1599, "grad_norm": 5.915409564971924, "learning_rate": 2.2176697797923653e-05, "loss": 0.5463, "step": 34900 }, { "ar_loss": 0.3823, "epoch": 4.768639508070715, "fm_loss": 0.165, "grad_norm": 3.208481550216675, "learning_rate": 2.205664419926106e-05, "loss": 0.5473, "step": 35000 }, { "ar_loss": 0.384, "epoch": 4.784012298232129, "fm_loss": 0.1604, "grad_norm": 2.6016440391540527, "learning_rate": 2.1936659391148682e-05, "loss": 0.5444, "step": 35100 }, { "ar_loss": 0.3839, "epoch": 4.799385088393543, "fm_loss": 0.16, "grad_norm": 7.024256229400635, "learning_rate": 2.1816746177807777e-05, "loss": 0.5438, "step": 35200 }, { "ar_loss": 0.3835, "epoch": 4.814757878554958, "fm_loss": 0.1604, "grad_norm": 2.983032464981079, "learning_rate": 2.169690736178636e-05, "loss": 0.5439, "step": 35300 }, { "ar_loss": 0.3857, "epoch": 4.830130668716372, "fm_loss": 0.1596, "grad_norm": 3.4045064449310303, "learning_rate": 2.1577145743893652e-05, "loss": 0.5453, "step": 35400 }, { "ar_loss": 0.3799, "epoch": 4.845503458877786, "fm_loss": 0.1606, "grad_norm": 3.3932929039001465, "learning_rate": 2.1457464123134654e-05, "loss": 0.5406, "step": 35500 }, { "ar_loss": 0.3815, "epoch": 4.860876249039201, "fm_loss": 0.1618, "grad_norm": 5.131358623504639, "learning_rate": 2.1337865296644693e-05, "loss": 0.5433, "step": 35600 }, { "ar_loss": 0.3839, "epoch": 4.876249039200615, "fm_loss": 0.1597, "grad_norm": 2.781935453414917, "learning_rate": 2.1218352059624125e-05, "loss": 0.5436, "step": 35700 }, { "ar_loss": 0.3836, "epoch": 4.8916218293620295, "fm_loss": 0.1593, "grad_norm": 4.221926689147949, "learning_rate": 2.1098927205272888e-05, "loss": 0.5429, "step": 35800 }, { "ar_loss": 0.3839, "epoch": 4.906994619523443, "fm_loss": 0.1595, "grad_norm": 3.9555509090423584, "learning_rate": 2.0979593524725326e-05, "loss": 0.5434, "step": 35900 }, { "ar_loss": 0.3815, "epoch": 4.922367409684858, "fm_loss": 0.1597, "grad_norm": 2.448594331741333, "learning_rate": 2.0860353806984917e-05, "loss": 0.5412, "step": 36000 }, { "ar_loss": 0.3813, "epoch": 4.937740199846272, "fm_loss": 0.1604, "grad_norm": 2.342075824737549, "learning_rate": 2.074121083885907e-05, "loss": 0.5417, "step": 36100 }, { "ar_loss": 0.3825, "epoch": 4.9531129900076865, "fm_loss": 0.1601, "grad_norm": 7.287113666534424, "learning_rate": 2.0622167404894034e-05, "loss": 0.5426, "step": 36200 }, { "ar_loss": 0.3835, "epoch": 4.9684857801691, "fm_loss": 0.1583, "grad_norm": 15.53018569946289, "learning_rate": 2.0503226287309786e-05, "loss": 0.5418, "step": 36300 }, { "ar_loss": 0.3832, "epoch": 4.983858570330515, "fm_loss": 0.1783, "grad_norm": 36.44465637207031, "learning_rate": 2.0384390265935027e-05, "loss": 0.5614, "step": 36400 }, { "ar_loss": 0.382, "epoch": 4.99923136049193, "fm_loss": 0.1595, "grad_norm": 4.1146745681762695, "learning_rate": 2.02656621181422e-05, "loss": 0.5415, "step": 36500 }, { "ar_loss": 0.3779, "epoch": 5.014604150653343, "fm_loss": 0.1594, "grad_norm": 2.2293853759765625, "learning_rate": 2.0147044618782585e-05, "loss": 0.5373, "step": 36600 }, { "ar_loss": 0.3786, "epoch": 5.029976940814758, "fm_loss": 0.1599, "grad_norm": 3.3400254249572754, "learning_rate": 2.0028540540121444e-05, "loss": 0.5386, "step": 36700 }, { "ar_loss": 0.3761, "epoch": 5.045349730976172, "fm_loss": 0.1594, "grad_norm": 2.30407452583313, "learning_rate": 1.9910152651773235e-05, "loss": 0.5355, "step": 36800 }, { "ar_loss": 0.3785, "epoch": 5.060722521137587, "fm_loss": 0.1592, "grad_norm": 2.3268890380859375, "learning_rate": 1.9791883720636864e-05, "loss": 0.5376, "step": 36900 }, { "ar_loss": 0.3771, "epoch": 5.076095311299, "fm_loss": 0.1583, "grad_norm": 4.636923789978027, "learning_rate": 1.967373651083106e-05, "loss": 0.5354, "step": 37000 }, { "ar_loss": 0.3741, "epoch": 5.091468101460415, "fm_loss": 0.1595, "grad_norm": 8.473295211791992, "learning_rate": 1.9555713783629714e-05, "loss": 0.5336, "step": 37100 }, { "ar_loss": 0.3763, "epoch": 5.10684089162183, "fm_loss": 0.1588, "grad_norm": 2.227837562561035, "learning_rate": 1.9437818297397403e-05, "loss": 0.5351, "step": 37200 }, { "ar_loss": 0.3778, "epoch": 5.1222136817832435, "fm_loss": 0.1597, "grad_norm": 2.5238685607910156, "learning_rate": 1.9320052807524874e-05, "loss": 0.5376, "step": 37300 }, { "ar_loss": 0.3799, "epoch": 5.137586471944658, "fm_loss": 0.16, "grad_norm": 3.3944637775421143, "learning_rate": 1.9202420066364678e-05, "loss": 0.5399, "step": 37400 }, { "ar_loss": 0.3767, "epoch": 5.152959262106072, "fm_loss": 0.1584, "grad_norm": 16.466928482055664, "learning_rate": 1.908492282316683e-05, "loss": 0.5351, "step": 37500 }, { "ar_loss": 0.3743, "epoch": 5.168332052267487, "fm_loss": 0.1591, "grad_norm": 2.3075056076049805, "learning_rate": 1.8967563824014563e-05, "loss": 0.5334, "step": 37600 }, { "ar_loss": 0.3821, "epoch": 5.1837048424289005, "fm_loss": 0.1595, "grad_norm": 2.2367005348205566, "learning_rate": 1.8850345811760152e-05, "loss": 0.5415, "step": 37700 }, { "ar_loss": 0.3746, "epoch": 5.199077632590315, "fm_loss": 0.1596, "grad_norm": 1.8881235122680664, "learning_rate": 1.873327152596077e-05, "loss": 0.5342, "step": 37800 }, { "ar_loss": 0.3776, "epoch": 5.21445042275173, "fm_loss": 0.1589, "grad_norm": 2.585137128829956, "learning_rate": 1.861634370281453e-05, "loss": 0.5365, "step": 37900 }, { "ar_loss": 0.3767, "epoch": 5.229823212913144, "fm_loss": 0.1594, "grad_norm": 3.3256664276123047, "learning_rate": 1.849956507509647e-05, "loss": 0.5361, "step": 38000 }, { "ar_loss": 0.3767, "epoch": 5.245196003074558, "fm_loss": 0.1601, "grad_norm": 2.189387559890747, "learning_rate": 1.838293837209472e-05, "loss": 0.5368, "step": 38100 }, { "ar_loss": 0.3765, "epoch": 5.260568793235972, "fm_loss": 0.1638, "grad_norm": 2.6017394065856934, "learning_rate": 1.8266466319546712e-05, "loss": 0.5403, "step": 38200 }, { "ar_loss": 0.3755, "epoch": 5.275941583397387, "fm_loss": 0.1598, "grad_norm": 3.207014322280884, "learning_rate": 1.8150151639575466e-05, "loss": 0.5354, "step": 38300 }, { "ar_loss": 0.3772, "epoch": 5.291314373558801, "fm_loss": 0.1589, "grad_norm": 3.0959692001342773, "learning_rate": 1.8033997050625966e-05, "loss": 0.536, "step": 38400 }, { "ar_loss": 0.3742, "epoch": 5.306687163720215, "fm_loss": 0.1587, "grad_norm": 2.948103666305542, "learning_rate": 1.791800526740165e-05, "loss": 0.5329, "step": 38500 }, { "ar_loss": 0.3768, "epoch": 5.322059953881629, "fm_loss": 0.1633, "grad_norm": 3.3506641387939453, "learning_rate": 1.7802179000800927e-05, "loss": 0.5401, "step": 38600 }, { "ar_loss": 0.3748, "epoch": 5.337432744043044, "fm_loss": 0.1593, "grad_norm": 1.7779144048690796, "learning_rate": 1.768652095785385e-05, "loss": 0.5342, "step": 38700 }, { "ar_loss": 0.3771, "epoch": 5.3528055342044585, "fm_loss": 0.1593, "grad_norm": 3.081190347671509, "learning_rate": 1.7571033841658844e-05, "loss": 0.5365, "step": 38800 }, { "ar_loss": 0.3769, "epoch": 5.368178324365872, "fm_loss": 0.159, "grad_norm": 2.642840623855591, "learning_rate": 1.7455720351319516e-05, "loss": 0.5359, "step": 38900 }, { "ar_loss": 0.3759, "epoch": 5.383551114527287, "fm_loss": 0.1596, "grad_norm": 3.235140323638916, "learning_rate": 1.734058318188158e-05, "loss": 0.5355, "step": 39000 }, { "ar_loss": 0.3728, "epoch": 5.398923904688701, "fm_loss": 0.1596, "grad_norm": 2.3143417835235596, "learning_rate": 1.7225625024269877e-05, "loss": 0.5324, "step": 39100 }, { "ar_loss": 0.3762, "epoch": 5.414296694850115, "fm_loss": 0.1592, "grad_norm": 2.0119452476501465, "learning_rate": 1.711084856522548e-05, "loss": 0.5354, "step": 39200 }, { "ar_loss": 0.3725, "epoch": 5.429669485011529, "fm_loss": 0.1596, "grad_norm": 2.9905428886413574, "learning_rate": 1.6996256487242894e-05, "loss": 0.5321, "step": 39300 }, { "ar_loss": 0.3746, "epoch": 5.445042275172944, "fm_loss": 0.1635, "grad_norm": 2.617199659347534, "learning_rate": 1.6881851468507358e-05, "loss": 0.5382, "step": 39400 }, { "ar_loss": 0.3746, "epoch": 5.460415065334358, "fm_loss": 0.1599, "grad_norm": 2.1979682445526123, "learning_rate": 1.6767636182832292e-05, "loss": 0.5345, "step": 39500 }, { "ar_loss": 0.3768, "epoch": 5.475787855495772, "fm_loss": 0.1586, "grad_norm": 3.3658335208892822, "learning_rate": 1.6653613299596748e-05, "loss": 0.5354, "step": 39600 }, { "ar_loss": 0.3753, "epoch": 5.491160645657187, "fm_loss": 0.1583, "grad_norm": 3.347707748413086, "learning_rate": 1.6539785483683063e-05, "loss": 0.5336, "step": 39700 }, { "ar_loss": 0.3786, "epoch": 5.506533435818601, "fm_loss": 0.1625, "grad_norm": 2.0319979190826416, "learning_rate": 1.6426155395414555e-05, "loss": 0.5411, "step": 39800 }, { "ar_loss": 0.3765, "epoch": 5.521906225980016, "fm_loss": 0.16, "grad_norm": 2.5716593265533447, "learning_rate": 1.631272569049336e-05, "loss": 0.5365, "step": 39900 }, { "ar_loss": 0.3767, "epoch": 5.537279016141429, "fm_loss": 0.1596, "grad_norm": 2.94095516204834, "learning_rate": 1.6199499019938363e-05, "loss": 0.5364, "step": 40000 }, { "ar_loss": 0.3755, "epoch": 6.015372790161415, "fm_loss": 0.1588, "grad_norm": 2.66264271736145, "learning_rate": 1.6086478030023248e-05, "loss": 0.5343, "step": 40100 }, { "ar_loss": 0.3771, "epoch": 6.0307455803228285, "fm_loss": 0.1577, "grad_norm": 3.340590476989746, "learning_rate": 1.597366536221462e-05, "loss": 0.5349, "step": 40200 }, { "ar_loss": 0.3748, "epoch": 6.046118370484243, "fm_loss": 0.1597, "grad_norm": 2.351405620574951, "learning_rate": 1.5861063653110292e-05, "loss": 0.5345, "step": 40300 }, { "ar_loss": 0.3747, "epoch": 6.061491160645657, "fm_loss": 0.1608, "grad_norm": 3.0236189365386963, "learning_rate": 1.5748675534377683e-05, "loss": 0.5355, "step": 40400 }, { "ar_loss": 0.3763, "epoch": 6.076863950807072, "fm_loss": 0.1588, "grad_norm": 2.360391139984131, "learning_rate": 1.563650363269227e-05, "loss": 0.5351, "step": 40500 }, { "ar_loss": 0.377, "epoch": 6.092236740968485, "fm_loss": 0.1611, "grad_norm": 2.1837239265441895, "learning_rate": 1.5524550569676224e-05, "loss": 0.5381, "step": 40600 }, { "ar_loss": 0.376, "epoch": 6.1076095311299, "fm_loss": 0.1585, "grad_norm": 3.4195451736450195, "learning_rate": 1.541281896183715e-05, "loss": 0.5346, "step": 40700 }, { "ar_loss": 0.3785, "epoch": 6.122982321291314, "fm_loss": 0.1592, "grad_norm": 3.0176382064819336, "learning_rate": 1.5301311420506897e-05, "loss": 0.5377, "step": 40800 }, { "ar_loss": 0.3735, "epoch": 6.138355111452729, "fm_loss": 0.1588, "grad_norm": 3.1062843799591064, "learning_rate": 1.5190030551780564e-05, "loss": 0.5323, "step": 40900 }, { "ar_loss": 0.374, "epoch": 6.153727901614143, "fm_loss": 0.1592, "grad_norm": 2.2580771446228027, "learning_rate": 1.5078978956455581e-05, "loss": 0.5332, "step": 41000 }, { "ar_loss": 0.3748, "epoch": 6.169100691775557, "fm_loss": 0.1584, "grad_norm": 3.3398499488830566, "learning_rate": 1.4968159229970914e-05, "loss": 0.5333, "step": 41100 }, { "ar_loss": 0.377, "epoch": 6.184473481936972, "fm_loss": 0.1581, "grad_norm": 4.929275989532471, "learning_rate": 1.4857573962346411e-05, "loss": 0.5351, "step": 41200 }, { "ar_loss": 0.3762, "epoch": 6.199846272098386, "fm_loss": 0.16, "grad_norm": 1.9189740419387817, "learning_rate": 1.4747225738122278e-05, "loss": 0.5362, "step": 41300 }, { "ar_loss": 0.3733, "epoch": 6.2152190622598, "fm_loss": 0.1599, "grad_norm": 2.381868600845337, "learning_rate": 1.4637117136298673e-05, "loss": 0.5333, "step": 41400 }, { "ar_loss": 0.3741, "epoch": 6.230591852421214, "fm_loss": 0.1591, "grad_norm": 4.60919713973999, "learning_rate": 1.452725073027541e-05, "loss": 0.5332, "step": 41500 }, { "ar_loss": 0.3768, "epoch": 6.245964642582629, "fm_loss": 0.1579, "grad_norm": 8.782063484191895, "learning_rate": 1.4417629087791868e-05, "loss": 0.5347, "step": 41600 }, { "ar_loss": 0.3755, "epoch": 6.261337432744043, "fm_loss": 0.1589, "grad_norm": 3.957836627960205, "learning_rate": 1.4308254770866886e-05, "loss": 0.5344, "step": 41700 }, { "ar_loss": 0.3779, "epoch": 6.276710222905457, "fm_loss": 0.1578, "grad_norm": 4.637214183807373, "learning_rate": 1.4199130335738981e-05, "loss": 0.5357, "step": 41800 }, { "ar_loss": 0.3745, "epoch": 6.292083013066872, "fm_loss": 0.1589, "grad_norm": 4.1944804191589355, "learning_rate": 1.409025833280655e-05, "loss": 0.5334, "step": 41900 }, { "ar_loss": 0.3738, "epoch": 6.307455803228286, "fm_loss": 0.1589, "grad_norm": 3.7318429946899414, "learning_rate": 1.3981641306568299e-05, "loss": 0.5328, "step": 42000 }, { "ar_loss": 0.3754, "epoch": 6.3228285933897, "fm_loss": 0.16, "grad_norm": 1.8556092977523804, "learning_rate": 1.3873281795563737e-05, "loss": 0.5355, "step": 42100 }, { "ar_loss": 0.3718, "epoch": 6.338201383551114, "fm_loss": 0.1584, "grad_norm": 3.1865854263305664, "learning_rate": 1.3765182332313859e-05, "loss": 0.5302, "step": 42200 }, { "ar_loss": 0.3778, "epoch": 6.353574173712529, "fm_loss": 0.1593, "grad_norm": 2.2085890769958496, "learning_rate": 1.3657345443261967e-05, "loss": 0.5371, "step": 42300 }, { "ar_loss": 0.377, "epoch": 6.3689469638739435, "fm_loss": 0.1592, "grad_norm": 4.625204563140869, "learning_rate": 1.3549773648714631e-05, "loss": 0.5363, "step": 42400 }, { "ar_loss": 0.3775, "epoch": 6.384319754035357, "fm_loss": 0.1581, "grad_norm": 2.3030028343200684, "learning_rate": 1.3442469462782741e-05, "loss": 0.5356, "step": 42500 }, { "ar_loss": 0.3768, "epoch": 6.399692544196772, "fm_loss": 0.1591, "grad_norm": 6.936190128326416, "learning_rate": 1.3335435393322826e-05, "loss": 0.5359, "step": 42600 }, { "ar_loss": 0.374, "epoch": 6.415065334358186, "fm_loss": 0.1591, "grad_norm": 3.345165252685547, "learning_rate": 1.322867394187836e-05, "loss": 0.5331, "step": 42700 }, { "ar_loss": 0.377, "epoch": 6.4304381245196005, "fm_loss": 0.1595, "grad_norm": 3.3900644779205322, "learning_rate": 1.3122187603621356e-05, "loss": 0.5366, "step": 42800 }, { "ar_loss": 0.3728, "epoch": 6.445810914681014, "fm_loss": 0.1582, "grad_norm": 2.5638771057128906, "learning_rate": 1.3015978867293996e-05, "loss": 0.531, "step": 42900 }, { "ar_loss": 0.377, "epoch": 6.461183704842429, "fm_loss": 0.1588, "grad_norm": 2.6694748401641846, "learning_rate": 1.2910050215150526e-05, "loss": 0.5358, "step": 43000 }, { "ar_loss": 0.3739, "epoch": 6.476556495003843, "fm_loss": 0.1593, "grad_norm": 4.689158916473389, "learning_rate": 1.2804404122899197e-05, "loss": 0.5332, "step": 43100 }, { "ar_loss": 0.3763, "epoch": 6.4919292851652575, "fm_loss": 0.1589, "grad_norm": 2.5251529216766357, "learning_rate": 1.2699043059644444e-05, "loss": 0.5352, "step": 43200 }, { "ar_loss": 0.3754, "epoch": 6.507302075326672, "fm_loss": 0.1582, "grad_norm": 2.896928548812866, "learning_rate": 1.2593969487829133e-05, "loss": 0.5337, "step": 43300 }, { "ar_loss": 0.3782, "epoch": 6.522674865488086, "fm_loss": 0.1589, "grad_norm": 8.51708698272705, "learning_rate": 1.2489185863177032e-05, "loss": 0.5371, "step": 43400 }, { "ar_loss": 0.3763, "epoch": 6.538047655649501, "fm_loss": 0.1584, "grad_norm": 2.936885356903076, "learning_rate": 1.2384694634635433e-05, "loss": 0.5347, "step": 43500 }, { "ar_loss": 0.3744, "epoch": 6.553420445810914, "fm_loss": 0.1592, "grad_norm": 2.76309871673584, "learning_rate": 1.2280498244317883e-05, "loss": 0.5337, "step": 43600 }, { "ar_loss": 0.3742, "epoch": 6.568793235972329, "fm_loss": 0.1586, "grad_norm": 2.100710868835449, "learning_rate": 1.2176599127447147e-05, "loss": 0.5328, "step": 43700 }, { "ar_loss": 0.3777, "epoch": 6.584166026133743, "fm_loss": 0.1588, "grad_norm": 2.9072883129119873, "learning_rate": 1.2072999712298242e-05, "loss": 0.5365, "step": 43800 }, { "ar_loss": 0.3757, "epoch": 6.599538816295158, "fm_loss": 0.1627, "grad_norm": 2.924342393875122, "learning_rate": 1.196970242014176e-05, "loss": 0.5384, "step": 43900 }, { "ar_loss": 0.3763, "epoch": 6.614911606456571, "fm_loss": 0.1586, "grad_norm": 2.559717893600464, "learning_rate": 1.1866709665187205e-05, "loss": 0.5349, "step": 44000 }, { "ar_loss": 0.3732, "epoch": 6.630284396617986, "fm_loss": 0.1588, "grad_norm": 6.084836006164551, "learning_rate": 1.1764023854526593e-05, "loss": 0.532, "step": 44100 }, { "ar_loss": 0.3753, "epoch": 6.645657186779401, "fm_loss": 0.1588, "grad_norm": 2.4638426303863525, "learning_rate": 1.1661647388078211e-05, "loss": 0.5341, "step": 44200 }, { "ar_loss": 0.3742, "epoch": 6.661029976940815, "fm_loss": 0.1594, "grad_norm": 2.2063241004943848, "learning_rate": 1.1559582658530526e-05, "loss": 0.5335, "step": 44300 }, { "ar_loss": 0.3743, "epoch": 6.676402767102229, "fm_loss": 0.1591, "grad_norm": 2.847790479660034, "learning_rate": 1.1457832051286235e-05, "loss": 0.5333, "step": 44400 }, { "ar_loss": 0.3724, "epoch": 6.691775557263643, "fm_loss": 0.1591, "grad_norm": 5.002224445343018, "learning_rate": 1.1356397944406566e-05, "loss": 0.5315, "step": 44500 }, { "ar_loss": 0.3739, "epoch": 6.707148347425058, "fm_loss": 0.1592, "grad_norm": 2.6647820472717285, "learning_rate": 1.125528270855564e-05, "loss": 0.5331, "step": 44600 }, { "ar_loss": 0.3758, "epoch": 6.722521137586472, "fm_loss": 0.1583, "grad_norm": 2.1632769107818604, "learning_rate": 1.1154488706945104e-05, "loss": 0.5341, "step": 44700 }, { "ar_loss": 0.3761, "epoch": 6.737893927747886, "fm_loss": 0.1589, "grad_norm": 2.196767568588257, "learning_rate": 1.105401829527889e-05, "loss": 0.535, "step": 44800 }, { "ar_loss": 0.3755, "epoch": 6.753266717909301, "fm_loss": 0.1585, "grad_norm": 3.3876090049743652, "learning_rate": 1.0953873821698153e-05, "loss": 0.534, "step": 44900 }, { "ar_loss": 0.375, "epoch": 6.768639508070715, "fm_loss": 0.1587, "grad_norm": 2.2706313133239746, "learning_rate": 1.085405762672639e-05, "loss": 0.5337, "step": 45000 }, { "ar_loss": 0.3739, "epoch": 6.784012298232129, "fm_loss": 0.159, "grad_norm": 2.5273022651672363, "learning_rate": 1.0754572043214773e-05, "loss": 0.5329, "step": 45100 }, { "ar_loss": 0.3734, "epoch": 6.799385088393543, "fm_loss": 0.1596, "grad_norm": 2.579967498779297, "learning_rate": 1.0655419396287578e-05, "loss": 0.533, "step": 45200 }, { "ar_loss": 0.3717, "epoch": 6.814757878554958, "fm_loss": 0.1583, "grad_norm": 36.556278228759766, "learning_rate": 1.0556602003287847e-05, "loss": 0.53, "step": 45300 }, { "ar_loss": 0.3769, "epoch": 6.830130668716372, "fm_loss": 0.1586, "grad_norm": 5.545002460479736, "learning_rate": 1.045812217372327e-05, "loss": 0.5355, "step": 45400 }, { "ar_loss": 0.3737, "epoch": 6.845503458877786, "fm_loss": 0.1587, "grad_norm": 2.081123113632202, "learning_rate": 1.0359982209212178e-05, "loss": 0.5323, "step": 45500 }, { "ar_loss": 0.3722, "epoch": 6.860876249039201, "fm_loss": 0.1591, "grad_norm": 3.176176071166992, "learning_rate": 1.0262184403429739e-05, "loss": 0.5313, "step": 45600 }, { "ar_loss": 0.3785, "epoch": 6.876249039200615, "fm_loss": 0.1584, "grad_norm": 7.945939540863037, "learning_rate": 1.016473104205441e-05, "loss": 0.5369, "step": 45700 }, { "ar_loss": 0.3752, "epoch": 6.8916218293620295, "fm_loss": 0.1589, "grad_norm": 3.0679447650909424, "learning_rate": 1.0067624402714438e-05, "loss": 0.5341, "step": 45800 }, { "ar_loss": 0.3731, "epoch": 6.906994619523443, "fm_loss": 0.159, "grad_norm": 4.075259685516357, "learning_rate": 9.970866754934677e-06, "loss": 0.532, "step": 45900 }, { "ar_loss": 0.3768, "epoch": 6.922367409684858, "fm_loss": 0.1581, "grad_norm": 2.1700098514556885, "learning_rate": 9.874460360083537e-06, "loss": 0.5349, "step": 46000 }, { "ar_loss": 0.3749, "epoch": 6.937740199846272, "fm_loss": 0.1598, "grad_norm": 2.1425936222076416, "learning_rate": 9.778407471320134e-06, "loss": 0.5347, "step": 46100 }, { "ar_loss": 0.3719, "epoch": 6.9531129900076865, "fm_loss": 0.1584, "grad_norm": 7.313533782958984, "learning_rate": 9.682710333541622e-06, "loss": 0.5303, "step": 46200 }, { "ar_loss": 0.3737, "epoch": 6.9684857801691, "fm_loss": 0.1586, "grad_norm": 1.9248658418655396, "learning_rate": 9.587371183330723e-06, "loss": 0.5323, "step": 46300 }, { "ar_loss": 0.3728, "epoch": 6.983858570330515, "fm_loss": 0.1596, "grad_norm": 2.7737770080566406, "learning_rate": 9.492392248903505e-06, "loss": 0.5324, "step": 46400 }, { "ar_loss": 0.3755, "epoch": 6.99923136049193, "fm_loss": 0.1585, "grad_norm": 2.6747500896453857, "learning_rate": 9.397775750057206e-06, "loss": 0.5341, "step": 46500 }, { "ar_loss": 0.3746, "epoch": 7.014604150653343, "fm_loss": 0.1594, "grad_norm": 6.737438678741455, "learning_rate": 9.303523898118444e-06, "loss": 0.534, "step": 46600 }, { "ar_loss": 0.3741, "epoch": 7.029976940814758, "fm_loss": 0.1589, "grad_norm": 3.4723405838012695, "learning_rate": 9.209638895891501e-06, "loss": 0.5331, "step": 46700 }, { "ar_loss": 0.3735, "epoch": 7.045349730976172, "fm_loss": 0.1594, "grad_norm": 6.770826816558838, "learning_rate": 9.116122937606835e-06, "loss": 0.5329, "step": 46800 }, { "ar_loss": 0.3735, "epoch": 7.060722521137587, "fm_loss": 0.1589, "grad_norm": 11.931926727294922, "learning_rate": 9.022978208869808e-06, "loss": 0.5323, "step": 46900 }, { "ar_loss": 0.3734, "epoch": 7.076095311299, "fm_loss": 0.1582, "grad_norm": 1.7415313720703125, "learning_rate": 8.930206886609616e-06, "loss": 0.5316, "step": 47000 }, { "ar_loss": 0.3734, "epoch": 7.091468101460415, "fm_loss": 0.1591, "grad_norm": 3.001337766647339, "learning_rate": 8.837811139028377e-06, "loss": 0.5325, "step": 47100 }, { "ar_loss": 0.3719, "epoch": 7.10684089162183, "fm_loss": 0.1588, "grad_norm": 2.8166637420654297, "learning_rate": 8.745793125550477e-06, "loss": 0.5307, "step": 47200 }, { "ar_loss": 0.3744, "epoch": 7.1222136817832435, "fm_loss": 0.1588, "grad_norm": 10.10988712310791, "learning_rate": 8.654154996772114e-06, "loss": 0.5332, "step": 47300 }, { "ar_loss": 0.3758, "epoch": 7.137586471944658, "fm_loss": 0.1593, "grad_norm": 2.6831390857696533, "learning_rate": 8.562898894411017e-06, "loss": 0.5351, "step": 47400 }, { "ar_loss": 0.371, "epoch": 7.152959262106072, "fm_loss": 0.1592, "grad_norm": 1.959746241569519, "learning_rate": 8.472026951256381e-06, "loss": 0.5302, "step": 47500 }, { "ar_loss": 0.3719, "epoch": 7.168332052267487, "fm_loss": 0.1582, "grad_norm": 2.5866353511810303, "learning_rate": 8.38154129111908e-06, "loss": 0.53, "step": 47600 }, { "ar_loss": 0.3731, "epoch": 7.1837048424289005, "fm_loss": 0.1584, "grad_norm": 2.6503565311431885, "learning_rate": 8.29144402878193e-06, "loss": 0.5314, "step": 47700 }, { "ar_loss": 0.372, "epoch": 7.199077632590315, "fm_loss": 0.1606, "grad_norm": 3.779261589050293, "learning_rate": 8.201737269950355e-06, "loss": 0.5325, "step": 47800 }, { "ar_loss": 0.371, "epoch": 7.21445042275173, "fm_loss": 0.1585, "grad_norm": 2.4449970722198486, "learning_rate": 8.112423111203124e-06, "loss": 0.5295, "step": 47900 }, { "ar_loss": 0.3738, "epoch": 7.229823212913144, "fm_loss": 0.1584, "grad_norm": 2.2168331146240234, "learning_rate": 8.023503639943378e-06, "loss": 0.5322, "step": 48000 }, { "ar_loss": 0.3758, "epoch": 7.245196003074558, "fm_loss": 0.158, "grad_norm": 2.078047275543213, "learning_rate": 7.934980934349811e-06, "loss": 0.5338, "step": 48100 }, { "ar_loss": 0.3757, "epoch": 7.260568793235972, "fm_loss": 0.1583, "grad_norm": 2.491028308868408, "learning_rate": 7.846857063328152e-06, "loss": 0.534, "step": 48200 }, { "ar_loss": 0.3748, "epoch": 7.275941583397387, "fm_loss": 0.1584, "grad_norm": 3.340336799621582, "learning_rate": 7.759134086462753e-06, "loss": 0.5332, "step": 48300 }, { "ar_loss": 0.3735, "epoch": 7.291314373558801, "fm_loss": 0.1584, "grad_norm": 3.3799524307250977, "learning_rate": 7.671814053968484e-06, "loss": 0.5319, "step": 48400 }, { "ar_loss": 0.3743, "epoch": 7.306687163720215, "fm_loss": 0.1587, "grad_norm": 3.1465113162994385, "learning_rate": 7.58489900664282e-06, "loss": 0.533, "step": 48500 }, { "ar_loss": 0.3723, "epoch": 7.322059953881629, "fm_loss": 0.1613, "grad_norm": 3.0010464191436768, "learning_rate": 7.49839097581814e-06, "loss": 0.5336, "step": 48600 }, { "ar_loss": 0.3727, "epoch": 7.337432744043044, "fm_loss": 0.1608, "grad_norm": 8.70785903930664, "learning_rate": 7.412291983314237e-06, "loss": 0.5335, "step": 48700 }, { "ar_loss": 0.3727, "epoch": 7.3528055342044585, "fm_loss": 0.158, "grad_norm": 3.002357244491577, "learning_rate": 7.326604041391089e-06, "loss": 0.5307, "step": 48800 }, { "ar_loss": 0.3715, "epoch": 7.368178324365872, "fm_loss": 0.1585, "grad_norm": 2.6171886920928955, "learning_rate": 7.241329152701812e-06, "loss": 0.53, "step": 48900 }, { "ar_loss": 0.3744, "epoch": 7.383551114527287, "fm_loss": 0.1583, "grad_norm": 2.283306360244751, "learning_rate": 7.156469310245864e-06, "loss": 0.5326, "step": 49000 }, { "ar_loss": 0.3744, "epoch": 7.398923904688701, "fm_loss": 0.1595, "grad_norm": 6.2937331199646, "learning_rate": 7.07202649732246e-06, "loss": 0.5338, "step": 49100 }, { "ar_loss": 0.3744, "epoch": 7.414296694850115, "fm_loss": 0.1601, "grad_norm": 4.580973148345947, "learning_rate": 6.988002687484222e-06, "loss": 0.5345, "step": 49200 }, { "ar_loss": 0.3728, "epoch": 7.429669485011529, "fm_loss": 0.158, "grad_norm": 2.2313263416290283, "learning_rate": 6.904399844491058e-06, "loss": 0.5308, "step": 49300 }, { "ar_loss": 0.3735, "epoch": 7.445042275172944, "fm_loss": 0.1595, "grad_norm": 3.6399624347686768, "learning_rate": 6.821219922264252e-06, "loss": 0.533, "step": 49400 }, { "ar_loss": 0.3752, "epoch": 7.460415065334358, "fm_loss": 0.1585, "grad_norm": 3.077457904815674, "learning_rate": 6.73846486484083e-06, "loss": 0.5337, "step": 49500 }, { "ar_loss": 0.3743, "epoch": 7.475787855495772, "fm_loss": 0.1581, "grad_norm": 2.1495542526245117, "learning_rate": 6.6561366063280784e-06, "loss": 0.5324, "step": 49600 }, { "ar_loss": 0.3727, "epoch": 7.491160645657187, "fm_loss": 0.1584, "grad_norm": 4.248561382293701, "learning_rate": 6.574237070858383e-06, "loss": 0.5311, "step": 49700 }, { "ar_loss": 0.3725, "epoch": 7.506533435818601, "fm_loss": 0.1588, "grad_norm": 1.8635345697402954, "learning_rate": 6.492768172544231e-06, "loss": 0.5314, "step": 49800 }, { "ar_loss": 0.3739, "epoch": 7.521906225980016, "fm_loss": 0.159, "grad_norm": 4.352962970733643, "learning_rate": 6.411731815433492e-06, "loss": 0.5329, "step": 49900 }, { "ar_loss": 0.3744, "epoch": 7.537279016141429, "fm_loss": 0.1597, "grad_norm": 1.9769291877746582, "learning_rate": 6.33112989346491e-06, "loss": 0.5341, "step": 50000 }, { "ar_loss": 0.3716, "epoch": 7.552651806302844, "fm_loss": 0.1728, "grad_norm": 2.1171858310699463, "learning_rate": 6.250964290423847e-06, "loss": 0.5445, "step": 50100 }, { "ar_loss": 0.3727, "epoch": 7.568024596464259, "fm_loss": 0.1592, "grad_norm": 2.2346949577331543, "learning_rate": 6.171236879898243e-06, "loss": 0.5319, "step": 50200 }, { "ar_loss": 0.3733, "epoch": 7.5833973866256725, "fm_loss": 0.1606, "grad_norm": 3.1455023288726807, "learning_rate": 6.091949525234838e-06, "loss": 0.5339, "step": 50300 }, { "ar_loss": 0.3767, "epoch": 7.598770176787087, "fm_loss": 0.1583, "grad_norm": 4.480979919433594, "learning_rate": 6.013104079495621e-06, "loss": 0.535, "step": 50400 }, { "ar_loss": 0.3706, "epoch": 7.614142966948501, "fm_loss": 0.1593, "grad_norm": 3.0612711906433105, "learning_rate": 5.934702385414517e-06, "loss": 0.5299, "step": 50500 }, { "ar_loss": 0.3729, "epoch": 7.629515757109916, "fm_loss": 0.1585, "grad_norm": 2.3492562770843506, "learning_rate": 5.856746275354322e-06, "loss": 0.5313, "step": 50600 }, { "ar_loss": 0.3735, "epoch": 7.6448885472713295, "fm_loss": 0.1585, "grad_norm": 4.781151294708252, "learning_rate": 5.77923757126389e-06, "loss": 0.532, "step": 50700 }, { "ar_loss": 0.3714, "epoch": 7.660261337432744, "fm_loss": 0.159, "grad_norm": 4.14220666885376, "learning_rate": 5.702178084635526e-06, "loss": 0.5304, "step": 50800 }, { "ar_loss": 0.3755, "epoch": 7.675634127594158, "fm_loss": 0.1586, "grad_norm": 2.7182235717773438, "learning_rate": 5.625569616462672e-06, "loss": 0.5341, "step": 50900 }, { "ar_loss": 0.375, "epoch": 7.691006917755573, "fm_loss": 0.1587, "grad_norm": 2.1390786170959473, "learning_rate": 5.549413957197797e-06, "loss": 0.5337, "step": 51000 }, { "ar_loss": 0.3757, "epoch": 7.706379707916987, "fm_loss": 0.1591, "grad_norm": 15.834261894226074, "learning_rate": 5.473712886710569e-06, "loss": 0.5348, "step": 51100 }, { "ar_loss": 0.3752, "epoch": 7.721752498078401, "fm_loss": 0.1587, "grad_norm": 3.1994242668151855, "learning_rate": 5.3984681742462435e-06, "loss": 0.5339, "step": 51200 }, { "ar_loss": 0.3728, "epoch": 7.737125288239816, "fm_loss": 0.1584, "grad_norm": 3.1270108222961426, "learning_rate": 5.323681578384318e-06, "loss": 0.5312, "step": 51300 }, { "ar_loss": 0.3753, "epoch": 7.75249807840123, "fm_loss": 0.1588, "grad_norm": 4.753870964050293, "learning_rate": 5.24935484699744e-06, "loss": 0.5341, "step": 51400 }, { "ar_loss": 0.3751, "epoch": 7.767870868562644, "fm_loss": 0.1591, "grad_norm": 2.4398574829101562, "learning_rate": 5.175489717210532e-06, "loss": 0.5343, "step": 51500 }, { "ar_loss": 0.3723, "epoch": 7.783243658724058, "fm_loss": 0.1603, "grad_norm": 1.8831534385681152, "learning_rate": 5.102087915360229e-06, "loss": 0.5326, "step": 51600 }, { "ar_loss": 0.3731, "epoch": 7.798616448885473, "fm_loss": 0.1579, "grad_norm": 3.3795456886291504, "learning_rate": 5.0291511569544955e-06, "loss": 0.5311, "step": 51700 }, { "ar_loss": 0.3724, "epoch": 7.813989239046887, "fm_loss": 0.159, "grad_norm": 6.143826007843018, "learning_rate": 4.956681146632553e-06, "loss": 0.5314, "step": 51800 }, { "ar_loss": 0.3737, "epoch": 7.829362029208301, "fm_loss": 0.1575, "grad_norm": 3.264896869659424, "learning_rate": 4.884679578125029e-06, "loss": 0.5312, "step": 51900 }, { "ar_loss": 0.3744, "epoch": 7.844734819369716, "fm_loss": 0.1608, "grad_norm": 2.4835152626037598, "learning_rate": 4.813148134214396e-06, "loss": 0.5352, "step": 52000 }, { "ar_loss": 0.3762, "epoch": 7.86010760953113, "fm_loss": 0.1581, "grad_norm": 2.550990581512451, "learning_rate": 4.742088486695604e-06, "loss": 0.5343, "step": 52100 }, { "ar_loss": 0.3731, "epoch": 7.875480399692544, "fm_loss": 0.1575, "grad_norm": 6.491451740264893, "learning_rate": 4.671502296337033e-06, "loss": 0.5307, "step": 52200 }, { "ar_loss": 0.3729, "epoch": 7.890853189853958, "fm_loss": 0.1588, "grad_norm": 3.544994354248047, "learning_rate": 4.60139121284168e-06, "loss": 0.5317, "step": 52300 }, { "ar_loss": 0.3753, "epoch": 7.906225980015373, "fm_loss": 0.1597, "grad_norm": 2.540804386138916, "learning_rate": 4.531756874808585e-06, "loss": 0.535, "step": 52400 }, { "ar_loss": 0.375, "epoch": 7.921598770176787, "fm_loss": 0.1583, "grad_norm": 3.565279722213745, "learning_rate": 4.462600909694559e-06, "loss": 0.5334, "step": 52500 }, { "ar_loss": 0.3729, "epoch": 7.936971560338201, "fm_loss": 0.1593, "grad_norm": 2.623326539993286, "learning_rate": 4.393924933776122e-06, "loss": 0.5321, "step": 52600 }, { "ar_loss": 0.3763, "epoch": 7.952344350499615, "fm_loss": 0.158, "grad_norm": 3.847337007522583, "learning_rate": 4.325730552111754e-06, "loss": 0.5343, "step": 52700 }, { "ar_loss": 0.3734, "epoch": 7.96771714066103, "fm_loss": 0.1577, "grad_norm": 2.1040573120117188, "learning_rate": 4.258019358504359e-06, "loss": 0.5312, "step": 52800 }, { "ar_loss": 0.3727, "epoch": 7.983089930822445, "fm_loss": 0.1585, "grad_norm": 2.5626471042633057, "learning_rate": 4.190792935464033e-06, "loss": 0.5311, "step": 52900 }, { "ar_loss": 0.3735, "epoch": 7.998462720983858, "fm_loss": 0.16, "grad_norm": 2.844808340072632, "learning_rate": 4.124052854171068e-06, "loss": 0.5335, "step": 53000 }, { "ar_loss": 0.3713, "epoch": 8.013835511145272, "fm_loss": 0.1589, "grad_norm": 3.6812970638275146, "learning_rate": 4.057800674439227e-06, "loss": 0.5302, "step": 53100 }, { "ar_loss": 0.3762, "epoch": 8.029208301306687, "fm_loss": 0.158, "grad_norm": 3.114596128463745, "learning_rate": 3.992037944679322e-06, "loss": 0.5342, "step": 53200 }, { "ar_loss": 0.3731, "epoch": 8.044581091468102, "fm_loss": 0.1582, "grad_norm": 7.761219501495361, "learning_rate": 3.926766201862972e-06, "loss": 0.5313, "step": 53300 }, { "ar_loss": 0.3736, "epoch": 8.059953881629516, "fm_loss": 0.1572, "grad_norm": 2.6152584552764893, "learning_rate": 3.861986971486725e-06, "loss": 0.5307, "step": 53400 }, { "ar_loss": 0.3762, "epoch": 8.07532667179093, "fm_loss": 0.1578, "grad_norm": 10.676106452941895, "learning_rate": 3.7977017675363826e-06, "loss": 0.5339, "step": 53500 }, { "ar_loss": 0.3778, "epoch": 8.090699461952344, "fm_loss": 0.1581, "grad_norm": 2.382251024246216, "learning_rate": 3.7339120924516276e-06, "loss": 0.5359, "step": 53600 }, { "ar_loss": 0.3755, "epoch": 8.106072252113758, "fm_loss": 0.1586, "grad_norm": 2.832390308380127, "learning_rate": 3.6706194370909025e-06, "loss": 0.5341, "step": 53700 }, { "ar_loss": 0.3725, "epoch": 8.121445042275173, "fm_loss": 0.1585, "grad_norm": 2.2325046062469482, "learning_rate": 3.6078252806965667e-06, "loss": 0.531, "step": 53800 }, { "ar_loss": 0.3749, "epoch": 8.136817832436588, "fm_loss": 0.1579, "grad_norm": 2.05092716217041, "learning_rate": 3.5455310908603293e-06, "loss": 0.5328, "step": 53900 }, { "ar_loss": 0.3705, "epoch": 8.152190622598, "fm_loss": 0.1623, "grad_norm": 2.234282970428467, "learning_rate": 3.4837383234889498e-06, "loss": 0.5328, "step": 54000 }, { "ar_loss": 0.3727, "epoch": 8.167563412759415, "fm_loss": 0.1582, "grad_norm": 2.087803363800049, "learning_rate": 3.422448422770197e-06, "loss": 0.5309, "step": 54100 }, { "ar_loss": 0.3737, "epoch": 8.18293620292083, "fm_loss": 0.1579, "grad_norm": 2.0325510501861572, "learning_rate": 3.3616628211391193e-06, "loss": 0.5316, "step": 54200 }, { "ar_loss": 0.3734, "epoch": 8.198308993082245, "fm_loss": 0.1578, "grad_norm": 4.768230438232422, "learning_rate": 3.3013829392445434e-06, "loss": 0.5312, "step": 54300 }, { "ar_loss": 0.371, "epoch": 8.21368178324366, "fm_loss": 0.1578, "grad_norm": 9.634552955627441, "learning_rate": 3.2416101859158887e-06, "loss": 0.5289, "step": 54400 }, { "ar_loss": 0.3733, "epoch": 8.229054573405072, "fm_loss": 0.16, "grad_norm": 2.277290105819702, "learning_rate": 3.1823459581302394e-06, "loss": 0.5333, "step": 54500 }, { "ar_loss": 0.3716, "epoch": 8.244427363566487, "fm_loss": 0.158, "grad_norm": 6.1840996742248535, "learning_rate": 3.123591640979681e-06, "loss": 0.5296, "step": 54600 }, { "ar_loss": 0.3735, "epoch": 8.259800153727902, "fm_loss": 0.1592, "grad_norm": 3.6673200130462646, "learning_rate": 3.065348607638946e-06, "loss": 0.5327, "step": 54700 }, { "ar_loss": 0.3722, "epoch": 8.275172943889316, "fm_loss": 0.1583, "grad_norm": 2.28403902053833, "learning_rate": 3.0076182193333053e-06, "loss": 0.5306, "step": 54800 }, { "ar_loss": 0.3706, "epoch": 8.29054573405073, "fm_loss": 0.1588, "grad_norm": 2.709955930709839, "learning_rate": 2.9504018253067673e-06, "loss": 0.5294, "step": 54900 }, { "ar_loss": 0.3732, "epoch": 8.305918524212144, "fm_loss": 0.1583, "grad_norm": 2.438748359680176, "learning_rate": 2.8937007627905354e-06, "loss": 0.5314, "step": 55000 }, { "ar_loss": 0.3735, "epoch": 8.321291314373559, "fm_loss": 0.1589, "grad_norm": 1.9475317001342773, "learning_rate": 2.8375163569717645e-06, "loss": 0.5324, "step": 55100 }, { "ar_loss": 0.375, "epoch": 8.336664104534973, "fm_loss": 0.1587, "grad_norm": 2.430262804031372, "learning_rate": 2.781849920962576e-06, "loss": 0.5337, "step": 55200 }, { "ar_loss": 0.3742, "epoch": 8.352036894696388, "fm_loss": 0.1588, "grad_norm": 3.527625799179077, "learning_rate": 2.726702755769381e-06, "loss": 0.5329, "step": 55300 }, { "ar_loss": 0.3744, "epoch": 8.367409684857801, "fm_loss": 0.1584, "grad_norm": 2.9451231956481934, "learning_rate": 2.6720761502624674e-06, "loss": 0.5328, "step": 55400 }, { "ar_loss": 0.3716, "epoch": 8.382782475019216, "fm_loss": 0.1599, "grad_norm": 4.992424488067627, "learning_rate": 2.6179713811458726e-06, "loss": 0.5315, "step": 55500 }, { "ar_loss": 0.3727, "epoch": 8.39815526518063, "fm_loss": 0.1585, "grad_norm": 4.794875144958496, "learning_rate": 2.564389712927556e-06, "loss": 0.5313, "step": 55600 }, { "ar_loss": 0.3728, "epoch": 8.413528055342045, "fm_loss": 0.1608, "grad_norm": 7.288453102111816, "learning_rate": 2.5113323978898455e-06, "loss": 0.5336, "step": 55700 }, { "ar_loss": 0.3732, "epoch": 8.42890084550346, "fm_loss": 0.1583, "grad_norm": 2.7333781719207764, "learning_rate": 2.458800676060158e-06, "loss": 0.5316, "step": 55800 }, { "ar_loss": 0.3737, "epoch": 8.444273635664873, "fm_loss": 0.1584, "grad_norm": 2.3420047760009766, "learning_rate": 2.406795775182025e-06, "loss": 0.532, "step": 55900 }, { "ar_loss": 0.3742, "epoch": 8.459646425826287, "fm_loss": 0.1583, "grad_norm": 2.3550283908843994, "learning_rate": 2.355318910686394e-06, "loss": 0.5325, "step": 56000 }, { "ar_loss": 0.3755, "epoch": 8.475019215987702, "fm_loss": 0.1609, "grad_norm": 2.22489595413208, "learning_rate": 2.304371285663237e-06, "loss": 0.5364, "step": 56100 }, { "ar_loss": 0.3723, "epoch": 8.490392006149117, "fm_loss": 0.1578, "grad_norm": 12.48835563659668, "learning_rate": 2.2539540908334157e-06, "loss": 0.5301, "step": 56200 }, { "ar_loss": 0.3729, "epoch": 8.50576479631053, "fm_loss": 0.159, "grad_norm": 2.215409994125366, "learning_rate": 2.204068504520859e-06, "loss": 0.5319, "step": 56300 }, { "ar_loss": 0.3739, "epoch": 8.521137586471944, "fm_loss": 0.1578, "grad_norm": 3.4783434867858887, "learning_rate": 2.15471569262502e-06, "loss": 0.5317, "step": 56400 }, { "ar_loss": 0.3721, "epoch": 8.536510376633359, "fm_loss": 0.1576, "grad_norm": 2.0251822471618652, "learning_rate": 2.1058968085936383e-06, "loss": 0.5297, "step": 56500 }, { "ar_loss": 0.374, "epoch": 8.551883166794774, "fm_loss": 0.1583, "grad_norm": 11.012389183044434, "learning_rate": 2.0576129933957715e-06, "loss": 0.5323, "step": 56600 }, { "ar_loss": 0.373, "epoch": 8.567255956956188, "fm_loss": 0.1579, "grad_norm": 2.4049017429351807, "learning_rate": 2.009865375495129e-06, "loss": 0.531, "step": 56700 }, { "ar_loss": 0.3748, "epoch": 8.582628747117601, "fm_loss": 0.1591, "grad_norm": 2.692699909210205, "learning_rate": 1.9626550708237075e-06, "loss": 0.5339, "step": 56800 }, { "ar_loss": 0.3729, "epoch": 8.598001537279016, "fm_loss": 0.1586, "grad_norm": 2.713548183441162, "learning_rate": 1.915983182755696e-06, "loss": 0.5315, "step": 56900 }, { "ar_loss": 0.3711, "epoch": 8.61337432744043, "fm_loss": 0.1575, "grad_norm": 6.727135181427002, "learning_rate": 1.8698508020817045e-06, "loss": 0.5286, "step": 57000 }, { "ar_loss": 0.3742, "epoch": 8.628747117601845, "fm_loss": 0.1583, "grad_norm": 3.071518659591675, "learning_rate": 1.8242590069832617e-06, "loss": 0.5325, "step": 57100 }, { "ar_loss": 0.3712, "epoch": 8.644119907763258, "fm_loss": 0.1584, "grad_norm": 2.538884401321411, "learning_rate": 1.7792088630076086e-06, "loss": 0.5296, "step": 57200 }, { "ar_loss": 0.374, "epoch": 8.659492697924673, "fm_loss": 0.1583, "grad_norm": 2.6885924339294434, "learning_rate": 1.7347014230428144e-06, "loss": 0.5323, "step": 57300 }, { "ar_loss": 0.3721, "epoch": 8.674865488086088, "fm_loss": 0.1579, "grad_norm": 2.660156726837158, "learning_rate": 1.6907377272931485e-06, "loss": 0.53, "step": 57400 }, { "ar_loss": 0.3725, "epoch": 8.690238278247502, "fm_loss": 0.1586, "grad_norm": 2.4190871715545654, "learning_rate": 1.6473188032547854e-06, "loss": 0.5311, "step": 57500 }, { "ar_loss": 0.3711, "epoch": 8.705611068408917, "fm_loss": 0.1581, "grad_norm": 3.2725918292999268, "learning_rate": 1.6044456656917839e-06, "loss": 0.5291, "step": 57600 }, { "ar_loss": 0.3725, "epoch": 8.72098385857033, "fm_loss": 0.1632, "grad_norm": 4.782108783721924, "learning_rate": 1.5621193166123648e-06, "loss": 0.5357, "step": 57700 }, { "ar_loss": 0.3747, "epoch": 8.736356648731745, "fm_loss": 0.158, "grad_norm": 7.434814929962158, "learning_rate": 1.5203407452455076e-06, "loss": 0.5327, "step": 57800 }, { "ar_loss": 0.3704, "epoch": 8.75172943889316, "fm_loss": 0.1581, "grad_norm": 2.1383039951324463, "learning_rate": 1.4791109280178129e-06, "loss": 0.5285, "step": 57900 }, { "ar_loss": 0.3772, "epoch": 8.767102229054574, "fm_loss": 0.158, "grad_norm": 26.415754318237305, "learning_rate": 1.4384308285306959e-06, "loss": 0.5352, "step": 58000 }, { "ar_loss": 0.3721, "epoch": 8.782475019215987, "fm_loss": 0.1575, "grad_norm": 8.601106643676758, "learning_rate": 1.3983013975378574e-06, "loss": 0.5296, "step": 58100 }, { "ar_loss": 0.372, "epoch": 8.797847809377402, "fm_loss": 0.1584, "grad_norm": 2.2945618629455566, "learning_rate": 1.3587235729230764e-06, "loss": 0.5303, "step": 58200 }, { "ar_loss": 0.3747, "epoch": 8.813220599538816, "fm_loss": 0.1584, "grad_norm": 4.096945762634277, "learning_rate": 1.3196982796782636e-06, "loss": 0.5331, "step": 58300 }, { "ar_loss": 0.3752, "epoch": 8.82859338970023, "fm_loss": 0.1599, "grad_norm": 3.189495801925659, "learning_rate": 1.2812264298818737e-06, "loss": 0.5351, "step": 58400 }, { "ar_loss": 0.3725, "epoch": 8.843966179861646, "fm_loss": 0.1587, "grad_norm": 3.499967098236084, "learning_rate": 1.2433089226775662e-06, "loss": 0.5312, "step": 58500 }, { "ar_loss": 0.3716, "epoch": 8.859338970023058, "fm_loss": 0.1591, "grad_norm": 2.2825992107391357, "learning_rate": 1.2059466442532004e-06, "loss": 0.5307, "step": 58600 }, { "ar_loss": 0.372, "epoch": 8.874711760184473, "fm_loss": 0.158, "grad_norm": 3.344238042831421, "learning_rate": 1.1691404678201317e-06, "loss": 0.53, "step": 58700 }, { "ar_loss": 0.372, "epoch": 8.890084550345888, "fm_loss": 0.1588, "grad_norm": 3.1672725677490234, "learning_rate": 1.1328912535927828e-06, "loss": 0.5308, "step": 58800 }, { "ar_loss": 0.3724, "epoch": 8.905457340507303, "fm_loss": 0.1581, "grad_norm": 19.84259605407715, "learning_rate": 1.0971998487685597e-06, "loss": 0.5304, "step": 58900 }, { "ar_loss": 0.3746, "epoch": 8.920830130668715, "fm_loss": 0.1598, "grad_norm": 3.0031962394714355, "learning_rate": 1.0620670875080397e-06, "loss": 0.5344, "step": 59000 }, { "ar_loss": 0.3723, "epoch": 8.93620292083013, "fm_loss": 0.159, "grad_norm": 6.419062614440918, "learning_rate": 1.0274937909154792e-06, "loss": 0.5314, "step": 59100 }, { "ar_loss": 0.3777, "epoch": 8.951575710991545, "fm_loss": 0.1597, "grad_norm": 2.2336511611938477, "learning_rate": 9.934807670196223e-07, "loss": 0.5373, "step": 59200 }, { "ar_loss": 0.3717, "epoch": 8.96694850115296, "fm_loss": 0.1585, "grad_norm": 1.7860270738601685, "learning_rate": 9.600288107548233e-07, "loss": 0.5302, "step": 59300 }, { "ar_loss": 0.3732, "epoch": 8.982321291314374, "fm_loss": 0.1591, "grad_norm": 2.5118651390075684, "learning_rate": 9.271387039424456e-07, "loss": 0.5322, "step": 59400 }, { "ar_loss": 0.3723, "epoch": 8.997694081475787, "fm_loss": 0.1586, "grad_norm": 2.007235288619995, "learning_rate": 8.948112152726285e-07, "loss": 0.5308, "step": 59500 }, { "ar_loss": 0.3726, "epoch": 9.013066871637202, "fm_loss": 0.1579, "grad_norm": 3.087045907974243, "learning_rate": 8.630471002862795e-07, "loss": 0.5305, "step": 59600 }, { "ar_loss": 0.373, "epoch": 9.028439661798616, "fm_loss": 0.1594, "grad_norm": 2.6808998584747314, "learning_rate": 8.318471013574442e-07, "loss": 0.5324, "step": 59700 }, { "ar_loss": 0.3725, "epoch": 9.043812451960031, "fm_loss": 0.1586, "grad_norm": 2.7164993286132812, "learning_rate": 8.01211947675945e-07, "loss": 0.5311, "step": 59800 }, { "ar_loss": 0.3736, "epoch": 9.059185242121446, "fm_loss": 0.1582, "grad_norm": 3.879828929901123, "learning_rate": 7.711423552303366e-07, "loss": 0.5318, "step": 59900 }, { "ar_loss": 0.3727, "epoch": 9.074558032282859, "fm_loss": 0.1574, "grad_norm": 2.1200876235961914, "learning_rate": 7.416390267911827e-07, "loss": 0.5301, "step": 60000 } ], "logging_steps": 100, "max_steps": 65050, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.656055098925268e+20, "train_batch_size": 1, "trial_name": null, "trial_params": null }