diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5434 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.074558032282859, + "eval_steps": 500, + "global_step": 60000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ar_loss": 0.7918, + "epoch": 0.015039855617386072, + "fm_loss": 2.2779, + "grad_norm": 4.37157678604126, + "learning_rate": 4.9999971366617235e-05, + "loss": 3.0697, + "step": 100 + }, + { + "ar_loss": 0.6852, + "epoch": 0.030079711234772145, + "fm_loss": 0.5703, + "grad_norm": 4.328033447265625, + "learning_rate": 4.999951278658594e-05, + "loss": 1.2555, + "step": 200 + }, + { + "ar_loss": 0.669, + "epoch": 0.04511956685215822, + "fm_loss": 0.3866, + "grad_norm": 3.0744214057922363, + "learning_rate": 4.9998494972632174e-05, + "loss": 1.0556, + "step": 300 + }, + { + "ar_loss": 0.6594, + "epoch": 0.06015942246954429, + "fm_loss": 0.319, + "grad_norm": 2.9869937896728516, + "learning_rate": 4.9996917947524234e-05, + "loss": 0.9783, + "step": 400 + }, + { + "ar_loss": 0.6539, + "epoch": 0.07519927808693036, + "fm_loss": 0.2901, + "grad_norm": 2.676532745361328, + "learning_rate": 4.999478174653984e-05, + "loss": 0.9441, + "step": 500 + }, + { + "ar_loss": 0.6451, + "epoch": 0.09023913370431644, + "fm_loss": 0.271, + "grad_norm": 2.8496103286743164, + "learning_rate": 4.999208641746537e-05, + "loss": 0.9161, + "step": 600 + }, + { + "ar_loss": 0.639, + "epoch": 0.10527898932170252, + "fm_loss": 0.262, + "grad_norm": 3.5984983444213867, + "learning_rate": 4.998883202059478e-05, + "loss": 0.901, + "step": 700 + }, + { + "ar_loss": 0.6368, + "epoch": 0.12031884493908858, + "fm_loss": 0.251, + "grad_norm": 3.3547682762145996, + "learning_rate": 4.998501862872824e-05, + "loss": 0.8878, + "step": 800 + }, + { + "ar_loss": 0.6325, + "epoch": 0.13535870055647467, + "fm_loss": 0.2422, + "grad_norm": 3.7177250385284424, + "learning_rate": 4.998064632717054e-05, + "loss": 0.8747, + "step": 900 + }, + { + "ar_loss": 0.6295, + "epoch": 0.15039855617386072, + "fm_loss": 0.2364, + "grad_norm": 2.8715012073516846, + "learning_rate": 4.997571521372918e-05, + "loss": 0.8659, + "step": 1000 + }, + { + "ar_loss": 0.6261, + "epoch": 0.1654384117912468, + "fm_loss": 0.233, + "grad_norm": 2.686033248901367, + "learning_rate": 4.9970225398712116e-05, + "loss": 0.8591, + "step": 1100 + }, + { + "ar_loss": 0.6227, + "epoch": 0.18047826740863288, + "fm_loss": 0.2337, + "grad_norm": 2.735515594482422, + "learning_rate": 4.99641770049254e-05, + "loss": 0.8564, + "step": 1200 + }, + { + "ar_loss": 0.6288, + "epoch": 0.19551812302601895, + "fm_loss": 0.2343, + "grad_norm": 3.192143440246582, + "learning_rate": 4.995757016767032e-05, + "loss": 0.8631, + "step": 1300 + }, + { + "ar_loss": 0.6179, + "epoch": 0.21055797864340503, + "fm_loss": 0.2254, + "grad_norm": 4.529001235961914, + "learning_rate": 4.995040503474049e-05, + "loss": 0.8433, + "step": 1400 + }, + { + "ar_loss": 0.6117, + "epoch": 0.2255978342607911, + "fm_loss": 0.2265, + "grad_norm": 2.0048255920410156, + "learning_rate": 4.994268176641842e-05, + "loss": 0.8382, + "step": 1500 + }, + { + "ar_loss": 0.6197, + "epoch": 0.24063768987817716, + "fm_loss": 0.2174, + "grad_norm": 2.0079429149627686, + "learning_rate": 4.993440053547204e-05, + "loss": 0.8372, + "step": 1600 + }, + { + "ar_loss": 0.6073, + "epoch": 0.25567754549556326, + "fm_loss": 0.2158, + "grad_norm": 2.0703108310699463, + "learning_rate": 4.992556152715076e-05, + "loss": 0.8231, + "step": 1700 + }, + { + "ar_loss": 0.6091, + "epoch": 0.27071740111294934, + "fm_loss": 0.2121, + "grad_norm": 2.3248846530914307, + "learning_rate": 4.991616493918137e-05, + "loss": 0.8213, + "step": 1800 + }, + { + "ar_loss": 0.608, + "epoch": 0.28575725673033536, + "fm_loss": 0.2088, + "grad_norm": 2.5417592525482178, + "learning_rate": 4.99062109817636e-05, + "loss": 0.8167, + "step": 1900 + }, + { + "ar_loss": 0.6097, + "epoch": 0.30079711234772144, + "fm_loss": 0.21, + "grad_norm": 2.219682216644287, + "learning_rate": 4.989569987756542e-05, + "loss": 0.8196, + "step": 2000 + }, + { + "ar_loss": 0.6056, + "epoch": 0.3158369679651075, + "fm_loss": 0.2079, + "grad_norm": 1.9187816381454468, + "learning_rate": 4.988463186171804e-05, + "loss": 0.8136, + "step": 2100 + }, + { + "ar_loss": 0.6064, + "epoch": 0.3308768235824936, + "fm_loss": 0.2072, + "grad_norm": 2.246711492538452, + "learning_rate": 4.987300718181068e-05, + "loss": 0.8136, + "step": 2200 + }, + { + "ar_loss": 0.6015, + "epoch": 0.3459166791998797, + "fm_loss": 0.2044, + "grad_norm": 1.855542540550232, + "learning_rate": 4.986082609788504e-05, + "loss": 0.8059, + "step": 2300 + }, + { + "ar_loss": 0.6, + "epoch": 0.36095653481726575, + "fm_loss": 0.2081, + "grad_norm": 1.883419156074524, + "learning_rate": 4.9848088882429426e-05, + "loss": 0.8082, + "step": 2400 + }, + { + "ar_loss": 0.6048, + "epoch": 0.37599639043465183, + "fm_loss": 0.201, + "grad_norm": 4.141597747802734, + "learning_rate": 4.983479582037272e-05, + "loss": 0.8058, + "step": 2500 + }, + { + "ar_loss": 0.6025, + "epoch": 0.3910362460520379, + "fm_loss": 0.2012, + "grad_norm": 1.3507440090179443, + "learning_rate": 4.9820947209077965e-05, + "loss": 0.8037, + "step": 2600 + }, + { + "ar_loss": 0.5966, + "epoch": 0.406076101669424, + "fm_loss": 0.2103, + "grad_norm": 1.5875388383865356, + "learning_rate": 4.980654335833572e-05, + "loss": 0.8069, + "step": 2700 + }, + { + "ar_loss": 0.5995, + "epoch": 0.42111595728681006, + "fm_loss": 0.1975, + "grad_norm": 2.4850363731384277, + "learning_rate": 4.979158459035715e-05, + "loss": 0.7971, + "step": 2800 + }, + { + "ar_loss": 0.595, + "epoch": 0.43615581290419614, + "fm_loss": 0.1981, + "grad_norm": 2.0931544303894043, + "learning_rate": 4.97760712397668e-05, + "loss": 0.7931, + "step": 2900 + }, + { + "ar_loss": 0.5959, + "epoch": 0.4511956685215822, + "fm_loss": 0.1984, + "grad_norm": 2.2044456005096436, + "learning_rate": 4.97600036535951e-05, + "loss": 0.7943, + "step": 3000 + }, + { + "ar_loss": 0.5894, + "epoch": 0.4662355241389683, + "fm_loss": 0.197, + "grad_norm": 2.905266046524048, + "learning_rate": 4.974338219127062e-05, + "loss": 0.7863, + "step": 3100 + }, + { + "ar_loss": 0.5851, + "epoch": 0.4812753797563543, + "fm_loss": 0.1953, + "grad_norm": 2.5252127647399902, + "learning_rate": 4.9726207224612034e-05, + "loss": 0.7805, + "step": 3200 + }, + { + "ar_loss": 0.5936, + "epoch": 0.4963152353737404, + "fm_loss": 0.1957, + "grad_norm": 2.7935545444488525, + "learning_rate": 4.97084791378198e-05, + "loss": 0.7893, + "step": 3300 + }, + { + "ar_loss": 0.5869, + "epoch": 0.5113550909911265, + "fm_loss": 0.1933, + "grad_norm": 1.2401388883590698, + "learning_rate": 4.9690198327467534e-05, + "loss": 0.7802, + "step": 3400 + }, + { + "ar_loss": 0.5888, + "epoch": 0.5263949466085126, + "fm_loss": 0.1944, + "grad_norm": 3.7216362953186035, + "learning_rate": 4.967136520249318e-05, + "loss": 0.7832, + "step": 3500 + }, + { + "ar_loss": 0.587, + "epoch": 0.5414348022258987, + "fm_loss": 0.1916, + "grad_norm": 1.946953296661377, + "learning_rate": 4.965198018418985e-05, + "loss": 0.7786, + "step": 3600 + }, + { + "ar_loss": 0.5828, + "epoch": 0.5564746578432848, + "fm_loss": 0.1921, + "grad_norm": 1.695999264717102, + "learning_rate": 4.963204370619637e-05, + "loss": 0.7749, + "step": 3700 + }, + { + "ar_loss": 0.5832, + "epoch": 0.5715145134606707, + "fm_loss": 0.1973, + "grad_norm": 2.257406234741211, + "learning_rate": 4.9611556214487645e-05, + "loss": 0.7805, + "step": 3800 + }, + { + "ar_loss": 0.5825, + "epoch": 0.5865543690780568, + "fm_loss": 0.1915, + "grad_norm": 2.4373984336853027, + "learning_rate": 4.95905181673646e-05, + "loss": 0.7739, + "step": 3900 + }, + { + "ar_loss": 0.5809, + "epoch": 0.6015942246954429, + "fm_loss": 0.1924, + "grad_norm": 1.4836270809173584, + "learning_rate": 4.956893003544401e-05, + "loss": 0.7733, + "step": 4000 + }, + { + "ar_loss": 0.5801, + "epoch": 0.616634080312829, + "fm_loss": 0.1956, + "grad_norm": 1.864709734916687, + "learning_rate": 4.954679230164789e-05, + "loss": 0.7757, + "step": 4100 + }, + { + "ar_loss": 0.5824, + "epoch": 0.631673935930215, + "fm_loss": 0.1895, + "grad_norm": 1.5477041006088257, + "learning_rate": 4.952410546119278e-05, + "loss": 0.7719, + "step": 4200 + }, + { + "ar_loss": 0.5819, + "epoch": 0.6467137915476011, + "fm_loss": 0.1925, + "grad_norm": 1.6031211614608765, + "learning_rate": 4.95008700215786e-05, + "loss": 0.7744, + "step": 4300 + }, + { + "ar_loss": 0.5778, + "epoch": 0.6617536471649872, + "fm_loss": 0.2067, + "grad_norm": 5.476424694061279, + "learning_rate": 4.947708650257732e-05, + "loss": 0.7845, + "step": 4400 + }, + { + "ar_loss": 0.5767, + "epoch": 0.6767935027823733, + "fm_loss": 0.1875, + "grad_norm": 1.901498794555664, + "learning_rate": 4.945275543622133e-05, + "loss": 0.7642, + "step": 4500 + }, + { + "ar_loss": 0.5766, + "epoch": 0.6918333583997593, + "fm_loss": 0.1912, + "grad_norm": 1.4781795740127563, + "learning_rate": 4.942787736679153e-05, + "loss": 0.7678, + "step": 4600 + }, + { + "ar_loss": 0.5735, + "epoch": 0.7068732140171454, + "fm_loss": 0.1893, + "grad_norm": 2.639249801635742, + "learning_rate": 4.940245285080521e-05, + "loss": 0.7628, + "step": 4700 + }, + { + "ar_loss": 0.5736, + "epoch": 0.7219130696345315, + "fm_loss": 0.1854, + "grad_norm": 1.5374678373336792, + "learning_rate": 4.93764824570035e-05, + "loss": 0.759, + "step": 4800 + }, + { + "ar_loss": 0.5755, + "epoch": 0.7369529252519176, + "fm_loss": 0.187, + "grad_norm": 1.971647024154663, + "learning_rate": 4.934996676633874e-05, + "loss": 0.7625, + "step": 4900 + }, + { + "ar_loss": 0.5725, + "epoch": 0.7519927808693037, + "fm_loss": 0.1875, + "grad_norm": 5.200182914733887, + "learning_rate": 4.932290637196144e-05, + "loss": 0.76, + "step": 5000 + }, + { + "ar_loss": 0.5504, + "epoch": 0.015037593984962405, + "fm_loss": 0.1893, + "grad_norm": 2.8439042568206787, + "learning_rate": 4.929551301429187e-05, + "loss": 0.7396, + "step": 5100 + }, + { + "ar_loss": 0.5503, + "epoch": 0.03007518796992481, + "fm_loss": 0.1831, + "grad_norm": 1.733323335647583, + "learning_rate": 4.926737343223295e-05, + "loss": 0.7334, + "step": 5200 + }, + { + "ar_loss": 0.5514, + "epoch": 0.045112781954887216, + "fm_loss": 0.185, + "grad_norm": 2.580357074737549, + "learning_rate": 4.9238691157474316e-05, + "loss": 0.7364, + "step": 5300 + }, + { + "ar_loss": 0.5493, + "epoch": 0.06015037593984962, + "fm_loss": 0.1835, + "grad_norm": 1.9058268070220947, + "learning_rate": 4.920946683143935e-05, + "loss": 0.7327, + "step": 5400 + }, + { + "ar_loss": 0.5497, + "epoch": 0.07518796992481203, + "fm_loss": 0.184, + "grad_norm": 1.3233968019485474, + "learning_rate": 4.91797011076734e-05, + "loss": 0.7337, + "step": 5500 + }, + { + "ar_loss": 0.5466, + "epoch": 0.09022556390977443, + "fm_loss": 0.1829, + "grad_norm": 1.8950995206832886, + "learning_rate": 4.9149394651829086e-05, + "loss": 0.7295, + "step": 5600 + }, + { + "ar_loss": 0.5495, + "epoch": 0.10526315789473684, + "fm_loss": 0.1829, + "grad_norm": 1.7234010696411133, + "learning_rate": 4.911854814165145e-05, + "loss": 0.7324, + "step": 5700 + }, + { + "ar_loss": 0.5475, + "epoch": 0.12030075187969924, + "fm_loss": 0.1821, + "grad_norm": 2.068098783493042, + "learning_rate": 4.908716226696284e-05, + "loss": 0.7295, + "step": 5800 + }, + { + "ar_loss": 0.5457, + "epoch": 0.13533834586466165, + "fm_loss": 0.1837, + "grad_norm": 1.4124906063079834, + "learning_rate": 4.905523772964739e-05, + "loss": 0.7294, + "step": 5900 + }, + { + "ar_loss": 0.5504, + "epoch": 0.15037593984962405, + "fm_loss": 0.1828, + "grad_norm": 1.7072114944458008, + "learning_rate": 4.902277524363543e-05, + "loss": 0.7332, + "step": 6000 + }, + { + "ar_loss": 0.5454, + "epoch": 0.16541353383458646, + "fm_loss": 0.181, + "grad_norm": 1.737274169921875, + "learning_rate": 4.898977553488743e-05, + "loss": 0.7264, + "step": 6100 + }, + { + "ar_loss": 0.5501, + "epoch": 0.18045112781954886, + "fm_loss": 0.1808, + "grad_norm": 2.1727452278137207, + "learning_rate": 4.895623934137783e-05, + "loss": 0.7309, + "step": 6200 + }, + { + "ar_loss": 0.5439, + "epoch": 0.19548872180451127, + "fm_loss": 0.1798, + "grad_norm": 2.8658437728881836, + "learning_rate": 4.892216741307848e-05, + "loss": 0.7237, + "step": 6300 + }, + { + "ar_loss": 0.5438, + "epoch": 0.21052631578947367, + "fm_loss": 0.1812, + "grad_norm": 1.8495811223983765, + "learning_rate": 4.888756051194193e-05, + "loss": 0.725, + "step": 6400 + }, + { + "ar_loss": 0.5391, + "epoch": 0.22556390977443608, + "fm_loss": 0.1894, + "grad_norm": 2.1281821727752686, + "learning_rate": 4.885241941188435e-05, + "loss": 0.7286, + "step": 6500 + }, + { + "ar_loss": 0.5437, + "epoch": 0.24060150375939848, + "fm_loss": 0.1787, + "grad_norm": 1.7707561254501343, + "learning_rate": 4.881674489876822e-05, + "loss": 0.7224, + "step": 6600 + }, + { + "ar_loss": 0.5413, + "epoch": 0.2556390977443609, + "fm_loss": 0.1795, + "grad_norm": 1.4524445533752441, + "learning_rate": 4.878053777038478e-05, + "loss": 0.7208, + "step": 6700 + }, + { + "ar_loss": 0.5411, + "epoch": 0.2706766917293233, + "fm_loss": 0.18, + "grad_norm": 2.2196991443634033, + "learning_rate": 4.8743798836436166e-05, + "loss": 0.7212, + "step": 6800 + }, + { + "ar_loss": 0.5395, + "epoch": 0.2857142857142857, + "fm_loss": 0.1778, + "grad_norm": 3.7135586738586426, + "learning_rate": 4.8706528918517326e-05, + "loss": 0.7173, + "step": 6900 + }, + { + "ar_loss": 0.5397, + "epoch": 0.3007518796992481, + "fm_loss": 0.179, + "grad_norm": 2.551405191421509, + "learning_rate": 4.866872885009762e-05, + "loss": 0.7188, + "step": 7000 + }, + { + "ar_loss": 0.5433, + "epoch": 0.3157894736842105, + "fm_loss": 0.1769, + "grad_norm": 1.5937398672103882, + "learning_rate": 4.863039947650221e-05, + "loss": 0.7202, + "step": 7100 + }, + { + "ar_loss": 0.537, + "epoch": 0.3308270676691729, + "fm_loss": 0.1778, + "grad_norm": 2.1410443782806396, + "learning_rate": 4.859154165489313e-05, + "loss": 0.7148, + "step": 7200 + }, + { + "ar_loss": 0.5345, + "epoch": 0.3458646616541353, + "fm_loss": 0.1786, + "grad_norm": 1.9549566507339478, + "learning_rate": 4.855215625425012e-05, + "loss": 0.7131, + "step": 7300 + }, + { + "ar_loss": 0.5385, + "epoch": 0.3609022556390977, + "fm_loss": 0.1786, + "grad_norm": 1.6920804977416992, + "learning_rate": 4.851224415535123e-05, + "loss": 0.7171, + "step": 7400 + }, + { + "ar_loss": 0.5403, + "epoch": 0.37593984962406013, + "fm_loss": 0.1778, + "grad_norm": 1.460734248161316, + "learning_rate": 4.847180625075306e-05, + "loss": 0.7182, + "step": 7500 + }, + { + "ar_loss": 0.5333, + "epoch": 0.39097744360902253, + "fm_loss": 0.1802, + "grad_norm": 2.369119644165039, + "learning_rate": 4.8430843444770856e-05, + "loss": 0.7135, + "step": 7600 + }, + { + "ar_loss": 0.5343, + "epoch": 0.40601503759398494, + "fm_loss": 0.1777, + "grad_norm": 2.0863733291625977, + "learning_rate": 4.838935665345826e-05, + "loss": 0.712, + "step": 7700 + }, + { + "ar_loss": 0.5361, + "epoch": 0.42105263157894735, + "fm_loss": 0.1826, + "grad_norm": 2.1648268699645996, + "learning_rate": 4.834734680458682e-05, + "loss": 0.7187, + "step": 7800 + }, + { + "ar_loss": 0.5362, + "epoch": 0.43609022556390975, + "fm_loss": 0.1759, + "grad_norm": 3.731491804122925, + "learning_rate": 4.8304814837625275e-05, + "loss": 0.7121, + "step": 7900 + }, + { + "ar_loss": 0.5361, + "epoch": 0.45112781954887216, + "fm_loss": 0.1763, + "grad_norm": 3.129291296005249, + "learning_rate": 4.826176170371848e-05, + "loss": 0.7124, + "step": 8000 + }, + { + "ar_loss": 0.5308, + "epoch": 0.46616541353383456, + "fm_loss": 0.1803, + "grad_norm": 1.837388038635254, + "learning_rate": 4.8218188365666216e-05, + "loss": 0.7111, + "step": 8100 + }, + { + "ar_loss": 0.5384, + "epoch": 0.48120300751879697, + "fm_loss": 0.1765, + "grad_norm": 1.696321964263916, + "learning_rate": 4.817409579790161e-05, + "loss": 0.7148, + "step": 8200 + }, + { + "ar_loss": 0.5351, + "epoch": 0.49624060150375937, + "fm_loss": 0.177, + "grad_norm": 2.0662057399749756, + "learning_rate": 4.812948498646933e-05, + "loss": 0.7121, + "step": 8300 + }, + { + "ar_loss": 0.5358, + "epoch": 0.5112781954887218, + "fm_loss": 0.1757, + "grad_norm": 1.4204461574554443, + "learning_rate": 4.80843569290036e-05, + "loss": 0.7114, + "step": 8400 + }, + { + "ar_loss": 0.5316, + "epoch": 0.5263157894736842, + "fm_loss": 0.176, + "grad_norm": 3.6314117908477783, + "learning_rate": 4.80387126347058e-05, + "loss": 0.7075, + "step": 8500 + }, + { + "ar_loss": 0.5307, + "epoch": 0.5413533834586466, + "fm_loss": 0.1754, + "grad_norm": 2.164659023284912, + "learning_rate": 4.799255312432199e-05, + "loss": 0.706, + "step": 8600 + }, + { + "ar_loss": 0.5331, + "epoch": 0.556390977443609, + "fm_loss": 0.184, + "grad_norm": 1.9370899200439453, + "learning_rate": 4.794587943012e-05, + "loss": 0.7171, + "step": 8700 + }, + { + "ar_loss": 0.5278, + "epoch": 0.5714285714285714, + "fm_loss": 0.1738, + "grad_norm": 2.0422327518463135, + "learning_rate": 4.7898692595866415e-05, + "loss": 0.7016, + "step": 8800 + }, + { + "ar_loss": 0.5299, + "epoch": 0.5864661654135338, + "fm_loss": 0.1729, + "grad_norm": 1.643233060836792, + "learning_rate": 4.785099367680317e-05, + "loss": 0.7028, + "step": 8900 + }, + { + "ar_loss": 0.5279, + "epoch": 0.6015037593984962, + "fm_loss": 0.173, + "grad_norm": 1.7165354490280151, + "learning_rate": 4.7802783739624e-05, + "loss": 0.7009, + "step": 9000 + }, + { + "ar_loss": 0.5297, + "epoch": 0.6165413533834586, + "fm_loss": 0.181, + "grad_norm": 1.632948875427246, + "learning_rate": 4.7754063862450576e-05, + "loss": 0.7107, + "step": 9100 + }, + { + "ar_loss": 0.5266, + "epoch": 0.631578947368421, + "fm_loss": 0.2008, + "grad_norm": 2.7003791332244873, + "learning_rate": 4.770483513480837e-05, + "loss": 0.7274, + "step": 9200 + }, + { + "ar_loss": 0.5304, + "epoch": 0.6466165413533834, + "fm_loss": 0.1754, + "grad_norm": 2.2766501903533936, + "learning_rate": 4.765509865760233e-05, + "loss": 0.7057, + "step": 9300 + }, + { + "ar_loss": 0.5262, + "epoch": 0.6616541353383458, + "fm_loss": 0.18, + "grad_norm": 10.034100532531738, + "learning_rate": 4.760485554309219e-05, + "loss": 0.7061, + "step": 9400 + }, + { + "ar_loss": 0.52, + "epoch": 0.6766917293233082, + "fm_loss": 0.1753, + "grad_norm": 1.70254385471344, + "learning_rate": 4.7554106914867705e-05, + "loss": 0.6953, + "step": 9500 + }, + { + "ar_loss": 0.5249, + "epoch": 0.6917293233082706, + "fm_loss": 0.1744, + "grad_norm": 1.7009040117263794, + "learning_rate": 4.750285390782342e-05, + "loss": 0.6992, + "step": 9600 + }, + { + "ar_loss": 0.523, + "epoch": 0.706766917293233, + "fm_loss": 0.1739, + "grad_norm": 1.4413669109344482, + "learning_rate": 4.745109766813334e-05, + "loss": 0.6968, + "step": 9700 + }, + { + "ar_loss": 0.5242, + "epoch": 0.7218045112781954, + "fm_loss": 0.1727, + "grad_norm": 1.4672104120254517, + "learning_rate": 4.739883935322532e-05, + "loss": 0.6969, + "step": 9800 + }, + { + "ar_loss": 0.5244, + "epoch": 0.7368421052631579, + "fm_loss": 0.1728, + "grad_norm": 1.458977222442627, + "learning_rate": 4.734608013175512e-05, + "loss": 0.6972, + "step": 9900 + }, + { + "ar_loss": 0.5247, + "epoch": 0.7518796992481203, + "fm_loss": 0.1721, + "grad_norm": 2.412842273712158, + "learning_rate": 4.72928211835803e-05, + "loss": 0.6968, + "step": 10000 + }, + { + "ar_loss": 0.5206, + "epoch": 0.7669172932330827, + "fm_loss": 0.172, + "grad_norm": 2.1207447052001953, + "learning_rate": 4.723906369973386e-05, + "loss": 0.6926, + "step": 10100 + }, + { + "ar_loss": 0.5214, + "epoch": 0.7819548872180451, + "fm_loss": 0.1735, + "grad_norm": 2.6220345497131348, + "learning_rate": 4.7184808882397594e-05, + "loss": 0.695, + "step": 10200 + }, + { + "ar_loss": 0.5236, + "epoch": 0.7969924812030075, + "fm_loss": 0.1819, + "grad_norm": 1.9105595350265503, + "learning_rate": 4.713005794487515e-05, + "loss": 0.7055, + "step": 10300 + }, + { + "ar_loss": 0.5202, + "epoch": 0.8120300751879699, + "fm_loss": 0.1718, + "grad_norm": 2.312239646911621, + "learning_rate": 4.707481211156497e-05, + "loss": 0.692, + "step": 10400 + }, + { + "ar_loss": 0.5207, + "epoch": 0.8270676691729323, + "fm_loss": 0.1788, + "grad_norm": 1.709987759590149, + "learning_rate": 4.701907261793287e-05, + "loss": 0.6995, + "step": 10500 + }, + { + "ar_loss": 0.5178, + "epoch": 0.8421052631578947, + "fm_loss": 0.172, + "grad_norm": 2.644814968109131, + "learning_rate": 4.696284071048444e-05, + "loss": 0.6898, + "step": 10600 + }, + { + "ar_loss": 0.5175, + "epoch": 0.8571428571428571, + "fm_loss": 0.1737, + "grad_norm": 1.4387990236282349, + "learning_rate": 4.690611764673713e-05, + "loss": 0.6912, + "step": 10700 + }, + { + "ar_loss": 0.5154, + "epoch": 0.8721804511278195, + "fm_loss": 0.1727, + "grad_norm": 1.308214545249939, + "learning_rate": 4.684890469519213e-05, + "loss": 0.6881, + "step": 10800 + }, + { + "ar_loss": 0.5158, + "epoch": 0.8872180451127819, + "fm_loss": 0.1707, + "grad_norm": 3.754106283187866, + "learning_rate": 4.6791203135306075e-05, + "loss": 0.6865, + "step": 10900 + }, + { + "ar_loss": 0.5213, + "epoch": 0.9022556390977443, + "fm_loss": 0.1709, + "grad_norm": 1.5025776624679565, + "learning_rate": 4.673301425746232e-05, + "loss": 0.6922, + "step": 11000 + }, + { + "ar_loss": 0.5129, + "epoch": 0.9172932330827067, + "fm_loss": 0.1705, + "grad_norm": 1.8450067043304443, + "learning_rate": 4.667433936294217e-05, + "loss": 0.6835, + "step": 11100 + }, + { + "ar_loss": 0.5176, + "epoch": 0.9323308270676691, + "fm_loss": 0.1768, + "grad_norm": 1.772120475769043, + "learning_rate": 4.661517976389574e-05, + "loss": 0.6944, + "step": 11200 + }, + { + "ar_loss": 0.5117, + "epoch": 0.9473684210526315, + "fm_loss": 0.1722, + "grad_norm": 1.8486195802688599, + "learning_rate": 4.6555536783312634e-05, + "loss": 0.6839, + "step": 11300 + }, + { + "ar_loss": 0.5139, + "epoch": 0.9624060150375939, + "fm_loss": 0.1705, + "grad_norm": 3.964401960372925, + "learning_rate": 4.649541175499232e-05, + "loss": 0.6844, + "step": 11400 + }, + { + "ar_loss": 0.5154, + "epoch": 0.9774436090225563, + "fm_loss": 0.1923, + "grad_norm": 2.2421326637268066, + "learning_rate": 4.6434806023514354e-05, + "loss": 0.7077, + "step": 11500 + }, + { + "ar_loss": 0.5147, + "epoch": 0.9924812030075187, + "fm_loss": 0.171, + "grad_norm": 1.773090124130249, + "learning_rate": 4.6373720944208275e-05, + "loss": 0.6856, + "step": 11600 + }, + { + "ar_loss": 0.4967, + "epoch": 1.0075187969924813, + "fm_loss": 0.1708, + "grad_norm": 2.0621070861816406, + "learning_rate": 4.631215788312331e-05, + "loss": 0.6675, + "step": 11700 + }, + { + "ar_loss": 0.4895, + "epoch": 1.0225563909774436, + "fm_loss": 0.1706, + "grad_norm": 1.7303022146224976, + "learning_rate": 4.6250118216997795e-05, + "loss": 0.6601, + "step": 11800 + }, + { + "ar_loss": 0.4896, + "epoch": 1.037593984962406, + "fm_loss": 0.1688, + "grad_norm": 2.481395959854126, + "learning_rate": 4.618760333322846e-05, + "loss": 0.6584, + "step": 11900 + }, + { + "ar_loss": 0.484, + "epoch": 1.0526315789473684, + "fm_loss": 0.1686, + "grad_norm": 8.144444465637207, + "learning_rate": 4.61246146298393e-05, + "loss": 0.6527, + "step": 12000 + }, + { + "ar_loss": 0.4851, + "epoch": 1.0676691729323309, + "fm_loss": 0.1695, + "grad_norm": 4.188894271850586, + "learning_rate": 4.606115351545043e-05, + "loss": 0.6546, + "step": 12100 + }, + { + "ar_loss": 0.4853, + "epoch": 1.0827067669172932, + "fm_loss": 0.1685, + "grad_norm": 2.112070083618164, + "learning_rate": 4.599722140924645e-05, + "loss": 0.6538, + "step": 12200 + }, + { + "ar_loss": 0.4862, + "epoch": 1.0977443609022557, + "fm_loss": 0.1684, + "grad_norm": 5.455174922943115, + "learning_rate": 4.593281974094483e-05, + "loss": 0.6547, + "step": 12300 + }, + { + "ar_loss": 0.4887, + "epoch": 1.112781954887218, + "fm_loss": 0.1682, + "grad_norm": 2.762221574783325, + "learning_rate": 4.5867949950763864e-05, + "loss": 0.6569, + "step": 12400 + }, + { + "ar_loss": 0.4829, + "epoch": 1.1278195488721805, + "fm_loss": 0.1683, + "grad_norm": 1.505560278892517, + "learning_rate": 4.5802613489390487e-05, + "loss": 0.6511, + "step": 12500 + }, + { + "ar_loss": 0.487, + "epoch": 1.1428571428571428, + "fm_loss": 0.1686, + "grad_norm": 12.36954402923584, + "learning_rate": 4.5736811817947824e-05, + "loss": 0.6557, + "step": 12600 + }, + { + "ar_loss": 0.4796, + "epoch": 1.1578947368421053, + "fm_loss": 0.1675, + "grad_norm": 1.7835297584533691, + "learning_rate": 4.5670546407962525e-05, + "loss": 0.6471, + "step": 12700 + }, + { + "ar_loss": 0.4841, + "epoch": 1.1729323308270676, + "fm_loss": 0.1685, + "grad_norm": 1.500798225402832, + "learning_rate": 4.560381874133186e-05, + "loss": 0.6526, + "step": 12800 + }, + { + "ar_loss": 0.4818, + "epoch": 1.1879699248120301, + "fm_loss": 0.1682, + "grad_norm": 3.4301960468292236, + "learning_rate": 4.553663031029055e-05, + "loss": 0.65, + "step": 12900 + }, + { + "ar_loss": 0.4821, + "epoch": 1.2030075187969924, + "fm_loss": 0.1699, + "grad_norm": 1.9192440509796143, + "learning_rate": 4.546898261737745e-05, + "loss": 0.6519, + "step": 13000 + }, + { + "ar_loss": 0.4854, + "epoch": 1.218045112781955, + "fm_loss": 0.1688, + "grad_norm": 2.5409817695617676, + "learning_rate": 4.540087717540188e-05, + "loss": 0.6542, + "step": 13100 + }, + { + "ar_loss": 0.4807, + "epoch": 1.2330827067669172, + "fm_loss": 0.1684, + "grad_norm": 1.6167821884155273, + "learning_rate": 4.533231550740985e-05, + "loss": 0.6491, + "step": 13200 + }, + { + "ar_loss": 0.4793, + "epoch": 1.2481203007518797, + "fm_loss": 0.1673, + "grad_norm": 2.658431053161621, + "learning_rate": 4.526329914664999e-05, + "loss": 0.6466, + "step": 13300 + }, + { + "ar_loss": 0.4802, + "epoch": 1.263157894736842, + "fm_loss": 0.1678, + "grad_norm": 1.7002625465393066, + "learning_rate": 4.51938296365392e-05, + "loss": 0.6481, + "step": 13400 + }, + { + "ar_loss": 0.4799, + "epoch": 1.2781954887218046, + "fm_loss": 0.1694, + "grad_norm": 2.5322110652923584, + "learning_rate": 4.5123908530628254e-05, + "loss": 0.6493, + "step": 13500 + }, + { + "ar_loss": 0.4807, + "epoch": 1.2932330827067668, + "fm_loss": 0.1674, + "grad_norm": 1.8980706930160522, + "learning_rate": 4.5053537392566946e-05, + "loss": 0.6481, + "step": 13600 + }, + { + "ar_loss": 0.4818, + "epoch": 1.3082706766917294, + "fm_loss": 0.1688, + "grad_norm": 1.619292140007019, + "learning_rate": 4.4982717796069176e-05, + "loss": 0.6507, + "step": 13700 + }, + { + "ar_loss": 0.4841, + "epoch": 1.3233082706766917, + "fm_loss": 0.1684, + "grad_norm": 1.7827472686767578, + "learning_rate": 4.491145132487775e-05, + "loss": 0.6526, + "step": 13800 + }, + { + "ar_loss": 0.4794, + "epoch": 1.3383458646616542, + "fm_loss": 0.1779, + "grad_norm": 2.303715944290161, + "learning_rate": 4.483973957272895e-05, + "loss": 0.6572, + "step": 13900 + }, + { + "ar_loss": 0.4729, + "epoch": 1.3533834586466165, + "fm_loss": 0.1767, + "grad_norm": 2.095193386077881, + "learning_rate": 4.476758414331691e-05, + "loss": 0.6496, + "step": 14000 + }, + { + "ar_loss": 0.4765, + "epoch": 1.368421052631579, + "fm_loss": 0.1679, + "grad_norm": 3.301593065261841, + "learning_rate": 4.4694986650257754e-05, + "loss": 0.6445, + "step": 14100 + }, + { + "ar_loss": 0.4787, + "epoch": 1.3834586466165413, + "fm_loss": 0.1675, + "grad_norm": 4.087191581726074, + "learning_rate": 4.462194871705347e-05, + "loss": 0.6462, + "step": 14200 + }, + { + "ar_loss": 0.4788, + "epoch": 1.3984962406015038, + "fm_loss": 0.1666, + "grad_norm": 1.5592633485794067, + "learning_rate": 4.4548471977055665e-05, + "loss": 0.6455, + "step": 14300 + }, + { + "ar_loss": 0.4747, + "epoch": 1.413533834586466, + "fm_loss": 0.1682, + "grad_norm": 7.437148571014404, + "learning_rate": 4.447455807342901e-05, + "loss": 0.6429, + "step": 14400 + }, + { + "ar_loss": 0.4798, + "epoch": 1.4285714285714286, + "fm_loss": 0.1674, + "grad_norm": 1.5794994831085205, + "learning_rate": 4.440020865911446e-05, + "loss": 0.6472, + "step": 14500 + }, + { + "ar_loss": 0.4729, + "epoch": 1.443609022556391, + "fm_loss": 0.1674, + "grad_norm": 2.327096700668335, + "learning_rate": 4.432542539679235e-05, + "loss": 0.6404, + "step": 14600 + }, + { + "ar_loss": 0.4711, + "epoch": 1.4586466165413534, + "fm_loss": 0.1699, + "grad_norm": 1.5044869184494019, + "learning_rate": 4.425020995884517e-05, + "loss": 0.641, + "step": 14700 + }, + { + "ar_loss": 0.4757, + "epoch": 1.4736842105263157, + "fm_loss": 0.1678, + "grad_norm": 1.8027641773223877, + "learning_rate": 4.41745640273202e-05, + "loss": 0.6435, + "step": 14800 + }, + { + "ar_loss": 0.4727, + "epoch": 1.4887218045112782, + "fm_loss": 0.1716, + "grad_norm": 1.5030885934829712, + "learning_rate": 4.4098489293891845e-05, + "loss": 0.6443, + "step": 14900 + }, + { + "ar_loss": 0.4741, + "epoch": 1.5037593984962405, + "fm_loss": 0.1678, + "grad_norm": 2.254826068878174, + "learning_rate": 4.4021987459823834e-05, + "loss": 0.6419, + "step": 15000 + }, + { + "ar_loss": 0.4636, + "epoch": 2.015368065160596, + "fm_loss": 0.1667, + "grad_norm": 5.636004447937012, + "learning_rate": 4.3687046036407485e-05, + "loss": 0.6303, + "step": 15100 + }, + { + "ar_loss": 0.4622, + "epoch": 2.0307361303211926, + "fm_loss": 0.1678, + "grad_norm": 2.2144827842712402, + "learning_rate": 4.360656750389484e-05, + "loss": 0.63, + "step": 15200 + }, + { + "ar_loss": 0.4643, + "epoch": 2.0461041954817887, + "fm_loss": 0.1659, + "grad_norm": 1.623849868774414, + "learning_rate": 4.3525654376107785e-05, + "loss": 0.6302, + "step": 15300 + }, + { + "ar_loss": 0.4617, + "epoch": 2.061472260642385, + "fm_loss": 0.1671, + "grad_norm": 10.88976001739502, + "learning_rate": 4.344430854294155e-05, + "loss": 0.6288, + "step": 15400 + }, + { + "ar_loss": 0.4622, + "epoch": 2.0768403258029813, + "fm_loss": 0.1679, + "grad_norm": 2.079127550125122, + "learning_rate": 4.3362531904398086e-05, + "loss": 0.6301, + "step": 15500 + }, + { + "ar_loss": 0.4593, + "epoch": 2.092208390963578, + "fm_loss": 0.1663, + "grad_norm": 3.9787468910217285, + "learning_rate": 4.3280326370541716e-05, + "loss": 0.6256, + "step": 15600 + }, + { + "ar_loss": 0.4606, + "epoch": 2.107576456124174, + "fm_loss": 0.1686, + "grad_norm": 2.992798089981079, + "learning_rate": 4.3197693861454505e-05, + "loss": 0.6292, + "step": 15700 + }, + { + "ar_loss": 0.4612, + "epoch": 2.1229445212847704, + "fm_loss": 0.1693, + "grad_norm": 1.920535922050476, + "learning_rate": 4.31146363071914e-05, + "loss": 0.6305, + "step": 15800 + }, + { + "ar_loss": 0.4626, + "epoch": 2.1383125864453665, + "fm_loss": 0.1793, + "grad_norm": 1.652984380722046, + "learning_rate": 4.303115564773521e-05, + "loss": 0.6419, + "step": 15900 + }, + { + "ar_loss": 0.4583, + "epoch": 2.153680651605963, + "fm_loss": 0.1665, + "grad_norm": 1.9141411781311035, + "learning_rate": 4.294725383295121e-05, + "loss": 0.6248, + "step": 16000 + }, + { + "ar_loss": 0.4646, + "epoch": 2.169048716766559, + "fm_loss": 0.1762, + "grad_norm": 1.7625855207443237, + "learning_rate": 4.286293282254165e-05, + "loss": 0.6407, + "step": 16100 + }, + { + "ar_loss": 0.4622, + "epoch": 2.1844167819271556, + "fm_loss": 0.1674, + "grad_norm": 1.4725229740142822, + "learning_rate": 4.2778194585999965e-05, + "loss": 0.6295, + "step": 16200 + }, + { + "ar_loss": 0.4642, + "epoch": 2.1997848470877517, + "fm_loss": 0.1654, + "grad_norm": 2.979617118835449, + "learning_rate": 4.269304110256479e-05, + "loss": 0.6296, + "step": 16300 + }, + { + "ar_loss": 0.4605, + "epoch": 2.2151529122483478, + "fm_loss": 0.1656, + "grad_norm": 4.4237470626831055, + "learning_rate": 4.2607474361173714e-05, + "loss": 0.6262, + "step": 16400 + }, + { + "ar_loss": 0.4597, + "epoch": 2.2305209774089443, + "fm_loss": 0.1673, + "grad_norm": 3.3088274002075195, + "learning_rate": 4.2521496360416834e-05, + "loss": 0.627, + "step": 16500 + }, + { + "ar_loss": 0.4588, + "epoch": 2.2458890425695404, + "fm_loss": 0.166, + "grad_norm": 2.4934027194976807, + "learning_rate": 4.243510910849006e-05, + "loss": 0.6248, + "step": 16600 + }, + { + "ar_loss": 0.4577, + "epoch": 2.261257107730137, + "fm_loss": 0.1654, + "grad_norm": 3.4698071479797363, + "learning_rate": 4.234831462314822e-05, + "loss": 0.6231, + "step": 16700 + }, + { + "ar_loss": 0.459, + "epoch": 2.276625172890733, + "fm_loss": 0.1663, + "grad_norm": 1.6527596712112427, + "learning_rate": 4.226111493165793e-05, + "loss": 0.6253, + "step": 16800 + }, + { + "ar_loss": 0.4563, + "epoch": 2.2919932380513295, + "fm_loss": 0.1647, + "grad_norm": 2.62147855758667, + "learning_rate": 4.217351207075024e-05, + "loss": 0.621, + "step": 16900 + }, + { + "ar_loss": 0.4583, + "epoch": 2.3073613032119256, + "fm_loss": 0.1651, + "grad_norm": 2.403658151626587, + "learning_rate": 4.208550808657309e-05, + "loss": 0.6234, + "step": 17000 + }, + { + "ar_loss": 0.4565, + "epoch": 2.322729368372522, + "fm_loss": 0.1651, + "grad_norm": 2.243856906890869, + "learning_rate": 4.199710503464345e-05, + "loss": 0.6216, + "step": 17100 + }, + { + "ar_loss": 0.456, + "epoch": 2.338097433533118, + "fm_loss": 0.1651, + "grad_norm": 2.199291229248047, + "learning_rate": 4.190830497979938e-05, + "loss": 0.6211, + "step": 17200 + }, + { + "ar_loss": 0.4532, + "epoch": 2.3534654986937147, + "fm_loss": 0.1656, + "grad_norm": 2.230714797973633, + "learning_rate": 4.1819109996151775e-05, + "loss": 0.6188, + "step": 17300 + }, + { + "ar_loss": 0.4526, + "epoch": 2.3688335638543108, + "fm_loss": 0.1651, + "grad_norm": 2.124840259552002, + "learning_rate": 4.172952216703588e-05, + "loss": 0.6176, + "step": 17400 + }, + { + "ar_loss": 0.4502, + "epoch": 2.384201629014907, + "fm_loss": 0.165, + "grad_norm": 2.1426265239715576, + "learning_rate": 4.1639543584962726e-05, + "loss": 0.6152, + "step": 17500 + }, + { + "ar_loss": 0.4542, + "epoch": 2.3995696941755034, + "fm_loss": 0.1651, + "grad_norm": 4.364583492279053, + "learning_rate": 4.154917635157015e-05, + "loss": 0.6193, + "step": 17600 + }, + { + "ar_loss": 0.4558, + "epoch": 2.4149377593360994, + "fm_loss": 0.1651, + "grad_norm": 1.8425495624542236, + "learning_rate": 4.145842257757377e-05, + "loss": 0.621, + "step": 17700 + }, + { + "ar_loss": 0.4533, + "epoch": 2.430305824496696, + "fm_loss": 0.1657, + "grad_norm": 1.7966125011444092, + "learning_rate": 4.136728438271768e-05, + "loss": 0.619, + "step": 17800 + }, + { + "ar_loss": 0.4493, + "epoch": 2.445673889657292, + "fm_loss": 0.1651, + "grad_norm": 2.5523791313171387, + "learning_rate": 4.127576389572488e-05, + "loss": 0.6144, + "step": 17900 + }, + { + "ar_loss": 0.4501, + "epoch": 2.4610419548178886, + "fm_loss": 0.1647, + "grad_norm": 2.2898178100585938, + "learning_rate": 4.1183863254247655e-05, + "loss": 0.6148, + "step": 18000 + }, + { + "ar_loss": 0.4509, + "epoch": 2.4764100199784846, + "fm_loss": 0.1635, + "grad_norm": 2.085273504257202, + "learning_rate": 4.109158460481758e-05, + "loss": 0.6144, + "step": 18100 + }, + { + "ar_loss": 0.4515, + "epoch": 2.491778085139081, + "fm_loss": 0.1651, + "grad_norm": 2.7309823036193848, + "learning_rate": 4.0998930102795377e-05, + "loss": 0.6166, + "step": 18200 + }, + { + "ar_loss": 0.4491, + "epoch": 2.5071461502996772, + "fm_loss": 0.1645, + "grad_norm": 1.9515680074691772, + "learning_rate": 4.090590191232061e-05, + "loss": 0.6137, + "step": 18300 + }, + { + "ar_loss": 0.4541, + "epoch": 2.5225142154602738, + "fm_loss": 0.1639, + "grad_norm": 3.2586522102355957, + "learning_rate": 4.0812502206261096e-05, + "loss": 0.618, + "step": 18400 + }, + { + "ar_loss": 0.4463, + "epoch": 2.53788228062087, + "fm_loss": 0.1645, + "grad_norm": 3.2903804779052734, + "learning_rate": 4.071873316616219e-05, + "loss": 0.6108, + "step": 18500 + }, + { + "ar_loss": 0.4465, + "epoch": 2.553250345781466, + "fm_loss": 0.1659, + "grad_norm": 3.0805041790008545, + "learning_rate": 4.062459698219583e-05, + "loss": 0.6124, + "step": 18600 + }, + { + "ar_loss": 0.4476, + "epoch": 2.5686184109420624, + "fm_loss": 0.1643, + "grad_norm": 10.558906555175781, + "learning_rate": 4.053009585310933e-05, + "loss": 0.6119, + "step": 18700 + }, + { + "ar_loss": 0.4461, + "epoch": 2.5839864761026585, + "fm_loss": 0.164, + "grad_norm": 1.7276713848114014, + "learning_rate": 4.04352319861741e-05, + "loss": 0.6102, + "step": 18800 + }, + { + "ar_loss": 0.4486, + "epoch": 2.599354541263255, + "fm_loss": 0.1655, + "grad_norm": 2.4759597778320312, + "learning_rate": 4.034000759713401e-05, + "loss": 0.6142, + "step": 18900 + }, + { + "ar_loss": 0.4461, + "epoch": 2.614722606423851, + "fm_loss": 0.164, + "grad_norm": 1.5683785676956177, + "learning_rate": 4.024442491015372e-05, + "loss": 0.6101, + "step": 19000 + }, + { + "ar_loss": 0.4495, + "epoch": 2.6300906715844476, + "fm_loss": 0.1637, + "grad_norm": 3.866807460784912, + "learning_rate": 4.014848615776666e-05, + "loss": 0.6132, + "step": 19100 + }, + { + "ar_loss": 0.447, + "epoch": 2.6454587367450437, + "fm_loss": 0.1635, + "grad_norm": 1.7105318307876587, + "learning_rate": 4.00521935808229e-05, + "loss": 0.6105, + "step": 19200 + }, + { + "ar_loss": 0.4454, + "epoch": 2.66082680190564, + "fm_loss": 0.1634, + "grad_norm": 2.6716930866241455, + "learning_rate": 3.995554942843687e-05, + "loss": 0.6088, + "step": 19300 + }, + { + "ar_loss": 0.4491, + "epoch": 2.6761948670662363, + "fm_loss": 0.1647, + "grad_norm": 1.8674702644348145, + "learning_rate": 3.9858555957934715e-05, + "loss": 0.6137, + "step": 19400 + }, + { + "ar_loss": 0.445, + "epoch": 2.691562932226833, + "fm_loss": 0.1631, + "grad_norm": 2.926995277404785, + "learning_rate": 3.976121543480169e-05, + "loss": 0.6081, + "step": 19500 + }, + { + "ar_loss": 0.4416, + "epoch": 2.706930997387429, + "fm_loss": 0.1641, + "grad_norm": 2.341655731201172, + "learning_rate": 3.966353013262917e-05, + "loss": 0.6057, + "step": 19600 + }, + { + "ar_loss": 0.4406, + "epoch": 2.722299062548025, + "fm_loss": 0.1648, + "grad_norm": 1.6259019374847412, + "learning_rate": 3.956550233306155e-05, + "loss": 0.6054, + "step": 19700 + }, + { + "ar_loss": 0.4439, + "epoch": 2.7376671277086215, + "fm_loss": 0.1626, + "grad_norm": 3.2317330837249756, + "learning_rate": 3.946713432574299e-05, + "loss": 0.6065, + "step": 19800 + }, + { + "ar_loss": 0.4445, + "epoch": 2.7530351928692176, + "fm_loss": 0.1636, + "grad_norm": 2.053748369216919, + "learning_rate": 3.936842840826391e-05, + "loss": 0.6081, + "step": 19900 + }, + { + "ar_loss": 0.444, + "epoch": 2.768403258029814, + "fm_loss": 0.1635, + "grad_norm": 3.10992431640625, + "learning_rate": 3.9269386886107304e-05, + "loss": 0.6076, + "step": 20000 + }, + { + "ar_loss": 0.4431, + "epoch": 2.78377132319041, + "fm_loss": 0.1634, + "grad_norm": 1.6531990766525269, + "learning_rate": 3.9170012072594944e-05, + "loss": 0.6065, + "step": 20100 + }, + { + "ar_loss": 0.4413, + "epoch": 2.7991393883510067, + "fm_loss": 0.1642, + "grad_norm": 2.239360809326172, + "learning_rate": 3.90703062888333e-05, + "loss": 0.6055, + "step": 20200 + }, + { + "ar_loss": 0.4411, + "epoch": 2.814507453511603, + "fm_loss": 0.1635, + "grad_norm": 2.6573100090026855, + "learning_rate": 3.8970271863659366e-05, + "loss": 0.6046, + "step": 20300 + }, + { + "ar_loss": 0.4388, + "epoch": 2.8298755186721993, + "fm_loss": 0.1639, + "grad_norm": 2.9059882164001465, + "learning_rate": 3.886991113358621e-05, + "loss": 0.6027, + "step": 20400 + }, + { + "ar_loss": 0.4413, + "epoch": 2.8452435838327954, + "fm_loss": 0.1647, + "grad_norm": 1.4836231470108032, + "learning_rate": 3.876922644274847e-05, + "loss": 0.6061, + "step": 20500 + }, + { + "ar_loss": 0.4394, + "epoch": 2.860611648993392, + "fm_loss": 0.1639, + "grad_norm": 3.667249917984009, + "learning_rate": 3.866822014284753e-05, + "loss": 0.6033, + "step": 20600 + }, + { + "ar_loss": 0.437, + "epoch": 2.875979714153988, + "fm_loss": 0.1631, + "grad_norm": 2.724766731262207, + "learning_rate": 3.8566894593096646e-05, + "loss": 0.6001, + "step": 20700 + }, + { + "ar_loss": 0.4408, + "epoch": 2.891347779314584, + "fm_loss": 0.1634, + "grad_norm": 3.302048683166504, + "learning_rate": 3.846525216016581e-05, + "loss": 0.6043, + "step": 20800 + }, + { + "ar_loss": 0.4374, + "epoch": 2.9067158444751806, + "fm_loss": 0.1632, + "grad_norm": 5.427467346191406, + "learning_rate": 3.836329521812651e-05, + "loss": 0.6006, + "step": 20900 + }, + { + "ar_loss": 0.4404, + "epoch": 2.922083909635777, + "fm_loss": 0.1632, + "grad_norm": 2.2337872982025146, + "learning_rate": 3.826102614839621e-05, + "loss": 0.6037, + "step": 21000 + }, + { + "ar_loss": 0.4384, + "epoch": 2.937451974796373, + "fm_loss": 0.1629, + "grad_norm": 13.053716659545898, + "learning_rate": 3.815844733968281e-05, + "loss": 0.6014, + "step": 21100 + }, + { + "ar_loss": 0.4374, + "epoch": 2.9528200399569693, + "fm_loss": 0.1623, + "grad_norm": 2.655907392501831, + "learning_rate": 3.8055561187928776e-05, + "loss": 0.5997, + "step": 21200 + }, + { + "ar_loss": 0.4344, + "epoch": 2.968188105117566, + "fm_loss": 0.163, + "grad_norm": 4.967069625854492, + "learning_rate": 3.795237009625523e-05, + "loss": 0.5974, + "step": 21300 + }, + { + "ar_loss": 0.4384, + "epoch": 2.983556170278162, + "fm_loss": 0.1639, + "grad_norm": 1.5047295093536377, + "learning_rate": 3.784887647490581e-05, + "loss": 0.6023, + "step": 21400 + }, + { + "ar_loss": 0.435, + "epoch": 2.9989242354387584, + "fm_loss": 0.1626, + "grad_norm": 2.5669796466827393, + "learning_rate": 3.774508274119035e-05, + "loss": 0.5976, + "step": 21500 + }, + { + "ar_loss": 0.4195, + "epoch": 3.0142923005993545, + "fm_loss": 0.1629, + "grad_norm": 2.004462718963623, + "learning_rate": 3.764099131942846e-05, + "loss": 0.5825, + "step": 21600 + }, + { + "ar_loss": 0.4153, + "epoch": 3.029660365759951, + "fm_loss": 0.1619, + "grad_norm": 4.897035121917725, + "learning_rate": 3.753660464089285e-05, + "loss": 0.5773, + "step": 21700 + }, + { + "ar_loss": 0.414, + "epoch": 3.045028430920547, + "fm_loss": 0.1628, + "grad_norm": 5.358692169189453, + "learning_rate": 3.743192514375257e-05, + "loss": 0.5768, + "step": 21800 + }, + { + "ar_loss": 0.417, + "epoch": 3.060396496081143, + "fm_loss": 0.1624, + "grad_norm": 2.0926132202148438, + "learning_rate": 3.732695527301609e-05, + "loss": 0.5793, + "step": 21900 + }, + { + "ar_loss": 0.4168, + "epoch": 3.0757645612417397, + "fm_loss": 0.1626, + "grad_norm": 2.1865603923797607, + "learning_rate": 3.722169748047413e-05, + "loss": 0.5793, + "step": 22000 + }, + { + "ar_loss": 0.4142, + "epoch": 3.0911326264023358, + "fm_loss": 0.163, + "grad_norm": 1.9334361553192139, + "learning_rate": 3.711615422464244e-05, + "loss": 0.5772, + "step": 22100 + }, + { + "ar_loss": 0.4156, + "epoch": 3.1065006915629323, + "fm_loss": 0.163, + "grad_norm": 2.2630386352539062, + "learning_rate": 3.701032797070436e-05, + "loss": 0.5785, + "step": 22200 + }, + { + "ar_loss": 0.4122, + "epoch": 3.1218687567235284, + "fm_loss": 0.1706, + "grad_norm": 1.959439754486084, + "learning_rate": 3.690422119045325e-05, + "loss": 0.5828, + "step": 22300 + }, + { + "ar_loss": 0.4142, + "epoch": 3.137236821884125, + "fm_loss": 0.1717, + "grad_norm": 2.2783379554748535, + "learning_rate": 3.6797836362234745e-05, + "loss": 0.5859, + "step": 22400 + }, + { + "ar_loss": 0.4142, + "epoch": 3.152604887044721, + "fm_loss": 0.1634, + "grad_norm": 2.1327381134033203, + "learning_rate": 3.669117597088885e-05, + "loss": 0.5776, + "step": 22500 + }, + { + "ar_loss": 0.4125, + "epoch": 3.1679729522053175, + "fm_loss": 0.1665, + "grad_norm": 2.315673828125, + "learning_rate": 3.658424250769195e-05, + "loss": 0.579, + "step": 22600 + }, + { + "ar_loss": 0.4148, + "epoch": 3.1833410173659136, + "fm_loss": 0.1629, + "grad_norm": 3.1342201232910156, + "learning_rate": 3.647703847029858e-05, + "loss": 0.5778, + "step": 22700 + }, + { + "ar_loss": 0.4164, + "epoch": 3.19870908252651, + "fm_loss": 0.1615, + "grad_norm": 2.276103973388672, + "learning_rate": 3.6369566362683115e-05, + "loss": 0.5778, + "step": 22800 + }, + { + "ar_loss": 0.415, + "epoch": 3.214077147687106, + "fm_loss": 0.1634, + "grad_norm": 3.8148179054260254, + "learning_rate": 3.626182869508124e-05, + "loss": 0.5784, + "step": 22900 + }, + { + "ar_loss": 0.4135, + "epoch": 3.2294452128477023, + "fm_loss": 0.163, + "grad_norm": 2.029802083969116, + "learning_rate": 3.6153827983931395e-05, + "loss": 0.5765, + "step": 23000 + }, + { + "ar_loss": 0.4108, + "epoch": 3.2448132780082988, + "fm_loss": 0.1616, + "grad_norm": 1.9012216329574585, + "learning_rate": 3.6045566751815906e-05, + "loss": 0.5724, + "step": 23100 + }, + { + "ar_loss": 0.4118, + "epoch": 3.260181343168895, + "fm_loss": 0.1631, + "grad_norm": 1.9454518556594849, + "learning_rate": 3.593704752740214e-05, + "loss": 0.5749, + "step": 23200 + }, + { + "ar_loss": 0.4114, + "epoch": 3.2755494083294914, + "fm_loss": 0.1646, + "grad_norm": 5.499399662017822, + "learning_rate": 3.5828272845383395e-05, + "loss": 0.576, + "step": 23300 + }, + { + "ar_loss": 0.4099, + "epoch": 3.2909174734900875, + "fm_loss": 0.1622, + "grad_norm": 2.5215775966644287, + "learning_rate": 3.571924524641973e-05, + "loss": 0.5721, + "step": 23400 + }, + { + "ar_loss": 0.4132, + "epoch": 3.306285538650684, + "fm_loss": 0.1619, + "grad_norm": 1.9290796518325806, + "learning_rate": 3.56099672770786e-05, + "loss": 0.575, + "step": 23500 + }, + { + "ar_loss": 0.4106, + "epoch": 3.32165360381128, + "fm_loss": 0.1653, + "grad_norm": 3.0975518226623535, + "learning_rate": 3.550044148977539e-05, + "loss": 0.5759, + "step": 23600 + }, + { + "ar_loss": 0.4109, + "epoch": 3.3370216689718766, + "fm_loss": 0.1625, + "grad_norm": 3.079251289367676, + "learning_rate": 3.539067044271378e-05, + "loss": 0.5734, + "step": 23700 + }, + { + "ar_loss": 0.4106, + "epoch": 3.3523897341324727, + "fm_loss": 0.162, + "grad_norm": 3.6910111904144287, + "learning_rate": 3.5280656699826016e-05, + "loss": 0.5726, + "step": 23800 + }, + { + "ar_loss": 0.4081, + "epoch": 3.367757799293069, + "fm_loss": 0.1616, + "grad_norm": 2.8438339233398438, + "learning_rate": 3.5170402830713004e-05, + "loss": 0.5698, + "step": 23900 + }, + { + "ar_loss": 0.4122, + "epoch": 3.3831258644536653, + "fm_loss": 0.1616, + "grad_norm": 5.298975467681885, + "learning_rate": 3.505991141058431e-05, + "loss": 0.5738, + "step": 24000 + }, + { + "ar_loss": 0.4107, + "epoch": 3.3984939296142613, + "fm_loss": 0.1614, + "grad_norm": 2.1549863815307617, + "learning_rate": 3.494918502019798e-05, + "loss": 0.5721, + "step": 24100 + }, + { + "ar_loss": 0.4095, + "epoch": 3.413861994774858, + "fm_loss": 0.1615, + "grad_norm": 2.1645219326019287, + "learning_rate": 3.483822624580031e-05, + "loss": 0.571, + "step": 24200 + }, + { + "ar_loss": 0.4102, + "epoch": 3.429230059935454, + "fm_loss": 0.1623, + "grad_norm": 3.294847249984741, + "learning_rate": 3.472703767906539e-05, + "loss": 0.5724, + "step": 24300 + }, + { + "ar_loss": 0.4116, + "epoch": 3.4445981250960505, + "fm_loss": 0.1628, + "grad_norm": 2.6318018436431885, + "learning_rate": 3.461562191703459e-05, + "loss": 0.5744, + "step": 24400 + }, + { + "ar_loss": 0.4116, + "epoch": 3.4599661902566465, + "fm_loss": 0.161, + "grad_norm": 4.470794677734375, + "learning_rate": 3.450398156205592e-05, + "loss": 0.5726, + "step": 24500 + }, + { + "ar_loss": 0.4101, + "epoch": 3.475334255417243, + "fm_loss": 0.1609, + "grad_norm": 3.786229133605957, + "learning_rate": 3.43921192217232e-05, + "loss": 0.571, + "step": 24600 + }, + { + "ar_loss": 0.4085, + "epoch": 3.490702320577839, + "fm_loss": 0.1606, + "grad_norm": 3.3834242820739746, + "learning_rate": 3.42800375088152e-05, + "loss": 0.5691, + "step": 24700 + }, + { + "ar_loss": 0.4106, + "epoch": 3.5060703857384357, + "fm_loss": 0.1612, + "grad_norm": 2.6395816802978516, + "learning_rate": 3.4167739041234595e-05, + "loss": 0.5718, + "step": 24800 + }, + { + "ar_loss": 0.4095, + "epoch": 3.5214384508990317, + "fm_loss": 0.161, + "grad_norm": 3.1765825748443604, + "learning_rate": 3.405522644194682e-05, + "loss": 0.5705, + "step": 24900 + }, + { + "ar_loss": 0.4098, + "epoch": 3.5368065160596283, + "fm_loss": 0.1619, + "grad_norm": 2.2531938552856445, + "learning_rate": 3.3942502338918795e-05, + "loss": 0.5716, + "step": 25000 + }, + { + "ar_loss": 0.4074, + "epoch": 3.5521745812202243, + "fm_loss": 0.1624, + "grad_norm": 2.6430165767669678, + "learning_rate": 3.382956936505755e-05, + "loss": 0.5698, + "step": 25100 + }, + { + "ar_loss": 0.4096, + "epoch": 3.5675426463808204, + "fm_loss": 0.1607, + "grad_norm": 2.1325323581695557, + "learning_rate": 3.371643015814874e-05, + "loss": 0.5703, + "step": 25200 + }, + { + "ar_loss": 0.4111, + "epoch": 3.582910711541417, + "fm_loss": 0.1614, + "grad_norm": 2.9714601039886475, + "learning_rate": 3.360308736079502e-05, + "loss": 0.5725, + "step": 25300 + }, + { + "ar_loss": 0.4086, + "epoch": 3.5982787767020135, + "fm_loss": 0.1612, + "grad_norm": 2.1280746459960938, + "learning_rate": 3.348954362035432e-05, + "loss": 0.5698, + "step": 25400 + }, + { + "ar_loss": 0.4074, + "epoch": 3.6136468418626095, + "fm_loss": 0.1608, + "grad_norm": 8.301246643066406, + "learning_rate": 3.337580158887802e-05, + "loss": 0.5681, + "step": 25500 + }, + { + "ar_loss": 0.4054, + "epoch": 3.6290149070232056, + "fm_loss": 0.164, + "grad_norm": 2.632568120956421, + "learning_rate": 3.326186392304901e-05, + "loss": 0.5694, + "step": 25600 + }, + { + "ar_loss": 0.4076, + "epoch": 3.644382972183802, + "fm_loss": 0.1616, + "grad_norm": 4.138895511627197, + "learning_rate": 3.314773328411962e-05, + "loss": 0.5692, + "step": 25700 + }, + { + "ar_loss": 0.4059, + "epoch": 3.659751037344398, + "fm_loss": 0.1616, + "grad_norm": 2.5377886295318604, + "learning_rate": 3.3033412337849466e-05, + "loss": 0.5675, + "step": 25800 + }, + { + "ar_loss": 0.4085, + "epoch": 3.6751191025049947, + "fm_loss": 0.1617, + "grad_norm": 2.334913730621338, + "learning_rate": 3.2918903754443195e-05, + "loss": 0.5702, + "step": 25900 + }, + { + "ar_loss": 0.4064, + "epoch": 3.690487167665591, + "fm_loss": 0.1625, + "grad_norm": 1.9625128507614136, + "learning_rate": 3.2804210208488114e-05, + "loss": 0.5689, + "step": 26000 + }, + { + "ar_loss": 0.4057, + "epoch": 3.7058552328261873, + "fm_loss": 0.1617, + "grad_norm": 2.2251503467559814, + "learning_rate": 3.268933437889172e-05, + "loss": 0.5674, + "step": 26100 + }, + { + "ar_loss": 0.4066, + "epoch": 3.7212232979867834, + "fm_loss": 0.1617, + "grad_norm": 3.2878170013427734, + "learning_rate": 3.2574278948819105e-05, + "loss": 0.5683, + "step": 26200 + }, + { + "ar_loss": 0.4097, + "epoch": 3.7365913631473795, + "fm_loss": 0.1618, + "grad_norm": 3.281667947769165, + "learning_rate": 3.2459046605630334e-05, + "loss": 0.5715, + "step": 26300 + }, + { + "ar_loss": 0.406, + "epoch": 3.751959428307976, + "fm_loss": 0.1607, + "grad_norm": 2.7066891193389893, + "learning_rate": 3.234364004081763e-05, + "loss": 0.5667, + "step": 26400 + }, + { + "ar_loss": 0.4041, + "epoch": 3.7673274934685725, + "fm_loss": 0.1614, + "grad_norm": 4.11818790435791, + "learning_rate": 3.222806194994253e-05, + "loss": 0.5655, + "step": 26500 + }, + { + "ar_loss": 0.4039, + "epoch": 3.7826955586291686, + "fm_loss": 0.1611, + "grad_norm": 2.0963382720947266, + "learning_rate": 3.211231503257292e-05, + "loss": 0.565, + "step": 26600 + }, + { + "ar_loss": 0.4039, + "epoch": 3.7980636237897647, + "fm_loss": 0.1612, + "grad_norm": 8.769935607910156, + "learning_rate": 3.199640199221998e-05, + "loss": 0.5651, + "step": 26700 + }, + { + "ar_loss": 0.407, + "epoch": 3.813431688950361, + "fm_loss": 0.1614, + "grad_norm": 4.367818832397461, + "learning_rate": 3.188032553627505e-05, + "loss": 0.5684, + "step": 26800 + }, + { + "ar_loss": 0.4025, + "epoch": 3.8287997541109573, + "fm_loss": 0.1612, + "grad_norm": 3.1772165298461914, + "learning_rate": 3.1764088375946355e-05, + "loss": 0.5637, + "step": 26900 + }, + { + "ar_loss": 0.4051, + "epoch": 3.844167819271554, + "fm_loss": 0.1606, + "grad_norm": 2.3004674911499023, + "learning_rate": 3.1647693226195764e-05, + "loss": 0.5657, + "step": 27000 + }, + { + "ar_loss": 0.4023, + "epoch": 3.85953588443215, + "fm_loss": 0.1609, + "grad_norm": 2.438415050506592, + "learning_rate": 3.1531142805675244e-05, + "loss": 0.5633, + "step": 27100 + }, + { + "ar_loss": 0.405, + "epoch": 3.8749039495927464, + "fm_loss": 0.1615, + "grad_norm": 2.6293303966522217, + "learning_rate": 3.141443983666349e-05, + "loss": 0.5666, + "step": 27200 + }, + { + "ar_loss": 0.4022, + "epoch": 3.8902720147533425, + "fm_loss": 0.1609, + "grad_norm": 3.9701802730560303, + "learning_rate": 3.1297587045002265e-05, + "loss": 0.5631, + "step": 27300 + }, + { + "ar_loss": 0.4008, + "epoch": 3.9056400799139386, + "fm_loss": 0.1615, + "grad_norm": 1.8459994792938232, + "learning_rate": 3.118058716003277e-05, + "loss": 0.5623, + "step": 27400 + }, + { + "ar_loss": 0.4058, + "epoch": 3.921008145074535, + "fm_loss": 0.1612, + "grad_norm": 10.706082344055176, + "learning_rate": 3.106344291453185e-05, + "loss": 0.567, + "step": 27500 + }, + { + "ar_loss": 0.4022, + "epoch": 3.9363762102351316, + "fm_loss": 0.1609, + "grad_norm": 2.411083221435547, + "learning_rate": 3.09461570446482e-05, + "loss": 0.5632, + "step": 27600 + }, + { + "ar_loss": 0.4021, + "epoch": 3.9517442753957277, + "fm_loss": 0.1609, + "grad_norm": 1.9293774366378784, + "learning_rate": 3.082873228983847e-05, + "loss": 0.563, + "step": 27700 + }, + { + "ar_loss": 0.4021, + "epoch": 3.967112340556324, + "fm_loss": 0.1607, + "grad_norm": 2.12009334564209, + "learning_rate": 3.071117139280325e-05, + "loss": 0.5628, + "step": 27800 + }, + { + "ar_loss": 0.4052, + "epoch": 3.9824804057169203, + "fm_loss": 0.16, + "grad_norm": 1.866605281829834, + "learning_rate": 3.059347709942299e-05, + "loss": 0.5652, + "step": 27900 + }, + { + "ar_loss": 0.404, + "epoch": 3.9978484708775164, + "fm_loss": 0.1614, + "grad_norm": 3.483856439590454, + "learning_rate": 3.0475652158693912e-05, + "loss": 0.5653, + "step": 28000 + }, + { + "ar_loss": 0.3897, + "epoch": 4.0132165360381125, + "fm_loss": 0.1654, + "grad_norm": 6.412163734436035, + "learning_rate": 3.0357699322663784e-05, + "loss": 0.5552, + "step": 28100 + }, + { + "ar_loss": 0.3912, + "epoch": 4.028584601198709, + "fm_loss": 0.1622, + "grad_norm": 2.0246286392211914, + "learning_rate": 3.023962134636763e-05, + "loss": 0.5535, + "step": 28200 + }, + { + "ar_loss": 0.3892, + "epoch": 4.0439526663593055, + "fm_loss": 0.1619, + "grad_norm": 1.9690488576889038, + "learning_rate": 3.0121420987763393e-05, + "loss": 0.5511, + "step": 28300 + }, + { + "ar_loss": 0.3891, + "epoch": 4.059320731519902, + "fm_loss": 0.161, + "grad_norm": 2.742201805114746, + "learning_rate": 3.0003101007667485e-05, + "loss": 0.55, + "step": 28400 + }, + { + "ar_loss": 0.3889, + "epoch": 4.074688796680498, + "fm_loss": 0.1608, + "grad_norm": 15.931248664855957, + "learning_rate": 2.9884664169690356e-05, + "loss": 0.5496, + "step": 28500 + }, + { + "ar_loss": 0.3909, + "epoch": 4.090056861841094, + "fm_loss": 0.16, + "grad_norm": 2.156740665435791, + "learning_rate": 2.976611324017191e-05, + "loss": 0.5509, + "step": 28600 + }, + { + "ar_loss": 0.388, + "epoch": 4.105424927001691, + "fm_loss": 0.1659, + "grad_norm": 5.583569049835205, + "learning_rate": 2.9647450988116893e-05, + "loss": 0.5539, + "step": 28700 + }, + { + "ar_loss": 0.3909, + "epoch": 4.120792992162286, + "fm_loss": 0.1597, + "grad_norm": 2.5985569953918457, + "learning_rate": 2.9528680185130214e-05, + "loss": 0.5505, + "step": 28800 + }, + { + "ar_loss": 0.3892, + "epoch": 4.136161057322883, + "fm_loss": 0.1596, + "grad_norm": 3.106172800064087, + "learning_rate": 2.9409803605352237e-05, + "loss": 0.5488, + "step": 28900 + }, + { + "ar_loss": 0.3863, + "epoch": 4.151529122483479, + "fm_loss": 0.1611, + "grad_norm": 2.211129665374756, + "learning_rate": 2.929082402539395e-05, + "loss": 0.5473, + "step": 29000 + }, + { + "ar_loss": 0.39, + "epoch": 4.166897187644076, + "fm_loss": 0.1602, + "grad_norm": 1.949877142906189, + "learning_rate": 2.9171744224272113e-05, + "loss": 0.5502, + "step": 29100 + }, + { + "ar_loss": 0.387, + "epoch": 4.1822652528046715, + "fm_loss": 0.1646, + "grad_norm": 8.61090087890625, + "learning_rate": 2.9052566983344388e-05, + "loss": 0.5516, + "step": 29200 + }, + { + "ar_loss": 0.391, + "epoch": 4.197633317965268, + "fm_loss": 0.1596, + "grad_norm": 2.0502259731292725, + "learning_rate": 2.893329508624433e-05, + "loss": 0.5506, + "step": 29300 + }, + { + "ar_loss": 0.3897, + "epoch": 4.213001383125865, + "fm_loss": 0.1603, + "grad_norm": 2.44155216217041, + "learning_rate": 2.8813931318816395e-05, + "loss": 0.55, + "step": 29400 + }, + { + "ar_loss": 0.387, + "epoch": 4.228369448286461, + "fm_loss": 0.1616, + "grad_norm": 2.004561185836792, + "learning_rate": 2.869447846905085e-05, + "loss": 0.5487, + "step": 29500 + }, + { + "ar_loss": 0.3875, + "epoch": 4.243737513447057, + "fm_loss": 0.1598, + "grad_norm": 5.270979881286621, + "learning_rate": 2.8574939327018685e-05, + "loss": 0.5473, + "step": 29600 + }, + { + "ar_loss": 0.3873, + "epoch": 4.259105578607653, + "fm_loss": 0.1605, + "grad_norm": 6.422144412994385, + "learning_rate": 2.8455316684806404e-05, + "loss": 0.5478, + "step": 29700 + }, + { + "ar_loss": 0.3856, + "epoch": 4.27447364376825, + "fm_loss": 0.1609, + "grad_norm": 9.331077575683594, + "learning_rate": 2.833561333645085e-05, + "loss": 0.5465, + "step": 29800 + }, + { + "ar_loss": 0.3887, + "epoch": 4.289841708928845, + "fm_loss": 0.1603, + "grad_norm": 3.2531440258026123, + "learning_rate": 2.8215832077873928e-05, + "loss": 0.549, + "step": 29900 + }, + { + "ar_loss": 0.3879, + "epoch": 4.305209774089442, + "fm_loss": 0.1616, + "grad_norm": 4.434958457946777, + "learning_rate": 2.8095975706817283e-05, + "loss": 0.5495, + "step": 30000 + }, + { + "ar_loss": 0.3873, + "epoch": 4.015372790161415, + "fm_loss": 0.16, + "grad_norm": 2.223679542541504, + "learning_rate": 2.7964958224364322e-05, + "loss": 0.5474, + "step": 30100 + }, + { + "ar_loss": 0.3861, + "epoch": 4.0307455803228285, + "fm_loss": 0.1608, + "grad_norm": 2.2263758182525635, + "learning_rate": 2.7844916800976393e-05, + "loss": 0.5468, + "step": 30200 + }, + { + "ar_loss": 0.385, + "epoch": 4.046118370484243, + "fm_loss": 0.1591, + "grad_norm": 3.2188937664031982, + "learning_rate": 2.772480888770234e-05, + "loss": 0.5442, + "step": 30300 + }, + { + "ar_loss": 0.3838, + "epoch": 4.061491160645657, + "fm_loss": 0.1605, + "grad_norm": 2.1031410694122314, + "learning_rate": 2.7604637291640594e-05, + "loss": 0.5443, + "step": 30400 + }, + { + "ar_loss": 0.3874, + "epoch": 4.076863950807072, + "fm_loss": 0.1604, + "grad_norm": 3.646202564239502, + "learning_rate": 2.748440482137793e-05, + "loss": 0.5478, + "step": 30500 + }, + { + "ar_loss": 0.3888, + "epoch": 4.092236740968485, + "fm_loss": 0.16, + "grad_norm": 3.0315663814544678, + "learning_rate": 2.7364114286923865e-05, + "loss": 0.5488, + "step": 30600 + }, + { + "ar_loss": 0.386, + "epoch": 4.1076095311299, + "fm_loss": 0.1597, + "grad_norm": 2.8235671520233154, + "learning_rate": 2.7243768499644946e-05, + "loss": 0.5457, + "step": 30700 + }, + { + "ar_loss": 0.3889, + "epoch": 4.122982321291314, + "fm_loss": 0.161, + "grad_norm": 2.2409422397613525, + "learning_rate": 2.7123370272199055e-05, + "loss": 0.55, + "step": 30800 + }, + { + "ar_loss": 0.3876, + "epoch": 4.138355111452729, + "fm_loss": 0.1612, + "grad_norm": 2.19675874710083, + "learning_rate": 2.700292241846971e-05, + "loss": 0.5488, + "step": 30900 + }, + { + "ar_loss": 0.3871, + "epoch": 4.153727901614143, + "fm_loss": 0.1601, + "grad_norm": 2.340182304382324, + "learning_rate": 2.6882427753500245e-05, + "loss": 0.5471, + "step": 31000 + }, + { + "ar_loss": 0.3879, + "epoch": 4.169100691775557, + "fm_loss": 0.16, + "grad_norm": 3.1240203380584717, + "learning_rate": 2.676188909342801e-05, + "loss": 0.5479, + "step": 31100 + }, + { + "ar_loss": 0.3832, + "epoch": 4.184473481936972, + "fm_loss": 0.1603, + "grad_norm": 2.1297569274902344, + "learning_rate": 2.664130925541865e-05, + "loss": 0.5436, + "step": 31200 + }, + { + "ar_loss": 0.3889, + "epoch": 4.199846272098386, + "fm_loss": 0.1599, + "grad_norm": 3.824352264404297, + "learning_rate": 2.6520691057600155e-05, + "loss": 0.5488, + "step": 31300 + }, + { + "ar_loss": 0.388, + "epoch": 4.2152190622598, + "fm_loss": 0.1605, + "grad_norm": 1.8939049243927002, + "learning_rate": 2.6400037318997046e-05, + "loss": 0.5485, + "step": 31400 + }, + { + "ar_loss": 0.3886, + "epoch": 4.230591852421214, + "fm_loss": 0.1609, + "grad_norm": 2.476659059524536, + "learning_rate": 2.6279350859464502e-05, + "loss": 0.5495, + "step": 31500 + }, + { + "ar_loss": 0.3888, + "epoch": 4.245964642582629, + "fm_loss": 0.1602, + "grad_norm": 3.117105007171631, + "learning_rate": 2.6158634499622425e-05, + "loss": 0.549, + "step": 31600 + }, + { + "ar_loss": 0.386, + "epoch": 4.261337432744043, + "fm_loss": 0.1603, + "grad_norm": 2.931809663772583, + "learning_rate": 2.6037891060789514e-05, + "loss": 0.5464, + "step": 31700 + }, + { + "ar_loss": 0.382, + "epoch": 4.276710222905457, + "fm_loss": 0.1591, + "grad_norm": 2.1063570976257324, + "learning_rate": 2.5917123364917378e-05, + "loss": 0.5411, + "step": 31800 + }, + { + "ar_loss": 0.3863, + "epoch": 4.292083013066872, + "fm_loss": 0.1601, + "grad_norm": 4.796857833862305, + "learning_rate": 2.5796334234524533e-05, + "loss": 0.5463, + "step": 31900 + }, + { + "ar_loss": 0.3849, + "epoch": 4.307455803228286, + "fm_loss": 0.1612, + "grad_norm": 2.840190887451172, + "learning_rate": 2.567552649263044e-05, + "loss": 0.5461, + "step": 32000 + }, + { + "ar_loss": 0.3876, + "epoch": 4.3228285933897, + "fm_loss": 0.1686, + "grad_norm": 2.64380145072937, + "learning_rate": 2.5554702962689563e-05, + "loss": 0.5562, + "step": 32100 + }, + { + "ar_loss": 0.3855, + "epoch": 4.338201383551114, + "fm_loss": 0.1599, + "grad_norm": 4.3930535316467285, + "learning_rate": 2.5433866468525342e-05, + "loss": 0.5453, + "step": 32200 + }, + { + "ar_loss": 0.3825, + "epoch": 4.353574173712529, + "fm_loss": 0.1587, + "grad_norm": 2.1165313720703125, + "learning_rate": 2.531301983426419e-05, + "loss": 0.5412, + "step": 32300 + }, + { + "ar_loss": 0.3849, + "epoch": 4.3689469638739435, + "fm_loss": 0.1601, + "grad_norm": 3.832287073135376, + "learning_rate": 2.519216588426955e-05, + "loss": 0.5449, + "step": 32400 + }, + { + "ar_loss": 0.3872, + "epoch": 4.384319754035357, + "fm_loss": 0.16, + "grad_norm": 2.8910598754882812, + "learning_rate": 2.507130744307581e-05, + "loss": 0.5472, + "step": 32500 + }, + { + "ar_loss": 0.3847, + "epoch": 4.399692544196772, + "fm_loss": 0.1603, + "grad_norm": 2.692793607711792, + "learning_rate": 2.4950447335322335e-05, + "loss": 0.545, + "step": 32600 + }, + { + "ar_loss": 0.3831, + "epoch": 4.415065334358186, + "fm_loss": 0.1595, + "grad_norm": 2.35683536529541, + "learning_rate": 2.482958838568746e-05, + "loss": 0.5426, + "step": 32700 + }, + { + "ar_loss": 0.3861, + "epoch": 4.4304381245196005, + "fm_loss": 0.159, + "grad_norm": 1.9129565954208374, + "learning_rate": 2.4708733418822427e-05, + "loss": 0.5451, + "step": 32800 + }, + { + "ar_loss": 0.3868, + "epoch": 4.445810914681014, + "fm_loss": 0.1599, + "grad_norm": 4.045548915863037, + "learning_rate": 2.4587885259285396e-05, + "loss": 0.5466, + "step": 32900 + }, + { + "ar_loss": 0.3834, + "epoch": 4.461183704842429, + "fm_loss": 0.1601, + "grad_norm": 2.0863168239593506, + "learning_rate": 2.446704673147544e-05, + "loss": 0.5436, + "step": 33000 + }, + { + "ar_loss": 0.3837, + "epoch": 4.476556495003843, + "fm_loss": 0.1596, + "grad_norm": 2.455124855041504, + "learning_rate": 2.4346220659566513e-05, + "loss": 0.5433, + "step": 33100 + }, + { + "ar_loss": 0.3841, + "epoch": 4.4919292851652575, + "fm_loss": 0.1593, + "grad_norm": 2.560342311859131, + "learning_rate": 2.4225409867441483e-05, + "loss": 0.5434, + "step": 33200 + }, + { + "ar_loss": 0.3813, + "epoch": 4.507302075326672, + "fm_loss": 0.1589, + "grad_norm": 13.45487117767334, + "learning_rate": 2.4104617178626075e-05, + "loss": 0.5402, + "step": 33300 + }, + { + "ar_loss": 0.3854, + "epoch": 4.522674865488086, + "fm_loss": 0.1606, + "grad_norm": 1.9280191659927368, + "learning_rate": 2.3983845416222943e-05, + "loss": 0.546, + "step": 33400 + }, + { + "ar_loss": 0.3861, + "epoch": 4.538047655649501, + "fm_loss": 0.1599, + "grad_norm": 4.483201503753662, + "learning_rate": 2.386309740284562e-05, + "loss": 0.5461, + "step": 33500 + }, + { + "ar_loss": 0.3837, + "epoch": 4.553420445810914, + "fm_loss": 0.1597, + "grad_norm": 2.269436836242676, + "learning_rate": 2.3742375960552628e-05, + "loss": 0.5434, + "step": 33600 + }, + { + "ar_loss": 0.3845, + "epoch": 4.568793235972329, + "fm_loss": 0.1595, + "grad_norm": 2.5172126293182373, + "learning_rate": 2.3621683910781458e-05, + "loss": 0.544, + "step": 33700 + }, + { + "ar_loss": 0.3859, + "epoch": 4.584166026133743, + "fm_loss": 0.1586, + "grad_norm": 29.834903717041016, + "learning_rate": 2.3501024074282665e-05, + "loss": 0.5445, + "step": 33800 + }, + { + "ar_loss": 0.3864, + "epoch": 4.599538816295158, + "fm_loss": 0.16, + "grad_norm": 2.567058563232422, + "learning_rate": 2.3380399271053953e-05, + "loss": 0.5464, + "step": 33900 + }, + { + "ar_loss": 0.3836, + "epoch": 4.614911606456571, + "fm_loss": 0.1598, + "grad_norm": 2.2282016277313232, + "learning_rate": 2.3259812320274206e-05, + "loss": 0.5434, + "step": 34000 + }, + { + "ar_loss": 0.3875, + "epoch": 4.630284396617986, + "fm_loss": 0.1595, + "grad_norm": 2.367128849029541, + "learning_rate": 2.313926604023767e-05, + "loss": 0.5469, + "step": 34100 + }, + { + "ar_loss": 0.3826, + "epoch": 4.645657186779401, + "fm_loss": 0.1587, + "grad_norm": 2.4427452087402344, + "learning_rate": 2.3018763248288043e-05, + "loss": 0.5413, + "step": 34200 + }, + { + "ar_loss": 0.3856, + "epoch": 4.661029976940815, + "fm_loss": 0.1597, + "grad_norm": 3.8236427307128906, + "learning_rate": 2.289830676075265e-05, + "loss": 0.5453, + "step": 34300 + }, + { + "ar_loss": 0.3857, + "epoch": 4.676402767102229, + "fm_loss": 0.16, + "grad_norm": 3.789658546447754, + "learning_rate": 2.2777899392876596e-05, + "loss": 0.5457, + "step": 34400 + }, + { + "ar_loss": 0.3843, + "epoch": 4.691775557263643, + "fm_loss": 0.1598, + "grad_norm": 7.402520656585693, + "learning_rate": 2.265754395875703e-05, + "loss": 0.5441, + "step": 34500 + }, + { + "ar_loss": 0.3828, + "epoch": 4.707148347425058, + "fm_loss": 0.1606, + "grad_norm": 2.951808452606201, + "learning_rate": 2.2537243271277286e-05, + "loss": 0.5434, + "step": 34600 + }, + { + "ar_loss": 0.3828, + "epoch": 4.722521137586472, + "fm_loss": 0.1602, + "grad_norm": 2.665632724761963, + "learning_rate": 2.241700014204121e-05, + "loss": 0.543, + "step": 34700 + }, + { + "ar_loss": 0.3818, + "epoch": 4.737893927747886, + "fm_loss": 0.1606, + "grad_norm": 2.390002727508545, + "learning_rate": 2.2296817381307425e-05, + "loss": 0.5423, + "step": 34800 + }, + { + "ar_loss": 0.3864, + "epoch": 4.753266717909301, + "fm_loss": 0.1599, + "grad_norm": 5.915409564971924, + "learning_rate": 2.2176697797923653e-05, + "loss": 0.5463, + "step": 34900 + }, + { + "ar_loss": 0.3823, + "epoch": 4.768639508070715, + "fm_loss": 0.165, + "grad_norm": 3.208481550216675, + "learning_rate": 2.205664419926106e-05, + "loss": 0.5473, + "step": 35000 + }, + { + "ar_loss": 0.384, + "epoch": 4.784012298232129, + "fm_loss": 0.1604, + "grad_norm": 2.6016440391540527, + "learning_rate": 2.1936659391148682e-05, + "loss": 0.5444, + "step": 35100 + }, + { + "ar_loss": 0.3839, + "epoch": 4.799385088393543, + "fm_loss": 0.16, + "grad_norm": 7.024256229400635, + "learning_rate": 2.1816746177807777e-05, + "loss": 0.5438, + "step": 35200 + }, + { + "ar_loss": 0.3835, + "epoch": 4.814757878554958, + "fm_loss": 0.1604, + "grad_norm": 2.983032464981079, + "learning_rate": 2.169690736178636e-05, + "loss": 0.5439, + "step": 35300 + }, + { + "ar_loss": 0.3857, + "epoch": 4.830130668716372, + "fm_loss": 0.1596, + "grad_norm": 3.4045064449310303, + "learning_rate": 2.1577145743893652e-05, + "loss": 0.5453, + "step": 35400 + }, + { + "ar_loss": 0.3799, + "epoch": 4.845503458877786, + "fm_loss": 0.1606, + "grad_norm": 3.3932929039001465, + "learning_rate": 2.1457464123134654e-05, + "loss": 0.5406, + "step": 35500 + }, + { + "ar_loss": 0.3815, + "epoch": 4.860876249039201, + "fm_loss": 0.1618, + "grad_norm": 5.131358623504639, + "learning_rate": 2.1337865296644693e-05, + "loss": 0.5433, + "step": 35600 + }, + { + "ar_loss": 0.3839, + "epoch": 4.876249039200615, + "fm_loss": 0.1597, + "grad_norm": 2.781935453414917, + "learning_rate": 2.1218352059624125e-05, + "loss": 0.5436, + "step": 35700 + }, + { + "ar_loss": 0.3836, + "epoch": 4.8916218293620295, + "fm_loss": 0.1593, + "grad_norm": 4.221926689147949, + "learning_rate": 2.1098927205272888e-05, + "loss": 0.5429, + "step": 35800 + }, + { + "ar_loss": 0.3839, + "epoch": 4.906994619523443, + "fm_loss": 0.1595, + "grad_norm": 3.9555509090423584, + "learning_rate": 2.0979593524725326e-05, + "loss": 0.5434, + "step": 35900 + }, + { + "ar_loss": 0.3815, + "epoch": 4.922367409684858, + "fm_loss": 0.1597, + "grad_norm": 2.448594331741333, + "learning_rate": 2.0860353806984917e-05, + "loss": 0.5412, + "step": 36000 + }, + { + "ar_loss": 0.3813, + "epoch": 4.937740199846272, + "fm_loss": 0.1604, + "grad_norm": 2.342075824737549, + "learning_rate": 2.074121083885907e-05, + "loss": 0.5417, + "step": 36100 + }, + { + "ar_loss": 0.3825, + "epoch": 4.9531129900076865, + "fm_loss": 0.1601, + "grad_norm": 7.287113666534424, + "learning_rate": 2.0622167404894034e-05, + "loss": 0.5426, + "step": 36200 + }, + { + "ar_loss": 0.3835, + "epoch": 4.9684857801691, + "fm_loss": 0.1583, + "grad_norm": 15.53018569946289, + "learning_rate": 2.0503226287309786e-05, + "loss": 0.5418, + "step": 36300 + }, + { + "ar_loss": 0.3832, + "epoch": 4.983858570330515, + "fm_loss": 0.1783, + "grad_norm": 36.44465637207031, + "learning_rate": 2.0384390265935027e-05, + "loss": 0.5614, + "step": 36400 + }, + { + "ar_loss": 0.382, + "epoch": 4.99923136049193, + "fm_loss": 0.1595, + "grad_norm": 4.1146745681762695, + "learning_rate": 2.02656621181422e-05, + "loss": 0.5415, + "step": 36500 + }, + { + "ar_loss": 0.3779, + "epoch": 5.014604150653343, + "fm_loss": 0.1594, + "grad_norm": 2.2293853759765625, + "learning_rate": 2.0147044618782585e-05, + "loss": 0.5373, + "step": 36600 + }, + { + "ar_loss": 0.3786, + "epoch": 5.029976940814758, + "fm_loss": 0.1599, + "grad_norm": 3.3400254249572754, + "learning_rate": 2.0028540540121444e-05, + "loss": 0.5386, + "step": 36700 + }, + { + "ar_loss": 0.3761, + "epoch": 5.045349730976172, + "fm_loss": 0.1594, + "grad_norm": 2.30407452583313, + "learning_rate": 1.9910152651773235e-05, + "loss": 0.5355, + "step": 36800 + }, + { + "ar_loss": 0.3785, + "epoch": 5.060722521137587, + "fm_loss": 0.1592, + "grad_norm": 2.3268890380859375, + "learning_rate": 1.9791883720636864e-05, + "loss": 0.5376, + "step": 36900 + }, + { + "ar_loss": 0.3771, + "epoch": 5.076095311299, + "fm_loss": 0.1583, + "grad_norm": 4.636923789978027, + "learning_rate": 1.967373651083106e-05, + "loss": 0.5354, + "step": 37000 + }, + { + "ar_loss": 0.3741, + "epoch": 5.091468101460415, + "fm_loss": 0.1595, + "grad_norm": 8.473295211791992, + "learning_rate": 1.9555713783629714e-05, + "loss": 0.5336, + "step": 37100 + }, + { + "ar_loss": 0.3763, + "epoch": 5.10684089162183, + "fm_loss": 0.1588, + "grad_norm": 2.227837562561035, + "learning_rate": 1.9437818297397403e-05, + "loss": 0.5351, + "step": 37200 + }, + { + "ar_loss": 0.3778, + "epoch": 5.1222136817832435, + "fm_loss": 0.1597, + "grad_norm": 2.5238685607910156, + "learning_rate": 1.9320052807524874e-05, + "loss": 0.5376, + "step": 37300 + }, + { + "ar_loss": 0.3799, + "epoch": 5.137586471944658, + "fm_loss": 0.16, + "grad_norm": 3.3944637775421143, + "learning_rate": 1.9202420066364678e-05, + "loss": 0.5399, + "step": 37400 + }, + { + "ar_loss": 0.3767, + "epoch": 5.152959262106072, + "fm_loss": 0.1584, + "grad_norm": 16.466928482055664, + "learning_rate": 1.908492282316683e-05, + "loss": 0.5351, + "step": 37500 + }, + { + "ar_loss": 0.3743, + "epoch": 5.168332052267487, + "fm_loss": 0.1591, + "grad_norm": 2.3075056076049805, + "learning_rate": 1.8967563824014563e-05, + "loss": 0.5334, + "step": 37600 + }, + { + "ar_loss": 0.3821, + "epoch": 5.1837048424289005, + "fm_loss": 0.1595, + "grad_norm": 2.2367005348205566, + "learning_rate": 1.8850345811760152e-05, + "loss": 0.5415, + "step": 37700 + }, + { + "ar_loss": 0.3746, + "epoch": 5.199077632590315, + "fm_loss": 0.1596, + "grad_norm": 1.8881235122680664, + "learning_rate": 1.873327152596077e-05, + "loss": 0.5342, + "step": 37800 + }, + { + "ar_loss": 0.3776, + "epoch": 5.21445042275173, + "fm_loss": 0.1589, + "grad_norm": 2.585137128829956, + "learning_rate": 1.861634370281453e-05, + "loss": 0.5365, + "step": 37900 + }, + { + "ar_loss": 0.3767, + "epoch": 5.229823212913144, + "fm_loss": 0.1594, + "grad_norm": 3.3256664276123047, + "learning_rate": 1.849956507509647e-05, + "loss": 0.5361, + "step": 38000 + }, + { + "ar_loss": 0.3767, + "epoch": 5.245196003074558, + "fm_loss": 0.1601, + "grad_norm": 2.189387559890747, + "learning_rate": 1.838293837209472e-05, + "loss": 0.5368, + "step": 38100 + }, + { + "ar_loss": 0.3765, + "epoch": 5.260568793235972, + "fm_loss": 0.1638, + "grad_norm": 2.6017394065856934, + "learning_rate": 1.8266466319546712e-05, + "loss": 0.5403, + "step": 38200 + }, + { + "ar_loss": 0.3755, + "epoch": 5.275941583397387, + "fm_loss": 0.1598, + "grad_norm": 3.207014322280884, + "learning_rate": 1.8150151639575466e-05, + "loss": 0.5354, + "step": 38300 + }, + { + "ar_loss": 0.3772, + "epoch": 5.291314373558801, + "fm_loss": 0.1589, + "grad_norm": 3.0959692001342773, + "learning_rate": 1.8033997050625966e-05, + "loss": 0.536, + "step": 38400 + }, + { + "ar_loss": 0.3742, + "epoch": 5.306687163720215, + "fm_loss": 0.1587, + "grad_norm": 2.948103666305542, + "learning_rate": 1.791800526740165e-05, + "loss": 0.5329, + "step": 38500 + }, + { + "ar_loss": 0.3768, + "epoch": 5.322059953881629, + "fm_loss": 0.1633, + "grad_norm": 3.3506641387939453, + "learning_rate": 1.7802179000800927e-05, + "loss": 0.5401, + "step": 38600 + }, + { + "ar_loss": 0.3748, + "epoch": 5.337432744043044, + "fm_loss": 0.1593, + "grad_norm": 1.7779144048690796, + "learning_rate": 1.768652095785385e-05, + "loss": 0.5342, + "step": 38700 + }, + { + "ar_loss": 0.3771, + "epoch": 5.3528055342044585, + "fm_loss": 0.1593, + "grad_norm": 3.081190347671509, + "learning_rate": 1.7571033841658844e-05, + "loss": 0.5365, + "step": 38800 + }, + { + "ar_loss": 0.3769, + "epoch": 5.368178324365872, + "fm_loss": 0.159, + "grad_norm": 2.642840623855591, + "learning_rate": 1.7455720351319516e-05, + "loss": 0.5359, + "step": 38900 + }, + { + "ar_loss": 0.3759, + "epoch": 5.383551114527287, + "fm_loss": 0.1596, + "grad_norm": 3.235140323638916, + "learning_rate": 1.734058318188158e-05, + "loss": 0.5355, + "step": 39000 + }, + { + "ar_loss": 0.3728, + "epoch": 5.398923904688701, + "fm_loss": 0.1596, + "grad_norm": 2.3143417835235596, + "learning_rate": 1.7225625024269877e-05, + "loss": 0.5324, + "step": 39100 + }, + { + "ar_loss": 0.3762, + "epoch": 5.414296694850115, + "fm_loss": 0.1592, + "grad_norm": 2.0119452476501465, + "learning_rate": 1.711084856522548e-05, + "loss": 0.5354, + "step": 39200 + }, + { + "ar_loss": 0.3725, + "epoch": 5.429669485011529, + "fm_loss": 0.1596, + "grad_norm": 2.9905428886413574, + "learning_rate": 1.6996256487242894e-05, + "loss": 0.5321, + "step": 39300 + }, + { + "ar_loss": 0.3746, + "epoch": 5.445042275172944, + "fm_loss": 0.1635, + "grad_norm": 2.617199659347534, + "learning_rate": 1.6881851468507358e-05, + "loss": 0.5382, + "step": 39400 + }, + { + "ar_loss": 0.3746, + "epoch": 5.460415065334358, + "fm_loss": 0.1599, + "grad_norm": 2.1979682445526123, + "learning_rate": 1.6767636182832292e-05, + "loss": 0.5345, + "step": 39500 + }, + { + "ar_loss": 0.3768, + "epoch": 5.475787855495772, + "fm_loss": 0.1586, + "grad_norm": 3.3658335208892822, + "learning_rate": 1.6653613299596748e-05, + "loss": 0.5354, + "step": 39600 + }, + { + "ar_loss": 0.3753, + "epoch": 5.491160645657187, + "fm_loss": 0.1583, + "grad_norm": 3.347707748413086, + "learning_rate": 1.6539785483683063e-05, + "loss": 0.5336, + "step": 39700 + }, + { + "ar_loss": 0.3786, + "epoch": 5.506533435818601, + "fm_loss": 0.1625, + "grad_norm": 2.0319979190826416, + "learning_rate": 1.6426155395414555e-05, + "loss": 0.5411, + "step": 39800 + }, + { + "ar_loss": 0.3765, + "epoch": 5.521906225980016, + "fm_loss": 0.16, + "grad_norm": 2.5716593265533447, + "learning_rate": 1.631272569049336e-05, + "loss": 0.5365, + "step": 39900 + }, + { + "ar_loss": 0.3767, + "epoch": 5.537279016141429, + "fm_loss": 0.1596, + "grad_norm": 2.94095516204834, + "learning_rate": 1.6199499019938363e-05, + "loss": 0.5364, + "step": 40000 + }, + { + "ar_loss": 0.3755, + "epoch": 6.015372790161415, + "fm_loss": 0.1588, + "grad_norm": 2.66264271736145, + "learning_rate": 1.6086478030023248e-05, + "loss": 0.5343, + "step": 40100 + }, + { + "ar_loss": 0.3771, + "epoch": 6.0307455803228285, + "fm_loss": 0.1577, + "grad_norm": 3.340590476989746, + "learning_rate": 1.597366536221462e-05, + "loss": 0.5349, + "step": 40200 + }, + { + "ar_loss": 0.3748, + "epoch": 6.046118370484243, + "fm_loss": 0.1597, + "grad_norm": 2.351405620574951, + "learning_rate": 1.5861063653110292e-05, + "loss": 0.5345, + "step": 40300 + }, + { + "ar_loss": 0.3747, + "epoch": 6.061491160645657, + "fm_loss": 0.1608, + "grad_norm": 3.0236189365386963, + "learning_rate": 1.5748675534377683e-05, + "loss": 0.5355, + "step": 40400 + }, + { + "ar_loss": 0.3763, + "epoch": 6.076863950807072, + "fm_loss": 0.1588, + "grad_norm": 2.360391139984131, + "learning_rate": 1.563650363269227e-05, + "loss": 0.5351, + "step": 40500 + }, + { + "ar_loss": 0.377, + "epoch": 6.092236740968485, + "fm_loss": 0.1611, + "grad_norm": 2.1837239265441895, + "learning_rate": 1.5524550569676224e-05, + "loss": 0.5381, + "step": 40600 + }, + { + "ar_loss": 0.376, + "epoch": 6.1076095311299, + "fm_loss": 0.1585, + "grad_norm": 3.4195451736450195, + "learning_rate": 1.541281896183715e-05, + "loss": 0.5346, + "step": 40700 + }, + { + "ar_loss": 0.3785, + "epoch": 6.122982321291314, + "fm_loss": 0.1592, + "grad_norm": 3.0176382064819336, + "learning_rate": 1.5301311420506897e-05, + "loss": 0.5377, + "step": 40800 + }, + { + "ar_loss": 0.3735, + "epoch": 6.138355111452729, + "fm_loss": 0.1588, + "grad_norm": 3.1062843799591064, + "learning_rate": 1.5190030551780564e-05, + "loss": 0.5323, + "step": 40900 + }, + { + "ar_loss": 0.374, + "epoch": 6.153727901614143, + "fm_loss": 0.1592, + "grad_norm": 2.2580771446228027, + "learning_rate": 1.5078978956455581e-05, + "loss": 0.5332, + "step": 41000 + }, + { + "ar_loss": 0.3748, + "epoch": 6.169100691775557, + "fm_loss": 0.1584, + "grad_norm": 3.3398499488830566, + "learning_rate": 1.4968159229970914e-05, + "loss": 0.5333, + "step": 41100 + }, + { + "ar_loss": 0.377, + "epoch": 6.184473481936972, + "fm_loss": 0.1581, + "grad_norm": 4.929275989532471, + "learning_rate": 1.4857573962346411e-05, + "loss": 0.5351, + "step": 41200 + }, + { + "ar_loss": 0.3762, + "epoch": 6.199846272098386, + "fm_loss": 0.16, + "grad_norm": 1.9189740419387817, + "learning_rate": 1.4747225738122278e-05, + "loss": 0.5362, + "step": 41300 + }, + { + "ar_loss": 0.3733, + "epoch": 6.2152190622598, + "fm_loss": 0.1599, + "grad_norm": 2.381868600845337, + "learning_rate": 1.4637117136298673e-05, + "loss": 0.5333, + "step": 41400 + }, + { + "ar_loss": 0.3741, + "epoch": 6.230591852421214, + "fm_loss": 0.1591, + "grad_norm": 4.60919713973999, + "learning_rate": 1.452725073027541e-05, + "loss": 0.5332, + "step": 41500 + }, + { + "ar_loss": 0.3768, + "epoch": 6.245964642582629, + "fm_loss": 0.1579, + "grad_norm": 8.782063484191895, + "learning_rate": 1.4417629087791868e-05, + "loss": 0.5347, + "step": 41600 + }, + { + "ar_loss": 0.3755, + "epoch": 6.261337432744043, + "fm_loss": 0.1589, + "grad_norm": 3.957836627960205, + "learning_rate": 1.4308254770866886e-05, + "loss": 0.5344, + "step": 41700 + }, + { + "ar_loss": 0.3779, + "epoch": 6.276710222905457, + "fm_loss": 0.1578, + "grad_norm": 4.637214183807373, + "learning_rate": 1.4199130335738981e-05, + "loss": 0.5357, + "step": 41800 + }, + { + "ar_loss": 0.3745, + "epoch": 6.292083013066872, + "fm_loss": 0.1589, + "grad_norm": 4.1944804191589355, + "learning_rate": 1.409025833280655e-05, + "loss": 0.5334, + "step": 41900 + }, + { + "ar_loss": 0.3738, + "epoch": 6.307455803228286, + "fm_loss": 0.1589, + "grad_norm": 3.7318429946899414, + "learning_rate": 1.3981641306568299e-05, + "loss": 0.5328, + "step": 42000 + }, + { + "ar_loss": 0.3754, + "epoch": 6.3228285933897, + "fm_loss": 0.16, + "grad_norm": 1.8556092977523804, + "learning_rate": 1.3873281795563737e-05, + "loss": 0.5355, + "step": 42100 + }, + { + "ar_loss": 0.3718, + "epoch": 6.338201383551114, + "fm_loss": 0.1584, + "grad_norm": 3.1865854263305664, + "learning_rate": 1.3765182332313859e-05, + "loss": 0.5302, + "step": 42200 + }, + { + "ar_loss": 0.3778, + "epoch": 6.353574173712529, + "fm_loss": 0.1593, + "grad_norm": 2.2085890769958496, + "learning_rate": 1.3657345443261967e-05, + "loss": 0.5371, + "step": 42300 + }, + { + "ar_loss": 0.377, + "epoch": 6.3689469638739435, + "fm_loss": 0.1592, + "grad_norm": 4.625204563140869, + "learning_rate": 1.3549773648714631e-05, + "loss": 0.5363, + "step": 42400 + }, + { + "ar_loss": 0.3775, + "epoch": 6.384319754035357, + "fm_loss": 0.1581, + "grad_norm": 2.3030028343200684, + "learning_rate": 1.3442469462782741e-05, + "loss": 0.5356, + "step": 42500 + }, + { + "ar_loss": 0.3768, + "epoch": 6.399692544196772, + "fm_loss": 0.1591, + "grad_norm": 6.936190128326416, + "learning_rate": 1.3335435393322826e-05, + "loss": 0.5359, + "step": 42600 + }, + { + "ar_loss": 0.374, + "epoch": 6.415065334358186, + "fm_loss": 0.1591, + "grad_norm": 3.345165252685547, + "learning_rate": 1.322867394187836e-05, + "loss": 0.5331, + "step": 42700 + }, + { + "ar_loss": 0.377, + "epoch": 6.4304381245196005, + "fm_loss": 0.1595, + "grad_norm": 3.3900644779205322, + "learning_rate": 1.3122187603621356e-05, + "loss": 0.5366, + "step": 42800 + }, + { + "ar_loss": 0.3728, + "epoch": 6.445810914681014, + "fm_loss": 0.1582, + "grad_norm": 2.5638771057128906, + "learning_rate": 1.3015978867293996e-05, + "loss": 0.531, + "step": 42900 + }, + { + "ar_loss": 0.377, + "epoch": 6.461183704842429, + "fm_loss": 0.1588, + "grad_norm": 2.6694748401641846, + "learning_rate": 1.2910050215150526e-05, + "loss": 0.5358, + "step": 43000 + }, + { + "ar_loss": 0.3739, + "epoch": 6.476556495003843, + "fm_loss": 0.1593, + "grad_norm": 4.689158916473389, + "learning_rate": 1.2804404122899197e-05, + "loss": 0.5332, + "step": 43100 + }, + { + "ar_loss": 0.3763, + "epoch": 6.4919292851652575, + "fm_loss": 0.1589, + "grad_norm": 2.5251529216766357, + "learning_rate": 1.2699043059644444e-05, + "loss": 0.5352, + "step": 43200 + }, + { + "ar_loss": 0.3754, + "epoch": 6.507302075326672, + "fm_loss": 0.1582, + "grad_norm": 2.896928548812866, + "learning_rate": 1.2593969487829133e-05, + "loss": 0.5337, + "step": 43300 + }, + { + "ar_loss": 0.3782, + "epoch": 6.522674865488086, + "fm_loss": 0.1589, + "grad_norm": 8.51708698272705, + "learning_rate": 1.2489185863177032e-05, + "loss": 0.5371, + "step": 43400 + }, + { + "ar_loss": 0.3763, + "epoch": 6.538047655649501, + "fm_loss": 0.1584, + "grad_norm": 2.936885356903076, + "learning_rate": 1.2384694634635433e-05, + "loss": 0.5347, + "step": 43500 + }, + { + "ar_loss": 0.3744, + "epoch": 6.553420445810914, + "fm_loss": 0.1592, + "grad_norm": 2.76309871673584, + "learning_rate": 1.2280498244317883e-05, + "loss": 0.5337, + "step": 43600 + }, + { + "ar_loss": 0.3742, + "epoch": 6.568793235972329, + "fm_loss": 0.1586, + "grad_norm": 2.100710868835449, + "learning_rate": 1.2176599127447147e-05, + "loss": 0.5328, + "step": 43700 + }, + { + "ar_loss": 0.3777, + "epoch": 6.584166026133743, + "fm_loss": 0.1588, + "grad_norm": 2.9072883129119873, + "learning_rate": 1.2072999712298242e-05, + "loss": 0.5365, + "step": 43800 + }, + { + "ar_loss": 0.3757, + "epoch": 6.599538816295158, + "fm_loss": 0.1627, + "grad_norm": 2.924342393875122, + "learning_rate": 1.196970242014176e-05, + "loss": 0.5384, + "step": 43900 + }, + { + "ar_loss": 0.3763, + "epoch": 6.614911606456571, + "fm_loss": 0.1586, + "grad_norm": 2.559717893600464, + "learning_rate": 1.1866709665187205e-05, + "loss": 0.5349, + "step": 44000 + }, + { + "ar_loss": 0.3732, + "epoch": 6.630284396617986, + "fm_loss": 0.1588, + "grad_norm": 6.084836006164551, + "learning_rate": 1.1764023854526593e-05, + "loss": 0.532, + "step": 44100 + }, + { + "ar_loss": 0.3753, + "epoch": 6.645657186779401, + "fm_loss": 0.1588, + "grad_norm": 2.4638426303863525, + "learning_rate": 1.1661647388078211e-05, + "loss": 0.5341, + "step": 44200 + }, + { + "ar_loss": 0.3742, + "epoch": 6.661029976940815, + "fm_loss": 0.1594, + "grad_norm": 2.2063241004943848, + "learning_rate": 1.1559582658530526e-05, + "loss": 0.5335, + "step": 44300 + }, + { + "ar_loss": 0.3743, + "epoch": 6.676402767102229, + "fm_loss": 0.1591, + "grad_norm": 2.847790479660034, + "learning_rate": 1.1457832051286235e-05, + "loss": 0.5333, + "step": 44400 + }, + { + "ar_loss": 0.3724, + "epoch": 6.691775557263643, + "fm_loss": 0.1591, + "grad_norm": 5.002224445343018, + "learning_rate": 1.1356397944406566e-05, + "loss": 0.5315, + "step": 44500 + }, + { + "ar_loss": 0.3739, + "epoch": 6.707148347425058, + "fm_loss": 0.1592, + "grad_norm": 2.6647820472717285, + "learning_rate": 1.125528270855564e-05, + "loss": 0.5331, + "step": 44600 + }, + { + "ar_loss": 0.3758, + "epoch": 6.722521137586472, + "fm_loss": 0.1583, + "grad_norm": 2.1632769107818604, + "learning_rate": 1.1154488706945104e-05, + "loss": 0.5341, + "step": 44700 + }, + { + "ar_loss": 0.3761, + "epoch": 6.737893927747886, + "fm_loss": 0.1589, + "grad_norm": 2.196767568588257, + "learning_rate": 1.105401829527889e-05, + "loss": 0.535, + "step": 44800 + }, + { + "ar_loss": 0.3755, + "epoch": 6.753266717909301, + "fm_loss": 0.1585, + "grad_norm": 3.3876090049743652, + "learning_rate": 1.0953873821698153e-05, + "loss": 0.534, + "step": 44900 + }, + { + "ar_loss": 0.375, + "epoch": 6.768639508070715, + "fm_loss": 0.1587, + "grad_norm": 2.2706313133239746, + "learning_rate": 1.085405762672639e-05, + "loss": 0.5337, + "step": 45000 + }, + { + "ar_loss": 0.3739, + "epoch": 6.784012298232129, + "fm_loss": 0.159, + "grad_norm": 2.5273022651672363, + "learning_rate": 1.0754572043214773e-05, + "loss": 0.5329, + "step": 45100 + }, + { + "ar_loss": 0.3734, + "epoch": 6.799385088393543, + "fm_loss": 0.1596, + "grad_norm": 2.579967498779297, + "learning_rate": 1.0655419396287578e-05, + "loss": 0.533, + "step": 45200 + }, + { + "ar_loss": 0.3717, + "epoch": 6.814757878554958, + "fm_loss": 0.1583, + "grad_norm": 36.556278228759766, + "learning_rate": 1.0556602003287847e-05, + "loss": 0.53, + "step": 45300 + }, + { + "ar_loss": 0.3769, + "epoch": 6.830130668716372, + "fm_loss": 0.1586, + "grad_norm": 5.545002460479736, + "learning_rate": 1.045812217372327e-05, + "loss": 0.5355, + "step": 45400 + }, + { + "ar_loss": 0.3737, + "epoch": 6.845503458877786, + "fm_loss": 0.1587, + "grad_norm": 2.081123113632202, + "learning_rate": 1.0359982209212178e-05, + "loss": 0.5323, + "step": 45500 + }, + { + "ar_loss": 0.3722, + "epoch": 6.860876249039201, + "fm_loss": 0.1591, + "grad_norm": 3.176176071166992, + "learning_rate": 1.0262184403429739e-05, + "loss": 0.5313, + "step": 45600 + }, + { + "ar_loss": 0.3785, + "epoch": 6.876249039200615, + "fm_loss": 0.1584, + "grad_norm": 7.945939540863037, + "learning_rate": 1.016473104205441e-05, + "loss": 0.5369, + "step": 45700 + }, + { + "ar_loss": 0.3752, + "epoch": 6.8916218293620295, + "fm_loss": 0.1589, + "grad_norm": 3.0679447650909424, + "learning_rate": 1.0067624402714438e-05, + "loss": 0.5341, + "step": 45800 + }, + { + "ar_loss": 0.3731, + "epoch": 6.906994619523443, + "fm_loss": 0.159, + "grad_norm": 4.075259685516357, + "learning_rate": 9.970866754934677e-06, + "loss": 0.532, + "step": 45900 + }, + { + "ar_loss": 0.3768, + "epoch": 6.922367409684858, + "fm_loss": 0.1581, + "grad_norm": 2.1700098514556885, + "learning_rate": 9.874460360083537e-06, + "loss": 0.5349, + "step": 46000 + }, + { + "ar_loss": 0.3749, + "epoch": 6.937740199846272, + "fm_loss": 0.1598, + "grad_norm": 2.1425936222076416, + "learning_rate": 9.778407471320134e-06, + "loss": 0.5347, + "step": 46100 + }, + { + "ar_loss": 0.3719, + "epoch": 6.9531129900076865, + "fm_loss": 0.1584, + "grad_norm": 7.313533782958984, + "learning_rate": 9.682710333541622e-06, + "loss": 0.5303, + "step": 46200 + }, + { + "ar_loss": 0.3737, + "epoch": 6.9684857801691, + "fm_loss": 0.1586, + "grad_norm": 1.9248658418655396, + "learning_rate": 9.587371183330723e-06, + "loss": 0.5323, + "step": 46300 + }, + { + "ar_loss": 0.3728, + "epoch": 6.983858570330515, + "fm_loss": 0.1596, + "grad_norm": 2.7737770080566406, + "learning_rate": 9.492392248903505e-06, + "loss": 0.5324, + "step": 46400 + }, + { + "ar_loss": 0.3755, + "epoch": 6.99923136049193, + "fm_loss": 0.1585, + "grad_norm": 2.6747500896453857, + "learning_rate": 9.397775750057206e-06, + "loss": 0.5341, + "step": 46500 + }, + { + "ar_loss": 0.3746, + "epoch": 7.014604150653343, + "fm_loss": 0.1594, + "grad_norm": 6.737438678741455, + "learning_rate": 9.303523898118444e-06, + "loss": 0.534, + "step": 46600 + }, + { + "ar_loss": 0.3741, + "epoch": 7.029976940814758, + "fm_loss": 0.1589, + "grad_norm": 3.4723405838012695, + "learning_rate": 9.209638895891501e-06, + "loss": 0.5331, + "step": 46700 + }, + { + "ar_loss": 0.3735, + "epoch": 7.045349730976172, + "fm_loss": 0.1594, + "grad_norm": 6.770826816558838, + "learning_rate": 9.116122937606835e-06, + "loss": 0.5329, + "step": 46800 + }, + { + "ar_loss": 0.3735, + "epoch": 7.060722521137587, + "fm_loss": 0.1589, + "grad_norm": 11.931926727294922, + "learning_rate": 9.022978208869808e-06, + "loss": 0.5323, + "step": 46900 + }, + { + "ar_loss": 0.3734, + "epoch": 7.076095311299, + "fm_loss": 0.1582, + "grad_norm": 1.7415313720703125, + "learning_rate": 8.930206886609616e-06, + "loss": 0.5316, + "step": 47000 + }, + { + "ar_loss": 0.3734, + "epoch": 7.091468101460415, + "fm_loss": 0.1591, + "grad_norm": 3.001337766647339, + "learning_rate": 8.837811139028377e-06, + "loss": 0.5325, + "step": 47100 + }, + { + "ar_loss": 0.3719, + "epoch": 7.10684089162183, + "fm_loss": 0.1588, + "grad_norm": 2.8166637420654297, + "learning_rate": 8.745793125550477e-06, + "loss": 0.5307, + "step": 47200 + }, + { + "ar_loss": 0.3744, + "epoch": 7.1222136817832435, + "fm_loss": 0.1588, + "grad_norm": 10.10988712310791, + "learning_rate": 8.654154996772114e-06, + "loss": 0.5332, + "step": 47300 + }, + { + "ar_loss": 0.3758, + "epoch": 7.137586471944658, + "fm_loss": 0.1593, + "grad_norm": 2.6831390857696533, + "learning_rate": 8.562898894411017e-06, + "loss": 0.5351, + "step": 47400 + }, + { + "ar_loss": 0.371, + "epoch": 7.152959262106072, + "fm_loss": 0.1592, + "grad_norm": 1.959746241569519, + "learning_rate": 8.472026951256381e-06, + "loss": 0.5302, + "step": 47500 + }, + { + "ar_loss": 0.3719, + "epoch": 7.168332052267487, + "fm_loss": 0.1582, + "grad_norm": 2.5866353511810303, + "learning_rate": 8.38154129111908e-06, + "loss": 0.53, + "step": 47600 + }, + { + "ar_loss": 0.3731, + "epoch": 7.1837048424289005, + "fm_loss": 0.1584, + "grad_norm": 2.6503565311431885, + "learning_rate": 8.29144402878193e-06, + "loss": 0.5314, + "step": 47700 + }, + { + "ar_loss": 0.372, + "epoch": 7.199077632590315, + "fm_loss": 0.1606, + "grad_norm": 3.779261589050293, + "learning_rate": 8.201737269950355e-06, + "loss": 0.5325, + "step": 47800 + }, + { + "ar_loss": 0.371, + "epoch": 7.21445042275173, + "fm_loss": 0.1585, + "grad_norm": 2.4449970722198486, + "learning_rate": 8.112423111203124e-06, + "loss": 0.5295, + "step": 47900 + }, + { + "ar_loss": 0.3738, + "epoch": 7.229823212913144, + "fm_loss": 0.1584, + "grad_norm": 2.2168331146240234, + "learning_rate": 8.023503639943378e-06, + "loss": 0.5322, + "step": 48000 + }, + { + "ar_loss": 0.3758, + "epoch": 7.245196003074558, + "fm_loss": 0.158, + "grad_norm": 2.078047275543213, + "learning_rate": 7.934980934349811e-06, + "loss": 0.5338, + "step": 48100 + }, + { + "ar_loss": 0.3757, + "epoch": 7.260568793235972, + "fm_loss": 0.1583, + "grad_norm": 2.491028308868408, + "learning_rate": 7.846857063328152e-06, + "loss": 0.534, + "step": 48200 + }, + { + "ar_loss": 0.3748, + "epoch": 7.275941583397387, + "fm_loss": 0.1584, + "grad_norm": 3.340336799621582, + "learning_rate": 7.759134086462753e-06, + "loss": 0.5332, + "step": 48300 + }, + { + "ar_loss": 0.3735, + "epoch": 7.291314373558801, + "fm_loss": 0.1584, + "grad_norm": 3.3799524307250977, + "learning_rate": 7.671814053968484e-06, + "loss": 0.5319, + "step": 48400 + }, + { + "ar_loss": 0.3743, + "epoch": 7.306687163720215, + "fm_loss": 0.1587, + "grad_norm": 3.1465113162994385, + "learning_rate": 7.58489900664282e-06, + "loss": 0.533, + "step": 48500 + }, + { + "ar_loss": 0.3723, + "epoch": 7.322059953881629, + "fm_loss": 0.1613, + "grad_norm": 3.0010464191436768, + "learning_rate": 7.49839097581814e-06, + "loss": 0.5336, + "step": 48600 + }, + { + "ar_loss": 0.3727, + "epoch": 7.337432744043044, + "fm_loss": 0.1608, + "grad_norm": 8.70785903930664, + "learning_rate": 7.412291983314237e-06, + "loss": 0.5335, + "step": 48700 + }, + { + "ar_loss": 0.3727, + "epoch": 7.3528055342044585, + "fm_loss": 0.158, + "grad_norm": 3.002357244491577, + "learning_rate": 7.326604041391089e-06, + "loss": 0.5307, + "step": 48800 + }, + { + "ar_loss": 0.3715, + "epoch": 7.368178324365872, + "fm_loss": 0.1585, + "grad_norm": 2.6171886920928955, + "learning_rate": 7.241329152701812e-06, + "loss": 0.53, + "step": 48900 + }, + { + "ar_loss": 0.3744, + "epoch": 7.383551114527287, + "fm_loss": 0.1583, + "grad_norm": 2.283306360244751, + "learning_rate": 7.156469310245864e-06, + "loss": 0.5326, + "step": 49000 + }, + { + "ar_loss": 0.3744, + "epoch": 7.398923904688701, + "fm_loss": 0.1595, + "grad_norm": 6.2937331199646, + "learning_rate": 7.07202649732246e-06, + "loss": 0.5338, + "step": 49100 + }, + { + "ar_loss": 0.3744, + "epoch": 7.414296694850115, + "fm_loss": 0.1601, + "grad_norm": 4.580973148345947, + "learning_rate": 6.988002687484222e-06, + "loss": 0.5345, + "step": 49200 + }, + { + "ar_loss": 0.3728, + "epoch": 7.429669485011529, + "fm_loss": 0.158, + "grad_norm": 2.2313263416290283, + "learning_rate": 6.904399844491058e-06, + "loss": 0.5308, + "step": 49300 + }, + { + "ar_loss": 0.3735, + "epoch": 7.445042275172944, + "fm_loss": 0.1595, + "grad_norm": 3.6399624347686768, + "learning_rate": 6.821219922264252e-06, + "loss": 0.533, + "step": 49400 + }, + { + "ar_loss": 0.3752, + "epoch": 7.460415065334358, + "fm_loss": 0.1585, + "grad_norm": 3.077457904815674, + "learning_rate": 6.73846486484083e-06, + "loss": 0.5337, + "step": 49500 + }, + { + "ar_loss": 0.3743, + "epoch": 7.475787855495772, + "fm_loss": 0.1581, + "grad_norm": 2.1495542526245117, + "learning_rate": 6.6561366063280784e-06, + "loss": 0.5324, + "step": 49600 + }, + { + "ar_loss": 0.3727, + "epoch": 7.491160645657187, + "fm_loss": 0.1584, + "grad_norm": 4.248561382293701, + "learning_rate": 6.574237070858383e-06, + "loss": 0.5311, + "step": 49700 + }, + { + "ar_loss": 0.3725, + "epoch": 7.506533435818601, + "fm_loss": 0.1588, + "grad_norm": 1.8635345697402954, + "learning_rate": 6.492768172544231e-06, + "loss": 0.5314, + "step": 49800 + }, + { + "ar_loss": 0.3739, + "epoch": 7.521906225980016, + "fm_loss": 0.159, + "grad_norm": 4.352962970733643, + "learning_rate": 6.411731815433492e-06, + "loss": 0.5329, + "step": 49900 + }, + { + "ar_loss": 0.3744, + "epoch": 7.537279016141429, + "fm_loss": 0.1597, + "grad_norm": 1.9769291877746582, + "learning_rate": 6.33112989346491e-06, + "loss": 0.5341, + "step": 50000 + }, + { + "ar_loss": 0.3716, + "epoch": 7.552651806302844, + "fm_loss": 0.1728, + "grad_norm": 2.1171858310699463, + "learning_rate": 6.250964290423847e-06, + "loss": 0.5445, + "step": 50100 + }, + { + "ar_loss": 0.3727, + "epoch": 7.568024596464259, + "fm_loss": 0.1592, + "grad_norm": 2.2346949577331543, + "learning_rate": 6.171236879898243e-06, + "loss": 0.5319, + "step": 50200 + }, + { + "ar_loss": 0.3733, + "epoch": 7.5833973866256725, + "fm_loss": 0.1606, + "grad_norm": 3.1455023288726807, + "learning_rate": 6.091949525234838e-06, + "loss": 0.5339, + "step": 50300 + }, + { + "ar_loss": 0.3767, + "epoch": 7.598770176787087, + "fm_loss": 0.1583, + "grad_norm": 4.480979919433594, + "learning_rate": 6.013104079495621e-06, + "loss": 0.535, + "step": 50400 + }, + { + "ar_loss": 0.3706, + "epoch": 7.614142966948501, + "fm_loss": 0.1593, + "grad_norm": 3.0612711906433105, + "learning_rate": 5.934702385414517e-06, + "loss": 0.5299, + "step": 50500 + }, + { + "ar_loss": 0.3729, + "epoch": 7.629515757109916, + "fm_loss": 0.1585, + "grad_norm": 2.3492562770843506, + "learning_rate": 5.856746275354322e-06, + "loss": 0.5313, + "step": 50600 + }, + { + "ar_loss": 0.3735, + "epoch": 7.6448885472713295, + "fm_loss": 0.1585, + "grad_norm": 4.781151294708252, + "learning_rate": 5.77923757126389e-06, + "loss": 0.532, + "step": 50700 + }, + { + "ar_loss": 0.3714, + "epoch": 7.660261337432744, + "fm_loss": 0.159, + "grad_norm": 4.14220666885376, + "learning_rate": 5.702178084635526e-06, + "loss": 0.5304, + "step": 50800 + }, + { + "ar_loss": 0.3755, + "epoch": 7.675634127594158, + "fm_loss": 0.1586, + "grad_norm": 2.7182235717773438, + "learning_rate": 5.625569616462672e-06, + "loss": 0.5341, + "step": 50900 + }, + { + "ar_loss": 0.375, + "epoch": 7.691006917755573, + "fm_loss": 0.1587, + "grad_norm": 2.1390786170959473, + "learning_rate": 5.549413957197797e-06, + "loss": 0.5337, + "step": 51000 + }, + { + "ar_loss": 0.3757, + "epoch": 7.706379707916987, + "fm_loss": 0.1591, + "grad_norm": 15.834261894226074, + "learning_rate": 5.473712886710569e-06, + "loss": 0.5348, + "step": 51100 + }, + { + "ar_loss": 0.3752, + "epoch": 7.721752498078401, + "fm_loss": 0.1587, + "grad_norm": 3.1994242668151855, + "learning_rate": 5.3984681742462435e-06, + "loss": 0.5339, + "step": 51200 + }, + { + "ar_loss": 0.3728, + "epoch": 7.737125288239816, + "fm_loss": 0.1584, + "grad_norm": 3.1270108222961426, + "learning_rate": 5.323681578384318e-06, + "loss": 0.5312, + "step": 51300 + }, + { + "ar_loss": 0.3753, + "epoch": 7.75249807840123, + "fm_loss": 0.1588, + "grad_norm": 4.753870964050293, + "learning_rate": 5.24935484699744e-06, + "loss": 0.5341, + "step": 51400 + }, + { + "ar_loss": 0.3751, + "epoch": 7.767870868562644, + "fm_loss": 0.1591, + "grad_norm": 2.4398574829101562, + "learning_rate": 5.175489717210532e-06, + "loss": 0.5343, + "step": 51500 + }, + { + "ar_loss": 0.3723, + "epoch": 7.783243658724058, + "fm_loss": 0.1603, + "grad_norm": 1.8831534385681152, + "learning_rate": 5.102087915360229e-06, + "loss": 0.5326, + "step": 51600 + }, + { + "ar_loss": 0.3731, + "epoch": 7.798616448885473, + "fm_loss": 0.1579, + "grad_norm": 3.3795456886291504, + "learning_rate": 5.0291511569544955e-06, + "loss": 0.5311, + "step": 51700 + }, + { + "ar_loss": 0.3724, + "epoch": 7.813989239046887, + "fm_loss": 0.159, + "grad_norm": 6.143826007843018, + "learning_rate": 4.956681146632553e-06, + "loss": 0.5314, + "step": 51800 + }, + { + "ar_loss": 0.3737, + "epoch": 7.829362029208301, + "fm_loss": 0.1575, + "grad_norm": 3.264896869659424, + "learning_rate": 4.884679578125029e-06, + "loss": 0.5312, + "step": 51900 + }, + { + "ar_loss": 0.3744, + "epoch": 7.844734819369716, + "fm_loss": 0.1608, + "grad_norm": 2.4835152626037598, + "learning_rate": 4.813148134214396e-06, + "loss": 0.5352, + "step": 52000 + }, + { + "ar_loss": 0.3762, + "epoch": 7.86010760953113, + "fm_loss": 0.1581, + "grad_norm": 2.550990581512451, + "learning_rate": 4.742088486695604e-06, + "loss": 0.5343, + "step": 52100 + }, + { + "ar_loss": 0.3731, + "epoch": 7.875480399692544, + "fm_loss": 0.1575, + "grad_norm": 6.491451740264893, + "learning_rate": 4.671502296337033e-06, + "loss": 0.5307, + "step": 52200 + }, + { + "ar_loss": 0.3729, + "epoch": 7.890853189853958, + "fm_loss": 0.1588, + "grad_norm": 3.544994354248047, + "learning_rate": 4.60139121284168e-06, + "loss": 0.5317, + "step": 52300 + }, + { + "ar_loss": 0.3753, + "epoch": 7.906225980015373, + "fm_loss": 0.1597, + "grad_norm": 2.540804386138916, + "learning_rate": 4.531756874808585e-06, + "loss": 0.535, + "step": 52400 + }, + { + "ar_loss": 0.375, + "epoch": 7.921598770176787, + "fm_loss": 0.1583, + "grad_norm": 3.565279722213745, + "learning_rate": 4.462600909694559e-06, + "loss": 0.5334, + "step": 52500 + }, + { + "ar_loss": 0.3729, + "epoch": 7.936971560338201, + "fm_loss": 0.1593, + "grad_norm": 2.623326539993286, + "learning_rate": 4.393924933776122e-06, + "loss": 0.5321, + "step": 52600 + }, + { + "ar_loss": 0.3763, + "epoch": 7.952344350499615, + "fm_loss": 0.158, + "grad_norm": 3.847337007522583, + "learning_rate": 4.325730552111754e-06, + "loss": 0.5343, + "step": 52700 + }, + { + "ar_loss": 0.3734, + "epoch": 7.96771714066103, + "fm_loss": 0.1577, + "grad_norm": 2.1040573120117188, + "learning_rate": 4.258019358504359e-06, + "loss": 0.5312, + "step": 52800 + }, + { + "ar_loss": 0.3727, + "epoch": 7.983089930822445, + "fm_loss": 0.1585, + "grad_norm": 2.5626471042633057, + "learning_rate": 4.190792935464033e-06, + "loss": 0.5311, + "step": 52900 + }, + { + "ar_loss": 0.3735, + "epoch": 7.998462720983858, + "fm_loss": 0.16, + "grad_norm": 2.844808340072632, + "learning_rate": 4.124052854171068e-06, + "loss": 0.5335, + "step": 53000 + }, + { + "ar_loss": 0.3713, + "epoch": 8.013835511145272, + "fm_loss": 0.1589, + "grad_norm": 3.6812970638275146, + "learning_rate": 4.057800674439227e-06, + "loss": 0.5302, + "step": 53100 + }, + { + "ar_loss": 0.3762, + "epoch": 8.029208301306687, + "fm_loss": 0.158, + "grad_norm": 3.114596128463745, + "learning_rate": 3.992037944679322e-06, + "loss": 0.5342, + "step": 53200 + }, + { + "ar_loss": 0.3731, + "epoch": 8.044581091468102, + "fm_loss": 0.1582, + "grad_norm": 7.761219501495361, + "learning_rate": 3.926766201862972e-06, + "loss": 0.5313, + "step": 53300 + }, + { + "ar_loss": 0.3736, + "epoch": 8.059953881629516, + "fm_loss": 0.1572, + "grad_norm": 2.6152584552764893, + "learning_rate": 3.861986971486725e-06, + "loss": 0.5307, + "step": 53400 + }, + { + "ar_loss": 0.3762, + "epoch": 8.07532667179093, + "fm_loss": 0.1578, + "grad_norm": 10.676106452941895, + "learning_rate": 3.7977017675363826e-06, + "loss": 0.5339, + "step": 53500 + }, + { + "ar_loss": 0.3778, + "epoch": 8.090699461952344, + "fm_loss": 0.1581, + "grad_norm": 2.382251024246216, + "learning_rate": 3.7339120924516276e-06, + "loss": 0.5359, + "step": 53600 + }, + { + "ar_loss": 0.3755, + "epoch": 8.106072252113758, + "fm_loss": 0.1586, + "grad_norm": 2.832390308380127, + "learning_rate": 3.6706194370909025e-06, + "loss": 0.5341, + "step": 53700 + }, + { + "ar_loss": 0.3725, + "epoch": 8.121445042275173, + "fm_loss": 0.1585, + "grad_norm": 2.2325046062469482, + "learning_rate": 3.6078252806965667e-06, + "loss": 0.531, + "step": 53800 + }, + { + "ar_loss": 0.3749, + "epoch": 8.136817832436588, + "fm_loss": 0.1579, + "grad_norm": 2.05092716217041, + "learning_rate": 3.5455310908603293e-06, + "loss": 0.5328, + "step": 53900 + }, + { + "ar_loss": 0.3705, + "epoch": 8.152190622598, + "fm_loss": 0.1623, + "grad_norm": 2.234282970428467, + "learning_rate": 3.4837383234889498e-06, + "loss": 0.5328, + "step": 54000 + }, + { + "ar_loss": 0.3727, + "epoch": 8.167563412759415, + "fm_loss": 0.1582, + "grad_norm": 2.087803363800049, + "learning_rate": 3.422448422770197e-06, + "loss": 0.5309, + "step": 54100 + }, + { + "ar_loss": 0.3737, + "epoch": 8.18293620292083, + "fm_loss": 0.1579, + "grad_norm": 2.0325510501861572, + "learning_rate": 3.3616628211391193e-06, + "loss": 0.5316, + "step": 54200 + }, + { + "ar_loss": 0.3734, + "epoch": 8.198308993082245, + "fm_loss": 0.1578, + "grad_norm": 4.768230438232422, + "learning_rate": 3.3013829392445434e-06, + "loss": 0.5312, + "step": 54300 + }, + { + "ar_loss": 0.371, + "epoch": 8.21368178324366, + "fm_loss": 0.1578, + "grad_norm": 9.634552955627441, + "learning_rate": 3.2416101859158887e-06, + "loss": 0.5289, + "step": 54400 + }, + { + "ar_loss": 0.3733, + "epoch": 8.229054573405072, + "fm_loss": 0.16, + "grad_norm": 2.277290105819702, + "learning_rate": 3.1823459581302394e-06, + "loss": 0.5333, + "step": 54500 + }, + { + "ar_loss": 0.3716, + "epoch": 8.244427363566487, + "fm_loss": 0.158, + "grad_norm": 6.1840996742248535, + "learning_rate": 3.123591640979681e-06, + "loss": 0.5296, + "step": 54600 + }, + { + "ar_loss": 0.3735, + "epoch": 8.259800153727902, + "fm_loss": 0.1592, + "grad_norm": 3.6673200130462646, + "learning_rate": 3.065348607638946e-06, + "loss": 0.5327, + "step": 54700 + }, + { + "ar_loss": 0.3722, + "epoch": 8.275172943889316, + "fm_loss": 0.1583, + "grad_norm": 2.28403902053833, + "learning_rate": 3.0076182193333053e-06, + "loss": 0.5306, + "step": 54800 + }, + { + "ar_loss": 0.3706, + "epoch": 8.29054573405073, + "fm_loss": 0.1588, + "grad_norm": 2.709955930709839, + "learning_rate": 2.9504018253067673e-06, + "loss": 0.5294, + "step": 54900 + }, + { + "ar_loss": 0.3732, + "epoch": 8.305918524212144, + "fm_loss": 0.1583, + "grad_norm": 2.438748359680176, + "learning_rate": 2.8937007627905354e-06, + "loss": 0.5314, + "step": 55000 + }, + { + "ar_loss": 0.3735, + "epoch": 8.321291314373559, + "fm_loss": 0.1589, + "grad_norm": 1.9475317001342773, + "learning_rate": 2.8375163569717645e-06, + "loss": 0.5324, + "step": 55100 + }, + { + "ar_loss": 0.375, + "epoch": 8.336664104534973, + "fm_loss": 0.1587, + "grad_norm": 2.430262804031372, + "learning_rate": 2.781849920962576e-06, + "loss": 0.5337, + "step": 55200 + }, + { + "ar_loss": 0.3742, + "epoch": 8.352036894696388, + "fm_loss": 0.1588, + "grad_norm": 3.527625799179077, + "learning_rate": 2.726702755769381e-06, + "loss": 0.5329, + "step": 55300 + }, + { + "ar_loss": 0.3744, + "epoch": 8.367409684857801, + "fm_loss": 0.1584, + "grad_norm": 2.9451231956481934, + "learning_rate": 2.6720761502624674e-06, + "loss": 0.5328, + "step": 55400 + }, + { + "ar_loss": 0.3716, + "epoch": 8.382782475019216, + "fm_loss": 0.1599, + "grad_norm": 4.992424488067627, + "learning_rate": 2.6179713811458726e-06, + "loss": 0.5315, + "step": 55500 + }, + { + "ar_loss": 0.3727, + "epoch": 8.39815526518063, + "fm_loss": 0.1585, + "grad_norm": 4.794875144958496, + "learning_rate": 2.564389712927556e-06, + "loss": 0.5313, + "step": 55600 + }, + { + "ar_loss": 0.3728, + "epoch": 8.413528055342045, + "fm_loss": 0.1608, + "grad_norm": 7.288453102111816, + "learning_rate": 2.5113323978898455e-06, + "loss": 0.5336, + "step": 55700 + }, + { + "ar_loss": 0.3732, + "epoch": 8.42890084550346, + "fm_loss": 0.1583, + "grad_norm": 2.7333781719207764, + "learning_rate": 2.458800676060158e-06, + "loss": 0.5316, + "step": 55800 + }, + { + "ar_loss": 0.3737, + "epoch": 8.444273635664873, + "fm_loss": 0.1584, + "grad_norm": 2.3420047760009766, + "learning_rate": 2.406795775182025e-06, + "loss": 0.532, + "step": 55900 + }, + { + "ar_loss": 0.3742, + "epoch": 8.459646425826287, + "fm_loss": 0.1583, + "grad_norm": 2.3550283908843994, + "learning_rate": 2.355318910686394e-06, + "loss": 0.5325, + "step": 56000 + }, + { + "ar_loss": 0.3755, + "epoch": 8.475019215987702, + "fm_loss": 0.1609, + "grad_norm": 2.22489595413208, + "learning_rate": 2.304371285663237e-06, + "loss": 0.5364, + "step": 56100 + }, + { + "ar_loss": 0.3723, + "epoch": 8.490392006149117, + "fm_loss": 0.1578, + "grad_norm": 12.48835563659668, + "learning_rate": 2.2539540908334157e-06, + "loss": 0.5301, + "step": 56200 + }, + { + "ar_loss": 0.3729, + "epoch": 8.50576479631053, + "fm_loss": 0.159, + "grad_norm": 2.215409994125366, + "learning_rate": 2.204068504520859e-06, + "loss": 0.5319, + "step": 56300 + }, + { + "ar_loss": 0.3739, + "epoch": 8.521137586471944, + "fm_loss": 0.1578, + "grad_norm": 3.4783434867858887, + "learning_rate": 2.15471569262502e-06, + "loss": 0.5317, + "step": 56400 + }, + { + "ar_loss": 0.3721, + "epoch": 8.536510376633359, + "fm_loss": 0.1576, + "grad_norm": 2.0251822471618652, + "learning_rate": 2.1058968085936383e-06, + "loss": 0.5297, + "step": 56500 + }, + { + "ar_loss": 0.374, + "epoch": 8.551883166794774, + "fm_loss": 0.1583, + "grad_norm": 11.012389183044434, + "learning_rate": 2.0576129933957715e-06, + "loss": 0.5323, + "step": 56600 + }, + { + "ar_loss": 0.373, + "epoch": 8.567255956956188, + "fm_loss": 0.1579, + "grad_norm": 2.4049017429351807, + "learning_rate": 2.009865375495129e-06, + "loss": 0.531, + "step": 56700 + }, + { + "ar_loss": 0.3748, + "epoch": 8.582628747117601, + "fm_loss": 0.1591, + "grad_norm": 2.692699909210205, + "learning_rate": 1.9626550708237075e-06, + "loss": 0.5339, + "step": 56800 + }, + { + "ar_loss": 0.3729, + "epoch": 8.598001537279016, + "fm_loss": 0.1586, + "grad_norm": 2.713548183441162, + "learning_rate": 1.915983182755696e-06, + "loss": 0.5315, + "step": 56900 + }, + { + "ar_loss": 0.3711, + "epoch": 8.61337432744043, + "fm_loss": 0.1575, + "grad_norm": 6.727135181427002, + "learning_rate": 1.8698508020817045e-06, + "loss": 0.5286, + "step": 57000 + }, + { + "ar_loss": 0.3742, + "epoch": 8.628747117601845, + "fm_loss": 0.1583, + "grad_norm": 3.071518659591675, + "learning_rate": 1.8242590069832617e-06, + "loss": 0.5325, + "step": 57100 + }, + { + "ar_loss": 0.3712, + "epoch": 8.644119907763258, + "fm_loss": 0.1584, + "grad_norm": 2.538884401321411, + "learning_rate": 1.7792088630076086e-06, + "loss": 0.5296, + "step": 57200 + }, + { + "ar_loss": 0.374, + "epoch": 8.659492697924673, + "fm_loss": 0.1583, + "grad_norm": 2.6885924339294434, + "learning_rate": 1.7347014230428144e-06, + "loss": 0.5323, + "step": 57300 + }, + { + "ar_loss": 0.3721, + "epoch": 8.674865488086088, + "fm_loss": 0.1579, + "grad_norm": 2.660156726837158, + "learning_rate": 1.6907377272931485e-06, + "loss": 0.53, + "step": 57400 + }, + { + "ar_loss": 0.3725, + "epoch": 8.690238278247502, + "fm_loss": 0.1586, + "grad_norm": 2.4190871715545654, + "learning_rate": 1.6473188032547854e-06, + "loss": 0.5311, + "step": 57500 + }, + { + "ar_loss": 0.3711, + "epoch": 8.705611068408917, + "fm_loss": 0.1581, + "grad_norm": 3.2725918292999268, + "learning_rate": 1.6044456656917839e-06, + "loss": 0.5291, + "step": 57600 + }, + { + "ar_loss": 0.3725, + "epoch": 8.72098385857033, + "fm_loss": 0.1632, + "grad_norm": 4.782108783721924, + "learning_rate": 1.5621193166123648e-06, + "loss": 0.5357, + "step": 57700 + }, + { + "ar_loss": 0.3747, + "epoch": 8.736356648731745, + "fm_loss": 0.158, + "grad_norm": 7.434814929962158, + "learning_rate": 1.5203407452455076e-06, + "loss": 0.5327, + "step": 57800 + }, + { + "ar_loss": 0.3704, + "epoch": 8.75172943889316, + "fm_loss": 0.1581, + "grad_norm": 2.1383039951324463, + "learning_rate": 1.4791109280178129e-06, + "loss": 0.5285, + "step": 57900 + }, + { + "ar_loss": 0.3772, + "epoch": 8.767102229054574, + "fm_loss": 0.158, + "grad_norm": 26.415754318237305, + "learning_rate": 1.4384308285306959e-06, + "loss": 0.5352, + "step": 58000 + }, + { + "ar_loss": 0.3721, + "epoch": 8.782475019215987, + "fm_loss": 0.1575, + "grad_norm": 8.601106643676758, + "learning_rate": 1.3983013975378574e-06, + "loss": 0.5296, + "step": 58100 + }, + { + "ar_loss": 0.372, + "epoch": 8.797847809377402, + "fm_loss": 0.1584, + "grad_norm": 2.2945618629455566, + "learning_rate": 1.3587235729230764e-06, + "loss": 0.5303, + "step": 58200 + }, + { + "ar_loss": 0.3747, + "epoch": 8.813220599538816, + "fm_loss": 0.1584, + "grad_norm": 4.096945762634277, + "learning_rate": 1.3196982796782636e-06, + "loss": 0.5331, + "step": 58300 + }, + { + "ar_loss": 0.3752, + "epoch": 8.82859338970023, + "fm_loss": 0.1599, + "grad_norm": 3.189495801925659, + "learning_rate": 1.2812264298818737e-06, + "loss": 0.5351, + "step": 58400 + }, + { + "ar_loss": 0.3725, + "epoch": 8.843966179861646, + "fm_loss": 0.1587, + "grad_norm": 3.499967098236084, + "learning_rate": 1.2433089226775662e-06, + "loss": 0.5312, + "step": 58500 + }, + { + "ar_loss": 0.3716, + "epoch": 8.859338970023058, + "fm_loss": 0.1591, + "grad_norm": 2.2825992107391357, + "learning_rate": 1.2059466442532004e-06, + "loss": 0.5307, + "step": 58600 + }, + { + "ar_loss": 0.372, + "epoch": 8.874711760184473, + "fm_loss": 0.158, + "grad_norm": 3.344238042831421, + "learning_rate": 1.1691404678201317e-06, + "loss": 0.53, + "step": 58700 + }, + { + "ar_loss": 0.372, + "epoch": 8.890084550345888, + "fm_loss": 0.1588, + "grad_norm": 3.1672725677490234, + "learning_rate": 1.1328912535927828e-06, + "loss": 0.5308, + "step": 58800 + }, + { + "ar_loss": 0.3724, + "epoch": 8.905457340507303, + "fm_loss": 0.1581, + "grad_norm": 19.84259605407715, + "learning_rate": 1.0971998487685597e-06, + "loss": 0.5304, + "step": 58900 + }, + { + "ar_loss": 0.3746, + "epoch": 8.920830130668715, + "fm_loss": 0.1598, + "grad_norm": 3.0031962394714355, + "learning_rate": 1.0620670875080397e-06, + "loss": 0.5344, + "step": 59000 + }, + { + "ar_loss": 0.3723, + "epoch": 8.93620292083013, + "fm_loss": 0.159, + "grad_norm": 6.419062614440918, + "learning_rate": 1.0274937909154792e-06, + "loss": 0.5314, + "step": 59100 + }, + { + "ar_loss": 0.3777, + "epoch": 8.951575710991545, + "fm_loss": 0.1597, + "grad_norm": 2.2336511611938477, + "learning_rate": 9.934807670196223e-07, + "loss": 0.5373, + "step": 59200 + }, + { + "ar_loss": 0.3717, + "epoch": 8.96694850115296, + "fm_loss": 0.1585, + "grad_norm": 1.7860270738601685, + "learning_rate": 9.600288107548233e-07, + "loss": 0.5302, + "step": 59300 + }, + { + "ar_loss": 0.3732, + "epoch": 8.982321291314374, + "fm_loss": 0.1591, + "grad_norm": 2.5118651390075684, + "learning_rate": 9.271387039424456e-07, + "loss": 0.5322, + "step": 59400 + }, + { + "ar_loss": 0.3723, + "epoch": 8.997694081475787, + "fm_loss": 0.1586, + "grad_norm": 2.007235288619995, + "learning_rate": 8.948112152726285e-07, + "loss": 0.5308, + "step": 59500 + }, + { + "ar_loss": 0.3726, + "epoch": 9.013066871637202, + "fm_loss": 0.1579, + "grad_norm": 3.087045907974243, + "learning_rate": 8.630471002862795e-07, + "loss": 0.5305, + "step": 59600 + }, + { + "ar_loss": 0.373, + "epoch": 9.028439661798616, + "fm_loss": 0.1594, + "grad_norm": 2.6808998584747314, + "learning_rate": 8.318471013574442e-07, + "loss": 0.5324, + "step": 59700 + }, + { + "ar_loss": 0.3725, + "epoch": 9.043812451960031, + "fm_loss": 0.1586, + "grad_norm": 2.7164993286132812, + "learning_rate": 8.01211947675945e-07, + "loss": 0.5311, + "step": 59800 + }, + { + "ar_loss": 0.3736, + "epoch": 9.059185242121446, + "fm_loss": 0.1582, + "grad_norm": 3.879828929901123, + "learning_rate": 7.711423552303366e-07, + "loss": 0.5318, + "step": 59900 + }, + { + "ar_loss": 0.3727, + "epoch": 9.074558032282859, + "fm_loss": 0.1574, + "grad_norm": 2.1200876235961914, + "learning_rate": 7.416390267911827e-07, + "loss": 0.5301, + "step": 60000 + } + ], + "logging_steps": 100, + "max_steps": 65050, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.656055098925268e+20, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}