{ "best_global_step": 9408, "best_metric": 0.1410149782896042, "best_model_checkpoint": "tmp/out/1536-96-r2_mix_channel_fcmCtx3_fcmLayers3_fcmChMixingTrue_stride24_bs512_lrf_deb3/checkpoint-9408", "epoch": 168.0, "eval_steps": 500, "global_step": 9408, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 0.376089870929718, "learning_rate": 0.00029836401390103334, "loss": 0.3643, "step": 56 }, { "epoch": 1.0, "eval_loss": 0.25079935789108276, "eval_runtime": 12.3705, "eval_samples_per_second": 877.898, "eval_steps_per_second": 1.778, "step": 56 }, { "epoch": 2.0, "grad_norm": 0.25105392932891846, "learning_rate": 0.00029836183164580883, "loss": 0.3058, "step": 112 }, { "epoch": 2.0, "eval_loss": 0.23216894268989563, "eval_runtime": 12.2194, "eval_samples_per_second": 888.753, "eval_steps_per_second": 1.8, "step": 112 }, { "epoch": 3.0, "grad_norm": 0.17020165920257568, "learning_rate": 0.00029835817704944523, "loss": 0.2683, "step": 168 }, { "epoch": 3.0, "eval_loss": 0.20991244912147522, "eval_runtime": 10.9934, "eval_samples_per_second": 987.863, "eval_steps_per_second": 2.001, "step": 168 }, { "epoch": 4.0, "grad_norm": 0.13130681216716766, "learning_rate": 0.00029835305014801184, "loss": 0.2395, "step": 224 }, { "epoch": 4.0, "eval_loss": 0.19736029207706451, "eval_runtime": 11.7226, "eval_samples_per_second": 926.414, "eval_steps_per_second": 1.877, "step": 224 }, { "epoch": 5.0, "grad_norm": 0.12686163187026978, "learning_rate": 0.0002983464509921093, "loss": 0.2241, "step": 280 }, { "epoch": 5.0, "eval_loss": 0.18977424502372742, "eval_runtime": 11.8479, "eval_samples_per_second": 916.618, "eval_steps_per_second": 1.857, "step": 280 }, { "epoch": 6.0, "grad_norm": 0.11746390908956528, "learning_rate": 0.00029833837964686835, "loss": 0.2148, "step": 336 }, { "epoch": 6.0, "eval_loss": 0.1851092129945755, "eval_runtime": 11.7556, "eval_samples_per_second": 923.812, "eval_steps_per_second": 1.871, "step": 336 }, { "epoch": 7.0, "grad_norm": 0.13627897202968597, "learning_rate": 0.0002983288361919503, "loss": 0.2078, "step": 392 }, { "epoch": 7.0, "eval_loss": 0.18129761517047882, "eval_runtime": 11.7487, "eval_samples_per_second": 924.357, "eval_steps_per_second": 1.873, "step": 392 }, { "epoch": 8.0, "grad_norm": 0.1497841328382492, "learning_rate": 0.00029831782072154485, "loss": 0.2025, "step": 448 }, { "epoch": 8.0, "eval_loss": 0.17769944667816162, "eval_runtime": 12.1141, "eval_samples_per_second": 896.477, "eval_steps_per_second": 1.816, "step": 448 }, { "epoch": 9.0, "grad_norm": 0.19643521308898926, "learning_rate": 0.0002983053333443701, "loss": 0.1976, "step": 504 }, { "epoch": 9.0, "eval_loss": 0.17583897709846497, "eval_runtime": 12.5558, "eval_samples_per_second": 864.936, "eval_steps_per_second": 1.752, "step": 504 }, { "epoch": 10.0, "grad_norm": 0.1033664122223854, "learning_rate": 0.0002982913741836719, "loss": 0.1936, "step": 560 }, { "epoch": 10.0, "eval_loss": 0.1739388257265091, "eval_runtime": 12.449, "eval_samples_per_second": 872.358, "eval_steps_per_second": 1.767, "step": 560 }, { "epoch": 11.0, "grad_norm": 0.1361815184354782, "learning_rate": 0.00029827594337722164, "loss": 0.1902, "step": 616 }, { "epoch": 11.0, "eval_loss": 0.17110829055309296, "eval_runtime": 12.7701, "eval_samples_per_second": 850.423, "eval_steps_per_second": 1.723, "step": 616 }, { "epoch": 12.0, "grad_norm": 0.12385320663452148, "learning_rate": 0.0002982590410773146, "loss": 0.1867, "step": 672 }, { "epoch": 12.0, "eval_loss": 0.16852673888206482, "eval_runtime": 11.8972, "eval_samples_per_second": 912.817, "eval_steps_per_second": 1.849, "step": 672 }, { "epoch": 13.0, "grad_norm": 0.13126742839813232, "learning_rate": 0.0002982406674507699, "loss": 0.1837, "step": 728 }, { "epoch": 13.0, "eval_loss": 0.1675039380788803, "eval_runtime": 11.8951, "eval_samples_per_second": 912.98, "eval_steps_per_second": 1.85, "step": 728 }, { "epoch": 14.0, "grad_norm": 0.14581529796123505, "learning_rate": 0.00029822082267892794, "loss": 0.1818, "step": 784 }, { "epoch": 14.0, "eval_loss": 0.16522179543972015, "eval_runtime": 12.951, "eval_samples_per_second": 838.545, "eval_steps_per_second": 1.699, "step": 784 }, { "epoch": 15.0, "grad_norm": 0.12710689008235931, "learning_rate": 0.0002981995069576483, "loss": 0.1787, "step": 840 }, { "epoch": 15.0, "eval_loss": 0.1651495099067688, "eval_runtime": 12.4369, "eval_samples_per_second": 873.211, "eval_steps_per_second": 1.769, "step": 840 }, { "epoch": 16.0, "grad_norm": 0.1914917379617691, "learning_rate": 0.0002981767204973089, "loss": 0.177, "step": 896 }, { "epoch": 16.0, "eval_loss": 0.1639031320810318, "eval_runtime": 12.7112, "eval_samples_per_second": 854.365, "eval_steps_per_second": 1.731, "step": 896 }, { "epoch": 17.0, "grad_norm": 0.15502069890499115, "learning_rate": 0.00029815246352280276, "loss": 0.1751, "step": 952 }, { "epoch": 17.0, "eval_loss": 0.16176268458366394, "eval_runtime": 12.1031, "eval_samples_per_second": 897.291, "eval_steps_per_second": 1.818, "step": 952 }, { "epoch": 18.0, "grad_norm": 0.11603855341672897, "learning_rate": 0.0002981267362735362, "loss": 0.1734, "step": 1008 }, { "epoch": 18.0, "eval_loss": 0.1614038050174713, "eval_runtime": 11.893, "eval_samples_per_second": 913.139, "eval_steps_per_second": 1.85, "step": 1008 }, { "epoch": 19.0, "grad_norm": 0.11780980974435806, "learning_rate": 0.0002980995390034271, "loss": 0.172, "step": 1064 }, { "epoch": 19.0, "eval_loss": 0.16114258766174316, "eval_runtime": 12.6404, "eval_samples_per_second": 859.152, "eval_steps_per_second": 1.74, "step": 1064 }, { "epoch": 20.0, "grad_norm": 0.14823858439922333, "learning_rate": 0.00029807087198090116, "loss": 0.1702, "step": 1120 }, { "epoch": 20.0, "eval_loss": 0.15980996191501617, "eval_runtime": 12.5631, "eval_samples_per_second": 864.434, "eval_steps_per_second": 1.751, "step": 1120 }, { "epoch": 21.0, "grad_norm": 0.1246936172246933, "learning_rate": 0.0002980407354888907, "loss": 0.1688, "step": 1176 }, { "epoch": 21.0, "eval_loss": 0.15955598652362823, "eval_runtime": 12.315, "eval_samples_per_second": 881.853, "eval_steps_per_second": 1.786, "step": 1176 }, { "epoch": 22.0, "grad_norm": 0.11726798117160797, "learning_rate": 0.0002980091298248309, "loss": 0.1675, "step": 1232 }, { "epoch": 22.0, "eval_loss": 0.15864743292331696, "eval_runtime": 12.3526, "eval_samples_per_second": 879.166, "eval_steps_per_second": 1.781, "step": 1232 }, { "epoch": 23.0, "grad_norm": 0.13960805535316467, "learning_rate": 0.0002979760553006564, "loss": 0.1666, "step": 1288 }, { "epoch": 23.0, "eval_loss": 0.15781378746032715, "eval_runtime": 12.187, "eval_samples_per_second": 891.116, "eval_steps_per_second": 1.805, "step": 1288 }, { "epoch": 24.0, "grad_norm": 0.11856065690517426, "learning_rate": 0.00029794151224279964, "loss": 0.1652, "step": 1344 }, { "epoch": 24.0, "eval_loss": 0.15776978433132172, "eval_runtime": 12.435, "eval_samples_per_second": 873.344, "eval_steps_per_second": 1.769, "step": 1344 }, { "epoch": 25.0, "grad_norm": 0.12466388940811157, "learning_rate": 0.00029790550099218654, "loss": 0.1643, "step": 1400 }, { "epoch": 25.0, "eval_loss": 0.15815725922584534, "eval_runtime": 13.1792, "eval_samples_per_second": 824.023, "eval_steps_per_second": 1.669, "step": 1400 }, { "epoch": 26.0, "grad_norm": 0.12369589507579803, "learning_rate": 0.0002978680219042336, "loss": 0.1633, "step": 1456 }, { "epoch": 26.0, "eval_loss": 0.1567024141550064, "eval_runtime": 12.484, "eval_samples_per_second": 869.916, "eval_steps_per_second": 1.762, "step": 1456 }, { "epoch": 27.0, "grad_norm": 0.14197547733783722, "learning_rate": 0.0002978290753488448, "loss": 0.1624, "step": 1512 }, { "epoch": 27.0, "eval_loss": 0.15676391124725342, "eval_runtime": 12.738, "eval_samples_per_second": 852.567, "eval_steps_per_second": 1.727, "step": 1512 }, { "epoch": 28.0, "grad_norm": 0.13262535631656647, "learning_rate": 0.0002977886617104062, "loss": 0.1613, "step": 1568 }, { "epoch": 28.0, "eval_loss": 0.1567520797252655, "eval_runtime": 12.6529, "eval_samples_per_second": 858.304, "eval_steps_per_second": 1.739, "step": 1568 }, { "epoch": 29.0, "grad_norm": 0.15622882544994354, "learning_rate": 0.0002977467813877842, "loss": 0.1604, "step": 1624 }, { "epoch": 29.0, "eval_loss": 0.15647795796394348, "eval_runtime": 12.6006, "eval_samples_per_second": 861.863, "eval_steps_per_second": 1.746, "step": 1624 }, { "epoch": 30.0, "grad_norm": 0.15161629021167755, "learning_rate": 0.00029770343479432095, "loss": 0.1598, "step": 1680 }, { "epoch": 30.0, "eval_loss": 0.15717600286006927, "eval_runtime": 12.8165, "eval_samples_per_second": 847.348, "eval_steps_per_second": 1.717, "step": 1680 }, { "epoch": 31.0, "grad_norm": 0.12715986371040344, "learning_rate": 0.0002976586223578297, "loss": 0.1591, "step": 1736 }, { "epoch": 31.0, "eval_loss": 0.1557074338197708, "eval_runtime": 12.6403, "eval_samples_per_second": 859.156, "eval_steps_per_second": 1.74, "step": 1736 }, { "epoch": 32.0, "grad_norm": 0.1595166027545929, "learning_rate": 0.00029761234452059136, "loss": 0.1584, "step": 1792 }, { "epoch": 32.0, "eval_loss": 0.15540747344493866, "eval_runtime": 13.3084, "eval_samples_per_second": 816.027, "eval_steps_per_second": 1.653, "step": 1792 }, { "epoch": 33.0, "grad_norm": 0.16593649983406067, "learning_rate": 0.0002975646017393494, "loss": 0.1576, "step": 1848 }, { "epoch": 33.0, "eval_loss": 0.15468333661556244, "eval_runtime": 13.1483, "eval_samples_per_second": 825.961, "eval_steps_per_second": 1.673, "step": 1848 }, { "epoch": 34.0, "grad_norm": 0.14555956423282623, "learning_rate": 0.0002975153944853054, "loss": 0.1567, "step": 1904 }, { "epoch": 34.0, "eval_loss": 0.1553257554769516, "eval_runtime": 12.853, "eval_samples_per_second": 844.936, "eval_steps_per_second": 1.712, "step": 1904 }, { "epoch": 35.0, "grad_norm": 0.23194457590579987, "learning_rate": 0.00029746472324411547, "loss": 0.156, "step": 1960 }, { "epoch": 35.0, "eval_loss": 0.1549767106771469, "eval_runtime": 11.49, "eval_samples_per_second": 945.169, "eval_steps_per_second": 1.915, "step": 1960 }, { "epoch": 36.0, "grad_norm": 0.17572428286075592, "learning_rate": 0.0002974125885158844, "loss": 0.1559, "step": 2016 }, { "epoch": 36.0, "eval_loss": 0.15631072223186493, "eval_runtime": 12.6465, "eval_samples_per_second": 858.739, "eval_steps_per_second": 1.74, "step": 2016 }, { "epoch": 37.0, "grad_norm": 0.1315496563911438, "learning_rate": 0.0002973589908151604, "loss": 0.1547, "step": 2072 }, { "epoch": 37.0, "eval_loss": 0.1540231704711914, "eval_runtime": 13.3162, "eval_samples_per_second": 815.548, "eval_steps_per_second": 1.652, "step": 2072 }, { "epoch": 38.0, "grad_norm": 0.17212693393230438, "learning_rate": 0.0002973039306709319, "loss": 0.1539, "step": 2128 }, { "epoch": 38.0, "eval_loss": 0.15414279699325562, "eval_runtime": 13.2364, "eval_samples_per_second": 820.466, "eval_steps_per_second": 1.662, "step": 2128 }, { "epoch": 39.0, "grad_norm": 0.12589286267757416, "learning_rate": 0.0002972474086266193, "loss": 0.1538, "step": 2184 }, { "epoch": 39.0, "eval_loss": 0.15399765968322754, "eval_runtime": 12.5952, "eval_samples_per_second": 862.236, "eval_steps_per_second": 1.747, "step": 2184 }, { "epoch": 40.0, "grad_norm": 0.1479528248310089, "learning_rate": 0.0002971894252400732, "loss": 0.1529, "step": 2240 }, { "epoch": 40.0, "eval_loss": 0.1546306610107422, "eval_runtime": 12.4569, "eval_samples_per_second": 871.809, "eval_steps_per_second": 1.766, "step": 2240 }, { "epoch": 41.0, "grad_norm": 0.140830859541893, "learning_rate": 0.00029712998108356566, "loss": 0.1521, "step": 2296 }, { "epoch": 41.0, "eval_loss": 0.15411749482154846, "eval_runtime": 12.8911, "eval_samples_per_second": 842.441, "eval_steps_per_second": 1.707, "step": 2296 }, { "epoch": 42.0, "grad_norm": 0.14429251849651337, "learning_rate": 0.0002970690767437871, "loss": 0.1521, "step": 2352 }, { "epoch": 42.0, "eval_loss": 0.1535186916589737, "eval_runtime": 12.7037, "eval_samples_per_second": 854.87, "eval_steps_per_second": 1.732, "step": 2352 }, { "epoch": 43.0, "grad_norm": 0.1678067147731781, "learning_rate": 0.00029700671282183844, "loss": 0.1516, "step": 2408 }, { "epoch": 43.0, "eval_loss": 0.15345174074172974, "eval_runtime": 12.8622, "eval_samples_per_second": 844.337, "eval_steps_per_second": 1.71, "step": 2408 }, { "epoch": 44.0, "grad_norm": 0.16715741157531738, "learning_rate": 0.00029694288993322636, "loss": 0.1506, "step": 2464 }, { "epoch": 44.0, "eval_loss": 0.1528453379869461, "eval_runtime": 12.394, "eval_samples_per_second": 876.23, "eval_steps_per_second": 1.775, "step": 2464 }, { "epoch": 45.0, "grad_norm": 0.1476888358592987, "learning_rate": 0.00029687760870785704, "loss": 0.1502, "step": 2520 }, { "epoch": 45.0, "eval_loss": 0.15371684730052948, "eval_runtime": 12.8504, "eval_samples_per_second": 845.113, "eval_steps_per_second": 1.712, "step": 2520 }, { "epoch": 46.0, "grad_norm": 0.16268473863601685, "learning_rate": 0.00029681086979003, "loss": 0.1497, "step": 2576 }, { "epoch": 46.0, "eval_loss": 0.15216761827468872, "eval_runtime": 12.9049, "eval_samples_per_second": 841.539, "eval_steps_per_second": 1.705, "step": 2576 }, { "epoch": 47.0, "grad_norm": 0.17756158113479614, "learning_rate": 0.0002967426738384313, "loss": 0.1493, "step": 2632 }, { "epoch": 47.0, "eval_loss": 0.15324676036834717, "eval_runtime": 13.0526, "eval_samples_per_second": 832.021, "eval_steps_per_second": 1.685, "step": 2632 }, { "epoch": 48.0, "grad_norm": 0.13994063436985016, "learning_rate": 0.0002966730215261271, "loss": 0.1487, "step": 2688 }, { "epoch": 48.0, "eval_loss": 0.15221010148525238, "eval_runtime": 12.6334, "eval_samples_per_second": 859.628, "eval_steps_per_second": 1.741, "step": 2688 }, { "epoch": 49.0, "grad_norm": 0.18394885957241058, "learning_rate": 0.0002966019135405581, "loss": 0.1483, "step": 2744 }, { "epoch": 49.0, "eval_loss": 0.15254603326320648, "eval_runtime": 12.296, "eval_samples_per_second": 883.214, "eval_steps_per_second": 1.789, "step": 2744 }, { "epoch": 50.0, "grad_norm": 0.14756232500076294, "learning_rate": 0.000296529350583531, "loss": 0.1479, "step": 2800 }, { "epoch": 50.0, "eval_loss": 0.15157358348369598, "eval_runtime": 12.7067, "eval_samples_per_second": 854.666, "eval_steps_per_second": 1.731, "step": 2800 }, { "epoch": 51.0, "grad_norm": 0.18675681948661804, "learning_rate": 0.00029645533337121344, "loss": 0.1476, "step": 2856 }, { "epoch": 51.0, "eval_loss": 0.15315961837768555, "eval_runtime": 12.914, "eval_samples_per_second": 840.949, "eval_steps_per_second": 1.704, "step": 2856 }, { "epoch": 52.0, "grad_norm": 0.21148425340652466, "learning_rate": 0.0002963798626341248, "loss": 0.1467, "step": 2912 }, { "epoch": 52.0, "eval_loss": 0.151397705078125, "eval_runtime": 12.6083, "eval_samples_per_second": 861.336, "eval_steps_per_second": 1.745, "step": 2912 }, { "epoch": 53.0, "grad_norm": 0.14957012236118317, "learning_rate": 0.00029630293911713125, "loss": 0.1463, "step": 2968 }, { "epoch": 53.0, "eval_loss": 0.152817040681839, "eval_runtime": 12.3988, "eval_samples_per_second": 875.89, "eval_steps_per_second": 1.774, "step": 2968 }, { "epoch": 54.0, "grad_norm": 0.18841682374477386, "learning_rate": 0.0002962245635794367, "loss": 0.1457, "step": 3024 }, { "epoch": 54.0, "eval_loss": 0.1509653627872467, "eval_runtime": 12.9201, "eval_samples_per_second": 840.553, "eval_steps_per_second": 1.703, "step": 3024 }, { "epoch": 55.0, "grad_norm": 0.19782641530036926, "learning_rate": 0.00029614473679457606, "loss": 0.1457, "step": 3080 }, { "epoch": 55.0, "eval_loss": 0.15204061567783356, "eval_runtime": 13.0172, "eval_samples_per_second": 834.282, "eval_steps_per_second": 1.69, "step": 3080 }, { "epoch": 56.0, "grad_norm": 0.15806534886360168, "learning_rate": 0.0002960634595504073, "loss": 0.145, "step": 3136 }, { "epoch": 56.0, "eval_loss": 0.15144167840480804, "eval_runtime": 12.3723, "eval_samples_per_second": 877.767, "eval_steps_per_second": 1.778, "step": 3136 }, { "epoch": 57.0, "grad_norm": 0.1470707207918167, "learning_rate": 0.00029598073264910414, "loss": 0.1446, "step": 3192 }, { "epoch": 57.0, "eval_loss": 0.15259326994419098, "eval_runtime": 11.8486, "eval_samples_per_second": 916.567, "eval_steps_per_second": 1.857, "step": 3192 }, { "epoch": 58.0, "grad_norm": 0.12880393862724304, "learning_rate": 0.00029589655690714776, "loss": 0.1444, "step": 3248 }, { "epoch": 58.0, "eval_loss": 0.1521604359149933, "eval_runtime": 12.3711, "eval_samples_per_second": 877.851, "eval_steps_per_second": 1.778, "step": 3248 }, { "epoch": 59.0, "grad_norm": 0.20687344670295715, "learning_rate": 0.00029581093315531867, "loss": 0.1439, "step": 3304 }, { "epoch": 59.0, "eval_loss": 0.1506902128458023, "eval_runtime": 12.2839, "eval_samples_per_second": 884.082, "eval_steps_per_second": 1.791, "step": 3304 }, { "epoch": 60.0, "grad_norm": 0.31674283742904663, "learning_rate": 0.00029572386223868856, "loss": 0.1434, "step": 3360 }, { "epoch": 60.0, "eval_loss": 0.1497628092765808, "eval_runtime": 12.2602, "eval_samples_per_second": 885.791, "eval_steps_per_second": 1.794, "step": 3360 }, { "epoch": 61.0, "grad_norm": 0.1524023711681366, "learning_rate": 0.0002956353450166127, "loss": 0.1428, "step": 3416 }, { "epoch": 61.0, "eval_loss": 0.15104272961616516, "eval_runtime": 11.4854, "eval_samples_per_second": 945.545, "eval_steps_per_second": 1.915, "step": 3416 }, { "epoch": 62.0, "grad_norm": 0.1333588808774948, "learning_rate": 0.00029554538236271986, "loss": 0.1427, "step": 3472 }, { "epoch": 62.0, "eval_loss": 0.15125687420368195, "eval_runtime": 11.619, "eval_samples_per_second": 934.673, "eval_steps_per_second": 1.893, "step": 3472 }, { "epoch": 63.0, "grad_norm": 0.14987458288669586, "learning_rate": 0.0002954539751649054, "loss": 0.1427, "step": 3528 }, { "epoch": 63.0, "eval_loss": 0.15022161602973938, "eval_runtime": 11.7178, "eval_samples_per_second": 926.795, "eval_steps_per_second": 1.877, "step": 3528 }, { "epoch": 64.0, "grad_norm": 0.19036932289600372, "learning_rate": 0.00029536112432532164, "loss": 0.1418, "step": 3584 }, { "epoch": 64.0, "eval_loss": 0.15002530813217163, "eval_runtime": 12.0423, "eval_samples_per_second": 901.82, "eval_steps_per_second": 1.827, "step": 3584 }, { "epoch": 65.0, "grad_norm": 0.15858310461044312, "learning_rate": 0.00029526683076036824, "loss": 0.1416, "step": 3640 }, { "epoch": 65.0, "eval_loss": 0.15072880685329437, "eval_runtime": 11.4427, "eval_samples_per_second": 949.077, "eval_steps_per_second": 1.923, "step": 3640 }, { "epoch": 66.0, "grad_norm": 0.1411045342683792, "learning_rate": 0.0002951710954006851, "loss": 0.1415, "step": 3696 }, { "epoch": 66.0, "eval_loss": 0.150208979845047, "eval_runtime": 11.7843, "eval_samples_per_second": 921.567, "eval_steps_per_second": 1.867, "step": 3696 }, { "epoch": 67.0, "grad_norm": 0.18127693235874176, "learning_rate": 0.00029507391919114174, "loss": 0.1407, "step": 3752 }, { "epoch": 67.0, "eval_loss": 0.15111134946346283, "eval_runtime": 11.7998, "eval_samples_per_second": 920.352, "eval_steps_per_second": 1.864, "step": 3752 }, { "epoch": 68.0, "grad_norm": 0.20954985916614532, "learning_rate": 0.0002949753030908276, "loss": 0.1404, "step": 3808 }, { "epoch": 68.0, "eval_loss": 0.15048466622829437, "eval_runtime": 11.8536, "eval_samples_per_second": 916.178, "eval_steps_per_second": 1.856, "step": 3808 }, { "epoch": 69.0, "grad_norm": 0.1799214780330658, "learning_rate": 0.0002948752480730442, "loss": 0.1401, "step": 3864 }, { "epoch": 69.0, "eval_loss": 0.14996136724948883, "eval_runtime": 11.8425, "eval_samples_per_second": 917.04, "eval_steps_per_second": 1.858, "step": 3864 }, { "epoch": 70.0, "grad_norm": 0.14687888324260712, "learning_rate": 0.0002947737551252938, "loss": 0.1399, "step": 3920 }, { "epoch": 70.0, "eval_loss": 0.1494998186826706, "eval_runtime": 11.8446, "eval_samples_per_second": 916.877, "eval_steps_per_second": 1.857, "step": 3920 }, { "epoch": 71.0, "grad_norm": 0.2250983864068985, "learning_rate": 0.000294670825249271, "loss": 0.1397, "step": 3976 }, { "epoch": 71.0, "eval_loss": 0.14974181354045868, "eval_runtime": 10.3667, "eval_samples_per_second": 1047.585, "eval_steps_per_second": 2.122, "step": 3976 }, { "epoch": 72.0, "grad_norm": 0.14977572858333588, "learning_rate": 0.00029456645946085235, "loss": 0.1393, "step": 4032 }, { "epoch": 72.0, "eval_loss": 0.1504337042570114, "eval_runtime": 11.0031, "eval_samples_per_second": 986.994, "eval_steps_per_second": 1.999, "step": 4032 }, { "epoch": 73.0, "grad_norm": 0.2215435802936554, "learning_rate": 0.00029446065879008577, "loss": 0.1389, "step": 4088 }, { "epoch": 73.0, "eval_loss": 0.14960449934005737, "eval_runtime": 10.5211, "eval_samples_per_second": 1032.216, "eval_steps_per_second": 2.091, "step": 4088 }, { "epoch": 74.0, "grad_norm": 0.14885684847831726, "learning_rate": 0.00029435342428118117, "loss": 0.1384, "step": 4144 }, { "epoch": 74.0, "eval_loss": 0.14882370829582214, "eval_runtime": 11.6942, "eval_samples_per_second": 928.669, "eval_steps_per_second": 1.881, "step": 4144 }, { "epoch": 75.0, "grad_norm": 0.20596224069595337, "learning_rate": 0.0002942447569924998, "loss": 0.1384, "step": 4200 }, { "epoch": 75.0, "eval_loss": 0.14847591519355774, "eval_runtime": 11.911, "eval_samples_per_second": 911.765, "eval_steps_per_second": 1.847, "step": 4200 }, { "epoch": 76.0, "grad_norm": 0.1551866978406906, "learning_rate": 0.0002941346579965444, "loss": 0.1379, "step": 4256 }, { "epoch": 76.0, "eval_loss": 0.1497822105884552, "eval_runtime": 11.0615, "eval_samples_per_second": 981.782, "eval_steps_per_second": 1.989, "step": 4256 }, { "epoch": 77.0, "grad_norm": 0.19567330181598663, "learning_rate": 0.00029402312837994727, "loss": 0.138, "step": 4312 }, { "epoch": 77.0, "eval_loss": 0.14890199899673462, "eval_runtime": 11.5065, "eval_samples_per_second": 943.812, "eval_steps_per_second": 1.912, "step": 4312 }, { "epoch": 78.0, "grad_norm": 0.1951490044593811, "learning_rate": 0.0002939101692434606, "loss": 0.1372, "step": 4368 }, { "epoch": 78.0, "eval_loss": 0.14929604530334473, "eval_runtime": 11.7303, "eval_samples_per_second": 925.806, "eval_steps_per_second": 1.875, "step": 4368 }, { "epoch": 79.0, "grad_norm": 0.15116438269615173, "learning_rate": 0.00029379578170194554, "loss": 0.1371, "step": 4424 }, { "epoch": 79.0, "eval_loss": 0.14909496903419495, "eval_runtime": 11.5142, "eval_samples_per_second": 943.184, "eval_steps_per_second": 1.911, "step": 4424 }, { "epoch": 80.0, "grad_norm": 0.24799354374408722, "learning_rate": 0.00029367996688436096, "loss": 0.1369, "step": 4480 }, { "epoch": 80.0, "eval_loss": 0.14952804148197174, "eval_runtime": 10.7014, "eval_samples_per_second": 1014.824, "eval_steps_per_second": 2.056, "step": 4480 }, { "epoch": 81.0, "grad_norm": 0.16792896389961243, "learning_rate": 0.00029356272593375216, "loss": 0.1368, "step": 4536 }, { "epoch": 81.0, "eval_loss": 0.1491686999797821, "eval_runtime": 11.5601, "eval_samples_per_second": 939.442, "eval_steps_per_second": 1.903, "step": 4536 }, { "epoch": 82.0, "grad_norm": 0.21115855872631073, "learning_rate": 0.00029344406000724046, "loss": 0.1363, "step": 4592 }, { "epoch": 82.0, "eval_loss": 0.14837497472763062, "eval_runtime": 11.7754, "eval_samples_per_second": 922.263, "eval_steps_per_second": 1.868, "step": 4592 }, { "epoch": 83.0, "grad_norm": 0.15595555305480957, "learning_rate": 0.0002933239702760101, "loss": 0.1361, "step": 4648 }, { "epoch": 83.0, "eval_loss": 0.14758282899856567, "eval_runtime": 11.5424, "eval_samples_per_second": 940.879, "eval_steps_per_second": 1.906, "step": 4648 }, { "epoch": 84.0, "grad_norm": 0.14343903958797455, "learning_rate": 0.00029320245792529843, "loss": 0.1355, "step": 4704 }, { "epoch": 84.0, "eval_loss": 0.1478155553340912, "eval_runtime": 11.4968, "eval_samples_per_second": 944.61, "eval_steps_per_second": 1.914, "step": 4704 }, { "epoch": 85.0, "grad_norm": 0.2670864462852478, "learning_rate": 0.00029307952415438376, "loss": 0.1353, "step": 4760 }, { "epoch": 85.0, "eval_loss": 0.14811985194683075, "eval_runtime": 11.0295, "eval_samples_per_second": 984.636, "eval_steps_per_second": 1.995, "step": 4760 }, { "epoch": 86.0, "grad_norm": 0.19388346374034882, "learning_rate": 0.00029295517017657207, "loss": 0.1353, "step": 4816 }, { "epoch": 86.0, "eval_loss": 0.14837351441383362, "eval_runtime": 11.4695, "eval_samples_per_second": 946.859, "eval_steps_per_second": 1.918, "step": 4816 }, { "epoch": 87.0, "grad_norm": 0.15899422764778137, "learning_rate": 0.00029282939721918743, "loss": 0.1351, "step": 4872 }, { "epoch": 87.0, "eval_loss": 0.14791646599769592, "eval_runtime": 11.4789, "eval_samples_per_second": 946.087, "eval_steps_per_second": 1.917, "step": 4872 }, { "epoch": 88.0, "grad_norm": 0.25924888253211975, "learning_rate": 0.00029270220652355785, "loss": 0.1345, "step": 4928 }, { "epoch": 88.0, "eval_loss": 0.1483958214521408, "eval_runtime": 11.0986, "eval_samples_per_second": 978.501, "eval_steps_per_second": 1.982, "step": 4928 }, { "epoch": 89.0, "grad_norm": 0.197585791349411, "learning_rate": 0.0002925735993450043, "loss": 0.1342, "step": 4984 }, { "epoch": 89.0, "eval_loss": 0.14841538667678833, "eval_runtime": 11.2913, "eval_samples_per_second": 961.799, "eval_steps_per_second": 1.948, "step": 4984 }, { "epoch": 90.0, "grad_norm": 0.18903715908527374, "learning_rate": 0.0002924435769528278, "loss": 0.1343, "step": 5040 }, { "epoch": 90.0, "eval_loss": 0.14745239913463593, "eval_runtime": 12.07, "eval_samples_per_second": 899.752, "eval_steps_per_second": 1.823, "step": 5040 }, { "epoch": 91.0, "grad_norm": 0.1610485017299652, "learning_rate": 0.00029231214063029666, "loss": 0.1336, "step": 5096 }, { "epoch": 91.0, "eval_loss": 0.1469384878873825, "eval_runtime": 12.1199, "eval_samples_per_second": 896.05, "eval_steps_per_second": 1.815, "step": 5096 }, { "epoch": 92.0, "grad_norm": 0.20112423598766327, "learning_rate": 0.00029217929167463404, "loss": 0.1337, "step": 5152 }, { "epoch": 92.0, "eval_loss": 0.14764182269573212, "eval_runtime": 10.2692, "eval_samples_per_second": 1057.536, "eval_steps_per_second": 2.142, "step": 5152 }, { "epoch": 93.0, "grad_norm": 0.28488588333129883, "learning_rate": 0.00029204503139700625, "loss": 0.1335, "step": 5208 }, { "epoch": 93.0, "eval_loss": 0.1479685753583908, "eval_runtime": 11.6849, "eval_samples_per_second": 929.407, "eval_steps_per_second": 1.883, "step": 5208 }, { "epoch": 94.0, "grad_norm": 0.2028261125087738, "learning_rate": 0.0002919093611225077, "loss": 0.1333, "step": 5264 }, { "epoch": 94.0, "eval_loss": 0.14725789427757263, "eval_runtime": 11.2025, "eval_samples_per_second": 969.429, "eval_steps_per_second": 1.964, "step": 5264 }, { "epoch": 95.0, "grad_norm": 0.20275919139385223, "learning_rate": 0.0002917722821901492, "loss": 0.1334, "step": 5320 }, { "epoch": 95.0, "eval_loss": 0.14767614006996155, "eval_runtime": 10.8005, "eval_samples_per_second": 1005.513, "eval_steps_per_second": 2.037, "step": 5320 }, { "epoch": 96.0, "grad_norm": 0.2053348869085312, "learning_rate": 0.0002916337959528444, "loss": 0.1325, "step": 5376 }, { "epoch": 96.0, "eval_loss": 0.14707864820957184, "eval_runtime": 11.1238, "eval_samples_per_second": 976.287, "eval_steps_per_second": 1.978, "step": 5376 }, { "epoch": 97.0, "grad_norm": 0.23510950803756714, "learning_rate": 0.0002914939037773966, "loss": 0.1321, "step": 5432 }, { "epoch": 97.0, "eval_loss": 0.1476944088935852, "eval_runtime": 10.9362, "eval_samples_per_second": 993.028, "eval_steps_per_second": 2.012, "step": 5432 }, { "epoch": 98.0, "grad_norm": 0.2703108787536621, "learning_rate": 0.000291352607044485, "loss": 0.1327, "step": 5488 }, { "epoch": 98.0, "eval_loss": 0.1466565579175949, "eval_runtime": 10.8189, "eval_samples_per_second": 1003.802, "eval_steps_per_second": 2.033, "step": 5488 }, { "epoch": 99.0, "grad_norm": 0.22386641800403595, "learning_rate": 0.0002912099071486513, "loss": 0.1318, "step": 5544 }, { "epoch": 99.0, "eval_loss": 0.1469065397977829, "eval_runtime": 10.9677, "eval_samples_per_second": 990.181, "eval_steps_per_second": 2.006, "step": 5544 }, { "epoch": 100.0, "grad_norm": 0.18684013187885284, "learning_rate": 0.0002910658054982861, "loss": 0.1319, "step": 5600 }, { "epoch": 100.0, "eval_loss": 0.1462097316980362, "eval_runtime": 11.5801, "eval_samples_per_second": 937.82, "eval_steps_per_second": 1.9, "step": 5600 }, { "epoch": 101.0, "grad_norm": 0.1831580400466919, "learning_rate": 0.00029092030351561435, "loss": 0.1318, "step": 5656 }, { "epoch": 101.0, "eval_loss": 0.1467864215373993, "eval_runtime": 11.2551, "eval_samples_per_second": 964.899, "eval_steps_per_second": 1.955, "step": 5656 }, { "epoch": 102.0, "grad_norm": 0.20423631370067596, "learning_rate": 0.00029077340263668184, "loss": 0.1315, "step": 5712 }, { "epoch": 102.0, "eval_loss": 0.1470629870891571, "eval_runtime": 10.0185, "eval_samples_per_second": 1083.994, "eval_steps_per_second": 2.196, "step": 5712 }, { "epoch": 103.0, "grad_norm": 0.20669810473918915, "learning_rate": 0.0002906251043113414, "loss": 0.1312, "step": 5768 }, { "epoch": 103.0, "eval_loss": 0.14603030681610107, "eval_runtime": 11.5962, "eval_samples_per_second": 936.51, "eval_steps_per_second": 1.897, "step": 5768 }, { "epoch": 104.0, "grad_norm": 0.18566496670246124, "learning_rate": 0.0002904754100032369, "loss": 0.1308, "step": 5824 }, { "epoch": 104.0, "eval_loss": 0.146591916680336, "eval_runtime": 11.8139, "eval_samples_per_second": 919.255, "eval_steps_per_second": 1.862, "step": 5824 }, { "epoch": 105.0, "grad_norm": 0.32265496253967285, "learning_rate": 0.000290324321189791, "loss": 0.1311, "step": 5880 }, { "epoch": 105.0, "eval_loss": 0.1458718478679657, "eval_runtime": 11.9546, "eval_samples_per_second": 908.438, "eval_steps_per_second": 1.84, "step": 5880 }, { "epoch": 106.0, "grad_norm": 0.17987699806690216, "learning_rate": 0.00029017183936218906, "loss": 0.1302, "step": 5936 }, { "epoch": 106.0, "eval_loss": 0.1459737867116928, "eval_runtime": 12.1694, "eval_samples_per_second": 892.4, "eval_steps_per_second": 1.808, "step": 5936 }, { "epoch": 107.0, "grad_norm": 0.18314820528030396, "learning_rate": 0.0002900179660253659, "loss": 0.1303, "step": 5992 }, { "epoch": 107.0, "eval_loss": 0.14506617188453674, "eval_runtime": 11.0204, "eval_samples_per_second": 985.446, "eval_steps_per_second": 1.996, "step": 5992 }, { "epoch": 108.0, "grad_norm": 0.1967027485370636, "learning_rate": 0.00028986270269798893, "loss": 0.13, "step": 6048 }, { "epoch": 108.0, "eval_loss": 0.1448826640844345, "eval_runtime": 11.2115, "eval_samples_per_second": 968.651, "eval_steps_per_second": 1.962, "step": 6048 }, { "epoch": 109.0, "grad_norm": 0.17848514020442963, "learning_rate": 0.00028970605091244395, "loss": 0.13, "step": 6104 }, { "epoch": 109.0, "eval_loss": 0.14577716588974, "eval_runtime": 12.0159, "eval_samples_per_second": 903.806, "eval_steps_per_second": 1.831, "step": 6104 }, { "epoch": 110.0, "grad_norm": 0.1681281179189682, "learning_rate": 0.00028954801221482137, "loss": 0.13, "step": 6160 }, { "epoch": 110.0, "eval_loss": 0.1459922343492508, "eval_runtime": 11.657, "eval_samples_per_second": 931.628, "eval_steps_per_second": 1.887, "step": 6160 }, { "epoch": 111.0, "grad_norm": 0.19543369114398956, "learning_rate": 0.00028938858816489945, "loss": 0.1294, "step": 6216 }, { "epoch": 111.0, "eval_loss": 0.14557458460330963, "eval_runtime": 11.502, "eval_samples_per_second": 944.183, "eval_steps_per_second": 1.913, "step": 6216 }, { "epoch": 112.0, "grad_norm": 0.19514279067516327, "learning_rate": 0.0002892277803361288, "loss": 0.1294, "step": 6272 }, { "epoch": 112.0, "eval_loss": 0.14542081952095032, "eval_runtime": 11.3675, "eval_samples_per_second": 955.353, "eval_steps_per_second": 1.935, "step": 6272 }, { "epoch": 113.0, "grad_norm": 0.19245897233486176, "learning_rate": 0.00028906559031561803, "loss": 0.1294, "step": 6328 }, { "epoch": 113.0, "eval_loss": 0.14575673639774323, "eval_runtime": 12.0854, "eval_samples_per_second": 898.603, "eval_steps_per_second": 1.82, "step": 6328 }, { "epoch": 114.0, "grad_norm": 0.2559398412704468, "learning_rate": 0.0002889020197041172, "loss": 0.129, "step": 6384 }, { "epoch": 114.0, "eval_loss": 0.14476452767848969, "eval_runtime": 11.4747, "eval_samples_per_second": 946.432, "eval_steps_per_second": 1.917, "step": 6384 }, { "epoch": 115.0, "grad_norm": 0.1581374853849411, "learning_rate": 0.0002887370701160019, "loss": 0.129, "step": 6440 }, { "epoch": 115.0, "eval_loss": 0.14649543166160583, "eval_runtime": 11.7792, "eval_samples_per_second": 921.961, "eval_steps_per_second": 1.868, "step": 6440 }, { "epoch": 116.0, "grad_norm": 0.17189738154411316, "learning_rate": 0.0002885707431792581, "loss": 0.1282, "step": 6496 }, { "epoch": 116.0, "eval_loss": 0.14660660922527313, "eval_runtime": 11.9186, "eval_samples_per_second": 911.183, "eval_steps_per_second": 1.846, "step": 6496 }, { "epoch": 117.0, "grad_norm": 0.2357121855020523, "learning_rate": 0.0002884030405354656, "loss": 0.129, "step": 6552 }, { "epoch": 117.0, "eval_loss": 0.146439790725708, "eval_runtime": 11.5156, "eval_samples_per_second": 943.071, "eval_steps_per_second": 1.91, "step": 6552 }, { "epoch": 118.0, "grad_norm": 0.1968863159418106, "learning_rate": 0.00028823396383978163, "loss": 0.1279, "step": 6608 }, { "epoch": 118.0, "eval_loss": 0.1450948715209961, "eval_runtime": 11.6204, "eval_samples_per_second": 934.567, "eval_steps_per_second": 1.893, "step": 6608 }, { "epoch": 119.0, "grad_norm": 0.16850939393043518, "learning_rate": 0.0002880635147609254, "loss": 0.1279, "step": 6664 }, { "epoch": 119.0, "eval_loss": 0.1456771343946457, "eval_runtime": 11.4295, "eval_samples_per_second": 950.17, "eval_steps_per_second": 1.925, "step": 6664 }, { "epoch": 120.0, "grad_norm": 0.20816339552402496, "learning_rate": 0.0002878916949811601, "loss": 0.1277, "step": 6720 }, { "epoch": 120.0, "eval_loss": 0.1461264193058014, "eval_runtime": 11.9161, "eval_samples_per_second": 911.372, "eval_steps_per_second": 1.846, "step": 6720 }, { "epoch": 121.0, "grad_norm": 0.19195137917995453, "learning_rate": 0.0002877185061962775, "loss": 0.1279, "step": 6776 }, { "epoch": 121.0, "eval_loss": 0.14506319165229797, "eval_runtime": 10.7769, "eval_samples_per_second": 1007.715, "eval_steps_per_second": 2.041, "step": 6776 }, { "epoch": 122.0, "grad_norm": 0.1636265516281128, "learning_rate": 0.0002875439501155812, "loss": 0.1277, "step": 6832 }, { "epoch": 122.0, "eval_loss": 0.1454634666442871, "eval_runtime": 11.7121, "eval_samples_per_second": 927.245, "eval_steps_per_second": 1.878, "step": 6832 }, { "epoch": 123.0, "grad_norm": 0.17660963535308838, "learning_rate": 0.00028736802846186907, "loss": 0.1273, "step": 6888 }, { "epoch": 123.0, "eval_loss": 0.1449379324913025, "eval_runtime": 12.0977, "eval_samples_per_second": 897.695, "eval_steps_per_second": 1.819, "step": 6888 }, { "epoch": 124.0, "grad_norm": 0.20895443856716156, "learning_rate": 0.00028719074297141686, "loss": 0.127, "step": 6944 }, { "epoch": 124.0, "eval_loss": 0.14427852630615234, "eval_runtime": 11.8774, "eval_samples_per_second": 914.341, "eval_steps_per_second": 1.852, "step": 6944 }, { "epoch": 125.0, "grad_norm": 0.1895224153995514, "learning_rate": 0.0002870120953939609, "loss": 0.1269, "step": 7000 }, { "epoch": 125.0, "eval_loss": 0.1446518748998642, "eval_runtime": 11.7658, "eval_samples_per_second": 923.015, "eval_steps_per_second": 1.87, "step": 7000 }, { "epoch": 126.0, "grad_norm": 0.191587895154953, "learning_rate": 0.0002868320874926807, "loss": 0.1269, "step": 7056 }, { "epoch": 126.0, "eval_loss": 0.14533261954784393, "eval_runtime": 11.2533, "eval_samples_per_second": 965.053, "eval_steps_per_second": 1.955, "step": 7056 }, { "epoch": 127.0, "grad_norm": 0.20511987805366516, "learning_rate": 0.00028665072104418107, "loss": 0.1263, "step": 7112 }, { "epoch": 127.0, "eval_loss": 0.1444355994462967, "eval_runtime": 11.3297, "eval_samples_per_second": 958.545, "eval_steps_per_second": 1.942, "step": 7112 }, { "epoch": 128.0, "grad_norm": 0.19347704946994781, "learning_rate": 0.0002864679978384761, "loss": 0.1266, "step": 7168 }, { "epoch": 128.0, "eval_loss": 0.14528335630893707, "eval_runtime": 11.7467, "eval_samples_per_second": 924.517, "eval_steps_per_second": 1.873, "step": 7168 }, { "epoch": 129.0, "grad_norm": 0.1948786824941635, "learning_rate": 0.00028628391967896994, "loss": 0.1267, "step": 7224 }, { "epoch": 129.0, "eval_loss": 0.1452852487564087, "eval_runtime": 10.7249, "eval_samples_per_second": 1012.6, "eval_steps_per_second": 2.051, "step": 7224 }, { "epoch": 130.0, "grad_norm": 0.2143562138080597, "learning_rate": 0.00028609848838243983, "loss": 0.1263, "step": 7280 }, { "epoch": 130.0, "eval_loss": 0.14422422647476196, "eval_runtime": 12.1111, "eval_samples_per_second": 896.699, "eval_steps_per_second": 1.817, "step": 7280 }, { "epoch": 131.0, "grad_norm": 0.17198456823825836, "learning_rate": 0.0002859117057790177, "loss": 0.1258, "step": 7336 }, { "epoch": 131.0, "eval_loss": 0.14419187605381012, "eval_runtime": 11.2161, "eval_samples_per_second": 968.25, "eval_steps_per_second": 1.961, "step": 7336 }, { "epoch": 132.0, "grad_norm": 0.2027718871831894, "learning_rate": 0.0002857235737121728, "loss": 0.1257, "step": 7392 }, { "epoch": 132.0, "eval_loss": 0.14398382604122162, "eval_runtime": 11.7549, "eval_samples_per_second": 923.871, "eval_steps_per_second": 1.872, "step": 7392 }, { "epoch": 133.0, "grad_norm": 0.18598471581935883, "learning_rate": 0.00028553409403869214, "loss": 0.1256, "step": 7448 }, { "epoch": 133.0, "eval_loss": 0.144750714302063, "eval_runtime": 10.9992, "eval_samples_per_second": 987.344, "eval_steps_per_second": 2.0, "step": 7448 }, { "epoch": 134.0, "grad_norm": 0.18290792405605316, "learning_rate": 0.0002853432686286638, "loss": 0.1255, "step": 7504 }, { "epoch": 134.0, "eval_loss": 0.14384572207927704, "eval_runtime": 11.23, "eval_samples_per_second": 967.05, "eval_steps_per_second": 1.959, "step": 7504 }, { "epoch": 135.0, "grad_norm": 0.22160011529922485, "learning_rate": 0.0002851510993654578, "loss": 0.1254, "step": 7560 }, { "epoch": 135.0, "eval_loss": 0.1437937319278717, "eval_runtime": 11.9673, "eval_samples_per_second": 907.472, "eval_steps_per_second": 1.838, "step": 7560 }, { "epoch": 136.0, "grad_norm": 0.18182989954948425, "learning_rate": 0.0002849575881457068, "loss": 0.1252, "step": 7616 }, { "epoch": 136.0, "eval_loss": 0.14378975331783295, "eval_runtime": 11.8117, "eval_samples_per_second": 919.426, "eval_steps_per_second": 1.863, "step": 7616 }, { "epoch": 137.0, "grad_norm": 0.16500607132911682, "learning_rate": 0.0002847627368792885, "loss": 0.125, "step": 7672 }, { "epoch": 137.0, "eval_loss": 0.1436585932970047, "eval_runtime": 12.4256, "eval_samples_per_second": 874.0, "eval_steps_per_second": 1.771, "step": 7672 }, { "epoch": 138.0, "grad_norm": 0.22664882242679596, "learning_rate": 0.0002845665474893062, "loss": 0.125, "step": 7728 }, { "epoch": 138.0, "eval_loss": 0.14313535392284393, "eval_runtime": 12.1895, "eval_samples_per_second": 890.932, "eval_steps_per_second": 1.805, "step": 7728 }, { "epoch": 139.0, "grad_norm": 0.1606769859790802, "learning_rate": 0.0002843690219120703, "loss": 0.1242, "step": 7784 }, { "epoch": 139.0, "eval_loss": 0.14361213147640228, "eval_runtime": 12.1036, "eval_samples_per_second": 897.251, "eval_steps_per_second": 1.818, "step": 7784 }, { "epoch": 140.0, "grad_norm": 0.20197436213493347, "learning_rate": 0.0002841701620970783, "loss": 0.1244, "step": 7840 }, { "epoch": 140.0, "eval_loss": 0.142960324883461, "eval_runtime": 11.6316, "eval_samples_per_second": 933.665, "eval_steps_per_second": 1.891, "step": 7840 }, { "epoch": 141.0, "grad_norm": 0.18616272509098053, "learning_rate": 0.000283969970006996, "loss": 0.1243, "step": 7896 }, { "epoch": 141.0, "eval_loss": 0.1441134661436081, "eval_runtime": 11.589, "eval_samples_per_second": 937.094, "eval_steps_per_second": 1.898, "step": 7896 }, { "epoch": 142.0, "grad_norm": 0.20340923964977264, "learning_rate": 0.0002837684476176391, "loss": 0.1239, "step": 7952 }, { "epoch": 142.0, "eval_loss": 0.1434699296951294, "eval_runtime": 12.3235, "eval_samples_per_second": 881.241, "eval_steps_per_second": 1.785, "step": 7952 }, { "epoch": 143.0, "grad_norm": 0.18145394325256348, "learning_rate": 0.0002835655969179518, "loss": 0.1241, "step": 8008 }, { "epoch": 143.0, "eval_loss": 0.14338643848896027, "eval_runtime": 12.3449, "eval_samples_per_second": 879.717, "eval_steps_per_second": 1.782, "step": 8008 }, { "epoch": 144.0, "grad_norm": 0.1755165159702301, "learning_rate": 0.0002833614199099885, "loss": 0.1241, "step": 8064 }, { "epoch": 144.0, "eval_loss": 0.14308682084083557, "eval_runtime": 12.0765, "eval_samples_per_second": 899.268, "eval_steps_per_second": 1.822, "step": 8064 }, { "epoch": 145.0, "grad_norm": 0.18520286679267883, "learning_rate": 0.00028315591860889397, "loss": 0.1238, "step": 8120 }, { "epoch": 145.0, "eval_loss": 0.14301612973213196, "eval_runtime": 11.4026, "eval_samples_per_second": 952.414, "eval_steps_per_second": 1.929, "step": 8120 }, { "epoch": 146.0, "grad_norm": 0.2836858630180359, "learning_rate": 0.0002829490950428833, "loss": 0.1237, "step": 8176 }, { "epoch": 146.0, "eval_loss": 0.1432274430990219, "eval_runtime": 10.5295, "eval_samples_per_second": 1031.389, "eval_steps_per_second": 2.089, "step": 8176 }, { "epoch": 147.0, "grad_norm": 0.18382933735847473, "learning_rate": 0.0002827409512532215, "loss": 0.1233, "step": 8232 }, { "epoch": 147.0, "eval_loss": 0.14315703511238098, "eval_runtime": 11.7841, "eval_samples_per_second": 921.584, "eval_steps_per_second": 1.867, "step": 8232 }, { "epoch": 148.0, "grad_norm": 0.16152502596378326, "learning_rate": 0.00028253148929420393, "loss": 0.1236, "step": 8288 }, { "epoch": 148.0, "eval_loss": 0.14190851151943207, "eval_runtime": 12.2311, "eval_samples_per_second": 887.903, "eval_steps_per_second": 1.799, "step": 8288 }, { "epoch": 149.0, "grad_norm": 0.23382407426834106, "learning_rate": 0.0002823207112331354, "loss": 0.1232, "step": 8344 }, { "epoch": 149.0, "eval_loss": 0.14270788431167603, "eval_runtime": 12.109, "eval_samples_per_second": 896.855, "eval_steps_per_second": 1.817, "step": 8344 }, { "epoch": 150.0, "grad_norm": 0.1615588366985321, "learning_rate": 0.00028210861915030973, "loss": 0.1232, "step": 8400 }, { "epoch": 150.0, "eval_loss": 0.14285807311534882, "eval_runtime": 12.5884, "eval_samples_per_second": 862.702, "eval_steps_per_second": 1.748, "step": 8400 }, { "epoch": 151.0, "grad_norm": 0.2795417308807373, "learning_rate": 0.0002818952151389907, "loss": 0.1227, "step": 8456 }, { "epoch": 151.0, "eval_loss": 0.14255040884017944, "eval_runtime": 12.5025, "eval_samples_per_second": 868.624, "eval_steps_per_second": 1.76, "step": 8456 }, { "epoch": 152.0, "grad_norm": 0.2292180061340332, "learning_rate": 0.00028168050130538953, "loss": 0.1231, "step": 8512 }, { "epoch": 152.0, "eval_loss": 0.14337477087974548, "eval_runtime": 12.1529, "eval_samples_per_second": 893.611, "eval_steps_per_second": 1.81, "step": 8512 }, { "epoch": 153.0, "grad_norm": 0.17736776173114777, "learning_rate": 0.00028146447976864553, "loss": 0.1224, "step": 8568 }, { "epoch": 153.0, "eval_loss": 0.14352336525917053, "eval_runtime": 12.3539, "eval_samples_per_second": 879.073, "eval_steps_per_second": 1.781, "step": 8568 }, { "epoch": 154.0, "grad_norm": 0.36273321509361267, "learning_rate": 0.0002812471526608039, "loss": 0.1227, "step": 8624 }, { "epoch": 154.0, "eval_loss": 0.142772376537323, "eval_runtime": 12.0892, "eval_samples_per_second": 898.323, "eval_steps_per_second": 1.82, "step": 8624 }, { "epoch": 155.0, "grad_norm": 0.19883078336715698, "learning_rate": 0.00028102852212679526, "loss": 0.1228, "step": 8680 }, { "epoch": 155.0, "eval_loss": 0.14210332930088043, "eval_runtime": 12.2389, "eval_samples_per_second": 887.336, "eval_steps_per_second": 1.798, "step": 8680 }, { "epoch": 156.0, "grad_norm": 0.2114337682723999, "learning_rate": 0.00028080859032441463, "loss": 0.1223, "step": 8736 }, { "epoch": 156.0, "eval_loss": 0.14258325099945068, "eval_runtime": 12.5038, "eval_samples_per_second": 868.534, "eval_steps_per_second": 1.759, "step": 8736 }, { "epoch": 157.0, "grad_norm": 0.193147674202919, "learning_rate": 0.0002805873594243001, "loss": 0.1223, "step": 8792 }, { "epoch": 157.0, "eval_loss": 0.1423390656709671, "eval_runtime": 11.2533, "eval_samples_per_second": 965.047, "eval_steps_per_second": 1.955, "step": 8792 }, { "epoch": 158.0, "grad_norm": 0.15751470625400543, "learning_rate": 0.0002803648316099116, "loss": 0.1222, "step": 8848 }, { "epoch": 158.0, "eval_loss": 0.1417943835258484, "eval_runtime": 11.5797, "eval_samples_per_second": 937.847, "eval_steps_per_second": 1.9, "step": 8848 }, { "epoch": 159.0, "grad_norm": 0.27395108342170715, "learning_rate": 0.00028014100907750874, "loss": 0.1219, "step": 8904 }, { "epoch": 159.0, "eval_loss": 0.14257293939590454, "eval_runtime": 12.328, "eval_samples_per_second": 880.923, "eval_steps_per_second": 1.785, "step": 8904 }, { "epoch": 160.0, "grad_norm": 0.22418324649333954, "learning_rate": 0.0002799158940361295, "loss": 0.1217, "step": 8960 }, { "epoch": 160.0, "eval_loss": 0.1431107521057129, "eval_runtime": 12.2423, "eval_samples_per_second": 887.09, "eval_steps_per_second": 1.797, "step": 8960 }, { "epoch": 161.0, "grad_norm": 0.2003849744796753, "learning_rate": 0.0002796894887075685, "loss": 0.1218, "step": 9016 }, { "epoch": 161.0, "eval_loss": 0.14198802411556244, "eval_runtime": 11.4923, "eval_samples_per_second": 944.981, "eval_steps_per_second": 1.914, "step": 9016 }, { "epoch": 162.0, "grad_norm": 0.21222490072250366, "learning_rate": 0.00027946179532635447, "loss": 0.1215, "step": 9072 }, { "epoch": 162.0, "eval_loss": 0.14226287603378296, "eval_runtime": 12.6489, "eval_samples_per_second": 858.572, "eval_steps_per_second": 1.739, "step": 9072 }, { "epoch": 163.0, "grad_norm": 0.3284847140312195, "learning_rate": 0.0002792328161397301, "loss": 0.1214, "step": 9128 }, { "epoch": 163.0, "eval_loss": 0.14255832135677338, "eval_runtime": 11.8749, "eval_samples_per_second": 914.536, "eval_steps_per_second": 1.853, "step": 9128 }, { "epoch": 164.0, "grad_norm": 0.17873606085777283, "learning_rate": 0.0002790025534076267, "loss": 0.1209, "step": 9184 }, { "epoch": 164.0, "eval_loss": 0.14214134216308594, "eval_runtime": 11.7349, "eval_samples_per_second": 925.446, "eval_steps_per_second": 1.875, "step": 9184 }, { "epoch": 165.0, "grad_norm": 0.29637348651885986, "learning_rate": 0.00027877100940264476, "loss": 0.1214, "step": 9240 }, { "epoch": 165.0, "eval_loss": 0.14148862659931183, "eval_runtime": 11.2369, "eval_samples_per_second": 966.457, "eval_steps_per_second": 1.958, "step": 9240 }, { "epoch": 166.0, "grad_norm": 0.19445298612117767, "learning_rate": 0.0002785381864100304, "loss": 0.1211, "step": 9296 }, { "epoch": 166.0, "eval_loss": 0.14366163313388824, "eval_runtime": 11.7897, "eval_samples_per_second": 921.146, "eval_steps_per_second": 1.866, "step": 9296 }, { "epoch": 167.0, "grad_norm": 0.2037288248538971, "learning_rate": 0.0002783040867276523, "loss": 0.1209, "step": 9352 }, { "epoch": 167.0, "eval_loss": 0.14206562936306, "eval_runtime": 11.4292, "eval_samples_per_second": 950.199, "eval_steps_per_second": 1.925, "step": 9352 }, { "epoch": 168.0, "grad_norm": 0.21530179679393768, "learning_rate": 0.0002780687126659796, "loss": 0.1208, "step": 9408 }, { "epoch": 168.0, "eval_loss": 0.1410149782896042, "eval_runtime": 11.7288, "eval_samples_per_second": 925.923, "eval_steps_per_second": 1.876, "step": 9408 } ], "logging_steps": 500, "max_steps": 56000, "num_input_tokens_seen": 0, "num_train_epochs": 1000, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 1e-05 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1001513367240704e+18, "train_batch_size": 512, "trial_name": null, "trial_params": null }