| { | |
| "best_global_step": 9408, | |
| "best_metric": 0.1410149782896042, | |
| "best_model_checkpoint": "tmp/out/1536-96-r2_mix_channel_fcmCtx3_fcmLayers3_fcmChMixingTrue_stride24_bs512_lrf_deb3/checkpoint-9408", | |
| "epoch": 168.0, | |
| "eval_steps": 500, | |
| "global_step": 9408, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.376089870929718, | |
| "learning_rate": 0.00029836401390103334, | |
| "loss": 0.3643, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.25079935789108276, | |
| "eval_runtime": 12.3705, | |
| "eval_samples_per_second": 877.898, | |
| "eval_steps_per_second": 1.778, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.25105392932891846, | |
| "learning_rate": 0.00029836183164580883, | |
| "loss": 0.3058, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.23216894268989563, | |
| "eval_runtime": 12.2194, | |
| "eval_samples_per_second": 888.753, | |
| "eval_steps_per_second": 1.8, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.17020165920257568, | |
| "learning_rate": 0.00029835817704944523, | |
| "loss": 0.2683, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.20991244912147522, | |
| "eval_runtime": 10.9934, | |
| "eval_samples_per_second": 987.863, | |
| "eval_steps_per_second": 2.001, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.13130681216716766, | |
| "learning_rate": 0.00029835305014801184, | |
| "loss": 0.2395, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.19736029207706451, | |
| "eval_runtime": 11.7226, | |
| "eval_samples_per_second": 926.414, | |
| "eval_steps_per_second": 1.877, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.12686163187026978, | |
| "learning_rate": 0.0002983464509921093, | |
| "loss": 0.2241, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.18977424502372742, | |
| "eval_runtime": 11.8479, | |
| "eval_samples_per_second": 916.618, | |
| "eval_steps_per_second": 1.857, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.11746390908956528, | |
| "learning_rate": 0.00029833837964686835, | |
| "loss": 0.2148, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.1851092129945755, | |
| "eval_runtime": 11.7556, | |
| "eval_samples_per_second": 923.812, | |
| "eval_steps_per_second": 1.871, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.13627897202968597, | |
| "learning_rate": 0.0002983288361919503, | |
| "loss": 0.2078, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.18129761517047882, | |
| "eval_runtime": 11.7487, | |
| "eval_samples_per_second": 924.357, | |
| "eval_steps_per_second": 1.873, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.1497841328382492, | |
| "learning_rate": 0.00029831782072154485, | |
| "loss": 0.2025, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.17769944667816162, | |
| "eval_runtime": 12.1141, | |
| "eval_samples_per_second": 896.477, | |
| "eval_steps_per_second": 1.816, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.19643521308898926, | |
| "learning_rate": 0.0002983053333443701, | |
| "loss": 0.1976, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.17583897709846497, | |
| "eval_runtime": 12.5558, | |
| "eval_samples_per_second": 864.936, | |
| "eval_steps_per_second": 1.752, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.1033664122223854, | |
| "learning_rate": 0.0002982913741836719, | |
| "loss": 0.1936, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.1739388257265091, | |
| "eval_runtime": 12.449, | |
| "eval_samples_per_second": 872.358, | |
| "eval_steps_per_second": 1.767, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 0.1361815184354782, | |
| "learning_rate": 0.00029827594337722164, | |
| "loss": 0.1902, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.17110829055309296, | |
| "eval_runtime": 12.7701, | |
| "eval_samples_per_second": 850.423, | |
| "eval_steps_per_second": 1.723, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 0.12385320663452148, | |
| "learning_rate": 0.0002982590410773146, | |
| "loss": 0.1867, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.16852673888206482, | |
| "eval_runtime": 11.8972, | |
| "eval_samples_per_second": 912.817, | |
| "eval_steps_per_second": 1.849, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 0.13126742839813232, | |
| "learning_rate": 0.0002982406674507699, | |
| "loss": 0.1837, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.1675039380788803, | |
| "eval_runtime": 11.8951, | |
| "eval_samples_per_second": 912.98, | |
| "eval_steps_per_second": 1.85, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 0.14581529796123505, | |
| "learning_rate": 0.00029822082267892794, | |
| "loss": 0.1818, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.16522179543972015, | |
| "eval_runtime": 12.951, | |
| "eval_samples_per_second": 838.545, | |
| "eval_steps_per_second": 1.699, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.12710689008235931, | |
| "learning_rate": 0.0002981995069576483, | |
| "loss": 0.1787, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.1651495099067688, | |
| "eval_runtime": 12.4369, | |
| "eval_samples_per_second": 873.211, | |
| "eval_steps_per_second": 1.769, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 0.1914917379617691, | |
| "learning_rate": 0.0002981767204973089, | |
| "loss": 0.177, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.1639031320810318, | |
| "eval_runtime": 12.7112, | |
| "eval_samples_per_second": 854.365, | |
| "eval_steps_per_second": 1.731, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "grad_norm": 0.15502069890499115, | |
| "learning_rate": 0.00029815246352280276, | |
| "loss": 0.1751, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 0.16176268458366394, | |
| "eval_runtime": 12.1031, | |
| "eval_samples_per_second": 897.291, | |
| "eval_steps_per_second": 1.818, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 0.11603855341672897, | |
| "learning_rate": 0.0002981267362735362, | |
| "loss": 0.1734, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 0.1614038050174713, | |
| "eval_runtime": 11.893, | |
| "eval_samples_per_second": 913.139, | |
| "eval_steps_per_second": 1.85, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "grad_norm": 0.11780980974435806, | |
| "learning_rate": 0.0002980995390034271, | |
| "loss": 0.172, | |
| "step": 1064 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 0.16114258766174316, | |
| "eval_runtime": 12.6404, | |
| "eval_samples_per_second": 859.152, | |
| "eval_steps_per_second": 1.74, | |
| "step": 1064 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.14823858439922333, | |
| "learning_rate": 0.00029807087198090116, | |
| "loss": 0.1702, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 0.15980996191501617, | |
| "eval_runtime": 12.5631, | |
| "eval_samples_per_second": 864.434, | |
| "eval_steps_per_second": 1.751, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "grad_norm": 0.1246936172246933, | |
| "learning_rate": 0.0002980407354888907, | |
| "loss": 0.1688, | |
| "step": 1176 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_loss": 0.15955598652362823, | |
| "eval_runtime": 12.315, | |
| "eval_samples_per_second": 881.853, | |
| "eval_steps_per_second": 1.786, | |
| "step": 1176 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "grad_norm": 0.11726798117160797, | |
| "learning_rate": 0.0002980091298248309, | |
| "loss": 0.1675, | |
| "step": 1232 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_loss": 0.15864743292331696, | |
| "eval_runtime": 12.3526, | |
| "eval_samples_per_second": 879.166, | |
| "eval_steps_per_second": 1.781, | |
| "step": 1232 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "grad_norm": 0.13960805535316467, | |
| "learning_rate": 0.0002979760553006564, | |
| "loss": 0.1666, | |
| "step": 1288 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_loss": 0.15781378746032715, | |
| "eval_runtime": 12.187, | |
| "eval_samples_per_second": 891.116, | |
| "eval_steps_per_second": 1.805, | |
| "step": 1288 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "grad_norm": 0.11856065690517426, | |
| "learning_rate": 0.00029794151224279964, | |
| "loss": 0.1652, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_loss": 0.15776978433132172, | |
| "eval_runtime": 12.435, | |
| "eval_samples_per_second": 873.344, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": 0.12466388940811157, | |
| "learning_rate": 0.00029790550099218654, | |
| "loss": 0.1643, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_loss": 0.15815725922584534, | |
| "eval_runtime": 13.1792, | |
| "eval_samples_per_second": 824.023, | |
| "eval_steps_per_second": 1.669, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "grad_norm": 0.12369589507579803, | |
| "learning_rate": 0.0002978680219042336, | |
| "loss": 0.1633, | |
| "step": 1456 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "eval_loss": 0.1567024141550064, | |
| "eval_runtime": 12.484, | |
| "eval_samples_per_second": 869.916, | |
| "eval_steps_per_second": 1.762, | |
| "step": 1456 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "grad_norm": 0.14197547733783722, | |
| "learning_rate": 0.0002978290753488448, | |
| "loss": 0.1624, | |
| "step": 1512 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "eval_loss": 0.15676391124725342, | |
| "eval_runtime": 12.738, | |
| "eval_samples_per_second": 852.567, | |
| "eval_steps_per_second": 1.727, | |
| "step": 1512 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "grad_norm": 0.13262535631656647, | |
| "learning_rate": 0.0002977886617104062, | |
| "loss": 0.1613, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_loss": 0.1567520797252655, | |
| "eval_runtime": 12.6529, | |
| "eval_samples_per_second": 858.304, | |
| "eval_steps_per_second": 1.739, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "grad_norm": 0.15622882544994354, | |
| "learning_rate": 0.0002977467813877842, | |
| "loss": 0.1604, | |
| "step": 1624 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "eval_loss": 0.15647795796394348, | |
| "eval_runtime": 12.6006, | |
| "eval_samples_per_second": 861.863, | |
| "eval_steps_per_second": 1.746, | |
| "step": 1624 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 0.15161629021167755, | |
| "learning_rate": 0.00029770343479432095, | |
| "loss": 0.1598, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_loss": 0.15717600286006927, | |
| "eval_runtime": 12.8165, | |
| "eval_samples_per_second": 847.348, | |
| "eval_steps_per_second": 1.717, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "grad_norm": 0.12715986371040344, | |
| "learning_rate": 0.0002976586223578297, | |
| "loss": 0.1591, | |
| "step": 1736 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "eval_loss": 0.1557074338197708, | |
| "eval_runtime": 12.6403, | |
| "eval_samples_per_second": 859.156, | |
| "eval_steps_per_second": 1.74, | |
| "step": 1736 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "grad_norm": 0.1595166027545929, | |
| "learning_rate": 0.00029761234452059136, | |
| "loss": 0.1584, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_loss": 0.15540747344493866, | |
| "eval_runtime": 13.3084, | |
| "eval_samples_per_second": 816.027, | |
| "eval_steps_per_second": 1.653, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "grad_norm": 0.16593649983406067, | |
| "learning_rate": 0.0002975646017393494, | |
| "loss": 0.1576, | |
| "step": 1848 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "eval_loss": 0.15468333661556244, | |
| "eval_runtime": 13.1483, | |
| "eval_samples_per_second": 825.961, | |
| "eval_steps_per_second": 1.673, | |
| "step": 1848 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "grad_norm": 0.14555956423282623, | |
| "learning_rate": 0.0002975153944853054, | |
| "loss": 0.1567, | |
| "step": 1904 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "eval_loss": 0.1553257554769516, | |
| "eval_runtime": 12.853, | |
| "eval_samples_per_second": 844.936, | |
| "eval_steps_per_second": 1.712, | |
| "step": 1904 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "grad_norm": 0.23194457590579987, | |
| "learning_rate": 0.00029746472324411547, | |
| "loss": 0.156, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "eval_loss": 0.1549767106771469, | |
| "eval_runtime": 11.49, | |
| "eval_samples_per_second": 945.169, | |
| "eval_steps_per_second": 1.915, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "grad_norm": 0.17572428286075592, | |
| "learning_rate": 0.0002974125885158844, | |
| "loss": 0.1559, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "eval_loss": 0.15631072223186493, | |
| "eval_runtime": 12.6465, | |
| "eval_samples_per_second": 858.739, | |
| "eval_steps_per_second": 1.74, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "grad_norm": 0.1315496563911438, | |
| "learning_rate": 0.0002973589908151604, | |
| "loss": 0.1547, | |
| "step": 2072 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "eval_loss": 0.1540231704711914, | |
| "eval_runtime": 13.3162, | |
| "eval_samples_per_second": 815.548, | |
| "eval_steps_per_second": 1.652, | |
| "step": 2072 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "grad_norm": 0.17212693393230438, | |
| "learning_rate": 0.0002973039306709319, | |
| "loss": 0.1539, | |
| "step": 2128 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "eval_loss": 0.15414279699325562, | |
| "eval_runtime": 13.2364, | |
| "eval_samples_per_second": 820.466, | |
| "eval_steps_per_second": 1.662, | |
| "step": 2128 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "grad_norm": 0.12589286267757416, | |
| "learning_rate": 0.0002972474086266193, | |
| "loss": 0.1538, | |
| "step": 2184 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "eval_loss": 0.15399765968322754, | |
| "eval_runtime": 12.5952, | |
| "eval_samples_per_second": 862.236, | |
| "eval_steps_per_second": 1.747, | |
| "step": 2184 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "grad_norm": 0.1479528248310089, | |
| "learning_rate": 0.0002971894252400732, | |
| "loss": 0.1529, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "eval_loss": 0.1546306610107422, | |
| "eval_runtime": 12.4569, | |
| "eval_samples_per_second": 871.809, | |
| "eval_steps_per_second": 1.766, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "grad_norm": 0.140830859541893, | |
| "learning_rate": 0.00029712998108356566, | |
| "loss": 0.1521, | |
| "step": 2296 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "eval_loss": 0.15411749482154846, | |
| "eval_runtime": 12.8911, | |
| "eval_samples_per_second": 842.441, | |
| "eval_steps_per_second": 1.707, | |
| "step": 2296 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "grad_norm": 0.14429251849651337, | |
| "learning_rate": 0.0002970690767437871, | |
| "loss": 0.1521, | |
| "step": 2352 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "eval_loss": 0.1535186916589737, | |
| "eval_runtime": 12.7037, | |
| "eval_samples_per_second": 854.87, | |
| "eval_steps_per_second": 1.732, | |
| "step": 2352 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "grad_norm": 0.1678067147731781, | |
| "learning_rate": 0.00029700671282183844, | |
| "loss": 0.1516, | |
| "step": 2408 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "eval_loss": 0.15345174074172974, | |
| "eval_runtime": 12.8622, | |
| "eval_samples_per_second": 844.337, | |
| "eval_steps_per_second": 1.71, | |
| "step": 2408 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "grad_norm": 0.16715741157531738, | |
| "learning_rate": 0.00029694288993322636, | |
| "loss": 0.1506, | |
| "step": 2464 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "eval_loss": 0.1528453379869461, | |
| "eval_runtime": 12.394, | |
| "eval_samples_per_second": 876.23, | |
| "eval_steps_per_second": 1.775, | |
| "step": 2464 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "grad_norm": 0.1476888358592987, | |
| "learning_rate": 0.00029687760870785704, | |
| "loss": 0.1502, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "eval_loss": 0.15371684730052948, | |
| "eval_runtime": 12.8504, | |
| "eval_samples_per_second": 845.113, | |
| "eval_steps_per_second": 1.712, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "grad_norm": 0.16268473863601685, | |
| "learning_rate": 0.00029681086979003, | |
| "loss": 0.1497, | |
| "step": 2576 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "eval_loss": 0.15216761827468872, | |
| "eval_runtime": 12.9049, | |
| "eval_samples_per_second": 841.539, | |
| "eval_steps_per_second": 1.705, | |
| "step": 2576 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "grad_norm": 0.17756158113479614, | |
| "learning_rate": 0.0002967426738384313, | |
| "loss": 0.1493, | |
| "step": 2632 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "eval_loss": 0.15324676036834717, | |
| "eval_runtime": 13.0526, | |
| "eval_samples_per_second": 832.021, | |
| "eval_steps_per_second": 1.685, | |
| "step": 2632 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "grad_norm": 0.13994063436985016, | |
| "learning_rate": 0.0002966730215261271, | |
| "loss": 0.1487, | |
| "step": 2688 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "eval_loss": 0.15221010148525238, | |
| "eval_runtime": 12.6334, | |
| "eval_samples_per_second": 859.628, | |
| "eval_steps_per_second": 1.741, | |
| "step": 2688 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "grad_norm": 0.18394885957241058, | |
| "learning_rate": 0.0002966019135405581, | |
| "loss": 0.1483, | |
| "step": 2744 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "eval_loss": 0.15254603326320648, | |
| "eval_runtime": 12.296, | |
| "eval_samples_per_second": 883.214, | |
| "eval_steps_per_second": 1.789, | |
| "step": 2744 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "grad_norm": 0.14756232500076294, | |
| "learning_rate": 0.000296529350583531, | |
| "loss": 0.1479, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "eval_loss": 0.15157358348369598, | |
| "eval_runtime": 12.7067, | |
| "eval_samples_per_second": 854.666, | |
| "eval_steps_per_second": 1.731, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 51.0, | |
| "grad_norm": 0.18675681948661804, | |
| "learning_rate": 0.00029645533337121344, | |
| "loss": 0.1476, | |
| "step": 2856 | |
| }, | |
| { | |
| "epoch": 51.0, | |
| "eval_loss": 0.15315961837768555, | |
| "eval_runtime": 12.914, | |
| "eval_samples_per_second": 840.949, | |
| "eval_steps_per_second": 1.704, | |
| "step": 2856 | |
| }, | |
| { | |
| "epoch": 52.0, | |
| "grad_norm": 0.21148425340652466, | |
| "learning_rate": 0.0002963798626341248, | |
| "loss": 0.1467, | |
| "step": 2912 | |
| }, | |
| { | |
| "epoch": 52.0, | |
| "eval_loss": 0.151397705078125, | |
| "eval_runtime": 12.6083, | |
| "eval_samples_per_second": 861.336, | |
| "eval_steps_per_second": 1.745, | |
| "step": 2912 | |
| }, | |
| { | |
| "epoch": 53.0, | |
| "grad_norm": 0.14957012236118317, | |
| "learning_rate": 0.00029630293911713125, | |
| "loss": 0.1463, | |
| "step": 2968 | |
| }, | |
| { | |
| "epoch": 53.0, | |
| "eval_loss": 0.152817040681839, | |
| "eval_runtime": 12.3988, | |
| "eval_samples_per_second": 875.89, | |
| "eval_steps_per_second": 1.774, | |
| "step": 2968 | |
| }, | |
| { | |
| "epoch": 54.0, | |
| "grad_norm": 0.18841682374477386, | |
| "learning_rate": 0.0002962245635794367, | |
| "loss": 0.1457, | |
| "step": 3024 | |
| }, | |
| { | |
| "epoch": 54.0, | |
| "eval_loss": 0.1509653627872467, | |
| "eval_runtime": 12.9201, | |
| "eval_samples_per_second": 840.553, | |
| "eval_steps_per_second": 1.703, | |
| "step": 3024 | |
| }, | |
| { | |
| "epoch": 55.0, | |
| "grad_norm": 0.19782641530036926, | |
| "learning_rate": 0.00029614473679457606, | |
| "loss": 0.1457, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 55.0, | |
| "eval_loss": 0.15204061567783356, | |
| "eval_runtime": 13.0172, | |
| "eval_samples_per_second": 834.282, | |
| "eval_steps_per_second": 1.69, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 56.0, | |
| "grad_norm": 0.15806534886360168, | |
| "learning_rate": 0.0002960634595504073, | |
| "loss": 0.145, | |
| "step": 3136 | |
| }, | |
| { | |
| "epoch": 56.0, | |
| "eval_loss": 0.15144167840480804, | |
| "eval_runtime": 12.3723, | |
| "eval_samples_per_second": 877.767, | |
| "eval_steps_per_second": 1.778, | |
| "step": 3136 | |
| }, | |
| { | |
| "epoch": 57.0, | |
| "grad_norm": 0.1470707207918167, | |
| "learning_rate": 0.00029598073264910414, | |
| "loss": 0.1446, | |
| "step": 3192 | |
| }, | |
| { | |
| "epoch": 57.0, | |
| "eval_loss": 0.15259326994419098, | |
| "eval_runtime": 11.8486, | |
| "eval_samples_per_second": 916.567, | |
| "eval_steps_per_second": 1.857, | |
| "step": 3192 | |
| }, | |
| { | |
| "epoch": 58.0, | |
| "grad_norm": 0.12880393862724304, | |
| "learning_rate": 0.00029589655690714776, | |
| "loss": 0.1444, | |
| "step": 3248 | |
| }, | |
| { | |
| "epoch": 58.0, | |
| "eval_loss": 0.1521604359149933, | |
| "eval_runtime": 12.3711, | |
| "eval_samples_per_second": 877.851, | |
| "eval_steps_per_second": 1.778, | |
| "step": 3248 | |
| }, | |
| { | |
| "epoch": 59.0, | |
| "grad_norm": 0.20687344670295715, | |
| "learning_rate": 0.00029581093315531867, | |
| "loss": 0.1439, | |
| "step": 3304 | |
| }, | |
| { | |
| "epoch": 59.0, | |
| "eval_loss": 0.1506902128458023, | |
| "eval_runtime": 12.2839, | |
| "eval_samples_per_second": 884.082, | |
| "eval_steps_per_second": 1.791, | |
| "step": 3304 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "grad_norm": 0.31674283742904663, | |
| "learning_rate": 0.00029572386223868856, | |
| "loss": 0.1434, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "eval_loss": 0.1497628092765808, | |
| "eval_runtime": 12.2602, | |
| "eval_samples_per_second": 885.791, | |
| "eval_steps_per_second": 1.794, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 61.0, | |
| "grad_norm": 0.1524023711681366, | |
| "learning_rate": 0.0002956353450166127, | |
| "loss": 0.1428, | |
| "step": 3416 | |
| }, | |
| { | |
| "epoch": 61.0, | |
| "eval_loss": 0.15104272961616516, | |
| "eval_runtime": 11.4854, | |
| "eval_samples_per_second": 945.545, | |
| "eval_steps_per_second": 1.915, | |
| "step": 3416 | |
| }, | |
| { | |
| "epoch": 62.0, | |
| "grad_norm": 0.1333588808774948, | |
| "learning_rate": 0.00029554538236271986, | |
| "loss": 0.1427, | |
| "step": 3472 | |
| }, | |
| { | |
| "epoch": 62.0, | |
| "eval_loss": 0.15125687420368195, | |
| "eval_runtime": 11.619, | |
| "eval_samples_per_second": 934.673, | |
| "eval_steps_per_second": 1.893, | |
| "step": 3472 | |
| }, | |
| { | |
| "epoch": 63.0, | |
| "grad_norm": 0.14987458288669586, | |
| "learning_rate": 0.0002954539751649054, | |
| "loss": 0.1427, | |
| "step": 3528 | |
| }, | |
| { | |
| "epoch": 63.0, | |
| "eval_loss": 0.15022161602973938, | |
| "eval_runtime": 11.7178, | |
| "eval_samples_per_second": 926.795, | |
| "eval_steps_per_second": 1.877, | |
| "step": 3528 | |
| }, | |
| { | |
| "epoch": 64.0, | |
| "grad_norm": 0.19036932289600372, | |
| "learning_rate": 0.00029536112432532164, | |
| "loss": 0.1418, | |
| "step": 3584 | |
| }, | |
| { | |
| "epoch": 64.0, | |
| "eval_loss": 0.15002530813217163, | |
| "eval_runtime": 12.0423, | |
| "eval_samples_per_second": 901.82, | |
| "eval_steps_per_second": 1.827, | |
| "step": 3584 | |
| }, | |
| { | |
| "epoch": 65.0, | |
| "grad_norm": 0.15858310461044312, | |
| "learning_rate": 0.00029526683076036824, | |
| "loss": 0.1416, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 65.0, | |
| "eval_loss": 0.15072880685329437, | |
| "eval_runtime": 11.4427, | |
| "eval_samples_per_second": 949.077, | |
| "eval_steps_per_second": 1.923, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 66.0, | |
| "grad_norm": 0.1411045342683792, | |
| "learning_rate": 0.0002951710954006851, | |
| "loss": 0.1415, | |
| "step": 3696 | |
| }, | |
| { | |
| "epoch": 66.0, | |
| "eval_loss": 0.150208979845047, | |
| "eval_runtime": 11.7843, | |
| "eval_samples_per_second": 921.567, | |
| "eval_steps_per_second": 1.867, | |
| "step": 3696 | |
| }, | |
| { | |
| "epoch": 67.0, | |
| "grad_norm": 0.18127693235874176, | |
| "learning_rate": 0.00029507391919114174, | |
| "loss": 0.1407, | |
| "step": 3752 | |
| }, | |
| { | |
| "epoch": 67.0, | |
| "eval_loss": 0.15111134946346283, | |
| "eval_runtime": 11.7998, | |
| "eval_samples_per_second": 920.352, | |
| "eval_steps_per_second": 1.864, | |
| "step": 3752 | |
| }, | |
| { | |
| "epoch": 68.0, | |
| "grad_norm": 0.20954985916614532, | |
| "learning_rate": 0.0002949753030908276, | |
| "loss": 0.1404, | |
| "step": 3808 | |
| }, | |
| { | |
| "epoch": 68.0, | |
| "eval_loss": 0.15048466622829437, | |
| "eval_runtime": 11.8536, | |
| "eval_samples_per_second": 916.178, | |
| "eval_steps_per_second": 1.856, | |
| "step": 3808 | |
| }, | |
| { | |
| "epoch": 69.0, | |
| "grad_norm": 0.1799214780330658, | |
| "learning_rate": 0.0002948752480730442, | |
| "loss": 0.1401, | |
| "step": 3864 | |
| }, | |
| { | |
| "epoch": 69.0, | |
| "eval_loss": 0.14996136724948883, | |
| "eval_runtime": 11.8425, | |
| "eval_samples_per_second": 917.04, | |
| "eval_steps_per_second": 1.858, | |
| "step": 3864 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "grad_norm": 0.14687888324260712, | |
| "learning_rate": 0.0002947737551252938, | |
| "loss": 0.1399, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "eval_loss": 0.1494998186826706, | |
| "eval_runtime": 11.8446, | |
| "eval_samples_per_second": 916.877, | |
| "eval_steps_per_second": 1.857, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 71.0, | |
| "grad_norm": 0.2250983864068985, | |
| "learning_rate": 0.000294670825249271, | |
| "loss": 0.1397, | |
| "step": 3976 | |
| }, | |
| { | |
| "epoch": 71.0, | |
| "eval_loss": 0.14974181354045868, | |
| "eval_runtime": 10.3667, | |
| "eval_samples_per_second": 1047.585, | |
| "eval_steps_per_second": 2.122, | |
| "step": 3976 | |
| }, | |
| { | |
| "epoch": 72.0, | |
| "grad_norm": 0.14977572858333588, | |
| "learning_rate": 0.00029456645946085235, | |
| "loss": 0.1393, | |
| "step": 4032 | |
| }, | |
| { | |
| "epoch": 72.0, | |
| "eval_loss": 0.1504337042570114, | |
| "eval_runtime": 11.0031, | |
| "eval_samples_per_second": 986.994, | |
| "eval_steps_per_second": 1.999, | |
| "step": 4032 | |
| }, | |
| { | |
| "epoch": 73.0, | |
| "grad_norm": 0.2215435802936554, | |
| "learning_rate": 0.00029446065879008577, | |
| "loss": 0.1389, | |
| "step": 4088 | |
| }, | |
| { | |
| "epoch": 73.0, | |
| "eval_loss": 0.14960449934005737, | |
| "eval_runtime": 10.5211, | |
| "eval_samples_per_second": 1032.216, | |
| "eval_steps_per_second": 2.091, | |
| "step": 4088 | |
| }, | |
| { | |
| "epoch": 74.0, | |
| "grad_norm": 0.14885684847831726, | |
| "learning_rate": 0.00029435342428118117, | |
| "loss": 0.1384, | |
| "step": 4144 | |
| }, | |
| { | |
| "epoch": 74.0, | |
| "eval_loss": 0.14882370829582214, | |
| "eval_runtime": 11.6942, | |
| "eval_samples_per_second": 928.669, | |
| "eval_steps_per_second": 1.881, | |
| "step": 4144 | |
| }, | |
| { | |
| "epoch": 75.0, | |
| "grad_norm": 0.20596224069595337, | |
| "learning_rate": 0.0002942447569924998, | |
| "loss": 0.1384, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 75.0, | |
| "eval_loss": 0.14847591519355774, | |
| "eval_runtime": 11.911, | |
| "eval_samples_per_second": 911.765, | |
| "eval_steps_per_second": 1.847, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 76.0, | |
| "grad_norm": 0.1551866978406906, | |
| "learning_rate": 0.0002941346579965444, | |
| "loss": 0.1379, | |
| "step": 4256 | |
| }, | |
| { | |
| "epoch": 76.0, | |
| "eval_loss": 0.1497822105884552, | |
| "eval_runtime": 11.0615, | |
| "eval_samples_per_second": 981.782, | |
| "eval_steps_per_second": 1.989, | |
| "step": 4256 | |
| }, | |
| { | |
| "epoch": 77.0, | |
| "grad_norm": 0.19567330181598663, | |
| "learning_rate": 0.00029402312837994727, | |
| "loss": 0.138, | |
| "step": 4312 | |
| }, | |
| { | |
| "epoch": 77.0, | |
| "eval_loss": 0.14890199899673462, | |
| "eval_runtime": 11.5065, | |
| "eval_samples_per_second": 943.812, | |
| "eval_steps_per_second": 1.912, | |
| "step": 4312 | |
| }, | |
| { | |
| "epoch": 78.0, | |
| "grad_norm": 0.1951490044593811, | |
| "learning_rate": 0.0002939101692434606, | |
| "loss": 0.1372, | |
| "step": 4368 | |
| }, | |
| { | |
| "epoch": 78.0, | |
| "eval_loss": 0.14929604530334473, | |
| "eval_runtime": 11.7303, | |
| "eval_samples_per_second": 925.806, | |
| "eval_steps_per_second": 1.875, | |
| "step": 4368 | |
| }, | |
| { | |
| "epoch": 79.0, | |
| "grad_norm": 0.15116438269615173, | |
| "learning_rate": 0.00029379578170194554, | |
| "loss": 0.1371, | |
| "step": 4424 | |
| }, | |
| { | |
| "epoch": 79.0, | |
| "eval_loss": 0.14909496903419495, | |
| "eval_runtime": 11.5142, | |
| "eval_samples_per_second": 943.184, | |
| "eval_steps_per_second": 1.911, | |
| "step": 4424 | |
| }, | |
| { | |
| "epoch": 80.0, | |
| "grad_norm": 0.24799354374408722, | |
| "learning_rate": 0.00029367996688436096, | |
| "loss": 0.1369, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 80.0, | |
| "eval_loss": 0.14952804148197174, | |
| "eval_runtime": 10.7014, | |
| "eval_samples_per_second": 1014.824, | |
| "eval_steps_per_second": 2.056, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 81.0, | |
| "grad_norm": 0.16792896389961243, | |
| "learning_rate": 0.00029356272593375216, | |
| "loss": 0.1368, | |
| "step": 4536 | |
| }, | |
| { | |
| "epoch": 81.0, | |
| "eval_loss": 0.1491686999797821, | |
| "eval_runtime": 11.5601, | |
| "eval_samples_per_second": 939.442, | |
| "eval_steps_per_second": 1.903, | |
| "step": 4536 | |
| }, | |
| { | |
| "epoch": 82.0, | |
| "grad_norm": 0.21115855872631073, | |
| "learning_rate": 0.00029344406000724046, | |
| "loss": 0.1363, | |
| "step": 4592 | |
| }, | |
| { | |
| "epoch": 82.0, | |
| "eval_loss": 0.14837497472763062, | |
| "eval_runtime": 11.7754, | |
| "eval_samples_per_second": 922.263, | |
| "eval_steps_per_second": 1.868, | |
| "step": 4592 | |
| }, | |
| { | |
| "epoch": 83.0, | |
| "grad_norm": 0.15595555305480957, | |
| "learning_rate": 0.0002933239702760101, | |
| "loss": 0.1361, | |
| "step": 4648 | |
| }, | |
| { | |
| "epoch": 83.0, | |
| "eval_loss": 0.14758282899856567, | |
| "eval_runtime": 11.5424, | |
| "eval_samples_per_second": 940.879, | |
| "eval_steps_per_second": 1.906, | |
| "step": 4648 | |
| }, | |
| { | |
| "epoch": 84.0, | |
| "grad_norm": 0.14343903958797455, | |
| "learning_rate": 0.00029320245792529843, | |
| "loss": 0.1355, | |
| "step": 4704 | |
| }, | |
| { | |
| "epoch": 84.0, | |
| "eval_loss": 0.1478155553340912, | |
| "eval_runtime": 11.4968, | |
| "eval_samples_per_second": 944.61, | |
| "eval_steps_per_second": 1.914, | |
| "step": 4704 | |
| }, | |
| { | |
| "epoch": 85.0, | |
| "grad_norm": 0.2670864462852478, | |
| "learning_rate": 0.00029307952415438376, | |
| "loss": 0.1353, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 85.0, | |
| "eval_loss": 0.14811985194683075, | |
| "eval_runtime": 11.0295, | |
| "eval_samples_per_second": 984.636, | |
| "eval_steps_per_second": 1.995, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 86.0, | |
| "grad_norm": 0.19388346374034882, | |
| "learning_rate": 0.00029295517017657207, | |
| "loss": 0.1353, | |
| "step": 4816 | |
| }, | |
| { | |
| "epoch": 86.0, | |
| "eval_loss": 0.14837351441383362, | |
| "eval_runtime": 11.4695, | |
| "eval_samples_per_second": 946.859, | |
| "eval_steps_per_second": 1.918, | |
| "step": 4816 | |
| }, | |
| { | |
| "epoch": 87.0, | |
| "grad_norm": 0.15899422764778137, | |
| "learning_rate": 0.00029282939721918743, | |
| "loss": 0.1351, | |
| "step": 4872 | |
| }, | |
| { | |
| "epoch": 87.0, | |
| "eval_loss": 0.14791646599769592, | |
| "eval_runtime": 11.4789, | |
| "eval_samples_per_second": 946.087, | |
| "eval_steps_per_second": 1.917, | |
| "step": 4872 | |
| }, | |
| { | |
| "epoch": 88.0, | |
| "grad_norm": 0.25924888253211975, | |
| "learning_rate": 0.00029270220652355785, | |
| "loss": 0.1345, | |
| "step": 4928 | |
| }, | |
| { | |
| "epoch": 88.0, | |
| "eval_loss": 0.1483958214521408, | |
| "eval_runtime": 11.0986, | |
| "eval_samples_per_second": 978.501, | |
| "eval_steps_per_second": 1.982, | |
| "step": 4928 | |
| }, | |
| { | |
| "epoch": 89.0, | |
| "grad_norm": 0.197585791349411, | |
| "learning_rate": 0.0002925735993450043, | |
| "loss": 0.1342, | |
| "step": 4984 | |
| }, | |
| { | |
| "epoch": 89.0, | |
| "eval_loss": 0.14841538667678833, | |
| "eval_runtime": 11.2913, | |
| "eval_samples_per_second": 961.799, | |
| "eval_steps_per_second": 1.948, | |
| "step": 4984 | |
| }, | |
| { | |
| "epoch": 90.0, | |
| "grad_norm": 0.18903715908527374, | |
| "learning_rate": 0.0002924435769528278, | |
| "loss": 0.1343, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 90.0, | |
| "eval_loss": 0.14745239913463593, | |
| "eval_runtime": 12.07, | |
| "eval_samples_per_second": 899.752, | |
| "eval_steps_per_second": 1.823, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 91.0, | |
| "grad_norm": 0.1610485017299652, | |
| "learning_rate": 0.00029231214063029666, | |
| "loss": 0.1336, | |
| "step": 5096 | |
| }, | |
| { | |
| "epoch": 91.0, | |
| "eval_loss": 0.1469384878873825, | |
| "eval_runtime": 12.1199, | |
| "eval_samples_per_second": 896.05, | |
| "eval_steps_per_second": 1.815, | |
| "step": 5096 | |
| }, | |
| { | |
| "epoch": 92.0, | |
| "grad_norm": 0.20112423598766327, | |
| "learning_rate": 0.00029217929167463404, | |
| "loss": 0.1337, | |
| "step": 5152 | |
| }, | |
| { | |
| "epoch": 92.0, | |
| "eval_loss": 0.14764182269573212, | |
| "eval_runtime": 10.2692, | |
| "eval_samples_per_second": 1057.536, | |
| "eval_steps_per_second": 2.142, | |
| "step": 5152 | |
| }, | |
| { | |
| "epoch": 93.0, | |
| "grad_norm": 0.28488588333129883, | |
| "learning_rate": 0.00029204503139700625, | |
| "loss": 0.1335, | |
| "step": 5208 | |
| }, | |
| { | |
| "epoch": 93.0, | |
| "eval_loss": 0.1479685753583908, | |
| "eval_runtime": 11.6849, | |
| "eval_samples_per_second": 929.407, | |
| "eval_steps_per_second": 1.883, | |
| "step": 5208 | |
| }, | |
| { | |
| "epoch": 94.0, | |
| "grad_norm": 0.2028261125087738, | |
| "learning_rate": 0.0002919093611225077, | |
| "loss": 0.1333, | |
| "step": 5264 | |
| }, | |
| { | |
| "epoch": 94.0, | |
| "eval_loss": 0.14725789427757263, | |
| "eval_runtime": 11.2025, | |
| "eval_samples_per_second": 969.429, | |
| "eval_steps_per_second": 1.964, | |
| "step": 5264 | |
| }, | |
| { | |
| "epoch": 95.0, | |
| "grad_norm": 0.20275919139385223, | |
| "learning_rate": 0.0002917722821901492, | |
| "loss": 0.1334, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 95.0, | |
| "eval_loss": 0.14767614006996155, | |
| "eval_runtime": 10.8005, | |
| "eval_samples_per_second": 1005.513, | |
| "eval_steps_per_second": 2.037, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 96.0, | |
| "grad_norm": 0.2053348869085312, | |
| "learning_rate": 0.0002916337959528444, | |
| "loss": 0.1325, | |
| "step": 5376 | |
| }, | |
| { | |
| "epoch": 96.0, | |
| "eval_loss": 0.14707864820957184, | |
| "eval_runtime": 11.1238, | |
| "eval_samples_per_second": 976.287, | |
| "eval_steps_per_second": 1.978, | |
| "step": 5376 | |
| }, | |
| { | |
| "epoch": 97.0, | |
| "grad_norm": 0.23510950803756714, | |
| "learning_rate": 0.0002914939037773966, | |
| "loss": 0.1321, | |
| "step": 5432 | |
| }, | |
| { | |
| "epoch": 97.0, | |
| "eval_loss": 0.1476944088935852, | |
| "eval_runtime": 10.9362, | |
| "eval_samples_per_second": 993.028, | |
| "eval_steps_per_second": 2.012, | |
| "step": 5432 | |
| }, | |
| { | |
| "epoch": 98.0, | |
| "grad_norm": 0.2703108787536621, | |
| "learning_rate": 0.000291352607044485, | |
| "loss": 0.1327, | |
| "step": 5488 | |
| }, | |
| { | |
| "epoch": 98.0, | |
| "eval_loss": 0.1466565579175949, | |
| "eval_runtime": 10.8189, | |
| "eval_samples_per_second": 1003.802, | |
| "eval_steps_per_second": 2.033, | |
| "step": 5488 | |
| }, | |
| { | |
| "epoch": 99.0, | |
| "grad_norm": 0.22386641800403595, | |
| "learning_rate": 0.0002912099071486513, | |
| "loss": 0.1318, | |
| "step": 5544 | |
| }, | |
| { | |
| "epoch": 99.0, | |
| "eval_loss": 0.1469065397977829, | |
| "eval_runtime": 10.9677, | |
| "eval_samples_per_second": 990.181, | |
| "eval_steps_per_second": 2.006, | |
| "step": 5544 | |
| }, | |
| { | |
| "epoch": 100.0, | |
| "grad_norm": 0.18684013187885284, | |
| "learning_rate": 0.0002910658054982861, | |
| "loss": 0.1319, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 100.0, | |
| "eval_loss": 0.1462097316980362, | |
| "eval_runtime": 11.5801, | |
| "eval_samples_per_second": 937.82, | |
| "eval_steps_per_second": 1.9, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 101.0, | |
| "grad_norm": 0.1831580400466919, | |
| "learning_rate": 0.00029092030351561435, | |
| "loss": 0.1318, | |
| "step": 5656 | |
| }, | |
| { | |
| "epoch": 101.0, | |
| "eval_loss": 0.1467864215373993, | |
| "eval_runtime": 11.2551, | |
| "eval_samples_per_second": 964.899, | |
| "eval_steps_per_second": 1.955, | |
| "step": 5656 | |
| }, | |
| { | |
| "epoch": 102.0, | |
| "grad_norm": 0.20423631370067596, | |
| "learning_rate": 0.00029077340263668184, | |
| "loss": 0.1315, | |
| "step": 5712 | |
| }, | |
| { | |
| "epoch": 102.0, | |
| "eval_loss": 0.1470629870891571, | |
| "eval_runtime": 10.0185, | |
| "eval_samples_per_second": 1083.994, | |
| "eval_steps_per_second": 2.196, | |
| "step": 5712 | |
| }, | |
| { | |
| "epoch": 103.0, | |
| "grad_norm": 0.20669810473918915, | |
| "learning_rate": 0.0002906251043113414, | |
| "loss": 0.1312, | |
| "step": 5768 | |
| }, | |
| { | |
| "epoch": 103.0, | |
| "eval_loss": 0.14603030681610107, | |
| "eval_runtime": 11.5962, | |
| "eval_samples_per_second": 936.51, | |
| "eval_steps_per_second": 1.897, | |
| "step": 5768 | |
| }, | |
| { | |
| "epoch": 104.0, | |
| "grad_norm": 0.18566496670246124, | |
| "learning_rate": 0.0002904754100032369, | |
| "loss": 0.1308, | |
| "step": 5824 | |
| }, | |
| { | |
| "epoch": 104.0, | |
| "eval_loss": 0.146591916680336, | |
| "eval_runtime": 11.8139, | |
| "eval_samples_per_second": 919.255, | |
| "eval_steps_per_second": 1.862, | |
| "step": 5824 | |
| }, | |
| { | |
| "epoch": 105.0, | |
| "grad_norm": 0.32265496253967285, | |
| "learning_rate": 0.000290324321189791, | |
| "loss": 0.1311, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 105.0, | |
| "eval_loss": 0.1458718478679657, | |
| "eval_runtime": 11.9546, | |
| "eval_samples_per_second": 908.438, | |
| "eval_steps_per_second": 1.84, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 106.0, | |
| "grad_norm": 0.17987699806690216, | |
| "learning_rate": 0.00029017183936218906, | |
| "loss": 0.1302, | |
| "step": 5936 | |
| }, | |
| { | |
| "epoch": 106.0, | |
| "eval_loss": 0.1459737867116928, | |
| "eval_runtime": 12.1694, | |
| "eval_samples_per_second": 892.4, | |
| "eval_steps_per_second": 1.808, | |
| "step": 5936 | |
| }, | |
| { | |
| "epoch": 107.0, | |
| "grad_norm": 0.18314820528030396, | |
| "learning_rate": 0.0002900179660253659, | |
| "loss": 0.1303, | |
| "step": 5992 | |
| }, | |
| { | |
| "epoch": 107.0, | |
| "eval_loss": 0.14506617188453674, | |
| "eval_runtime": 11.0204, | |
| "eval_samples_per_second": 985.446, | |
| "eval_steps_per_second": 1.996, | |
| "step": 5992 | |
| }, | |
| { | |
| "epoch": 108.0, | |
| "grad_norm": 0.1967027485370636, | |
| "learning_rate": 0.00028986270269798893, | |
| "loss": 0.13, | |
| "step": 6048 | |
| }, | |
| { | |
| "epoch": 108.0, | |
| "eval_loss": 0.1448826640844345, | |
| "eval_runtime": 11.2115, | |
| "eval_samples_per_second": 968.651, | |
| "eval_steps_per_second": 1.962, | |
| "step": 6048 | |
| }, | |
| { | |
| "epoch": 109.0, | |
| "grad_norm": 0.17848514020442963, | |
| "learning_rate": 0.00028970605091244395, | |
| "loss": 0.13, | |
| "step": 6104 | |
| }, | |
| { | |
| "epoch": 109.0, | |
| "eval_loss": 0.14577716588974, | |
| "eval_runtime": 12.0159, | |
| "eval_samples_per_second": 903.806, | |
| "eval_steps_per_second": 1.831, | |
| "step": 6104 | |
| }, | |
| { | |
| "epoch": 110.0, | |
| "grad_norm": 0.1681281179189682, | |
| "learning_rate": 0.00028954801221482137, | |
| "loss": 0.13, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 110.0, | |
| "eval_loss": 0.1459922343492508, | |
| "eval_runtime": 11.657, | |
| "eval_samples_per_second": 931.628, | |
| "eval_steps_per_second": 1.887, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 111.0, | |
| "grad_norm": 0.19543369114398956, | |
| "learning_rate": 0.00028938858816489945, | |
| "loss": 0.1294, | |
| "step": 6216 | |
| }, | |
| { | |
| "epoch": 111.0, | |
| "eval_loss": 0.14557458460330963, | |
| "eval_runtime": 11.502, | |
| "eval_samples_per_second": 944.183, | |
| "eval_steps_per_second": 1.913, | |
| "step": 6216 | |
| }, | |
| { | |
| "epoch": 112.0, | |
| "grad_norm": 0.19514279067516327, | |
| "learning_rate": 0.0002892277803361288, | |
| "loss": 0.1294, | |
| "step": 6272 | |
| }, | |
| { | |
| "epoch": 112.0, | |
| "eval_loss": 0.14542081952095032, | |
| "eval_runtime": 11.3675, | |
| "eval_samples_per_second": 955.353, | |
| "eval_steps_per_second": 1.935, | |
| "step": 6272 | |
| }, | |
| { | |
| "epoch": 113.0, | |
| "grad_norm": 0.19245897233486176, | |
| "learning_rate": 0.00028906559031561803, | |
| "loss": 0.1294, | |
| "step": 6328 | |
| }, | |
| { | |
| "epoch": 113.0, | |
| "eval_loss": 0.14575673639774323, | |
| "eval_runtime": 12.0854, | |
| "eval_samples_per_second": 898.603, | |
| "eval_steps_per_second": 1.82, | |
| "step": 6328 | |
| }, | |
| { | |
| "epoch": 114.0, | |
| "grad_norm": 0.2559398412704468, | |
| "learning_rate": 0.0002889020197041172, | |
| "loss": 0.129, | |
| "step": 6384 | |
| }, | |
| { | |
| "epoch": 114.0, | |
| "eval_loss": 0.14476452767848969, | |
| "eval_runtime": 11.4747, | |
| "eval_samples_per_second": 946.432, | |
| "eval_steps_per_second": 1.917, | |
| "step": 6384 | |
| }, | |
| { | |
| "epoch": 115.0, | |
| "grad_norm": 0.1581374853849411, | |
| "learning_rate": 0.0002887370701160019, | |
| "loss": 0.129, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 115.0, | |
| "eval_loss": 0.14649543166160583, | |
| "eval_runtime": 11.7792, | |
| "eval_samples_per_second": 921.961, | |
| "eval_steps_per_second": 1.868, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 116.0, | |
| "grad_norm": 0.17189738154411316, | |
| "learning_rate": 0.0002885707431792581, | |
| "loss": 0.1282, | |
| "step": 6496 | |
| }, | |
| { | |
| "epoch": 116.0, | |
| "eval_loss": 0.14660660922527313, | |
| "eval_runtime": 11.9186, | |
| "eval_samples_per_second": 911.183, | |
| "eval_steps_per_second": 1.846, | |
| "step": 6496 | |
| }, | |
| { | |
| "epoch": 117.0, | |
| "grad_norm": 0.2357121855020523, | |
| "learning_rate": 0.0002884030405354656, | |
| "loss": 0.129, | |
| "step": 6552 | |
| }, | |
| { | |
| "epoch": 117.0, | |
| "eval_loss": 0.146439790725708, | |
| "eval_runtime": 11.5156, | |
| "eval_samples_per_second": 943.071, | |
| "eval_steps_per_second": 1.91, | |
| "step": 6552 | |
| }, | |
| { | |
| "epoch": 118.0, | |
| "grad_norm": 0.1968863159418106, | |
| "learning_rate": 0.00028823396383978163, | |
| "loss": 0.1279, | |
| "step": 6608 | |
| }, | |
| { | |
| "epoch": 118.0, | |
| "eval_loss": 0.1450948715209961, | |
| "eval_runtime": 11.6204, | |
| "eval_samples_per_second": 934.567, | |
| "eval_steps_per_second": 1.893, | |
| "step": 6608 | |
| }, | |
| { | |
| "epoch": 119.0, | |
| "grad_norm": 0.16850939393043518, | |
| "learning_rate": 0.0002880635147609254, | |
| "loss": 0.1279, | |
| "step": 6664 | |
| }, | |
| { | |
| "epoch": 119.0, | |
| "eval_loss": 0.1456771343946457, | |
| "eval_runtime": 11.4295, | |
| "eval_samples_per_second": 950.17, | |
| "eval_steps_per_second": 1.925, | |
| "step": 6664 | |
| }, | |
| { | |
| "epoch": 120.0, | |
| "grad_norm": 0.20816339552402496, | |
| "learning_rate": 0.0002878916949811601, | |
| "loss": 0.1277, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 120.0, | |
| "eval_loss": 0.1461264193058014, | |
| "eval_runtime": 11.9161, | |
| "eval_samples_per_second": 911.372, | |
| "eval_steps_per_second": 1.846, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 121.0, | |
| "grad_norm": 0.19195137917995453, | |
| "learning_rate": 0.0002877185061962775, | |
| "loss": 0.1279, | |
| "step": 6776 | |
| }, | |
| { | |
| "epoch": 121.0, | |
| "eval_loss": 0.14506319165229797, | |
| "eval_runtime": 10.7769, | |
| "eval_samples_per_second": 1007.715, | |
| "eval_steps_per_second": 2.041, | |
| "step": 6776 | |
| }, | |
| { | |
| "epoch": 122.0, | |
| "grad_norm": 0.1636265516281128, | |
| "learning_rate": 0.0002875439501155812, | |
| "loss": 0.1277, | |
| "step": 6832 | |
| }, | |
| { | |
| "epoch": 122.0, | |
| "eval_loss": 0.1454634666442871, | |
| "eval_runtime": 11.7121, | |
| "eval_samples_per_second": 927.245, | |
| "eval_steps_per_second": 1.878, | |
| "step": 6832 | |
| }, | |
| { | |
| "epoch": 123.0, | |
| "grad_norm": 0.17660963535308838, | |
| "learning_rate": 0.00028736802846186907, | |
| "loss": 0.1273, | |
| "step": 6888 | |
| }, | |
| { | |
| "epoch": 123.0, | |
| "eval_loss": 0.1449379324913025, | |
| "eval_runtime": 12.0977, | |
| "eval_samples_per_second": 897.695, | |
| "eval_steps_per_second": 1.819, | |
| "step": 6888 | |
| }, | |
| { | |
| "epoch": 124.0, | |
| "grad_norm": 0.20895443856716156, | |
| "learning_rate": 0.00028719074297141686, | |
| "loss": 0.127, | |
| "step": 6944 | |
| }, | |
| { | |
| "epoch": 124.0, | |
| "eval_loss": 0.14427852630615234, | |
| "eval_runtime": 11.8774, | |
| "eval_samples_per_second": 914.341, | |
| "eval_steps_per_second": 1.852, | |
| "step": 6944 | |
| }, | |
| { | |
| "epoch": 125.0, | |
| "grad_norm": 0.1895224153995514, | |
| "learning_rate": 0.0002870120953939609, | |
| "loss": 0.1269, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 125.0, | |
| "eval_loss": 0.1446518748998642, | |
| "eval_runtime": 11.7658, | |
| "eval_samples_per_second": 923.015, | |
| "eval_steps_per_second": 1.87, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 126.0, | |
| "grad_norm": 0.191587895154953, | |
| "learning_rate": 0.0002868320874926807, | |
| "loss": 0.1269, | |
| "step": 7056 | |
| }, | |
| { | |
| "epoch": 126.0, | |
| "eval_loss": 0.14533261954784393, | |
| "eval_runtime": 11.2533, | |
| "eval_samples_per_second": 965.053, | |
| "eval_steps_per_second": 1.955, | |
| "step": 7056 | |
| }, | |
| { | |
| "epoch": 127.0, | |
| "grad_norm": 0.20511987805366516, | |
| "learning_rate": 0.00028665072104418107, | |
| "loss": 0.1263, | |
| "step": 7112 | |
| }, | |
| { | |
| "epoch": 127.0, | |
| "eval_loss": 0.1444355994462967, | |
| "eval_runtime": 11.3297, | |
| "eval_samples_per_second": 958.545, | |
| "eval_steps_per_second": 1.942, | |
| "step": 7112 | |
| }, | |
| { | |
| "epoch": 128.0, | |
| "grad_norm": 0.19347704946994781, | |
| "learning_rate": 0.0002864679978384761, | |
| "loss": 0.1266, | |
| "step": 7168 | |
| }, | |
| { | |
| "epoch": 128.0, | |
| "eval_loss": 0.14528335630893707, | |
| "eval_runtime": 11.7467, | |
| "eval_samples_per_second": 924.517, | |
| "eval_steps_per_second": 1.873, | |
| "step": 7168 | |
| }, | |
| { | |
| "epoch": 129.0, | |
| "grad_norm": 0.1948786824941635, | |
| "learning_rate": 0.00028628391967896994, | |
| "loss": 0.1267, | |
| "step": 7224 | |
| }, | |
| { | |
| "epoch": 129.0, | |
| "eval_loss": 0.1452852487564087, | |
| "eval_runtime": 10.7249, | |
| "eval_samples_per_second": 1012.6, | |
| "eval_steps_per_second": 2.051, | |
| "step": 7224 | |
| }, | |
| { | |
| "epoch": 130.0, | |
| "grad_norm": 0.2143562138080597, | |
| "learning_rate": 0.00028609848838243983, | |
| "loss": 0.1263, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 130.0, | |
| "eval_loss": 0.14422422647476196, | |
| "eval_runtime": 12.1111, | |
| "eval_samples_per_second": 896.699, | |
| "eval_steps_per_second": 1.817, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 131.0, | |
| "grad_norm": 0.17198456823825836, | |
| "learning_rate": 0.0002859117057790177, | |
| "loss": 0.1258, | |
| "step": 7336 | |
| }, | |
| { | |
| "epoch": 131.0, | |
| "eval_loss": 0.14419187605381012, | |
| "eval_runtime": 11.2161, | |
| "eval_samples_per_second": 968.25, | |
| "eval_steps_per_second": 1.961, | |
| "step": 7336 | |
| }, | |
| { | |
| "epoch": 132.0, | |
| "grad_norm": 0.2027718871831894, | |
| "learning_rate": 0.0002857235737121728, | |
| "loss": 0.1257, | |
| "step": 7392 | |
| }, | |
| { | |
| "epoch": 132.0, | |
| "eval_loss": 0.14398382604122162, | |
| "eval_runtime": 11.7549, | |
| "eval_samples_per_second": 923.871, | |
| "eval_steps_per_second": 1.872, | |
| "step": 7392 | |
| }, | |
| { | |
| "epoch": 133.0, | |
| "grad_norm": 0.18598471581935883, | |
| "learning_rate": 0.00028553409403869214, | |
| "loss": 0.1256, | |
| "step": 7448 | |
| }, | |
| { | |
| "epoch": 133.0, | |
| "eval_loss": 0.144750714302063, | |
| "eval_runtime": 10.9992, | |
| "eval_samples_per_second": 987.344, | |
| "eval_steps_per_second": 2.0, | |
| "step": 7448 | |
| }, | |
| { | |
| "epoch": 134.0, | |
| "grad_norm": 0.18290792405605316, | |
| "learning_rate": 0.0002853432686286638, | |
| "loss": 0.1255, | |
| "step": 7504 | |
| }, | |
| { | |
| "epoch": 134.0, | |
| "eval_loss": 0.14384572207927704, | |
| "eval_runtime": 11.23, | |
| "eval_samples_per_second": 967.05, | |
| "eval_steps_per_second": 1.959, | |
| "step": 7504 | |
| }, | |
| { | |
| "epoch": 135.0, | |
| "grad_norm": 0.22160011529922485, | |
| "learning_rate": 0.0002851510993654578, | |
| "loss": 0.1254, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 135.0, | |
| "eval_loss": 0.1437937319278717, | |
| "eval_runtime": 11.9673, | |
| "eval_samples_per_second": 907.472, | |
| "eval_steps_per_second": 1.838, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 136.0, | |
| "grad_norm": 0.18182989954948425, | |
| "learning_rate": 0.0002849575881457068, | |
| "loss": 0.1252, | |
| "step": 7616 | |
| }, | |
| { | |
| "epoch": 136.0, | |
| "eval_loss": 0.14378975331783295, | |
| "eval_runtime": 11.8117, | |
| "eval_samples_per_second": 919.426, | |
| "eval_steps_per_second": 1.863, | |
| "step": 7616 | |
| }, | |
| { | |
| "epoch": 137.0, | |
| "grad_norm": 0.16500607132911682, | |
| "learning_rate": 0.0002847627368792885, | |
| "loss": 0.125, | |
| "step": 7672 | |
| }, | |
| { | |
| "epoch": 137.0, | |
| "eval_loss": 0.1436585932970047, | |
| "eval_runtime": 12.4256, | |
| "eval_samples_per_second": 874.0, | |
| "eval_steps_per_second": 1.771, | |
| "step": 7672 | |
| }, | |
| { | |
| "epoch": 138.0, | |
| "grad_norm": 0.22664882242679596, | |
| "learning_rate": 0.0002845665474893062, | |
| "loss": 0.125, | |
| "step": 7728 | |
| }, | |
| { | |
| "epoch": 138.0, | |
| "eval_loss": 0.14313535392284393, | |
| "eval_runtime": 12.1895, | |
| "eval_samples_per_second": 890.932, | |
| "eval_steps_per_second": 1.805, | |
| "step": 7728 | |
| }, | |
| { | |
| "epoch": 139.0, | |
| "grad_norm": 0.1606769859790802, | |
| "learning_rate": 0.0002843690219120703, | |
| "loss": 0.1242, | |
| "step": 7784 | |
| }, | |
| { | |
| "epoch": 139.0, | |
| "eval_loss": 0.14361213147640228, | |
| "eval_runtime": 12.1036, | |
| "eval_samples_per_second": 897.251, | |
| "eval_steps_per_second": 1.818, | |
| "step": 7784 | |
| }, | |
| { | |
| "epoch": 140.0, | |
| "grad_norm": 0.20197436213493347, | |
| "learning_rate": 0.0002841701620970783, | |
| "loss": 0.1244, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 140.0, | |
| "eval_loss": 0.142960324883461, | |
| "eval_runtime": 11.6316, | |
| "eval_samples_per_second": 933.665, | |
| "eval_steps_per_second": 1.891, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 141.0, | |
| "grad_norm": 0.18616272509098053, | |
| "learning_rate": 0.000283969970006996, | |
| "loss": 0.1243, | |
| "step": 7896 | |
| }, | |
| { | |
| "epoch": 141.0, | |
| "eval_loss": 0.1441134661436081, | |
| "eval_runtime": 11.589, | |
| "eval_samples_per_second": 937.094, | |
| "eval_steps_per_second": 1.898, | |
| "step": 7896 | |
| }, | |
| { | |
| "epoch": 142.0, | |
| "grad_norm": 0.20340923964977264, | |
| "learning_rate": 0.0002837684476176391, | |
| "loss": 0.1239, | |
| "step": 7952 | |
| }, | |
| { | |
| "epoch": 142.0, | |
| "eval_loss": 0.1434699296951294, | |
| "eval_runtime": 12.3235, | |
| "eval_samples_per_second": 881.241, | |
| "eval_steps_per_second": 1.785, | |
| "step": 7952 | |
| }, | |
| { | |
| "epoch": 143.0, | |
| "grad_norm": 0.18145394325256348, | |
| "learning_rate": 0.0002835655969179518, | |
| "loss": 0.1241, | |
| "step": 8008 | |
| }, | |
| { | |
| "epoch": 143.0, | |
| "eval_loss": 0.14338643848896027, | |
| "eval_runtime": 12.3449, | |
| "eval_samples_per_second": 879.717, | |
| "eval_steps_per_second": 1.782, | |
| "step": 8008 | |
| }, | |
| { | |
| "epoch": 144.0, | |
| "grad_norm": 0.1755165159702301, | |
| "learning_rate": 0.0002833614199099885, | |
| "loss": 0.1241, | |
| "step": 8064 | |
| }, | |
| { | |
| "epoch": 144.0, | |
| "eval_loss": 0.14308682084083557, | |
| "eval_runtime": 12.0765, | |
| "eval_samples_per_second": 899.268, | |
| "eval_steps_per_second": 1.822, | |
| "step": 8064 | |
| }, | |
| { | |
| "epoch": 145.0, | |
| "grad_norm": 0.18520286679267883, | |
| "learning_rate": 0.00028315591860889397, | |
| "loss": 0.1238, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 145.0, | |
| "eval_loss": 0.14301612973213196, | |
| "eval_runtime": 11.4026, | |
| "eval_samples_per_second": 952.414, | |
| "eval_steps_per_second": 1.929, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 146.0, | |
| "grad_norm": 0.2836858630180359, | |
| "learning_rate": 0.0002829490950428833, | |
| "loss": 0.1237, | |
| "step": 8176 | |
| }, | |
| { | |
| "epoch": 146.0, | |
| "eval_loss": 0.1432274430990219, | |
| "eval_runtime": 10.5295, | |
| "eval_samples_per_second": 1031.389, | |
| "eval_steps_per_second": 2.089, | |
| "step": 8176 | |
| }, | |
| { | |
| "epoch": 147.0, | |
| "grad_norm": 0.18382933735847473, | |
| "learning_rate": 0.0002827409512532215, | |
| "loss": 0.1233, | |
| "step": 8232 | |
| }, | |
| { | |
| "epoch": 147.0, | |
| "eval_loss": 0.14315703511238098, | |
| "eval_runtime": 11.7841, | |
| "eval_samples_per_second": 921.584, | |
| "eval_steps_per_second": 1.867, | |
| "step": 8232 | |
| }, | |
| { | |
| "epoch": 148.0, | |
| "grad_norm": 0.16152502596378326, | |
| "learning_rate": 0.00028253148929420393, | |
| "loss": 0.1236, | |
| "step": 8288 | |
| }, | |
| { | |
| "epoch": 148.0, | |
| "eval_loss": 0.14190851151943207, | |
| "eval_runtime": 12.2311, | |
| "eval_samples_per_second": 887.903, | |
| "eval_steps_per_second": 1.799, | |
| "step": 8288 | |
| }, | |
| { | |
| "epoch": 149.0, | |
| "grad_norm": 0.23382407426834106, | |
| "learning_rate": 0.0002823207112331354, | |
| "loss": 0.1232, | |
| "step": 8344 | |
| }, | |
| { | |
| "epoch": 149.0, | |
| "eval_loss": 0.14270788431167603, | |
| "eval_runtime": 12.109, | |
| "eval_samples_per_second": 896.855, | |
| "eval_steps_per_second": 1.817, | |
| "step": 8344 | |
| }, | |
| { | |
| "epoch": 150.0, | |
| "grad_norm": 0.1615588366985321, | |
| "learning_rate": 0.00028210861915030973, | |
| "loss": 0.1232, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 150.0, | |
| "eval_loss": 0.14285807311534882, | |
| "eval_runtime": 12.5884, | |
| "eval_samples_per_second": 862.702, | |
| "eval_steps_per_second": 1.748, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 151.0, | |
| "grad_norm": 0.2795417308807373, | |
| "learning_rate": 0.0002818952151389907, | |
| "loss": 0.1227, | |
| "step": 8456 | |
| }, | |
| { | |
| "epoch": 151.0, | |
| "eval_loss": 0.14255040884017944, | |
| "eval_runtime": 12.5025, | |
| "eval_samples_per_second": 868.624, | |
| "eval_steps_per_second": 1.76, | |
| "step": 8456 | |
| }, | |
| { | |
| "epoch": 152.0, | |
| "grad_norm": 0.2292180061340332, | |
| "learning_rate": 0.00028168050130538953, | |
| "loss": 0.1231, | |
| "step": 8512 | |
| }, | |
| { | |
| "epoch": 152.0, | |
| "eval_loss": 0.14337477087974548, | |
| "eval_runtime": 12.1529, | |
| "eval_samples_per_second": 893.611, | |
| "eval_steps_per_second": 1.81, | |
| "step": 8512 | |
| }, | |
| { | |
| "epoch": 153.0, | |
| "grad_norm": 0.17736776173114777, | |
| "learning_rate": 0.00028146447976864553, | |
| "loss": 0.1224, | |
| "step": 8568 | |
| }, | |
| { | |
| "epoch": 153.0, | |
| "eval_loss": 0.14352336525917053, | |
| "eval_runtime": 12.3539, | |
| "eval_samples_per_second": 879.073, | |
| "eval_steps_per_second": 1.781, | |
| "step": 8568 | |
| }, | |
| { | |
| "epoch": 154.0, | |
| "grad_norm": 0.36273321509361267, | |
| "learning_rate": 0.0002812471526608039, | |
| "loss": 0.1227, | |
| "step": 8624 | |
| }, | |
| { | |
| "epoch": 154.0, | |
| "eval_loss": 0.142772376537323, | |
| "eval_runtime": 12.0892, | |
| "eval_samples_per_second": 898.323, | |
| "eval_steps_per_second": 1.82, | |
| "step": 8624 | |
| }, | |
| { | |
| "epoch": 155.0, | |
| "grad_norm": 0.19883078336715698, | |
| "learning_rate": 0.00028102852212679526, | |
| "loss": 0.1228, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 155.0, | |
| "eval_loss": 0.14210332930088043, | |
| "eval_runtime": 12.2389, | |
| "eval_samples_per_second": 887.336, | |
| "eval_steps_per_second": 1.798, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 156.0, | |
| "grad_norm": 0.2114337682723999, | |
| "learning_rate": 0.00028080859032441463, | |
| "loss": 0.1223, | |
| "step": 8736 | |
| }, | |
| { | |
| "epoch": 156.0, | |
| "eval_loss": 0.14258325099945068, | |
| "eval_runtime": 12.5038, | |
| "eval_samples_per_second": 868.534, | |
| "eval_steps_per_second": 1.759, | |
| "step": 8736 | |
| }, | |
| { | |
| "epoch": 157.0, | |
| "grad_norm": 0.193147674202919, | |
| "learning_rate": 0.0002805873594243001, | |
| "loss": 0.1223, | |
| "step": 8792 | |
| }, | |
| { | |
| "epoch": 157.0, | |
| "eval_loss": 0.1423390656709671, | |
| "eval_runtime": 11.2533, | |
| "eval_samples_per_second": 965.047, | |
| "eval_steps_per_second": 1.955, | |
| "step": 8792 | |
| }, | |
| { | |
| "epoch": 158.0, | |
| "grad_norm": 0.15751470625400543, | |
| "learning_rate": 0.0002803648316099116, | |
| "loss": 0.1222, | |
| "step": 8848 | |
| }, | |
| { | |
| "epoch": 158.0, | |
| "eval_loss": 0.1417943835258484, | |
| "eval_runtime": 11.5797, | |
| "eval_samples_per_second": 937.847, | |
| "eval_steps_per_second": 1.9, | |
| "step": 8848 | |
| }, | |
| { | |
| "epoch": 159.0, | |
| "grad_norm": 0.27395108342170715, | |
| "learning_rate": 0.00028014100907750874, | |
| "loss": 0.1219, | |
| "step": 8904 | |
| }, | |
| { | |
| "epoch": 159.0, | |
| "eval_loss": 0.14257293939590454, | |
| "eval_runtime": 12.328, | |
| "eval_samples_per_second": 880.923, | |
| "eval_steps_per_second": 1.785, | |
| "step": 8904 | |
| }, | |
| { | |
| "epoch": 160.0, | |
| "grad_norm": 0.22418324649333954, | |
| "learning_rate": 0.0002799158940361295, | |
| "loss": 0.1217, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 160.0, | |
| "eval_loss": 0.1431107521057129, | |
| "eval_runtime": 12.2423, | |
| "eval_samples_per_second": 887.09, | |
| "eval_steps_per_second": 1.797, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 161.0, | |
| "grad_norm": 0.2003849744796753, | |
| "learning_rate": 0.0002796894887075685, | |
| "loss": 0.1218, | |
| "step": 9016 | |
| }, | |
| { | |
| "epoch": 161.0, | |
| "eval_loss": 0.14198802411556244, | |
| "eval_runtime": 11.4923, | |
| "eval_samples_per_second": 944.981, | |
| "eval_steps_per_second": 1.914, | |
| "step": 9016 | |
| }, | |
| { | |
| "epoch": 162.0, | |
| "grad_norm": 0.21222490072250366, | |
| "learning_rate": 0.00027946179532635447, | |
| "loss": 0.1215, | |
| "step": 9072 | |
| }, | |
| { | |
| "epoch": 162.0, | |
| "eval_loss": 0.14226287603378296, | |
| "eval_runtime": 12.6489, | |
| "eval_samples_per_second": 858.572, | |
| "eval_steps_per_second": 1.739, | |
| "step": 9072 | |
| }, | |
| { | |
| "epoch": 163.0, | |
| "grad_norm": 0.3284847140312195, | |
| "learning_rate": 0.0002792328161397301, | |
| "loss": 0.1214, | |
| "step": 9128 | |
| }, | |
| { | |
| "epoch": 163.0, | |
| "eval_loss": 0.14255832135677338, | |
| "eval_runtime": 11.8749, | |
| "eval_samples_per_second": 914.536, | |
| "eval_steps_per_second": 1.853, | |
| "step": 9128 | |
| }, | |
| { | |
| "epoch": 164.0, | |
| "grad_norm": 0.17873606085777283, | |
| "learning_rate": 0.0002790025534076267, | |
| "loss": 0.1209, | |
| "step": 9184 | |
| }, | |
| { | |
| "epoch": 164.0, | |
| "eval_loss": 0.14214134216308594, | |
| "eval_runtime": 11.7349, | |
| "eval_samples_per_second": 925.446, | |
| "eval_steps_per_second": 1.875, | |
| "step": 9184 | |
| }, | |
| { | |
| "epoch": 165.0, | |
| "grad_norm": 0.29637348651885986, | |
| "learning_rate": 0.00027877100940264476, | |
| "loss": 0.1214, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 165.0, | |
| "eval_loss": 0.14148862659931183, | |
| "eval_runtime": 11.2369, | |
| "eval_samples_per_second": 966.457, | |
| "eval_steps_per_second": 1.958, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 166.0, | |
| "grad_norm": 0.19445298612117767, | |
| "learning_rate": 0.0002785381864100304, | |
| "loss": 0.1211, | |
| "step": 9296 | |
| }, | |
| { | |
| "epoch": 166.0, | |
| "eval_loss": 0.14366163313388824, | |
| "eval_runtime": 11.7897, | |
| "eval_samples_per_second": 921.146, | |
| "eval_steps_per_second": 1.866, | |
| "step": 9296 | |
| }, | |
| { | |
| "epoch": 167.0, | |
| "grad_norm": 0.2037288248538971, | |
| "learning_rate": 0.0002783040867276523, | |
| "loss": 0.1209, | |
| "step": 9352 | |
| }, | |
| { | |
| "epoch": 167.0, | |
| "eval_loss": 0.14206562936306, | |
| "eval_runtime": 11.4292, | |
| "eval_samples_per_second": 950.199, | |
| "eval_steps_per_second": 1.925, | |
| "step": 9352 | |
| }, | |
| { | |
| "epoch": 168.0, | |
| "grad_norm": 0.21530179679393768, | |
| "learning_rate": 0.0002780687126659796, | |
| "loss": 0.1208, | |
| "step": 9408 | |
| }, | |
| { | |
| "epoch": 168.0, | |
| "eval_loss": 0.1410149782896042, | |
| "eval_runtime": 11.7288, | |
| "eval_samples_per_second": 925.923, | |
| "eval_steps_per_second": 1.876, | |
| "step": 9408 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 56000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1000, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 10, | |
| "early_stopping_threshold": 1e-05 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1001513367240704e+18, | |
| "train_batch_size": 512, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |