| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 313, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0032, |
| "grad_norm": 13.684800148010254, |
| "learning_rate": 0.0, |
| "loss": 2.3276, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0064, |
| "grad_norm": 13.660787582397461, |
| "learning_rate": 4e-05, |
| "loss": 2.2792, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0096, |
| "grad_norm": 13.35280704498291, |
| "learning_rate": 8e-05, |
| "loss": 2.4151, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0128, |
| "grad_norm": 6.15027379989624, |
| "learning_rate": 0.00012, |
| "loss": 1.7812, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 1.3168226480484009, |
| "learning_rate": 0.00016, |
| "loss": 1.4536, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0192, |
| "grad_norm": 0.9872580170631409, |
| "learning_rate": 0.0002, |
| "loss": 1.4171, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.0224, |
| "grad_norm": 0.7496100664138794, |
| "learning_rate": 0.00019935064935064936, |
| "loss": 1.4168, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.0256, |
| "grad_norm": 0.7376005053520203, |
| "learning_rate": 0.00019870129870129872, |
| "loss": 1.3659, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.0288, |
| "grad_norm": 0.5281137824058533, |
| "learning_rate": 0.00019805194805194807, |
| "loss": 1.2566, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 0.5485746264457703, |
| "learning_rate": 0.00019740259740259742, |
| "loss": 1.3761, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0352, |
| "grad_norm": 0.5506592392921448, |
| "learning_rate": 0.00019675324675324675, |
| "loss": 1.3327, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.0384, |
| "grad_norm": 0.49382686614990234, |
| "learning_rate": 0.00019610389610389613, |
| "loss": 1.3727, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.0416, |
| "grad_norm": 0.36203011870384216, |
| "learning_rate": 0.00019545454545454548, |
| "loss": 1.1515, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.0448, |
| "grad_norm": 0.3528599739074707, |
| "learning_rate": 0.0001948051948051948, |
| "loss": 1.2636, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 0.31244418025016785, |
| "learning_rate": 0.00019415584415584416, |
| "loss": 1.1873, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0512, |
| "grad_norm": 0.3379523754119873, |
| "learning_rate": 0.00019350649350649354, |
| "loss": 1.2657, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.0544, |
| "grad_norm": 0.3025083839893341, |
| "learning_rate": 0.00019285714285714286, |
| "loss": 1.2846, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.0576, |
| "grad_norm": 0.2560190260410309, |
| "learning_rate": 0.00019220779220779222, |
| "loss": 1.1587, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.0608, |
| "grad_norm": 0.2554129958152771, |
| "learning_rate": 0.00019155844155844157, |
| "loss": 1.2812, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 0.22662702202796936, |
| "learning_rate": 0.00019090909090909092, |
| "loss": 1.1664, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0672, |
| "grad_norm": 0.2515714168548584, |
| "learning_rate": 0.00019025974025974027, |
| "loss": 1.2177, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.0704, |
| "grad_norm": 0.24396637082099915, |
| "learning_rate": 0.00018961038961038963, |
| "loss": 1.2053, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.0736, |
| "grad_norm": 0.24488303065299988, |
| "learning_rate": 0.00018896103896103895, |
| "loss": 1.2074, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.0768, |
| "grad_norm": 0.2168620079755783, |
| "learning_rate": 0.00018831168831168833, |
| "loss": 1.1284, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.24021224677562714, |
| "learning_rate": 0.00018766233766233769, |
| "loss": 1.2169, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.0832, |
| "grad_norm": 0.20057056844234467, |
| "learning_rate": 0.000187012987012987, |
| "loss": 1.1031, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.0864, |
| "grad_norm": 0.19900795817375183, |
| "learning_rate": 0.00018636363636363636, |
| "loss": 1.1004, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.0896, |
| "grad_norm": 0.2019268423318863, |
| "learning_rate": 0.00018571428571428572, |
| "loss": 1.1476, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.0928, |
| "grad_norm": 0.1996479034423828, |
| "learning_rate": 0.00018506493506493507, |
| "loss": 1.1455, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 0.25262022018432617, |
| "learning_rate": 0.00018441558441558442, |
| "loss": 1.1025, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0992, |
| "grad_norm": 0.225438192486763, |
| "learning_rate": 0.00018376623376623378, |
| "loss": 1.1954, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.1024, |
| "grad_norm": 0.17834505438804626, |
| "learning_rate": 0.00018311688311688313, |
| "loss": 1.0934, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.1056, |
| "grad_norm": 0.20071206986904144, |
| "learning_rate": 0.00018246753246753248, |
| "loss": 1.0488, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.1088, |
| "grad_norm": 0.1920139640569687, |
| "learning_rate": 0.00018181818181818183, |
| "loss": 1.123, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 0.18714852631092072, |
| "learning_rate": 0.0001811688311688312, |
| "loss": 1.0798, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.1152, |
| "grad_norm": 0.18315713107585907, |
| "learning_rate": 0.00018051948051948054, |
| "loss": 1.1107, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.1184, |
| "grad_norm": 0.19156870245933533, |
| "learning_rate": 0.00017987012987012987, |
| "loss": 1.1125, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.1216, |
| "grad_norm": 0.21527768671512604, |
| "learning_rate": 0.00017922077922077922, |
| "loss": 1.1346, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.1248, |
| "grad_norm": 0.1871163249015808, |
| "learning_rate": 0.0001785714285714286, |
| "loss": 1.0742, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 0.17750784754753113, |
| "learning_rate": 0.00017792207792207792, |
| "loss": 1.1323, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.1312, |
| "grad_norm": 0.177419051527977, |
| "learning_rate": 0.00017727272727272728, |
| "loss": 1.1405, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.1344, |
| "grad_norm": 0.16714292764663696, |
| "learning_rate": 0.00017662337662337663, |
| "loss": 1.1084, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.1376, |
| "grad_norm": 0.1610356718301773, |
| "learning_rate": 0.00017597402597402598, |
| "loss": 1.1125, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.1408, |
| "grad_norm": 0.2548656761646271, |
| "learning_rate": 0.00017532467532467534, |
| "loss": 1.1114, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 0.1731044203042984, |
| "learning_rate": 0.0001746753246753247, |
| "loss": 1.1197, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.1472, |
| "grad_norm": 0.1739533394575119, |
| "learning_rate": 0.00017402597402597401, |
| "loss": 1.1777, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.1504, |
| "grad_norm": 0.2178352177143097, |
| "learning_rate": 0.0001733766233766234, |
| "loss": 1.1111, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.1536, |
| "grad_norm": 0.17247150838375092, |
| "learning_rate": 0.00017272727272727275, |
| "loss": 1.1253, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.1568, |
| "grad_norm": 0.18075324594974518, |
| "learning_rate": 0.00017207792207792207, |
| "loss": 1.1358, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.15898071229457855, |
| "learning_rate": 0.00017142857142857143, |
| "loss": 1.0606, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.1632, |
| "grad_norm": 0.16518613696098328, |
| "learning_rate": 0.0001707792207792208, |
| "loss": 1.0944, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.1664, |
| "grad_norm": 0.16035063564777374, |
| "learning_rate": 0.00017012987012987013, |
| "loss": 1.0554, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.1696, |
| "grad_norm": 0.1686483472585678, |
| "learning_rate": 0.00016948051948051948, |
| "loss": 1.0384, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.1728, |
| "grad_norm": 0.16575631499290466, |
| "learning_rate": 0.00016883116883116884, |
| "loss": 1.0243, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 0.16840039193630219, |
| "learning_rate": 0.0001681818181818182, |
| "loss": 1.117, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.1792, |
| "grad_norm": 0.17616064846515656, |
| "learning_rate": 0.00016753246753246754, |
| "loss": 1.0743, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.1824, |
| "grad_norm": 0.168218195438385, |
| "learning_rate": 0.0001668831168831169, |
| "loss": 1.0627, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.1856, |
| "grad_norm": 0.17026656866073608, |
| "learning_rate": 0.00016623376623376625, |
| "loss": 1.0059, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.1888, |
| "grad_norm": 0.16454458236694336, |
| "learning_rate": 0.0001655844155844156, |
| "loss": 0.9943, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 0.17185136675834656, |
| "learning_rate": 0.00016493506493506495, |
| "loss": 1.1545, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.1952, |
| "grad_norm": 0.17822986841201782, |
| "learning_rate": 0.00016428571428571428, |
| "loss": 1.073, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.1984, |
| "grad_norm": 0.1676608771085739, |
| "learning_rate": 0.00016363636363636366, |
| "loss": 1.0886, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.2016, |
| "grad_norm": 0.1727771908044815, |
| "learning_rate": 0.000162987012987013, |
| "loss": 1.0432, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.2048, |
| "grad_norm": 0.17827573418617249, |
| "learning_rate": 0.00016233766233766234, |
| "loss": 1.083, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.208, |
| "grad_norm": 0.19807517528533936, |
| "learning_rate": 0.0001616883116883117, |
| "loss": 1.1208, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.2112, |
| "grad_norm": 0.17693684995174408, |
| "learning_rate": 0.00016103896103896104, |
| "loss": 1.089, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.2144, |
| "grad_norm": 0.15489234030246735, |
| "learning_rate": 0.0001603896103896104, |
| "loss": 0.9707, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.2176, |
| "grad_norm": 0.16443990170955658, |
| "learning_rate": 0.00015974025974025975, |
| "loss": 1.0643, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.2208, |
| "grad_norm": 0.2051103413105011, |
| "learning_rate": 0.0001590909090909091, |
| "loss": 1.1246, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.224, |
| "grad_norm": 0.18824075162410736, |
| "learning_rate": 0.00015844155844155845, |
| "loss": 1.0855, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.2272, |
| "grad_norm": 0.18659448623657227, |
| "learning_rate": 0.0001577922077922078, |
| "loss": 1.1412, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.2304, |
| "grad_norm": 0.1854114979505539, |
| "learning_rate": 0.00015714285714285716, |
| "loss": 1.0249, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.2336, |
| "grad_norm": 0.1876193732023239, |
| "learning_rate": 0.00015649350649350649, |
| "loss": 1.1029, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.2368, |
| "grad_norm": 0.1888684630393982, |
| "learning_rate": 0.00015584415584415587, |
| "loss": 1.0789, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.20240606367588043, |
| "learning_rate": 0.0001551948051948052, |
| "loss": 1.0495, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.2432, |
| "grad_norm": 0.232120081782341, |
| "learning_rate": 0.00015454545454545454, |
| "loss": 1.0735, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.2464, |
| "grad_norm": 0.16897843778133392, |
| "learning_rate": 0.0001538961038961039, |
| "loss": 1.0164, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.2496, |
| "grad_norm": 0.18796634674072266, |
| "learning_rate": 0.00015324675324675325, |
| "loss": 1.0676, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.2528, |
| "grad_norm": 0.19574032723903656, |
| "learning_rate": 0.0001525974025974026, |
| "loss": 1.0456, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.256, |
| "grad_norm": 0.18007811903953552, |
| "learning_rate": 0.00015194805194805196, |
| "loss": 1.0894, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2592, |
| "grad_norm": 0.18932929635047913, |
| "learning_rate": 0.0001512987012987013, |
| "loss": 1.0729, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.2624, |
| "grad_norm": 0.20614288747310638, |
| "learning_rate": 0.00015064935064935066, |
| "loss": 1.0854, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.2656, |
| "grad_norm": 0.19291089475154877, |
| "learning_rate": 0.00015000000000000001, |
| "loss": 1.1217, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.2688, |
| "grad_norm": 0.18916529417037964, |
| "learning_rate": 0.00014935064935064934, |
| "loss": 1.0963, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.272, |
| "grad_norm": 0.20306220650672913, |
| "learning_rate": 0.00014870129870129872, |
| "loss": 1.0898, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.2752, |
| "grad_norm": 0.17870067059993744, |
| "learning_rate": 0.00014805194805194807, |
| "loss": 1.0213, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.2784, |
| "grad_norm": 0.18411923944950104, |
| "learning_rate": 0.0001474025974025974, |
| "loss": 1.0844, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.2816, |
| "grad_norm": 0.18788227438926697, |
| "learning_rate": 0.00014675324675324675, |
| "loss": 1.0338, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.2848, |
| "grad_norm": 0.23874884843826294, |
| "learning_rate": 0.00014610389610389613, |
| "loss": 1.1118, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.288, |
| "grad_norm": 0.19380499422550201, |
| "learning_rate": 0.00014545454545454546, |
| "loss": 1.0464, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.2912, |
| "grad_norm": 0.18968750536441803, |
| "learning_rate": 0.0001448051948051948, |
| "loss": 1.0569, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.2944, |
| "grad_norm": 0.19545753300189972, |
| "learning_rate": 0.00014415584415584416, |
| "loss": 1.1225, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.2976, |
| "grad_norm": 0.19170494377613068, |
| "learning_rate": 0.00014350649350649352, |
| "loss": 1.0602, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.3008, |
| "grad_norm": 0.17953918874263763, |
| "learning_rate": 0.00014285714285714287, |
| "loss": 1.032, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.304, |
| "grad_norm": 0.1822536289691925, |
| "learning_rate": 0.00014220779220779222, |
| "loss": 1.0559, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.3072, |
| "grad_norm": 0.18591298162937164, |
| "learning_rate": 0.00014155844155844155, |
| "loss": 1.031, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.3104, |
| "grad_norm": 0.2129002958536148, |
| "learning_rate": 0.00014090909090909093, |
| "loss": 1.1391, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.3136, |
| "grad_norm": 0.18386681377887726, |
| "learning_rate": 0.00014025974025974028, |
| "loss": 0.9919, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.3168, |
| "grad_norm": 0.18314239382743835, |
| "learning_rate": 0.0001396103896103896, |
| "loss": 1.0445, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.1999066174030304, |
| "learning_rate": 0.00013896103896103896, |
| "loss": 1.0538, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3232, |
| "grad_norm": 0.18741188943386078, |
| "learning_rate": 0.00013831168831168834, |
| "loss": 1.0722, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.3264, |
| "grad_norm": 0.19351010024547577, |
| "learning_rate": 0.00013766233766233766, |
| "loss": 1.0491, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.3296, |
| "grad_norm": 0.18859203159809113, |
| "learning_rate": 0.00013701298701298702, |
| "loss": 1.0593, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.3328, |
| "grad_norm": 0.1962767392396927, |
| "learning_rate": 0.00013636363636363637, |
| "loss": 1.1344, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.336, |
| "grad_norm": 0.20819440484046936, |
| "learning_rate": 0.00013571428571428572, |
| "loss": 1.1137, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.3392, |
| "grad_norm": 0.19590184092521667, |
| "learning_rate": 0.00013506493506493507, |
| "loss": 1.0624, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.3424, |
| "grad_norm": 0.18631424009799957, |
| "learning_rate": 0.00013441558441558443, |
| "loss": 1.0587, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.3456, |
| "grad_norm": 0.19572143256664276, |
| "learning_rate": 0.00013376623376623375, |
| "loss": 1.0494, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.3488, |
| "grad_norm": 0.1910988837480545, |
| "learning_rate": 0.00013311688311688313, |
| "loss": 1.0481, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.352, |
| "grad_norm": 0.19455869495868683, |
| "learning_rate": 0.00013246753246753249, |
| "loss": 1.029, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.3552, |
| "grad_norm": 0.18669827282428741, |
| "learning_rate": 0.0001318181818181818, |
| "loss": 1.0513, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.3584, |
| "grad_norm": 0.17523664236068726, |
| "learning_rate": 0.0001311688311688312, |
| "loss": 1.0126, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.3616, |
| "grad_norm": 0.17929129302501678, |
| "learning_rate": 0.00013051948051948052, |
| "loss": 1.0717, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.3648, |
| "grad_norm": 0.19380168616771698, |
| "learning_rate": 0.00012987012987012987, |
| "loss": 1.0324, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.368, |
| "grad_norm": 0.18090228736400604, |
| "learning_rate": 0.00012922077922077922, |
| "loss": 1.0515, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.3712, |
| "grad_norm": 0.2067340910434723, |
| "learning_rate": 0.00012857142857142858, |
| "loss": 1.0939, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.3744, |
| "grad_norm": 0.1880485862493515, |
| "learning_rate": 0.00012792207792207793, |
| "loss": 1.0986, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.3776, |
| "grad_norm": 0.182168647646904, |
| "learning_rate": 0.00012727272727272728, |
| "loss": 1.0109, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.3808, |
| "grad_norm": 0.20187129080295563, |
| "learning_rate": 0.00012662337662337663, |
| "loss": 1.0668, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.384, |
| "grad_norm": 0.2082669734954834, |
| "learning_rate": 0.000125974025974026, |
| "loss": 1.054, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3872, |
| "grad_norm": 0.18294434249401093, |
| "learning_rate": 0.00012532467532467534, |
| "loss": 1.0397, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.3904, |
| "grad_norm": 0.20515067875385284, |
| "learning_rate": 0.00012467532467532467, |
| "loss": 1.1092, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.3936, |
| "grad_norm": 0.1758790761232376, |
| "learning_rate": 0.00012402597402597402, |
| "loss": 0.9755, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.3968, |
| "grad_norm": 0.2170792669057846, |
| "learning_rate": 0.0001233766233766234, |
| "loss": 1.0434, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.202157124876976, |
| "learning_rate": 0.00012272727272727272, |
| "loss": 1.1129, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.4032, |
| "grad_norm": 0.18556398153305054, |
| "learning_rate": 0.00012207792207792208, |
| "loss": 1.0665, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.4064, |
| "grad_norm": 0.20196087658405304, |
| "learning_rate": 0.00012142857142857143, |
| "loss": 1.1, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.4096, |
| "grad_norm": 0.1921566128730774, |
| "learning_rate": 0.0001207792207792208, |
| "loss": 1.0918, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.4128, |
| "grad_norm": 0.18866224586963654, |
| "learning_rate": 0.00012012987012987014, |
| "loss": 1.0014, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.416, |
| "grad_norm": 0.207601398229599, |
| "learning_rate": 0.00011948051948051949, |
| "loss": 1.0726, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.4192, |
| "grad_norm": 0.21592366695404053, |
| "learning_rate": 0.00011883116883116883, |
| "loss": 1.1379, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.4224, |
| "grad_norm": 0.2016124576330185, |
| "learning_rate": 0.0001181818181818182, |
| "loss": 1.1428, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.4256, |
| "grad_norm": 0.20478437840938568, |
| "learning_rate": 0.00011753246753246753, |
| "loss": 1.121, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.4288, |
| "grad_norm": 0.22730594873428345, |
| "learning_rate": 0.00011688311688311689, |
| "loss": 1.0319, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.432, |
| "grad_norm": 0.22592711448669434, |
| "learning_rate": 0.00011623376623376625, |
| "loss": 1.1264, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.4352, |
| "grad_norm": 0.20035041868686676, |
| "learning_rate": 0.00011558441558441559, |
| "loss": 1.0686, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.4384, |
| "grad_norm": 0.20648567378520966, |
| "learning_rate": 0.00011493506493506494, |
| "loss": 1.0817, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.4416, |
| "grad_norm": 0.21222743391990662, |
| "learning_rate": 0.00011428571428571428, |
| "loss": 1.0678, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.4448, |
| "grad_norm": 0.2075391560792923, |
| "learning_rate": 0.00011363636363636365, |
| "loss": 1.0897, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.448, |
| "grad_norm": 0.1964101791381836, |
| "learning_rate": 0.000112987012987013, |
| "loss": 1.0906, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.4512, |
| "grad_norm": 0.22406511008739471, |
| "learning_rate": 0.00011233766233766234, |
| "loss": 1.0594, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.4544, |
| "grad_norm": 0.23787978291511536, |
| "learning_rate": 0.00011168831168831168, |
| "loss": 1.1053, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.4576, |
| "grad_norm": 0.21196185052394867, |
| "learning_rate": 0.00011103896103896105, |
| "loss": 1.0923, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.4608, |
| "grad_norm": 0.21042804419994354, |
| "learning_rate": 0.0001103896103896104, |
| "loss": 1.0381, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.464, |
| "grad_norm": 0.2267436534166336, |
| "learning_rate": 0.00010974025974025974, |
| "loss": 1.0818, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.4672, |
| "grad_norm": 0.23742735385894775, |
| "learning_rate": 0.00010909090909090909, |
| "loss": 1.0872, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.4704, |
| "grad_norm": 0.17787213623523712, |
| "learning_rate": 0.00010844155844155846, |
| "loss": 1.03, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.4736, |
| "grad_norm": 0.22422832250595093, |
| "learning_rate": 0.0001077922077922078, |
| "loss": 1.0738, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.4768, |
| "grad_norm": 0.22946301102638245, |
| "learning_rate": 0.00010714285714285715, |
| "loss": 1.0274, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.2137996405363083, |
| "learning_rate": 0.00010649350649350649, |
| "loss": 1.0539, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.4832, |
| "grad_norm": 0.1748756766319275, |
| "learning_rate": 0.00010584415584415586, |
| "loss": 1.0355, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.4864, |
| "grad_norm": 0.22275175154209137, |
| "learning_rate": 0.0001051948051948052, |
| "loss": 1.1696, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.4896, |
| "grad_norm": 0.20996077358722687, |
| "learning_rate": 0.00010454545454545455, |
| "loss": 1.0303, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.4928, |
| "grad_norm": 0.1945938766002655, |
| "learning_rate": 0.00010389610389610389, |
| "loss": 0.9747, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.496, |
| "grad_norm": 0.1970377266407013, |
| "learning_rate": 0.00010324675324675325, |
| "loss": 1.0358, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.4992, |
| "grad_norm": 0.18814732134342194, |
| "learning_rate": 0.00010259740259740261, |
| "loss": 0.9612, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.5024, |
| "grad_norm": 0.2153233289718628, |
| "learning_rate": 0.00010194805194805195, |
| "loss": 1.0749, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.5056, |
| "grad_norm": 0.21788008511066437, |
| "learning_rate": 0.0001012987012987013, |
| "loss": 1.0883, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.5088, |
| "grad_norm": 0.214650496840477, |
| "learning_rate": 0.00010064935064935067, |
| "loss": 1.0539, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.512, |
| "grad_norm": 0.19312834739685059, |
| "learning_rate": 0.0001, |
| "loss": 1.0657, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.5152, |
| "grad_norm": 0.19916598498821259, |
| "learning_rate": 9.935064935064936e-05, |
| "loss": 1.0478, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.5184, |
| "grad_norm": 0.2057606726884842, |
| "learning_rate": 9.870129870129871e-05, |
| "loss": 1.0094, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.5216, |
| "grad_norm": 0.22159607708454132, |
| "learning_rate": 9.805194805194806e-05, |
| "loss": 1.0952, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.5248, |
| "grad_norm": 0.18274275958538055, |
| "learning_rate": 9.74025974025974e-05, |
| "loss": 1.0065, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.528, |
| "grad_norm": 0.19835162162780762, |
| "learning_rate": 9.675324675324677e-05, |
| "loss": 1.0742, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.5312, |
| "grad_norm": 0.2114904820919037, |
| "learning_rate": 9.610389610389611e-05, |
| "loss": 1.1109, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.5344, |
| "grad_norm": 0.21488523483276367, |
| "learning_rate": 9.545454545454546e-05, |
| "loss": 1.0465, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.5376, |
| "grad_norm": 0.19870303571224213, |
| "learning_rate": 9.480519480519481e-05, |
| "loss": 1.0318, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.5408, |
| "grad_norm": 0.20413029193878174, |
| "learning_rate": 9.415584415584417e-05, |
| "loss": 1.0817, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.544, |
| "grad_norm": 0.1847231239080429, |
| "learning_rate": 9.35064935064935e-05, |
| "loss": 1.0144, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5472, |
| "grad_norm": 0.2715964913368225, |
| "learning_rate": 9.285714285714286e-05, |
| "loss": 0.9832, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.5504, |
| "grad_norm": 0.2225002497434616, |
| "learning_rate": 9.220779220779221e-05, |
| "loss": 1.1051, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.5536, |
| "grad_norm": 0.22931510210037231, |
| "learning_rate": 9.155844155844156e-05, |
| "loss": 1.1042, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.5568, |
| "grad_norm": 0.21848627924919128, |
| "learning_rate": 9.090909090909092e-05, |
| "loss": 1.1151, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.19852259755134583, |
| "learning_rate": 9.025974025974027e-05, |
| "loss": 1.0889, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.5632, |
| "grad_norm": 0.2080363780260086, |
| "learning_rate": 8.961038961038961e-05, |
| "loss": 1.0777, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.5664, |
| "grad_norm": 0.22391024231910706, |
| "learning_rate": 8.896103896103896e-05, |
| "loss": 1.1092, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.5696, |
| "grad_norm": 0.21793846786022186, |
| "learning_rate": 8.831168831168831e-05, |
| "loss": 1.044, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.5728, |
| "grad_norm": 0.2009749859571457, |
| "learning_rate": 8.766233766233767e-05, |
| "loss": 1.0198, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.576, |
| "grad_norm": 0.19432318210601807, |
| "learning_rate": 8.701298701298701e-05, |
| "loss": 1.075, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5792, |
| "grad_norm": 0.18634547293186188, |
| "learning_rate": 8.636363636363637e-05, |
| "loss": 0.9964, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.5824, |
| "grad_norm": 0.1947103589773178, |
| "learning_rate": 8.571428571428571e-05, |
| "loss": 1.0025, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.5856, |
| "grad_norm": 0.23098671436309814, |
| "learning_rate": 8.506493506493507e-05, |
| "loss": 1.0562, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.5888, |
| "grad_norm": 0.19686414301395416, |
| "learning_rate": 8.441558441558442e-05, |
| "loss": 1.0285, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.592, |
| "grad_norm": 0.19852428138256073, |
| "learning_rate": 8.376623376623377e-05, |
| "loss": 1.0054, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.5952, |
| "grad_norm": 0.21483510732650757, |
| "learning_rate": 8.311688311688312e-05, |
| "loss": 1.108, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.5984, |
| "grad_norm": 0.23313644528388977, |
| "learning_rate": 8.246753246753248e-05, |
| "loss": 1.1383, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.6016, |
| "grad_norm": 0.21453145146369934, |
| "learning_rate": 8.181818181818183e-05, |
| "loss": 1.0911, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.6048, |
| "grad_norm": 0.20268195867538452, |
| "learning_rate": 8.116883116883117e-05, |
| "loss": 1.0145, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.608, |
| "grad_norm": 0.20576398074626923, |
| "learning_rate": 8.051948051948052e-05, |
| "loss": 1.0829, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.6112, |
| "grad_norm": 0.21732626855373383, |
| "learning_rate": 7.987012987012987e-05, |
| "loss": 1.0152, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.6144, |
| "grad_norm": 0.22046895325183868, |
| "learning_rate": 7.922077922077923e-05, |
| "loss": 1.1311, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.6176, |
| "grad_norm": 0.19727715849876404, |
| "learning_rate": 7.857142857142858e-05, |
| "loss": 1.0364, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.6208, |
| "grad_norm": 0.20861488580703735, |
| "learning_rate": 7.792207792207793e-05, |
| "loss": 1.0435, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.624, |
| "grad_norm": 0.18545083701610565, |
| "learning_rate": 7.727272727272727e-05, |
| "loss": 1.0299, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.6272, |
| "grad_norm": 0.19965052604675293, |
| "learning_rate": 7.662337662337662e-05, |
| "loss": 1.0511, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.6304, |
| "grad_norm": 0.23673909902572632, |
| "learning_rate": 7.597402597402598e-05, |
| "loss": 1.081, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.6336, |
| "grad_norm": 0.17583179473876953, |
| "learning_rate": 7.532467532467533e-05, |
| "loss": 0.9808, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.6368, |
| "grad_norm": 0.2129366099834442, |
| "learning_rate": 7.467532467532467e-05, |
| "loss": 1.0522, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.21679140627384186, |
| "learning_rate": 7.402597402597404e-05, |
| "loss": 1.0567, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6432, |
| "grad_norm": 0.2032000720500946, |
| "learning_rate": 7.337662337662338e-05, |
| "loss": 1.0466, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.6464, |
| "grad_norm": 0.1887970268726349, |
| "learning_rate": 7.272727272727273e-05, |
| "loss": 1.0329, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.6496, |
| "grad_norm": 0.21060192584991455, |
| "learning_rate": 7.207792207792208e-05, |
| "loss": 1.1021, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.6528, |
| "grad_norm": 0.21191425621509552, |
| "learning_rate": 7.142857142857143e-05, |
| "loss": 0.99, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.656, |
| "grad_norm": 0.1995989829301834, |
| "learning_rate": 7.077922077922077e-05, |
| "loss": 1.0526, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.6592, |
| "grad_norm": 0.1849513053894043, |
| "learning_rate": 7.012987012987014e-05, |
| "loss": 0.9998, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.6624, |
| "grad_norm": 0.1948779672384262, |
| "learning_rate": 6.948051948051948e-05, |
| "loss": 1.075, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.6656, |
| "grad_norm": 0.20374052226543427, |
| "learning_rate": 6.883116883116883e-05, |
| "loss": 1.0933, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.6688, |
| "grad_norm": 0.2102465033531189, |
| "learning_rate": 6.818181818181818e-05, |
| "loss": 1.1123, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.672, |
| "grad_norm": 0.21376173198223114, |
| "learning_rate": 6.753246753246754e-05, |
| "loss": 1.1233, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.6752, |
| "grad_norm": 0.20934203267097473, |
| "learning_rate": 6.688311688311688e-05, |
| "loss": 1.1374, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.6784, |
| "grad_norm": 0.18604128062725067, |
| "learning_rate": 6.623376623376624e-05, |
| "loss": 1.0213, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.6816, |
| "grad_norm": 0.19644233584403992, |
| "learning_rate": 6.55844155844156e-05, |
| "loss": 1.0046, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.6848, |
| "grad_norm": 0.18479463458061218, |
| "learning_rate": 6.493506493506494e-05, |
| "loss": 0.9792, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.688, |
| "grad_norm": 0.1945149153470993, |
| "learning_rate": 6.428571428571429e-05, |
| "loss": 1.0584, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.6912, |
| "grad_norm": 0.2070147544145584, |
| "learning_rate": 6.363636363636364e-05, |
| "loss": 1.071, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.6944, |
| "grad_norm": 0.19645985960960388, |
| "learning_rate": 6.2987012987013e-05, |
| "loss": 1.0721, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.6976, |
| "grad_norm": 0.1960117667913437, |
| "learning_rate": 6.233766233766233e-05, |
| "loss": 1.071, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.7008, |
| "grad_norm": 0.20168261229991913, |
| "learning_rate": 6.16883116883117e-05, |
| "loss": 1.0808, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.704, |
| "grad_norm": 0.21254412829875946, |
| "learning_rate": 6.103896103896104e-05, |
| "loss": 1.0287, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.7072, |
| "grad_norm": 0.21271063387393951, |
| "learning_rate": 6.03896103896104e-05, |
| "loss": 1.0605, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.7104, |
| "grad_norm": 0.2081408053636551, |
| "learning_rate": 5.9740259740259744e-05, |
| "loss": 1.091, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.7136, |
| "grad_norm": 0.21113798022270203, |
| "learning_rate": 5.90909090909091e-05, |
| "loss": 1.1323, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.7168, |
| "grad_norm": 0.20670844614505768, |
| "learning_rate": 5.844155844155844e-05, |
| "loss": 1.0955, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.2010120451450348, |
| "learning_rate": 5.7792207792207796e-05, |
| "loss": 1.1068, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.7232, |
| "grad_norm": 0.20379121601581573, |
| "learning_rate": 5.714285714285714e-05, |
| "loss": 1.0419, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.7264, |
| "grad_norm": 0.22799807786941528, |
| "learning_rate": 5.64935064935065e-05, |
| "loss": 1.0904, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.7296, |
| "grad_norm": 0.2005995213985443, |
| "learning_rate": 5.584415584415584e-05, |
| "loss": 1.078, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.7328, |
| "grad_norm": 0.20329605042934418, |
| "learning_rate": 5.51948051948052e-05, |
| "loss": 1.0245, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.736, |
| "grad_norm": 0.19283504784107208, |
| "learning_rate": 5.4545454545454546e-05, |
| "loss": 1.0367, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.7392, |
| "grad_norm": 0.20624355971813202, |
| "learning_rate": 5.38961038961039e-05, |
| "loss": 1.1046, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.7424, |
| "grad_norm": 0.21362991631031036, |
| "learning_rate": 5.3246753246753245e-05, |
| "loss": 1.1104, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.7456, |
| "grad_norm": 0.20447863638401031, |
| "learning_rate": 5.25974025974026e-05, |
| "loss": 1.0514, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.7488, |
| "grad_norm": 0.1974381059408188, |
| "learning_rate": 5.1948051948051944e-05, |
| "loss": 1.0048, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.752, |
| "grad_norm": 0.21237170696258545, |
| "learning_rate": 5.1298701298701304e-05, |
| "loss": 1.1299, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.7552, |
| "grad_norm": 0.21224971115589142, |
| "learning_rate": 5.064935064935065e-05, |
| "loss": 1.05, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.7584, |
| "grad_norm": 0.19865018129348755, |
| "learning_rate": 5e-05, |
| "loss": 1.0665, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.7616, |
| "grad_norm": 0.19199275970458984, |
| "learning_rate": 4.9350649350649355e-05, |
| "loss": 0.9531, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.7648, |
| "grad_norm": 0.19573214650154114, |
| "learning_rate": 4.87012987012987e-05, |
| "loss": 1.0318, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.768, |
| "grad_norm": 0.21338805556297302, |
| "learning_rate": 4.8051948051948054e-05, |
| "loss": 1.0343, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.7712, |
| "grad_norm": 0.2254691869020462, |
| "learning_rate": 4.740259740259741e-05, |
| "loss": 1.0472, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.7744, |
| "grad_norm": 0.18101665377616882, |
| "learning_rate": 4.675324675324675e-05, |
| "loss": 1.017, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.7776, |
| "grad_norm": 0.22090592980384827, |
| "learning_rate": 4.6103896103896106e-05, |
| "loss": 1.0389, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.7808, |
| "grad_norm": 0.20865507423877716, |
| "learning_rate": 4.545454545454546e-05, |
| "loss": 1.0369, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.784, |
| "grad_norm": 0.21619610488414764, |
| "learning_rate": 4.4805194805194805e-05, |
| "loss": 1.109, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.7872, |
| "grad_norm": 0.21694771945476532, |
| "learning_rate": 4.415584415584416e-05, |
| "loss": 1.0525, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.7904, |
| "grad_norm": 0.2182662934064865, |
| "learning_rate": 4.3506493506493503e-05, |
| "loss": 1.0331, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.7936, |
| "grad_norm": 0.2026486098766327, |
| "learning_rate": 4.2857142857142856e-05, |
| "loss": 1.027, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.7968, |
| "grad_norm": 0.19606547057628632, |
| "learning_rate": 4.220779220779221e-05, |
| "loss": 1.0242, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.22107470035552979, |
| "learning_rate": 4.155844155844156e-05, |
| "loss": 1.0924, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8032, |
| "grad_norm": 0.19960008561611176, |
| "learning_rate": 4.0909090909090915e-05, |
| "loss": 1.0384, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.8064, |
| "grad_norm": 0.1945488154888153, |
| "learning_rate": 4.025974025974026e-05, |
| "loss": 1.0673, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.8096, |
| "grad_norm": 0.22067414224147797, |
| "learning_rate": 3.9610389610389614e-05, |
| "loss": 1.0426, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.8128, |
| "grad_norm": 0.19010980427265167, |
| "learning_rate": 3.8961038961038966e-05, |
| "loss": 1.0617, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.816, |
| "grad_norm": 0.18781176209449768, |
| "learning_rate": 3.831168831168831e-05, |
| "loss": 1.0243, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.8192, |
| "grad_norm": 0.20388829708099365, |
| "learning_rate": 3.7662337662337665e-05, |
| "loss": 1.0476, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.8224, |
| "grad_norm": 0.19911155104637146, |
| "learning_rate": 3.701298701298702e-05, |
| "loss": 1.0324, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.8256, |
| "grad_norm": 0.19884039461612701, |
| "learning_rate": 3.6363636363636364e-05, |
| "loss": 1.0242, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.8288, |
| "grad_norm": 0.19036105275154114, |
| "learning_rate": 3.571428571428572e-05, |
| "loss": 1.0323, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.832, |
| "grad_norm": 0.20039844512939453, |
| "learning_rate": 3.506493506493507e-05, |
| "loss": 1.0749, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.8352, |
| "grad_norm": 0.1899934560060501, |
| "learning_rate": 3.4415584415584416e-05, |
| "loss": 1.0115, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.8384, |
| "grad_norm": 0.20019090175628662, |
| "learning_rate": 3.376623376623377e-05, |
| "loss": 1.0782, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.8416, |
| "grad_norm": 0.2020583152770996, |
| "learning_rate": 3.311688311688312e-05, |
| "loss": 1.0687, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.8448, |
| "grad_norm": 0.21407337486743927, |
| "learning_rate": 3.246753246753247e-05, |
| "loss": 1.1015, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.848, |
| "grad_norm": 0.1871640682220459, |
| "learning_rate": 3.181818181818182e-05, |
| "loss": 0.9637, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.8512, |
| "grad_norm": 0.21622811257839203, |
| "learning_rate": 3.1168831168831166e-05, |
| "loss": 1.1222, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.8544, |
| "grad_norm": 0.22504661977291107, |
| "learning_rate": 3.051948051948052e-05, |
| "loss": 1.132, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.8576, |
| "grad_norm": 0.19177629053592682, |
| "learning_rate": 2.9870129870129872e-05, |
| "loss": 1.0281, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.8608, |
| "grad_norm": 0.1970544159412384, |
| "learning_rate": 2.922077922077922e-05, |
| "loss": 1.0393, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.864, |
| "grad_norm": 0.21554522216320038, |
| "learning_rate": 2.857142857142857e-05, |
| "loss": 1.074, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.8672, |
| "grad_norm": 0.21131229400634766, |
| "learning_rate": 2.792207792207792e-05, |
| "loss": 1.054, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.8704, |
| "grad_norm": 0.19816523790359497, |
| "learning_rate": 2.7272727272727273e-05, |
| "loss": 1.0456, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.8736, |
| "grad_norm": 0.21075209975242615, |
| "learning_rate": 2.6623376623376623e-05, |
| "loss": 1.0758, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.8768, |
| "grad_norm": 0.2296527624130249, |
| "learning_rate": 2.5974025974025972e-05, |
| "loss": 1.0917, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 0.19722610712051392, |
| "learning_rate": 2.5324675324675325e-05, |
| "loss": 1.0704, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.8832, |
| "grad_norm": 0.18721099197864532, |
| "learning_rate": 2.4675324675324678e-05, |
| "loss": 0.9919, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.8864, |
| "grad_norm": 0.20244193077087402, |
| "learning_rate": 2.4025974025974027e-05, |
| "loss": 1.0368, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.8896, |
| "grad_norm": 0.19518914818763733, |
| "learning_rate": 2.3376623376623376e-05, |
| "loss": 1.0436, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.8928, |
| "grad_norm": 0.19650357961654663, |
| "learning_rate": 2.272727272727273e-05, |
| "loss": 1.0306, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.896, |
| "grad_norm": 0.20320096611976624, |
| "learning_rate": 2.207792207792208e-05, |
| "loss": 1.0941, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.8992, |
| "grad_norm": 0.18296951055526733, |
| "learning_rate": 2.1428571428571428e-05, |
| "loss": 0.9802, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.9024, |
| "grad_norm": 0.21357610821723938, |
| "learning_rate": 2.077922077922078e-05, |
| "loss": 1.0449, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.9056, |
| "grad_norm": 0.193921759724617, |
| "learning_rate": 2.012987012987013e-05, |
| "loss": 1.0116, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.9088, |
| "grad_norm": 0.1953902244567871, |
| "learning_rate": 1.9480519480519483e-05, |
| "loss": 1.0105, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.912, |
| "grad_norm": 0.19440975785255432, |
| "learning_rate": 1.8831168831168833e-05, |
| "loss": 0.9952, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.9152, |
| "grad_norm": 0.21054105460643768, |
| "learning_rate": 1.8181818181818182e-05, |
| "loss": 1.0701, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.9184, |
| "grad_norm": 0.18844804167747498, |
| "learning_rate": 1.7532467532467535e-05, |
| "loss": 1.0146, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.9216, |
| "grad_norm": 0.2067311704158783, |
| "learning_rate": 1.6883116883116884e-05, |
| "loss": 1.0781, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.9248, |
| "grad_norm": 0.1941213756799698, |
| "learning_rate": 1.6233766233766234e-05, |
| "loss": 0.9814, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.928, |
| "grad_norm": 0.22726193070411682, |
| "learning_rate": 1.5584415584415583e-05, |
| "loss": 1.1431, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.9312, |
| "grad_norm": 0.18025581538677216, |
| "learning_rate": 1.4935064935064936e-05, |
| "loss": 0.9649, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.9344, |
| "grad_norm": 0.21535000205039978, |
| "learning_rate": 1.4285714285714285e-05, |
| "loss": 1.0441, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.9376, |
| "grad_norm": 0.20014546811580658, |
| "learning_rate": 1.3636363636363637e-05, |
| "loss": 1.0166, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.9408, |
| "grad_norm": 0.22738787531852722, |
| "learning_rate": 1.2987012987012986e-05, |
| "loss": 1.0564, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.944, |
| "grad_norm": 0.2020861804485321, |
| "learning_rate": 1.2337662337662339e-05, |
| "loss": 1.1241, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.9472, |
| "grad_norm": 0.19888809323310852, |
| "learning_rate": 1.1688311688311688e-05, |
| "loss": 1.1114, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.9504, |
| "grad_norm": 0.20912377536296844, |
| "learning_rate": 1.103896103896104e-05, |
| "loss": 1.0971, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.9536, |
| "grad_norm": 0.21206621825695038, |
| "learning_rate": 1.038961038961039e-05, |
| "loss": 1.0601, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.9568, |
| "grad_norm": 0.18667680025100708, |
| "learning_rate": 9.740259740259742e-06, |
| "loss": 1.0291, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 0.21125559508800507, |
| "learning_rate": 9.090909090909091e-06, |
| "loss": 1.0483, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9632, |
| "grad_norm": 0.21776145696640015, |
| "learning_rate": 8.441558441558442e-06, |
| "loss": 0.9912, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.9664, |
| "grad_norm": 0.20144303143024445, |
| "learning_rate": 7.792207792207792e-06, |
| "loss": 1.0357, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.9696, |
| "grad_norm": 0.1984029859304428, |
| "learning_rate": 7.142857142857143e-06, |
| "loss": 1.0648, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.9728, |
| "grad_norm": 0.17972829937934875, |
| "learning_rate": 6.493506493506493e-06, |
| "loss": 1.0033, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.976, |
| "grad_norm": 0.1818286031484604, |
| "learning_rate": 5.844155844155844e-06, |
| "loss": 0.997, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.9792, |
| "grad_norm": 0.19670912623405457, |
| "learning_rate": 5.194805194805195e-06, |
| "loss": 1.0256, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.9824, |
| "grad_norm": 0.20527283847332, |
| "learning_rate": 4.5454545454545455e-06, |
| "loss": 1.0348, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.9856, |
| "grad_norm": 0.19025909900665283, |
| "learning_rate": 3.896103896103896e-06, |
| "loss": 1.0682, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.9888, |
| "grad_norm": 0.19544818997383118, |
| "learning_rate": 3.2467532467532465e-06, |
| "loss": 0.9872, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.992, |
| "grad_norm": 0.22112183272838593, |
| "learning_rate": 2.5974025974025976e-06, |
| "loss": 1.0661, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.9952, |
| "grad_norm": 0.23328153789043427, |
| "learning_rate": 1.948051948051948e-06, |
| "loss": 1.0691, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.9984, |
| "grad_norm": 0.20181375741958618, |
| "learning_rate": 1.2987012987012988e-06, |
| "loss": 0.9416, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.29312625527381897, |
| "learning_rate": 6.493506493506494e-07, |
| "loss": 1.1216, |
| "step": 313 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 313, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.768425540391928e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|