| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9975990396158463, | |
| "eval_steps": 500, | |
| "global_step": 312, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006402561024409764, | |
| "grad_norm": 2.679521906145934, | |
| "learning_rate": 6.25e-08, | |
| "loss": 0.1981, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.012805122048819529, | |
| "grad_norm": 2.5600704949176025, | |
| "learning_rate": 1.25e-07, | |
| "loss": 0.1782, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.01920768307322929, | |
| "grad_norm": 2.8865232354148174, | |
| "learning_rate": 1.875e-07, | |
| "loss": 0.2347, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.025610244097639057, | |
| "grad_norm": 2.76706815796981, | |
| "learning_rate": 2.5e-07, | |
| "loss": 0.2138, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.03201280512204882, | |
| "grad_norm": 2.6835981054304963, | |
| "learning_rate": 3.1249999999999997e-07, | |
| "loss": 0.2395, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03841536614645858, | |
| "grad_norm": 2.6104328133293877, | |
| "learning_rate": 3.75e-07, | |
| "loss": 0.2417, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.04481792717086835, | |
| "grad_norm": 2.551735870154201, | |
| "learning_rate": 4.375e-07, | |
| "loss": 0.175, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.051220488195278115, | |
| "grad_norm": 3.4086729277405725, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1706, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.057623049219687875, | |
| "grad_norm": 2.7571882436426587, | |
| "learning_rate": 5.625e-07, | |
| "loss": 0.2184, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.06402561024409764, | |
| "grad_norm": 2.7694984961484717, | |
| "learning_rate": 6.249999999999999e-07, | |
| "loss": 0.2017, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07042817126850741, | |
| "grad_norm": 2.547784892526759, | |
| "learning_rate": 6.875e-07, | |
| "loss": 0.1911, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.07683073229291716, | |
| "grad_norm": 2.666671570148531, | |
| "learning_rate": 7.5e-07, | |
| "loss": 0.2143, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.08323329331732693, | |
| "grad_norm": 2.7605278136164912, | |
| "learning_rate": 8.125e-07, | |
| "loss": 0.2027, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0896358543417367, | |
| "grad_norm": 2.4621112010955866, | |
| "learning_rate": 8.75e-07, | |
| "loss": 0.2068, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.09603841536614646, | |
| "grad_norm": 2.803025975251869, | |
| "learning_rate": 9.374999999999999e-07, | |
| "loss": 0.213, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.10244097639055623, | |
| "grad_norm": 2.901342575515257, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2106, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.10884353741496598, | |
| "grad_norm": 2.7337203386461644, | |
| "learning_rate": 1.0625e-06, | |
| "loss": 0.2281, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.11524609843937575, | |
| "grad_norm": 2.6481197250458193, | |
| "learning_rate": 1.125e-06, | |
| "loss": 0.1827, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.12164865946378552, | |
| "grad_norm": 2.7639376607457162, | |
| "learning_rate": 1.1874999999999999e-06, | |
| "loss": 0.2159, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.12805122048819528, | |
| "grad_norm": 2.7175927412952467, | |
| "learning_rate": 1.2499999999999999e-06, | |
| "loss": 0.1745, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.13445378151260504, | |
| "grad_norm": 3.1021924663248384, | |
| "learning_rate": 1.3125e-06, | |
| "loss": 0.1935, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.14085634253701482, | |
| "grad_norm": 3.1941312523935563, | |
| "learning_rate": 1.375e-06, | |
| "loss": 0.2053, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.14725890356142457, | |
| "grad_norm": 2.500688058457321, | |
| "learning_rate": 1.4375e-06, | |
| "loss": 0.1699, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.15366146458583432, | |
| "grad_norm": 3.417365189000316, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.2222, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.1600640256102441, | |
| "grad_norm": 2.7988997309084125, | |
| "learning_rate": 1.5624999999999999e-06, | |
| "loss": 0.1855, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.16646658663465386, | |
| "grad_norm": 3.057142160060086, | |
| "learning_rate": 1.625e-06, | |
| "loss": 0.1698, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.17286914765906364, | |
| "grad_norm": 2.8282556781813772, | |
| "learning_rate": 1.6875e-06, | |
| "loss": 0.2047, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.1792717086834734, | |
| "grad_norm": 2.7193234045232413, | |
| "learning_rate": 1.75e-06, | |
| "loss": 0.1979, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.18567426970788314, | |
| "grad_norm": 3.520125861739441, | |
| "learning_rate": 1.8125e-06, | |
| "loss": 0.2214, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.19207683073229292, | |
| "grad_norm": 3.017660845656798, | |
| "learning_rate": 1.8749999999999998e-06, | |
| "loss": 0.2119, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.19847939175670268, | |
| "grad_norm": 3.4420544526065515, | |
| "learning_rate": 1.9375e-06, | |
| "loss": 0.1649, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.20488195278111246, | |
| "grad_norm": 2.858393701736414, | |
| "learning_rate": 2e-06, | |
| "loss": 0.2038, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.2112845138055222, | |
| "grad_norm": 3.6069506971510723, | |
| "learning_rate": 1.9999370567547003e-06, | |
| "loss": 0.2194, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.21768707482993196, | |
| "grad_norm": 3.1830415852739415, | |
| "learning_rate": 1.9997482349425066e-06, | |
| "loss": 0.143, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.22408963585434175, | |
| "grad_norm": 3.4429216757147545, | |
| "learning_rate": 1.9994335583335335e-06, | |
| "loss": 0.2026, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.2304921968787515, | |
| "grad_norm": 3.5752393511399463, | |
| "learning_rate": 1.9989930665413145e-06, | |
| "loss": 0.2087, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.23689475790316125, | |
| "grad_norm": 4.08665235099645, | |
| "learning_rate": 1.9984268150178167e-06, | |
| "loss": 0.196, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.24329731892757103, | |
| "grad_norm": 3.979136486495609, | |
| "learning_rate": 1.997734875046456e-06, | |
| "loss": 0.1824, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.24969987995198079, | |
| "grad_norm": 3.866519745374768, | |
| "learning_rate": 1.996917333733128e-06, | |
| "loss": 0.1896, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.25610244097639057, | |
| "grad_norm": 3.2575226153396115, | |
| "learning_rate": 1.995974293995239e-06, | |
| "loss": 0.194, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.26250500200080035, | |
| "grad_norm": 3.7743686803486605, | |
| "learning_rate": 1.994905874548752e-06, | |
| "loss": 0.1882, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.2689075630252101, | |
| "grad_norm": 4.287147445421594, | |
| "learning_rate": 1.9937122098932426e-06, | |
| "loss": 0.1772, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.27531012404961985, | |
| "grad_norm": 3.832538953651167, | |
| "learning_rate": 1.9923934502949643e-06, | |
| "loss": 0.1904, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.28171268507402963, | |
| "grad_norm": 5.1771162120228365, | |
| "learning_rate": 1.9909497617679347e-06, | |
| "loss": 0.1892, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.28811524609843936, | |
| "grad_norm": 4.303644513637155, | |
| "learning_rate": 1.9893813260530367e-06, | |
| "loss": 0.2056, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.29451780712284914, | |
| "grad_norm": 4.2352469784485445, | |
| "learning_rate": 1.9876883405951377e-06, | |
| "loss": 0.2316, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.3009203681472589, | |
| "grad_norm": 4.5051673675815955, | |
| "learning_rate": 1.9858710185182355e-06, | |
| "loss": 0.1792, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.30732292917166865, | |
| "grad_norm": 4.68950995491574, | |
| "learning_rate": 1.9839295885986295e-06, | |
| "loss": 0.1841, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.3137254901960784, | |
| "grad_norm": 4.053751205533602, | |
| "learning_rate": 1.9818642952361183e-06, | |
| "loss": 0.1909, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.3201280512204882, | |
| "grad_norm": 3.537770773869747, | |
| "learning_rate": 1.9796753984232355e-06, | |
| "loss": 0.1642, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.32653061224489793, | |
| "grad_norm": 4.937399937551455, | |
| "learning_rate": 1.977363173712519e-06, | |
| "loss": 0.1628, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.3329331732693077, | |
| "grad_norm": 4.458098457737047, | |
| "learning_rate": 1.9749279121818236e-06, | |
| "loss": 0.2108, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.3393357342937175, | |
| "grad_norm": 4.692309558133837, | |
| "learning_rate": 1.9723699203976766e-06, | |
| "loss": 0.2023, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.3457382953181273, | |
| "grad_norm": 4.277910169048945, | |
| "learning_rate": 1.9696895203766866e-06, | |
| "loss": 0.1885, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.352140856342537, | |
| "grad_norm": 4.837495606765268, | |
| "learning_rate": 1.966887049545006e-06, | |
| "loss": 0.1899, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.3585434173669468, | |
| "grad_norm": 4.723060685847474, | |
| "learning_rate": 1.9639628606958534e-06, | |
| "loss": 0.2454, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.36494597839135656, | |
| "grad_norm": 4.472186733872948, | |
| "learning_rate": 1.9609173219450997e-06, | |
| "loss": 0.1902, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.3713485394157663, | |
| "grad_norm": 4.27556686419538, | |
| "learning_rate": 1.9577508166849303e-06, | |
| "loss": 0.1962, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.37775110044017607, | |
| "grad_norm": 3.8215311273874506, | |
| "learning_rate": 1.9544637435355806e-06, | |
| "loss": 0.1658, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.38415366146458585, | |
| "grad_norm": 4.623141330416456, | |
| "learning_rate": 1.9510565162951534e-06, | |
| "loss": 0.2274, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3905562224889956, | |
| "grad_norm": 4.245526551915113, | |
| "learning_rate": 1.947529563887529e-06, | |
| "loss": 0.2097, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.39695878351340536, | |
| "grad_norm": 3.8360308169316064, | |
| "learning_rate": 1.9438833303083674e-06, | |
| "loss": 0.1629, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.40336134453781514, | |
| "grad_norm": 4.390197594681171, | |
| "learning_rate": 1.9401182745692187e-06, | |
| "loss": 0.1913, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.4097639055622249, | |
| "grad_norm": 3.8567511370833594, | |
| "learning_rate": 1.936234870639737e-06, | |
| "loss": 0.204, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.41616646658663464, | |
| "grad_norm": 4.3670167230444354, | |
| "learning_rate": 1.9322336073880143e-06, | |
| "loss": 0.1925, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.4225690276110444, | |
| "grad_norm": 4.5446121221499505, | |
| "learning_rate": 1.928114988519039e-06, | |
| "loss": 0.2036, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.4289715886354542, | |
| "grad_norm": 4.492339684008934, | |
| "learning_rate": 1.9238795325112867e-06, | |
| "loss": 0.1976, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.43537414965986393, | |
| "grad_norm": 3.616344744906229, | |
| "learning_rate": 1.9195277725514506e-06, | |
| "loss": 0.2173, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.4417767106842737, | |
| "grad_norm": 4.807993813843196, | |
| "learning_rate": 1.91506025646732e-06, | |
| "loss": 0.2297, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.4481792717086835, | |
| "grad_norm": 3.9611147863165215, | |
| "learning_rate": 1.9104775466588157e-06, | |
| "loss": 0.1724, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4545818327330932, | |
| "grad_norm": 4.038448965820852, | |
| "learning_rate": 1.905780220027194e-06, | |
| "loss": 0.1817, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.460984393757503, | |
| "grad_norm": 3.6234581360493308, | |
| "learning_rate": 1.9009688679024189e-06, | |
| "loss": 0.1898, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.4673869547819128, | |
| "grad_norm": 4.412888570956389, | |
| "learning_rate": 1.8960440959687252e-06, | |
| "loss": 0.2056, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.4737895158063225, | |
| "grad_norm": 4.438175504352378, | |
| "learning_rate": 1.8910065241883678e-06, | |
| "loss": 0.2206, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.4801920768307323, | |
| "grad_norm": 4.188755548423674, | |
| "learning_rate": 1.8858567867235798e-06, | |
| "loss": 0.1955, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.48659463785514206, | |
| "grad_norm": 4.043921178736117, | |
| "learning_rate": 1.8805955318567379e-06, | |
| "loss": 0.1948, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.49299719887955185, | |
| "grad_norm": 3.3626388865946195, | |
| "learning_rate": 1.8752234219087537e-06, | |
| "loss": 0.2156, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.49939975990396157, | |
| "grad_norm": 3.856090864925135, | |
| "learning_rate": 1.8697411331556953e-06, | |
| "loss": 0.1969, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.5058023209283713, | |
| "grad_norm": 3.902527444697349, | |
| "learning_rate": 1.8641493557436548e-06, | |
| "loss": 0.2156, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.5122048819527811, | |
| "grad_norm": 3.8994885011285776, | |
| "learning_rate": 1.858448793601866e-06, | |
| "loss": 0.1978, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5186074429771909, | |
| "grad_norm": 3.842402356308699, | |
| "learning_rate": 1.852640164354092e-06, | |
| "loss": 0.2271, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.5250100040016007, | |
| "grad_norm": 3.3752000095050376, | |
| "learning_rate": 1.8467241992282841e-06, | |
| "loss": 0.1707, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.5314125650260104, | |
| "grad_norm": 4.043279107262682, | |
| "learning_rate": 1.8407016429645302e-06, | |
| "loss": 0.2163, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.5378151260504201, | |
| "grad_norm": 4.403978560088287, | |
| "learning_rate": 1.8345732537213026e-06, | |
| "loss": 0.1739, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.54421768707483, | |
| "grad_norm": 3.8981585910721552, | |
| "learning_rate": 1.8283398029800164e-06, | |
| "loss": 0.1696, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5506202480992397, | |
| "grad_norm": 4.932247696008996, | |
| "learning_rate": 1.82200207544791e-06, | |
| "loss": 0.1915, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.5570228091236494, | |
| "grad_norm": 4.099692261040826, | |
| "learning_rate": 1.8155608689592601e-06, | |
| "loss": 0.1785, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.5634253701480593, | |
| "grad_norm": 3.7071427170962585, | |
| "learning_rate": 1.8090169943749474e-06, | |
| "loss": 0.1643, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.569827931172469, | |
| "grad_norm": 4.198912758644434, | |
| "learning_rate": 1.802371275480378e-06, | |
| "loss": 0.2079, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.5762304921968787, | |
| "grad_norm": 4.08750592105122, | |
| "learning_rate": 1.795624548881781e-06, | |
| "loss": 0.1875, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5826330532212886, | |
| "grad_norm": 5.4368089983091945, | |
| "learning_rate": 1.7887776639008912e-06, | |
| "loss": 0.1842, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.5890356142456983, | |
| "grad_norm": 3.6668016857716, | |
| "learning_rate": 1.7818314824680298e-06, | |
| "loss": 0.1927, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.595438175270108, | |
| "grad_norm": 3.9325828012665482, | |
| "learning_rate": 1.774786879013601e-06, | |
| "loss": 0.1849, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.6018407362945178, | |
| "grad_norm": 4.2551345503780125, | |
| "learning_rate": 1.767644740358011e-06, | |
| "loss": 0.2103, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.6082432973189276, | |
| "grad_norm": 3.562360928332586, | |
| "learning_rate": 1.760405965600031e-06, | |
| "loss": 0.2292, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.6146458583433373, | |
| "grad_norm": 4.074513845673071, | |
| "learning_rate": 1.753071466003611e-06, | |
| "loss": 0.164, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.6210484193677471, | |
| "grad_norm": 4.264136629053828, | |
| "learning_rate": 1.7456421648831654e-06, | |
| "loss": 0.1919, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.6274509803921569, | |
| "grad_norm": 4.101904029046064, | |
| "learning_rate": 1.7381189974873407e-06, | |
| "loss": 0.2492, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.6338535414165666, | |
| "grad_norm": 4.129940049014058, | |
| "learning_rate": 1.7305029108812774e-06, | |
| "loss": 0.1899, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.6402561024409764, | |
| "grad_norm": 4.2688607303525705, | |
| "learning_rate": 1.7227948638273915e-06, | |
| "loss": 0.1802, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6466586634653861, | |
| "grad_norm": 3.931916755773599, | |
| "learning_rate": 1.7149958266646754e-06, | |
| "loss": 0.1524, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.6530612244897959, | |
| "grad_norm": 4.309265054679012, | |
| "learning_rate": 1.7071067811865474e-06, | |
| "loss": 0.2035, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.6594637855142057, | |
| "grad_norm": 3.4899838789545847, | |
| "learning_rate": 1.6991287205172574e-06, | |
| "loss": 0.1885, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.6658663465386154, | |
| "grad_norm": 4.603685696411418, | |
| "learning_rate": 1.6910626489868648e-06, | |
| "loss": 0.2008, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.6722689075630253, | |
| "grad_norm": 3.682716771358225, | |
| "learning_rate": 1.682909582004807e-06, | |
| "loss": 0.2069, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.678671468587435, | |
| "grad_norm": 5.35989531173202, | |
| "learning_rate": 1.6746705459320744e-06, | |
| "loss": 0.2185, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.6850740296118447, | |
| "grad_norm": 4.4810176716864225, | |
| "learning_rate": 1.6663465779520037e-06, | |
| "loss": 0.2179, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.6914765906362546, | |
| "grad_norm": 4.781878607782948, | |
| "learning_rate": 1.6579387259397126e-06, | |
| "loss": 0.2162, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.6978791516606643, | |
| "grad_norm": 3.578367620297612, | |
| "learning_rate": 1.6494480483301835e-06, | |
| "loss": 0.1864, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.704281712685074, | |
| "grad_norm": 4.938703350422557, | |
| "learning_rate": 1.640875613985024e-06, | |
| "loss": 0.1789, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7106842737094838, | |
| "grad_norm": 4.712146890436173, | |
| "learning_rate": 1.6322225020579096e-06, | |
| "loss": 0.1971, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.7170868347338936, | |
| "grad_norm": 4.49092614751852, | |
| "learning_rate": 1.6234898018587336e-06, | |
| "loss": 0.1796, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.7234893957583033, | |
| "grad_norm": 4.123243996580139, | |
| "learning_rate": 1.6146786127164771e-06, | |
| "loss": 0.1703, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.7298919567827131, | |
| "grad_norm": 3.4610648898646623, | |
| "learning_rate": 1.6057900438408199e-06, | |
| "loss": 0.2069, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.7362945178071229, | |
| "grad_norm": 3.701626323345168, | |
| "learning_rate": 1.5968252141825035e-06, | |
| "loss": 0.2037, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.7426970788315326, | |
| "grad_norm": 4.5853683365418965, | |
| "learning_rate": 1.587785252292473e-06, | |
| "loss": 0.1867, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.7490996398559424, | |
| "grad_norm": 3.813531824888892, | |
| "learning_rate": 1.578671296179806e-06, | |
| "loss": 0.1923, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.7555022008803521, | |
| "grad_norm": 4.333755687391246, | |
| "learning_rate": 1.569484493168452e-06, | |
| "loss": 0.1917, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 3.5490254589371526, | |
| "learning_rate": 1.5602259997528027e-06, | |
| "loss": 0.21, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.7683073229291717, | |
| "grad_norm": 3.8349133617246665, | |
| "learning_rate": 1.5508969814521024e-06, | |
| "loss": 0.1839, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7747098839535814, | |
| "grad_norm": 4.480906522239498, | |
| "learning_rate": 1.5414986126637257e-06, | |
| "loss": 0.1804, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.7811124449779911, | |
| "grad_norm": 4.063797691144723, | |
| "learning_rate": 1.5320320765153365e-06, | |
| "loss": 0.1872, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.787515006002401, | |
| "grad_norm": 3.837906572094775, | |
| "learning_rate": 1.5224985647159488e-06, | |
| "loss": 0.202, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.7939175670268107, | |
| "grad_norm": 4.226911617716516, | |
| "learning_rate": 1.5128992774059062e-06, | |
| "loss": 0.2067, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.8003201280512204, | |
| "grad_norm": 4.143910488552411, | |
| "learning_rate": 1.5032354230058002e-06, | |
| "loss": 0.1884, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.8067226890756303, | |
| "grad_norm": 3.996874874040649, | |
| "learning_rate": 1.4935082180643467e-06, | |
| "loss": 0.1797, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.81312525010004, | |
| "grad_norm": 4.241850731448795, | |
| "learning_rate": 1.4837188871052397e-06, | |
| "loss": 0.1973, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.8195278111244498, | |
| "grad_norm": 4.8128807497544015, | |
| "learning_rate": 1.4738686624729987e-06, | |
| "loss": 0.1888, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.8259303721488596, | |
| "grad_norm": 4.624440170471318, | |
| "learning_rate": 1.463958784177834e-06, | |
| "loss": 0.1855, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.8323329331732693, | |
| "grad_norm": 3.760993103519322, | |
| "learning_rate": 1.4539904997395467e-06, | |
| "loss": 0.1662, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.8387354941976791, | |
| "grad_norm": 4.333505468408259, | |
| "learning_rate": 1.4439650640304821e-06, | |
| "loss": 0.2089, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.8451380552220888, | |
| "grad_norm": 4.278934798609152, | |
| "learning_rate": 1.433883739117558e-06, | |
| "loss": 0.1879, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.8515406162464986, | |
| "grad_norm": 4.16072632298661, | |
| "learning_rate": 1.4237477941033886e-06, | |
| "loss": 0.1904, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.8579431772709084, | |
| "grad_norm": 3.5517189505372784, | |
| "learning_rate": 1.4135585049665206e-06, | |
| "loss": 0.2498, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.8643457382953181, | |
| "grad_norm": 3.9706094312160776, | |
| "learning_rate": 1.4033171544008051e-06, | |
| "loss": 0.1901, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.8707482993197279, | |
| "grad_norm": 4.171292985573587, | |
| "learning_rate": 1.3930250316539235e-06, | |
| "loss": 0.1835, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.8771508603441377, | |
| "grad_norm": 3.8333520124526896, | |
| "learning_rate": 1.3826834323650898e-06, | |
| "loss": 0.1747, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.8835534213685474, | |
| "grad_norm": 4.281605413414975, | |
| "learning_rate": 1.3722936584019451e-06, | |
| "loss": 0.2296, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.8899559823929571, | |
| "grad_norm": 4.001127332533811, | |
| "learning_rate": 1.3618570176966722e-06, | |
| "loss": 0.2248, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.896358543417367, | |
| "grad_norm": 3.2545123553526585, | |
| "learning_rate": 1.3513748240813427e-06, | |
| "loss": 0.2172, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.9027611044417767, | |
| "grad_norm": 3.6921559524457073, | |
| "learning_rate": 1.3408483971225249e-06, | |
| "loss": 0.1865, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.9091636654661864, | |
| "grad_norm": 4.729238595946433, | |
| "learning_rate": 1.3302790619551672e-06, | |
| "loss": 0.1511, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.9155662264905963, | |
| "grad_norm": 4.27641471239857, | |
| "learning_rate": 1.3196681491157816e-06, | |
| "loss": 0.1775, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.921968787515006, | |
| "grad_norm": 4.195425093653204, | |
| "learning_rate": 1.3090169943749473e-06, | |
| "loss": 0.2102, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.9283713485394157, | |
| "grad_norm": 4.4927187865344305, | |
| "learning_rate": 1.298326938569156e-06, | |
| "loss": 0.1794, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.9347739095638256, | |
| "grad_norm": 3.6780116242547467, | |
| "learning_rate": 1.2875993274320173e-06, | |
| "loss": 0.1716, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.9411764705882353, | |
| "grad_norm": 3.5052737613672926, | |
| "learning_rate": 1.2768355114248492e-06, | |
| "loss": 0.1909, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.947579031612645, | |
| "grad_norm": 5.4003852624340976, | |
| "learning_rate": 1.266036845566675e-06, | |
| "loss": 0.1828, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.9539815926370548, | |
| "grad_norm": 4.293449029987786, | |
| "learning_rate": 1.2552046892636426e-06, | |
| "loss": 0.1934, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.9603841536614646, | |
| "grad_norm": 4.213735783478494, | |
| "learning_rate": 1.244340406137894e-06, | |
| "loss": 0.1635, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9667867146858744, | |
| "grad_norm": 5.682803899024664, | |
| "learning_rate": 1.2334453638559054e-06, | |
| "loss": 0.2282, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.9731892757102841, | |
| "grad_norm": 5.125716971978602, | |
| "learning_rate": 1.2225209339563143e-06, | |
| "loss": 0.2124, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.9795918367346939, | |
| "grad_norm": 3.6296567686595558, | |
| "learning_rate": 1.211568491677263e-06, | |
| "loss": 0.1686, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.9859943977591037, | |
| "grad_norm": 4.032149136826441, | |
| "learning_rate": 1.2005894157832728e-06, | |
| "loss": 0.1801, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.9923969587835134, | |
| "grad_norm": 4.328877120814621, | |
| "learning_rate": 1.1895850883916785e-06, | |
| "loss": 0.2116, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.9987995198079231, | |
| "grad_norm": 4.42679277844411, | |
| "learning_rate": 1.1785568947986366e-06, | |
| "loss": 0.197, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.005202080832333, | |
| "grad_norm": 2.995825299760741, | |
| "learning_rate": 1.1675062233047363e-06, | |
| "loss": 0.1754, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.0116046418567426, | |
| "grad_norm": 3.314198590902652, | |
| "learning_rate": 1.156434465040231e-06, | |
| "loss": 0.1591, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.0180072028811524, | |
| "grad_norm": 3.282342144727049, | |
| "learning_rate": 1.1453430137899128e-06, | |
| "loss": 0.1463, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.0244097639055623, | |
| "grad_norm": 3.3852348105131327, | |
| "learning_rate": 1.1342332658176555e-06, | |
| "loss": 0.1417, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0308123249299719, | |
| "grad_norm": 2.5846365344808224, | |
| "learning_rate": 1.123106619690643e-06, | |
| "loss": 0.1579, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.0372148859543817, | |
| "grad_norm": 3.4112665397765407, | |
| "learning_rate": 1.1119644761033077e-06, | |
| "loss": 0.1569, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.0436174469787916, | |
| "grad_norm": 3.057068352071886, | |
| "learning_rate": 1.1008082377010045e-06, | |
| "loss": 0.1588, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.0500200080032012, | |
| "grad_norm": 2.480458097820055, | |
| "learning_rate": 1.0896393089034335e-06, | |
| "loss": 0.1604, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.056422569027611, | |
| "grad_norm": 2.5238894472575, | |
| "learning_rate": 1.078459095727845e-06, | |
| "loss": 0.1738, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.0628251300520208, | |
| "grad_norm": 2.702306669134078, | |
| "learning_rate": 1.0672690056120398e-06, | |
| "loss": 0.1416, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.0692276910764307, | |
| "grad_norm": 2.715141146658483, | |
| "learning_rate": 1.0560704472371917e-06, | |
| "loss": 0.127, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.0756302521008403, | |
| "grad_norm": 2.698908112656233, | |
| "learning_rate": 1.044864830350515e-06, | |
| "loss": 0.1769, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.0820328131252501, | |
| "grad_norm": 2.393553347708748, | |
| "learning_rate": 1.033653565587794e-06, | |
| "loss": 0.1595, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.08843537414966, | |
| "grad_norm": 2.515931323925557, | |
| "learning_rate": 1.022438064295805e-06, | |
| "loss": 0.1509, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.0948379351740696, | |
| "grad_norm": 3.122208532437112, | |
| "learning_rate": 1.0112197383546459e-06, | |
| "loss": 0.1362, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.1012404961984794, | |
| "grad_norm": 2.9259499260174437, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1393, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.1076430572228892, | |
| "grad_norm": 2.6643868148585783, | |
| "learning_rate": 9.88780261645354e-07, | |
| "loss": 0.1602, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.1140456182472989, | |
| "grad_norm": 2.529601808890674, | |
| "learning_rate": 9.77561935704195e-07, | |
| "loss": 0.16, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.1204481792717087, | |
| "grad_norm": 2.4798313875614038, | |
| "learning_rate": 9.663464344122063e-07, | |
| "loss": 0.1667, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.1268507402961185, | |
| "grad_norm": 2.528807676927116, | |
| "learning_rate": 9.551351696494853e-07, | |
| "loss": 0.142, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.1332533013205282, | |
| "grad_norm": 2.222384060658359, | |
| "learning_rate": 9.43929552762808e-07, | |
| "loss": 0.1273, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.139655862344938, | |
| "grad_norm": 2.6020312291887993, | |
| "learning_rate": 9.327309943879603e-07, | |
| "loss": 0.1209, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.1460584233693478, | |
| "grad_norm": 2.5865819736528213, | |
| "learning_rate": 9.215409042721551e-07, | |
| "loss": 0.1147, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.1524609843937574, | |
| "grad_norm": 2.6984970538621047, | |
| "learning_rate": 9.103606910965665e-07, | |
| "loss": 0.1267, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.1588635454181673, | |
| "grad_norm": 3.064928156142861, | |
| "learning_rate": 8.991917622989955e-07, | |
| "loss": 0.1472, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.165266106442577, | |
| "grad_norm": 2.6751065232729787, | |
| "learning_rate": 8.880355238966921e-07, | |
| "loss": 0.1481, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.1716686674669867, | |
| "grad_norm": 2.4919821908211137, | |
| "learning_rate": 8.768933803093572e-07, | |
| "loss": 0.1582, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.1780712284913966, | |
| "grad_norm": 2.7496916062023926, | |
| "learning_rate": 8.657667341823448e-07, | |
| "loss": 0.1132, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.1844737895158064, | |
| "grad_norm": 2.8683066132542065, | |
| "learning_rate": 8.546569862100875e-07, | |
| "loss": 0.1537, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.190876350540216, | |
| "grad_norm": 3.3497753630792504, | |
| "learning_rate": 8.435655349597689e-07, | |
| "loss": 0.1498, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.1972789115646258, | |
| "grad_norm": 2.8175991760028776, | |
| "learning_rate": 8.324937766952636e-07, | |
| "loss": 0.1344, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.2036814725890357, | |
| "grad_norm": 3.5078725354724978, | |
| "learning_rate": 8.214431052013634e-07, | |
| "loss": 0.1794, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.2100840336134453, | |
| "grad_norm": 3.134230661450569, | |
| "learning_rate": 8.104149116083216e-07, | |
| "loss": 0.1636, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.2164865946378551, | |
| "grad_norm": 2.789058239974061, | |
| "learning_rate": 7.994105842167272e-07, | |
| "loss": 0.1384, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.222889155662265, | |
| "grad_norm": 3.7547713671098966, | |
| "learning_rate": 7.884315083227372e-07, | |
| "loss": 0.1774, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.2292917166866746, | |
| "grad_norm": 3.664307191430093, | |
| "learning_rate": 7.774790660436857e-07, | |
| "loss": 0.1481, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.2356942777110844, | |
| "grad_norm": 2.771171440914553, | |
| "learning_rate": 7.665546361440949e-07, | |
| "loss": 0.1996, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.2420968387354943, | |
| "grad_norm": 2.6893659053745815, | |
| "learning_rate": 7.556595938621058e-07, | |
| "loss": 0.1411, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.2484993997599039, | |
| "grad_norm": 3.678898327438179, | |
| "learning_rate": 7.447953107363574e-07, | |
| "loss": 0.1864, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.2549019607843137, | |
| "grad_norm": 3.520031085298818, | |
| "learning_rate": 7.33963154433325e-07, | |
| "loss": 0.1681, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.2613045218087235, | |
| "grad_norm": 3.1974510727727354, | |
| "learning_rate": 7.231644885751507e-07, | |
| "loss": 0.1197, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.2677070828331334, | |
| "grad_norm": 3.0861033592576703, | |
| "learning_rate": 7.124006725679828e-07, | |
| "loss": 0.1852, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.274109643857543, | |
| "grad_norm": 2.392211457828515, | |
| "learning_rate": 7.016730614308439e-07, | |
| "loss": 0.1478, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.2805122048819528, | |
| "grad_norm": 2.6382593348580636, | |
| "learning_rate": 6.909830056250526e-07, | |
| "loss": 0.1499, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.2869147659063627, | |
| "grad_norm": 3.310685530214126, | |
| "learning_rate": 6.803318508842186e-07, | |
| "loss": 0.1892, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.2933173269307723, | |
| "grad_norm": 2.9330195000820054, | |
| "learning_rate": 6.697209380448332e-07, | |
| "loss": 0.1853, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.2997198879551821, | |
| "grad_norm": 2.8596301823674115, | |
| "learning_rate": 6.59151602877475e-07, | |
| "loss": 0.1328, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.306122448979592, | |
| "grad_norm": 2.794306696992077, | |
| "learning_rate": 6.486251759186572e-07, | |
| "loss": 0.1807, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.3125250100040016, | |
| "grad_norm": 4.390407508756913, | |
| "learning_rate": 6.381429823033279e-07, | |
| "loss": 0.1536, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.3189275710284114, | |
| "grad_norm": 2.6300011319364915, | |
| "learning_rate": 6.277063415980548e-07, | |
| "loss": 0.1431, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.3253301320528212, | |
| "grad_norm": 2.62332174376127, | |
| "learning_rate": 6.173165676349102e-07, | |
| "loss": 0.1482, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.3317326930772309, | |
| "grad_norm": 2.733993586202797, | |
| "learning_rate": 6.069749683460764e-07, | |
| "loss": 0.1228, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.3381352541016407, | |
| "grad_norm": 3.315910975674784, | |
| "learning_rate": 5.96682845599195e-07, | |
| "loss": 0.2054, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.3445378151260505, | |
| "grad_norm": 2.930927185763075, | |
| "learning_rate": 5.864414950334795e-07, | |
| "loss": 0.159, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.3509403761504601, | |
| "grad_norm": 3.1317606911937568, | |
| "learning_rate": 5.762522058966113e-07, | |
| "loss": 0.1586, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.35734293717487, | |
| "grad_norm": 4.040118089874495, | |
| "learning_rate": 5.661162608824419e-07, | |
| "loss": 0.1505, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.3637454981992798, | |
| "grad_norm": 2.746598755946971, | |
| "learning_rate": 5.56034935969518e-07, | |
| "loss": 0.1761, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.3701480592236894, | |
| "grad_norm": 2.9141623052335137, | |
| "learning_rate": 5.460095002604532e-07, | |
| "loss": 0.19, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.3765506202480993, | |
| "grad_norm": 4.001391421545072, | |
| "learning_rate": 5.36041215822166e-07, | |
| "loss": 0.1321, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.382953181272509, | |
| "grad_norm": 3.6428702220535736, | |
| "learning_rate": 5.261313375270013e-07, | |
| "loss": 0.1905, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.3893557422969187, | |
| "grad_norm": 3.324099015382822, | |
| "learning_rate": 5.162811128947602e-07, | |
| "loss": 0.1669, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.3957583033213286, | |
| "grad_norm": 2.6236595277867294, | |
| "learning_rate": 5.064917819356531e-07, | |
| "loss": 0.1235, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.4021608643457384, | |
| "grad_norm": 3.259881675873255, | |
| "learning_rate": 4.967645769941999e-07, | |
| "loss": 0.1814, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.408563425370148, | |
| "grad_norm": 2.8694964354069117, | |
| "learning_rate": 4.871007225940939e-07, | |
| "loss": 0.1495, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.4149659863945578, | |
| "grad_norm": 2.901840928815238, | |
| "learning_rate": 4.775014352840512e-07, | |
| "loss": 0.1951, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.4213685474189677, | |
| "grad_norm": 3.418089743568751, | |
| "learning_rate": 4.6796792348466353e-07, | |
| "loss": 0.1547, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.4277711084433773, | |
| "grad_norm": 2.830949019249368, | |
| "learning_rate": 4.585013873362743e-07, | |
| "loss": 0.1046, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.4341736694677871, | |
| "grad_norm": 3.5740356720869206, | |
| "learning_rate": 4.4910301854789755e-07, | |
| "loss": 0.1482, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.440576230492197, | |
| "grad_norm": 2.7706265692202217, | |
| "learning_rate": 4.397740002471972e-07, | |
| "loss": 0.1842, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.4469787915166066, | |
| "grad_norm": 2.705953973970954, | |
| "learning_rate": 4.3051550683154804e-07, | |
| "loss": 0.173, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.4533813525410164, | |
| "grad_norm": 3.5200476993463976, | |
| "learning_rate": 4.2132870382019427e-07, | |
| "loss": 0.1268, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.4597839135654262, | |
| "grad_norm": 2.980462619255481, | |
| "learning_rate": 4.1221474770752696e-07, | |
| "loss": 0.1454, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.4661864745898359, | |
| "grad_norm": 3.0488417880599274, | |
| "learning_rate": 4.031747858174964e-07, | |
| "loss": 0.1573, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.4725890356142457, | |
| "grad_norm": 2.866558859217372, | |
| "learning_rate": 3.942099561591802e-07, | |
| "loss": 0.1354, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.4789915966386555, | |
| "grad_norm": 3.3643880310904017, | |
| "learning_rate": 3.853213872835228e-07, | |
| "loss": 0.1527, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.4853941576630652, | |
| "grad_norm": 2.3493487438896463, | |
| "learning_rate": 3.765101981412665e-07, | |
| "loss": 0.1483, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.491796718687475, | |
| "grad_norm": 2.482947067390714, | |
| "learning_rate": 3.677774979420903e-07, | |
| "loss": 0.1302, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.4981992797118848, | |
| "grad_norm": 2.599751282623492, | |
| "learning_rate": 3.5912438601497584e-07, | |
| "loss": 0.1522, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.5046018407362944, | |
| "grad_norm": 3.191973535201136, | |
| "learning_rate": 3.5055195166981646e-07, | |
| "loss": 0.1615, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.5110044017607043, | |
| "grad_norm": 3.15562179961526, | |
| "learning_rate": 3.420612740602874e-07, | |
| "loss": 0.1478, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.517406962785114, | |
| "grad_norm": 2.8880563361860823, | |
| "learning_rate": 3.3365342204799606e-07, | |
| "loss": 0.1397, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.5238095238095237, | |
| "grad_norm": 3.688725379316976, | |
| "learning_rate": 3.253294540679257e-07, | |
| "loss": 0.1875, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.5302120848339336, | |
| "grad_norm": 2.817507541571056, | |
| "learning_rate": 3.170904179951931e-07, | |
| "loss": 0.1216, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.5366146458583434, | |
| "grad_norm": 2.9412647550371975, | |
| "learning_rate": 3.0893735101313535e-07, | |
| "loss": 0.1398, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.543017206882753, | |
| "grad_norm": 2.762627387485447, | |
| "learning_rate": 3.008712794827426e-07, | |
| "loss": 0.1537, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.5494197679071628, | |
| "grad_norm": 2.8082508660384633, | |
| "learning_rate": 2.9289321881345254e-07, | |
| "loss": 0.1367, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.5558223289315727, | |
| "grad_norm": 3.1843132713343314, | |
| "learning_rate": 2.850041733353247e-07, | |
| "loss": 0.1451, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.5622248899559823, | |
| "grad_norm": 2.549326946513764, | |
| "learning_rate": 2.7720513617260855e-07, | |
| "loss": 0.1578, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.5686274509803921, | |
| "grad_norm": 2.035464821513073, | |
| "learning_rate": 2.6949708911872247e-07, | |
| "loss": 0.1271, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.575030012004802, | |
| "grad_norm": 2.494783934906446, | |
| "learning_rate": 2.6188100251265943e-07, | |
| "loss": 0.1446, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.5814325730292116, | |
| "grad_norm": 4.226886500584243, | |
| "learning_rate": 2.543578351168344e-07, | |
| "loss": 0.1417, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.5878351340536214, | |
| "grad_norm": 2.495677422705392, | |
| "learning_rate": 2.4692853399638913e-07, | |
| "loss": 0.1364, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.5942376950780313, | |
| "grad_norm": 3.1917365323900446, | |
| "learning_rate": 2.395940343999691e-07, | |
| "loss": 0.1626, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.6006402561024409, | |
| "grad_norm": 3.0344619777547868, | |
| "learning_rate": 2.3235525964198888e-07, | |
| "loss": 0.1623, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.6070428171268507, | |
| "grad_norm": 3.688983622501882, | |
| "learning_rate": 2.252131209863991e-07, | |
| "loss": 0.1654, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.6134453781512605, | |
| "grad_norm": 2.760266999319, | |
| "learning_rate": 2.181685175319702e-07, | |
| "loss": 0.1757, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.6198479391756702, | |
| "grad_norm": 3.3493718404784816, | |
| "learning_rate": 2.11222336099109e-07, | |
| "loss": 0.1877, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.62625050020008, | |
| "grad_norm": 2.7059015648474767, | |
| "learning_rate": 2.043754511182191e-07, | |
| "loss": 0.1302, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.6326530612244898, | |
| "grad_norm": 2.6710609583138636, | |
| "learning_rate": 1.9762872451962208e-07, | |
| "loss": 0.1376, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.6390556222488994, | |
| "grad_norm": 2.636600973542151, | |
| "learning_rate": 1.9098300562505264e-07, | |
| "loss": 0.1667, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.6454581832733093, | |
| "grad_norm": 2.7163444391021647, | |
| "learning_rate": 1.8443913104073982e-07, | |
| "loss": 0.121, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.6518607442977191, | |
| "grad_norm": 5.493549571701205, | |
| "learning_rate": 1.7799792455209016e-07, | |
| "loss": 0.1702, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.6582633053221287, | |
| "grad_norm": 2.795867026435785, | |
| "learning_rate": 1.716601970199836e-07, | |
| "loss": 0.1657, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.6646658663465386, | |
| "grad_norm": 2.697456578390499, | |
| "learning_rate": 1.6542674627869734e-07, | |
| "loss": 0.1108, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.6710684273709484, | |
| "grad_norm": 3.700105341120021, | |
| "learning_rate": 1.592983570354699e-07, | |
| "loss": 0.1526, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.677470988395358, | |
| "grad_norm": 2.8767911269966957, | |
| "learning_rate": 1.5327580077171588e-07, | |
| "loss": 0.1581, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.6838735494197679, | |
| "grad_norm": 2.8392553114328183, | |
| "learning_rate": 1.473598356459078e-07, | |
| "loss": 0.1387, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.6902761104441777, | |
| "grad_norm": 2.848243119062683, | |
| "learning_rate": 1.415512063981339e-07, | |
| "loss": 0.1352, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.6966786714685873, | |
| "grad_norm": 4.008263998101646, | |
| "learning_rate": 1.358506442563454e-07, | |
| "loss": 0.1717, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.7030812324929971, | |
| "grad_norm": 3.3644795533763747, | |
| "learning_rate": 1.3025886684430465e-07, | |
| "loss": 0.1851, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.709483793517407, | |
| "grad_norm": 2.1927113979713457, | |
| "learning_rate": 1.2477657809124632e-07, | |
| "loss": 0.1529, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.7158863545418166, | |
| "grad_norm": 4.20748377167738, | |
| "learning_rate": 1.19404468143262e-07, | |
| "loss": 0.1506, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.7222889155662267, | |
| "grad_norm": 2.6119161837028897, | |
| "learning_rate": 1.1414321327642019e-07, | |
| "loss": 0.1692, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.7286914765906363, | |
| "grad_norm": 3.25899049531815, | |
| "learning_rate": 1.089934758116322e-07, | |
| "loss": 0.1552, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.7350940376150459, | |
| "grad_norm": 2.4457520889715085, | |
| "learning_rate": 1.0395590403127486e-07, | |
| "loss": 0.1072, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.741496598639456, | |
| "grad_norm": 3.725879684312294, | |
| "learning_rate": 9.903113209758096e-08, | |
| "loss": 0.1297, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.7478991596638656, | |
| "grad_norm": 4.086760833693879, | |
| "learning_rate": 9.421977997280594e-08, | |
| "loss": 0.134, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.7543017206882752, | |
| "grad_norm": 2.8228633836720944, | |
| "learning_rate": 8.952245334118413e-08, | |
| "loss": 0.1429, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.7607042817126852, | |
| "grad_norm": 2.5768217353103906, | |
| "learning_rate": 8.493974353268019e-08, | |
| "loss": 0.138, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.7671068427370948, | |
| "grad_norm": 3.920988018065823, | |
| "learning_rate": 8.047222744854942e-08, | |
| "loss": 0.168, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.7735094037615045, | |
| "grad_norm": 2.665420920196723, | |
| "learning_rate": 7.612046748871326e-08, | |
| "loss": 0.1198, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.7799119647859145, | |
| "grad_norm": 2.84264633693513, | |
| "learning_rate": 7.188501148096116e-08, | |
| "loss": 0.1724, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.7863145258103241, | |
| "grad_norm": 2.7132283377188386, | |
| "learning_rate": 6.77663926119858e-08, | |
| "loss": 0.1369, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.7927170868347337, | |
| "grad_norm": 2.8632960787364494, | |
| "learning_rate": 6.376512936026279e-08, | |
| "loss": 0.1606, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.7991196478591438, | |
| "grad_norm": 2.738598278709088, | |
| "learning_rate": 5.988172543078096e-08, | |
| "loss": 0.1679, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.8055222088835534, | |
| "grad_norm": 2.311034901841601, | |
| "learning_rate": 5.611666969163242e-08, | |
| "loss": 0.1644, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.811924769907963, | |
| "grad_norm": 2.8470945573238837, | |
| "learning_rate": 5.2470436112471264e-08, | |
| "loss": 0.1817, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.818327330932373, | |
| "grad_norm": 2.9917534547915476, | |
| "learning_rate": 4.8943483704846465e-08, | |
| "loss": 0.1472, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.8247298919567827, | |
| "grad_norm": 2.6305242577097685, | |
| "learning_rate": 4.553625646441928e-08, | |
| "loss": 0.1289, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.8311324529811923, | |
| "grad_norm": 2.5380081332282014, | |
| "learning_rate": 4.224918331506955e-08, | |
| "loss": 0.1379, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.8375350140056024, | |
| "grad_norm": 3.110480752833179, | |
| "learning_rate": 3.908267805490051e-08, | |
| "loss": 0.1623, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.843937575030012, | |
| "grad_norm": 3.1789215828737074, | |
| "learning_rate": 3.6037139304146756e-08, | |
| "loss": 0.1441, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.8503401360544216, | |
| "grad_norm": 3.2716298643823927, | |
| "learning_rate": 3.3112950454993625e-08, | |
| "loss": 0.166, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.8567426970788317, | |
| "grad_norm": 4.5463007787733725, | |
| "learning_rate": 3.0310479623313125e-08, | |
| "loss": 0.1391, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.8631452581032413, | |
| "grad_norm": 3.0644978198504966, | |
| "learning_rate": 2.7630079602323443e-08, | |
| "loss": 0.1359, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.8695478191276511, | |
| "grad_norm": 2.915540706565416, | |
| "learning_rate": 2.507208781817638e-08, | |
| "loss": 0.1854, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.875950380152061, | |
| "grad_norm": 2.7471867583483958, | |
| "learning_rate": 2.263682628748087e-08, | |
| "loss": 0.1593, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.8823529411764706, | |
| "grad_norm": 3.3996282117269323, | |
| "learning_rate": 2.032460157676452e-08, | |
| "loss": 0.1655, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.8887555022008804, | |
| "grad_norm": 2.782949881418279, | |
| "learning_rate": 1.8135704763881598e-08, | |
| "loss": 0.1411, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.8951580632252902, | |
| "grad_norm": 2.817009087504296, | |
| "learning_rate": 1.607041140137033e-08, | |
| "loss": 0.1741, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.9015606242496998, | |
| "grad_norm": 2.514339605431768, | |
| "learning_rate": 1.4128981481764113e-08, | |
| "loss": 0.1639, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.9079631852741097, | |
| "grad_norm": 2.502723430413198, | |
| "learning_rate": 1.231165940486234e-08, | |
| "loss": 0.1458, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.9143657462985195, | |
| "grad_norm": 2.9756989992004, | |
| "learning_rate": 1.0618673946963364e-08, | |
| "loss": 0.1899, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.9207683073229291, | |
| "grad_norm": 3.029713811900019, | |
| "learning_rate": 9.050238232065299e-09, | |
| "loss": 0.1194, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.927170868347339, | |
| "grad_norm": 2.8129240654015555, | |
| "learning_rate": 7.606549705035935e-09, | |
| "loss": 0.1564, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.9335734293717488, | |
| "grad_norm": 3.5641124301233904, | |
| "learning_rate": 6.2877901067573955e-09, | |
| "loss": 0.1272, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.9399759903961584, | |
| "grad_norm": 2.5825170270283024, | |
| "learning_rate": 5.094125451247655e-09, | |
| "loss": 0.1864, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.9463785514205683, | |
| "grad_norm": 2.75200131179709, | |
| "learning_rate": 4.025706004760931e-09, | |
| "loss": 0.1568, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.952781112444978, | |
| "grad_norm": 3.1581337599381527, | |
| "learning_rate": 3.082666266872036e-09, | |
| "loss": 0.1256, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.9591836734693877, | |
| "grad_norm": 3.4365108386154786, | |
| "learning_rate": 2.2651249535439177e-09, | |
| "loss": 0.1516, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.9655862344937975, | |
| "grad_norm": 2.552383623057842, | |
| "learning_rate": 1.5731849821833953e-09, | |
| "loss": 0.1362, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.9719887955182074, | |
| "grad_norm": 2.5409532995249657, | |
| "learning_rate": 1.0069334586854105e-09, | |
| "loss": 0.1238, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.978391356542617, | |
| "grad_norm": 4.158643521191336, | |
| "learning_rate": 5.664416664666882e-10, | |
| "loss": 0.1615, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.9847939175670268, | |
| "grad_norm": 3.0805263918986983, | |
| "learning_rate": 2.517650574934693e-10, | |
| "loss": 0.1513, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.9911964785914367, | |
| "grad_norm": 3.1343235054789926, | |
| "learning_rate": 6.29432452994294e-11, | |
| "loss": 0.1493, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.9975990396158463, | |
| "grad_norm": 4.792054679066387, | |
| "learning_rate": 0.0, | |
| "loss": 0.167, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.9975990396158463, | |
| "step": 312, | |
| "total_flos": 211327818203136.0, | |
| "train_loss": 0.17435487460058469, | |
| "train_runtime": 7374.5307, | |
| "train_samples_per_second": 10.842, | |
| "train_steps_per_second": 0.042 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 312, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 211327818203136.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |