{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.733952049497293, "eval_steps": 500, "global_step": 30000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025779840164990978, "grad_norm": 0.5347822308540344, "learning_rate": 0.0002, "loss": 1.4509, "step": 100 }, { "epoch": 0.051559680329981955, "grad_norm": 0.4712078273296356, "learning_rate": 0.0002, "loss": 1.1744, "step": 200 }, { "epoch": 0.07733952049497293, "grad_norm": 0.5031601786613464, "learning_rate": 0.0002, "loss": 1.096, "step": 300 }, { "epoch": 0.10311936065996391, "grad_norm": 0.49241065979003906, "learning_rate": 0.0002, "loss": 0.9847, "step": 400 }, { "epoch": 0.12889920082495487, "grad_norm": 0.9957050681114197, "learning_rate": 0.0002, "loss": 0.9928, "step": 500 }, { "epoch": 0.15467904098994587, "grad_norm": 0.38163048028945923, "learning_rate": 0.0002, "loss": 0.9008, "step": 600 }, { "epoch": 0.18045888115493683, "grad_norm": 0.4322434663772583, "learning_rate": 0.0002, "loss": 0.9108, "step": 700 }, { "epoch": 0.20623872131992782, "grad_norm": 0.4072737395763397, "learning_rate": 0.0002, "loss": 0.8713, "step": 800 }, { "epoch": 0.23201856148491878, "grad_norm": 0.5637839436531067, "learning_rate": 0.0002, "loss": 0.8538, "step": 900 }, { "epoch": 0.25779840164990975, "grad_norm": 0.6094131469726562, "learning_rate": 0.0002, "loss": 0.8154, "step": 1000 }, { "epoch": 0.28357824181490077, "grad_norm": 0.4212701618671417, "learning_rate": 0.0002, "loss": 0.7897, "step": 1100 }, { "epoch": 0.30935808197989173, "grad_norm": 0.4663824737071991, "learning_rate": 0.0002, "loss": 0.8021, "step": 1200 }, { "epoch": 0.3351379221448827, "grad_norm": 0.3774861693382263, "learning_rate": 0.0002, "loss": 0.7452, "step": 1300 }, { "epoch": 0.36091776230987366, "grad_norm": 0.19446992874145508, "learning_rate": 0.0002, "loss": 0.737, "step": 1400 }, { "epoch": 0.3866976024748647, "grad_norm": 0.25984033942222595, "learning_rate": 0.0002, "loss": 0.6966, "step": 1500 }, { "epoch": 0.41247744263985564, "grad_norm": 0.3495163023471832, "learning_rate": 0.0002, "loss": 0.7179, "step": 1600 }, { "epoch": 0.4382572828048466, "grad_norm": 0.5092929601669312, "learning_rate": 0.0002, "loss": 0.7132, "step": 1700 }, { "epoch": 0.46403712296983757, "grad_norm": 0.16095790266990662, "learning_rate": 0.0002, "loss": 0.6652, "step": 1800 }, { "epoch": 0.4898169631348286, "grad_norm": 0.38502034544944763, "learning_rate": 0.0002, "loss": 0.6564, "step": 1900 }, { "epoch": 0.5155968032998195, "grad_norm": 0.3100506067276001, "learning_rate": 0.0002, "loss": 0.6082, "step": 2000 }, { "epoch": 0.5413766434648105, "grad_norm": 0.4585016965866089, "learning_rate": 0.0002, "loss": 0.6491, "step": 2100 }, { "epoch": 0.5671564836298015, "grad_norm": 0.35394927859306335, "learning_rate": 0.0002, "loss": 0.6136, "step": 2200 }, { "epoch": 0.5929363237947924, "grad_norm": 0.4828909933567047, "learning_rate": 0.0002, "loss": 0.5639, "step": 2300 }, { "epoch": 0.6187161639597835, "grad_norm": 0.7377568483352661, "learning_rate": 0.0002, "loss": 0.5998, "step": 2400 }, { "epoch": 0.6444960041247745, "grad_norm": 0.33992356061935425, "learning_rate": 0.0002, "loss": 0.5535, "step": 2500 }, { "epoch": 0.6702758442897654, "grad_norm": 0.40880173444747925, "learning_rate": 0.0002, "loss": 0.5839, "step": 2600 }, { "epoch": 0.6960556844547564, "grad_norm": 0.6135886907577515, "learning_rate": 0.0002, "loss": 0.5697, "step": 2700 }, { "epoch": 0.7218355246197473, "grad_norm": 0.14242181181907654, "learning_rate": 0.0002, "loss": 0.562, "step": 2800 }, { "epoch": 0.7476153647847383, "grad_norm": 0.1636349856853485, "learning_rate": 0.0002, "loss": 0.5301, "step": 2900 }, { "epoch": 0.7733952049497294, "grad_norm": 0.5300703644752502, "learning_rate": 0.0002, "loss": 0.5428, "step": 3000 }, { "epoch": 0.7991750451147203, "grad_norm": 0.2816906273365021, "learning_rate": 0.0002, "loss": 0.5319, "step": 3100 }, { "epoch": 0.8249548852797113, "grad_norm": 0.4165875315666199, "learning_rate": 0.0002, "loss": 0.5073, "step": 3200 }, { "epoch": 0.8507347254447022, "grad_norm": 0.46957316994667053, "learning_rate": 0.0002, "loss": 0.4973, "step": 3300 }, { "epoch": 0.8765145656096932, "grad_norm": 0.22382797300815582, "learning_rate": 0.0002, "loss": 0.5091, "step": 3400 }, { "epoch": 0.9022944057746842, "grad_norm": 0.517814576625824, "learning_rate": 0.0002, "loss": 0.4879, "step": 3500 }, { "epoch": 0.9280742459396751, "grad_norm": 0.44171011447906494, "learning_rate": 0.0002, "loss": 0.4711, "step": 3600 }, { "epoch": 0.9538540861046662, "grad_norm": 0.3107047379016876, "learning_rate": 0.0002, "loss": 0.465, "step": 3700 }, { "epoch": 0.9796339262696572, "grad_norm": 0.09984863549470901, "learning_rate": 0.0002, "loss": 0.4485, "step": 3800 }, { "epoch": 1.005413766434648, "grad_norm": 0.43100592494010925, "learning_rate": 0.0002, "loss": 0.4752, "step": 3900 }, { "epoch": 1.031193606599639, "grad_norm": 0.5259262919425964, "learning_rate": 0.0002, "loss": 0.3621, "step": 4000 }, { "epoch": 1.0569734467646301, "grad_norm": 0.47033509612083435, "learning_rate": 0.0002, "loss": 0.3569, "step": 4100 }, { "epoch": 1.082753286929621, "grad_norm": 0.5318751931190491, "learning_rate": 0.0002, "loss": 0.3512, "step": 4200 }, { "epoch": 1.108533127094612, "grad_norm": 0.5434057116508484, "learning_rate": 0.0002, "loss": 0.3504, "step": 4300 }, { "epoch": 1.134312967259603, "grad_norm": 0.47843560576438904, "learning_rate": 0.0002, "loss": 0.3712, "step": 4400 }, { "epoch": 1.160092807424594, "grad_norm": 0.5956776142120361, "learning_rate": 0.0002, "loss": 0.3511, "step": 4500 }, { "epoch": 1.1858726475895849, "grad_norm": 0.5072950720787048, "learning_rate": 0.0002, "loss": 0.3445, "step": 4600 }, { "epoch": 1.211652487754576, "grad_norm": 0.5608052611351013, "learning_rate": 0.0002, "loss": 0.3377, "step": 4700 }, { "epoch": 1.237432327919567, "grad_norm": 0.474223256111145, "learning_rate": 0.0002, "loss": 0.3276, "step": 4800 }, { "epoch": 1.2632121680845578, "grad_norm": 0.5215118527412415, "learning_rate": 0.0002, "loss": 0.3375, "step": 4900 }, { "epoch": 1.288992008249549, "grad_norm": 0.3922516405582428, "learning_rate": 0.0002, "loss": 0.342, "step": 5000 }, { "epoch": 1.3147718484145399, "grad_norm": 0.4958643615245819, "learning_rate": 0.0002, "loss": 0.3553, "step": 5100 }, { "epoch": 1.3405516885795308, "grad_norm": 0.564983069896698, "learning_rate": 0.0002, "loss": 0.3389, "step": 5200 }, { "epoch": 1.3663315287445217, "grad_norm": 0.5662856698036194, "learning_rate": 0.0002, "loss": 0.3382, "step": 5300 }, { "epoch": 1.3921113689095128, "grad_norm": 0.5040738582611084, "learning_rate": 0.0002, "loss": 0.3408, "step": 5400 }, { "epoch": 1.4178912090745037, "grad_norm": 0.27346768975257874, "learning_rate": 0.0002, "loss": 0.3266, "step": 5500 }, { "epoch": 1.4436710492394949, "grad_norm": 0.5055024027824402, "learning_rate": 0.0002, "loss": 0.3561, "step": 5600 }, { "epoch": 1.4694508894044858, "grad_norm": 0.5442714691162109, "learning_rate": 0.0002, "loss": 0.3241, "step": 5700 }, { "epoch": 1.4952307295694767, "grad_norm": 0.4862806499004364, "learning_rate": 0.0002, "loss": 0.344, "step": 5800 }, { "epoch": 1.5210105697344676, "grad_norm": 0.6346714496612549, "learning_rate": 0.0002, "loss": 0.3195, "step": 5900 }, { "epoch": 1.5467904098994585, "grad_norm": 0.5846338272094727, "learning_rate": 0.0002, "loss": 0.3232, "step": 6000 }, { "epoch": 1.5725702500644496, "grad_norm": 0.41255345940589905, "learning_rate": 0.0002, "loss": 0.3379, "step": 6100 }, { "epoch": 1.5983500902294405, "grad_norm": 0.6396617293357849, "learning_rate": 0.0002, "loss": 0.3099, "step": 6200 }, { "epoch": 1.6241299303944317, "grad_norm": 0.3450670540332794, "learning_rate": 0.0002, "loss": 0.3129, "step": 6300 }, { "epoch": 1.6499097705594226, "grad_norm": 0.30461055040359497, "learning_rate": 0.0002, "loss": 0.2978, "step": 6400 }, { "epoch": 1.6756896107244135, "grad_norm": 0.4209739863872528, "learning_rate": 0.0002, "loss": 0.3323, "step": 6500 }, { "epoch": 1.7014694508894044, "grad_norm": 0.3296062648296356, "learning_rate": 0.0002, "loss": 0.3047, "step": 6600 }, { "epoch": 1.7272492910543955, "grad_norm": 0.9009484648704529, "learning_rate": 0.0002, "loss": 0.3046, "step": 6700 }, { "epoch": 1.7530291312193864, "grad_norm": 0.7505986094474792, "learning_rate": 0.0002, "loss": 0.3123, "step": 6800 }, { "epoch": 1.7788089713843775, "grad_norm": 0.3542492389678955, "learning_rate": 0.0002, "loss": 0.3259, "step": 6900 }, { "epoch": 1.8045888115493685, "grad_norm": 0.4935378432273865, "learning_rate": 0.0002, "loss": 0.3262, "step": 7000 }, { "epoch": 1.8303686517143594, "grad_norm": 0.3000539541244507, "learning_rate": 0.0002, "loss": 0.2887, "step": 7100 }, { "epoch": 1.8561484918793503, "grad_norm": 0.2680779695510864, "learning_rate": 0.0002, "loss": 0.3108, "step": 7200 }, { "epoch": 1.8819283320443412, "grad_norm": 0.5922934412956238, "learning_rate": 0.0002, "loss": 0.3211, "step": 7300 }, { "epoch": 1.9077081722093323, "grad_norm": 0.38349688053131104, "learning_rate": 0.0002, "loss": 0.316, "step": 7400 }, { "epoch": 1.9334880123743234, "grad_norm": 0.7654793858528137, "learning_rate": 0.0002, "loss": 0.3111, "step": 7500 }, { "epoch": 1.9592678525393143, "grad_norm": 0.2399352639913559, "learning_rate": 0.0002, "loss": 0.3042, "step": 7600 }, { "epoch": 1.9850476927043053, "grad_norm": 0.42787912487983704, "learning_rate": 0.0002, "loss": 0.2928, "step": 7700 }, { "epoch": 2.010827532869296, "grad_norm": 0.4771544933319092, "learning_rate": 0.0002, "loss": 0.2487, "step": 7800 }, { "epoch": 2.036607373034287, "grad_norm": 0.6133277416229248, "learning_rate": 0.0002, "loss": 0.2219, "step": 7900 }, { "epoch": 2.062387213199278, "grad_norm": 0.43137651681900024, "learning_rate": 0.0002, "loss": 0.2158, "step": 8000 }, { "epoch": 2.0881670533642693, "grad_norm": 0.41038885712623596, "learning_rate": 0.0002, "loss": 0.2127, "step": 8100 }, { "epoch": 2.1139468935292602, "grad_norm": 0.351235568523407, "learning_rate": 0.0002, "loss": 0.2185, "step": 8200 }, { "epoch": 2.139726733694251, "grad_norm": 0.41089433431625366, "learning_rate": 0.0002, "loss": 0.2346, "step": 8300 }, { "epoch": 2.165506573859242, "grad_norm": 0.3464137613773346, "learning_rate": 0.0002, "loss": 0.2273, "step": 8400 }, { "epoch": 2.191286414024233, "grad_norm": 0.2753762900829315, "learning_rate": 0.0002, "loss": 0.2359, "step": 8500 }, { "epoch": 2.217066254189224, "grad_norm": 0.3630015552043915, "learning_rate": 0.0002, "loss": 0.2351, "step": 8600 }, { "epoch": 2.2428460943542152, "grad_norm": 0.5501378178596497, "learning_rate": 0.0002, "loss": 0.2273, "step": 8700 }, { "epoch": 2.268625934519206, "grad_norm": 0.31958362460136414, "learning_rate": 0.0002, "loss": 0.2306, "step": 8800 }, { "epoch": 2.294405774684197, "grad_norm": 0.4495809078216553, "learning_rate": 0.0002, "loss": 0.2283, "step": 8900 }, { "epoch": 2.320185614849188, "grad_norm": 0.45789313316345215, "learning_rate": 0.0002, "loss": 0.2191, "step": 9000 }, { "epoch": 2.345965455014179, "grad_norm": 0.2430783361196518, "learning_rate": 0.0002, "loss": 0.2266, "step": 9100 }, { "epoch": 2.3717452951791698, "grad_norm": 0.512585461139679, "learning_rate": 0.0002, "loss": 0.2293, "step": 9200 }, { "epoch": 2.3975251353441607, "grad_norm": 0.42088598012924194, "learning_rate": 0.0002, "loss": 0.2388, "step": 9300 }, { "epoch": 2.423304975509152, "grad_norm": 0.4196650981903076, "learning_rate": 0.0002, "loss": 0.2305, "step": 9400 }, { "epoch": 2.449084815674143, "grad_norm": 0.45856234431266785, "learning_rate": 0.0002, "loss": 0.2294, "step": 9500 }, { "epoch": 2.474864655839134, "grad_norm": 0.5690295100212097, "learning_rate": 0.0002, "loss": 0.2237, "step": 9600 }, { "epoch": 2.5006444960041248, "grad_norm": 0.5325428247451782, "learning_rate": 0.0002, "loss": 0.2125, "step": 9700 }, { "epoch": 2.5264243361691157, "grad_norm": 0.4254339933395386, "learning_rate": 0.0002, "loss": 0.2335, "step": 9800 }, { "epoch": 2.5522041763341066, "grad_norm": 0.44463545083999634, "learning_rate": 0.0002, "loss": 0.2247, "step": 9900 }, { "epoch": 2.577984016499098, "grad_norm": 0.4192294776439667, "learning_rate": 0.0002, "loss": 0.2328, "step": 10000 }, { "epoch": 2.603763856664089, "grad_norm": 0.39080777764320374, "learning_rate": 0.0002, "loss": 0.2229, "step": 10100 }, { "epoch": 2.6295436968290797, "grad_norm": 0.3375299870967865, "learning_rate": 0.0002, "loss": 0.2374, "step": 10200 }, { "epoch": 2.6553235369940706, "grad_norm": 0.6126553416252136, "learning_rate": 0.0002, "loss": 0.2283, "step": 10300 }, { "epoch": 2.6811033771590616, "grad_norm": 0.21654823422431946, "learning_rate": 0.0002, "loss": 0.2265, "step": 10400 }, { "epoch": 2.7068832173240525, "grad_norm": 0.41668832302093506, "learning_rate": 0.0002, "loss": 0.2267, "step": 10500 }, { "epoch": 2.7326630574890434, "grad_norm": 0.5655872225761414, "learning_rate": 0.0002, "loss": 0.2331, "step": 10600 }, { "epoch": 2.7584428976540343, "grad_norm": 0.49956533312797546, "learning_rate": 0.0002, "loss": 0.2323, "step": 10700 }, { "epoch": 2.7842227378190256, "grad_norm": 0.4230547547340393, "learning_rate": 0.0002, "loss": 0.2157, "step": 10800 }, { "epoch": 2.8100025779840165, "grad_norm": 0.5253151655197144, "learning_rate": 0.0002, "loss": 0.2189, "step": 10900 }, { "epoch": 2.8357824181490074, "grad_norm": 0.3807348906993866, "learning_rate": 0.0002, "loss": 0.2285, "step": 11000 }, { "epoch": 2.8615622583139984, "grad_norm": 0.6454833149909973, "learning_rate": 0.0002, "loss": 0.228, "step": 11100 }, { "epoch": 2.8873420984789897, "grad_norm": 0.2508118450641632, "learning_rate": 0.0002, "loss": 0.2139, "step": 11200 }, { "epoch": 2.9131219386439806, "grad_norm": 0.32768428325653076, "learning_rate": 0.0002, "loss": 0.2206, "step": 11300 }, { "epoch": 2.9389017788089715, "grad_norm": 0.4850573241710663, "learning_rate": 0.0002, "loss": 0.2235, "step": 11400 }, { "epoch": 2.9646816189739624, "grad_norm": 0.6089478135108948, "learning_rate": 0.0002, "loss": 0.2081, "step": 11500 }, { "epoch": 2.9904614591389533, "grad_norm": 0.47153401374816895, "learning_rate": 0.0002, "loss": 0.2463, "step": 11600 }, { "epoch": 3.0162412993039442, "grad_norm": 0.3843853771686554, "learning_rate": 0.0002, "loss": 0.1911, "step": 11700 }, { "epoch": 3.042021139468935, "grad_norm": 0.21224769949913025, "learning_rate": 0.0002, "loss": 0.1753, "step": 11800 }, { "epoch": 3.067800979633926, "grad_norm": 0.3223534822463989, "learning_rate": 0.0002, "loss": 0.1799, "step": 11900 }, { "epoch": 3.0935808197989174, "grad_norm": 0.399443656206131, "learning_rate": 0.0002, "loss": 0.1755, "step": 12000 }, { "epoch": 3.1193606599639083, "grad_norm": 0.253034770488739, "learning_rate": 0.0002, "loss": 0.177, "step": 12100 }, { "epoch": 3.1451405001288992, "grad_norm": 0.318568617105484, "learning_rate": 0.0002, "loss": 0.1772, "step": 12200 }, { "epoch": 3.17092034029389, "grad_norm": 0.2624630928039551, "learning_rate": 0.0002, "loss": 0.1876, "step": 12300 }, { "epoch": 3.196700180458881, "grad_norm": 0.46422523260116577, "learning_rate": 0.0002, "loss": 0.1717, "step": 12400 }, { "epoch": 3.222480020623872, "grad_norm": 0.4504973888397217, "learning_rate": 0.0002, "loss": 0.1862, "step": 12500 }, { "epoch": 3.2482598607888633, "grad_norm": 0.44676682353019714, "learning_rate": 0.0002, "loss": 0.1865, "step": 12600 }, { "epoch": 3.274039700953854, "grad_norm": 0.44682949781417847, "learning_rate": 0.0002, "loss": 0.1797, "step": 12700 }, { "epoch": 3.299819541118845, "grad_norm": 0.22240401804447174, "learning_rate": 0.0002, "loss": 0.1823, "step": 12800 }, { "epoch": 3.325599381283836, "grad_norm": 0.3457636535167694, "learning_rate": 0.0002, "loss": 0.1839, "step": 12900 }, { "epoch": 3.351379221448827, "grad_norm": 0.5065191388130188, "learning_rate": 0.0002, "loss": 0.1823, "step": 13000 }, { "epoch": 3.377159061613818, "grad_norm": 0.516930341720581, "learning_rate": 0.0002, "loss": 0.1812, "step": 13100 }, { "epoch": 3.4029389017788088, "grad_norm": 0.5823391079902649, "learning_rate": 0.0002, "loss": 0.1851, "step": 13200 }, { "epoch": 3.4287187419438, "grad_norm": 0.4604497253894806, "learning_rate": 0.0002, "loss": 0.1897, "step": 13300 }, { "epoch": 3.454498582108791, "grad_norm": 0.3871957063674927, "learning_rate": 0.0002, "loss": 0.1778, "step": 13400 }, { "epoch": 3.480278422273782, "grad_norm": 0.40806278586387634, "learning_rate": 0.0002, "loss": 0.1854, "step": 13500 }, { "epoch": 3.506058262438773, "grad_norm": 0.24849525094032288, "learning_rate": 0.0002, "loss": 0.1825, "step": 13600 }, { "epoch": 3.5318381026037637, "grad_norm": 0.28265008330345154, "learning_rate": 0.0002, "loss": 0.1914, "step": 13700 }, { "epoch": 3.557617942768755, "grad_norm": 0.18643364310264587, "learning_rate": 0.0002, "loss": 0.1728, "step": 13800 }, { "epoch": 3.583397782933746, "grad_norm": 0.36125150322914124, "learning_rate": 0.0002, "loss": 0.184, "step": 13900 }, { "epoch": 3.609177623098737, "grad_norm": 0.35003572702407837, "learning_rate": 0.0002, "loss": 0.1834, "step": 14000 }, { "epoch": 3.634957463263728, "grad_norm": 0.29175901412963867, "learning_rate": 0.0002, "loss": 0.1845, "step": 14100 }, { "epoch": 3.6607373034287187, "grad_norm": 0.37868496775627136, "learning_rate": 0.0002, "loss": 0.1893, "step": 14200 }, { "epoch": 3.6865171435937096, "grad_norm": 0.3279033899307251, "learning_rate": 0.0002, "loss": 0.1908, "step": 14300 }, { "epoch": 3.7122969837587005, "grad_norm": 0.31007370352745056, "learning_rate": 0.0002, "loss": 0.1832, "step": 14400 }, { "epoch": 3.7380768239236914, "grad_norm": 0.298289030790329, "learning_rate": 0.0002, "loss": 0.1948, "step": 14500 }, { "epoch": 3.763856664088683, "grad_norm": 0.6039551496505737, "learning_rate": 0.0002, "loss": 0.1828, "step": 14600 }, { "epoch": 3.7896365042536737, "grad_norm": 0.449587345123291, "learning_rate": 0.0002, "loss": 0.1891, "step": 14700 }, { "epoch": 3.8154163444186646, "grad_norm": 0.6465901136398315, "learning_rate": 0.0002, "loss": 0.1895, "step": 14800 }, { "epoch": 3.8411961845836555, "grad_norm": 0.5226249098777771, "learning_rate": 0.0002, "loss": 0.1767, "step": 14900 }, { "epoch": 3.8669760247486464, "grad_norm": 0.29470816254615784, "learning_rate": 0.0002, "loss": 0.1958, "step": 15000 }, { "epoch": 3.892755864913638, "grad_norm": 0.4997386336326599, "learning_rate": 0.0002, "loss": 0.1984, "step": 15100 }, { "epoch": 3.9185357050786287, "grad_norm": 0.35381177067756653, "learning_rate": 0.0002, "loss": 0.1839, "step": 15200 }, { "epoch": 3.9443155452436196, "grad_norm": 0.29231759905815125, "learning_rate": 0.0002, "loss": 0.1812, "step": 15300 }, { "epoch": 3.9700953854086105, "grad_norm": 0.40497833490371704, "learning_rate": 0.0002, "loss": 0.1798, "step": 15400 }, { "epoch": 3.9958752255736014, "grad_norm": 0.1775328516960144, "learning_rate": 0.0002, "loss": 0.1931, "step": 15500 }, { "epoch": 4.021655065738592, "grad_norm": 0.2625548243522644, "learning_rate": 0.0002, "loss": 0.1513, "step": 15600 }, { "epoch": 4.047434905903583, "grad_norm": 0.47476592659950256, "learning_rate": 0.0002, "loss": 0.1607, "step": 15700 }, { "epoch": 4.073214746068574, "grad_norm": 0.4454491138458252, "learning_rate": 0.0002, "loss": 0.1529, "step": 15800 }, { "epoch": 4.098994586233565, "grad_norm": 0.12239188700914383, "learning_rate": 0.0002, "loss": 0.1539, "step": 15900 }, { "epoch": 4.124774426398556, "grad_norm": 0.2339598536491394, "learning_rate": 0.0002, "loss": 0.1572, "step": 16000 }, { "epoch": 4.150554266563548, "grad_norm": 0.19658803939819336, "learning_rate": 0.0002, "loss": 0.1571, "step": 16100 }, { "epoch": 4.176334106728539, "grad_norm": 0.25842776894569397, "learning_rate": 0.0002, "loss": 0.155, "step": 16200 }, { "epoch": 4.20211394689353, "grad_norm": 0.4655442535877228, "learning_rate": 0.0002, "loss": 0.1584, "step": 16300 }, { "epoch": 4.2278937870585205, "grad_norm": 0.3778013586997986, "learning_rate": 0.0002, "loss": 0.1587, "step": 16400 }, { "epoch": 4.253673627223511, "grad_norm": 0.22199797630310059, "learning_rate": 0.0002, "loss": 0.1573, "step": 16500 }, { "epoch": 4.279453467388502, "grad_norm": 0.23724961280822754, "learning_rate": 0.0002, "loss": 0.1649, "step": 16600 }, { "epoch": 4.305233307553493, "grad_norm": 0.4558769166469574, "learning_rate": 0.0002, "loss": 0.1633, "step": 16700 }, { "epoch": 4.331013147718484, "grad_norm": 0.27720391750335693, "learning_rate": 0.0002, "loss": 0.1613, "step": 16800 }, { "epoch": 4.356792987883475, "grad_norm": 0.3628349304199219, "learning_rate": 0.0002, "loss": 0.16, "step": 16900 }, { "epoch": 4.382572828048466, "grad_norm": 0.6290438175201416, "learning_rate": 0.0002, "loss": 0.1658, "step": 17000 }, { "epoch": 4.408352668213457, "grad_norm": 0.14983007311820984, "learning_rate": 0.0002, "loss": 0.1629, "step": 17100 }, { "epoch": 4.434132508378448, "grad_norm": 0.30865323543548584, "learning_rate": 0.0002, "loss": 0.1603, "step": 17200 }, { "epoch": 4.459912348543439, "grad_norm": 0.5674950480461121, "learning_rate": 0.0002, "loss": 0.1674, "step": 17300 }, { "epoch": 4.4856921887084305, "grad_norm": 0.40429455041885376, "learning_rate": 0.0002, "loss": 0.1677, "step": 17400 }, { "epoch": 4.511472028873421, "grad_norm": 0.27213749289512634, "learning_rate": 0.0002, "loss": 0.1642, "step": 17500 }, { "epoch": 4.537251869038412, "grad_norm": 0.40964949131011963, "learning_rate": 0.0002, "loss": 0.1626, "step": 17600 }, { "epoch": 4.563031709203403, "grad_norm": 0.3955250382423401, "learning_rate": 0.0002, "loss": 0.1564, "step": 17700 }, { "epoch": 4.588811549368394, "grad_norm": 0.3900775611400604, "learning_rate": 0.0002, "loss": 0.1605, "step": 17800 }, { "epoch": 4.614591389533385, "grad_norm": 0.2436327487230301, "learning_rate": 0.0002, "loss": 0.1603, "step": 17900 }, { "epoch": 4.640371229698376, "grad_norm": 0.4188991189002991, "learning_rate": 0.0002, "loss": 0.163, "step": 18000 }, { "epoch": 4.666151069863367, "grad_norm": 0.15686850249767303, "learning_rate": 0.0002, "loss": 0.1656, "step": 18100 }, { "epoch": 4.691930910028358, "grad_norm": 0.30334389209747314, "learning_rate": 0.0002, "loss": 0.1612, "step": 18200 }, { "epoch": 4.717710750193349, "grad_norm": 0.33619073033332825, "learning_rate": 0.0002, "loss": 0.1626, "step": 18300 }, { "epoch": 4.7434905903583395, "grad_norm": 0.20497629046440125, "learning_rate": 0.0002, "loss": 0.1647, "step": 18400 }, { "epoch": 4.76927043052333, "grad_norm": 0.20428726077079773, "learning_rate": 0.0002, "loss": 0.1726, "step": 18500 }, { "epoch": 4.795050270688321, "grad_norm": 0.3606746196746826, "learning_rate": 0.0002, "loss": 0.1638, "step": 18600 }, { "epoch": 4.820830110853313, "grad_norm": 0.3441687226295471, "learning_rate": 0.0002, "loss": 0.1676, "step": 18700 }, { "epoch": 4.846609951018304, "grad_norm": 0.3479159474372864, "learning_rate": 0.0002, "loss": 0.1654, "step": 18800 }, { "epoch": 4.872389791183295, "grad_norm": 0.39751461148262024, "learning_rate": 0.0002, "loss": 0.1592, "step": 18900 }, { "epoch": 4.898169631348286, "grad_norm": 0.1793346256017685, "learning_rate": 0.0002, "loss": 0.1683, "step": 19000 }, { "epoch": 4.923949471513277, "grad_norm": 0.100714772939682, "learning_rate": 0.0002, "loss": 0.1592, "step": 19100 }, { "epoch": 4.949729311678268, "grad_norm": 0.6268895864486694, "learning_rate": 0.0002, "loss": 0.1667, "step": 19200 }, { "epoch": 4.975509151843259, "grad_norm": 0.32232895493507385, "learning_rate": 0.0002, "loss": 0.1615, "step": 19300 }, { "epoch": 5.0012889920082495, "grad_norm": 0.3094789683818817, "learning_rate": 0.0002, "loss": 0.1648, "step": 19400 }, { "epoch": 5.02706883217324, "grad_norm": 0.3806459307670593, "learning_rate": 0.0002, "loss": 0.149, "step": 19500 }, { "epoch": 5.052848672338231, "grad_norm": 0.28195375204086304, "learning_rate": 0.0002, "loss": 0.1409, "step": 19600 }, { "epoch": 5.078628512503222, "grad_norm": 0.1819002479314804, "learning_rate": 0.0002, "loss": 0.1403, "step": 19700 }, { "epoch": 5.104408352668213, "grad_norm": 0.27728572487831116, "learning_rate": 0.0002, "loss": 0.1426, "step": 19800 }, { "epoch": 5.130188192833204, "grad_norm": 0.21889761090278625, "learning_rate": 0.0002, "loss": 0.1499, "step": 19900 }, { "epoch": 5.155968032998196, "grad_norm": 0.3974555432796478, "learning_rate": 0.0002, "loss": 0.1427, "step": 20000 }, { "epoch": 5.181747873163187, "grad_norm": 0.48159608244895935, "learning_rate": 0.0002, "loss": 0.1477, "step": 20100 }, { "epoch": 5.207527713328178, "grad_norm": 0.3865210711956024, "learning_rate": 0.0002, "loss": 0.1424, "step": 20200 }, { "epoch": 5.233307553493169, "grad_norm": 0.26485195755958557, "learning_rate": 0.0002, "loss": 0.1486, "step": 20300 }, { "epoch": 5.2590873936581595, "grad_norm": 0.41939619183540344, "learning_rate": 0.0002, "loss": 0.151, "step": 20400 }, { "epoch": 5.28486723382315, "grad_norm": 0.3483380973339081, "learning_rate": 0.0002, "loss": 0.1475, "step": 20500 }, { "epoch": 5.310647073988141, "grad_norm": 0.40975695848464966, "learning_rate": 0.0002, "loss": 0.1461, "step": 20600 }, { "epoch": 5.336426914153132, "grad_norm": 0.27101436257362366, "learning_rate": 0.0002, "loss": 0.1528, "step": 20700 }, { "epoch": 5.362206754318123, "grad_norm": 0.27852606773376465, "learning_rate": 0.0002, "loss": 0.1484, "step": 20800 }, { "epoch": 5.387986594483114, "grad_norm": 0.4176689684391022, "learning_rate": 0.0002, "loss": 0.1485, "step": 20900 }, { "epoch": 5.413766434648105, "grad_norm": 0.4901387691497803, "learning_rate": 0.0002, "loss": 0.1479, "step": 21000 }, { "epoch": 5.439546274813096, "grad_norm": 0.33768975734710693, "learning_rate": 0.0002, "loss": 0.15, "step": 21100 }, { "epoch": 5.465326114978087, "grad_norm": 0.5349870324134827, "learning_rate": 0.0002, "loss": 0.1485, "step": 21200 }, { "epoch": 5.4911059551430785, "grad_norm": 0.24405865371227264, "learning_rate": 0.0002, "loss": 0.146, "step": 21300 }, { "epoch": 5.516885795308069, "grad_norm": 0.2870001494884491, "learning_rate": 0.0002, "loss": 0.1482, "step": 21400 }, { "epoch": 5.54266563547306, "grad_norm": 0.34606364369392395, "learning_rate": 0.0002, "loss": 0.1535, "step": 21500 }, { "epoch": 5.568445475638051, "grad_norm": 0.4999238848686218, "learning_rate": 0.0002, "loss": 0.1523, "step": 21600 }, { "epoch": 5.594225315803042, "grad_norm": 0.2526559829711914, "learning_rate": 0.0002, "loss": 0.1524, "step": 21700 }, { "epoch": 5.620005155968033, "grad_norm": 0.270786315202713, "learning_rate": 0.0002, "loss": 0.1511, "step": 21800 }, { "epoch": 5.645784996133024, "grad_norm": 0.4440493881702423, "learning_rate": 0.0002, "loss": 0.1539, "step": 21900 }, { "epoch": 5.671564836298015, "grad_norm": 0.4871107041835785, "learning_rate": 0.0002, "loss": 0.1505, "step": 22000 }, { "epoch": 5.697344676463006, "grad_norm": 0.40973493456840515, "learning_rate": 0.0002, "loss": 0.1553, "step": 22100 }, { "epoch": 5.723124516627997, "grad_norm": 0.4365851581096649, "learning_rate": 0.0002, "loss": 0.1502, "step": 22200 }, { "epoch": 5.748904356792988, "grad_norm": 0.5478639602661133, "learning_rate": 0.0002, "loss": 0.1611, "step": 22300 }, { "epoch": 5.7746841969579785, "grad_norm": 0.29485803842544556, "learning_rate": 0.0002, "loss": 0.157, "step": 22400 }, { "epoch": 5.800464037122969, "grad_norm": 0.20778502523899078, "learning_rate": 0.0002, "loss": 0.1489, "step": 22500 }, { "epoch": 5.826243877287961, "grad_norm": 0.1795939952135086, "learning_rate": 0.0002, "loss": 0.1517, "step": 22600 }, { "epoch": 5.852023717452952, "grad_norm": 0.4165894687175751, "learning_rate": 0.0002, "loss": 0.1464, "step": 22700 }, { "epoch": 5.877803557617943, "grad_norm": 0.35076722502708435, "learning_rate": 0.0002, "loss": 0.1499, "step": 22800 }, { "epoch": 5.903583397782934, "grad_norm": 0.3190014362335205, "learning_rate": 0.0002, "loss": 0.1474, "step": 22900 }, { "epoch": 5.929363237947925, "grad_norm": 0.6232258081436157, "learning_rate": 0.0002, "loss": 0.1521, "step": 23000 }, { "epoch": 5.955143078112916, "grad_norm": 0.41889217495918274, "learning_rate": 0.0002, "loss": 0.1553, "step": 23100 }, { "epoch": 5.980922918277907, "grad_norm": 0.4977259635925293, "learning_rate": 0.0002, "loss": 0.1543, "step": 23200 }, { "epoch": 6.006702758442898, "grad_norm": 0.3092762231826782, "learning_rate": 0.0002, "loss": 0.145, "step": 23300 }, { "epoch": 6.0324825986078885, "grad_norm": 0.15745452046394348, "learning_rate": 0.0002, "loss": 0.138, "step": 23400 }, { "epoch": 6.058262438772879, "grad_norm": 0.10685788840055466, "learning_rate": 0.0002, "loss": 0.1345, "step": 23500 }, { "epoch": 6.08404227893787, "grad_norm": 0.41699907183647156, "learning_rate": 0.0002, "loss": 0.1379, "step": 23600 }, { "epoch": 6.109822119102861, "grad_norm": 0.18783129751682281, "learning_rate": 0.0002, "loss": 0.1306, "step": 23700 }, { "epoch": 6.135601959267852, "grad_norm": 0.15569710731506348, "learning_rate": 0.0002, "loss": 0.1372, "step": 23800 }, { "epoch": 6.161381799432844, "grad_norm": 0.4492259919643402, "learning_rate": 0.0002, "loss": 0.1414, "step": 23900 }, { "epoch": 6.187161639597835, "grad_norm": 0.1448894888162613, "learning_rate": 0.0002, "loss": 0.1376, "step": 24000 }, { "epoch": 6.212941479762826, "grad_norm": 0.2028491050004959, "learning_rate": 0.0002, "loss": 0.1349, "step": 24100 }, { "epoch": 6.238721319927817, "grad_norm": 0.19205012917518616, "learning_rate": 0.0002, "loss": 0.1396, "step": 24200 }, { "epoch": 6.2645011600928076, "grad_norm": 0.29885369539260864, "learning_rate": 0.0002, "loss": 0.1449, "step": 24300 }, { "epoch": 6.2902810002577985, "grad_norm": 0.15814617276191711, "learning_rate": 0.0002, "loss": 0.1438, "step": 24400 }, { "epoch": 6.316060840422789, "grad_norm": 0.2691551148891449, "learning_rate": 0.0002, "loss": 0.1406, "step": 24500 }, { "epoch": 6.34184068058778, "grad_norm": 0.543335497379303, "learning_rate": 0.0002, "loss": 0.1389, "step": 24600 }, { "epoch": 6.367620520752771, "grad_norm": 0.33116665482521057, "learning_rate": 0.0002, "loss": 0.1403, "step": 24700 }, { "epoch": 6.393400360917762, "grad_norm": 0.5159612894058228, "learning_rate": 0.0002, "loss": 0.1408, "step": 24800 }, { "epoch": 6.419180201082753, "grad_norm": 0.30205056071281433, "learning_rate": 0.0002, "loss": 0.1409, "step": 24900 }, { "epoch": 6.444960041247744, "grad_norm": 0.44916966557502747, "learning_rate": 0.0002, "loss": 0.1432, "step": 25000 }, { "epoch": 6.470739881412735, "grad_norm": 0.18665899336338043, "learning_rate": 0.0002, "loss": 0.1434, "step": 25100 }, { "epoch": 6.496519721577727, "grad_norm": 0.4078758656978607, "learning_rate": 0.0002, "loss": 0.1411, "step": 25200 }, { "epoch": 6.5222995617427175, "grad_norm": 0.39813536405563354, "learning_rate": 0.0002, "loss": 0.1445, "step": 25300 }, { "epoch": 6.548079401907708, "grad_norm": 0.2587377727031708, "learning_rate": 0.0002, "loss": 0.1463, "step": 25400 }, { "epoch": 6.573859242072699, "grad_norm": 0.41181057691574097, "learning_rate": 0.0002, "loss": 0.1487, "step": 25500 }, { "epoch": 6.59963908223769, "grad_norm": 0.3136518597602844, "learning_rate": 0.0002, "loss": 0.1414, "step": 25600 }, { "epoch": 6.625418922402681, "grad_norm": 0.4114777445793152, "learning_rate": 0.0002, "loss": 0.1434, "step": 25700 }, { "epoch": 6.651198762567672, "grad_norm": 0.17142866551876068, "learning_rate": 0.0002, "loss": 0.1411, "step": 25800 }, { "epoch": 6.676978602732663, "grad_norm": 0.5585296750068665, "learning_rate": 0.0002, "loss": 0.148, "step": 25900 }, { "epoch": 6.702758442897654, "grad_norm": 0.23773185908794403, "learning_rate": 0.0002, "loss": 0.1468, "step": 26000 }, { "epoch": 6.728538283062645, "grad_norm": 0.38246840238571167, "learning_rate": 0.0002, "loss": 0.1426, "step": 26100 }, { "epoch": 6.754318123227636, "grad_norm": 0.5393186807632446, "learning_rate": 0.0002, "loss": 0.1456, "step": 26200 }, { "epoch": 6.780097963392627, "grad_norm": 0.21433015167713165, "learning_rate": 0.0002, "loss": 0.1456, "step": 26300 }, { "epoch": 6.8058778035576175, "grad_norm": 0.4375258982181549, "learning_rate": 0.0002, "loss": 0.1461, "step": 26400 }, { "epoch": 6.831657643722609, "grad_norm": 0.515832781791687, "learning_rate": 0.0002, "loss": 0.1484, "step": 26500 }, { "epoch": 6.8574374838876, "grad_norm": 0.496559739112854, "learning_rate": 0.0002, "loss": 0.1461, "step": 26600 }, { "epoch": 6.883217324052591, "grad_norm": 0.30182015895843506, "learning_rate": 0.0002, "loss": 0.1471, "step": 26700 }, { "epoch": 6.908997164217582, "grad_norm": 0.3858971893787384, "learning_rate": 0.0002, "loss": 0.1469, "step": 26800 }, { "epoch": 6.934777004382573, "grad_norm": 0.30368533730506897, "learning_rate": 0.0002, "loss": 0.1466, "step": 26900 }, { "epoch": 6.960556844547564, "grad_norm": 0.29557520151138306, "learning_rate": 0.0002, "loss": 0.1446, "step": 27000 }, { "epoch": 6.986336684712555, "grad_norm": 0.34702664613723755, "learning_rate": 0.0002, "loss": 0.143, "step": 27100 }, { "epoch": 7.012116524877546, "grad_norm": 0.18182627856731415, "learning_rate": 0.0002, "loss": 0.1467, "step": 27200 }, { "epoch": 7.037896365042537, "grad_norm": 0.48641154170036316, "learning_rate": 0.0002, "loss": 0.1337, "step": 27300 }, { "epoch": 7.0636762052075275, "grad_norm": 0.5797538757324219, "learning_rate": 0.0002, "loss": 0.1291, "step": 27400 }, { "epoch": 7.089456045372518, "grad_norm": 0.20399855077266693, "learning_rate": 0.0002, "loss": 0.1372, "step": 27500 }, { "epoch": 7.115235885537509, "grad_norm": 0.12141354382038116, "learning_rate": 0.0002, "loss": 0.1359, "step": 27600 }, { "epoch": 7.1410157257025, "grad_norm": 0.13764117658138275, "learning_rate": 0.0002, "loss": 0.1276, "step": 27700 }, { "epoch": 7.166795565867492, "grad_norm": 0.21888123452663422, "learning_rate": 0.0002, "loss": 0.1337, "step": 27800 }, { "epoch": 7.192575406032483, "grad_norm": 0.1562834531068802, "learning_rate": 0.0002, "loss": 0.133, "step": 27900 }, { "epoch": 7.218355246197474, "grad_norm": 0.3367880880832672, "learning_rate": 0.0002, "loss": 0.1335, "step": 28000 }, { "epoch": 7.244135086362465, "grad_norm": 0.1075579896569252, "learning_rate": 0.0002, "loss": 0.1334, "step": 28100 }, { "epoch": 7.269914926527456, "grad_norm": 0.11283877491950989, "learning_rate": 0.0002, "loss": 0.1356, "step": 28200 }, { "epoch": 7.2956947666924465, "grad_norm": 0.24768362939357758, "learning_rate": 0.0002, "loss": 0.1374, "step": 28300 }, { "epoch": 7.3214746068574375, "grad_norm": 0.22776305675506592, "learning_rate": 0.0002, "loss": 0.1307, "step": 28400 }, { "epoch": 7.347254447022428, "grad_norm": 0.13827867805957794, "learning_rate": 0.0002, "loss": 0.1396, "step": 28500 }, { "epoch": 7.373034287187419, "grad_norm": 0.2935916781425476, "learning_rate": 0.0002, "loss": 0.1355, "step": 28600 }, { "epoch": 7.39881412735241, "grad_norm": 0.10991048812866211, "learning_rate": 0.0002, "loss": 0.1349, "step": 28700 }, { "epoch": 7.424593967517401, "grad_norm": 0.30149704217910767, "learning_rate": 0.0002, "loss": 0.1374, "step": 28800 }, { "epoch": 7.450373807682392, "grad_norm": 0.13918708264827728, "learning_rate": 0.0002, "loss": 0.141, "step": 28900 }, { "epoch": 7.476153647847383, "grad_norm": 0.13292869925498962, "learning_rate": 0.0002, "loss": 0.1386, "step": 29000 }, { "epoch": 7.501933488012375, "grad_norm": 0.5602275729179382, "learning_rate": 0.0002, "loss": 0.1421, "step": 29100 }, { "epoch": 7.527713328177366, "grad_norm": 0.12204320728778839, "learning_rate": 0.0002, "loss": 0.1334, "step": 29200 }, { "epoch": 7.5534931683423565, "grad_norm": 0.17424637079238892, "learning_rate": 0.0002, "loss": 0.1372, "step": 29300 }, { "epoch": 7.579273008507347, "grad_norm": 0.4190254509449005, "learning_rate": 0.0002, "loss": 0.1458, "step": 29400 }, { "epoch": 7.605052848672338, "grad_norm": 0.13242638111114502, "learning_rate": 0.0002, "loss": 0.1421, "step": 29500 }, { "epoch": 7.630832688837329, "grad_norm": 0.23242244124412537, "learning_rate": 0.0002, "loss": 0.1429, "step": 29600 }, { "epoch": 7.65661252900232, "grad_norm": 0.4323575794696808, "learning_rate": 0.0002, "loss": 0.1402, "step": 29700 }, { "epoch": 7.682392369167311, "grad_norm": 0.1595413088798523, "learning_rate": 0.0002, "loss": 0.1403, "step": 29800 }, { "epoch": 7.708172209332302, "grad_norm": 0.1448589414358139, "learning_rate": 0.0002, "loss": 0.136, "step": 29900 }, { "epoch": 7.733952049497293, "grad_norm": 0.5433810353279114, "learning_rate": 0.0002, "loss": 0.139, "step": 30000 } ], "logging_steps": 100, "max_steps": 31032, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.186321886206116e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }