| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 7.733952049497293, | |
| "eval_steps": 500, | |
| "global_step": 30000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.025779840164990978, | |
| "grad_norm": 0.5347822308540344, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4509, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.051559680329981955, | |
| "grad_norm": 0.4712078273296356, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1744, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07733952049497293, | |
| "grad_norm": 0.5031601786613464, | |
| "learning_rate": 0.0002, | |
| "loss": 1.096, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10311936065996391, | |
| "grad_norm": 0.49241065979003906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9847, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.12889920082495487, | |
| "grad_norm": 0.9957050681114197, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9928, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.15467904098994587, | |
| "grad_norm": 0.38163048028945923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9008, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.18045888115493683, | |
| "grad_norm": 0.4322434663772583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9108, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.20623872131992782, | |
| "grad_norm": 0.4072737395763397, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8713, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.23201856148491878, | |
| "grad_norm": 0.5637839436531067, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8538, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.25779840164990975, | |
| "grad_norm": 0.6094131469726562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8154, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.28357824181490077, | |
| "grad_norm": 0.4212701618671417, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7897, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.30935808197989173, | |
| "grad_norm": 0.4663824737071991, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8021, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3351379221448827, | |
| "grad_norm": 0.3774861693382263, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7452, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.36091776230987366, | |
| "grad_norm": 0.19446992874145508, | |
| "learning_rate": 0.0002, | |
| "loss": 0.737, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.3866976024748647, | |
| "grad_norm": 0.25984033942222595, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6966, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.41247744263985564, | |
| "grad_norm": 0.3495163023471832, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7179, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.4382572828048466, | |
| "grad_norm": 0.5092929601669312, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7132, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.46403712296983757, | |
| "grad_norm": 0.16095790266990662, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6652, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.4898169631348286, | |
| "grad_norm": 0.38502034544944763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6564, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5155968032998195, | |
| "grad_norm": 0.3100506067276001, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6082, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5413766434648105, | |
| "grad_norm": 0.4585016965866089, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6491, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.5671564836298015, | |
| "grad_norm": 0.35394927859306335, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6136, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.5929363237947924, | |
| "grad_norm": 0.4828909933567047, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5639, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6187161639597835, | |
| "grad_norm": 0.7377568483352661, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5998, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.6444960041247745, | |
| "grad_norm": 0.33992356061935425, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6702758442897654, | |
| "grad_norm": 0.40880173444747925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5839, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.6960556844547564, | |
| "grad_norm": 0.6135886907577515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5697, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.7218355246197473, | |
| "grad_norm": 0.14242181181907654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.562, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.7476153647847383, | |
| "grad_norm": 0.1636349856853485, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.7733952049497294, | |
| "grad_norm": 0.5300703644752502, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.7991750451147203, | |
| "grad_norm": 0.2816906273365021, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5319, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.8249548852797113, | |
| "grad_norm": 0.4165875315666199, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5073, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.8507347254447022, | |
| "grad_norm": 0.46957316994667053, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4973, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.8765145656096932, | |
| "grad_norm": 0.22382797300815582, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5091, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.9022944057746842, | |
| "grad_norm": 0.517814576625824, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4879, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.9280742459396751, | |
| "grad_norm": 0.44171011447906494, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4711, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.9538540861046662, | |
| "grad_norm": 0.3107047379016876, | |
| "learning_rate": 0.0002, | |
| "loss": 0.465, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.9796339262696572, | |
| "grad_norm": 0.09984863549470901, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4485, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.005413766434648, | |
| "grad_norm": 0.43100592494010925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4752, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.031193606599639, | |
| "grad_norm": 0.5259262919425964, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3621, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.0569734467646301, | |
| "grad_norm": 0.47033509612083435, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3569, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.082753286929621, | |
| "grad_norm": 0.5318751931190491, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3512, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.108533127094612, | |
| "grad_norm": 0.5434057116508484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3504, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.134312967259603, | |
| "grad_norm": 0.47843560576438904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3712, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.160092807424594, | |
| "grad_norm": 0.5956776142120361, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3511, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.1858726475895849, | |
| "grad_norm": 0.5072950720787048, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3445, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.211652487754576, | |
| "grad_norm": 0.5608052611351013, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3377, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.237432327919567, | |
| "grad_norm": 0.474223256111145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3276, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.2632121680845578, | |
| "grad_norm": 0.5215118527412415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3375, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.288992008249549, | |
| "grad_norm": 0.3922516405582428, | |
| "learning_rate": 0.0002, | |
| "loss": 0.342, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.3147718484145399, | |
| "grad_norm": 0.4958643615245819, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3553, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.3405516885795308, | |
| "grad_norm": 0.564983069896698, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3389, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.3663315287445217, | |
| "grad_norm": 0.5662856698036194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3382, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.3921113689095128, | |
| "grad_norm": 0.5040738582611084, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3408, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.4178912090745037, | |
| "grad_norm": 0.27346768975257874, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3266, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.4436710492394949, | |
| "grad_norm": 0.5055024027824402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3561, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.4694508894044858, | |
| "grad_norm": 0.5442714691162109, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3241, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.4952307295694767, | |
| "grad_norm": 0.4862806499004364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.344, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.5210105697344676, | |
| "grad_norm": 0.6346714496612549, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3195, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.5467904098994585, | |
| "grad_norm": 0.5846338272094727, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3232, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.5725702500644496, | |
| "grad_norm": 0.41255345940589905, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3379, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.5983500902294405, | |
| "grad_norm": 0.6396617293357849, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3099, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.6241299303944317, | |
| "grad_norm": 0.3450670540332794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3129, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.6499097705594226, | |
| "grad_norm": 0.30461055040359497, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2978, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.6756896107244135, | |
| "grad_norm": 0.4209739863872528, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3323, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.7014694508894044, | |
| "grad_norm": 0.3296062648296356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3047, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.7272492910543955, | |
| "grad_norm": 0.9009484648704529, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3046, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.7530291312193864, | |
| "grad_norm": 0.7505986094474792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3123, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.7788089713843775, | |
| "grad_norm": 0.3542492389678955, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3259, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.8045888115493685, | |
| "grad_norm": 0.4935378432273865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3262, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.8303686517143594, | |
| "grad_norm": 0.3000539541244507, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2887, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.8561484918793503, | |
| "grad_norm": 0.2680779695510864, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3108, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.8819283320443412, | |
| "grad_norm": 0.5922934412956238, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3211, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.9077081722093323, | |
| "grad_norm": 0.38349688053131104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.316, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.9334880123743234, | |
| "grad_norm": 0.7654793858528137, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3111, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.9592678525393143, | |
| "grad_norm": 0.2399352639913559, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3042, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.9850476927043053, | |
| "grad_norm": 0.42787912487983704, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2928, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.010827532869296, | |
| "grad_norm": 0.4771544933319092, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2487, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.036607373034287, | |
| "grad_norm": 0.6133277416229248, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2219, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.062387213199278, | |
| "grad_norm": 0.43137651681900024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2158, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.0881670533642693, | |
| "grad_norm": 0.41038885712623596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2127, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.1139468935292602, | |
| "grad_norm": 0.351235568523407, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2185, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.139726733694251, | |
| "grad_norm": 0.41089433431625366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2346, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.165506573859242, | |
| "grad_norm": 0.3464137613773346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2273, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.191286414024233, | |
| "grad_norm": 0.2753762900829315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2359, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.217066254189224, | |
| "grad_norm": 0.3630015552043915, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2351, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.2428460943542152, | |
| "grad_norm": 0.5501378178596497, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2273, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.268625934519206, | |
| "grad_norm": 0.31958362460136414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2306, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.294405774684197, | |
| "grad_norm": 0.4495809078216553, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2283, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.320185614849188, | |
| "grad_norm": 0.45789313316345215, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2191, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.345965455014179, | |
| "grad_norm": 0.2430783361196518, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2266, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 2.3717452951791698, | |
| "grad_norm": 0.512585461139679, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2293, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.3975251353441607, | |
| "grad_norm": 0.42088598012924194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2388, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.423304975509152, | |
| "grad_norm": 0.4196650981903076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2305, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.449084815674143, | |
| "grad_norm": 0.45856234431266785, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2294, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.474864655839134, | |
| "grad_norm": 0.5690295100212097, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2237, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.5006444960041248, | |
| "grad_norm": 0.5325428247451782, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2125, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 2.5264243361691157, | |
| "grad_norm": 0.4254339933395386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2335, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 2.5522041763341066, | |
| "grad_norm": 0.44463545083999634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2247, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.577984016499098, | |
| "grad_norm": 0.4192294776439667, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2328, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.603763856664089, | |
| "grad_norm": 0.39080777764320374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2229, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.6295436968290797, | |
| "grad_norm": 0.3375299870967865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2374, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.6553235369940706, | |
| "grad_norm": 0.6126553416252136, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2283, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 2.6811033771590616, | |
| "grad_norm": 0.21654823422431946, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2265, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 2.7068832173240525, | |
| "grad_norm": 0.41668832302093506, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2267, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.7326630574890434, | |
| "grad_norm": 0.5655872225761414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2331, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 2.7584428976540343, | |
| "grad_norm": 0.49956533312797546, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2323, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 2.7842227378190256, | |
| "grad_norm": 0.4230547547340393, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2157, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 2.8100025779840165, | |
| "grad_norm": 0.5253151655197144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2189, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 2.8357824181490074, | |
| "grad_norm": 0.3807348906993866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2285, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.8615622583139984, | |
| "grad_norm": 0.6454833149909973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.228, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 2.8873420984789897, | |
| "grad_norm": 0.2508118450641632, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2139, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 2.9131219386439806, | |
| "grad_norm": 0.32768428325653076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2206, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 2.9389017788089715, | |
| "grad_norm": 0.4850573241710663, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2235, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 2.9646816189739624, | |
| "grad_norm": 0.6089478135108948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2081, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.9904614591389533, | |
| "grad_norm": 0.47153401374816895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2463, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.0162412993039442, | |
| "grad_norm": 0.3843853771686554, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1911, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.042021139468935, | |
| "grad_norm": 0.21224769949913025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1753, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 3.067800979633926, | |
| "grad_norm": 0.3223534822463989, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1799, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 3.0935808197989174, | |
| "grad_norm": 0.399443656206131, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1755, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.1193606599639083, | |
| "grad_norm": 0.253034770488739, | |
| "learning_rate": 0.0002, | |
| "loss": 0.177, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 3.1451405001288992, | |
| "grad_norm": 0.318568617105484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1772, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 3.17092034029389, | |
| "grad_norm": 0.2624630928039551, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1876, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 3.196700180458881, | |
| "grad_norm": 0.46422523260116577, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1717, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 3.222480020623872, | |
| "grad_norm": 0.4504973888397217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1862, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.2482598607888633, | |
| "grad_norm": 0.44676682353019714, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1865, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 3.274039700953854, | |
| "grad_norm": 0.44682949781417847, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1797, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 3.299819541118845, | |
| "grad_norm": 0.22240401804447174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1823, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 3.325599381283836, | |
| "grad_norm": 0.3457636535167694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1839, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 3.351379221448827, | |
| "grad_norm": 0.5065191388130188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1823, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.377159061613818, | |
| "grad_norm": 0.516930341720581, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1812, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 3.4029389017788088, | |
| "grad_norm": 0.5823391079902649, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1851, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 3.4287187419438, | |
| "grad_norm": 0.4604497253894806, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1897, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 3.454498582108791, | |
| "grad_norm": 0.3871957063674927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1778, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 3.480278422273782, | |
| "grad_norm": 0.40806278586387634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1854, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.506058262438773, | |
| "grad_norm": 0.24849525094032288, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1825, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 3.5318381026037637, | |
| "grad_norm": 0.28265008330345154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1914, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 3.557617942768755, | |
| "grad_norm": 0.18643364310264587, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1728, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 3.583397782933746, | |
| "grad_norm": 0.36125150322914124, | |
| "learning_rate": 0.0002, | |
| "loss": 0.184, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 3.609177623098737, | |
| "grad_norm": 0.35003572702407837, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1834, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 3.634957463263728, | |
| "grad_norm": 0.29175901412963867, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1845, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 3.6607373034287187, | |
| "grad_norm": 0.37868496775627136, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1893, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 3.6865171435937096, | |
| "grad_norm": 0.3279033899307251, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1908, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 3.7122969837587005, | |
| "grad_norm": 0.31007370352745056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1832, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 3.7380768239236914, | |
| "grad_norm": 0.298289030790329, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1948, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 3.763856664088683, | |
| "grad_norm": 0.6039551496505737, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1828, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 3.7896365042536737, | |
| "grad_norm": 0.449587345123291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1891, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 3.8154163444186646, | |
| "grad_norm": 0.6465901136398315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1895, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 3.8411961845836555, | |
| "grad_norm": 0.5226249098777771, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1767, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 3.8669760247486464, | |
| "grad_norm": 0.29470816254615784, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1958, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.892755864913638, | |
| "grad_norm": 0.4997386336326599, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1984, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 3.9185357050786287, | |
| "grad_norm": 0.35381177067756653, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1839, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 3.9443155452436196, | |
| "grad_norm": 0.29231759905815125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1812, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 3.9700953854086105, | |
| "grad_norm": 0.40497833490371704, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1798, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 3.9958752255736014, | |
| "grad_norm": 0.1775328516960144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1931, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.021655065738592, | |
| "grad_norm": 0.2625548243522644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1513, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 4.047434905903583, | |
| "grad_norm": 0.47476592659950256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1607, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 4.073214746068574, | |
| "grad_norm": 0.4454491138458252, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1529, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 4.098994586233565, | |
| "grad_norm": 0.12239188700914383, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1539, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 4.124774426398556, | |
| "grad_norm": 0.2339598536491394, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1572, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.150554266563548, | |
| "grad_norm": 0.19658803939819336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1571, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 4.176334106728539, | |
| "grad_norm": 0.25842776894569397, | |
| "learning_rate": 0.0002, | |
| "loss": 0.155, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 4.20211394689353, | |
| "grad_norm": 0.4655442535877228, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1584, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 4.2278937870585205, | |
| "grad_norm": 0.3778013586997986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1587, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 4.253673627223511, | |
| "grad_norm": 0.22199797630310059, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1573, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.279453467388502, | |
| "grad_norm": 0.23724961280822754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1649, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 4.305233307553493, | |
| "grad_norm": 0.4558769166469574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1633, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 4.331013147718484, | |
| "grad_norm": 0.27720391750335693, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1613, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 4.356792987883475, | |
| "grad_norm": 0.3628349304199219, | |
| "learning_rate": 0.0002, | |
| "loss": 0.16, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 4.382572828048466, | |
| "grad_norm": 0.6290438175201416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1658, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.408352668213457, | |
| "grad_norm": 0.14983007311820984, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1629, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 4.434132508378448, | |
| "grad_norm": 0.30865323543548584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1603, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 4.459912348543439, | |
| "grad_norm": 0.5674950480461121, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1674, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 4.4856921887084305, | |
| "grad_norm": 0.40429455041885376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1677, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 4.511472028873421, | |
| "grad_norm": 0.27213749289512634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1642, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 4.537251869038412, | |
| "grad_norm": 0.40964949131011963, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1626, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 4.563031709203403, | |
| "grad_norm": 0.3955250382423401, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1564, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 4.588811549368394, | |
| "grad_norm": 0.3900775611400604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1605, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 4.614591389533385, | |
| "grad_norm": 0.2436327487230301, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1603, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 4.640371229698376, | |
| "grad_norm": 0.4188991189002991, | |
| "learning_rate": 0.0002, | |
| "loss": 0.163, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 4.666151069863367, | |
| "grad_norm": 0.15686850249767303, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1656, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 4.691930910028358, | |
| "grad_norm": 0.30334389209747314, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1612, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 4.717710750193349, | |
| "grad_norm": 0.33619073033332825, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1626, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 4.7434905903583395, | |
| "grad_norm": 0.20497629046440125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1647, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 4.76927043052333, | |
| "grad_norm": 0.20428726077079773, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1726, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 4.795050270688321, | |
| "grad_norm": 0.3606746196746826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1638, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 4.820830110853313, | |
| "grad_norm": 0.3441687226295471, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1676, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 4.846609951018304, | |
| "grad_norm": 0.3479159474372864, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1654, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 4.872389791183295, | |
| "grad_norm": 0.39751461148262024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1592, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 4.898169631348286, | |
| "grad_norm": 0.1793346256017685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1683, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 4.923949471513277, | |
| "grad_norm": 0.100714772939682, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1592, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 4.949729311678268, | |
| "grad_norm": 0.6268895864486694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1667, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 4.975509151843259, | |
| "grad_norm": 0.32232895493507385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1615, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 5.0012889920082495, | |
| "grad_norm": 0.3094789683818817, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1648, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 5.02706883217324, | |
| "grad_norm": 0.3806459307670593, | |
| "learning_rate": 0.0002, | |
| "loss": 0.149, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.052848672338231, | |
| "grad_norm": 0.28195375204086304, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1409, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 5.078628512503222, | |
| "grad_norm": 0.1819002479314804, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1403, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 5.104408352668213, | |
| "grad_norm": 0.27728572487831116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1426, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 5.130188192833204, | |
| "grad_norm": 0.21889761090278625, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1499, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 5.155968032998196, | |
| "grad_norm": 0.3974555432796478, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1427, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.181747873163187, | |
| "grad_norm": 0.48159608244895935, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1477, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 5.207527713328178, | |
| "grad_norm": 0.3865210711956024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1424, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 5.233307553493169, | |
| "grad_norm": 0.26485195755958557, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1486, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 5.2590873936581595, | |
| "grad_norm": 0.41939619183540344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.151, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 5.28486723382315, | |
| "grad_norm": 0.3483380973339081, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1475, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.310647073988141, | |
| "grad_norm": 0.40975695848464966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1461, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 5.336426914153132, | |
| "grad_norm": 0.27101436257362366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1528, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 5.362206754318123, | |
| "grad_norm": 0.27852606773376465, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1484, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 5.387986594483114, | |
| "grad_norm": 0.4176689684391022, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1485, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 5.413766434648105, | |
| "grad_norm": 0.4901387691497803, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1479, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 5.439546274813096, | |
| "grad_norm": 0.33768975734710693, | |
| "learning_rate": 0.0002, | |
| "loss": 0.15, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 5.465326114978087, | |
| "grad_norm": 0.5349870324134827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1485, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 5.4911059551430785, | |
| "grad_norm": 0.24405865371227264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.146, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 5.516885795308069, | |
| "grad_norm": 0.2870001494884491, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1482, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 5.54266563547306, | |
| "grad_norm": 0.34606364369392395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1535, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 5.568445475638051, | |
| "grad_norm": 0.4999238848686218, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1523, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 5.594225315803042, | |
| "grad_norm": 0.2526559829711914, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1524, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 5.620005155968033, | |
| "grad_norm": 0.270786315202713, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1511, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 5.645784996133024, | |
| "grad_norm": 0.4440493881702423, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1539, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 5.671564836298015, | |
| "grad_norm": 0.4871107041835785, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1505, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 5.697344676463006, | |
| "grad_norm": 0.40973493456840515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1553, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 5.723124516627997, | |
| "grad_norm": 0.4365851581096649, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1502, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 5.748904356792988, | |
| "grad_norm": 0.5478639602661133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1611, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 5.7746841969579785, | |
| "grad_norm": 0.29485803842544556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.157, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 5.800464037122969, | |
| "grad_norm": 0.20778502523899078, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1489, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 5.826243877287961, | |
| "grad_norm": 0.1795939952135086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1517, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 5.852023717452952, | |
| "grad_norm": 0.4165894687175751, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1464, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 5.877803557617943, | |
| "grad_norm": 0.35076722502708435, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1499, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 5.903583397782934, | |
| "grad_norm": 0.3190014362335205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1474, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 5.929363237947925, | |
| "grad_norm": 0.6232258081436157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1521, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 5.955143078112916, | |
| "grad_norm": 0.41889217495918274, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1553, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 5.980922918277907, | |
| "grad_norm": 0.4977259635925293, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1543, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 6.006702758442898, | |
| "grad_norm": 0.3092762231826782, | |
| "learning_rate": 0.0002, | |
| "loss": 0.145, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 6.0324825986078885, | |
| "grad_norm": 0.15745452046394348, | |
| "learning_rate": 0.0002, | |
| "loss": 0.138, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 6.058262438772879, | |
| "grad_norm": 0.10685788840055466, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1345, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 6.08404227893787, | |
| "grad_norm": 0.41699907183647156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1379, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 6.109822119102861, | |
| "grad_norm": 0.18783129751682281, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1306, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 6.135601959267852, | |
| "grad_norm": 0.15569710731506348, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1372, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 6.161381799432844, | |
| "grad_norm": 0.4492259919643402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1414, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 6.187161639597835, | |
| "grad_norm": 0.1448894888162613, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1376, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.212941479762826, | |
| "grad_norm": 0.2028491050004959, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1349, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 6.238721319927817, | |
| "grad_norm": 0.19205012917518616, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1396, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 6.2645011600928076, | |
| "grad_norm": 0.29885369539260864, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1449, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 6.2902810002577985, | |
| "grad_norm": 0.15814617276191711, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1438, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 6.316060840422789, | |
| "grad_norm": 0.2691551148891449, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1406, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 6.34184068058778, | |
| "grad_norm": 0.543335497379303, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1389, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 6.367620520752771, | |
| "grad_norm": 0.33116665482521057, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1403, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 6.393400360917762, | |
| "grad_norm": 0.5159612894058228, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1408, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 6.419180201082753, | |
| "grad_norm": 0.30205056071281433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1409, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 6.444960041247744, | |
| "grad_norm": 0.44916966557502747, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1432, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 6.470739881412735, | |
| "grad_norm": 0.18665899336338043, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1434, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 6.496519721577727, | |
| "grad_norm": 0.4078758656978607, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1411, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 6.5222995617427175, | |
| "grad_norm": 0.39813536405563354, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1445, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 6.548079401907708, | |
| "grad_norm": 0.2587377727031708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1463, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 6.573859242072699, | |
| "grad_norm": 0.41181057691574097, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1487, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 6.59963908223769, | |
| "grad_norm": 0.3136518597602844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1414, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 6.625418922402681, | |
| "grad_norm": 0.4114777445793152, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1434, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 6.651198762567672, | |
| "grad_norm": 0.17142866551876068, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1411, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 6.676978602732663, | |
| "grad_norm": 0.5585296750068665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.148, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 6.702758442897654, | |
| "grad_norm": 0.23773185908794403, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1468, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 6.728538283062645, | |
| "grad_norm": 0.38246840238571167, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1426, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 6.754318123227636, | |
| "grad_norm": 0.5393186807632446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1456, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 6.780097963392627, | |
| "grad_norm": 0.21433015167713165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1456, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 6.8058778035576175, | |
| "grad_norm": 0.4375258982181549, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1461, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 6.831657643722609, | |
| "grad_norm": 0.515832781791687, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1484, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 6.8574374838876, | |
| "grad_norm": 0.496559739112854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1461, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 6.883217324052591, | |
| "grad_norm": 0.30182015895843506, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1471, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 6.908997164217582, | |
| "grad_norm": 0.3858971893787384, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1469, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 6.934777004382573, | |
| "grad_norm": 0.30368533730506897, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1466, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 6.960556844547564, | |
| "grad_norm": 0.29557520151138306, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1446, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 6.986336684712555, | |
| "grad_norm": 0.34702664613723755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.143, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 7.012116524877546, | |
| "grad_norm": 0.18182627856731415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1467, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 7.037896365042537, | |
| "grad_norm": 0.48641154170036316, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1337, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 7.0636762052075275, | |
| "grad_norm": 0.5797538757324219, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1291, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 7.089456045372518, | |
| "grad_norm": 0.20399855077266693, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1372, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 7.115235885537509, | |
| "grad_norm": 0.12141354382038116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1359, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 7.1410157257025, | |
| "grad_norm": 0.13764117658138275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1276, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 7.166795565867492, | |
| "grad_norm": 0.21888123452663422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1337, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 7.192575406032483, | |
| "grad_norm": 0.1562834531068802, | |
| "learning_rate": 0.0002, | |
| "loss": 0.133, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 7.218355246197474, | |
| "grad_norm": 0.3367880880832672, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1335, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 7.244135086362465, | |
| "grad_norm": 0.1075579896569252, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1334, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 7.269914926527456, | |
| "grad_norm": 0.11283877491950989, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1356, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 7.2956947666924465, | |
| "grad_norm": 0.24768362939357758, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1374, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 7.3214746068574375, | |
| "grad_norm": 0.22776305675506592, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1307, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 7.347254447022428, | |
| "grad_norm": 0.13827867805957794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1396, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 7.373034287187419, | |
| "grad_norm": 0.2935916781425476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1355, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 7.39881412735241, | |
| "grad_norm": 0.10991048812866211, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1349, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 7.424593967517401, | |
| "grad_norm": 0.30149704217910767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1374, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 7.450373807682392, | |
| "grad_norm": 0.13918708264827728, | |
| "learning_rate": 0.0002, | |
| "loss": 0.141, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 7.476153647847383, | |
| "grad_norm": 0.13292869925498962, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1386, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 7.501933488012375, | |
| "grad_norm": 0.5602275729179382, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1421, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 7.527713328177366, | |
| "grad_norm": 0.12204320728778839, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1334, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 7.5534931683423565, | |
| "grad_norm": 0.17424637079238892, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1372, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 7.579273008507347, | |
| "grad_norm": 0.4190254509449005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1458, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 7.605052848672338, | |
| "grad_norm": 0.13242638111114502, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1421, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 7.630832688837329, | |
| "grad_norm": 0.23242244124412537, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1429, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 7.65661252900232, | |
| "grad_norm": 0.4323575794696808, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1402, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 7.682392369167311, | |
| "grad_norm": 0.1595413088798523, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1403, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 7.708172209332302, | |
| "grad_norm": 0.1448589414358139, | |
| "learning_rate": 0.0002, | |
| "loss": 0.136, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 7.733952049497293, | |
| "grad_norm": 0.5433810353279114, | |
| "learning_rate": 0.0002, | |
| "loss": 0.139, | |
| "step": 30000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 31032, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 3000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.186321886206116e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |