| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 525, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05730659025787966, | |
| "grad_norm": 2.0758858092387618, | |
| "learning_rate": 1.6981132075471698e-06, | |
| "loss": 0.5683, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.11461318051575932, | |
| "grad_norm": 1.032745783560496, | |
| "learning_rate": 3.5849056603773586e-06, | |
| "loss": 0.511, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.17191977077363896, | |
| "grad_norm": 0.6337965187055514, | |
| "learning_rate": 5.4716981132075475e-06, | |
| "loss": 0.4697, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.22922636103151864, | |
| "grad_norm": 0.3341583249899249, | |
| "learning_rate": 7.358490566037736e-06, | |
| "loss": 0.4383, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.28653295128939826, | |
| "grad_norm": 0.24105124040802722, | |
| "learning_rate": 9.245283018867926e-06, | |
| "loss": 0.415, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3438395415472779, | |
| "grad_norm": 0.1979988380418763, | |
| "learning_rate": 9.996013419506035e-06, | |
| "loss": 0.4052, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.40114613180515757, | |
| "grad_norm": 0.2042121825495575, | |
| "learning_rate": 9.971674001050687e-06, | |
| "loss": 0.4027, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4584527220630373, | |
| "grad_norm": 0.20586101864992135, | |
| "learning_rate": 9.925317587058516e-06, | |
| "loss": 0.3908, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5157593123209169, | |
| "grad_norm": 0.19753266562175947, | |
| "learning_rate": 9.85714946632355e-06, | |
| "loss": 0.3824, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5730659025787965, | |
| "grad_norm": 0.2014408335897292, | |
| "learning_rate": 9.767471520507713e-06, | |
| "loss": 0.3843, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6303724928366762, | |
| "grad_norm": 0.21455878509840362, | |
| "learning_rate": 9.656680887261693e-06, | |
| "loss": 0.3797, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6876790830945558, | |
| "grad_norm": 0.19677946400066818, | |
| "learning_rate": 9.52526820150588e-06, | |
| "loss": 0.3763, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7449856733524355, | |
| "grad_norm": 0.18157944977618248, | |
| "learning_rate": 9.373815422659806e-06, | |
| "loss": 0.3756, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.8022922636103151, | |
| "grad_norm": 0.20079167101829795, | |
| "learning_rate": 9.202993257442216e-06, | |
| "loss": 0.3735, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8595988538681948, | |
| "grad_norm": 0.22524995002273018, | |
| "learning_rate": 9.013558189654819e-06, | |
| "loss": 0.3704, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9169054441260746, | |
| "grad_norm": 0.20366626027514875, | |
| "learning_rate": 8.806349130103334e-06, | |
| "loss": 0.3649, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9742120343839542, | |
| "grad_norm": 0.20483109220703685, | |
| "learning_rate": 8.582283701491576e-06, | |
| "loss": 0.3726, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.0286532951289398, | |
| "grad_norm": 0.1861813989832312, | |
| "learning_rate": 8.342354174740904e-06, | |
| "loss": 0.3613, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.0859598853868195, | |
| "grad_norm": 0.18944792927613582, | |
| "learning_rate": 8.08762307473096e-06, | |
| "loss": 0.3477, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.143266475644699, | |
| "grad_norm": 0.19241082013181077, | |
| "learning_rate": 7.81921847492168e-06, | |
| "loss": 0.3528, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.2005730659025788, | |
| "grad_norm": 0.2154742217299199, | |
| "learning_rate": 7.5383290016942e-06, | |
| "loss": 0.351, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.2578796561604584, | |
| "grad_norm": 0.20901304048619337, | |
| "learning_rate": 7.246198570533944e-06, | |
| "loss": 0.351, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.3151862464183381, | |
| "grad_norm": 0.19073549086926014, | |
| "learning_rate": 6.944120877366605e-06, | |
| "loss": 0.3557, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.3724928366762177, | |
| "grad_norm": 0.17646170009433357, | |
| "learning_rate": 6.633433669442066e-06, | |
| "loss": 0.35, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.4297994269340975, | |
| "grad_norm": 0.19599623413727513, | |
| "learning_rate": 6.315512821137606e-06, | |
| "loss": 0.3473, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.487106017191977, | |
| "grad_norm": 0.1780821708268571, | |
| "learning_rate": 5.9917662409155896e-06, | |
| "loss": 0.3516, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.5444126074498568, | |
| "grad_norm": 0.17861634863865428, | |
| "learning_rate": 5.663627636418611e-06, | |
| "loss": 0.3501, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.6017191977077365, | |
| "grad_norm": 0.19324138154633005, | |
| "learning_rate": 5.332550165313312e-06, | |
| "loss": 0.3482, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.659025787965616, | |
| "grad_norm": 0.1756126120758286, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3438, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.7163323782234956, | |
| "grad_norm": 0.17188834689286137, | |
| "learning_rate": 4.667449834686689e-06, | |
| "loss": 0.3452, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.7736389684813754, | |
| "grad_norm": 0.1837742901594931, | |
| "learning_rate": 4.336372363581391e-06, | |
| "loss": 0.3473, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.8309455587392551, | |
| "grad_norm": 0.1672330512230618, | |
| "learning_rate": 4.00823375908441e-06, | |
| "loss": 0.3497, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.8882521489971347, | |
| "grad_norm": 0.17943679283278077, | |
| "learning_rate": 3.6844871788623946e-06, | |
| "loss": 0.3422, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.9455587392550142, | |
| "grad_norm": 0.17561326371630695, | |
| "learning_rate": 3.366566330557935e-06, | |
| "loss": 0.3434, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.21808711950541695, | |
| "learning_rate": 3.0558791226333974e-06, | |
| "loss": 0.3411, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.0573065902578795, | |
| "grad_norm": 0.16805885653283237, | |
| "learning_rate": 2.7538014294660564e-06, | |
| "loss": 0.3338, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.1146131805157595, | |
| "grad_norm": 0.1571105011704873, | |
| "learning_rate": 2.461670998305802e-06, | |
| "loss": 0.3365, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.171919770773639, | |
| "grad_norm": 0.15487232581993202, | |
| "learning_rate": 2.1807815250783194e-06, | |
| "loss": 0.3266, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.2292263610315186, | |
| "grad_norm": 0.15903611974395213, | |
| "learning_rate": 1.912376925269041e-06, | |
| "loss": 0.3306, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.286532951289398, | |
| "grad_norm": 0.1602582517852452, | |
| "learning_rate": 1.6576458252590988e-06, | |
| "loss": 0.3338, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.343839541547278, | |
| "grad_norm": 0.153892591517447, | |
| "learning_rate": 1.4177162985084242e-06, | |
| "loss": 0.3391, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.4011461318051577, | |
| "grad_norm": 0.1670245531773294, | |
| "learning_rate": 1.1936508698966664e-06, | |
| "loss": 0.3368, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.458452722063037, | |
| "grad_norm": 0.15231997853829518, | |
| "learning_rate": 9.86441810345183e-07, | |
| "loss": 0.3366, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.5157593123209168, | |
| "grad_norm": 0.16138543102686964, | |
| "learning_rate": 7.970067425577849e-07, | |
| "loss": 0.3345, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.5730659025787963, | |
| "grad_norm": 0.16815491218506493, | |
| "learning_rate": 6.261845773401936e-07, | |
| "loss": 0.3308, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.6303724928366763, | |
| "grad_norm": 0.1645142556469246, | |
| "learning_rate": 4.747317984941213e-07, | |
| "loss": 0.3291, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.687679083094556, | |
| "grad_norm": 0.1554646689171431, | |
| "learning_rate": 3.433191127383079e-07, | |
| "loss": 0.3341, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.7449856733524354, | |
| "grad_norm": 0.14919476246429758, | |
| "learning_rate": 2.325284794922883e-07, | |
| "loss": 0.3337, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.8022922636103154, | |
| "grad_norm": 0.1381795881550593, | |
| "learning_rate": 1.4285053367645074e-07, | |
| "loss": 0.3353, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.859598853868195, | |
| "grad_norm": 0.1417249974490203, | |
| "learning_rate": 7.468241294148471e-08, | |
| "loss": 0.3307, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.9169054441260744, | |
| "grad_norm": 0.14380681025290207, | |
| "learning_rate": 2.8325998949314536e-08, | |
| "loss": 0.3309, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.974212034383954, | |
| "grad_norm": 0.14738174330263265, | |
| "learning_rate": 3.9865804939659414e-09, | |
| "loss": 0.3333, | |
| "step": 520 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 525, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 10000000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5069159780057088.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |