{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 525, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05730659025787966, "grad_norm": 2.0758858092387618, "learning_rate": 1.6981132075471698e-06, "loss": 0.5683, "step": 10 }, { "epoch": 0.11461318051575932, "grad_norm": 1.032745783560496, "learning_rate": 3.5849056603773586e-06, "loss": 0.511, "step": 20 }, { "epoch": 0.17191977077363896, "grad_norm": 0.6337965187055514, "learning_rate": 5.4716981132075475e-06, "loss": 0.4697, "step": 30 }, { "epoch": 0.22922636103151864, "grad_norm": 0.3341583249899249, "learning_rate": 7.358490566037736e-06, "loss": 0.4383, "step": 40 }, { "epoch": 0.28653295128939826, "grad_norm": 0.24105124040802722, "learning_rate": 9.245283018867926e-06, "loss": 0.415, "step": 50 }, { "epoch": 0.3438395415472779, "grad_norm": 0.1979988380418763, "learning_rate": 9.996013419506035e-06, "loss": 0.4052, "step": 60 }, { "epoch": 0.40114613180515757, "grad_norm": 0.2042121825495575, "learning_rate": 9.971674001050687e-06, "loss": 0.4027, "step": 70 }, { "epoch": 0.4584527220630373, "grad_norm": 0.20586101864992135, "learning_rate": 9.925317587058516e-06, "loss": 0.3908, "step": 80 }, { "epoch": 0.5157593123209169, "grad_norm": 0.19753266562175947, "learning_rate": 9.85714946632355e-06, "loss": 0.3824, "step": 90 }, { "epoch": 0.5730659025787965, "grad_norm": 0.2014408335897292, "learning_rate": 9.767471520507713e-06, "loss": 0.3843, "step": 100 }, { "epoch": 0.6303724928366762, "grad_norm": 0.21455878509840362, "learning_rate": 9.656680887261693e-06, "loss": 0.3797, "step": 110 }, { "epoch": 0.6876790830945558, "grad_norm": 0.19677946400066818, "learning_rate": 9.52526820150588e-06, "loss": 0.3763, "step": 120 }, { "epoch": 0.7449856733524355, "grad_norm": 0.18157944977618248, "learning_rate": 9.373815422659806e-06, "loss": 0.3756, "step": 130 }, { "epoch": 0.8022922636103151, "grad_norm": 0.20079167101829795, "learning_rate": 9.202993257442216e-06, "loss": 0.3735, "step": 140 }, { "epoch": 0.8595988538681948, "grad_norm": 0.22524995002273018, "learning_rate": 9.013558189654819e-06, "loss": 0.3704, "step": 150 }, { "epoch": 0.9169054441260746, "grad_norm": 0.20366626027514875, "learning_rate": 8.806349130103334e-06, "loss": 0.3649, "step": 160 }, { "epoch": 0.9742120343839542, "grad_norm": 0.20483109220703685, "learning_rate": 8.582283701491576e-06, "loss": 0.3726, "step": 170 }, { "epoch": 1.0286532951289398, "grad_norm": 0.1861813989832312, "learning_rate": 8.342354174740904e-06, "loss": 0.3613, "step": 180 }, { "epoch": 1.0859598853868195, "grad_norm": 0.18944792927613582, "learning_rate": 8.08762307473096e-06, "loss": 0.3477, "step": 190 }, { "epoch": 1.143266475644699, "grad_norm": 0.19241082013181077, "learning_rate": 7.81921847492168e-06, "loss": 0.3528, "step": 200 }, { "epoch": 1.2005730659025788, "grad_norm": 0.2154742217299199, "learning_rate": 7.5383290016942e-06, "loss": 0.351, "step": 210 }, { "epoch": 1.2578796561604584, "grad_norm": 0.20901304048619337, "learning_rate": 7.246198570533944e-06, "loss": 0.351, "step": 220 }, { "epoch": 1.3151862464183381, "grad_norm": 0.19073549086926014, "learning_rate": 6.944120877366605e-06, "loss": 0.3557, "step": 230 }, { "epoch": 1.3724928366762177, "grad_norm": 0.17646170009433357, "learning_rate": 6.633433669442066e-06, "loss": 0.35, "step": 240 }, { "epoch": 1.4297994269340975, "grad_norm": 0.19599623413727513, "learning_rate": 6.315512821137606e-06, "loss": 0.3473, "step": 250 }, { "epoch": 1.487106017191977, "grad_norm": 0.1780821708268571, "learning_rate": 5.9917662409155896e-06, "loss": 0.3516, "step": 260 }, { "epoch": 1.5444126074498568, "grad_norm": 0.17861634863865428, "learning_rate": 5.663627636418611e-06, "loss": 0.3501, "step": 270 }, { "epoch": 1.6017191977077365, "grad_norm": 0.19324138154633005, "learning_rate": 5.332550165313312e-06, "loss": 0.3482, "step": 280 }, { "epoch": 1.659025787965616, "grad_norm": 0.1756126120758286, "learning_rate": 5e-06, "loss": 0.3438, "step": 290 }, { "epoch": 1.7163323782234956, "grad_norm": 0.17188834689286137, "learning_rate": 4.667449834686689e-06, "loss": 0.3452, "step": 300 }, { "epoch": 1.7736389684813754, "grad_norm": 0.1837742901594931, "learning_rate": 4.336372363581391e-06, "loss": 0.3473, "step": 310 }, { "epoch": 1.8309455587392551, "grad_norm": 0.1672330512230618, "learning_rate": 4.00823375908441e-06, "loss": 0.3497, "step": 320 }, { "epoch": 1.8882521489971347, "grad_norm": 0.17943679283278077, "learning_rate": 3.6844871788623946e-06, "loss": 0.3422, "step": 330 }, { "epoch": 1.9455587392550142, "grad_norm": 0.17561326371630695, "learning_rate": 3.366566330557935e-06, "loss": 0.3434, "step": 340 }, { "epoch": 2.0, "grad_norm": 0.21808711950541695, "learning_rate": 3.0558791226333974e-06, "loss": 0.3411, "step": 350 }, { "epoch": 2.0573065902578795, "grad_norm": 0.16805885653283237, "learning_rate": 2.7538014294660564e-06, "loss": 0.3338, "step": 360 }, { "epoch": 2.1146131805157595, "grad_norm": 0.1571105011704873, "learning_rate": 2.461670998305802e-06, "loss": 0.3365, "step": 370 }, { "epoch": 2.171919770773639, "grad_norm": 0.15487232581993202, "learning_rate": 2.1807815250783194e-06, "loss": 0.3266, "step": 380 }, { "epoch": 2.2292263610315186, "grad_norm": 0.15903611974395213, "learning_rate": 1.912376925269041e-06, "loss": 0.3306, "step": 390 }, { "epoch": 2.286532951289398, "grad_norm": 0.1602582517852452, "learning_rate": 1.6576458252590988e-06, "loss": 0.3338, "step": 400 }, { "epoch": 2.343839541547278, "grad_norm": 0.153892591517447, "learning_rate": 1.4177162985084242e-06, "loss": 0.3391, "step": 410 }, { "epoch": 2.4011461318051577, "grad_norm": 0.1670245531773294, "learning_rate": 1.1936508698966664e-06, "loss": 0.3368, "step": 420 }, { "epoch": 2.458452722063037, "grad_norm": 0.15231997853829518, "learning_rate": 9.86441810345183e-07, "loss": 0.3366, "step": 430 }, { "epoch": 2.5157593123209168, "grad_norm": 0.16138543102686964, "learning_rate": 7.970067425577849e-07, "loss": 0.3345, "step": 440 }, { "epoch": 2.5730659025787963, "grad_norm": 0.16815491218506493, "learning_rate": 6.261845773401936e-07, "loss": 0.3308, "step": 450 }, { "epoch": 2.6303724928366763, "grad_norm": 0.1645142556469246, "learning_rate": 4.747317984941213e-07, "loss": 0.3291, "step": 460 }, { "epoch": 2.687679083094556, "grad_norm": 0.1554646689171431, "learning_rate": 3.433191127383079e-07, "loss": 0.3341, "step": 470 }, { "epoch": 2.7449856733524354, "grad_norm": 0.14919476246429758, "learning_rate": 2.325284794922883e-07, "loss": 0.3337, "step": 480 }, { "epoch": 2.8022922636103154, "grad_norm": 0.1381795881550593, "learning_rate": 1.4285053367645074e-07, "loss": 0.3353, "step": 490 }, { "epoch": 2.859598853868195, "grad_norm": 0.1417249974490203, "learning_rate": 7.468241294148471e-08, "loss": 0.3307, "step": 500 }, { "epoch": 2.9169054441260744, "grad_norm": 0.14380681025290207, "learning_rate": 2.8325998949314536e-08, "loss": 0.3309, "step": 510 }, { "epoch": 2.974212034383954, "grad_norm": 0.14738174330263265, "learning_rate": 3.9865804939659414e-09, "loss": 0.3333, "step": 520 } ], "logging_steps": 10, "max_steps": 525, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5069159780057088.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }