both_pack_32768 / trainer_state.json
LHL3341's picture
upload checkpoint
4e7b975 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 525,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05730659025787966,
"grad_norm": 2.0758858092387618,
"learning_rate": 1.6981132075471698e-06,
"loss": 0.5683,
"step": 10
},
{
"epoch": 0.11461318051575932,
"grad_norm": 1.032745783560496,
"learning_rate": 3.5849056603773586e-06,
"loss": 0.511,
"step": 20
},
{
"epoch": 0.17191977077363896,
"grad_norm": 0.6337965187055514,
"learning_rate": 5.4716981132075475e-06,
"loss": 0.4697,
"step": 30
},
{
"epoch": 0.22922636103151864,
"grad_norm": 0.3341583249899249,
"learning_rate": 7.358490566037736e-06,
"loss": 0.4383,
"step": 40
},
{
"epoch": 0.28653295128939826,
"grad_norm": 0.24105124040802722,
"learning_rate": 9.245283018867926e-06,
"loss": 0.415,
"step": 50
},
{
"epoch": 0.3438395415472779,
"grad_norm": 0.1979988380418763,
"learning_rate": 9.996013419506035e-06,
"loss": 0.4052,
"step": 60
},
{
"epoch": 0.40114613180515757,
"grad_norm": 0.2042121825495575,
"learning_rate": 9.971674001050687e-06,
"loss": 0.4027,
"step": 70
},
{
"epoch": 0.4584527220630373,
"grad_norm": 0.20586101864992135,
"learning_rate": 9.925317587058516e-06,
"loss": 0.3908,
"step": 80
},
{
"epoch": 0.5157593123209169,
"grad_norm": 0.19753266562175947,
"learning_rate": 9.85714946632355e-06,
"loss": 0.3824,
"step": 90
},
{
"epoch": 0.5730659025787965,
"grad_norm": 0.2014408335897292,
"learning_rate": 9.767471520507713e-06,
"loss": 0.3843,
"step": 100
},
{
"epoch": 0.6303724928366762,
"grad_norm": 0.21455878509840362,
"learning_rate": 9.656680887261693e-06,
"loss": 0.3797,
"step": 110
},
{
"epoch": 0.6876790830945558,
"grad_norm": 0.19677946400066818,
"learning_rate": 9.52526820150588e-06,
"loss": 0.3763,
"step": 120
},
{
"epoch": 0.7449856733524355,
"grad_norm": 0.18157944977618248,
"learning_rate": 9.373815422659806e-06,
"loss": 0.3756,
"step": 130
},
{
"epoch": 0.8022922636103151,
"grad_norm": 0.20079167101829795,
"learning_rate": 9.202993257442216e-06,
"loss": 0.3735,
"step": 140
},
{
"epoch": 0.8595988538681948,
"grad_norm": 0.22524995002273018,
"learning_rate": 9.013558189654819e-06,
"loss": 0.3704,
"step": 150
},
{
"epoch": 0.9169054441260746,
"grad_norm": 0.20366626027514875,
"learning_rate": 8.806349130103334e-06,
"loss": 0.3649,
"step": 160
},
{
"epoch": 0.9742120343839542,
"grad_norm": 0.20483109220703685,
"learning_rate": 8.582283701491576e-06,
"loss": 0.3726,
"step": 170
},
{
"epoch": 1.0286532951289398,
"grad_norm": 0.1861813989832312,
"learning_rate": 8.342354174740904e-06,
"loss": 0.3613,
"step": 180
},
{
"epoch": 1.0859598853868195,
"grad_norm": 0.18944792927613582,
"learning_rate": 8.08762307473096e-06,
"loss": 0.3477,
"step": 190
},
{
"epoch": 1.143266475644699,
"grad_norm": 0.19241082013181077,
"learning_rate": 7.81921847492168e-06,
"loss": 0.3528,
"step": 200
},
{
"epoch": 1.2005730659025788,
"grad_norm": 0.2154742217299199,
"learning_rate": 7.5383290016942e-06,
"loss": 0.351,
"step": 210
},
{
"epoch": 1.2578796561604584,
"grad_norm": 0.20901304048619337,
"learning_rate": 7.246198570533944e-06,
"loss": 0.351,
"step": 220
},
{
"epoch": 1.3151862464183381,
"grad_norm": 0.19073549086926014,
"learning_rate": 6.944120877366605e-06,
"loss": 0.3557,
"step": 230
},
{
"epoch": 1.3724928366762177,
"grad_norm": 0.17646170009433357,
"learning_rate": 6.633433669442066e-06,
"loss": 0.35,
"step": 240
},
{
"epoch": 1.4297994269340975,
"grad_norm": 0.19599623413727513,
"learning_rate": 6.315512821137606e-06,
"loss": 0.3473,
"step": 250
},
{
"epoch": 1.487106017191977,
"grad_norm": 0.1780821708268571,
"learning_rate": 5.9917662409155896e-06,
"loss": 0.3516,
"step": 260
},
{
"epoch": 1.5444126074498568,
"grad_norm": 0.17861634863865428,
"learning_rate": 5.663627636418611e-06,
"loss": 0.3501,
"step": 270
},
{
"epoch": 1.6017191977077365,
"grad_norm": 0.19324138154633005,
"learning_rate": 5.332550165313312e-06,
"loss": 0.3482,
"step": 280
},
{
"epoch": 1.659025787965616,
"grad_norm": 0.1756126120758286,
"learning_rate": 5e-06,
"loss": 0.3438,
"step": 290
},
{
"epoch": 1.7163323782234956,
"grad_norm": 0.17188834689286137,
"learning_rate": 4.667449834686689e-06,
"loss": 0.3452,
"step": 300
},
{
"epoch": 1.7736389684813754,
"grad_norm": 0.1837742901594931,
"learning_rate": 4.336372363581391e-06,
"loss": 0.3473,
"step": 310
},
{
"epoch": 1.8309455587392551,
"grad_norm": 0.1672330512230618,
"learning_rate": 4.00823375908441e-06,
"loss": 0.3497,
"step": 320
},
{
"epoch": 1.8882521489971347,
"grad_norm": 0.17943679283278077,
"learning_rate": 3.6844871788623946e-06,
"loss": 0.3422,
"step": 330
},
{
"epoch": 1.9455587392550142,
"grad_norm": 0.17561326371630695,
"learning_rate": 3.366566330557935e-06,
"loss": 0.3434,
"step": 340
},
{
"epoch": 2.0,
"grad_norm": 0.21808711950541695,
"learning_rate": 3.0558791226333974e-06,
"loss": 0.3411,
"step": 350
},
{
"epoch": 2.0573065902578795,
"grad_norm": 0.16805885653283237,
"learning_rate": 2.7538014294660564e-06,
"loss": 0.3338,
"step": 360
},
{
"epoch": 2.1146131805157595,
"grad_norm": 0.1571105011704873,
"learning_rate": 2.461670998305802e-06,
"loss": 0.3365,
"step": 370
},
{
"epoch": 2.171919770773639,
"grad_norm": 0.15487232581993202,
"learning_rate": 2.1807815250783194e-06,
"loss": 0.3266,
"step": 380
},
{
"epoch": 2.2292263610315186,
"grad_norm": 0.15903611974395213,
"learning_rate": 1.912376925269041e-06,
"loss": 0.3306,
"step": 390
},
{
"epoch": 2.286532951289398,
"grad_norm": 0.1602582517852452,
"learning_rate": 1.6576458252590988e-06,
"loss": 0.3338,
"step": 400
},
{
"epoch": 2.343839541547278,
"grad_norm": 0.153892591517447,
"learning_rate": 1.4177162985084242e-06,
"loss": 0.3391,
"step": 410
},
{
"epoch": 2.4011461318051577,
"grad_norm": 0.1670245531773294,
"learning_rate": 1.1936508698966664e-06,
"loss": 0.3368,
"step": 420
},
{
"epoch": 2.458452722063037,
"grad_norm": 0.15231997853829518,
"learning_rate": 9.86441810345183e-07,
"loss": 0.3366,
"step": 430
},
{
"epoch": 2.5157593123209168,
"grad_norm": 0.16138543102686964,
"learning_rate": 7.970067425577849e-07,
"loss": 0.3345,
"step": 440
},
{
"epoch": 2.5730659025787963,
"grad_norm": 0.16815491218506493,
"learning_rate": 6.261845773401936e-07,
"loss": 0.3308,
"step": 450
},
{
"epoch": 2.6303724928366763,
"grad_norm": 0.1645142556469246,
"learning_rate": 4.747317984941213e-07,
"loss": 0.3291,
"step": 460
},
{
"epoch": 2.687679083094556,
"grad_norm": 0.1554646689171431,
"learning_rate": 3.433191127383079e-07,
"loss": 0.3341,
"step": 470
},
{
"epoch": 2.7449856733524354,
"grad_norm": 0.14919476246429758,
"learning_rate": 2.325284794922883e-07,
"loss": 0.3337,
"step": 480
},
{
"epoch": 2.8022922636103154,
"grad_norm": 0.1381795881550593,
"learning_rate": 1.4285053367645074e-07,
"loss": 0.3353,
"step": 490
},
{
"epoch": 2.859598853868195,
"grad_norm": 0.1417249974490203,
"learning_rate": 7.468241294148471e-08,
"loss": 0.3307,
"step": 500
},
{
"epoch": 2.9169054441260744,
"grad_norm": 0.14380681025290207,
"learning_rate": 2.8325998949314536e-08,
"loss": 0.3309,
"step": 510
},
{
"epoch": 2.974212034383954,
"grad_norm": 0.14738174330263265,
"learning_rate": 3.9865804939659414e-09,
"loss": 0.3333,
"step": 520
}
],
"logging_steps": 10,
"max_steps": 525,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5069159780057088.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}