both_pack_32768_32 / trainer_state.json
LHL3341's picture
upload checkpoint
682ebb2 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1047,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02865329512893983,
"grad_norm": 2.6186984732280814,
"learning_rate": 8.571428571428572e-07,
"loss": 0.5731,
"step": 10
},
{
"epoch": 0.05730659025787966,
"grad_norm": 1.0232811311533683,
"learning_rate": 1.8095238095238097e-06,
"loss": 0.5412,
"step": 20
},
{
"epoch": 0.08595988538681948,
"grad_norm": 0.9701094427215733,
"learning_rate": 2.7619047619047625e-06,
"loss": 0.4937,
"step": 30
},
{
"epoch": 0.11461318051575932,
"grad_norm": 0.5323948276277348,
"learning_rate": 3.7142857142857146e-06,
"loss": 0.4582,
"step": 40
},
{
"epoch": 0.14326647564469913,
"grad_norm": 0.36885209030841964,
"learning_rate": 4.666666666666667e-06,
"loss": 0.4392,
"step": 50
},
{
"epoch": 0.17191977077363896,
"grad_norm": 0.27190324169109104,
"learning_rate": 5.619047619047619e-06,
"loss": 0.4255,
"step": 60
},
{
"epoch": 0.20057306590257878,
"grad_norm": 0.284362183163526,
"learning_rate": 6.571428571428572e-06,
"loss": 0.42,
"step": 70
},
{
"epoch": 0.22922636103151864,
"grad_norm": 0.2415087591622169,
"learning_rate": 7.523809523809524e-06,
"loss": 0.4006,
"step": 80
},
{
"epoch": 0.25787965616045844,
"grad_norm": 0.2832959959768393,
"learning_rate": 8.476190476190477e-06,
"loss": 0.3976,
"step": 90
},
{
"epoch": 0.28653295128939826,
"grad_norm": 0.2819249571240325,
"learning_rate": 9.42857142857143e-06,
"loss": 0.3887,
"step": 100
},
{
"epoch": 0.3151862464183381,
"grad_norm": 0.26289638357241263,
"learning_rate": 9.999555111181558e-06,
"loss": 0.3914,
"step": 110
},
{
"epoch": 0.3438395415472779,
"grad_norm": 0.31346359091894765,
"learning_rate": 9.994551021152415e-06,
"loss": 0.3847,
"step": 120
},
{
"epoch": 0.37249283667621774,
"grad_norm": 0.25447340280629915,
"learning_rate": 9.983992313852776e-06,
"loss": 0.3883,
"step": 130
},
{
"epoch": 0.40114613180515757,
"grad_norm": 0.2713615676238601,
"learning_rate": 9.967890731995383e-06,
"loss": 0.3885,
"step": 140
},
{
"epoch": 0.4297994269340974,
"grad_norm": 0.29785890393022935,
"learning_rate": 9.946264182720295e-06,
"loss": 0.3811,
"step": 150
},
{
"epoch": 0.4584527220630373,
"grad_norm": 0.31068169293825476,
"learning_rate": 9.919136717679723e-06,
"loss": 0.3755,
"step": 160
},
{
"epoch": 0.4871060171919771,
"grad_norm": 0.291914065891215,
"learning_rate": 9.88653850628933e-06,
"loss": 0.3727,
"step": 170
},
{
"epoch": 0.5157593123209169,
"grad_norm": 0.28645514791771215,
"learning_rate": 9.848505802175762e-06,
"loss": 0.3694,
"step": 180
},
{
"epoch": 0.5444126074498568,
"grad_norm": 0.26191461156518464,
"learning_rate": 9.8050809028577e-06,
"loss": 0.3704,
"step": 190
},
{
"epoch": 0.5730659025787965,
"grad_norm": 0.2782451749750521,
"learning_rate": 9.756312102705284e-06,
"loss": 0.3766,
"step": 200
},
{
"epoch": 0.6017191977077364,
"grad_norm": 0.31746617801856314,
"learning_rate": 9.702253639230246e-06,
"loss": 0.3639,
"step": 210
},
{
"epoch": 0.6303724928366762,
"grad_norm": 0.3104703712769699,
"learning_rate": 9.642965632766437e-06,
"loss": 0.3749,
"step": 220
},
{
"epoch": 0.6590257879656161,
"grad_norm": 0.25481045697197613,
"learning_rate": 9.57851401960788e-06,
"loss": 0.3691,
"step": 230
},
{
"epoch": 0.6876790830945558,
"grad_norm": 0.2569207157034273,
"learning_rate": 9.508970478678676e-06,
"loss": 0.364,
"step": 240
},
{
"epoch": 0.7163323782234957,
"grad_norm": 0.3010191468789677,
"learning_rate": 9.434412351816329e-06,
"loss": 0.3699,
"step": 250
},
{
"epoch": 0.7449856733524355,
"grad_norm": 0.2845353099007207,
"learning_rate": 9.354922557757153e-06,
"loss": 0.3626,
"step": 260
},
{
"epoch": 0.7736389684813754,
"grad_norm": 0.2583262736091065,
"learning_rate": 9.270589499919405e-06,
"loss": 0.367,
"step": 270
},
{
"epoch": 0.8022922636103151,
"grad_norm": 0.2734910827357129,
"learning_rate": 9.181506968086696e-06,
"loss": 0.3616,
"step": 280
},
{
"epoch": 0.830945558739255,
"grad_norm": 0.3163373871358028,
"learning_rate": 9.087774034101069e-06,
"loss": 0.3603,
"step": 290
},
{
"epoch": 0.8595988538681948,
"grad_norm": 0.27019612276333577,
"learning_rate": 8.989494941681672e-06,
"loss": 0.3625,
"step": 300
},
{
"epoch": 0.8882521489971347,
"grad_norm": 0.26011182825983586,
"learning_rate": 8.886778990491632e-06,
"loss": 0.3571,
"step": 310
},
{
"epoch": 0.9169054441260746,
"grad_norm": 0.2602289671922979,
"learning_rate": 8.77974041458202e-06,
"loss": 0.3551,
"step": 320
},
{
"epoch": 0.9455587392550143,
"grad_norm": 0.28489190685342874,
"learning_rate": 8.668498255348119e-06,
"loss": 0.3651,
"step": 330
},
{
"epoch": 0.9742120343839542,
"grad_norm": 0.2907841661352505,
"learning_rate": 8.553176229139262e-06,
"loss": 0.3621,
"step": 340
},
{
"epoch": 1.002865329512894,
"grad_norm": 0.29410444361118165,
"learning_rate": 8.433902589669489e-06,
"loss": 0.3526,
"step": 350
},
{
"epoch": 1.0315186246418337,
"grad_norm": 0.2629411999849927,
"learning_rate": 8.310809985382059e-06,
"loss": 0.3428,
"step": 360
},
{
"epoch": 1.0601719197707737,
"grad_norm": 0.2664030042868273,
"learning_rate": 8.184035311926397e-06,
"loss": 0.3352,
"step": 370
},
{
"epoch": 1.0888252148997135,
"grad_norm": 0.27448456132450433,
"learning_rate": 8.053719559911605e-06,
"loss": 0.3313,
"step": 380
},
{
"epoch": 1.1174785100286533,
"grad_norm": 0.280680513900802,
"learning_rate": 7.92000765810579e-06,
"loss": 0.3372,
"step": 390
},
{
"epoch": 1.146131805157593,
"grad_norm": 0.2696702891136822,
"learning_rate": 7.783048312255653e-06,
"loss": 0.3418,
"step": 400
},
{
"epoch": 1.174785100286533,
"grad_norm": 0.31145780582148586,
"learning_rate": 7.642993839705557e-06,
"loss": 0.3374,
"step": 410
},
{
"epoch": 1.2034383954154728,
"grad_norm": 0.2869659364327292,
"learning_rate": 7.500000000000001e-06,
"loss": 0.3363,
"step": 420
},
{
"epoch": 1.2320916905444126,
"grad_norm": 0.25325335643256375,
"learning_rate": 7.3542258216579136e-06,
"loss": 0.3316,
"step": 430
},
{
"epoch": 1.2607449856733524,
"grad_norm": 0.2772339994096704,
"learning_rate": 7.205833425311394e-06,
"loss": 0.3436,
"step": 440
},
{
"epoch": 1.2893982808022924,
"grad_norm": 0.29278050839602937,
"learning_rate": 7.0549878434056155e-06,
"loss": 0.3406,
"step": 450
},
{
"epoch": 1.3180515759312321,
"grad_norm": 0.28430937748006735,
"learning_rate": 6.901856836660386e-06,
"loss": 0.3432,
"step": 460
},
{
"epoch": 1.346704871060172,
"grad_norm": 0.2803846351745633,
"learning_rate": 6.746610707497511e-06,
"loss": 0.34,
"step": 470
},
{
"epoch": 1.3753581661891117,
"grad_norm": 0.27906165695648083,
"learning_rate": 6.58942211064142e-06,
"loss": 0.3353,
"step": 480
},
{
"epoch": 1.4040114613180517,
"grad_norm": 0.28984790937034516,
"learning_rate": 6.43046586110374e-06,
"loss": 0.3309,
"step": 490
},
{
"epoch": 1.4326647564469914,
"grad_norm": 0.23182165178077277,
"learning_rate": 6.269918739765313e-06,
"loss": 0.3355,
"step": 500
},
{
"epoch": 1.4613180515759312,
"grad_norm": 0.2488774130896262,
"learning_rate": 6.107959296771915e-06,
"loss": 0.3329,
"step": 510
},
{
"epoch": 1.4899713467048712,
"grad_norm": 0.24874727627825863,
"learning_rate": 5.944767652962309e-06,
"loss": 0.3438,
"step": 520
},
{
"epoch": 1.518624641833811,
"grad_norm": 0.2911542320121491,
"learning_rate": 5.780525299549473e-06,
"loss": 0.3359,
"step": 530
},
{
"epoch": 1.5472779369627507,
"grad_norm": 0.28119928239607705,
"learning_rate": 5.615414896277786e-06,
"loss": 0.336,
"step": 540
},
{
"epoch": 1.5759312320916905,
"grad_norm": 0.2765768847959203,
"learning_rate": 5.44962006828065e-06,
"loss": 0.3404,
"step": 550
},
{
"epoch": 1.6045845272206303,
"grad_norm": 0.27806174427298036,
"learning_rate": 5.283325201864475e-06,
"loss": 0.3304,
"step": 560
},
{
"epoch": 1.63323782234957,
"grad_norm": 0.2404022014715492,
"learning_rate": 5.116715239446121e-06,
"loss": 0.3295,
"step": 570
},
{
"epoch": 1.66189111747851,
"grad_norm": 0.2513689234570333,
"learning_rate": 4.9499754738718835e-06,
"loss": 0.3342,
"step": 580
},
{
"epoch": 1.6905444126074498,
"grad_norm": 0.2429044062755473,
"learning_rate": 4.7832913423467555e-06,
"loss": 0.3364,
"step": 590
},
{
"epoch": 1.7191977077363898,
"grad_norm": 0.23681124768665515,
"learning_rate": 4.616848220203124e-06,
"loss": 0.3275,
"step": 600
},
{
"epoch": 1.7478510028653296,
"grad_norm": 0.2491649241512219,
"learning_rate": 4.450831214738303e-06,
"loss": 0.3385,
"step": 610
},
{
"epoch": 1.7765042979942693,
"grad_norm": 0.7732756792555575,
"learning_rate": 4.285424959350139e-06,
"loss": 0.3303,
"step": 620
},
{
"epoch": 1.8051575931232091,
"grad_norm": 0.24958098955725797,
"learning_rate": 4.1208134081996625e-06,
"loss": 0.3382,
"step": 630
},
{
"epoch": 1.8338108882521489,
"grad_norm": 0.2713258221481738,
"learning_rate": 3.957179631629148e-06,
"loss": 0.3348,
"step": 640
},
{
"epoch": 1.8624641833810889,
"grad_norm": 0.23269097420301693,
"learning_rate": 3.7947056125630904e-06,
"loss": 0.3332,
"step": 650
},
{
"epoch": 1.8911174785100286,
"grad_norm": 0.2800452866317965,
"learning_rate": 3.6335720441185474e-06,
"loss": 0.3263,
"step": 660
},
{
"epoch": 1.9197707736389686,
"grad_norm": 0.2244027553245592,
"learning_rate": 3.4739581286499147e-06,
"loss": 0.3347,
"step": 670
},
{
"epoch": 1.9484240687679084,
"grad_norm": 0.36236606952760386,
"learning_rate": 3.3160413784516342e-06,
"loss": 0.3251,
"step": 680
},
{
"epoch": 1.9770773638968482,
"grad_norm": 0.2560138762411745,
"learning_rate": 3.1599974183404784e-06,
"loss": 0.3311,
"step": 690
},
{
"epoch": 2.005730659025788,
"grad_norm": 0.24846783217066923,
"learning_rate": 3.0059997903369658e-06,
"loss": 0.3263,
"step": 700
},
{
"epoch": 2.0343839541547277,
"grad_norm": 0.2561038844246594,
"learning_rate": 2.854219760663125e-06,
"loss": 0.3194,
"step": 710
},
{
"epoch": 2.0630372492836675,
"grad_norm": 0.21888324162969877,
"learning_rate": 2.704826129271257e-06,
"loss": 0.3106,
"step": 720
},
{
"epoch": 2.0916905444126073,
"grad_norm": 0.23634859060639352,
"learning_rate": 2.5579850421155294e-06,
"loss": 0.3215,
"step": 730
},
{
"epoch": 2.1203438395415475,
"grad_norm": 0.40366184729596755,
"learning_rate": 2.413859806375159e-06,
"loss": 0.3119,
"step": 740
},
{
"epoch": 2.1489971346704873,
"grad_norm": 0.21759429767669508,
"learning_rate": 2.272610708834719e-06,
"loss": 0.3094,
"step": 750
},
{
"epoch": 2.177650429799427,
"grad_norm": 0.21797932422789995,
"learning_rate": 2.1343948376235146e-06,
"loss": 0.3081,
"step": 760
},
{
"epoch": 2.206303724928367,
"grad_norm": 0.22015734090400496,
"learning_rate": 1.9993659075123117e-06,
"loss": 0.3118,
"step": 770
},
{
"epoch": 2.2349570200573066,
"grad_norm": 0.5859620771202967,
"learning_rate": 1.8676740889616835e-06,
"loss": 0.3194,
"step": 780
},
{
"epoch": 2.2636103151862463,
"grad_norm": 0.21809409710426375,
"learning_rate": 1.739465841112125e-06,
"loss": 0.3156,
"step": 790
},
{
"epoch": 2.292263610315186,
"grad_norm": 0.311384088738318,
"learning_rate": 1.6148837489016406e-06,
"loss": 0.3105,
"step": 800
},
{
"epoch": 2.3209169054441263,
"grad_norm": 0.22764092498020874,
"learning_rate": 1.49406636449199e-06,
"loss": 0.3209,
"step": 810
},
{
"epoch": 2.349570200573066,
"grad_norm": 0.22336147023416364,
"learning_rate": 1.3771480531799054e-06,
"loss": 0.3217,
"step": 820
},
{
"epoch": 2.378223495702006,
"grad_norm": 0.22717728346077168,
"learning_rate": 1.2642588439646951e-06,
"loss": 0.3211,
"step": 830
},
{
"epoch": 2.4068767908309456,
"grad_norm": 0.208665334531538,
"learning_rate": 1.1555242849383668e-06,
"loss": 0.3183,
"step": 840
},
{
"epoch": 2.4355300859598854,
"grad_norm": 0.21821518140433077,
"learning_rate": 1.0510653036591583e-06,
"loss": 0.3188,
"step": 850
},
{
"epoch": 2.464183381088825,
"grad_norm": 0.22416726569351633,
"learning_rate": 9.509980726637003e-07,
"loss": 0.3167,
"step": 860
},
{
"epoch": 2.492836676217765,
"grad_norm": 0.2135708085936856,
"learning_rate": 8.5543388026743e-07,
"loss": 0.315,
"step": 870
},
{
"epoch": 2.5214899713467047,
"grad_norm": 0.20626691858117885,
"learning_rate": 7.644790067969005e-07,
"loss": 0.3151,
"step": 880
},
{
"epoch": 2.5501432664756445,
"grad_norm": 0.203689315528671,
"learning_rate": 6.7823460639167e-07,
"loss": 0.3122,
"step": 890
},
{
"epoch": 2.5787965616045847,
"grad_norm": 0.20939291314273653,
"learning_rate": 5.967965945071896e-07,
"loss": 0.3153,
"step": 900
},
{
"epoch": 2.6074498567335245,
"grad_norm": 0.23023262138795553,
"learning_rate": 5.202555412438309e-07,
"loss": 0.3094,
"step": 910
},
{
"epoch": 2.6361031518624642,
"grad_norm": 0.235572721264662,
"learning_rate": 4.486965706206597e-07,
"loss": 0.3146,
"step": 920
},
{
"epoch": 2.664756446991404,
"grad_norm": 0.2315057700635808,
"learning_rate": 3.8219926590600365e-07,
"loss": 0.3144,
"step": 930
},
{
"epoch": 2.693409742120344,
"grad_norm": 0.21729980205720664,
"learning_rate": 3.2083758111006946e-07,
"loss": 0.3191,
"step": 940
},
{
"epoch": 2.7220630372492836,
"grad_norm": 0.2031412349579613,
"learning_rate": 2.6467975873807617e-07,
"loss": 0.3127,
"step": 950
},
{
"epoch": 2.7507163323782233,
"grad_norm": 0.19878302412351165,
"learning_rate": 2.1378825389533508e-07,
"loss": 0.3169,
"step": 960
},
{
"epoch": 2.7793696275071635,
"grad_norm": 0.20562940798947288,
"learning_rate": 1.6821966482872264e-07,
"loss": 0.3197,
"step": 970
},
{
"epoch": 2.8080229226361033,
"grad_norm": 0.20192515439147846,
"learning_rate": 1.28024669981755e-07,
"loss": 0.3154,
"step": 980
},
{
"epoch": 2.836676217765043,
"grad_norm": 0.20191838585812194,
"learning_rate": 9.324797163330012e-08,
"loss": 0.3125,
"step": 990
},
{
"epoch": 2.865329512893983,
"grad_norm": 0.2153477946539778,
"learning_rate": 6.39282461825852e-08,
"loss": 0.3119,
"step": 1000
},
{
"epoch": 2.8939828080229226,
"grad_norm": 0.21913342629281218,
"learning_rate": 4.009810113580426e-08,
"loss": 0.3175,
"step": 1010
},
{
"epoch": 2.9226361031518624,
"grad_norm": 0.19798983439143433,
"learning_rate": 2.178403884215141e-08,
"loss": 0.3081,
"step": 1020
},
{
"epoch": 2.951289398280802,
"grad_norm": 0.22183768018570765,
"learning_rate": 9.006427019622177e-09,
"loss": 0.3181,
"step": 1030
},
{
"epoch": 2.9799426934097424,
"grad_norm": 0.1912655799602591,
"learning_rate": 1.7794761033496089e-09,
"loss": 0.3146,
"step": 1040
}
],
"logging_steps": 10,
"max_steps": 1047,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5074006013116416.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}