1_sft_label1_708 / trainer_state.json
birdyzz's picture
11
6e01c4c
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 708,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02824858757062147,
"grad_norm": 5.842401568474987,
"learning_rate": 5e-06,
"loss": 2.0396,
"step": 10
},
{
"epoch": 0.05649717514124294,
"grad_norm": 3.6388768595262957,
"learning_rate": 4.997468222143782e-06,
"loss": 1.7131,
"step": 20
},
{
"epoch": 0.0847457627118644,
"grad_norm": 1.8710689080902667,
"learning_rate": 4.989878016494418e-06,
"loss": 1.5907,
"step": 30
},
{
"epoch": 0.11299435028248588,
"grad_norm": 1.880666572553662,
"learning_rate": 4.977244756423578e-06,
"loss": 1.502,
"step": 40
},
{
"epoch": 0.14124293785310735,
"grad_norm": 1.6520564729299156,
"learning_rate": 4.959594029617741e-06,
"loss": 1.4357,
"step": 50
},
{
"epoch": 0.1694915254237288,
"grad_norm": 1.7284626155394651,
"learning_rate": 4.9369615862523266e-06,
"loss": 1.3794,
"step": 60
},
{
"epoch": 0.1977401129943503,
"grad_norm": 1.7131842535751252,
"learning_rate": 4.90939326658249e-06,
"loss": 1.3396,
"step": 70
},
{
"epoch": 0.22598870056497175,
"grad_norm": 1.6872621752920653,
"learning_rate": 4.876944908097249e-06,
"loss": 1.3119,
"step": 80
},
{
"epoch": 0.2542372881355932,
"grad_norm": 1.6781047658941155,
"learning_rate": 4.8396822324249915e-06,
"loss": 1.2932,
"step": 90
},
{
"epoch": 0.2824858757062147,
"grad_norm": 1.8225514648316858,
"learning_rate": 4.797680712219421e-06,
"loss": 1.2533,
"step": 100
},
{
"epoch": 0.3107344632768362,
"grad_norm": 1.4304345855914073,
"learning_rate": 4.751025418295565e-06,
"loss": 1.2581,
"step": 110
},
{
"epoch": 0.3389830508474576,
"grad_norm": 2.489693154849688,
"learning_rate": 4.699810847325449e-06,
"loss": 1.2615,
"step": 120
},
{
"epoch": 0.3672316384180791,
"grad_norm": 1.497470889635536,
"learning_rate": 4.644140730442432e-06,
"loss": 1.2385,
"step": 130
},
{
"epoch": 0.3954802259887006,
"grad_norm": 1.4852922561551125,
"learning_rate": 4.584127823141855e-06,
"loss": 1.2228,
"step": 140
},
{
"epoch": 0.423728813559322,
"grad_norm": 1.4374567982916069,
"learning_rate": 4.5198936769035504e-06,
"loss": 1.2254,
"step": 150
},
{
"epoch": 0.4519774011299435,
"grad_norm": 1.4496375733325404,
"learning_rate": 4.451568392998767e-06,
"loss": 1.2265,
"step": 160
},
{
"epoch": 0.480225988700565,
"grad_norm": 1.2754388779607286,
"learning_rate": 4.3792903589801515e-06,
"loss": 1.1846,
"step": 170
},
{
"epoch": 0.5084745762711864,
"grad_norm": 1.3975405752824914,
"learning_rate": 4.30320596838852e-06,
"loss": 1.178,
"step": 180
},
{
"epoch": 0.536723163841808,
"grad_norm": 1.3749489429241677,
"learning_rate": 4.223469324244115e-06,
"loss": 1.1717,
"step": 190
},
{
"epoch": 0.5649717514124294,
"grad_norm": 1.393315372114993,
"learning_rate": 4.140241926922916e-06,
"loss": 1.181,
"step": 200
},
{
"epoch": 0.5932203389830508,
"grad_norm": 1.505577319035518,
"learning_rate": 4.0536923470501775e-06,
"loss": 1.1744,
"step": 210
},
{
"epoch": 0.6214689265536724,
"grad_norm": 1.457055249771263,
"learning_rate": 3.96399588407373e-06,
"loss": 1.151,
"step": 220
},
{
"epoch": 0.6497175141242938,
"grad_norm": 1.3812712992322103,
"learning_rate": 3.8713342112085685e-06,
"loss": 1.1787,
"step": 230
},
{
"epoch": 0.6779661016949152,
"grad_norm": 1.528224921984006,
"learning_rate": 3.775895007471876e-06,
"loss": 1.158,
"step": 240
},
{
"epoch": 0.7062146892655368,
"grad_norm": 1.4254969486921814,
"learning_rate": 3.677871577553763e-06,
"loss": 1.1662,
"step": 250
},
{
"epoch": 0.7344632768361582,
"grad_norm": 1.4029062285962588,
"learning_rate": 3.5774624602936344e-06,
"loss": 1.1716,
"step": 260
},
{
"epoch": 0.7627118644067796,
"grad_norm": 1.4621797650744357,
"learning_rate": 3.474871026555204e-06,
"loss": 1.1523,
"step": 270
},
{
"epoch": 0.7909604519774012,
"grad_norm": 1.576846783408568,
"learning_rate": 3.370305067314612e-06,
"loss": 1.169,
"step": 280
},
{
"epoch": 0.8192090395480226,
"grad_norm": 1.4553855180258597,
"learning_rate": 3.2639763727959554e-06,
"loss": 1.1374,
"step": 290
},
{
"epoch": 0.847457627118644,
"grad_norm": 1.3037242619402352,
"learning_rate": 3.1561003035066435e-06,
"loss": 1.144,
"step": 300
},
{
"epoch": 0.8757062146892656,
"grad_norm": 1.4064392305723843,
"learning_rate": 3.0468953540414304e-06,
"loss": 1.1396,
"step": 310
},
{
"epoch": 0.903954802259887,
"grad_norm": 1.2072866070703256,
"learning_rate": 2.936582710538593e-06,
"loss": 1.1388,
"step": 320
},
{
"epoch": 0.9322033898305084,
"grad_norm": 1.234511949962314,
"learning_rate": 2.8253858026845958e-06,
"loss": 1.149,
"step": 330
},
{
"epoch": 0.96045197740113,
"grad_norm": 1.3682697011912646,
"learning_rate": 2.7135298511746276e-06,
"loss": 1.1463,
"step": 340
},
{
"epoch": 0.9887005649717514,
"grad_norm": 1.4595117952046124,
"learning_rate": 2.6012414115455826e-06,
"loss": 1.1339,
"step": 350
},
{
"epoch": 1.0169491525423728,
"grad_norm": 1.365355679432521,
"learning_rate": 2.488747915305431e-06,
"loss": 1.0592,
"step": 360
},
{
"epoch": 1.0451977401129944,
"grad_norm": 1.4547925198477407,
"learning_rate": 2.376277209288372e-06,
"loss": 1.0288,
"step": 370
},
{
"epoch": 1.073446327683616,
"grad_norm": 1.2787445662747539,
"learning_rate": 2.2640570941687794e-06,
"loss": 1.022,
"step": 380
},
{
"epoch": 1.1016949152542372,
"grad_norm": 1.3427132830957944,
"learning_rate": 2.1523148630686397e-06,
"loss": 1.0311,
"step": 390
},
{
"epoch": 1.1299435028248588,
"grad_norm": 1.446710010618482,
"learning_rate": 2.0412768411929948e-06,
"loss": 1.017,
"step": 400
},
{
"epoch": 1.1581920903954803,
"grad_norm": 1.34481917501583,
"learning_rate": 1.931167927425832e-06,
"loss": 1.0266,
"step": 410
},
{
"epoch": 1.1864406779661016,
"grad_norm": 1.3341116732215694,
"learning_rate": 1.8222111388148678e-06,
"loss": 1.0236,
"step": 420
},
{
"epoch": 1.2146892655367232,
"grad_norm": 1.345613554920642,
"learning_rate": 1.714627158867857e-06,
"loss": 1.0109,
"step": 430
},
{
"epoch": 1.2429378531073447,
"grad_norm": 1.2833581931894298,
"learning_rate": 1.6086338905752883e-06,
"loss": 0.9996,
"step": 440
},
{
"epoch": 1.271186440677966,
"grad_norm": 1.3699670070856527,
"learning_rate": 1.50444601506482e-06,
"loss": 1.0128,
"step": 450
},
{
"epoch": 1.2994350282485876,
"grad_norm": 1.2610914382885658,
"learning_rate": 1.4022745567813334e-06,
"loss": 1.0094,
"step": 460
},
{
"epoch": 1.327683615819209,
"grad_norm": 1.2627951088836191,
"learning_rate": 1.3023264560733268e-06,
"loss": 1.0206,
"step": 470
},
{
"epoch": 1.3559322033898304,
"grad_norm": 1.268968791060816,
"learning_rate": 1.2048041500513136e-06,
"loss": 1.0136,
"step": 480
},
{
"epoch": 1.384180790960452,
"grad_norm": 1.2875833787086686,
"learning_rate": 1.1099051625671928e-06,
"loss": 1.0029,
"step": 490
},
{
"epoch": 1.4124293785310735,
"grad_norm": 1.2562663969235288,
"learning_rate": 1.0178217041450355e-06,
"loss": 1.012,
"step": 500
},
{
"epoch": 1.4406779661016949,
"grad_norm": 1.2156705609077063,
"learning_rate": 9.287402826736089e-07,
"loss": 1.0057,
"step": 510
},
{
"epoch": 1.4689265536723164,
"grad_norm": 2.005409061506591,
"learning_rate": 8.428413256491386e-07,
"loss": 1.0042,
"step": 520
},
{
"epoch": 1.497175141242938,
"grad_norm": 1.2904127978163176,
"learning_rate": 7.602988147334372e-07,
"loss": 1.0029,
"step": 530
},
{
"epoch": 1.5254237288135593,
"grad_norm": 1.2327199709726766,
"learning_rate": 6.81279933367571e-07,
"loss": 0.999,
"step": 540
},
{
"epoch": 1.5536723163841808,
"grad_norm": 1.3251476089228396,
"learning_rate": 6.059447281547929e-07,
"loss": 1.0063,
"step": 550
},
{
"epoch": 1.5819209039548023,
"grad_norm": 1.2684033556573027,
"learning_rate": 5.344457846985837e-07,
"loss": 1.0041,
"step": 560
},
{
"epoch": 1.6101694915254239,
"grad_norm": 1.274213451781422,
"learning_rate": 4.6692791855237144e-07,
"loss": 1.0102,
"step": 570
},
{
"epoch": 1.6384180790960452,
"grad_norm": 1.2004747141578425,
"learning_rate": 4.0352788190688245e-07,
"loss": 0.9986,
"step": 580
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.2138867752979392,
"learning_rate": 3.443740866092074e-07,
"loss": 0.9936,
"step": 590
},
{
"epoch": 1.694915254237288,
"grad_norm": 1.3657230357705068,
"learning_rate": 2.895863440745822e-07,
"loss": 1.0085,
"step": 600
},
{
"epoch": 1.7231638418079096,
"grad_norm": 1.2178452975739655,
"learning_rate": 2.3927562261768095e-07,
"loss": 1.0114,
"step": 610
},
{
"epoch": 1.7514124293785311,
"grad_norm": 1.201511284318748,
"learning_rate": 1.935438226949146e-07,
"loss": 1.011,
"step": 620
},
{
"epoch": 1.7796610169491527,
"grad_norm": 1.2103707783122326,
"learning_rate": 1.5248357051297957e-07,
"loss": 1.0053,
"step": 630
},
{
"epoch": 1.807909604519774,
"grad_norm": 1.206125451520313,
"learning_rate": 1.1617803042167142e-07,
"loss": 0.9949,
"step": 640
},
{
"epoch": 1.8361581920903953,
"grad_norm": 1.327611608679995,
"learning_rate": 8.4700736470959e-08,
"loss": 1.0107,
"step": 650
},
{
"epoch": 1.8644067796610169,
"grad_norm": 1.2479460675046632,
"learning_rate": 5.811544347348097e-08,
"loss": 0.9932,
"step": 660
},
{
"epoch": 1.8926553672316384,
"grad_norm": 1.2150911321344626,
"learning_rate": 3.647599787412692e-08,
"loss": 1.0028,
"step": 670
},
{
"epoch": 1.92090395480226,
"grad_norm": 1.1712546508255446,
"learning_rate": 1.9826228688248073e-08,
"loss": 0.9951,
"step": 680
},
{
"epoch": 1.9491525423728815,
"grad_norm": 1.2304982193614624,
"learning_rate": 8.19985872939355e-09,
"loss": 0.9928,
"step": 690
},
{
"epoch": 1.9774011299435028,
"grad_norm": 1.2367698151490922,
"learning_rate": 1.6204363063712647e-09,
"loss": 0.9979,
"step": 700
}
],
"logging_steps": 10,
"max_steps": 708,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 230913547960320.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}