qwen-sft-eng / trainer_state.json
Mayank6255's picture
Upload qwen-sft-eng
eb5c04c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 833,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012005702708786673,
"grad_norm": 13.904192924499512,
"learning_rate": 1.0714285714285714e-06,
"loss": 2.0907,
"step": 10
},
{
"epoch": 0.024011405417573347,
"grad_norm": 10.684447288513184,
"learning_rate": 2.261904761904762e-06,
"loss": 1.7351,
"step": 20
},
{
"epoch": 0.03601710812636002,
"grad_norm": 13.9859619140625,
"learning_rate": 3.4523809523809528e-06,
"loss": 1.9817,
"step": 30
},
{
"epoch": 0.048022810835146694,
"grad_norm": 11.632477760314941,
"learning_rate": 4.642857142857144e-06,
"loss": 1.8382,
"step": 40
},
{
"epoch": 0.06002851354393337,
"grad_norm": 10.015055656433105,
"learning_rate": 5.833333333333334e-06,
"loss": 2.1399,
"step": 50
},
{
"epoch": 0.07203421625272004,
"grad_norm": 11.713295936584473,
"learning_rate": 7.023809523809524e-06,
"loss": 1.8619,
"step": 60
},
{
"epoch": 0.08403991896150671,
"grad_norm": 9.750036239624023,
"learning_rate": 8.214285714285714e-06,
"loss": 1.9983,
"step": 70
},
{
"epoch": 0.09604562167029339,
"grad_norm": 11.98967170715332,
"learning_rate": 9.404761904761905e-06,
"loss": 1.8564,
"step": 80
},
{
"epoch": 0.10805132437908006,
"grad_norm": 7.032362461090088,
"learning_rate": 9.998900487402431e-06,
"loss": 1.8123,
"step": 90
},
{
"epoch": 0.12005702708786674,
"grad_norm": 9.9423246383667,
"learning_rate": 9.99010728783628e-06,
"loss": 1.7612,
"step": 100
},
{
"epoch": 0.1320627297966534,
"grad_norm": 10.866604804992676,
"learning_rate": 9.972536356177037e-06,
"loss": 1.8318,
"step": 110
},
{
"epoch": 0.14406843250544007,
"grad_norm": 10.427411079406738,
"learning_rate": 9.94621860016312e-06,
"loss": 1.9897,
"step": 120
},
{
"epoch": 0.15607413521422675,
"grad_norm": 8.520286560058594,
"learning_rate": 9.911200313430767e-06,
"loss": 1.9808,
"step": 130
},
{
"epoch": 0.16807983792301343,
"grad_norm": 12.630261421203613,
"learning_rate": 9.8675430940823e-06,
"loss": 1.9811,
"step": 140
},
{
"epoch": 0.1800855406318001,
"grad_norm": 9.4483060836792,
"learning_rate": 9.815323736333405e-06,
"loss": 2.0145,
"step": 150
},
{
"epoch": 0.19209124334058678,
"grad_norm": 8.699251174926758,
"learning_rate": 9.754634095430062e-06,
"loss": 2.0973,
"step": 160
},
{
"epoch": 0.20409694604937345,
"grad_norm": 9.745418548583984,
"learning_rate": 9.685580926072713e-06,
"loss": 2.1629,
"step": 170
},
{
"epoch": 0.21610264875816013,
"grad_norm": 11.316314697265625,
"learning_rate": 9.608285694631884e-06,
"loss": 1.8444,
"step": 180
},
{
"epoch": 0.2281083514669468,
"grad_norm": 10.36503791809082,
"learning_rate": 9.522884365485599e-06,
"loss": 1.9405,
"step": 190
},
{
"epoch": 0.24011405417573348,
"grad_norm": 11.719226837158203,
"learning_rate": 9.429527161854402e-06,
"loss": 1.8054,
"step": 200
},
{
"epoch": 0.25211975688452015,
"grad_norm": 7.179232120513916,
"learning_rate": 9.328378301554698e-06,
"loss": 2.1386,
"step": 210
},
{
"epoch": 0.2641254595933068,
"grad_norm": 9.562182426452637,
"learning_rate": 9.219615708135226e-06,
"loss": 1.8712,
"step": 220
},
{
"epoch": 0.2761311623020935,
"grad_norm": 12.569293022155762,
"learning_rate": 9.103430697904776e-06,
"loss": 1.9458,
"step": 230
},
{
"epoch": 0.28813686501088015,
"grad_norm": 8.965167045593262,
"learning_rate": 8.980027643401694e-06,
"loss": 1.9557,
"step": 240
},
{
"epoch": 0.30014256771966685,
"grad_norm": 9.079051971435547,
"learning_rate": 8.849623613897126e-06,
"loss": 1.8958,
"step": 250
},
{
"epoch": 0.3121482704284535,
"grad_norm": 12.004711151123047,
"learning_rate": 8.712447993564362e-06,
"loss": 2.0377,
"step": 260
},
{
"epoch": 0.3241539731372402,
"grad_norm": 12.883252143859863,
"learning_rate": 8.568742077985945e-06,
"loss": 1.9571,
"step": 270
},
{
"epoch": 0.33615967584602685,
"grad_norm": 10.449505805969238,
"learning_rate": 8.418758649708299e-06,
"loss": 1.8825,
"step": 280
},
{
"epoch": 0.34816537855481355,
"grad_norm": 9.432299613952637,
"learning_rate": 8.262761533590468e-06,
"loss": 1.8399,
"step": 290
},
{
"epoch": 0.3601710812636002,
"grad_norm": 7.1271185874938965,
"learning_rate": 8.101025132729139e-06,
"loss": 2.0495,
"step": 300
},
{
"epoch": 0.3721767839723869,
"grad_norm": 10.691527366638184,
"learning_rate": 7.933833945776257e-06,
"loss": 2.1008,
"step": 310
},
{
"epoch": 0.38418248668117355,
"grad_norm": 9.862756729125977,
"learning_rate": 7.761482066498298e-06,
"loss": 2.1887,
"step": 320
},
{
"epoch": 0.39618818938996025,
"grad_norm": 8.47981071472168,
"learning_rate": 7.584272666457471e-06,
"loss": 2.0397,
"step": 330
},
{
"epoch": 0.4081938920987469,
"grad_norm": 10.607376098632812,
"learning_rate": 7.402517461724839e-06,
"loss": 2.044,
"step": 340
},
{
"epoch": 0.4201995948075336,
"grad_norm": 10.644391059875488,
"learning_rate": 7.2165361645634105e-06,
"loss": 1.943,
"step": 350
},
{
"epoch": 0.43220529751632025,
"grad_norm": 10.169779777526855,
"learning_rate": 7.026655921045736e-06,
"loss": 1.9137,
"step": 360
},
{
"epoch": 0.4442110002251069,
"grad_norm": 8.738324165344238,
"learning_rate": 6.833210735595215e-06,
"loss": 2.0836,
"step": 370
},
{
"epoch": 0.4562167029338936,
"grad_norm": 11.044781684875488,
"learning_rate": 6.636540883463385e-06,
"loss": 1.8944,
"step": 380
},
{
"epoch": 0.46822240564268025,
"grad_norm": 10.515331268310547,
"learning_rate": 6.436992312176669e-06,
"loss": 1.8942,
"step": 390
},
{
"epoch": 0.48022810835146695,
"grad_norm": 8.095483779907227,
"learning_rate": 6.234916033005421e-06,
"loss": 2.1919,
"step": 400
},
{
"epoch": 0.4922338110602536,
"grad_norm": 11.837594032287598,
"learning_rate": 6.030667503525726e-06,
"loss": 2.1104,
"step": 410
},
{
"epoch": 0.5042395137690403,
"grad_norm": 10.554974555969238,
"learning_rate": 5.82460600236002e-06,
"loss": 2.0049,
"step": 420
},
{
"epoch": 0.516245216477827,
"grad_norm": 8.677346229553223,
"learning_rate": 5.617093997196392e-06,
"loss": 2.0641,
"step": 430
},
{
"epoch": 0.5282509191866136,
"grad_norm": 9.323301315307617,
"learning_rate": 5.408496507198229e-06,
"loss": 1.888,
"step": 440
},
{
"epoch": 0.5402566218954004,
"grad_norm": 10.618653297424316,
"learning_rate": 5.199180460925757e-06,
"loss": 1.8445,
"step": 450
},
{
"epoch": 0.552262324604187,
"grad_norm": 12.810103416442871,
"learning_rate": 4.989514050898893e-06,
"loss": 1.8746,
"step": 460
},
{
"epoch": 0.5642680273129737,
"grad_norm": 9.233360290527344,
"learning_rate": 4.779866085936762e-06,
"loss": 1.9861,
"step": 470
},
{
"epoch": 0.5762737300217603,
"grad_norm": 10.848196983337402,
"learning_rate": 4.5706053424131285e-06,
"loss": 1.9709,
"step": 480
},
{
"epoch": 0.5882794327305471,
"grad_norm": 11.675863265991211,
"learning_rate": 4.362099915568894e-06,
"loss": 2.0604,
"step": 490
},
{
"epoch": 0.6002851354393337,
"grad_norm": 10.185128211975098,
"learning_rate": 4.15471657202274e-06,
"loss": 1.9789,
"step": 500
},
{
"epoch": 0.6002851354393337,
"eval_loss": 1.6066299676895142,
"eval_runtime": 41.652,
"eval_samples_per_second": 35.557,
"eval_steps_per_second": 35.557,
"step": 500
},
{
"epoch": 0.6122908381481204,
"grad_norm": 11.495535850524902,
"learning_rate": 3.948820104618828e-06,
"loss": 1.9084,
"step": 510
},
{
"epoch": 0.624296540856907,
"grad_norm": 9.106337547302246,
"learning_rate": 3.744772690746448e-06,
"loss": 1.9345,
"step": 520
},
{
"epoch": 0.6363022435656938,
"grad_norm": 10.831775665283203,
"learning_rate": 3.542933255260309e-06,
"loss": 2.0163,
"step": 530
},
{
"epoch": 0.6483079462744804,
"grad_norm": 9.374225616455078,
"learning_rate": 3.3436568391221215e-06,
"loss": 2.0138,
"step": 540
},
{
"epoch": 0.660313648983267,
"grad_norm": 8.958917617797852,
"learning_rate": 3.1472939748740604e-06,
"loss": 2.0027,
"step": 550
},
{
"epoch": 0.6723193516920537,
"grad_norm": 6.963953018188477,
"learning_rate": 2.954190070042654e-06,
"loss": 1.8467,
"step": 560
},
{
"epoch": 0.6843250544008403,
"grad_norm": 6.840718746185303,
"learning_rate": 2.764684799557714e-06,
"loss": 1.963,
"step": 570
},
{
"epoch": 0.6963307571096271,
"grad_norm": 12.296991348266602,
"learning_rate": 2.5791115082550495e-06,
"loss": 1.8294,
"step": 580
},
{
"epoch": 0.7083364598184138,
"grad_norm": 9.745269775390625,
"learning_rate": 2.397796624514001e-06,
"loss": 1.8554,
"step": 590
},
{
"epoch": 0.7203421625272004,
"grad_norm": 10.261857986450195,
"learning_rate": 2.2210590860611835e-06,
"loss": 1.9552,
"step": 600
},
{
"epoch": 0.732347865235987,
"grad_norm": 10.252482414245605,
"learning_rate": 2.049209778950518e-06,
"loss": 1.9781,
"step": 610
},
{
"epoch": 0.7443535679447738,
"grad_norm": 7.251546382904053,
"learning_rate": 1.8825509907063328e-06,
"loss": 1.6812,
"step": 620
},
{
"epoch": 0.7563592706535605,
"grad_norm": 10.077775955200195,
"learning_rate": 1.7213758785915508e-06,
"loss": 1.9387,
"step": 630
},
{
"epoch": 0.7683649733623471,
"grad_norm": 11.14145565032959,
"learning_rate": 1.5659679539362071e-06,
"loss": 1.9589,
"step": 640
},
{
"epoch": 0.7803706760711338,
"grad_norm": 11.107977867126465,
"learning_rate": 1.4166005834334607e-06,
"loss": 1.9859,
"step": 650
},
{
"epoch": 0.7923763787799205,
"grad_norm": 7.929206371307373,
"learning_rate": 1.2735365082802642e-06,
"loss": 1.9113,
"step": 660
},
{
"epoch": 0.8043820814887072,
"grad_norm": 9.809419631958008,
"learning_rate": 1.1370273820085731e-06,
"loss": 1.9373,
"step": 670
},
{
"epoch": 0.8163877841974938,
"grad_norm": 10.714163780212402,
"learning_rate": 1.0073133278200702e-06,
"loss": 1.8766,
"step": 680
},
{
"epoch": 0.8283934869062805,
"grad_norm": 8.51906967163086,
"learning_rate": 8.846225162030204e-07,
"loss": 1.8467,
"step": 690
},
{
"epoch": 0.8403991896150672,
"grad_norm": 10.252246856689453,
"learning_rate": 7.691707635742957e-07,
"loss": 1.9715,
"step": 700
},
{
"epoch": 0.8524048923238539,
"grad_norm": 10.066018104553223,
"learning_rate": 6.611611526525214e-07,
"loss": 2.0223,
"step": 710
},
{
"epoch": 0.8644105950326405,
"grad_norm": 10.5,
"learning_rate": 5.607836752301527e-07,
"loss": 2.0496,
"step": 720
},
{
"epoch": 0.8764162977414272,
"grad_norm": 9.800883293151855,
"learning_rate": 4.6821489797282624e-07,
"loss": 1.9503,
"step": 730
},
{
"epoch": 0.8884220004502138,
"grad_norm": 11.5232515335083,
"learning_rate": 3.8361765183388466e-07,
"loss": 1.9714,
"step": 740
},
{
"epoch": 0.9004277031590006,
"grad_norm": 10.428476333618164,
"learning_rate": 3.0714074563037043e-07,
"loss": 1.8059,
"step": 750
},
{
"epoch": 0.9124334058677872,
"grad_norm": 10.010412216186523,
"learning_rate": 2.389187042843416e-07,
"loss": 2.0123,
"step": 760
},
{
"epoch": 0.9244391085765739,
"grad_norm": 9.442755699157715,
"learning_rate": 1.7907153218994166e-07,
"loss": 1.7247,
"step": 770
},
{
"epoch": 0.9364448112853605,
"grad_norm": 11.165854454040527,
"learning_rate": 1.277045021224571e-07,
"loss": 1.9639,
"step": 780
},
{
"epoch": 0.9484505139941473,
"grad_norm": 10.688188552856445,
"learning_rate": 8.490797006069696e-08,
"loss": 1.9314,
"step": 790
},
{
"epoch": 0.9604562167029339,
"grad_norm": 8.110855102539062,
"learning_rate": 5.075721624840713e-08,
"loss": 1.8037,
"step": 800
},
{
"epoch": 0.9724619194117206,
"grad_norm": 7.74312162399292,
"learning_rate": 2.5312312774313784e-08,
"loss": 1.7496,
"step": 810
},
{
"epoch": 0.9844676221205072,
"grad_norm": 10.240447998046875,
"learning_rate": 8.618017903708198e-09,
"loss": 1.8276,
"step": 820
},
{
"epoch": 0.996473324829294,
"grad_norm": 9.714399337768555,
"learning_rate": 7.036973474688901e-10,
"loss": 1.9989,
"step": 830
},
{
"epoch": 1.0,
"step": 833,
"total_flos": 5523238276368384.0,
"train_loss": 1.9459604552957046,
"train_runtime": 1842.4954,
"train_samples_per_second": 7.233,
"train_steps_per_second": 0.452
}
],
"logging_steps": 10,
"max_steps": 833,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5523238276368384.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}