qwen-debug-stop-wmdp / trainer_state.json
jackysnake's picture
Upload folder using huggingface_hub
4988b30 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1068,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.028089887640449437,
"grad_norm": 44.3452575996642,
"learning_rate": 8.411214953271029e-07,
"loss": 4.2299,
"step": 10
},
{
"epoch": 0.056179775280898875,
"grad_norm": 16.696144417512347,
"learning_rate": 1.7757009345794394e-06,
"loss": 3.4728,
"step": 20
},
{
"epoch": 0.08426966292134831,
"grad_norm": 5.245647070749203,
"learning_rate": 2.7102803738317757e-06,
"loss": 2.2396,
"step": 30
},
{
"epoch": 0.11235955056179775,
"grad_norm": 3.747589322083775,
"learning_rate": 3.6448598130841123e-06,
"loss": 1.9973,
"step": 40
},
{
"epoch": 0.1404494382022472,
"grad_norm": 3.43796713948135,
"learning_rate": 4.579439252336449e-06,
"loss": 1.8521,
"step": 50
},
{
"epoch": 0.16853932584269662,
"grad_norm": 3.514480101837343,
"learning_rate": 5.514018691588785e-06,
"loss": 1.8285,
"step": 60
},
{
"epoch": 0.19662921348314608,
"grad_norm": 3.392412234015579,
"learning_rate": 6.448598130841122e-06,
"loss": 1.7364,
"step": 70
},
{
"epoch": 0.2247191011235955,
"grad_norm": 4.233842511487955,
"learning_rate": 7.383177570093458e-06,
"loss": 1.6781,
"step": 80
},
{
"epoch": 0.25280898876404495,
"grad_norm": 3.715743054676569,
"learning_rate": 8.317757009345795e-06,
"loss": 1.6416,
"step": 90
},
{
"epoch": 0.2808988764044944,
"grad_norm": 3.143134814755476,
"learning_rate": 9.252336448598132e-06,
"loss": 1.6064,
"step": 100
},
{
"epoch": 0.3089887640449438,
"grad_norm": 2.7477799274995416,
"learning_rate": 9.999893131079397e-06,
"loss": 1.4984,
"step": 110
},
{
"epoch": 0.33707865168539325,
"grad_norm": 2.3802092180915273,
"learning_rate": 9.996153198516951e-06,
"loss": 1.5675,
"step": 120
},
{
"epoch": 0.3651685393258427,
"grad_norm": 2.672410802886858,
"learning_rate": 9.987074387433024e-06,
"loss": 1.4511,
"step": 130
},
{
"epoch": 0.39325842696629215,
"grad_norm": 2.624658269919838,
"learning_rate": 9.972666399425538e-06,
"loss": 1.4829,
"step": 140
},
{
"epoch": 0.42134831460674155,
"grad_norm": 2.222084135278492,
"learning_rate": 9.952944630839371e-06,
"loss": 1.5174,
"step": 150
},
{
"epoch": 0.449438202247191,
"grad_norm": 2.373569909234205,
"learning_rate": 9.92793015631386e-06,
"loss": 1.4802,
"step": 160
},
{
"epoch": 0.47752808988764045,
"grad_norm": 2.6373847502517203,
"learning_rate": 9.897649706262474e-06,
"loss": 1.4245,
"step": 170
},
{
"epoch": 0.5056179775280899,
"grad_norm": 2.849217346030171,
"learning_rate": 9.862135638308763e-06,
"loss": 1.4519,
"step": 180
},
{
"epoch": 0.5337078651685393,
"grad_norm": 2.8063076239177334,
"learning_rate": 9.821425902709072e-06,
"loss": 1.3854,
"step": 190
},
{
"epoch": 0.5617977528089888,
"grad_norm": 2.4489682971962914,
"learning_rate": 9.775564001798973e-06,
"loss": 1.3687,
"step": 200
},
{
"epoch": 0.5898876404494382,
"grad_norm": 2.6550254325393516,
"learning_rate": 9.724598943506762e-06,
"loss": 1.3376,
"step": 210
},
{
"epoch": 0.6179775280898876,
"grad_norm": 2.5844459484825153,
"learning_rate": 9.6685851889837e-06,
"loss": 1.3022,
"step": 220
},
{
"epoch": 0.6460674157303371,
"grad_norm": 2.8049982042093564,
"learning_rate": 9.607582594406941e-06,
"loss": 1.3088,
"step": 230
},
{
"epoch": 0.6741573033707865,
"grad_norm": 2.4864422239085604,
"learning_rate": 9.541656347017345e-06,
"loss": 1.2908,
"step": 240
},
{
"epoch": 0.702247191011236,
"grad_norm": 2.964748175555427,
"learning_rate": 9.470876895460545e-06,
"loss": 1.2868,
"step": 250
},
{
"epoch": 0.7303370786516854,
"grad_norm": 2.604536489470611,
"learning_rate": 9.395319874505661e-06,
"loss": 1.277,
"step": 260
},
{
"epoch": 0.7584269662921348,
"grad_norm": 2.5620862270285185,
"learning_rate": 9.315066024222163e-06,
"loss": 1.2643,
"step": 270
},
{
"epoch": 0.7865168539325843,
"grad_norm": 2.653270358757971,
"learning_rate": 9.230201103701201e-06,
"loss": 1.205,
"step": 280
},
{
"epoch": 0.8146067415730337,
"grad_norm": 2.5808237804954923,
"learning_rate": 9.140815799413624e-06,
"loss": 1.2222,
"step": 290
},
{
"epoch": 0.8426966292134831,
"grad_norm": 2.898027686306709,
"learning_rate": 9.047005628302617e-06,
"loss": 1.1676,
"step": 300
},
{
"epoch": 0.8707865168539326,
"grad_norm": 3.518431492195722,
"learning_rate": 8.948870835714491e-06,
"loss": 1.1993,
"step": 310
},
{
"epoch": 0.898876404494382,
"grad_norm": 2.925246568058356,
"learning_rate": 8.846516288276743e-06,
"loss": 1.1115,
"step": 320
},
{
"epoch": 0.9269662921348315,
"grad_norm": 2.9427782034508527,
"learning_rate": 8.740051361837786e-06,
"loss": 1.1041,
"step": 330
},
{
"epoch": 0.9550561797752809,
"grad_norm": 2.5514195156882518,
"learning_rate": 8.629589824588158e-06,
"loss": 1.143,
"step": 340
},
{
"epoch": 0.9831460674157303,
"grad_norm": 3.015436546535006,
"learning_rate": 8.515249715488085e-06,
"loss": 1.0505,
"step": 350
},
{
"epoch": 1.0112359550561798,
"grad_norm": 3.2863183728567975,
"learning_rate": 8.397153218131297e-06,
"loss": 0.8597,
"step": 360
},
{
"epoch": 1.0393258426966292,
"grad_norm": 3.2434695923935033,
"learning_rate": 8.2754265301799e-06,
"loss": 0.6543,
"step": 370
},
{
"epoch": 1.0674157303370786,
"grad_norm": 4.0641803528122145,
"learning_rate": 8.150199728509844e-06,
"loss": 0.6335,
"step": 380
},
{
"epoch": 1.095505617977528,
"grad_norm": 3.2752657889372885,
"learning_rate": 8.02160663021103e-06,
"loss": 0.6254,
"step": 390
},
{
"epoch": 1.1235955056179776,
"grad_norm": 3.326866921104294,
"learning_rate": 7.889784649590673e-06,
"loss": 0.6102,
"step": 400
},
{
"epoch": 1.151685393258427,
"grad_norm": 3.4005649282793846,
"learning_rate": 7.754874651332671e-06,
"loss": 0.5881,
"step": 410
},
{
"epoch": 1.1797752808988764,
"grad_norm": 3.138502719677173,
"learning_rate": 7.617020799969895e-06,
"loss": 0.5858,
"step": 420
},
{
"epoch": 1.2078651685393258,
"grad_norm": 3.343328236660075,
"learning_rate": 7.476370405830293e-06,
"loss": 0.6526,
"step": 430
},
{
"epoch": 1.2359550561797752,
"grad_norm": 3.435537024900103,
"learning_rate": 7.333073767621385e-06,
"loss": 0.5759,
"step": 440
},
{
"epoch": 1.2640449438202248,
"grad_norm": 3.0857557861079643,
"learning_rate": 7.18728401182139e-06,
"loss": 0.5646,
"step": 450
},
{
"epoch": 1.2921348314606742,
"grad_norm": 3.5691357734753724,
"learning_rate": 7.039156929048603e-06,
"loss": 0.5574,
"step": 460
},
{
"epoch": 1.3202247191011236,
"grad_norm": 3.449734373780758,
"learning_rate": 6.888850807583875e-06,
"loss": 0.5308,
"step": 470
},
{
"epoch": 1.348314606741573,
"grad_norm": 3.8917107638385215,
"learning_rate": 6.736526264224101e-06,
"loss": 0.5457,
"step": 480
},
{
"epoch": 1.3764044943820224,
"grad_norm": 3.530441704545947,
"learning_rate": 6.582346072647455e-06,
"loss": 0.5429,
"step": 490
},
{
"epoch": 1.404494382022472,
"grad_norm": 3.2288633404653657,
"learning_rate": 6.426474989473785e-06,
"loss": 0.5216,
"step": 500
},
{
"epoch": 1.4325842696629214,
"grad_norm": 4.041005860018691,
"learning_rate": 6.2690795782060535e-06,
"loss": 0.4646,
"step": 510
},
{
"epoch": 1.4606741573033708,
"grad_norm": 4.067173197200558,
"learning_rate": 6.1103280312409355e-06,
"loss": 0.4637,
"step": 520
},
{
"epoch": 1.4887640449438202,
"grad_norm": 3.326339667248253,
"learning_rate": 5.950389990138774e-06,
"loss": 0.4783,
"step": 530
},
{
"epoch": 1.5168539325842696,
"grad_norm": 3.83419253395647,
"learning_rate": 5.789436364344998e-06,
"loss": 0.4588,
"step": 540
},
{
"epoch": 1.5449438202247192,
"grad_norm": 3.139254395895648,
"learning_rate": 5.627639148556638e-06,
"loss": 0.4374,
"step": 550
},
{
"epoch": 1.5730337078651684,
"grad_norm": 3.9341533967371403,
"learning_rate": 5.465171238929173e-06,
"loss": 0.455,
"step": 560
},
{
"epoch": 1.601123595505618,
"grad_norm": 2.9583490118609435,
"learning_rate": 5.30220624832007e-06,
"loss": 0.3843,
"step": 570
},
{
"epoch": 1.6292134831460674,
"grad_norm": 3.7095899864138606,
"learning_rate": 5.13891832076646e-06,
"loss": 0.4115,
"step": 580
},
{
"epoch": 1.6573033707865168,
"grad_norm": 3.2360479692986153,
"learning_rate": 4.9754819453951986e-06,
"loss": 0.3814,
"step": 590
},
{
"epoch": 1.6853932584269664,
"grad_norm": 4.383809713794338,
"learning_rate": 4.8120717699641535e-06,
"loss": 0.3791,
"step": 600
},
{
"epoch": 1.7134831460674156,
"grad_norm": 3.4036732065921993,
"learning_rate": 4.648862414233998e-06,
"loss": 0.3517,
"step": 610
},
{
"epoch": 1.7415730337078652,
"grad_norm": 3.8998605138651325,
"learning_rate": 4.486028283369901e-06,
"loss": 0.3603,
"step": 620
},
{
"epoch": 1.7696629213483146,
"grad_norm": 3.6292532967844835,
"learning_rate": 4.323743381572557e-06,
"loss": 0.3184,
"step": 630
},
{
"epoch": 1.797752808988764,
"grad_norm": 4.428693489997381,
"learning_rate": 4.162181126137658e-06,
"loss": 0.3807,
"step": 640
},
{
"epoch": 1.8258426966292136,
"grad_norm": 3.955306267340941,
"learning_rate": 4.001514162142559e-06,
"loss": 0.3074,
"step": 650
},
{
"epoch": 1.8539325842696628,
"grad_norm": 3.5747162718635197,
"learning_rate": 3.84191417795811e-06,
"loss": 0.3111,
"step": 660
},
{
"epoch": 1.8820224719101124,
"grad_norm": 3.9923845873645742,
"learning_rate": 3.6835517217828442e-06,
"loss": 0.3005,
"step": 670
},
{
"epoch": 1.9101123595505618,
"grad_norm": 3.65999276518314,
"learning_rate": 3.5265960193955338e-06,
"loss": 0.2559,
"step": 680
},
{
"epoch": 1.9382022471910112,
"grad_norm": 3.7068235753264123,
"learning_rate": 3.3712147933208885e-06,
"loss": 0.2737,
"step": 690
},
{
"epoch": 1.9662921348314608,
"grad_norm": 4.090893668780354,
"learning_rate": 3.2175740836016323e-06,
"loss": 0.231,
"step": 700
},
{
"epoch": 1.99438202247191,
"grad_norm": 3.90282406209805,
"learning_rate": 3.065838070368469e-06,
"loss": 0.2496,
"step": 710
},
{
"epoch": 2.0224719101123596,
"grad_norm": 2.796358555674757,
"learning_rate": 2.9161688983975466e-06,
"loss": 0.1056,
"step": 720
},
{
"epoch": 2.050561797752809,
"grad_norm": 1.840479273209187,
"learning_rate": 2.7687265038429074e-06,
"loss": 0.063,
"step": 730
},
{
"epoch": 2.0786516853932584,
"grad_norm": 2.5759762149420924,
"learning_rate": 2.6236684433290494e-06,
"loss": 0.0596,
"step": 740
},
{
"epoch": 2.106741573033708,
"grad_norm": 1.9486659149321488,
"learning_rate": 2.4811497255862634e-06,
"loss": 0.0632,
"step": 750
},
{
"epoch": 2.134831460674157,
"grad_norm": 2.2514702517323926,
"learning_rate": 2.341322645808642e-06,
"loss": 0.0658,
"step": 760
},
{
"epoch": 2.162921348314607,
"grad_norm": 2.1967246146223345,
"learning_rate": 2.204336622911753e-06,
"loss": 0.0604,
"step": 770
},
{
"epoch": 2.191011235955056,
"grad_norm": 2.486502088549764,
"learning_rate": 2.070338039863917e-06,
"loss": 0.0627,
"step": 780
},
{
"epoch": 2.2191011235955056,
"grad_norm": 1.687530407531317,
"learning_rate": 1.9394700872616856e-06,
"loss": 0.0581,
"step": 790
},
{
"epoch": 2.247191011235955,
"grad_norm": 2.6462802513977897,
"learning_rate": 1.8118726103166706e-06,
"loss": 0.061,
"step": 800
},
{
"epoch": 2.2752808988764044,
"grad_norm": 2.016987283249722,
"learning_rate": 1.6876819594172578e-06,
"loss": 0.0619,
"step": 810
},
{
"epoch": 2.303370786516854,
"grad_norm": 2.0842256218362163,
"learning_rate": 1.5670308444248777e-06,
"loss": 0.0584,
"step": 820
},
{
"epoch": 2.331460674157303,
"grad_norm": 1.8494552666391486,
"learning_rate": 1.4500481928605304e-06,
"loss": 0.0506,
"step": 830
},
{
"epoch": 2.359550561797753,
"grad_norm": 2.2638744740418755,
"learning_rate": 1.3368590121331166e-06,
"loss": 0.0544,
"step": 840
},
{
"epoch": 2.3876404494382024,
"grad_norm": 2.2496173269208737,
"learning_rate": 1.2275842559567947e-06,
"loss": 0.0508,
"step": 850
},
{
"epoch": 2.4157303370786516,
"grad_norm": 1.5930060349264639,
"learning_rate": 1.1223406951000936e-06,
"loss": 0.0501,
"step": 860
},
{
"epoch": 2.443820224719101,
"grad_norm": 1.8725457508959784,
"learning_rate": 1.021240792604929e-06,
"loss": 0.048,
"step": 870
},
{
"epoch": 2.4719101123595504,
"grad_norm": 1.9419600970183988,
"learning_rate": 9.243925836088386e-07,
"loss": 0.045,
"step": 880
},
{
"epoch": 2.5,
"grad_norm": 1.8713563535913174,
"learning_rate": 8.318995598988649e-07,
"loss": 0.0461,
"step": 890
},
{
"epoch": 2.5280898876404496,
"grad_norm": 1.995760237288878,
"learning_rate": 7.438605593204562e-07,
"loss": 0.0481,
"step": 900
},
{
"epoch": 2.556179775280899,
"grad_norm": 2.109074394489752,
"learning_rate": 6.603696601595577e-07,
"loss": 0.0396,
"step": 910
},
{
"epoch": 2.5842696629213484,
"grad_norm": 3.85749040186571,
"learning_rate": 5.8151608061076e-07,
"loss": 0.05,
"step": 920
},
{
"epoch": 2.6123595505617976,
"grad_norm": 2.4295027083907246,
"learning_rate": 5.073840834389293e-07,
"loss": 0.0448,
"step": 930
},
{
"epoch": 2.640449438202247,
"grad_norm": 3.3753210520643893,
"learning_rate": 4.380528859361954e-07,
"loss": 0.0502,
"step": 940
},
{
"epoch": 2.668539325842697,
"grad_norm": 1.4432588529836197,
"learning_rate": 3.735965752705256e-07,
"loss": 0.0448,
"step": 950
},
{
"epoch": 2.696629213483146,
"grad_norm": 2.3912332708241815,
"learning_rate": 3.1408402931634163e-07,
"loss": 0.0446,
"step": 960
},
{
"epoch": 2.7247191011235956,
"grad_norm": 1.6107794104662452,
"learning_rate": 2.595788430517637e-07,
"loss": 0.038,
"step": 970
},
{
"epoch": 2.752808988764045,
"grad_norm": 1.6211044234112233,
"learning_rate": 2.1013926060116042e-07,
"loss": 0.0389,
"step": 980
},
{
"epoch": 2.7808988764044944,
"grad_norm": 2.5135992092458546,
"learning_rate": 1.6581811299560212e-07,
"loss": 0.0417,
"step": 990
},
{
"epoch": 2.808988764044944,
"grad_norm": 1.392970025455271,
"learning_rate": 1.2666276171773073e-07,
"loss": 0.0396,
"step": 1000
},
{
"epoch": 2.837078651685393,
"grad_norm": 1.8357472070480059,
"learning_rate": 9.271504809138854e-08,
"loss": 0.0377,
"step": 1010
},
{
"epoch": 2.865168539325843,
"grad_norm": 1.5659783547206103,
"learning_rate": 6.401124857006502e-08,
"loss": 0.0369,
"step": 1020
},
{
"epoch": 2.893258426966292,
"grad_norm": 1.6270134391199909,
"learning_rate": 4.058203597195831e-08,
"loss": 0.0334,
"step": 1030
},
{
"epoch": 2.9213483146067416,
"grad_norm": 1.5836178259212397,
"learning_rate": 2.2452446703067897e-08,
"loss": 0.0332,
"step": 1040
},
{
"epoch": 2.949438202247191,
"grad_norm": 1.5726583876755116,
"learning_rate": 9.641854003346607e-09,
"loss": 0.0341,
"step": 1050
},
{
"epoch": 2.9775280898876404,
"grad_norm": 1.720292896166018,
"learning_rate": 2.1639472444956454e-09,
"loss": 0.0336,
"step": 1060
},
{
"epoch": 3.0,
"step": 1068,
"total_flos": 49945662455808.0,
"train_loss": 0.6897849787152215,
"train_runtime": 5575.3809,
"train_samples_per_second": 12.252,
"train_steps_per_second": 0.192
}
],
"logging_steps": 10,
"max_steps": 1068,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 49945662455808.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}