llama_original_sol_sft / trainer_state.json
jinqij's picture
Upload folder using huggingface_hub
9e9abaf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 782,
"global_step": 3910,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 186.5393524169922,
"learning_rate": 1.2787723785166243e-08,
"loss": 9.9143,
"step": 1
},
{
"epoch": 0.04,
"grad_norm": 79.81620788574219,
"learning_rate": 4.0920716112531976e-07,
"loss": 9.1648,
"step": 32
},
{
"epoch": 0.08,
"grad_norm": 42.39503860473633,
"learning_rate": 8.184143222506395e-07,
"loss": 7.594,
"step": 64
},
{
"epoch": 0.12,
"grad_norm": 39.50730514526367,
"learning_rate": 1.2276214833759592e-06,
"loss": 7.1797,
"step": 96
},
{
"epoch": 0.16,
"grad_norm": 66.16382598876953,
"learning_rate": 1.636828644501279e-06,
"loss": 6.8731,
"step": 128
},
{
"epoch": 0.2,
"grad_norm": 73.76081848144531,
"learning_rate": 2.0460358056265987e-06,
"loss": 6.2698,
"step": 160
},
{
"epoch": 0.25,
"grad_norm": 83.69901275634766,
"learning_rate": 2.4552429667519184e-06,
"loss": 5.4165,
"step": 192
},
{
"epoch": 0.29,
"grad_norm": 83.07150268554688,
"learning_rate": 2.864450127877238e-06,
"loss": 4.1512,
"step": 224
},
{
"epoch": 0.33,
"grad_norm": 89.9841079711914,
"learning_rate": 3.273657289002558e-06,
"loss": 2.7473,
"step": 256
},
{
"epoch": 0.37,
"grad_norm": 47.78603744506836,
"learning_rate": 3.6828644501278778e-06,
"loss": 1.3029,
"step": 288
},
{
"epoch": 0.41,
"grad_norm": 16.27153205871582,
"learning_rate": 4.092071611253197e-06,
"loss": 0.5736,
"step": 320
},
{
"epoch": 0.45,
"grad_norm": 8.598386764526367,
"learning_rate": 4.501278772378517e-06,
"loss": 0.4362,
"step": 352
},
{
"epoch": 0.49,
"grad_norm": 6.162290096282959,
"learning_rate": 4.910485933503837e-06,
"loss": 0.3712,
"step": 384
},
{
"epoch": 0.53,
"grad_norm": 4.307323455810547,
"learning_rate": 4.999377365452712e-06,
"loss": 0.3602,
"step": 416
},
{
"epoch": 0.57,
"grad_norm": 3.5536632537841797,
"learning_rate": 4.996763860622537e-06,
"loss": 0.3408,
"step": 448
},
{
"epoch": 0.61,
"grad_norm": 2.9953174591064453,
"learning_rate": 4.992112801917064e-06,
"loss": 0.3275,
"step": 480
},
{
"epoch": 0.65,
"grad_norm": 2.5501198768615723,
"learning_rate": 4.985427984962641e-06,
"loss": 0.3398,
"step": 512
},
{
"epoch": 0.7,
"grad_norm": 2.0887036323547363,
"learning_rate": 4.976714865090827e-06,
"loss": 0.3333,
"step": 544
},
{
"epoch": 0.74,
"grad_norm": 2.0213687419891357,
"learning_rate": 4.965980552886427e-06,
"loss": 0.3231,
"step": 576
},
{
"epoch": 0.78,
"grad_norm": 1.822816252708435,
"learning_rate": 4.953233808384689e-06,
"loss": 0.3235,
"step": 608
},
{
"epoch": 0.82,
"grad_norm": 1.6860909461975098,
"learning_rate": 4.9384850339224405e-06,
"loss": 0.3146,
"step": 640
},
{
"epoch": 0.86,
"grad_norm": 1.4786981344223022,
"learning_rate": 4.92174626564897e-06,
"loss": 0.3147,
"step": 672
},
{
"epoch": 0.9,
"grad_norm": 1.4971543550491333,
"learning_rate": 4.903031163703588e-06,
"loss": 0.3159,
"step": 704
},
{
"epoch": 0.94,
"grad_norm": 1.6120744943618774,
"learning_rate": 4.882355001067892e-06,
"loss": 0.3172,
"step": 736
},
{
"epoch": 0.98,
"grad_norm": 1.7643611431121826,
"learning_rate": 4.859734651101821e-06,
"loss": 0.3179,
"step": 768
},
{
"epoch": 1.02,
"grad_norm": 1.4471312761306763,
"learning_rate": 4.835188573773681e-06,
"loss": 0.3146,
"step": 800
},
{
"epoch": 1.06,
"grad_norm": 1.5208021402359009,
"learning_rate": 4.808736800595372e-06,
"loss": 0.2978,
"step": 832
},
{
"epoch": 1.1,
"grad_norm": 1.2985948324203491,
"learning_rate": 4.78040091827511e-06,
"loss": 0.2945,
"step": 864
},
{
"epoch": 1.15,
"grad_norm": 1.3271950483322144,
"learning_rate": 4.750204051100996e-06,
"loss": 0.2864,
"step": 896
},
{
"epoch": 1.19,
"grad_norm": 1.5903184413909912,
"learning_rate": 4.718170842069793e-06,
"loss": 0.3022,
"step": 928
},
{
"epoch": 1.23,
"grad_norm": 1.446515679359436,
"learning_rate": 4.6843274327763165e-06,
"loss": 0.3016,
"step": 960
},
{
"epoch": 1.27,
"grad_norm": 1.1784732341766357,
"learning_rate": 4.648701442079864e-06,
"loss": 0.2862,
"step": 992
},
{
"epoch": 1.31,
"grad_norm": 1.4098988771438599,
"learning_rate": 4.611321943565065e-06,
"loss": 0.2882,
"step": 1024
},
{
"epoch": 1.35,
"grad_norm": 1.194462537765503,
"learning_rate": 4.5722194418155756e-06,
"loss": 0.2867,
"step": 1056
},
{
"epoch": 1.39,
"grad_norm": 1.4424467086791992,
"learning_rate": 4.531425847519958e-06,
"loss": 0.2979,
"step": 1088
},
{
"epoch": 1.43,
"grad_norm": 1.1310840845108032,
"learning_rate": 4.488974451430077e-06,
"loss": 0.304,
"step": 1120
},
{
"epoch": 1.47,
"grad_norm": 1.291769027709961,
"learning_rate": 4.444899897193247e-06,
"loss": 0.295,
"step": 1152
},
{
"epoch": 1.51,
"grad_norm": 1.2577590942382812,
"learning_rate": 4.399238153080317e-06,
"loss": 0.2827,
"step": 1184
},
{
"epoch": 1.55,
"grad_norm": 1.5202428102493286,
"learning_rate": 4.352026482632762e-06,
"loss": 0.2936,
"step": 1216
},
{
"epoch": 1.6,
"grad_norm": 1.2438831329345703,
"learning_rate": 4.303303414252724e-06,
"loss": 0.2861,
"step": 1248
},
{
"epoch": 1.64,
"grad_norm": 1.170016884803772,
"learning_rate": 4.253108709760838e-06,
"loss": 0.2949,
"step": 1280
},
{
"epoch": 1.68,
"grad_norm": 1.502139925956726,
"learning_rate": 4.201483331947488e-06,
"loss": 0.29,
"step": 1312
},
{
"epoch": 1.72,
"grad_norm": 1.021010398864746,
"learning_rate": 4.148469411143973e-06,
"loss": 0.2832,
"step": 1344
},
{
"epoch": 1.76,
"grad_norm": 1.2760310173034668,
"learning_rate": 4.094110210840879e-06,
"loss": 0.2855,
"step": 1376
},
{
"epoch": 1.8,
"grad_norm": 1.6488782167434692,
"learning_rate": 4.038450092381697e-06,
"loss": 0.2897,
"step": 1408
},
{
"epoch": 1.84,
"grad_norm": 1.4734816551208496,
"learning_rate": 3.981534478760508e-06,
"loss": 0.2945,
"step": 1440
},
{
"epoch": 1.88,
"grad_norm": 1.2717450857162476,
"learning_rate": 3.923409817553284e-06,
"loss": 0.2865,
"step": 1472
},
{
"epoch": 1.92,
"grad_norm": 1.449500322341919,
"learning_rate": 3.864123543013044e-06,
"loss": 0.2913,
"step": 1504
},
{
"epoch": 1.96,
"grad_norm": 0.9587951302528381,
"learning_rate": 3.8037240373598077e-06,
"loss": 0.2798,
"step": 1536
},
{
"epoch": 2.01,
"grad_norm": 1.0608307123184204,
"learning_rate": 3.7422605912969334e-06,
"loss": 0.2832,
"step": 1568
},
{
"epoch": 2.05,
"grad_norm": 1.510848045349121,
"learning_rate": 3.679783363786063e-06,
"loss": 0.27,
"step": 1600
},
{
"epoch": 2.09,
"grad_norm": 1.4769214391708374,
"learning_rate": 3.6163433411135003e-06,
"loss": 0.2616,
"step": 1632
},
{
"epoch": 2.13,
"grad_norm": 1.3239907026290894,
"learning_rate": 3.551992295281431e-06,
"loss": 0.2653,
"step": 1664
},
{
"epoch": 2.17,
"grad_norm": 1.6486111879348755,
"learning_rate": 3.48678274175793e-06,
"loss": 0.2579,
"step": 1696
},
{
"epoch": 2.21,
"grad_norm": 1.3881120681762695,
"learning_rate": 3.420767896620249e-06,
"loss": 0.2657,
"step": 1728
},
{
"epoch": 2.25,
"grad_norm": 1.0850417613983154,
"learning_rate": 3.3540016331263526e-06,
"loss": 0.2594,
"step": 1760
},
{
"epoch": 2.29,
"grad_norm": 1.369093656539917,
"learning_rate": 3.2865384377501385e-06,
"loss": 0.2605,
"step": 1792
},
{
"epoch": 2.33,
"grad_norm": 1.3315595388412476,
"learning_rate": 3.2184333657162297e-06,
"loss": 0.2538,
"step": 1824
},
{
"epoch": 2.37,
"grad_norm": 1.4239180088043213,
"learning_rate": 3.1497419960706235e-06,
"loss": 0.2632,
"step": 1856
},
{
"epoch": 2.41,
"grad_norm": 1.4137816429138184,
"learning_rate": 3.080520386323853e-06,
"loss": 0.2639,
"step": 1888
},
{
"epoch": 2.46,
"grad_norm": 1.2701876163482666,
"learning_rate": 3.0108250267036976e-06,
"loss": 0.2664,
"step": 1920
},
{
"epoch": 2.5,
"grad_norm": 1.0828937292099,
"learning_rate": 2.9407127940547485e-06,
"loss": 0.2667,
"step": 1952
},
{
"epoch": 2.54,
"grad_norm": 1.4425878524780273,
"learning_rate": 2.870240905422476e-06,
"loss": 0.2587,
"step": 1984
},
{
"epoch": 2.58,
"grad_norm": 1.6281284093856812,
"learning_rate": 2.7994668713596598e-06,
"loss": 0.2593,
"step": 2016
},
{
"epoch": 2.62,
"grad_norm": 1.3322839736938477,
"learning_rate": 2.728448448993292e-06,
"loss": 0.2596,
"step": 2048
},
{
"epoch": 2.66,
"grad_norm": 1.1276755332946777,
"learning_rate": 2.65724359489027e-06,
"loss": 0.2562,
"step": 2080
},
{
"epoch": 2.7,
"grad_norm": 1.6861284971237183,
"learning_rate": 2.5859104177603146e-06,
"loss": 0.2667,
"step": 2112
},
{
"epoch": 2.74,
"grad_norm": 1.2902480363845825,
"learning_rate": 2.514507131034735e-06,
"loss": 0.2605,
"step": 2144
},
{
"epoch": 2.78,
"grad_norm": 1.439092755317688,
"learning_rate": 2.443092005359736e-06,
"loss": 0.2616,
"step": 2176
},
{
"epoch": 2.82,
"grad_norm": 1.6176005601882935,
"learning_rate": 2.3717233210430258e-06,
"loss": 0.2627,
"step": 2208
},
{
"epoch": 2.86,
"grad_norm": 1.1382880210876465,
"learning_rate": 2.300459320492547e-06,
"loss": 0.2536,
"step": 2240
},
{
"epoch": 2.91,
"grad_norm": 1.4830528497695923,
"learning_rate": 2.2293581606861298e-06,
"loss": 0.2613,
"step": 2272
},
{
"epoch": 2.95,
"grad_norm": 1.468441367149353,
"learning_rate": 2.158477865710868e-06,
"loss": 0.2661,
"step": 2304
},
{
"epoch": 2.99,
"grad_norm": 1.5543335676193237,
"learning_rate": 2.087876279410942e-06,
"loss": 0.2628,
"step": 2336
},
{
"epoch": 3.03,
"grad_norm": 1.4273053407669067,
"learning_rate": 2.017611018182533e-06,
"loss": 0.2549,
"step": 2368
},
{
"epoch": 3.07,
"grad_norm": 1.910224199295044,
"learning_rate": 1.94773942395436e-06,
"loss": 0.2345,
"step": 2400
},
{
"epoch": 3.11,
"grad_norm": 1.6112534999847412,
"learning_rate": 1.8783185173921847e-06,
"loss": 0.2335,
"step": 2432
},
{
"epoch": 3.15,
"grad_norm": 1.7274821996688843,
"learning_rate": 1.8094049513655191e-06,
"loss": 0.2391,
"step": 2464
},
{
"epoch": 3.19,
"grad_norm": 1.3582186698913574,
"learning_rate": 1.7410549647144598e-06,
"loss": 0.2339,
"step": 2496
},
{
"epoch": 3.23,
"grad_norm": 1.3842142820358276,
"learning_rate": 1.6733243363544154e-06,
"loss": 0.2326,
"step": 2528
},
{
"epoch": 3.27,
"grad_norm": 1.635377287864685,
"learning_rate": 1.606268339756166e-06,
"loss": 0.2416,
"step": 2560
},
{
"epoch": 3.31,
"grad_norm": 1.555497407913208,
"learning_rate": 1.5399416978383985e-06,
"loss": 0.2285,
"step": 2592
},
{
"epoch": 3.36,
"grad_norm": 1.3637293577194214,
"learning_rate": 1.4743985383095478e-06,
"loss": 0.2283,
"step": 2624
},
{
"epoch": 3.4,
"grad_norm": 1.5432817935943604,
"learning_rate": 1.409692349495363e-06,
"loss": 0.2302,
"step": 2656
},
{
"epoch": 3.44,
"grad_norm": 1.5474470853805542,
"learning_rate": 1.345875936688268e-06,
"loss": 0.2356,
"step": 2688
},
{
"epoch": 3.48,
"grad_norm": 1.5453095436096191,
"learning_rate": 1.283001379054128e-06,
"loss": 0.2421,
"step": 2720
},
{
"epoch": 3.52,
"grad_norm": 1.3231130838394165,
"learning_rate": 1.2211199871315932e-06,
"loss": 0.2303,
"step": 2752
},
{
"epoch": 3.56,
"grad_norm": 1.3008874654769897,
"learning_rate": 1.160282260958692e-06,
"loss": 0.2337,
"step": 2784
},
{
"epoch": 3.6,
"grad_norm": 1.9900566339492798,
"learning_rate": 1.1005378488608783e-06,
"loss": 0.2331,
"step": 2816
},
{
"epoch": 3.64,
"grad_norm": 1.4739086627960205,
"learning_rate": 1.0419355069341206e-06,
"loss": 0.2363,
"step": 2848
},
{
"epoch": 3.68,
"grad_norm": 1.4675333499908447,
"learning_rate": 9.845230592561273e-07,
"loss": 0.2207,
"step": 2880
},
{
"epoch": 3.72,
"grad_norm": 1.4848682880401611,
"learning_rate": 9.283473588581784e-07,
"loss": 0.2287,
"step": 2912
},
{
"epoch": 3.76,
"grad_norm": 1.7442777156829834,
"learning_rate": 8.734542494893955e-07,
"loss": 0.2341,
"step": 2944
},
{
"epoch": 3.81,
"grad_norm": 1.6904102563858032,
"learning_rate": 8.198885282046712e-07,
"loss": 0.2304,
"step": 2976
},
{
"epoch": 3.85,
"grad_norm": 1.308547019958496,
"learning_rate": 7.676939088067847e-07,
"loss": 0.231,
"step": 3008
},
{
"epoch": 3.89,
"grad_norm": 1.2075304985046387,
"learning_rate": 7.169129861725297e-07,
"loss": 0.2337,
"step": 3040
},
{
"epoch": 3.93,
"grad_norm": 1.1886279582977295,
"learning_rate": 6.675872014919738e-07,
"loss": 0.2263,
"step": 3072
},
{
"epoch": 3.97,
"grad_norm": 1.3249859809875488,
"learning_rate": 6.197568084492203e-07,
"loss": 0.2287,
"step": 3104
},
{
"epoch": 4.01,
"grad_norm": 1.322100281715393,
"learning_rate": 5.734608403722674e-07,
"loss": 0.2245,
"step": 3136
},
{
"epoch": 4.05,
"grad_norm": 1.4475897550582886,
"learning_rate": 5.287370783787649e-07,
"loss": 0.2136,
"step": 3168
},
{
"epoch": 4.09,
"grad_norm": 1.3953217267990112,
"learning_rate": 4.856220205436834e-07,
"loss": 0.2062,
"step": 3200
},
{
"epoch": 4.13,
"grad_norm": 1.5530751943588257,
"learning_rate": 4.441508521140392e-07,
"loss": 0.2133,
"step": 3232
},
{
"epoch": 4.17,
"grad_norm": 1.3993407487869263,
"learning_rate": 4.043574167949893e-07,
"loss": 0.2086,
"step": 3264
},
{
"epoch": 4.21,
"grad_norm": 1.5585782527923584,
"learning_rate": 3.66274189130732e-07,
"loss": 0.2176,
"step": 3296
},
{
"epoch": 4.26,
"grad_norm": 1.8112726211547852,
"learning_rate": 3.299322480027498e-07,
"loss": 0.2095,
"step": 3328
},
{
"epoch": 4.3,
"grad_norm": 1.3818844556808472,
"learning_rate": 2.9536125126701565e-07,
"loss": 0.2083,
"step": 3360
},
{
"epoch": 4.34,
"grad_norm": 2.006455898284912,
"learning_rate": 2.6258941155087783e-07,
"loss": 0.2165,
"step": 3392
},
{
"epoch": 4.38,
"grad_norm": 1.5955146551132202,
"learning_rate": 2.3164347322935127e-07,
"loss": 0.2116,
"step": 3424
},
{
"epoch": 4.42,
"grad_norm": 1.6406195163726807,
"learning_rate": 2.0254869059962628e-07,
"loss": 0.2085,
"step": 3456
},
{
"epoch": 4.46,
"grad_norm": 1.5686984062194824,
"learning_rate": 1.75328807271595e-07,
"loss": 0.2083,
"step": 3488
},
{
"epoch": 4.5,
"grad_norm": 1.7452423572540283,
"learning_rate": 1.5000603679121456e-07,
"loss": 0.2067,
"step": 3520
},
{
"epoch": 4.54,
"grad_norm": 1.6517360210418701,
"learning_rate": 1.2660104451252043e-07,
"loss": 0.2091,
"step": 3552
},
{
"epoch": 4.58,
"grad_norm": 1.559053897857666,
"learning_rate": 1.0513293073309089e-07,
"loss": 0.2133,
"step": 3584
},
{
"epoch": 4.62,
"grad_norm": 1.491980791091919,
"learning_rate": 8.561921510671312e-08,
"loss": 0.2109,
"step": 3616
},
{
"epoch": 4.66,
"grad_norm": 1.7280569076538086,
"learning_rate": 6.807582234598042e-08,
"loss": 0.2195,
"step": 3648
},
{
"epoch": 4.71,
"grad_norm": 1.3292275667190552,
"learning_rate": 5.2517069226488694e-08,
"loss": 0.2126,
"step": 3680
},
{
"epoch": 4.75,
"grad_norm": 1.9652115106582642,
"learning_rate": 3.8955652903228114e-08,
"loss": 0.2184,
"step": 3712
},
{
"epoch": 4.79,
"grad_norm": 1.3413430452346802,
"learning_rate": 2.7402640548717107e-08,
"loss": 0.2089,
"step": 3744
},
{
"epoch": 4.83,
"grad_norm": 1.6649166345596313,
"learning_rate": 1.786746032132747e-08,
"loss": 0.2077,
"step": 3776
},
{
"epoch": 4.87,
"grad_norm": 1.5024484395980835,
"learning_rate": 1.0357893671171793e-08,
"loss": 0.2167,
"step": 3808
},
{
"epoch": 4.91,
"grad_norm": 1.7963902950286865,
"learning_rate": 4.8800689898337304e-09,
"loss": 0.2073,
"step": 3840
},
{
"epoch": 4.95,
"grad_norm": 1.2430423498153687,
"learning_rate": 1.4384566091227293e-09,
"loss": 0.2086,
"step": 3872
},
{
"epoch": 4.99,
"grad_norm": 1.498355507850647,
"learning_rate": 3.586515293557691e-11,
"loss": 0.2099,
"step": 3904
}
],
"logging_steps": 32,
"max_steps": 3910,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 782,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}