qwen2.5_7b_checkpoints / trainer_state.json
tonyshelby's picture
Upload folder using huggingface_hub
97f44f9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 100,
"global_step": 1827,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016420361247947456,
"grad_norm": 0.24344406443731442,
"learning_rate": 1.092896174863388e-05,
"loss": 0.9892,
"step": 10
},
{
"epoch": 0.03284072249589491,
"grad_norm": 0.4125241392130406,
"learning_rate": 2.185792349726776e-05,
"loss": 1.0156,
"step": 20
},
{
"epoch": 0.04926108374384237,
"grad_norm": 0.6820350190299901,
"learning_rate": 3.2786885245901635e-05,
"loss": 0.9011,
"step": 30
},
{
"epoch": 0.06568144499178982,
"grad_norm": 0.3337377325006549,
"learning_rate": 4.371584699453552e-05,
"loss": 0.7712,
"step": 40
},
{
"epoch": 0.08210180623973727,
"grad_norm": 0.25141334294130874,
"learning_rate": 5.4644808743169406e-05,
"loss": 0.6344,
"step": 50
},
{
"epoch": 0.09852216748768473,
"grad_norm": 0.24065515098304474,
"learning_rate": 6.557377049180327e-05,
"loss": 0.5561,
"step": 60
},
{
"epoch": 0.11494252873563218,
"grad_norm": 0.22706179761932083,
"learning_rate": 7.650273224043716e-05,
"loss": 0.5197,
"step": 70
},
{
"epoch": 0.13136288998357964,
"grad_norm": 0.27088874717723194,
"learning_rate": 8.743169398907104e-05,
"loss": 0.5356,
"step": 80
},
{
"epoch": 0.1477832512315271,
"grad_norm": 0.20612532388807284,
"learning_rate": 9.836065573770493e-05,
"loss": 0.4876,
"step": 90
},
{
"epoch": 0.16420361247947454,
"grad_norm": 0.19862741522784605,
"learning_rate": 0.00010928961748633881,
"loss": 0.4525,
"step": 100
},
{
"epoch": 0.16420361247947454,
"eval_loss": 0.46812304854393005,
"eval_runtime": 183.2125,
"eval_samples_per_second": 23.639,
"eval_steps_per_second": 2.958,
"step": 100
},
{
"epoch": 0.180623973727422,
"grad_norm": 0.2101845619328537,
"learning_rate": 0.00012021857923497268,
"loss": 0.506,
"step": 110
},
{
"epoch": 0.19704433497536947,
"grad_norm": 0.2349100061009304,
"learning_rate": 0.00013114754098360654,
"loss": 0.4962,
"step": 120
},
{
"epoch": 0.2134646962233169,
"grad_norm": 0.28742618935606645,
"learning_rate": 0.00014207650273224045,
"loss": 0.4713,
"step": 130
},
{
"epoch": 0.22988505747126436,
"grad_norm": 0.29754352649151433,
"learning_rate": 0.0001530054644808743,
"loss": 0.4533,
"step": 140
},
{
"epoch": 0.24630541871921183,
"grad_norm": 0.24471382470916278,
"learning_rate": 0.0001639344262295082,
"loss": 0.4405,
"step": 150
},
{
"epoch": 0.2627257799671593,
"grad_norm": 0.25817141257326837,
"learning_rate": 0.00017486338797814208,
"loss": 0.4467,
"step": 160
},
{
"epoch": 0.2791461412151067,
"grad_norm": 0.2989945379301817,
"learning_rate": 0.00018579234972677597,
"loss": 0.4422,
"step": 170
},
{
"epoch": 0.2955665024630542,
"grad_norm": 0.3228970636746939,
"learning_rate": 0.00019672131147540985,
"loss": 0.4353,
"step": 180
},
{
"epoch": 0.31198686371100165,
"grad_norm": 0.20939874727194543,
"learning_rate": 0.00019999105344723812,
"loss": 0.4237,
"step": 190
},
{
"epoch": 0.3284072249589491,
"grad_norm": 0.18763922543710346,
"learning_rate": 0.0001999472374506253,
"loss": 0.4298,
"step": 200
},
{
"epoch": 0.3284072249589491,
"eval_loss": 0.421655535697937,
"eval_runtime": 183.1202,
"eval_samples_per_second": 23.651,
"eval_steps_per_second": 2.96,
"step": 200
},
{
"epoch": 0.3448275862068966,
"grad_norm": 0.20346107785301223,
"learning_rate": 0.00019986692474561292,
"loss": 0.4229,
"step": 210
},
{
"epoch": 0.361247947454844,
"grad_norm": 0.2148306578056735,
"learning_rate": 0.00019975014465916825,
"loss": 0.4065,
"step": 220
},
{
"epoch": 0.37766830870279144,
"grad_norm": 0.232976318701319,
"learning_rate": 0.00019959693983467874,
"loss": 0.4122,
"step": 230
},
{
"epoch": 0.39408866995073893,
"grad_norm": 0.26419924004160084,
"learning_rate": 0.00019940736621638,
"loss": 0.4045,
"step": 240
},
{
"epoch": 0.41050903119868637,
"grad_norm": 0.26529960716769463,
"learning_rate": 0.00019918149302892746,
"loss": 0.4303,
"step": 250
},
{
"epoch": 0.4269293924466338,
"grad_norm": 0.2070163052593257,
"learning_rate": 0.0001989194027521181,
"loss": 0.4064,
"step": 260
},
{
"epoch": 0.4433497536945813,
"grad_norm": 0.22108521642249446,
"learning_rate": 0.00019862119109077223,
"loss": 0.3908,
"step": 270
},
{
"epoch": 0.45977011494252873,
"grad_norm": 0.22405910167315063,
"learning_rate": 0.00019828696693978615,
"loss": 0.4068,
"step": 280
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.2185643225410028,
"learning_rate": 0.00019791685234436771,
"loss": 0.3992,
"step": 290
},
{
"epoch": 0.49261083743842365,
"grad_norm": 0.19006575606804949,
"learning_rate": 0.0001975109824554707,
"loss": 0.4128,
"step": 300
},
{
"epoch": 0.49261083743842365,
"eval_loss": 0.4074872136116028,
"eval_runtime": 182.3826,
"eval_samples_per_second": 23.747,
"eval_steps_per_second": 2.972,
"step": 300
},
{
"epoch": 0.5090311986863711,
"grad_norm": 0.19145583439083957,
"learning_rate": 0.0001970695054804429,
"loss": 0.4034,
"step": 310
},
{
"epoch": 0.5254515599343186,
"grad_norm": 0.2273380197603789,
"learning_rate": 0.00019659258262890683,
"loss": 0.4197,
"step": 320
},
{
"epoch": 0.541871921182266,
"grad_norm": 0.22091852959390795,
"learning_rate": 0.00019608038805389252,
"loss": 0.3991,
"step": 330
},
{
"epoch": 0.5582922824302134,
"grad_norm": 0.22367242517683747,
"learning_rate": 0.00019553310878824373,
"loss": 0.3861,
"step": 340
},
{
"epoch": 0.5747126436781609,
"grad_norm": 0.17219186625619032,
"learning_rate": 0.00019495094467632113,
"loss": 0.3979,
"step": 350
},
{
"epoch": 0.5911330049261084,
"grad_norm": 0.19351650812107563,
"learning_rate": 0.00019433410830102722,
"loss": 0.3815,
"step": 360
},
{
"epoch": 0.6075533661740559,
"grad_norm": 0.23951228194879498,
"learning_rate": 0.00019368282490617964,
"loss": 0.4016,
"step": 370
},
{
"epoch": 0.6239737274220033,
"grad_norm": 0.18151574403855258,
"learning_rate": 0.000192997332314261,
"loss": 0.4116,
"step": 380
},
{
"epoch": 0.6403940886699507,
"grad_norm": 0.2062847844928879,
"learning_rate": 0.0001922778808395759,
"loss": 0.4316,
"step": 390
},
{
"epoch": 0.6568144499178982,
"grad_norm": 0.2100697627990869,
"learning_rate": 0.0001915247331968461,
"loss": 0.4247,
"step": 400
},
{
"epoch": 0.6568144499178982,
"eval_loss": 0.3982411026954651,
"eval_runtime": 182.3128,
"eval_samples_per_second": 23.756,
"eval_steps_per_second": 2.973,
"step": 400
},
{
"epoch": 0.6732348111658456,
"grad_norm": 0.2045711407622735,
"learning_rate": 0.00019073816440527778,
"loss": 0.4004,
"step": 410
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.2127829647814462,
"learning_rate": 0.00018991846168813544,
"loss": 0.3842,
"step": 420
},
{
"epoch": 0.7060755336617406,
"grad_norm": 0.21955839561937393,
"learning_rate": 0.00018906592436785966,
"loss": 0.3999,
"step": 430
},
{
"epoch": 0.722495894909688,
"grad_norm": 0.1655887989642121,
"learning_rate": 0.00018818086375676653,
"loss": 0.4066,
"step": 440
},
{
"epoch": 0.7389162561576355,
"grad_norm": 0.24085315390514417,
"learning_rate": 0.00018726360304336894,
"loss": 0.3908,
"step": 450
},
{
"epoch": 0.7553366174055829,
"grad_norm": 0.21190935166305344,
"learning_rate": 0.00018631447717436115,
"loss": 0.4083,
"step": 460
},
{
"epoch": 0.7717569786535303,
"grad_norm": 0.20895160435862614,
"learning_rate": 0.00018533383273230966,
"loss": 0.3995,
"step": 470
},
{
"epoch": 0.7881773399014779,
"grad_norm": 0.2711577365645202,
"learning_rate": 0.0001843220278090954,
"loss": 0.3645,
"step": 480
},
{
"epoch": 0.8045977011494253,
"grad_norm": 0.18438724587907176,
"learning_rate": 0.00018327943187515278,
"loss": 0.3823,
"step": 490
},
{
"epoch": 0.8210180623973727,
"grad_norm": 0.2604449097960011,
"learning_rate": 0.000182206425644554,
"loss": 0.3922,
"step": 500
},
{
"epoch": 0.8210180623973727,
"eval_loss": 0.39160048961639404,
"eval_runtime": 182.2512,
"eval_samples_per_second": 23.764,
"eval_steps_per_second": 2.974,
"step": 500
},
{
"epoch": 0.8374384236453202,
"grad_norm": 0.17015787500883076,
"learning_rate": 0.0001811034009359877,
"loss": 0.3849,
"step": 510
},
{
"epoch": 0.8538587848932676,
"grad_norm": 0.24475801339167882,
"learning_rate": 0.0001799707605296825,
"loss": 0.3985,
"step": 520
},
{
"epoch": 0.8702791461412152,
"grad_norm": 0.19669186817709053,
"learning_rate": 0.00017880891802032775,
"loss": 0.3912,
"step": 530
},
{
"epoch": 0.8866995073891626,
"grad_norm": 0.21566193043161921,
"learning_rate": 0.00017761829766604556,
"loss": 0.3618,
"step": 540
},
{
"epoch": 0.90311986863711,
"grad_norm": 0.16411309691606532,
"learning_rate": 0.0001763993342334688,
"loss": 0.3742,
"step": 550
},
{
"epoch": 0.9195402298850575,
"grad_norm": 0.20572670194937387,
"learning_rate": 0.00017515247283898165,
"loss": 0.4028,
"step": 560
},
{
"epoch": 0.9359605911330049,
"grad_norm": 0.18095400431338843,
"learning_rate": 0.0001738781687861812,
"loss": 0.3687,
"step": 570
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.17975479637736744,
"learning_rate": 0.0001725768873996188,
"loss": 0.409,
"step": 580
},
{
"epoch": 0.9688013136288999,
"grad_norm": 0.18067906778640103,
"learning_rate": 0.00017124910385488238,
"loss": 0.3695,
"step": 590
},
{
"epoch": 0.9852216748768473,
"grad_norm": 0.28537458222870044,
"learning_rate": 0.00016989530300508124,
"loss": 0.3634,
"step": 600
},
{
"epoch": 0.9852216748768473,
"eval_loss": 0.3875725269317627,
"eval_runtime": 184.7983,
"eval_samples_per_second": 23.436,
"eval_steps_per_second": 2.933,
"step": 600
},
{
"epoch": 1.0016420361247949,
"grad_norm": 0.24626304392948783,
"learning_rate": 0.00016851597920379741,
"loss": 0.3756,
"step": 610
},
{
"epoch": 1.0180623973727423,
"grad_norm": 0.2377389610644995,
"learning_rate": 0.00016711163612456758,
"loss": 0.3631,
"step": 620
},
{
"epoch": 1.0344827586206897,
"grad_norm": 0.2001960352295221,
"learning_rate": 0.00016568278657696164,
"loss": 0.3469,
"step": 630
},
{
"epoch": 1.0509031198686372,
"grad_norm": 0.24763506441381955,
"learning_rate": 0.00016422995231932548,
"loss": 0.3675,
"step": 640
},
{
"epoch": 1.0673234811165846,
"grad_norm": 0.2770882265665396,
"learning_rate": 0.00016275366386825572,
"loss": 0.3556,
"step": 650
},
{
"epoch": 1.083743842364532,
"grad_norm": 0.22949474804382808,
"learning_rate": 0.00016125446030487643,
"loss": 0.3837,
"step": 660
},
{
"epoch": 1.1001642036124795,
"grad_norm": 0.2312352658660111,
"learning_rate": 0.00015973288907798842,
"loss": 0.3399,
"step": 670
},
{
"epoch": 1.116584564860427,
"grad_norm": 0.3016093942936665,
"learning_rate": 0.0001581895058041629,
"loss": 0.353,
"step": 680
},
{
"epoch": 1.1330049261083743,
"grad_norm": 0.20329768179994662,
"learning_rate": 0.00015662487406485273,
"loss": 0.3567,
"step": 690
},
{
"epoch": 1.1494252873563218,
"grad_norm": 0.24197160472933935,
"learning_rate": 0.00015503956520059525,
"loss": 0.351,
"step": 700
},
{
"epoch": 1.1494252873563218,
"eval_loss": 0.3857228755950928,
"eval_runtime": 182.3099,
"eval_samples_per_second": 23.756,
"eval_steps_per_second": 2.973,
"step": 700
},
{
"epoch": 1.1658456486042692,
"grad_norm": 0.2224497243857267,
"learning_rate": 0.0001534341581023814,
"loss": 0.3603,
"step": 710
},
{
"epoch": 1.1822660098522166,
"grad_norm": 0.20517751734717224,
"learning_rate": 0.00015180923900026848,
"loss": 0.3681,
"step": 720
},
{
"epoch": 1.1986863711001643,
"grad_norm": 0.2731449513925482,
"learning_rate": 0.0001501654012493121,
"loss": 0.3812,
"step": 730
},
{
"epoch": 1.2151067323481117,
"grad_norm": 0.24191327156963474,
"learning_rate": 0.0001485032451128971,
"loss": 0.3665,
"step": 740
},
{
"epoch": 1.2315270935960592,
"grad_norm": 0.21627312539889124,
"learning_rate": 0.00014682337754354534,
"loss": 0.3564,
"step": 750
},
{
"epoch": 1.2479474548440066,
"grad_norm": 0.17826400495436648,
"learning_rate": 0.00014512641196128115,
"loss": 0.3705,
"step": 760
},
{
"epoch": 1.264367816091954,
"grad_norm": 0.24384067958647468,
"learning_rate": 0.000143412968029635,
"loss": 0.3659,
"step": 770
},
{
"epoch": 1.2807881773399015,
"grad_norm": 0.2622553973852724,
"learning_rate": 0.00014168367142936735,
"loss": 0.3777,
"step": 780
},
{
"epoch": 1.297208538587849,
"grad_norm": 0.24127081497225922,
"learning_rate": 0.00013993915362999515,
"loss": 0.3636,
"step": 790
},
{
"epoch": 1.3136288998357963,
"grad_norm": 0.20872541872479608,
"learning_rate": 0.00013818005165920467,
"loss": 0.3613,
"step": 800
},
{
"epoch": 1.3136288998357963,
"eval_loss": 0.3825320899486542,
"eval_runtime": 181.276,
"eval_samples_per_second": 23.892,
"eval_steps_per_second": 2.99,
"step": 800
},
{
"epoch": 1.3300492610837438,
"grad_norm": 0.22884530054863364,
"learning_rate": 0.00013640700787023464,
"loss": 0.3487,
"step": 810
},
{
"epoch": 1.3464696223316914,
"grad_norm": 0.3192682013485456,
"learning_rate": 0.00013462066970731454,
"loss": 0.3644,
"step": 820
},
{
"epoch": 1.3628899835796386,
"grad_norm": 0.24655216410147654,
"learning_rate": 0.00013282168946924424,
"loss": 0.3571,
"step": 830
},
{
"epoch": 1.3793103448275863,
"grad_norm": 0.18787989160895535,
"learning_rate": 0.00013101072407120057,
"loss": 0.351,
"step": 840
},
{
"epoch": 1.3957307060755337,
"grad_norm": 0.1848697738688275,
"learning_rate": 0.0001291884348048584,
"loss": 0.3891,
"step": 850
},
{
"epoch": 1.4121510673234812,
"grad_norm": 0.22237495334933807,
"learning_rate": 0.00012735548709691356,
"loss": 0.3774,
"step": 860
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.2466680297782571,
"learning_rate": 0.0001255125502660958,
"loss": 0.3733,
"step": 870
},
{
"epoch": 1.444991789819376,
"grad_norm": 0.2335451857601313,
"learning_rate": 0.0001236602972787604,
"loss": 0.3826,
"step": 880
},
{
"epoch": 1.4614121510673235,
"grad_norm": 0.26118360054137263,
"learning_rate": 0.00012179940450314816,
"loss": 0.3742,
"step": 890
},
{
"epoch": 1.477832512315271,
"grad_norm": 0.2378914948570509,
"learning_rate": 0.00011993055146240273,
"loss": 0.3655,
"step": 900
},
{
"epoch": 1.477832512315271,
"eval_loss": 0.3792349696159363,
"eval_runtime": 181.1939,
"eval_samples_per_second": 23.903,
"eval_steps_per_second": 2.991,
"step": 900
},
{
"epoch": 1.4942528735632183,
"grad_norm": 0.255487972388616,
"learning_rate": 0.00011805442058643621,
"loss": 0.3614,
"step": 910
},
{
"epoch": 1.5106732348111658,
"grad_norm": 0.19677148734311173,
"learning_rate": 0.00011617169696273325,
"loss": 0.3607,
"step": 920
},
{
"epoch": 1.5270935960591134,
"grad_norm": 0.2034148205786053,
"learning_rate": 0.00011428306808618456,
"loss": 0.3581,
"step": 930
},
{
"epoch": 1.5435139573070606,
"grad_norm": 0.2172139938997764,
"learning_rate": 0.00011238922360804159,
"loss": 0.3513,
"step": 940
},
{
"epoch": 1.5599343185550083,
"grad_norm": 0.2086331597480039,
"learning_rate": 0.00011049085508408348,
"loss": 0.3405,
"step": 950
},
{
"epoch": 1.5763546798029555,
"grad_norm": 0.21489224754509587,
"learning_rate": 0.00010858865572208892,
"loss": 0.3696,
"step": 960
},
{
"epoch": 1.5927750410509032,
"grad_norm": 0.18831018208877479,
"learning_rate": 0.00010668332012870437,
"loss": 0.3422,
"step": 970
},
{
"epoch": 1.6091954022988506,
"grad_norm": 0.25159259320461275,
"learning_rate": 0.00010477554405580183,
"loss": 0.3452,
"step": 980
},
{
"epoch": 1.625615763546798,
"grad_norm": 0.21031599290249967,
"learning_rate": 0.00010286602414641817,
"loss": 0.3521,
"step": 990
},
{
"epoch": 1.6420361247947455,
"grad_norm": 0.19651891254691545,
"learning_rate": 0.00010095545768036913,
"loss": 0.3849,
"step": 1000
},
{
"epoch": 1.6420361247947455,
"eval_loss": 0.3762163817882538,
"eval_runtime": 181.2331,
"eval_samples_per_second": 23.897,
"eval_steps_per_second": 2.991,
"step": 1000
},
{
"epoch": 1.658456486042693,
"grad_norm": 0.19968900716509402,
"learning_rate": 9.904454231963089e-05,
"loss": 0.3854,
"step": 1010
},
{
"epoch": 1.6748768472906403,
"grad_norm": 0.20362598590099687,
"learning_rate": 9.713397585358188e-05,
"loss": 0.3768,
"step": 1020
},
{
"epoch": 1.6912972085385878,
"grad_norm": 0.22197361327539747,
"learning_rate": 9.52244559441982e-05,
"loss": 0.345,
"step": 1030
},
{
"epoch": 1.7077175697865354,
"grad_norm": 0.21984040718616688,
"learning_rate": 9.331667987129567e-05,
"loss": 0.3391,
"step": 1040
},
{
"epoch": 1.7241379310344827,
"grad_norm": 0.19128872792880933,
"learning_rate": 9.14113442779111e-05,
"loss": 0.3483,
"step": 1050
},
{
"epoch": 1.7405582922824303,
"grad_norm": 0.1799425595659442,
"learning_rate": 8.950914491591653e-05,
"loss": 0.3581,
"step": 1060
},
{
"epoch": 1.7569786535303775,
"grad_norm": 0.24469075185624548,
"learning_rate": 8.761077639195845e-05,
"loss": 0.3624,
"step": 1070
},
{
"epoch": 1.7733990147783252,
"grad_norm": 0.19922186890571247,
"learning_rate": 8.571693191381545e-05,
"loss": 0.3406,
"step": 1080
},
{
"epoch": 1.7898193760262726,
"grad_norm": 0.21371381860320704,
"learning_rate": 8.38283030372668e-05,
"loss": 0.3924,
"step": 1090
},
{
"epoch": 1.80623973727422,
"grad_norm": 0.22498092502689765,
"learning_rate": 8.194557941356382e-05,
"loss": 0.3373,
"step": 1100
},
{
"epoch": 1.80623973727422,
"eval_loss": 0.37362968921661377,
"eval_runtime": 182.0338,
"eval_samples_per_second": 23.792,
"eval_steps_per_second": 2.977,
"step": 1100
},
{
"epoch": 1.8226600985221675,
"grad_norm": 0.20923612184407323,
"learning_rate": 8.006944853759732e-05,
"loss": 0.3539,
"step": 1110
},
{
"epoch": 1.839080459770115,
"grad_norm": 0.197165934554251,
"learning_rate": 7.820059549685185e-05,
"loss": 0.3366,
"step": 1120
},
{
"epoch": 1.8555008210180624,
"grad_norm": 0.2479628844731751,
"learning_rate": 7.63397027212396e-05,
"loss": 0.3533,
"step": 1130
},
{
"epoch": 1.8719211822660098,
"grad_norm": 0.2048613348077339,
"learning_rate": 7.448744973390422e-05,
"loss": 0.3433,
"step": 1140
},
{
"epoch": 1.8883415435139574,
"grad_norm": 0.1978616785725276,
"learning_rate": 7.264451290308642e-05,
"loss": 0.358,
"step": 1150
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.23907776822210447,
"learning_rate": 7.081156519514162e-05,
"loss": 0.3373,
"step": 1160
},
{
"epoch": 1.9211822660098523,
"grad_norm": 0.22697517505792655,
"learning_rate": 6.898927592879945e-05,
"loss": 0.3677,
"step": 1170
},
{
"epoch": 1.9376026272577995,
"grad_norm": 0.19952355876882955,
"learning_rate": 6.71783105307558e-05,
"loss": 0.3586,
"step": 1180
},
{
"epoch": 1.9540229885057472,
"grad_norm": 0.23973505798542807,
"learning_rate": 6.537933029268545e-05,
"loss": 0.351,
"step": 1190
},
{
"epoch": 1.9704433497536946,
"grad_norm": 0.21045951152349507,
"learning_rate": 6.359299212976534e-05,
"loss": 0.358,
"step": 1200
},
{
"epoch": 1.9704433497536946,
"eval_loss": 0.37114256620407104,
"eval_runtime": 181.5295,
"eval_samples_per_second": 23.858,
"eval_steps_per_second": 2.986,
"step": 1200
},
{
"epoch": 1.986863711001642,
"grad_norm": 0.2178513568906315,
"learning_rate": 6.181994834079534e-05,
"loss": 0.3661,
"step": 1210
},
{
"epoch": 2.0032840722495897,
"grad_norm": 0.22255069074338651,
"learning_rate": 6.006084637000486e-05,
"loss": 0.3468,
"step": 1220
},
{
"epoch": 2.019704433497537,
"grad_norm": 0.2146784712653626,
"learning_rate": 5.8316328570632706e-05,
"loss": 0.3288,
"step": 1230
},
{
"epoch": 2.0361247947454846,
"grad_norm": 0.19842848576771832,
"learning_rate": 5.6587031970365034e-05,
"loss": 0.3428,
"step": 1240
},
{
"epoch": 2.052545155993432,
"grad_norm": 0.2480996901650553,
"learning_rate": 5.487358803871887e-05,
"loss": 0.317,
"step": 1250
},
{
"epoch": 2.0689655172413794,
"grad_norm": 0.25320522417631036,
"learning_rate": 5.3176622456454693e-05,
"loss": 0.3458,
"step": 1260
},
{
"epoch": 2.0853858784893267,
"grad_norm": 0.2761010309663585,
"learning_rate": 5.1496754887102924e-05,
"loss": 0.344,
"step": 1270
},
{
"epoch": 2.1018062397372743,
"grad_norm": 0.207714571300336,
"learning_rate": 4.98345987506879e-05,
"loss": 0.3346,
"step": 1280
},
{
"epoch": 2.1182266009852215,
"grad_norm": 0.26011454554235863,
"learning_rate": 4.8190760999731524e-05,
"loss": 0.308,
"step": 1290
},
{
"epoch": 2.134646962233169,
"grad_norm": 0.22344608598475113,
"learning_rate": 4.6565841897618615e-05,
"loss": 0.3476,
"step": 1300
},
{
"epoch": 2.134646962233169,
"eval_loss": 0.37267637252807617,
"eval_runtime": 181.518,
"eval_samples_per_second": 23.86,
"eval_steps_per_second": 2.986,
"step": 1300
},
{
"epoch": 2.1510673234811164,
"grad_norm": 0.2967215583254955,
"learning_rate": 4.496043479940478e-05,
"loss": 0.3257,
"step": 1310
},
{
"epoch": 2.167487684729064,
"grad_norm": 0.2606814382277013,
"learning_rate": 4.337512593514729e-05,
"loss": 0.3386,
"step": 1320
},
{
"epoch": 2.1839080459770113,
"grad_norm": 0.23459530147231222,
"learning_rate": 4.181049419583713e-05,
"loss": 0.3222,
"step": 1330
},
{
"epoch": 2.200328407224959,
"grad_norm": 0.22931006977757953,
"learning_rate": 4.026711092201162e-05,
"loss": 0.3485,
"step": 1340
},
{
"epoch": 2.2167487684729066,
"grad_norm": 0.21406254032335828,
"learning_rate": 3.8745539695123575e-05,
"loss": 0.3317,
"step": 1350
},
{
"epoch": 2.233169129720854,
"grad_norm": 0.2246772369192269,
"learning_rate": 3.724633613174429e-05,
"loss": 0.3195,
"step": 1360
},
{
"epoch": 2.2495894909688015,
"grad_norm": 0.20058597054661914,
"learning_rate": 3.577004768067456e-05,
"loss": 0.3353,
"step": 1370
},
{
"epoch": 2.2660098522167487,
"grad_norm": 0.19548049551690252,
"learning_rate": 3.431721342303839e-05,
"loss": 0.3435,
"step": 1380
},
{
"epoch": 2.2824302134646963,
"grad_norm": 0.3209884551646414,
"learning_rate": 3.288836387543247e-05,
"loss": 0.3065,
"step": 1390
},
{
"epoch": 2.2988505747126435,
"grad_norm": 0.22671041629202113,
"learning_rate": 3.148402079620261e-05,
"loss": 0.3318,
"step": 1400
},
{
"epoch": 2.2988505747126435,
"eval_loss": 0.3717256188392639,
"eval_runtime": 181.4415,
"eval_samples_per_second": 23.87,
"eval_steps_per_second": 2.987,
"step": 1400
},
{
"epoch": 2.315270935960591,
"grad_norm": 0.2030483834077384,
"learning_rate": 3.01046969949188e-05,
"loss": 0.3064,
"step": 1410
},
{
"epoch": 2.3316912972085384,
"grad_norm": 0.23788365593483962,
"learning_rate": 2.8750896145117657e-05,
"loss": 0.3179,
"step": 1420
},
{
"epoch": 2.348111658456486,
"grad_norm": 0.19692059961575673,
"learning_rate": 2.7423112600381206e-05,
"loss": 0.3131,
"step": 1430
},
{
"epoch": 2.3645320197044333,
"grad_norm": 0.2237818052949346,
"learning_rate": 2.6121831213818827e-05,
"loss": 0.3253,
"step": 1440
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.19504196499026782,
"learning_rate": 2.4847527161018357e-05,
"loss": 0.3357,
"step": 1450
},
{
"epoch": 2.3973727422003286,
"grad_norm": 0.22539099442687743,
"learning_rate": 2.3600665766531217e-05,
"loss": 0.3046,
"step": 1460
},
{
"epoch": 2.413793103448276,
"grad_norm": 0.22439054668185862,
"learning_rate": 2.2381702333954434e-05,
"loss": 0.3077,
"step": 1470
},
{
"epoch": 2.4302134646962235,
"grad_norm": 0.2712680693548686,
"learning_rate": 2.119108197967228e-05,
"loss": 0.3363,
"step": 1480
},
{
"epoch": 2.4466338259441707,
"grad_norm": 0.24707851488677393,
"learning_rate": 2.002923947031753e-05,
"loss": 0.3383,
"step": 1490
},
{
"epoch": 2.4630541871921183,
"grad_norm": 0.2441408851266546,
"learning_rate": 1.8896599064012298e-05,
"loss": 0.3309,
"step": 1500
},
{
"epoch": 2.4630541871921183,
"eval_loss": 0.3708588480949402,
"eval_runtime": 183.0453,
"eval_samples_per_second": 23.661,
"eval_steps_per_second": 2.961,
"step": 1500
},
{
"epoch": 2.4794745484400655,
"grad_norm": 0.33265270775799227,
"learning_rate": 1.779357435544603e-05,
"loss": 0.3127,
"step": 1510
},
{
"epoch": 2.495894909688013,
"grad_norm": 0.27253862679732843,
"learning_rate": 1.6720568124847245e-05,
"loss": 0.3165,
"step": 1520
},
{
"epoch": 2.512315270935961,
"grad_norm": 0.2346747331686747,
"learning_rate": 1.5677972190904622e-05,
"loss": 0.3288,
"step": 1530
},
{
"epoch": 2.528735632183908,
"grad_norm": 0.22806719713684484,
"learning_rate": 1.4666167267690345e-05,
"loss": 0.3314,
"step": 1540
},
{
"epoch": 2.5451559934318553,
"grad_norm": 0.29265105480331116,
"learning_rate": 1.3685522825638897e-05,
"loss": 0.3397,
"step": 1550
},
{
"epoch": 2.561576354679803,
"grad_norm": 0.23724928573204634,
"learning_rate": 1.273639695663108e-05,
"loss": 0.3218,
"step": 1560
},
{
"epoch": 2.5779967159277506,
"grad_norm": 0.26141011452015106,
"learning_rate": 1.1819136243233487e-05,
"loss": 0.3177,
"step": 1570
},
{
"epoch": 2.594417077175698,
"grad_norm": 0.2031906424566249,
"learning_rate": 1.093407563214036e-05,
"loss": 0.3231,
"step": 1580
},
{
"epoch": 2.6108374384236455,
"grad_norm": 0.21755775030563373,
"learning_rate": 1.0081538311864569e-05,
"loss": 0.3339,
"step": 1590
},
{
"epoch": 2.6272577996715927,
"grad_norm": 0.21842386508748884,
"learning_rate": 9.261835594722213e-06,
"loss": 0.3141,
"step": 1600
},
{
"epoch": 2.6272577996715927,
"eval_loss": 0.3702554702758789,
"eval_runtime": 182.4061,
"eval_samples_per_second": 23.744,
"eval_steps_per_second": 2.971,
"step": 1600
},
{
"epoch": 2.6436781609195403,
"grad_norm": 0.236951072287543,
"learning_rate": 8.475266803153891e-06,
"loss": 0.3401,
"step": 1610
},
{
"epoch": 2.6600985221674875,
"grad_norm": 0.2525139633376538,
"learning_rate": 7.722119160424112e-06,
"loss": 0.3085,
"step": 1620
},
{
"epoch": 2.676518883415435,
"grad_norm": 0.2516229325471898,
"learning_rate": 7.002667685739006e-06,
"loss": 0.3022,
"step": 1630
},
{
"epoch": 2.692939244663383,
"grad_norm": 0.26050295140139745,
"learning_rate": 6.317175093820371e-06,
"loss": 0.3267,
"step": 1640
},
{
"epoch": 2.70935960591133,
"grad_norm": 0.18224262926232945,
"learning_rate": 5.6658916989727695e-06,
"loss": 0.303,
"step": 1650
},
{
"epoch": 2.7257799671592773,
"grad_norm": 0.24832596165336826,
"learning_rate": 5.049055323678886e-06,
"loss": 0.321,
"step": 1660
},
{
"epoch": 2.742200328407225,
"grad_norm": 0.2590753148743458,
"learning_rate": 4.466891211756297e-06,
"loss": 0.3262,
"step": 1670
},
{
"epoch": 2.7586206896551726,
"grad_norm": 0.2428540816617657,
"learning_rate": 3.919611946107493e-06,
"loss": 0.3349,
"step": 1680
},
{
"epoch": 2.77504105090312,
"grad_norm": 0.26458866020693184,
"learning_rate": 3.40741737109318e-06,
"loss": 0.3162,
"step": 1690
},
{
"epoch": 2.7914614121510675,
"grad_norm": 0.19323777891652238,
"learning_rate": 2.930494519557114e-06,
"loss": 0.3252,
"step": 1700
},
{
"epoch": 2.7914614121510675,
"eval_loss": 0.3698328137397766,
"eval_runtime": 180.1089,
"eval_samples_per_second": 24.047,
"eval_steps_per_second": 3.009,
"step": 1700
},
{
"epoch": 2.8078817733990147,
"grad_norm": 0.22306709767178198,
"learning_rate": 2.489017544529315e-06,
"loss": 0.3294,
"step": 1710
},
{
"epoch": 2.8243021346469623,
"grad_norm": 0.21362629340229825,
"learning_rate": 2.083147655632289e-06,
"loss": 0.3452,
"step": 1720
},
{
"epoch": 2.8407224958949095,
"grad_norm": 0.2451092641924001,
"learning_rate": 1.7130330602138644e-06,
"loss": 0.3244,
"step": 1730
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.24087531125607498,
"learning_rate": 1.378808909227769e-06,
"loss": 0.3331,
"step": 1740
},
{
"epoch": 2.873563218390805,
"grad_norm": 0.2263212098087392,
"learning_rate": 1.0805972478819425e-06,
"loss": 0.3214,
"step": 1750
},
{
"epoch": 2.889983579638752,
"grad_norm": 0.23088744114799653,
"learning_rate": 8.185069710725524e-07,
"loss": 0.3132,
"step": 1760
},
{
"epoch": 2.9064039408866993,
"grad_norm": 0.24275972838675813,
"learning_rate": 5.926337836199891e-07,
"loss": 0.3223,
"step": 1770
},
{
"epoch": 2.922824302134647,
"grad_norm": 0.23934161415457397,
"learning_rate": 4.0306016532126734e-07,
"loss": 0.3322,
"step": 1780
},
{
"epoch": 2.9392446633825946,
"grad_norm": 0.21022506463435206,
"learning_rate": 2.4985534083176166e-07,
"loss": 0.3284,
"step": 1790
},
{
"epoch": 2.955665024630542,
"grad_norm": 0.21555995324889698,
"learning_rate": 1.330752543871161e-07,
"loss": 0.3446,
"step": 1800
},
{
"epoch": 2.955665024630542,
"eval_loss": 0.3698555529117584,
"eval_runtime": 177.5603,
"eval_samples_per_second": 24.392,
"eval_steps_per_second": 3.052,
"step": 1800
},
{
"epoch": 2.9720853858784895,
"grad_norm": 0.2518284713990051,
"learning_rate": 5.2762549374685275e-08,
"loss": 0.3281,
"step": 1810
},
{
"epoch": 2.9885057471264367,
"grad_norm": 0.2209423779742796,
"learning_rate": 8.946552761890382e-09,
"loss": 0.3178,
"step": 1820
},
{
"epoch": 3.0,
"step": 1827,
"total_flos": 5848557328269312.0,
"train_loss": 0.38001893711142287,
"train_runtime": 19748.2381,
"train_samples_per_second": 5.92,
"train_steps_per_second": 0.093
}
],
"logging_steps": 10,
"max_steps": 1827,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5848557328269312.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}