rootxhacker's picture
Upload folder using huggingface_hub
58d2467 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9980806142034548,
"eval_steps": 500,
"global_step": 780,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.025591810620601407,
"grad_norm": 2.7761764526367188,
"learning_rate": 1.976923076923077e-05,
"loss": 4.4287,
"mean_token_accuracy": 0.22894721738994123,
"num_tokens": 894781.0,
"step": 10
},
{
"epoch": 0.05118362124120281,
"grad_norm": 2.137922525405884,
"learning_rate": 1.9512820512820515e-05,
"loss": 4.1941,
"mean_token_accuracy": 0.2595341790467501,
"num_tokens": 1805355.0,
"step": 20
},
{
"epoch": 0.07677543186180422,
"grad_norm": 2.520721673965454,
"learning_rate": 1.9256410256410258e-05,
"loss": 4.0212,
"mean_token_accuracy": 0.2825487531721592,
"num_tokens": 2699319.0,
"step": 30
},
{
"epoch": 0.10236724248240563,
"grad_norm": 1.547048807144165,
"learning_rate": 1.9e-05,
"loss": 3.8859,
"mean_token_accuracy": 0.30224928483366964,
"num_tokens": 3588550.0,
"step": 40
},
{
"epoch": 0.12795905310300704,
"grad_norm": 1.5576727390289307,
"learning_rate": 1.8743589743589744e-05,
"loss": 3.7614,
"mean_token_accuracy": 0.320540402084589,
"num_tokens": 4483812.0,
"step": 50
},
{
"epoch": 0.15355086372360843,
"grad_norm": 1.8138513565063477,
"learning_rate": 1.848717948717949e-05,
"loss": 3.6902,
"mean_token_accuracy": 0.33278593569993975,
"num_tokens": 5372889.0,
"step": 60
},
{
"epoch": 0.17914267434420986,
"grad_norm": 1.1237279176712036,
"learning_rate": 1.823076923076923e-05,
"loss": 3.5589,
"mean_token_accuracy": 0.347703804820776,
"num_tokens": 6277651.0,
"step": 70
},
{
"epoch": 0.20473448496481125,
"grad_norm": 0.9200040698051453,
"learning_rate": 1.7974358974358977e-05,
"loss": 3.4994,
"mean_token_accuracy": 0.35774786919355395,
"num_tokens": 7172606.0,
"step": 80
},
{
"epoch": 0.23032629558541268,
"grad_norm": 1.232081413269043,
"learning_rate": 1.7717948717948717e-05,
"loss": 3.4412,
"mean_token_accuracy": 0.3667620047926903,
"num_tokens": 8065928.0,
"step": 90
},
{
"epoch": 0.2559181062060141,
"grad_norm": 1.0582596063613892,
"learning_rate": 1.7461538461538464e-05,
"loss": 3.3965,
"mean_token_accuracy": 0.37178524360060694,
"num_tokens": 8952519.0,
"step": 100
},
{
"epoch": 0.28150991682661547,
"grad_norm": 0.8860539793968201,
"learning_rate": 1.7205128205128207e-05,
"loss": 3.3236,
"mean_token_accuracy": 0.37989369705319403,
"num_tokens": 9847582.0,
"step": 110
},
{
"epoch": 0.30710172744721687,
"grad_norm": 5.248290538787842,
"learning_rate": 1.694871794871795e-05,
"loss": 3.3115,
"mean_token_accuracy": 0.38339284956455233,
"num_tokens": 10742170.0,
"step": 120
},
{
"epoch": 0.3326935380678183,
"grad_norm": 1.340420126914978,
"learning_rate": 1.6692307692307694e-05,
"loss": 3.2598,
"mean_token_accuracy": 0.3880648836493492,
"num_tokens": 11637955.0,
"step": 130
},
{
"epoch": 0.3582853486884197,
"grad_norm": 0.9110859036445618,
"learning_rate": 1.6435897435897437e-05,
"loss": 3.2249,
"mean_token_accuracy": 0.3904557466506958,
"num_tokens": 12537106.0,
"step": 140
},
{
"epoch": 0.3838771593090211,
"grad_norm": 0.9896584749221802,
"learning_rate": 1.617948717948718e-05,
"loss": 3.1932,
"mean_token_accuracy": 0.39675025418400767,
"num_tokens": 13426821.0,
"step": 150
},
{
"epoch": 0.4094689699296225,
"grad_norm": 0.8233770728111267,
"learning_rate": 1.5923076923076924e-05,
"loss": 3.1574,
"mean_token_accuracy": 0.399780885130167,
"num_tokens": 14313654.0,
"step": 160
},
{
"epoch": 0.4350607805502239,
"grad_norm": 1.1695690155029297,
"learning_rate": 1.5666666666666667e-05,
"loss": 3.1298,
"mean_token_accuracy": 0.40367085859179497,
"num_tokens": 15225624.0,
"step": 170
},
{
"epoch": 0.46065259117082535,
"grad_norm": 0.776798665523529,
"learning_rate": 1.5410256410256414e-05,
"loss": 3.1148,
"mean_token_accuracy": 0.4049819767475128,
"num_tokens": 16121750.0,
"step": 180
},
{
"epoch": 0.48624440179142675,
"grad_norm": 0.8073769807815552,
"learning_rate": 1.5153846153846155e-05,
"loss": 3.0744,
"mean_token_accuracy": 0.4091757610440254,
"num_tokens": 17023953.0,
"step": 190
},
{
"epoch": 0.5118362124120281,
"grad_norm": 0.9491840600967407,
"learning_rate": 1.4897435897435898e-05,
"loss": 3.0747,
"mean_token_accuracy": 0.40831351578235625,
"num_tokens": 17922120.0,
"step": 200
},
{
"epoch": 0.5374280230326296,
"grad_norm": 0.9127055406570435,
"learning_rate": 1.4641025641025642e-05,
"loss": 3.0611,
"mean_token_accuracy": 0.4100494936108589,
"num_tokens": 18823381.0,
"step": 210
},
{
"epoch": 0.5630198336532309,
"grad_norm": 0.7686980366706848,
"learning_rate": 1.4384615384615387e-05,
"loss": 3.0241,
"mean_token_accuracy": 0.4129943989217281,
"num_tokens": 19724559.0,
"step": 220
},
{
"epoch": 0.5886116442738324,
"grad_norm": 1.0187700986862183,
"learning_rate": 1.412820512820513e-05,
"loss": 3.0255,
"mean_token_accuracy": 0.41373484060168264,
"num_tokens": 20614866.0,
"step": 230
},
{
"epoch": 0.6142034548944337,
"grad_norm": 0.7729346752166748,
"learning_rate": 1.3871794871794873e-05,
"loss": 2.9829,
"mean_token_accuracy": 0.42028677016496657,
"num_tokens": 21514400.0,
"step": 240
},
{
"epoch": 0.6397952655150352,
"grad_norm": 6.206243991851807,
"learning_rate": 1.3615384615384616e-05,
"loss": 2.9783,
"mean_token_accuracy": 0.42037207037210467,
"num_tokens": 22404222.0,
"step": 250
},
{
"epoch": 0.6653870761356366,
"grad_norm": 0.6800546050071716,
"learning_rate": 1.335897435897436e-05,
"loss": 2.9643,
"mean_token_accuracy": 0.4217521704733372,
"num_tokens": 23302463.0,
"step": 260
},
{
"epoch": 0.690978886756238,
"grad_norm": 1.250061273574829,
"learning_rate": 1.3102564102564103e-05,
"loss": 2.9509,
"mean_token_accuracy": 0.4231578640639782,
"num_tokens": 24196568.0,
"step": 270
},
{
"epoch": 0.7165706973768394,
"grad_norm": 0.8788368701934814,
"learning_rate": 1.2846153846153848e-05,
"loss": 2.9386,
"mean_token_accuracy": 0.4258805990219116,
"num_tokens": 25084205.0,
"step": 280
},
{
"epoch": 0.7421625079974408,
"grad_norm": 2.8421788215637207,
"learning_rate": 1.2589743589743591e-05,
"loss": 2.9336,
"mean_token_accuracy": 0.4254921153187752,
"num_tokens": 25975991.0,
"step": 290
},
{
"epoch": 0.7677543186180422,
"grad_norm": 0.8528095483779907,
"learning_rate": 1.2333333333333334e-05,
"loss": 2.926,
"mean_token_accuracy": 0.426171388477087,
"num_tokens": 26866598.0,
"step": 300
},
{
"epoch": 0.7933461292386437,
"grad_norm": 0.7494258880615234,
"learning_rate": 1.2076923076923078e-05,
"loss": 2.9165,
"mean_token_accuracy": 0.42724777534604075,
"num_tokens": 27765200.0,
"step": 310
},
{
"epoch": 0.818937939859245,
"grad_norm": 1.1572643518447876,
"learning_rate": 1.1820512820512821e-05,
"loss": 2.8875,
"mean_token_accuracy": 0.43193317875266074,
"num_tokens": 28663860.0,
"step": 320
},
{
"epoch": 0.8445297504798465,
"grad_norm": 0.7244220972061157,
"learning_rate": 1.1564102564102566e-05,
"loss": 2.8848,
"mean_token_accuracy": 0.43283705189824107,
"num_tokens": 29561062.0,
"step": 330
},
{
"epoch": 0.8701215611004478,
"grad_norm": 0.9269903898239136,
"learning_rate": 1.1307692307692309e-05,
"loss": 2.8685,
"mean_token_accuracy": 0.4354383051395416,
"num_tokens": 30454457.0,
"step": 340
},
{
"epoch": 0.8957133717210493,
"grad_norm": 1.2391330003738403,
"learning_rate": 1.1051282051282052e-05,
"loss": 2.8566,
"mean_token_accuracy": 0.4357985772192478,
"num_tokens": 31357895.0,
"step": 350
},
{
"epoch": 0.9213051823416507,
"grad_norm": 0.60429447889328,
"learning_rate": 1.0794871794871796e-05,
"loss": 2.8737,
"mean_token_accuracy": 0.43362868800759313,
"num_tokens": 32253327.0,
"step": 360
},
{
"epoch": 0.946896992962252,
"grad_norm": 0.7335871458053589,
"learning_rate": 1.0538461538461539e-05,
"loss": 2.8323,
"mean_token_accuracy": 0.4392602853477001,
"num_tokens": 33148158.0,
"step": 370
},
{
"epoch": 0.9724888035828535,
"grad_norm": 0.8495442867279053,
"learning_rate": 1.0282051282051282e-05,
"loss": 2.8493,
"mean_token_accuracy": 0.4362662024796009,
"num_tokens": 34042486.0,
"step": 380
},
{
"epoch": 0.9980806142034548,
"grad_norm": 1.2509167194366455,
"learning_rate": 1.0025641025641027e-05,
"loss": 2.8359,
"mean_token_accuracy": 0.44098760187625885,
"num_tokens": 34924151.0,
"step": 390
},
{
"epoch": 1.0255918106206015,
"grad_norm": 0.5853167176246643,
"learning_rate": 9.76923076923077e-06,
"loss": 3.0907,
"mean_token_accuracy": 0.44089484498614356,
"num_tokens": 35865261.0,
"step": 400
},
{
"epoch": 1.051183621241203,
"grad_norm": 0.8962728381156921,
"learning_rate": 9.512820512820514e-06,
"loss": 2.81,
"mean_token_accuracy": 0.44187747687101364,
"num_tokens": 36762778.0,
"step": 410
},
{
"epoch": 1.0767754318618041,
"grad_norm": 2.0159361362457275,
"learning_rate": 9.256410256410257e-06,
"loss": 2.8121,
"mean_token_accuracy": 0.44206427708268164,
"num_tokens": 37666100.0,
"step": 420
},
{
"epoch": 1.1023672424824056,
"grad_norm": 1.0803464651107788,
"learning_rate": 9e-06,
"loss": 2.8162,
"mean_token_accuracy": 0.4421087481081486,
"num_tokens": 38570373.0,
"step": 430
},
{
"epoch": 1.127959053103007,
"grad_norm": 1.8244508504867554,
"learning_rate": 8.743589743589743e-06,
"loss": 2.7878,
"mean_token_accuracy": 0.443483180552721,
"num_tokens": 39463722.0,
"step": 440
},
{
"epoch": 1.1535508637236085,
"grad_norm": 1.025512933731079,
"learning_rate": 8.487179487179488e-06,
"loss": 2.7997,
"mean_token_accuracy": 0.44501683712005613,
"num_tokens": 40356114.0,
"step": 450
},
{
"epoch": 1.17914267434421,
"grad_norm": 1.3213328123092651,
"learning_rate": 8.230769230769232e-06,
"loss": 2.7966,
"mean_token_accuracy": 0.44384807869791987,
"num_tokens": 41253467.0,
"step": 460
},
{
"epoch": 1.2047344849648112,
"grad_norm": 0.6755945086479187,
"learning_rate": 7.974358974358975e-06,
"loss": 2.7788,
"mean_token_accuracy": 0.44546112343668937,
"num_tokens": 42147423.0,
"step": 470
},
{
"epoch": 1.2303262955854126,
"grad_norm": 0.815871000289917,
"learning_rate": 7.717948717948718e-06,
"loss": 2.7702,
"mean_token_accuracy": 0.4467897318303585,
"num_tokens": 43043613.0,
"step": 480
},
{
"epoch": 1.255918106206014,
"grad_norm": 0.8647878766059875,
"learning_rate": 7.461538461538462e-06,
"loss": 2.7593,
"mean_token_accuracy": 0.4478010691702366,
"num_tokens": 43928134.0,
"step": 490
},
{
"epoch": 1.2815099168266155,
"grad_norm": 0.5179564356803894,
"learning_rate": 7.205128205128206e-06,
"loss": 2.7684,
"mean_token_accuracy": 0.4487785018980503,
"num_tokens": 44810604.0,
"step": 500
},
{
"epoch": 1.307101727447217,
"grad_norm": 1.2333202362060547,
"learning_rate": 6.948717948717949e-06,
"loss": 2.755,
"mean_token_accuracy": 0.4496650531888008,
"num_tokens": 45704269.0,
"step": 510
},
{
"epoch": 1.3326935380678182,
"grad_norm": 0.9590532779693604,
"learning_rate": 6.692307692307692e-06,
"loss": 2.7565,
"mean_token_accuracy": 0.44860857501626017,
"num_tokens": 46599574.0,
"step": 520
},
{
"epoch": 1.3582853486884197,
"grad_norm": 0.5098512172698975,
"learning_rate": 6.435897435897437e-06,
"loss": 2.7424,
"mean_token_accuracy": 0.4517396934330463,
"num_tokens": 47500789.0,
"step": 530
},
{
"epoch": 1.383877159309021,
"grad_norm": 0.6403014659881592,
"learning_rate": 6.17948717948718e-06,
"loss": 2.7484,
"mean_token_accuracy": 0.45020677894353867,
"num_tokens": 48397237.0,
"step": 540
},
{
"epoch": 1.4094689699296226,
"grad_norm": 2.570819854736328,
"learning_rate": 5.923076923076924e-06,
"loss": 2.7382,
"mean_token_accuracy": 0.45186189860105513,
"num_tokens": 49288748.0,
"step": 550
},
{
"epoch": 1.435060780550224,
"grad_norm": 0.6260067224502563,
"learning_rate": 5.666666666666667e-06,
"loss": 2.7378,
"mean_token_accuracy": 0.45138209462165835,
"num_tokens": 50172619.0,
"step": 560
},
{
"epoch": 1.4606525911708252,
"grad_norm": 0.6413472294807434,
"learning_rate": 5.41025641025641e-06,
"loss": 2.7317,
"mean_token_accuracy": 0.45287573114037516,
"num_tokens": 51083437.0,
"step": 570
},
{
"epoch": 1.4862444017914267,
"grad_norm": 0.9748353362083435,
"learning_rate": 5.1538461538461534e-06,
"loss": 2.7163,
"mean_token_accuracy": 0.4528719700872898,
"num_tokens": 51987422.0,
"step": 580
},
{
"epoch": 1.5118362124120281,
"grad_norm": 0.7616235017776489,
"learning_rate": 4.8974358974358975e-06,
"loss": 2.7247,
"mean_token_accuracy": 0.45309568718075754,
"num_tokens": 52875548.0,
"step": 590
},
{
"epoch": 1.5374280230326296,
"grad_norm": 1.3642046451568604,
"learning_rate": 4.641025641025642e-06,
"loss": 2.7189,
"mean_token_accuracy": 0.4543063327670097,
"num_tokens": 53777731.0,
"step": 600
},
{
"epoch": 1.563019833653231,
"grad_norm": 2.9087612628936768,
"learning_rate": 4.384615384615385e-06,
"loss": 2.7221,
"mean_token_accuracy": 0.4528664395213127,
"num_tokens": 54672687.0,
"step": 610
},
{
"epoch": 1.5886116442738323,
"grad_norm": 1.5256329774856567,
"learning_rate": 4.128205128205128e-06,
"loss": 2.7249,
"mean_token_accuracy": 0.4534047245979309,
"num_tokens": 55567899.0,
"step": 620
},
{
"epoch": 1.6142034548944337,
"grad_norm": 0.6487675905227661,
"learning_rate": 3.871794871794872e-06,
"loss": 2.715,
"mean_token_accuracy": 0.45388809889554976,
"num_tokens": 56472732.0,
"step": 630
},
{
"epoch": 1.6397952655150352,
"grad_norm": 0.6692759990692139,
"learning_rate": 3.6153846153846156e-06,
"loss": 2.7033,
"mean_token_accuracy": 0.45569391921162605,
"num_tokens": 57378122.0,
"step": 640
},
{
"epoch": 1.6653870761356366,
"grad_norm": 0.5085057020187378,
"learning_rate": 3.358974358974359e-06,
"loss": 2.7164,
"mean_token_accuracy": 0.45427701622247696,
"num_tokens": 58274016.0,
"step": 650
},
{
"epoch": 1.690978886756238,
"grad_norm": 0.6099756360054016,
"learning_rate": 3.102564102564103e-06,
"loss": 2.7047,
"mean_token_accuracy": 0.45659793838858603,
"num_tokens": 59171947.0,
"step": 660
},
{
"epoch": 1.7165706973768393,
"grad_norm": 0.7250745892524719,
"learning_rate": 2.846153846153846e-06,
"loss": 2.7102,
"mean_token_accuracy": 0.45642822831869123,
"num_tokens": 60068960.0,
"step": 670
},
{
"epoch": 1.7421625079974408,
"grad_norm": 0.6664665937423706,
"learning_rate": 2.5897435897435903e-06,
"loss": 2.7096,
"mean_token_accuracy": 0.4557917319238186,
"num_tokens": 60968874.0,
"step": 680
},
{
"epoch": 1.7677543186180422,
"grad_norm": 1.3182893991470337,
"learning_rate": 2.3333333333333336e-06,
"loss": 2.7186,
"mean_token_accuracy": 0.4538013473153114,
"num_tokens": 61865465.0,
"step": 690
},
{
"epoch": 1.7933461292386437,
"grad_norm": 0.7066709995269775,
"learning_rate": 2.0769230769230773e-06,
"loss": 2.7083,
"mean_token_accuracy": 0.4556036002933979,
"num_tokens": 62751515.0,
"step": 700
},
{
"epoch": 1.8189379398592451,
"grad_norm": 0.633770227432251,
"learning_rate": 1.8205128205128205e-06,
"loss": 2.708,
"mean_token_accuracy": 0.4562342181801796,
"num_tokens": 63653226.0,
"step": 710
},
{
"epoch": 1.8445297504798464,
"grad_norm": 0.6355544924736023,
"learning_rate": 1.5641025641025642e-06,
"loss": 2.6983,
"mean_token_accuracy": 0.45753874629735947,
"num_tokens": 64553626.0,
"step": 720
},
{
"epoch": 1.8701215611004478,
"grad_norm": 0.6770824193954468,
"learning_rate": 1.307692307692308e-06,
"loss": 2.7195,
"mean_token_accuracy": 0.45460380911827086,
"num_tokens": 65440945.0,
"step": 730
},
{
"epoch": 1.8957133717210493,
"grad_norm": 0.9229477643966675,
"learning_rate": 1.0512820512820514e-06,
"loss": 2.7056,
"mean_token_accuracy": 0.4557946674525738,
"num_tokens": 66330885.0,
"step": 740
},
{
"epoch": 1.9213051823416507,
"grad_norm": 0.6344442367553711,
"learning_rate": 7.948717948717949e-07,
"loss": 2.6962,
"mean_token_accuracy": 0.4569088116288185,
"num_tokens": 67222508.0,
"step": 750
},
{
"epoch": 1.9468969929622522,
"grad_norm": 0.6643022894859314,
"learning_rate": 5.384615384615386e-07,
"loss": 2.6917,
"mean_token_accuracy": 0.45792855247855185,
"num_tokens": 68113106.0,
"step": 760
},
{
"epoch": 1.9724888035828534,
"grad_norm": 0.7332549095153809,
"learning_rate": 2.820512820512821e-07,
"loss": 2.7041,
"mean_token_accuracy": 0.4560040533542633,
"num_tokens": 69005071.0,
"step": 770
},
{
"epoch": 1.9980806142034548,
"grad_norm": 0.48741206526756287,
"learning_rate": 2.5641025641025643e-08,
"loss": 2.6798,
"mean_token_accuracy": 0.4586730174720287,
"num_tokens": 69895190.0,
"step": 780
}
],
"logging_steps": 10,
"max_steps": 780,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.2469045882126336e+16,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}