alextsiak's picture
Upload folder using huggingface_hub
388ae51 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.9822904368358913,
"eval_steps": 500,
"global_step": 844,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.047225501770956316,
"grad_norm": 2.2224767208099365,
"learning_rate": 0.00019786729857819907,
"loss": 1.4046,
"mean_token_accuracy": 0.6860443703830242,
"num_tokens": 11258.0,
"step": 10
},
{
"epoch": 0.09445100354191263,
"grad_norm": 2.9072351455688477,
"learning_rate": 0.00019549763033175358,
"loss": 0.9967,
"mean_token_accuracy": 0.796217393875122,
"num_tokens": 22389.0,
"step": 20
},
{
"epoch": 0.14167650531286896,
"grad_norm": 1.7167646884918213,
"learning_rate": 0.00019312796208530806,
"loss": 0.4636,
"mean_token_accuracy": 0.9008516699075699,
"num_tokens": 33839.0,
"step": 30
},
{
"epoch": 0.18890200708382526,
"grad_norm": 1.540899395942688,
"learning_rate": 0.00019075829383886258,
"loss": 0.4207,
"mean_token_accuracy": 0.9101088687777519,
"num_tokens": 45227.0,
"step": 40
},
{
"epoch": 0.2361275088547816,
"grad_norm": 1.7775837182998657,
"learning_rate": 0.0001883886255924171,
"loss": 0.3663,
"mean_token_accuracy": 0.9167132675647736,
"num_tokens": 56969.0,
"step": 50
},
{
"epoch": 0.2833530106257379,
"grad_norm": 1.2004725933074951,
"learning_rate": 0.00018601895734597157,
"loss": 0.3187,
"mean_token_accuracy": 0.9333658754825592,
"num_tokens": 68254.0,
"step": 60
},
{
"epoch": 0.3305785123966942,
"grad_norm": 1.32843816280365,
"learning_rate": 0.0001836492890995261,
"loss": 0.3664,
"mean_token_accuracy": 0.9262594923377037,
"num_tokens": 79400.0,
"step": 70
},
{
"epoch": 0.3778040141676505,
"grad_norm": 1.8211090564727783,
"learning_rate": 0.00018127962085308057,
"loss": 0.3767,
"mean_token_accuracy": 0.9262291938066483,
"num_tokens": 90951.0,
"step": 80
},
{
"epoch": 0.42502951593860683,
"grad_norm": 1.4291584491729736,
"learning_rate": 0.00017890995260663508,
"loss": 0.2829,
"mean_token_accuracy": 0.936517083644867,
"num_tokens": 102424.0,
"step": 90
},
{
"epoch": 0.4722550177095632,
"grad_norm": 1.4524176120758057,
"learning_rate": 0.0001765402843601896,
"loss": 0.3404,
"mean_token_accuracy": 0.9263656228780747,
"num_tokens": 114139.0,
"step": 100
},
{
"epoch": 0.5194805194805194,
"grad_norm": 1.2539514303207397,
"learning_rate": 0.00017417061611374408,
"loss": 0.367,
"mean_token_accuracy": 0.9276282742619515,
"num_tokens": 125361.0,
"step": 110
},
{
"epoch": 0.5667060212514758,
"grad_norm": 0.8698514103889465,
"learning_rate": 0.0001718009478672986,
"loss": 0.2972,
"mean_token_accuracy": 0.9385714828968048,
"num_tokens": 136673.0,
"step": 120
},
{
"epoch": 0.6139315230224321,
"grad_norm": 1.112260341644287,
"learning_rate": 0.00016943127962085308,
"loss": 0.2655,
"mean_token_accuracy": 0.9430030316114426,
"num_tokens": 147942.0,
"step": 130
},
{
"epoch": 0.6611570247933884,
"grad_norm": 1.448964238166809,
"learning_rate": 0.0001670616113744076,
"loss": 0.2198,
"mean_token_accuracy": 0.9480510011315346,
"num_tokens": 159547.0,
"step": 140
},
{
"epoch": 0.7083825265643447,
"grad_norm": 1.364487886428833,
"learning_rate": 0.0001646919431279621,
"loss": 0.2652,
"mean_token_accuracy": 0.9435501232743263,
"num_tokens": 170741.0,
"step": 150
},
{
"epoch": 0.755608028335301,
"grad_norm": 0.9347731471061707,
"learning_rate": 0.0001623222748815166,
"loss": 0.2316,
"mean_token_accuracy": 0.950811243057251,
"num_tokens": 181969.0,
"step": 160
},
{
"epoch": 0.8028335301062574,
"grad_norm": 0.9598692655563354,
"learning_rate": 0.0001599526066350711,
"loss": 0.1798,
"mean_token_accuracy": 0.956840255856514,
"num_tokens": 193046.0,
"step": 170
},
{
"epoch": 0.8500590318772137,
"grad_norm": 0.7902966737747192,
"learning_rate": 0.0001575829383886256,
"loss": 0.2031,
"mean_token_accuracy": 0.9510332688689231,
"num_tokens": 204429.0,
"step": 180
},
{
"epoch": 0.89728453364817,
"grad_norm": 0.6174350380897522,
"learning_rate": 0.0001552132701421801,
"loss": 0.2149,
"mean_token_accuracy": 0.9503797248005867,
"num_tokens": 215842.0,
"step": 190
},
{
"epoch": 0.9445100354191264,
"grad_norm": 1.0609785318374634,
"learning_rate": 0.00015284360189573462,
"loss": 0.1987,
"mean_token_accuracy": 0.9518642231822014,
"num_tokens": 227422.0,
"step": 200
},
{
"epoch": 0.9917355371900827,
"grad_norm": 0.8327723741531372,
"learning_rate": 0.0001504739336492891,
"loss": 0.1987,
"mean_token_accuracy": 0.9499309301376343,
"num_tokens": 238784.0,
"step": 210
},
{
"epoch": 1.0,
"eval_loss": 0.19764918088912964,
"eval_mean_token_accuracy": 0.9525814542063961,
"eval_num_tokens": 240751.0,
"eval_runtime": 8.6341,
"eval_samples_per_second": 24.554,
"eval_steps_per_second": 3.127,
"step": 212
},
{
"epoch": 1.037780401416765,
"grad_norm": 0.6004464626312256,
"learning_rate": 0.0001481042654028436,
"loss": 0.1459,
"mean_token_accuracy": 0.9623932639757792,
"num_tokens": 249811.0,
"step": 220
},
{
"epoch": 1.0850059031877213,
"grad_norm": 0.5023283362388611,
"learning_rate": 0.0001457345971563981,
"loss": 0.1085,
"mean_token_accuracy": 0.9685453534126282,
"num_tokens": 261566.0,
"step": 230
},
{
"epoch": 1.1322314049586777,
"grad_norm": 0.9379103183746338,
"learning_rate": 0.0001433649289099526,
"loss": 0.0836,
"mean_token_accuracy": 0.9732989251613617,
"num_tokens": 273201.0,
"step": 240
},
{
"epoch": 1.179456906729634,
"grad_norm": 1.0074262619018555,
"learning_rate": 0.00014099526066350712,
"loss": 0.1364,
"mean_token_accuracy": 0.9613454505801201,
"num_tokens": 284542.0,
"step": 250
},
{
"epoch": 1.2266824085005903,
"grad_norm": 1.0366216897964478,
"learning_rate": 0.0001386255924170616,
"loss": 0.1628,
"mean_token_accuracy": 0.9535841554403305,
"num_tokens": 295925.0,
"step": 260
},
{
"epoch": 1.2739079102715467,
"grad_norm": 0.7236562371253967,
"learning_rate": 0.00013625592417061612,
"loss": 0.124,
"mean_token_accuracy": 0.9644495561718941,
"num_tokens": 307173.0,
"step": 270
},
{
"epoch": 1.321133412042503,
"grad_norm": 1.424338459968567,
"learning_rate": 0.0001338862559241706,
"loss": 0.1094,
"mean_token_accuracy": 0.9684907376766205,
"num_tokens": 318836.0,
"step": 280
},
{
"epoch": 1.3683589138134593,
"grad_norm": 0.9334771037101746,
"learning_rate": 0.00013151658767772512,
"loss": 0.1226,
"mean_token_accuracy": 0.9643323451280594,
"num_tokens": 330284.0,
"step": 290
},
{
"epoch": 1.4155844155844157,
"grad_norm": 1.1074577569961548,
"learning_rate": 0.00012914691943127963,
"loss": 0.1415,
"mean_token_accuracy": 0.9657251536846161,
"num_tokens": 341636.0,
"step": 300
},
{
"epoch": 1.462809917355372,
"grad_norm": 0.4959653317928314,
"learning_rate": 0.00012677725118483412,
"loss": 0.0863,
"mean_token_accuracy": 0.9732676669955254,
"num_tokens": 353014.0,
"step": 310
},
{
"epoch": 1.510035419126328,
"grad_norm": 1.0306661128997803,
"learning_rate": 0.00012440758293838863,
"loss": 0.0992,
"mean_token_accuracy": 0.9696009412407876,
"num_tokens": 364630.0,
"step": 320
},
{
"epoch": 1.5572609208972845,
"grad_norm": 0.907910943031311,
"learning_rate": 0.00012203791469194314,
"loss": 0.0965,
"mean_token_accuracy": 0.9723138064146042,
"num_tokens": 376367.0,
"step": 330
},
{
"epoch": 1.604486422668241,
"grad_norm": 0.7785394787788391,
"learning_rate": 0.00011966824644549763,
"loss": 0.1407,
"mean_token_accuracy": 0.9637758329510688,
"num_tokens": 387837.0,
"step": 340
},
{
"epoch": 1.6517119244391971,
"grad_norm": 1.0611106157302856,
"learning_rate": 0.00011729857819905214,
"loss": 0.1064,
"mean_token_accuracy": 0.9677139699459076,
"num_tokens": 398928.0,
"step": 350
},
{
"epoch": 1.6989374262101535,
"grad_norm": 0.7576094269752502,
"learning_rate": 0.00011492890995260664,
"loss": 0.1181,
"mean_token_accuracy": 0.9652754247188569,
"num_tokens": 410374.0,
"step": 360
},
{
"epoch": 1.7461629279811097,
"grad_norm": 0.7488318085670471,
"learning_rate": 0.00011255924170616114,
"loss": 0.1012,
"mean_token_accuracy": 0.9696858197450637,
"num_tokens": 421264.0,
"step": 370
},
{
"epoch": 1.7933884297520661,
"grad_norm": 0.6903710961341858,
"learning_rate": 0.00011018957345971565,
"loss": 0.1249,
"mean_token_accuracy": 0.964360211789608,
"num_tokens": 432294.0,
"step": 380
},
{
"epoch": 1.8406139315230226,
"grad_norm": 1.0426132678985596,
"learning_rate": 0.00010781990521327015,
"loss": 0.1058,
"mean_token_accuracy": 0.9673917979001999,
"num_tokens": 443662.0,
"step": 390
},
{
"epoch": 1.8878394332939787,
"grad_norm": 0.6279439330101013,
"learning_rate": 0.00010545023696682465,
"loss": 0.1265,
"mean_token_accuracy": 0.9684036031365395,
"num_tokens": 454594.0,
"step": 400
},
{
"epoch": 1.935064935064935,
"grad_norm": 1.036986231803894,
"learning_rate": 0.00010308056872037915,
"loss": 0.078,
"mean_token_accuracy": 0.9755039691925049,
"num_tokens": 465724.0,
"step": 410
},
{
"epoch": 1.9822904368358913,
"grad_norm": 0.7609688639640808,
"learning_rate": 0.00010071090047393366,
"loss": 0.0922,
"mean_token_accuracy": 0.9699518546462059,
"num_tokens": 477077.0,
"step": 420
},
{
"epoch": 2.0,
"eval_loss": 0.15497758984565735,
"eval_mean_token_accuracy": 0.9626010656356812,
"eval_num_tokens": 481502.0,
"eval_runtime": 8.619,
"eval_samples_per_second": 24.597,
"eval_steps_per_second": 3.133,
"step": 424
},
{
"epoch": 2.0283353010625738,
"grad_norm": 0.832304835319519,
"learning_rate": 9.834123222748816e-05,
"loss": 0.1073,
"mean_token_accuracy": 0.9689378264622811,
"num_tokens": 488024.0,
"step": 430
},
{
"epoch": 2.07556080283353,
"grad_norm": 0.5486078858375549,
"learning_rate": 9.597156398104266e-05,
"loss": 0.0484,
"mean_token_accuracy": 0.9826492935419082,
"num_tokens": 499131.0,
"step": 440
},
{
"epoch": 2.1227863046044866,
"grad_norm": 0.40537288784980774,
"learning_rate": 9.360189573459716e-05,
"loss": 0.0566,
"mean_token_accuracy": 0.9807873949408531,
"num_tokens": 510734.0,
"step": 450
},
{
"epoch": 2.1700118063754426,
"grad_norm": 0.5876602530479431,
"learning_rate": 9.123222748815167e-05,
"loss": 0.0578,
"mean_token_accuracy": 0.9810668498277664,
"num_tokens": 521828.0,
"step": 460
},
{
"epoch": 2.217237308146399,
"grad_norm": 0.5273938775062561,
"learning_rate": 8.886255924170617e-05,
"loss": 0.055,
"mean_token_accuracy": 0.9809592545032502,
"num_tokens": 533020.0,
"step": 470
},
{
"epoch": 2.2644628099173554,
"grad_norm": 0.5727369785308838,
"learning_rate": 8.649289099526067e-05,
"loss": 0.0495,
"mean_token_accuracy": 0.9824301272630691,
"num_tokens": 544287.0,
"step": 480
},
{
"epoch": 2.311688311688312,
"grad_norm": 0.609664261341095,
"learning_rate": 8.412322274881517e-05,
"loss": 0.0553,
"mean_token_accuracy": 0.9817688629031182,
"num_tokens": 555880.0,
"step": 490
},
{
"epoch": 2.358913813459268,
"grad_norm": 0.48904091119766235,
"learning_rate": 8.175355450236967e-05,
"loss": 0.0561,
"mean_token_accuracy": 0.9802302822470665,
"num_tokens": 567454.0,
"step": 500
},
{
"epoch": 2.406139315230224,
"grad_norm": 0.48052382469177246,
"learning_rate": 7.938388625592418e-05,
"loss": 0.049,
"mean_token_accuracy": 0.983304688334465,
"num_tokens": 578751.0,
"step": 510
},
{
"epoch": 2.4533648170011806,
"grad_norm": 0.6199146509170532,
"learning_rate": 7.701421800947868e-05,
"loss": 0.0602,
"mean_token_accuracy": 0.9780631363391876,
"num_tokens": 590469.0,
"step": 520
},
{
"epoch": 2.500590318772137,
"grad_norm": 0.753097414970398,
"learning_rate": 7.464454976303318e-05,
"loss": 0.0509,
"mean_token_accuracy": 0.9815936490893364,
"num_tokens": 602058.0,
"step": 530
},
{
"epoch": 2.5478158205430934,
"grad_norm": 0.7676092386245728,
"learning_rate": 7.227488151658768e-05,
"loss": 0.052,
"mean_token_accuracy": 0.981397558748722,
"num_tokens": 613415.0,
"step": 540
},
{
"epoch": 2.5950413223140494,
"grad_norm": 0.49483609199523926,
"learning_rate": 6.990521327014218e-05,
"loss": 0.051,
"mean_token_accuracy": 0.9827686205506325,
"num_tokens": 625010.0,
"step": 550
},
{
"epoch": 2.642266824085006,
"grad_norm": 0.6355498433113098,
"learning_rate": 6.753554502369669e-05,
"loss": 0.0563,
"mean_token_accuracy": 0.980421070754528,
"num_tokens": 636527.0,
"step": 560
},
{
"epoch": 2.689492325855962,
"grad_norm": 0.6222267150878906,
"learning_rate": 6.516587677725119e-05,
"loss": 0.0566,
"mean_token_accuracy": 0.9805225148797035,
"num_tokens": 647992.0,
"step": 570
},
{
"epoch": 2.7367178276269186,
"grad_norm": 0.5963544845581055,
"learning_rate": 6.279620853080569e-05,
"loss": 0.0484,
"mean_token_accuracy": 0.9810224324464798,
"num_tokens": 659364.0,
"step": 580
},
{
"epoch": 2.783943329397875,
"grad_norm": 0.48161807656288147,
"learning_rate": 6.0426540284360186e-05,
"loss": 0.0495,
"mean_token_accuracy": 0.9829053461551667,
"num_tokens": 670493.0,
"step": 590
},
{
"epoch": 2.8311688311688314,
"grad_norm": 0.6928554773330688,
"learning_rate": 5.80568720379147e-05,
"loss": 0.0474,
"mean_token_accuracy": 0.9838150143623352,
"num_tokens": 681980.0,
"step": 600
},
{
"epoch": 2.8783943329397874,
"grad_norm": 0.9183737635612488,
"learning_rate": 5.56872037914692e-05,
"loss": 0.0513,
"mean_token_accuracy": 0.9831859543919563,
"num_tokens": 692889.0,
"step": 610
},
{
"epoch": 2.925619834710744,
"grad_norm": 0.6690914034843445,
"learning_rate": 5.33175355450237e-05,
"loss": 0.0549,
"mean_token_accuracy": 0.9816744804382325,
"num_tokens": 704348.0,
"step": 620
},
{
"epoch": 2.9728453364817002,
"grad_norm": 0.3908725678920746,
"learning_rate": 5.09478672985782e-05,
"loss": 0.0481,
"mean_token_accuracy": 0.9822039097547531,
"num_tokens": 715695.0,
"step": 630
},
{
"epoch": 3.0,
"eval_loss": 0.14640700817108154,
"eval_mean_token_accuracy": 0.967608372370402,
"eval_num_tokens": 722253.0,
"eval_runtime": 8.5998,
"eval_samples_per_second": 24.652,
"eval_steps_per_second": 3.14,
"step": 636
},
{
"epoch": 3.0188902007083827,
"grad_norm": 0.39226505160331726,
"learning_rate": 4.857819905213271e-05,
"loss": 0.0402,
"mean_token_accuracy": 0.9841358707501338,
"num_tokens": 726613.0,
"step": 640
},
{
"epoch": 3.0661157024793386,
"grad_norm": 0.34782007336616516,
"learning_rate": 4.620853080568721e-05,
"loss": 0.0322,
"mean_token_accuracy": 0.9865604758262634,
"num_tokens": 737963.0,
"step": 650
},
{
"epoch": 3.113341204250295,
"grad_norm": 0.7617977261543274,
"learning_rate": 4.383886255924171e-05,
"loss": 0.0352,
"mean_token_accuracy": 0.9863895252346992,
"num_tokens": 748987.0,
"step": 660
},
{
"epoch": 3.1605667060212514,
"grad_norm": 0.5488002896308899,
"learning_rate": 4.146919431279621e-05,
"loss": 0.0323,
"mean_token_accuracy": 0.9870850175619126,
"num_tokens": 760488.0,
"step": 670
},
{
"epoch": 3.207792207792208,
"grad_norm": 0.44978898763656616,
"learning_rate": 3.909952606635071e-05,
"loss": 0.0357,
"mean_token_accuracy": 0.9864787235856056,
"num_tokens": 771677.0,
"step": 680
},
{
"epoch": 3.2550177095631643,
"grad_norm": 0.44440773129463196,
"learning_rate": 3.672985781990522e-05,
"loss": 0.0363,
"mean_token_accuracy": 0.9861913770437241,
"num_tokens": 783243.0,
"step": 690
},
{
"epoch": 3.3022432113341202,
"grad_norm": 0.41815730929374695,
"learning_rate": 3.4360189573459716e-05,
"loss": 0.0374,
"mean_token_accuracy": 0.9860232338309288,
"num_tokens": 794711.0,
"step": 700
},
{
"epoch": 3.3494687131050767,
"grad_norm": 0.2978448271751404,
"learning_rate": 3.1990521327014215e-05,
"loss": 0.0308,
"mean_token_accuracy": 0.9869714677333832,
"num_tokens": 806385.0,
"step": 710
},
{
"epoch": 3.396694214876033,
"grad_norm": 0.46016019582748413,
"learning_rate": 2.962085308056872e-05,
"loss": 0.0343,
"mean_token_accuracy": 0.9866258546710014,
"num_tokens": 817664.0,
"step": 720
},
{
"epoch": 3.4439197166469895,
"grad_norm": 0.4907480478286743,
"learning_rate": 2.7251184834123224e-05,
"loss": 0.0356,
"mean_token_accuracy": 0.9860621899366379,
"num_tokens": 829118.0,
"step": 730
},
{
"epoch": 3.4911452184179455,
"grad_norm": 0.5607575178146362,
"learning_rate": 2.4881516587677726e-05,
"loss": 0.0375,
"mean_token_accuracy": 0.9857015043497086,
"num_tokens": 840540.0,
"step": 740
},
{
"epoch": 3.538370720188902,
"grad_norm": 0.5227943062782288,
"learning_rate": 2.251184834123223e-05,
"loss": 0.0371,
"mean_token_accuracy": 0.9858236253261566,
"num_tokens": 851987.0,
"step": 750
},
{
"epoch": 3.5855962219598583,
"grad_norm": 0.28605079650878906,
"learning_rate": 2.014218009478673e-05,
"loss": 0.0306,
"mean_token_accuracy": 0.9882108762860298,
"num_tokens": 863423.0,
"step": 760
},
{
"epoch": 3.6328217237308147,
"grad_norm": 0.33974796533584595,
"learning_rate": 1.7772511848341233e-05,
"loss": 0.0381,
"mean_token_accuracy": 0.9858895480632782,
"num_tokens": 874997.0,
"step": 770
},
{
"epoch": 3.680047225501771,
"grad_norm": 0.5480939149856567,
"learning_rate": 1.5402843601895736e-05,
"loss": 0.0344,
"mean_token_accuracy": 0.9871362060308456,
"num_tokens": 886148.0,
"step": 780
},
{
"epoch": 3.7272727272727275,
"grad_norm": 0.4544774889945984,
"learning_rate": 1.3033175355450238e-05,
"loss": 0.0364,
"mean_token_accuracy": 0.9863489225506783,
"num_tokens": 897137.0,
"step": 790
},
{
"epoch": 3.7744982290436835,
"grad_norm": 0.6491249799728394,
"learning_rate": 1.066350710900474e-05,
"loss": 0.0336,
"mean_token_accuracy": 0.9858677625656128,
"num_tokens": 908688.0,
"step": 800
},
{
"epoch": 3.82172373081464,
"grad_norm": 0.4520932137966156,
"learning_rate": 8.293838862559241e-06,
"loss": 0.0337,
"mean_token_accuracy": 0.9875034481287003,
"num_tokens": 920183.0,
"step": 810
},
{
"epoch": 3.8689492325855963,
"grad_norm": 0.45541301369667053,
"learning_rate": 5.924170616113745e-06,
"loss": 0.0337,
"mean_token_accuracy": 0.986977969110012,
"num_tokens": 931127.0,
"step": 820
},
{
"epoch": 3.9161747343565523,
"grad_norm": 0.4386422634124756,
"learning_rate": 3.5545023696682464e-06,
"loss": 0.0354,
"mean_token_accuracy": 0.9872412413358689,
"num_tokens": 942690.0,
"step": 830
},
{
"epoch": 3.9634002361275087,
"grad_norm": 0.5566153526306152,
"learning_rate": 1.1848341232227488e-06,
"loss": 0.0351,
"mean_token_accuracy": 0.9845622256398201,
"num_tokens": 954344.0,
"step": 840
}
],
"logging_steps": 10,
"max_steps": 844,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.631693863816397e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}