MazgaBERT / checkpoint-10000 /trainer_state.json
Eraly-ml's picture
Upload folder using huggingface_hub
2f781fe verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_global_step": 10000,
"best_metric": 0.8816914583342934,
"best_model_checkpoint": "./checkpoint-10000",
"epoch": 2.844201095072175,
"eval_steps": 1000,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00028443433122377873,
"grad_norm": Infinity,
"learning_rate": 0.0,
"loss": 6.8942,
"step": 1
},
{
"epoch": 0.028443433122377872,
"grad_norm": 10.016256332397461,
"learning_rate": 1.8767772511848342e-06,
"loss": 5.0217,
"step": 100
},
{
"epoch": 0.056886866244755745,
"grad_norm": 29.21646499633789,
"learning_rate": 3.772511848341233e-06,
"loss": 3.3137,
"step": 200
},
{
"epoch": 0.08533029936713361,
"grad_norm": 33.183448791503906,
"learning_rate": 5.66824644549763e-06,
"loss": 2.5012,
"step": 300
},
{
"epoch": 0.11377373248951149,
"grad_norm": 35.1544303894043,
"learning_rate": 7.563981042654029e-06,
"loss": 2.0729,
"step": 400
},
{
"epoch": 0.14221716561188935,
"grad_norm": 40.57954406738281,
"learning_rate": 9.459715639810427e-06,
"loss": 1.8169,
"step": 500
},
{
"epoch": 0.17066059873426723,
"grad_norm": 28.46094512939453,
"learning_rate": 1.1355450236966825e-05,
"loss": 1.7012,
"step": 600
},
{
"epoch": 0.1991040318566451,
"grad_norm": 18.60147476196289,
"learning_rate": 1.3251184834123222e-05,
"loss": 1.5425,
"step": 700
},
{
"epoch": 0.22754746497902298,
"grad_norm": 32.00541687011719,
"learning_rate": 1.5146919431279623e-05,
"loss": 1.4402,
"step": 800
},
{
"epoch": 0.25599089810140085,
"grad_norm": 18.643096923828125,
"learning_rate": 1.704265402843602e-05,
"loss": 1.38,
"step": 900
},
{
"epoch": 0.2844343312237787,
"grad_norm": 19.019792556762695,
"learning_rate": 1.8938388625592418e-05,
"loss": 1.3005,
"step": 1000
},
{
"epoch": 0.2844343312237787,
"eval_f1_macro": 0.7974631141388782,
"eval_loss": 0.3369969427585602,
"eval_runtime": 407.0416,
"eval_samples_per_second": 491.35,
"eval_steps_per_second": 3.84,
"step": 1000
},
{
"epoch": 0.3128777643461566,
"grad_norm": 15.35057544708252,
"learning_rate": 1.9907300115874858e-05,
"loss": 1.3325,
"step": 1100
},
{
"epoch": 0.34132119746853445,
"grad_norm": 7.621375560760498,
"learning_rate": 1.969661856104498e-05,
"loss": 1.2425,
"step": 1200
},
{
"epoch": 0.3697646305909123,
"grad_norm": 28.559072494506836,
"learning_rate": 1.9485937006215108e-05,
"loss": 1.2189,
"step": 1300
},
{
"epoch": 0.3982080637132902,
"grad_norm": 11.210355758666992,
"learning_rate": 1.927525545138523e-05,
"loss": 1.166,
"step": 1400
},
{
"epoch": 0.42665149683566805,
"grad_norm": 16.360652923583984,
"learning_rate": 1.906457389655536e-05,
"loss": 1.1281,
"step": 1500
},
{
"epoch": 0.45509492995804596,
"grad_norm": 17.10007095336914,
"learning_rate": 1.8853892341725482e-05,
"loss": 1.1306,
"step": 1600
},
{
"epoch": 0.4835383630804238,
"grad_norm": 11.677529335021973,
"learning_rate": 1.864321078689561e-05,
"loss": 1.1031,
"step": 1700
},
{
"epoch": 0.5119817962028017,
"grad_norm": 9.072050094604492,
"learning_rate": 1.8432529232065736e-05,
"loss": 1.0715,
"step": 1800
},
{
"epoch": 0.5404252293251796,
"grad_norm": 11.437026023864746,
"learning_rate": 1.822184767723586e-05,
"loss": 1.0483,
"step": 1900
},
{
"epoch": 0.5688686624475574,
"grad_norm": 7.232166290283203,
"learning_rate": 1.8011166122405986e-05,
"loss": 1.0246,
"step": 2000
},
{
"epoch": 0.5688686624475574,
"eval_f1_macro": 0.8172822685871202,
"eval_loss": 0.26557663083076477,
"eval_runtime": 404.6294,
"eval_samples_per_second": 494.279,
"eval_steps_per_second": 3.863,
"step": 2000
},
{
"epoch": 0.5973120955699353,
"grad_norm": 11.33175277709961,
"learning_rate": 1.780048456757611e-05,
"loss": 1.0286,
"step": 2100
},
{
"epoch": 0.6257555286923132,
"grad_norm": 10.749075889587402,
"learning_rate": 1.7589803012746237e-05,
"loss": 1.0027,
"step": 2200
},
{
"epoch": 0.6541989618146911,
"grad_norm": 17.69948387145996,
"learning_rate": 1.737912145791636e-05,
"loss": 0.9903,
"step": 2300
},
{
"epoch": 0.6826423949370689,
"grad_norm": 9.61713695526123,
"learning_rate": 1.7168439903086487e-05,
"loss": 0.9816,
"step": 2400
},
{
"epoch": 0.7110858280594468,
"grad_norm": 10.537064552307129,
"learning_rate": 1.695775834825661e-05,
"loss": 0.9797,
"step": 2500
},
{
"epoch": 0.7395292611818246,
"grad_norm": 10.092955589294434,
"learning_rate": 1.6747076793426738e-05,
"loss": 0.9582,
"step": 2600
},
{
"epoch": 0.7679726943042026,
"grad_norm": 9.091156959533691,
"learning_rate": 1.653639523859686e-05,
"loss": 0.9423,
"step": 2700
},
{
"epoch": 0.7964161274265804,
"grad_norm": 7.700159549713135,
"learning_rate": 1.632571368376699e-05,
"loss": 0.9291,
"step": 2800
},
{
"epoch": 0.8248595605489583,
"grad_norm": 8.272307395935059,
"learning_rate": 1.6115032128937115e-05,
"loss": 0.9524,
"step": 2900
},
{
"epoch": 0.8533029936713361,
"grad_norm": 11.422605514526367,
"learning_rate": 1.590435057410724e-05,
"loss": 0.9236,
"step": 3000
},
{
"epoch": 0.8533029936713361,
"eval_f1_macro": 0.85234873705105,
"eval_loss": 0.23829074203968048,
"eval_runtime": 404.6948,
"eval_samples_per_second": 494.2,
"eval_steps_per_second": 3.862,
"step": 3000
},
{
"epoch": 0.881746426793714,
"grad_norm": 8.847649574279785,
"learning_rate": 1.5693669019277366e-05,
"loss": 0.9048,
"step": 3100
},
{
"epoch": 0.9101898599160919,
"grad_norm": 10.125356674194336,
"learning_rate": 1.548298746444749e-05,
"loss": 0.9432,
"step": 3200
},
{
"epoch": 0.9386332930384698,
"grad_norm": 10.186013221740723,
"learning_rate": 1.5272305909617616e-05,
"loss": 0.903,
"step": 3300
},
{
"epoch": 0.9670767261608476,
"grad_norm": 8.035357475280762,
"learning_rate": 1.506162435478774e-05,
"loss": 0.9075,
"step": 3400
},
{
"epoch": 0.9955201592832255,
"grad_norm": 8.40256404876709,
"learning_rate": 1.4850942799957867e-05,
"loss": 0.9129,
"step": 3500
},
{
"epoch": 1.0238924838227974,
"grad_norm": 8.256072998046875,
"learning_rate": 1.4640261245127992e-05,
"loss": 0.8138,
"step": 3600
},
{
"epoch": 1.0523359169451754,
"grad_norm": 8.351125717163086,
"learning_rate": 1.4429579690298117e-05,
"loss": 0.8174,
"step": 3700
},
{
"epoch": 1.080779350067553,
"grad_norm": 16.844223022460938,
"learning_rate": 1.4218898135468239e-05,
"loss": 0.7941,
"step": 3800
},
{
"epoch": 1.109222783189931,
"grad_norm": 7.787734031677246,
"learning_rate": 1.4008216580638366e-05,
"loss": 0.7983,
"step": 3900
},
{
"epoch": 1.1376662163123088,
"grad_norm": 10.029424667358398,
"learning_rate": 1.3797535025808491e-05,
"loss": 0.8016,
"step": 4000
},
{
"epoch": 1.1376662163123088,
"eval_f1_macro": 0.8639591530686195,
"eval_loss": 0.21851009130477905,
"eval_runtime": 404.5114,
"eval_samples_per_second": 494.424,
"eval_steps_per_second": 3.864,
"step": 4000
},
{
"epoch": 1.1661096494346868,
"grad_norm": 10.865072250366211,
"learning_rate": 1.3586853470978616e-05,
"loss": 0.8246,
"step": 4100
},
{
"epoch": 1.1945530825570647,
"grad_norm": 6.806798934936523,
"learning_rate": 1.3376171916148742e-05,
"loss": 0.8112,
"step": 4200
},
{
"epoch": 1.2229965156794425,
"grad_norm": 5.789618492126465,
"learning_rate": 1.3165490361318867e-05,
"loss": 0.8061,
"step": 4300
},
{
"epoch": 1.2514399488018204,
"grad_norm": 12.199695587158203,
"learning_rate": 1.2954808806488992e-05,
"loss": 0.8079,
"step": 4400
},
{
"epoch": 1.2798833819241984,
"grad_norm": 10.48000431060791,
"learning_rate": 1.2744127251659117e-05,
"loss": 0.8118,
"step": 4500
},
{
"epoch": 1.308326815046576,
"grad_norm": 11.554473876953125,
"learning_rate": 1.2533445696829243e-05,
"loss": 0.7698,
"step": 4600
},
{
"epoch": 1.336770248168954,
"grad_norm": 7.0890374183654785,
"learning_rate": 1.2322764141999368e-05,
"loss": 0.7983,
"step": 4700
},
{
"epoch": 1.3652136812913318,
"grad_norm": 7.440741539001465,
"learning_rate": 1.2112082587169493e-05,
"loss": 0.7862,
"step": 4800
},
{
"epoch": 1.3936571144137098,
"grad_norm": 12.46674919128418,
"learning_rate": 1.1901401032339618e-05,
"loss": 0.766,
"step": 4900
},
{
"epoch": 1.4221005475360875,
"grad_norm": 11.290918350219727,
"learning_rate": 1.1690719477509744e-05,
"loss": 0.7744,
"step": 5000
},
{
"epoch": 1.4221005475360875,
"eval_f1_macro": 0.8615043778768627,
"eval_loss": 0.22264312207698822,
"eval_runtime": 404.4096,
"eval_samples_per_second": 494.548,
"eval_steps_per_second": 3.865,
"step": 5000
},
{
"epoch": 1.4505439806584655,
"grad_norm": 7.385248184204102,
"learning_rate": 1.148003792267987e-05,
"loss": 0.7632,
"step": 5100
},
{
"epoch": 1.4789874137808434,
"grad_norm": 10.126408576965332,
"learning_rate": 1.1269356367849996e-05,
"loss": 0.772,
"step": 5200
},
{
"epoch": 1.5074308469032212,
"grad_norm": 8.623499870300293,
"learning_rate": 1.1058674813020121e-05,
"loss": 0.7809,
"step": 5300
},
{
"epoch": 1.5358742800255991,
"grad_norm": 10.418437004089355,
"learning_rate": 1.0847993258190246e-05,
"loss": 0.7777,
"step": 5400
},
{
"epoch": 1.564317713147977,
"grad_norm": 11.511435508728027,
"learning_rate": 1.0637311703360371e-05,
"loss": 0.7481,
"step": 5500
},
{
"epoch": 1.5927611462703548,
"grad_norm": 7.2818121910095215,
"learning_rate": 1.0426630148530497e-05,
"loss": 0.7572,
"step": 5600
},
{
"epoch": 1.6212045793927325,
"grad_norm": 10.107643127441406,
"learning_rate": 1.0215948593700622e-05,
"loss": 0.7687,
"step": 5700
},
{
"epoch": 1.6496480125151107,
"grad_norm": 11.408272743225098,
"learning_rate": 1.0005267038870747e-05,
"loss": 0.7693,
"step": 5800
},
{
"epoch": 1.6780914456374885,
"grad_norm": 12.303543090820312,
"learning_rate": 9.794585484040872e-06,
"loss": 0.7338,
"step": 5900
},
{
"epoch": 1.7065348787598662,
"grad_norm": 10.1310396194458,
"learning_rate": 9.583903929210998e-06,
"loss": 0.7476,
"step": 6000
},
{
"epoch": 1.7065348787598662,
"eval_f1_macro": 0.8754489727729019,
"eval_loss": 0.20082785189151764,
"eval_runtime": 404.7772,
"eval_samples_per_second": 494.099,
"eval_steps_per_second": 3.861,
"step": 6000
},
{
"epoch": 1.7349783118822442,
"grad_norm": 8.645033836364746,
"learning_rate": 9.373222374381123e-06,
"loss": 0.7637,
"step": 6100
},
{
"epoch": 1.7634217450046221,
"grad_norm": 9.703550338745117,
"learning_rate": 9.162540819551248e-06,
"loss": 0.7443,
"step": 6200
},
{
"epoch": 1.7918651781269999,
"grad_norm": 11.156991958618164,
"learning_rate": 8.951859264721375e-06,
"loss": 0.7523,
"step": 6300
},
{
"epoch": 1.8203086112493778,
"grad_norm": 9.254724502563477,
"learning_rate": 8.7411777098915e-06,
"loss": 0.7513,
"step": 6400
},
{
"epoch": 1.8487520443717558,
"grad_norm": 8.819207191467285,
"learning_rate": 8.530496155061625e-06,
"loss": 0.7536,
"step": 6500
},
{
"epoch": 1.8771954774941335,
"grad_norm": 10.501325607299805,
"learning_rate": 8.31981460023175e-06,
"loss": 0.7411,
"step": 6600
},
{
"epoch": 1.9056389106165113,
"grad_norm": 20.624738693237305,
"learning_rate": 8.109133045401876e-06,
"loss": 0.7514,
"step": 6700
},
{
"epoch": 1.9340823437388894,
"grad_norm": 8.438063621520996,
"learning_rate": 7.898451490572001e-06,
"loss": 0.737,
"step": 6800
},
{
"epoch": 1.9625257768612672,
"grad_norm": 7.91304874420166,
"learning_rate": 7.687769935742126e-06,
"loss": 0.7252,
"step": 6900
},
{
"epoch": 1.990969209983645,
"grad_norm": 12.667023658752441,
"learning_rate": 7.477088380912252e-06,
"loss": 0.7187,
"step": 7000
},
{
"epoch": 1.990969209983645,
"eval_f1_macro": 0.8785000192236985,
"eval_loss": 0.1944791078567505,
"eval_runtime": 404.7549,
"eval_samples_per_second": 494.126,
"eval_steps_per_second": 3.862,
"step": 7000
},
{
"epoch": 2.019341534523217,
"grad_norm": 7.297705173492432,
"learning_rate": 7.266406826082377e-06,
"loss": 0.6465,
"step": 7100
},
{
"epoch": 2.047784967645595,
"grad_norm": 10.342308044433594,
"learning_rate": 7.055725271252503e-06,
"loss": 0.6034,
"step": 7200
},
{
"epoch": 2.0762284007679725,
"grad_norm": 16.366321563720703,
"learning_rate": 6.845043716422628e-06,
"loss": 0.6199,
"step": 7300
},
{
"epoch": 2.1046718338903507,
"grad_norm": 9.647150993347168,
"learning_rate": 6.6343621615927535e-06,
"loss": 0.5877,
"step": 7400
},
{
"epoch": 2.1331152670127285,
"grad_norm": 10.496694564819336,
"learning_rate": 6.423680606762879e-06,
"loss": 0.6048,
"step": 7500
},
{
"epoch": 2.161558700135106,
"grad_norm": 8.25317668914795,
"learning_rate": 6.212999051933004e-06,
"loss": 0.6036,
"step": 7600
},
{
"epoch": 2.1900021332574844,
"grad_norm": 7.672098159790039,
"learning_rate": 6.002317497103129e-06,
"loss": 0.5875,
"step": 7700
},
{
"epoch": 2.218445566379862,
"grad_norm": 11.453436851501465,
"learning_rate": 5.791635942273255e-06,
"loss": 0.5878,
"step": 7800
},
{
"epoch": 2.24688899950224,
"grad_norm": 8.48521900177002,
"learning_rate": 5.5809543874433805e-06,
"loss": 0.5879,
"step": 7900
},
{
"epoch": 2.2753324326246176,
"grad_norm": 10.552648544311523,
"learning_rate": 5.370272832613506e-06,
"loss": 0.6051,
"step": 8000
},
{
"epoch": 2.2753324326246176,
"eval_f1_macro": 0.8787007793384074,
"eval_loss": 0.20028316974639893,
"eval_runtime": 404.6697,
"eval_samples_per_second": 494.23,
"eval_steps_per_second": 3.862,
"step": 8000
},
{
"epoch": 2.3037758657469958,
"grad_norm": 7.606287479400635,
"learning_rate": 5.159591277783631e-06,
"loss": 0.591,
"step": 8100
},
{
"epoch": 2.3322192988693735,
"grad_norm": 10.84433364868164,
"learning_rate": 4.948909722953756e-06,
"loss": 0.5676,
"step": 8200
},
{
"epoch": 2.3606627319917513,
"grad_norm": 7.784737586975098,
"learning_rate": 4.7382281681238814e-06,
"loss": 0.5582,
"step": 8300
},
{
"epoch": 2.3891061651141294,
"grad_norm": 10.117920875549316,
"learning_rate": 4.527546613294007e-06,
"loss": 0.5748,
"step": 8400
},
{
"epoch": 2.417549598236507,
"grad_norm": 10.901618003845215,
"learning_rate": 4.316865058464132e-06,
"loss": 0.5832,
"step": 8500
},
{
"epoch": 2.445993031358885,
"grad_norm": 8.637004852294922,
"learning_rate": 4.106183503634257e-06,
"loss": 0.5916,
"step": 8600
},
{
"epoch": 2.474436464481263,
"grad_norm": 13.190328598022461,
"learning_rate": 3.895501948804382e-06,
"loss": 0.5849,
"step": 8700
},
{
"epoch": 2.502879897603641,
"grad_norm": 10.942352294921875,
"learning_rate": 3.684820393974508e-06,
"loss": 0.5709,
"step": 8800
},
{
"epoch": 2.5313233307260186,
"grad_norm": 10.253789901733398,
"learning_rate": 3.474138839144633e-06,
"loss": 0.5772,
"step": 8900
},
{
"epoch": 2.5597667638483967,
"grad_norm": 6.905132293701172,
"learning_rate": 3.263457284314758e-06,
"loss": 0.579,
"step": 9000
},
{
"epoch": 2.5597667638483967,
"eval_f1_macro": 0.8794205074585332,
"eval_loss": 0.19818730652332306,
"eval_runtime": 404.4575,
"eval_samples_per_second": 494.49,
"eval_steps_per_second": 3.864,
"step": 9000
},
{
"epoch": 2.5882101969707745,
"grad_norm": 8.60560131072998,
"learning_rate": 3.0527757294848838e-06,
"loss": 0.5877,
"step": 9100
},
{
"epoch": 2.616653630093152,
"grad_norm": 9.271297454833984,
"learning_rate": 2.842094174655009e-06,
"loss": 0.6023,
"step": 9200
},
{
"epoch": 2.64509706321553,
"grad_norm": 9.618608474731445,
"learning_rate": 2.6314126198251342e-06,
"loss": 0.5977,
"step": 9300
},
{
"epoch": 2.673540496337908,
"grad_norm": 7.832092761993408,
"learning_rate": 2.42073106499526e-06,
"loss": 0.5764,
"step": 9400
},
{
"epoch": 2.701983929460286,
"grad_norm": 9.95085620880127,
"learning_rate": 2.210049510165385e-06,
"loss": 0.5693,
"step": 9500
},
{
"epoch": 2.7304273625826636,
"grad_norm": 9.252172470092773,
"learning_rate": 1.9993679553355104e-06,
"loss": 0.5547,
"step": 9600
},
{
"epoch": 2.7588707957050413,
"grad_norm": 9.34287166595459,
"learning_rate": 1.7886864005056358e-06,
"loss": 0.5801,
"step": 9700
},
{
"epoch": 2.7873142288274195,
"grad_norm": 8.971467018127441,
"learning_rate": 1.5780048456757613e-06,
"loss": 0.5675,
"step": 9800
},
{
"epoch": 2.8157576619497973,
"grad_norm": 9.46047306060791,
"learning_rate": 1.3673232908458867e-06,
"loss": 0.5669,
"step": 9900
},
{
"epoch": 2.844201095072175,
"grad_norm": 8.73759651184082,
"learning_rate": 1.156641736016012e-06,
"loss": 0.5556,
"step": 10000
},
{
"epoch": 2.844201095072175,
"eval_f1_macro": 0.8816914583342934,
"eval_loss": 0.19616882503032684,
"eval_runtime": 404.736,
"eval_samples_per_second": 494.149,
"eval_steps_per_second": 3.862,
"step": 10000
}
],
"logging_steps": 100,
"max_steps": 10548,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.7445735094230712e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}