Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

config.json +32 -0
generation_config.json +7 -0
model.safetensors +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +613 -0
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_name_or_path": "/mnt/d/f0_kmeans_T5/checkpoint-2000",
+  "architectures": [
+    "MT5ForConditionalGeneration"
+  ],
+  "classifier_dropout": 0.0,
+  "d_ff": 1024,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "gelu_new",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "mt5",
+  "num_decoder_layers": 8,
+  "num_heads": 6,
+  "num_layers": 8,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "tokenizer_class": "T5Tokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.2",
+  "use_cache": true,
+  "vocab_size": 250101
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.40.2"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43226b3a6968b718374df385ca06a4426e699e2dd890eeafdf6f79819d782a6d
+size 1200684456

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e64dfc14c3a15656d3863d5cb576e968dff0c46e35fd78558dd031a89092f9f1
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5fb67bbe6b7e3f079d8158118ffcdf8ca1fb16112ad5b1903035ffc3876c7d5
+size 1064

trainer_state.json ADDED Viewed

	@@ -0,0 +1,613 @@

+{
+  "best_metric": 1.3676984310150146,
+  "best_model_checkpoint": "/mnt/d/f0_kmeans_T5_resume/checkpoint-8000",
+  "epoch": 0.8033287936888481,
+  "eval_steps": 2000,
+  "global_step": 8000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.010041609921110603,
+      "grad_norm": 0.6225447654724121,
+      "learning_rate": 2e-05,
+      "loss": 1.8574,
+      "step": 100
+    },
+    {
+      "epoch": 0.020083219842221205,
+      "grad_norm": 0.58293217420578,
+      "learning_rate": 4e-05,
+      "loss": 1.8565,
+      "step": 200
+    },
+    {
+      "epoch": 0.030124829763331806,
+      "grad_norm": 0.7323440313339233,
+      "learning_rate": 6e-05,
+      "loss": 1.8463,
+      "step": 300
+    },
+    {
+      "epoch": 0.04016643968444241,
+      "grad_norm": 0.501658022403717,
+      "learning_rate": 8e-05,
+      "loss": 1.8284,
+      "step": 400
+    },
+    {
+      "epoch": 0.05020804960555301,
+      "grad_norm": 0.971801221370697,
+      "learning_rate": 0.0001,
+      "loss": 1.8107,
+      "step": 500
+    },
+    {
+      "epoch": 0.06024965952666361,
+      "grad_norm": 0.6389101147651672,
+      "learning_rate": 9.894269401564813e-05,
+      "loss": 1.7857,
+      "step": 600
+    },
+    {
+      "epoch": 0.07029126944777421,
+      "grad_norm": 0.6149382591247559,
+      "learning_rate": 9.788538803129626e-05,
+      "loss": 1.7686,
+      "step": 700
+    },
+    {
+      "epoch": 0.08033287936888482,
+      "grad_norm": 0.9680651426315308,
+      "learning_rate": 9.68280820469444e-05,
+      "loss": 1.7481,
+      "step": 800
+    },
+    {
+      "epoch": 0.09037448928999542,
+      "grad_norm": 0.5966058969497681,
+      "learning_rate": 9.577077606259251e-05,
+      "loss": 1.7298,
+      "step": 900
+    },
+    {
+      "epoch": 0.10041609921110602,
+      "grad_norm": 0.806626558303833,
+      "learning_rate": 9.471347007824066e-05,
+      "loss": 1.7173,
+      "step": 1000
+    },
+    {
+      "epoch": 0.11045770913221663,
+      "grad_norm": 0.6925487518310547,
+      "learning_rate": 9.365616409388877e-05,
+      "loss": 1.6905,
+      "step": 1100
+    },
+    {
+      "epoch": 0.12049931905332723,
+      "grad_norm": 0.7050909399986267,
+      "learning_rate": 9.25988581095369e-05,
+      "loss": 1.679,
+      "step": 1200
+    },
+    {
+      "epoch": 0.13054092897443784,
+      "grad_norm": 0.560664713382721,
+      "learning_rate": 9.154155212518503e-05,
+      "loss": 1.6691,
+      "step": 1300
+    },
+    {
+      "epoch": 0.14058253889554842,
+      "grad_norm": 0.7096071243286133,
+      "learning_rate": 9.048424614083316e-05,
+      "loss": 1.6542,
+      "step": 1400
+    },
+    {
+      "epoch": 0.15062414881665903,
+      "grad_norm": 0.8381897211074829,
+      "learning_rate": 8.942694015648129e-05,
+      "loss": 1.6545,
+      "step": 1500
+    },
+    {
+      "epoch": 0.16066575873776964,
+      "grad_norm": 0.8832806944847107,
+      "learning_rate": 8.836963417212942e-05,
+      "loss": 1.6378,
+      "step": 1600
+    },
+    {
+      "epoch": 0.17070736865888023,
+      "grad_norm": 0.5661273002624512,
+      "learning_rate": 8.731232818777755e-05,
+      "loss": 1.625,
+      "step": 1700
+    },
+    {
+      "epoch": 0.18074897857999084,
+      "grad_norm": 0.4389723241329193,
+      "learning_rate": 8.625502220342567e-05,
+      "loss": 1.6253,
+      "step": 1800
+    },
+    {
+      "epoch": 0.19079058850110145,
+      "grad_norm": 0.45324140787124634,
+      "learning_rate": 8.519771621907381e-05,
+      "loss": 1.6048,
+      "step": 1900
+    },
+    {
+      "epoch": 0.20083219842221203,
+      "grad_norm": 0.6926701068878174,
+      "learning_rate": 8.414041023472193e-05,
+      "loss": 1.6103,
+      "step": 2000
+    },
+    {
+      "epoch": 0.20083219842221203,
+      "eval_loss": 1.4620566368103027,
+      "eval_runtime": 407.2516,
+      "eval_samples_per_second": 173.89,
+      "eval_steps_per_second": 21.738,
+      "step": 2000
+    },
+    {
+      "epoch": 0.21087380834332264,
+      "grad_norm": 0.5207750797271729,
+      "learning_rate": 8.308310425037006e-05,
+      "loss": 1.6101,
+      "step": 2100
+    },
+    {
+      "epoch": 0.22091541826443326,
+      "grad_norm": 0.564079225063324,
+      "learning_rate": 8.202579826601819e-05,
+      "loss": 1.5943,
+      "step": 2200
+    },
+    {
+      "epoch": 0.23095702818554384,
+      "grad_norm": 0.6724056601524353,
+      "learning_rate": 8.096849228166632e-05,
+      "loss": 1.5963,
+      "step": 2300
+    },
+    {
+      "epoch": 0.24099863810665445,
+      "grad_norm": 0.42919212579727173,
+      "learning_rate": 7.991118629731445e-05,
+      "loss": 1.5863,
+      "step": 2400
+    },
+    {
+      "epoch": 0.25104024802776503,
+      "grad_norm": 0.49685391783714294,
+      "learning_rate": 7.885388031296258e-05,
+      "loss": 1.5833,
+      "step": 2500
+    },
+    {
+      "epoch": 0.2610818579488757,
+      "grad_norm": 0.784831702709198,
+      "learning_rate": 7.779657432861071e-05,
+      "loss": 1.5759,
+      "step": 2600
+    },
+    {
+      "epoch": 0.27112346786998626,
+      "grad_norm": 0.3932148814201355,
+      "learning_rate": 7.673926834425883e-05,
+      "loss": 1.5764,
+      "step": 2700
+    },
+    {
+      "epoch": 0.28116507779109684,
+      "grad_norm": 0.7318882346153259,
+      "learning_rate": 7.568196235990697e-05,
+      "loss": 1.5531,
+      "step": 2800
+    },
+    {
+      "epoch": 0.2912066877122075,
+      "grad_norm": 0.46883219480514526,
+      "learning_rate": 7.462465637555509e-05,
+      "loss": 1.5597,
+      "step": 2900
+    },
+    {
+      "epoch": 0.30124829763331806,
+      "grad_norm": 0.5420709252357483,
+      "learning_rate": 7.356735039120322e-05,
+      "loss": 1.5607,
+      "step": 3000
+    },
+    {
+      "epoch": 0.31128990755442865,
+      "grad_norm": 0.4739568829536438,
+      "learning_rate": 7.251004440685135e-05,
+      "loss": 1.5571,
+      "step": 3100
+    },
+    {
+      "epoch": 0.3213315174755393,
+      "grad_norm": 0.524760365486145,
+      "learning_rate": 7.145273842249948e-05,
+      "loss": 1.5413,
+      "step": 3200
+    },
+    {
+      "epoch": 0.33137312739664987,
+      "grad_norm": 0.40299034118652344,
+      "learning_rate": 7.039543243814761e-05,
+      "loss": 1.544,
+      "step": 3300
+    },
+    {
+      "epoch": 0.34141473731776045,
+      "grad_norm": 0.6930205225944519,
+      "learning_rate": 6.933812645379572e-05,
+      "loss": 1.5407,
+      "step": 3400
+    },
+    {
+      "epoch": 0.3514563472388711,
+      "grad_norm": 0.4104674160480499,
+      "learning_rate": 6.828082046944387e-05,
+      "loss": 1.539,
+      "step": 3500
+    },
+    {
+      "epoch": 0.3614979571599817,
+      "grad_norm": 0.4443911910057068,
+      "learning_rate": 6.722351448509198e-05,
+      "loss": 1.534,
+      "step": 3600
+    },
+    {
+      "epoch": 0.37153956708109226,
+      "grad_norm": 0.3941650092601776,
+      "learning_rate": 6.616620850074011e-05,
+      "loss": 1.5338,
+      "step": 3700
+    },
+    {
+      "epoch": 0.3815811770022029,
+      "grad_norm": 0.530119001865387,
+      "learning_rate": 6.510890251638824e-05,
+      "loss": 1.5244,
+      "step": 3800
+    },
+    {
+      "epoch": 0.3916227869233135,
+      "grad_norm": 0.5141203999519348,
+      "learning_rate": 6.405159653203637e-05,
+      "loss": 1.5242,
+      "step": 3900
+    },
+    {
+      "epoch": 0.40166439684442407,
+      "grad_norm": 0.5742161870002747,
+      "learning_rate": 6.29942905476845e-05,
+      "loss": 1.5233,
+      "step": 4000
+    },
+    {
+      "epoch": 0.40166439684442407,
+      "eval_loss": 1.409444808959961,
+      "eval_runtime": 406.7484,
+      "eval_samples_per_second": 174.105,
+      "eval_steps_per_second": 21.765,
+      "step": 4000
+    },
+    {
+      "epoch": 0.4117060067655347,
+      "grad_norm": 0.49223774671554565,
+      "learning_rate": 6.193698456333263e-05,
+      "loss": 1.5254,
+      "step": 4100
+    },
+    {
+      "epoch": 0.4217476166866453,
+      "grad_norm": 0.4922013580799103,
+      "learning_rate": 6.087967857898076e-05,
+      "loss": 1.5187,
+      "step": 4200
+    },
+    {
+      "epoch": 0.43178922660775587,
+      "grad_norm": 0.5500440001487732,
+      "learning_rate": 5.982237259462888e-05,
+      "loss": 1.5217,
+      "step": 4300
+    },
+    {
+      "epoch": 0.4418308365288665,
+      "grad_norm": 0.5224810242652893,
+      "learning_rate": 5.876506661027702e-05,
+      "loss": 1.5187,
+      "step": 4400
+    },
+    {
+      "epoch": 0.4518724464499771,
+      "grad_norm": 0.42916256189346313,
+      "learning_rate": 5.770776062592515e-05,
+      "loss": 1.5064,
+      "step": 4500
+    },
+    {
+      "epoch": 0.4619140563710877,
+      "grad_norm": 0.4909166693687439,
+      "learning_rate": 5.665045464157327e-05,
+      "loss": 1.5065,
+      "step": 4600
+    },
+    {
+      "epoch": 0.4719556662921983,
+      "grad_norm": 0.4582947790622711,
+      "learning_rate": 5.559314865722141e-05,
+      "loss": 1.5095,
+      "step": 4700
+    },
+    {
+      "epoch": 0.4819972762133089,
+      "grad_norm": 0.4197562038898468,
+      "learning_rate": 5.453584267286953e-05,
+      "loss": 1.501,
+      "step": 4800
+    },
+    {
+      "epoch": 0.4920388861344195,
+      "grad_norm": 0.42700713872909546,
+      "learning_rate": 5.3478536688517655e-05,
+      "loss": 1.5088,
+      "step": 4900
+    },
+    {
+      "epoch": 0.5020804960555301,
+      "grad_norm": 0.6198418140411377,
+      "learning_rate": 5.242123070416579e-05,
+      "loss": 1.4986,
+      "step": 5000
+    },
+    {
+      "epoch": 0.5121221059766407,
+      "grad_norm": 0.46520060300827026,
+      "learning_rate": 5.1363924719813915e-05,
+      "loss": 1.5118,
+      "step": 5100
+    },
+    {
+      "epoch": 0.5221637158977513,
+      "grad_norm": 0.36571305990219116,
+      "learning_rate": 5.030661873546204e-05,
+      "loss": 1.5023,
+      "step": 5200
+    },
+    {
+      "epoch": 0.5322053258188619,
+      "grad_norm": 0.3751266300678253,
+      "learning_rate": 4.924931275111017e-05,
+      "loss": 1.4921,
+      "step": 5300
+    },
+    {
+      "epoch": 0.5422469357399725,
+      "grad_norm": 0.37080907821655273,
+      "learning_rate": 4.8192006766758305e-05,
+      "loss": 1.492,
+      "step": 5400
+    },
+    {
+      "epoch": 0.5522885456610831,
+      "grad_norm": 0.43024304509162903,
+      "learning_rate": 4.7134700782406435e-05,
+      "loss": 1.4859,
+      "step": 5500
+    },
+    {
+      "epoch": 0.5623301555821937,
+      "grad_norm": 0.40753471851348877,
+      "learning_rate": 4.607739479805456e-05,
+      "loss": 1.4853,
+      "step": 5600
+    },
+    {
+      "epoch": 0.5723717655033043,
+      "grad_norm": 0.40475383400917053,
+      "learning_rate": 4.502008881370269e-05,
+      "loss": 1.489,
+      "step": 5700
+    },
+    {
+      "epoch": 0.582413375424415,
+      "grad_norm": 0.3590254485607147,
+      "learning_rate": 4.396278282935082e-05,
+      "loss": 1.4892,
+      "step": 5800
+    },
+    {
+      "epoch": 0.5924549853455255,
+      "grad_norm": 0.4830130338668823,
+      "learning_rate": 4.290547684499894e-05,
+      "loss": 1.4922,
+      "step": 5900
+    },
+    {
+      "epoch": 0.6024965952666361,
+      "grad_norm": 0.42672935128211975,
+      "learning_rate": 4.184817086064707e-05,
+      "loss": 1.483,
+      "step": 6000
+    },
+    {
+      "epoch": 0.6024965952666361,
+      "eval_loss": 1.3820070028305054,
+      "eval_runtime": 408.5783,
+      "eval_samples_per_second": 173.325,
+      "eval_steps_per_second": 21.668,
+      "step": 6000
+    },
+    {
+      "epoch": 0.6125382051877467,
+      "grad_norm": 0.39759695529937744,
+      "learning_rate": 4.07908648762952e-05,
+      "loss": 1.4778,
+      "step": 6100
+    },
+    {
+      "epoch": 0.6225798151088573,
+      "grad_norm": 0.43023234605789185,
+      "learning_rate": 3.9733558891943326e-05,
+      "loss": 1.4774,
+      "step": 6200
+    },
+    {
+      "epoch": 0.6326214250299679,
+      "grad_norm": 0.4756367802619934,
+      "learning_rate": 3.867625290759146e-05,
+      "loss": 1.4879,
+      "step": 6300
+    },
+    {
+      "epoch": 0.6426630349510786,
+      "grad_norm": 0.46574530005455017,
+      "learning_rate": 3.761894692323959e-05,
+      "loss": 1.4765,
+      "step": 6400
+    },
+    {
+      "epoch": 0.6527046448721892,
+      "grad_norm": 0.6158186197280884,
+      "learning_rate": 3.6561640938887716e-05,
+      "loss": 1.4741,
+      "step": 6500
+    },
+    {
+      "epoch": 0.6627462547932997,
+      "grad_norm": 0.42495304346084595,
+      "learning_rate": 3.5504334954535846e-05,
+      "loss": 1.4764,
+      "step": 6600
+    },
+    {
+      "epoch": 0.6727878647144103,
+      "grad_norm": 0.44197046756744385,
+      "learning_rate": 3.4447028970183976e-05,
+      "loss": 1.4688,
+      "step": 6700
+    },
+    {
+      "epoch": 0.6828294746355209,
+      "grad_norm": 0.38747721910476685,
+      "learning_rate": 3.33897229858321e-05,
+      "loss": 1.4772,
+      "step": 6800
+    },
+    {
+      "epoch": 0.6928710845566315,
+      "grad_norm": 0.3641980290412903,
+      "learning_rate": 3.233241700148023e-05,
+      "loss": 1.47,
+      "step": 6900
+    },
+    {
+      "epoch": 0.7029126944777422,
+      "grad_norm": 0.39210596680641174,
+      "learning_rate": 3.127511101712835e-05,
+      "loss": 1.4734,
+      "step": 7000
+    },
+    {
+      "epoch": 0.7129543043988528,
+      "grad_norm": 0.4860726594924927,
+      "learning_rate": 3.0217805032776486e-05,
+      "loss": 1.4815,
+      "step": 7100
+    },
+    {
+      "epoch": 0.7229959143199634,
+      "grad_norm": 0.4656274616718292,
+      "learning_rate": 2.9160499048424616e-05,
+      "loss": 1.464,
+      "step": 7200
+    },
+    {
+      "epoch": 0.7330375242410739,
+      "grad_norm": 0.37134116888046265,
+      "learning_rate": 2.810319306407274e-05,
+      "loss": 1.4716,
+      "step": 7300
+    },
+    {
+      "epoch": 0.7430791341621845,
+      "grad_norm": 0.46362295746803284,
+      "learning_rate": 2.7045887079720873e-05,
+      "loss": 1.4725,
+      "step": 7400
+    },
+    {
+      "epoch": 0.7531207440832951,
+      "grad_norm": 0.3737928867340088,
+      "learning_rate": 2.5988581095369003e-05,
+      "loss": 1.4673,
+      "step": 7500
+    },
+    {
+      "epoch": 0.7631623540044058,
+      "grad_norm": 0.3332815170288086,
+      "learning_rate": 2.493127511101713e-05,
+      "loss": 1.4608,
+      "step": 7600
+    },
+    {
+      "epoch": 0.7732039639255164,
+      "grad_norm": 0.44841036200523376,
+      "learning_rate": 2.3873969126665257e-05,
+      "loss": 1.466,
+      "step": 7700
+    },
+    {
+      "epoch": 0.783245573846627,
+      "grad_norm": 0.5019286274909973,
+      "learning_rate": 2.2816663142313387e-05,
+      "loss": 1.4651,
+      "step": 7800
+    },
+    {
+      "epoch": 0.7932871837677375,
+      "grad_norm": 0.3360118567943573,
+      "learning_rate": 2.1759357157961517e-05,
+      "loss": 1.4706,
+      "step": 7900
+    },
+    {
+      "epoch": 0.8033287936888481,
+      "grad_norm": 0.38790610432624817,
+      "learning_rate": 2.0702051173609644e-05,
+      "loss": 1.4646,
+      "step": 8000
+    },
+    {
+      "epoch": 0.8033287936888481,
+      "eval_loss": 1.3676984310150146,
+      "eval_runtime": 408.474,
+      "eval_samples_per_second": 173.37,
+      "eval_steps_per_second": 21.673,
+      "step": 8000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 9958,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 2000,
+  "total_flos": 1.35355491680256e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:698e0e256724fb8c8e561a1465421623b9a309763586162a64221d91635c40fd
+size 5112