Training in progress, step 2, checkpoint

Browse files

Files changed (7) hide show

last-checkpoint/config.json +5 -5
last-checkpoint/model.safetensors +2 -2
last-checkpoint/optimizer.pt +2 -2
last-checkpoint/rng_state.pth +3 -0
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +18 -844
last-checkpoint/training_args.bin +1 -1

last-checkpoint/config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "tattabio/gLM2_650M",
   "architectures": [
     "gLM2ForMaskedLM"
   ],
@@ -8,14 +8,14 @@
     "AutoModel": "modeling_glm2.gLM2Model",
     "AutoModelForMaskedLM": "modeling_glm2.gLM2ForMaskedLM"
   },
-  "depth": 33,
-  "dim": 1280,
   "ffn_dim_multiplier": null,
-  "heads": 20,
   "model_type": "gLM2",
   "norm_eps": 1e-05,
   "swiglu_multiple_of": 256,
   "torch_dtype": "float32",
-  "transformers_version": "4.45.2",
   "vocab_size": 37
 }

 {
+  "_name_or_path": "tattabio/gLM2_150M",
   "architectures": [
     "gLM2ForMaskedLM"
   ],
     "AutoModel": "modeling_glm2.gLM2Model",
     "AutoModelForMaskedLM": "modeling_glm2.gLM2ForMaskedLM"
   },
+  "depth": 30,
+  "dim": 640,
   "ffn_dim_multiplier": null,
+  "heads": 10,
   "model_type": "gLM2",
   "norm_eps": 1e-05,
   "swiglu_multiple_of": 256,
   "torch_dtype": "float32",
+  "transformers_version": "4.46.0",
   "vocab_size": 37
 }

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9eae45eb43651c4ce612c5b264270a3ccdfbc48e1be2784320e0059c614c3cab
-size 2682482800

 version https://git-lfs.github.com/spec/v1
+oid sha256:516558ed7782de66fc542438abb1c93e159afd70a2aeb6571ce83cca423452b0
+size 609855088

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:36970303513d3e205403c36051106bf22e33ef86f3a1e71a2f1e2cba961b8110
-size 5365108834

 version https://git-lfs.github.com/spec/v1
+oid sha256:812c91eacfd5aea68d8b5decb8b50302d3944860c0aa6ecd636549bd4f072a92
+size 1219840058

last-checkpoint/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd261f24a3b802018daa7344b83f247d318502ad53b463783d522e7fc68f088e
+size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:214df0e2d0c96471516754f237b8e237791d4cac9a44207b49ae1586ecbb810a
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:59630a3df2ec5543c18897bf2cb0562e6bac8d472d75091b8f7ddabcb069715a
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,859 +1,33 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.46051392874746205,
-  "eval_steps": 500,
-  "global_step": 6000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.003837616072895517,
-      "grad_norm": 0.660070538520813,
-      "learning_rate": 9.961621123733497e-05,
-      "loss": 1.2329,
-      "step": 50
     },
     {
-      "epoch": 0.007675232145791034,
-      "grad_norm": 0.795362114906311,
-      "learning_rate": 9.923242247466995e-05,
-      "loss": 1.1816,
-      "step": 100
-    },
-    {
-      "epoch": 0.011512848218686552,
-      "grad_norm": 0.9296917915344238,
-      "learning_rate": 9.884863371200491e-05,
-      "loss": 1.1603,
-      "step": 150
-    },
-    {
-      "epoch": 0.015350464291582069,
-      "grad_norm": 0.6324566006660461,
-      "learning_rate": 9.846484494933989e-05,
-      "loss": 1.1505,
-      "step": 200
-    },
-    {
-      "epoch": 0.019188080364477587,
-      "grad_norm": 0.8173375725746155,
-      "learning_rate": 9.808105618667486e-05,
-      "loss": 1.1454,
-      "step": 250
-    },
-    {
-      "epoch": 0.023025696437373105,
-      "grad_norm": 0.6401157975196838,
-      "learning_rate": 9.769726742400983e-05,
-      "loss": 1.1346,
-      "step": 300
-    },
-    {
-      "epoch": 0.026863312510268623,
-      "grad_norm": 0.6105799674987793,
-      "learning_rate": 9.731347866134481e-05,
-      "loss": 1.1263,
-      "step": 350
-    },
-    {
-      "epoch": 0.030700928583164137,
-      "grad_norm": 0.3902953863143921,
-      "learning_rate": 9.692968989867978e-05,
-      "loss": 1.1201,
-      "step": 400
-    },
-    {
-      "epoch": 0.034538544656059655,
-      "grad_norm": 0.8294398188591003,
-      "learning_rate": 9.654590113601474e-05,
-      "loss": 1.1123,
-      "step": 450
-    },
-    {
-      "epoch": 0.03837616072895517,
-      "grad_norm": 0.7488604187965393,
-      "learning_rate": 9.616211237334971e-05,
-      "loss": 1.1099,
-      "step": 500
-    },
-    {
-      "epoch": 0.04221377680185069,
-      "grad_norm": 0.32585829496383667,
-      "learning_rate": 9.577832361068468e-05,
-      "loss": 1.1056,
-      "step": 550
-    },
-    {
-      "epoch": 0.04605139287474621,
-      "grad_norm": 0.48506152629852295,
-      "learning_rate": 9.539453484801966e-05,
-      "loss": 1.1021,
-      "step": 600
-    },
-    {
-      "epoch": 0.04988900894764173,
-      "grad_norm": 0.8694831132888794,
-      "learning_rate": 9.501074608535463e-05,
-      "loss": 1.0931,
-      "step": 650
-    },
-    {
-      "epoch": 0.053726625020537246,
-      "grad_norm": 1.0110464096069336,
-      "learning_rate": 9.462695732268958e-05,
-      "loss": 1.0903,
-      "step": 700
-    },
-    {
-      "epoch": 0.05756424109343276,
-      "grad_norm": 0.5741580128669739,
-      "learning_rate": 9.424316856002457e-05,
-      "loss": 1.0871,
-      "step": 750
-    },
-    {
-      "epoch": 0.061401857166328275,
-      "grad_norm": 1.2655494213104248,
-      "learning_rate": 9.385937979735953e-05,
-      "loss": 1.0825,
-      "step": 800
-    },
-    {
-      "epoch": 0.0652394732392238,
-      "grad_norm": 0.6292811036109924,
-      "learning_rate": 9.347559103469452e-05,
-      "loss": 1.0787,
-      "step": 850
-    },
-    {
-      "epoch": 0.06907708931211931,
-      "grad_norm": 0.6818861961364746,
-      "learning_rate": 9.309180227202948e-05,
-      "loss": 1.0738,
-      "step": 900
-    },
-    {
-      "epoch": 0.07291470538501482,
-      "grad_norm": 0.5532193183898926,
-      "learning_rate": 9.270801350936445e-05,
-      "loss": 1.0714,
-      "step": 950
-    },
-    {
-      "epoch": 0.07675232145791035,
-      "grad_norm": 0.6174580454826355,
-      "learning_rate": 9.232422474669942e-05,
-      "loss": 1.0674,
-      "step": 1000
-    },
-    {
-      "epoch": 0.08058993753080586,
-      "grad_norm": 0.44129157066345215,
-      "learning_rate": 9.194043598403439e-05,
-      "loss": 1.0663,
-      "step": 1050
-    },
-    {
-      "epoch": 0.08442755360370138,
-      "grad_norm": 0.6179072856903076,
-      "learning_rate": 9.155664722136937e-05,
-      "loss": 1.0639,
-      "step": 1100
-    },
-    {
-      "epoch": 0.0882651696765969,
-      "grad_norm": 0.847963273525238,
-      "learning_rate": 9.117285845870434e-05,
-      "loss": 1.0601,
-      "step": 1150
-    },
-    {
-      "epoch": 0.09210278574949242,
-      "grad_norm": 0.8802704215049744,
-      "learning_rate": 9.07890696960393e-05,
-      "loss": 1.0536,
-      "step": 1200
-    },
-    {
-      "epoch": 0.09594040182238793,
-      "grad_norm": 0.837968111038208,
-      "learning_rate": 9.040528093337427e-05,
-      "loss": 1.0525,
-      "step": 1250
-    },
-    {
-      "epoch": 0.09977801789528346,
-      "grad_norm": 0.8432613611221313,
-      "learning_rate": 9.002149217070924e-05,
-      "loss": 1.0482,
-      "step": 1300
-    },
-    {
-      "epoch": 0.10361563396817897,
-      "grad_norm": 0.5921217799186707,
-      "learning_rate": 8.963770340804421e-05,
-      "loss": 1.0449,
-      "step": 1350
-    },
-    {
-      "epoch": 0.10745325004107449,
-      "grad_norm": 0.45630180835723877,
-      "learning_rate": 8.925391464537919e-05,
-      "loss": 1.0416,
-      "step": 1400
-    },
-    {
-      "epoch": 0.11129086611397,
-      "grad_norm": 0.45162612199783325,
-      "learning_rate": 8.887012588271416e-05,
-      "loss": 1.0384,
-      "step": 1450
-    },
-    {
-      "epoch": 0.11512848218686551,
-      "grad_norm": 0.9114300012588501,
-      "learning_rate": 8.848633712004914e-05,
-      "loss": 1.0355,
-      "step": 1500
-    },
-    {
-      "epoch": 0.11896609825976104,
-      "grad_norm": 1.0031269788742065,
-      "learning_rate": 8.81025483573841e-05,
-      "loss": 1.0336,
-      "step": 1550
-    },
-    {
-      "epoch": 0.12280371433265655,
-      "grad_norm": 0.6353164911270142,
-      "learning_rate": 8.771875959471906e-05,
-      "loss": 1.0284,
-      "step": 1600
-    },
-    {
-      "epoch": 0.12664133040555206,
-      "grad_norm": 0.5234444737434387,
-      "learning_rate": 8.733497083205404e-05,
-      "loss": 1.0277,
-      "step": 1650
-    },
-    {
-      "epoch": 0.1304789464784476,
-      "grad_norm": 1.0591809749603271,
-      "learning_rate": 8.695118206938901e-05,
-      "loss": 1.0262,
-      "step": 1700
-    },
-    {
-      "epoch": 0.1343165625513431,
-      "grad_norm": 0.3141140341758728,
-      "learning_rate": 8.656739330672398e-05,
-      "loss": 1.0253,
-      "step": 1750
-    },
-    {
-      "epoch": 0.13815417862423862,
-      "grad_norm": 0.6443182229995728,
-      "learning_rate": 8.618360454405895e-05,
-      "loss": 1.0202,
-      "step": 1800
-    },
-    {
-      "epoch": 0.14199179469713413,
-      "grad_norm": 0.7155106067657471,
-      "learning_rate": 8.579981578139392e-05,
-      "loss": 1.0202,
-      "step": 1850
-    },
-    {
-      "epoch": 0.14582941077002964,
-      "grad_norm": 0.7634809017181396,
-      "learning_rate": 8.54160270187289e-05,
-      "loss": 1.0165,
-      "step": 1900
-    },
-    {
-      "epoch": 0.14966702684292518,
-      "grad_norm": 0.4034004509449005,
-      "learning_rate": 8.503223825606387e-05,
-      "loss": 1.0157,
-      "step": 1950
-    },
-    {
-      "epoch": 0.1535046429158207,
-      "grad_norm": 1.0218342542648315,
-      "learning_rate": 8.464844949339883e-05,
-      "loss": 1.0122,
-      "step": 2000
-    },
-    {
-      "epoch": 0.1573422589887162,
-      "grad_norm": 0.9840885996818542,
-      "learning_rate": 8.426466073073382e-05,
-      "loss": 1.0112,
-      "step": 2050
-    },
-    {
-      "epoch": 0.16117987506161172,
-      "grad_norm": 0.6855255961418152,
-      "learning_rate": 8.388087196806877e-05,
-      "loss": 1.007,
-      "step": 2100
-    },
-    {
-      "epoch": 0.16501749113450725,
-      "grad_norm": 1.3098440170288086,
-      "learning_rate": 8.349708320540375e-05,
-      "loss": 1.0063,
-      "step": 2150
-    },
-    {
-      "epoch": 0.16885510720740277,
-      "grad_norm": 0.7584977149963379,
-      "learning_rate": 8.311329444273872e-05,
-      "loss": 1.0068,
-      "step": 2200
-    },
-    {
-      "epoch": 0.17269272328029828,
-      "grad_norm": 0.5157095193862915,
-      "learning_rate": 8.272950568007369e-05,
-      "loss": 1.0043,
-      "step": 2250
-    },
-    {
-      "epoch": 0.1765303393531938,
-      "grad_norm": 0.78426194190979,
-      "learning_rate": 8.234571691740867e-05,
-      "loss": 0.9992,
-      "step": 2300
-    },
-    {
-      "epoch": 0.18036795542608933,
-      "grad_norm": 0.729180097579956,
-      "learning_rate": 8.196192815474362e-05,
-      "loss": 0.9974,
-      "step": 2350
-    },
-    {
-      "epoch": 0.18420557149898484,
-      "grad_norm": 0.8251708149909973,
-      "learning_rate": 8.15781393920786e-05,
-      "loss": 0.9944,
-      "step": 2400
-    },
-    {
-      "epoch": 0.18804318757188035,
-      "grad_norm": 0.4832938015460968,
-      "learning_rate": 8.119435062941357e-05,
-      "loss": 0.9959,
-      "step": 2450
-    },
-    {
-      "epoch": 0.19188080364477586,
-      "grad_norm": 1.06001877784729,
-      "learning_rate": 8.081056186674854e-05,
-      "loss": 0.9944,
-      "step": 2500
-    },
-    {
-      "epoch": 0.19571841971767137,
-      "grad_norm": 0.8641451597213745,
-      "learning_rate": 8.042677310408352e-05,
-      "loss": 0.989,
-      "step": 2550
-    },
-    {
-      "epoch": 0.1995560357905669,
-      "grad_norm": 0.3354558050632477,
-      "learning_rate": 8.004298434141849e-05,
-      "loss": 0.9891,
-      "step": 2600
-    },
-    {
-      "epoch": 0.20339365186346242,
-      "grad_norm": 0.26871201395988464,
-      "learning_rate": 7.965919557875346e-05,
-      "loss": 0.9845,
-      "step": 2650
-    },
-    {
-      "epoch": 0.20723126793635793,
-      "grad_norm": 0.5861389636993408,
-      "learning_rate": 7.927540681608843e-05,
-      "loss": 0.982,
-      "step": 2700
-    },
-    {
-      "epoch": 0.21106888400925344,
-      "grad_norm": 0.3075869679450989,
-      "learning_rate": 7.88916180534234e-05,
-      "loss": 0.9814,
-      "step": 2750
-    },
-    {
-      "epoch": 0.21490650008214898,
-      "grad_norm": 0.3018185496330261,
-      "learning_rate": 7.850782929075838e-05,
-      "loss": 0.9814,
-      "step": 2800
-    },
-    {
-      "epoch": 0.2187441161550445,
-      "grad_norm": 0.7530673742294312,
-      "learning_rate": 7.812404052809335e-05,
-      "loss": 0.9807,
-      "step": 2850
-    },
-    {
-      "epoch": 0.22258173222794,
-      "grad_norm": 0.8647517561912537,
-      "learning_rate": 7.774025176542831e-05,
-      "loss": 0.9749,
-      "step": 2900
-    },
-    {
-      "epoch": 0.22641934830083552,
-      "grad_norm": 0.3062540888786316,
-      "learning_rate": 7.735646300276328e-05,
-      "loss": 0.9761,
-      "step": 2950
-    },
-    {
-      "epoch": 0.23025696437373103,
-      "grad_norm": 0.8063308596611023,
-      "learning_rate": 7.697267424009825e-05,
-      "loss": 0.9705,
-      "step": 3000
-    },
-    {
-      "epoch": 0.23409458044662657,
-      "grad_norm": 1.2951929569244385,
-      "learning_rate": 7.658888547743323e-05,
-      "loss": 0.9741,
-      "step": 3050
-    },
-    {
-      "epoch": 0.23793219651952208,
-      "grad_norm": 0.57526695728302,
-      "learning_rate": 7.62050967147682e-05,
-      "loss": 0.9667,
-      "step": 3100
-    },
-    {
-      "epoch": 0.2417698125924176,
-      "grad_norm": 0.21630573272705078,
-      "learning_rate": 7.582130795210317e-05,
-      "loss": 0.9658,
-      "step": 3150
-    },
-    {
-      "epoch": 0.2456074286653131,
-      "grad_norm": 0.44468560814857483,
-      "learning_rate": 7.543751918943814e-05,
-      "loss": 0.9652,
-      "step": 3200
-    },
-    {
-      "epoch": 0.24944504473820864,
-      "grad_norm": 0.484546959400177,
-      "learning_rate": 7.50537304267731e-05,
-      "loss": 0.9632,
-      "step": 3250
-    },
-    {
-      "epoch": 0.2532826608111041,
-      "grad_norm": 0.6433889865875244,
-      "learning_rate": 7.466994166410808e-05,
-      "loss": 0.9623,
-      "step": 3300
-    },
-    {
-      "epoch": 0.25712027688399963,
-      "grad_norm": 0.43382543325424194,
-      "learning_rate": 7.428615290144305e-05,
-      "loss": 0.9607,
-      "step": 3350
-    },
-    {
-      "epoch": 0.2609578929568952,
-      "grad_norm": 0.524282693862915,
-      "learning_rate": 7.390236413877802e-05,
-      "loss": 0.9604,
-      "step": 3400
-    },
-    {
-      "epoch": 0.2647955090297907,
-      "grad_norm": 0.8118641972541809,
-      "learning_rate": 7.351857537611299e-05,
-      "loss": 0.9571,
-      "step": 3450
-    },
-    {
-      "epoch": 0.2686331251026862,
-      "grad_norm": 0.7356590628623962,
-      "learning_rate": 7.313478661344796e-05,
-      "loss": 0.9535,
-      "step": 3500
-    },
-    {
-      "epoch": 0.27247074117558173,
-      "grad_norm": 0.27771323919296265,
-      "learning_rate": 7.275099785078293e-05,
-      "loss": 0.9557,
-      "step": 3550
-    },
-    {
-      "epoch": 0.27630835724847724,
-      "grad_norm": 0.5177915692329407,
-      "learning_rate": 7.23672090881179e-05,
-      "loss": 0.9542,
-      "step": 3600
-    },
-    {
-      "epoch": 0.28014597332137275,
-      "grad_norm": 0.3106517195701599,
-      "learning_rate": 7.198342032545287e-05,
-      "loss": 0.9538,
-      "step": 3650
-    },
-    {
-      "epoch": 0.28398358939426827,
-      "grad_norm": 0.4957466125488281,
-      "learning_rate": 7.159963156278786e-05,
-      "loss": 0.9513,
-      "step": 3700
-    },
-    {
-      "epoch": 0.2878212054671638,
-      "grad_norm": 0.5112007856369019,
-      "learning_rate": 7.121584280012281e-05,
-      "loss": 0.949,
-      "step": 3750
-    },
-    {
-      "epoch": 0.2916588215400593,
-      "grad_norm": 0.6187167167663574,
-      "learning_rate": 7.083205403745778e-05,
-      "loss": 0.9477,
-      "step": 3800
-    },
-    {
-      "epoch": 0.29549643761295485,
-      "grad_norm": 0.29803329706192017,
-      "learning_rate": 7.044826527479276e-05,
-      "loss": 0.9497,
-      "step": 3850
-    },
-    {
-      "epoch": 0.29933405368585037,
-      "grad_norm": 0.7799173593521118,
-      "learning_rate": 7.006447651212773e-05,
-      "loss": 0.9432,
-      "step": 3900
-    },
-    {
-      "epoch": 0.3031716697587459,
-      "grad_norm": 0.37847256660461426,
-      "learning_rate": 6.96806877494627e-05,
-      "loss": 0.9396,
-      "step": 3950
-    },
-    {
-      "epoch": 0.3070092858316414,
-      "grad_norm": 0.4524092972278595,
-      "learning_rate": 6.929689898679766e-05,
-      "loss": 0.943,
-      "step": 4000
-    },
-    {
-      "epoch": 0.3108469019045369,
-      "grad_norm": 0.6535223126411438,
-      "learning_rate": 6.891311022413263e-05,
-      "loss": 0.944,
-      "step": 4050
-    },
-    {
-      "epoch": 0.3146845179774324,
-      "grad_norm": 0.5323360562324524,
-      "learning_rate": 6.852932146146761e-05,
-      "loss": 0.942,
-      "step": 4100
-    },
-    {
-      "epoch": 0.3185221340503279,
-      "grad_norm": 0.784599781036377,
-      "learning_rate": 6.814553269880258e-05,
-      "loss": 0.9386,
-      "step": 4150
-    },
-    {
-      "epoch": 0.32235975012322343,
-      "grad_norm": 0.4353456497192383,
-      "learning_rate": 6.776174393613755e-05,
-      "loss": 0.9406,
-      "step": 4200
-    },
-    {
-      "epoch": 0.326197366196119,
-      "grad_norm": 0.5127778053283691,
-      "learning_rate": 6.737795517347253e-05,
-      "loss": 0.9374,
-      "step": 4250
-    },
-    {
-      "epoch": 0.3300349822690145,
-      "grad_norm": 0.8174408674240112,
-      "learning_rate": 6.699416641080749e-05,
-      "loss": 0.932,
-      "step": 4300
-    },
-    {
-      "epoch": 0.33387259834191,
-      "grad_norm": 0.2989351153373718,
-      "learning_rate": 6.661037764814247e-05,
-      "loss": 0.9345,
-      "step": 4350
-    },
-    {
-      "epoch": 0.33771021441480553,
-      "grad_norm": 0.41601112484931946,
-      "learning_rate": 6.622658888547744e-05,
-      "loss": 0.9347,
-      "step": 4400
-    },
-    {
-      "epoch": 0.34154783048770104,
-      "grad_norm": 0.4532497525215149,
-      "learning_rate": 6.58428001228124e-05,
-      "loss": 0.932,
-      "step": 4450
-    },
-    {
-      "epoch": 0.34538544656059655,
-      "grad_norm": 0.4111255407333374,
-      "learning_rate": 6.545901136014739e-05,
-      "loss": 0.936,
-      "step": 4500
-    },
-    {
-      "epoch": 0.34922306263349207,
-      "grad_norm": 0.7033655047416687,
-      "learning_rate": 6.507522259748234e-05,
-      "loss": 0.9307,
-      "step": 4550
-    },
-    {
-      "epoch": 0.3530606787063876,
-      "grad_norm": 0.4548279643058777,
-      "learning_rate": 6.469143383481732e-05,
-      "loss": 0.9278,
-      "step": 4600
-    },
-    {
-      "epoch": 0.3568982947792831,
-      "grad_norm": 0.5447307229042053,
-      "learning_rate": 6.430764507215229e-05,
-      "loss": 0.9289,
-      "step": 4650
-    },
-    {
-      "epoch": 0.36073591085217865,
-      "grad_norm": 0.38505110144615173,
-      "learning_rate": 6.392385630948726e-05,
-      "loss": 0.9237,
-      "step": 4700
-    },
-    {
-      "epoch": 0.36457352692507417,
-      "grad_norm": 0.40116333961486816,
-      "learning_rate": 6.354006754682224e-05,
-      "loss": 0.9248,
-      "step": 4750
-    },
-    {
-      "epoch": 0.3684111429979697,
-      "grad_norm": 0.3748728930950165,
-      "learning_rate": 6.315627878415721e-05,
-      "loss": 0.9203,
-      "step": 4800
-    },
-    {
-      "epoch": 0.3722487590708652,
-      "grad_norm": 0.5459182858467102,
-      "learning_rate": 6.277249002149218e-05,
-      "loss": 0.9263,
-      "step": 4850
-    },
-    {
-      "epoch": 0.3760863751437607,
-      "grad_norm": 0.5564711093902588,
-      "learning_rate": 6.238870125882714e-05,
-      "loss": 0.9242,
-      "step": 4900
-    },
-    {
-      "epoch": 0.3799239912166562,
-      "grad_norm": 0.5415127873420715,
-      "learning_rate": 6.200491249616211e-05,
-      "loss": 0.9165,
-      "step": 4950
-    },
-    {
-      "epoch": 0.3837616072895517,
-      "grad_norm": 0.9149804711341858,
-      "learning_rate": 6.162112373349709e-05,
-      "loss": 0.92,
-      "step": 5000
-    },
-    {
-      "epoch": 0.38759922336244723,
-      "grad_norm": 0.6962669491767883,
-      "learning_rate": 6.123733497083206e-05,
-      "loss": 0.9186,
-      "step": 5050
-    },
-    {
-      "epoch": 0.39143683943534274,
-      "grad_norm": 0.6156628131866455,
-      "learning_rate": 6.085354620816702e-05,
-      "loss": 0.9139,
-      "step": 5100
-    },
-    {
-      "epoch": 0.3952744555082383,
-      "grad_norm": 0.4484277069568634,
-      "learning_rate": 6.0469757445502e-05,
-      "loss": 0.914,
-      "step": 5150
-    },
-    {
-      "epoch": 0.3991120715811338,
-      "grad_norm": 0.6082286834716797,
-      "learning_rate": 6.0085968682836965e-05,
-      "loss": 0.9148,
-      "step": 5200
-    },
-    {
-      "epoch": 0.40294968765402933,
-      "grad_norm": 0.6756613850593567,
-      "learning_rate": 5.970217992017194e-05,
-      "loss": 0.9137,
-      "step": 5250
-    },
-    {
-      "epoch": 0.40678730372692484,
-      "grad_norm": 0.6353741884231567,
-      "learning_rate": 5.9318391157506915e-05,
-      "loss": 0.9094,
-      "step": 5300
-    },
-    {
-      "epoch": 0.41062491979982035,
-      "grad_norm": 0.6543828845024109,
-      "learning_rate": 5.893460239484189e-05,
-      "loss": 0.9089,
-      "step": 5350
-    },
-    {
-      "epoch": 0.41446253587271586,
-      "grad_norm": 0.6633620262145996,
-      "learning_rate": 5.855081363217685e-05,
-      "loss": 0.9105,
-      "step": 5400
-    },
-    {
-      "epoch": 0.4183001519456114,
-      "grad_norm": 0.6769128441810608,
-      "learning_rate": 5.816702486951182e-05,
-      "loss": 0.9095,
-      "step": 5450
-    },
-    {
-      "epoch": 0.4221377680185069,
-      "grad_norm": 0.6803929209709167,
-      "learning_rate": 5.7783236106846794e-05,
-      "loss": 0.9085,
-      "step": 5500
-    },
-    {
-      "epoch": 0.4259753840914024,
-      "grad_norm": 0.4861834645271301,
-      "learning_rate": 5.739944734418177e-05,
-      "loss": 0.9067,
-      "step": 5550
-    },
-    {
-      "epoch": 0.42981300016429796,
-      "grad_norm": 0.24226143956184387,
-      "learning_rate": 5.7015658581516737e-05,
-      "loss": 0.9066,
-      "step": 5600
-    },
-    {
-      "epoch": 0.4336506162371935,
-      "grad_norm": 0.2108086198568344,
-      "learning_rate": 5.6631869818851705e-05,
-      "loss": 0.9061,
-      "step": 5650
-    },
-    {
-      "epoch": 0.437488232310089,
-      "grad_norm": 0.7616965770721436,
-      "learning_rate": 5.624808105618667e-05,
-      "loss": 0.9041,
-      "step": 5700
-    },
-    {
-      "epoch": 0.4413258483829845,
-      "grad_norm": 0.3760414719581604,
-      "learning_rate": 5.586429229352165e-05,
-      "loss": 0.9035,
-      "step": 5750
-    },
-    {
-      "epoch": 0.44516346445588,
-      "grad_norm": 0.4564415216445923,
-      "learning_rate": 5.548050353085662e-05,
-      "loss": 0.902,
-      "step": 5800
-    },
-    {
-      "epoch": 0.4490010805287755,
-      "grad_norm": 0.803648054599762,
-      "learning_rate": 5.509671476819159e-05,
-      "loss": 0.9011,
-      "step": 5850
-    },
-    {
-      "epoch": 0.45283869660167103,
-      "grad_norm": 0.7869254350662231,
-      "learning_rate": 5.4712926005526565e-05,
-      "loss": 0.9007,
-      "step": 5900
-    },
-    {
-      "epoch": 0.45667631267456654,
-      "grad_norm": 0.8484482765197754,
-      "learning_rate": 5.4329137242861526e-05,
-      "loss": 0.902,
-      "step": 5950
-    },
-    {
-      "epoch": 0.46051392874746205,
-      "grad_norm": 0.4946975111961365,
-      "learning_rate": 5.39453484801965e-05,
-      "loss": 0.8968,
-      "step": 6000
     }
   ],
-  "logging_steps": 50,
-  "max_steps": 13028,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
-  "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
@@ -866,8 +40,8 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.0284708509245243e+19,
-  "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null
 }

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.0012275351477837237,
+  "eval_steps": 2,
+  "global_step": 2,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.0006137675738918619,
+      "grad_norm": 158.1446075439453,
+      "learning_rate": 9.993861264579497e-05,
+      "loss": 100.2575,
+      "step": 1
     },
     {
+      "epoch": 0.0012275351477837237,
+      "grad_norm": 105.63041687011719,
+      "learning_rate": 9.987722529158994e-05,
+      "loss": 95.2722,
+      "step": 2
     }
   ],
+  "logging_steps": 1,
+  "max_steps": 1629,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
+  "save_steps": 2,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
       "attributes": {}
     }
   },
+  "total_flos": 919791151165440.0,
+  "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null
 }

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:65de12ccf65227e16c5ac7d6f4de8c23b93867370e90dd502a95ed85503923fb
 size 5240

 version https://git-lfs.github.com/spec/v1
+oid sha256:064b240ea07b11fb2a55256aa70c4f515e16a1e7de5972e80b77b98e19219a68
 size 5240