Auto-save flat update: checkpoint-100

Browse files

Files changed (7) hide show

config.json +5 -5
model.safetensors +2 -2
optimizer.pt +2 -2
scheduler.pt +1 -1
tokenizer.json +101 -1
trainer_state.json +43 -323
training_args.bin +1 -1

config.json CHANGED Viewed

@@ -8,18 +8,18 @@
   "bos_token_id": 1,
   "dtype": "float32",
   "eos_token_id": 2,
-  "head_dim": 34,
   "hidden_act": "silu",
   "hidden_size": 256,
   "initializer_range": 0.02,
   "intermediate_size": 1024,
-  "max_position_embeddings": 512,
   "max_window_layers": 28,
   "mlp_bias": false,
   "model_type": "qwen2",
-  "num_attention_heads": 7,
   "num_hidden_layers": 1,
-  "num_key_value_heads": 7,
   "pad_token_id": 3,
   "pretraining_tp": 1,
   "rms_norm_eps": 1e-05,
@@ -31,5 +31,5 @@
   "transformers_version": "4.48.3",
   "use_cache": false,
   "use_sliding_window": false,
-  "vocab_size": 172
 }

   "bos_token_id": 1,
   "dtype": "float32",
   "eos_token_id": 2,
+  "head_dim": 32,
   "hidden_act": "silu",
   "hidden_size": 256,
   "initializer_range": 0.02,
   "intermediate_size": 1024,
+  "max_position_embeddings": 1024,
   "max_window_layers": 28,
   "mlp_bias": false,
   "model_type": "qwen2",
+  "num_attention_heads": 8,
   "num_hidden_layers": 1,
+  "num_key_value_heads": 8,
   "pad_token_id": 3,
   "pretraining_tp": 1,
   "rms_norm_eps": 1e-05,
   "transformers_version": "4.48.3",
   "use_cache": false,
   "use_sliding_window": false,
+  "vocab_size": 192
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:078f371374102fba70d8911c4e9ceee1c08b56600c236669096b0c5ae3d0b654
-size 4304112

 version https://git-lfs.github.com/spec/v1
+oid sha256:b6b0a725fafdc4cbc9ff3e3dd898c7b32faaea0147dd5188701fcf792ce45084
+size 4398536

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c2b4fc98d17e4290ad2188bc0aad59d772b90e7a0a2fc8dd9b4cb1188eae530c
-size 8617285

 version https://git-lfs.github.com/spec/v1
+oid sha256:cd6cabe1dde2585f2289245c3f51d734eea81900d782207f109b03f385742dd5
+size 8806533

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5f65e5bda4d7f853068561455de53cd9248ace1e991b2f25b4956a5c05f7a8a2
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:d82c58c32b204ed6cf1be47fcccac4a2997bdd7e1431fe3a6ec925f0a86a9891
 size 1465

tokenizer.json CHANGED Viewed

@@ -231,7 +231,27 @@
       "Ġpro": 168,
       "ch": 169,
       "ow": 170,
-      "tic": 171
     },
     "merges": [
       [
@@ -525,6 +545,86 @@
       [
         "ti",
         "c"
       ]
     ]
   }

       "Ġpro": 168,
       "ch": 169,
       "ow": 170,
+      "tic": 171,
+      "Ġcon": 172,
+      "qu": 173,
+      "Ġh": 174,
+      "per": 175,
+      "Ġon": 176,
+      "ig": 177,
+      "am": 178,
+      "res": 179,
+      "Ġwith": 180,
+      "Ġthat": 181,
+      "ĠW": 182,
+      "ver": 183,
+      "um": 184,
+      "Ġ$": 185,
+      "il": 186,
+      "Ġex": 187,
+      "ut": 188,
+      "se": 189,
+      "ot": 190,
+      "ate": 191
     },
     "merges": [
       [
       [
         "ti",
         "c"
+      ],
+      [
+        "Ġc",
+        "on"
+      ],
+      [
+        "q",
+        "u"
+      ],
+      [
+        "Ġ",
+        "h"
+      ],
+      [
+        "p",
+        "er"
+      ],
+      [
+        "Ġ",
+        "on"
+      ],
+      [
+        "i",
+        "g"
+      ],
+      [
+        "a",
+        "m"
+      ],
+      [
+        "re",
+        "s"
+      ],
+      [
+        "Ġw",
+        "ith"
+      ],
+      [
+        "Ġth",
+        "at"
+      ],
+      [
+        "Ġ",
+        "W"
+      ],
+      [
+        "v",
+        "er"
+      ],
+      [
+        "u",
+        "m"
+      ],
+      [
+        "Ġ",
+        "$"
+      ],
+      [
+        "i",
+        "l"
+      ],
+      [
+        "Ġe",
+        "x"
+      ],
+      [
+        "u",
+        "t"
+      ],
+      [
+        "s",
+        "e"
+      ],
+      [
+        "o",
+        "t"
+      ],
+      [
+        "at",
+        "e"
       ]
     ]
   }

trainer_state.json CHANGED Viewed

@@ -1,432 +1,152 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.03,
   "eval_steps": 500,
-  "global_step": 300,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.0005,
-      "grad_norm": 1.4382522106170654,
       "learning_rate": 0.0001,
-      "loss": 5.1929,
       "step": 5
     },
     {
       "epoch": 0.001,
-      "grad_norm": 1.085871934890747,
       "learning_rate": 0.0002,
-      "loss": 4.9631,
       "step": 10
     },
     {
       "epoch": 0.0015,
-      "grad_norm": 0.8867707252502441,
       "learning_rate": 0.0001998998998998999,
-      "loss": 4.701,
       "step": 15
     },
     {
       "epoch": 0.002,
-      "grad_norm": 0.946327805519104,
       "learning_rate": 0.0001997997997997998,
-      "loss": 4.5336,
       "step": 20
     },
     {
       "epoch": 0.0025,
-      "grad_norm": 0.9437915086746216,
       "learning_rate": 0.0001996996996996997,
-      "loss": 4.3456,
       "step": 25
     },
     {
       "epoch": 0.003,
-      "grad_norm": 0.7022916078567505,
       "learning_rate": 0.0001995995995995996,
-      "loss": 4.2017,
       "step": 30
     },
     {
       "epoch": 0.0035,
-      "grad_norm": 0.5652568936347961,
       "learning_rate": 0.0001994994994994995,
-      "loss": 4.0888,
       "step": 35
     },
     {
       "epoch": 0.004,
-      "grad_norm": 0.5211153030395508,
       "learning_rate": 0.0001993993993993994,
-      "loss": 3.9942,
       "step": 40
     },
     {
       "epoch": 0.0045,
-      "grad_norm": 0.4528588056564331,
       "learning_rate": 0.00019929929929929932,
-      "loss": 3.9148,
       "step": 45
     },
     {
       "epoch": 0.005,
-      "grad_norm": 0.43602684140205383,
       "learning_rate": 0.0001991991991991992,
-      "loss": 3.8423,
       "step": 50
     },
     {
       "epoch": 0.0055,
-      "grad_norm": 0.40453559160232544,
       "learning_rate": 0.00019909909909909912,
-      "loss": 3.7929,
       "step": 55
     },
     {
       "epoch": 0.006,
-      "grad_norm": 0.3981894254684448,
       "learning_rate": 0.000198998998998999,
-      "loss": 3.7473,
       "step": 60
     },
     {
       "epoch": 0.0065,
-      "grad_norm": 0.4431403577327728,
       "learning_rate": 0.0001988988988988989,
-      "loss": 3.6961,
       "step": 65
     },
     {
       "epoch": 0.007,
-      "grad_norm": 0.4041431248188019,
       "learning_rate": 0.0001987987987987988,
-      "loss": 3.6709,
       "step": 70
     },
     {
       "epoch": 0.0075,
-      "grad_norm": 0.4025708734989166,
       "learning_rate": 0.0001986986986986987,
-      "loss": 3.6548,
       "step": 75
     },
     {
       "epoch": 0.008,
-      "grad_norm": 0.3811189830303192,
       "learning_rate": 0.0001985985985985986,
-      "loss": 3.6196,
       "step": 80
     },
     {
       "epoch": 0.0085,
-      "grad_norm": 0.35992950201034546,
       "learning_rate": 0.0001984984984984985,
-      "loss": 3.6011,
       "step": 85
     },
     {
       "epoch": 0.009,
-      "grad_norm": 0.35293370485305786,
       "learning_rate": 0.0001983983983983984,
-      "loss": 3.5855,
       "step": 90
     },
     {
       "epoch": 0.0095,
-      "grad_norm": 0.3603716194629669,
       "learning_rate": 0.00019829829829829833,
-      "loss": 3.5711,
       "step": 95
     },
     {
       "epoch": 0.01,
-      "grad_norm": 0.3005909025669098,
       "learning_rate": 0.0001981981981981982,
-      "loss": 3.5562,
       "step": 100
-    },
-    {
-      "epoch": 0.0105,
-      "grad_norm": 0.3497621715068817,
-      "learning_rate": 0.00019809809809809813,
-      "loss": 3.5253,
-      "step": 105
-    },
-    {
-      "epoch": 0.011,
-      "grad_norm": 0.3970584273338318,
-      "learning_rate": 0.000197997997997998,
-      "loss": 3.513,
-      "step": 110
-    },
-    {
-      "epoch": 0.0115,
-      "grad_norm": 0.47932690382003784,
-      "learning_rate": 0.0001978978978978979,
-      "loss": 3.4934,
-      "step": 115
-    },
-    {
-      "epoch": 0.012,
-      "grad_norm": 0.3744785487651825,
-      "learning_rate": 0.0001977977977977978,
-      "loss": 3.4994,
-      "step": 120
-    },
-    {
-      "epoch": 0.0125,
-      "grad_norm": 0.35583263635635376,
-      "learning_rate": 0.0001976976976976977,
-      "loss": 3.4676,
-      "step": 125
-    },
-    {
-      "epoch": 0.013,
-      "grad_norm": 0.3067843019962311,
-      "learning_rate": 0.0001975975975975976,
-      "loss": 3.4778,
-      "step": 130
-    },
-    {
-      "epoch": 0.0135,
-      "grad_norm": 0.4709765315055847,
-      "learning_rate": 0.0001974974974974975,
-      "loss": 3.4547,
-      "step": 135
-    },
-    {
-      "epoch": 0.014,
-      "grad_norm": 0.6164122223854065,
-      "learning_rate": 0.00019739739739739739,
-      "loss": 3.4351,
-      "step": 140
-    },
-    {
-      "epoch": 0.0145,
-      "grad_norm": 0.41007131338119507,
-      "learning_rate": 0.0001972972972972973,
-      "loss": 3.4244,
-      "step": 145
-    },
-    {
-      "epoch": 0.015,
-      "grad_norm": 0.6154835224151611,
-      "learning_rate": 0.0001971971971971972,
-      "loss": 3.4039,
-      "step": 150
-    },
-    {
-      "epoch": 0.0155,
-      "grad_norm": 0.4073669910430908,
-      "learning_rate": 0.00019709709709709713,
-      "loss": 3.395,
-      "step": 155
-    },
-    {
-      "epoch": 0.016,
-      "grad_norm": 0.5838276147842407,
-      "learning_rate": 0.00019699699699699701,
-      "loss": 3.3642,
-      "step": 160
-    },
-    {
-      "epoch": 0.0165,
-      "grad_norm": 0.49278542399406433,
-      "learning_rate": 0.0001968968968968969,
-      "loss": 3.3515,
-      "step": 165
-    },
-    {
-      "epoch": 0.017,
-      "grad_norm": 0.4297572374343872,
-      "learning_rate": 0.00019679679679679681,
-      "loss": 3.3261,
-      "step": 170
-    },
-    {
-      "epoch": 0.0175,
-      "grad_norm": 0.43436136841773987,
-      "learning_rate": 0.0001966966966966967,
-      "loss": 3.2953,
-      "step": 175
-    },
-    {
-      "epoch": 0.018,
-      "grad_norm": 0.4154890179634094,
-      "learning_rate": 0.00019659659659659661,
-      "loss": 3.2588,
-      "step": 180
-    },
-    {
-      "epoch": 0.0185,
-      "grad_norm": 0.6486464142799377,
-      "learning_rate": 0.0001964964964964965,
-      "loss": 3.229,
-      "step": 185
-    },
-    {
-      "epoch": 0.019,
-      "grad_norm": 0.5434504151344299,
-      "learning_rate": 0.0001963963963963964,
-      "loss": 3.2005,
-      "step": 190
-    },
-    {
-      "epoch": 0.0195,
-      "grad_norm": 0.6403669714927673,
-      "learning_rate": 0.0001962962962962963,
-      "loss": 3.1609,
-      "step": 195
-    },
-    {
-      "epoch": 0.02,
-      "grad_norm": 0.5148853063583374,
-      "learning_rate": 0.00019619619619619621,
-      "loss": 3.1362,
-      "step": 200
-    },
-    {
-      "epoch": 0.0205,
-      "grad_norm": 0.6012855768203735,
-      "learning_rate": 0.00019609609609609613,
-      "loss": 3.1118,
-      "step": 205
-    },
-    {
-      "epoch": 0.021,
-      "grad_norm": 0.6342504620552063,
-      "learning_rate": 0.00019599599599599602,
-      "loss": 3.0452,
-      "step": 210
-    },
-    {
-      "epoch": 0.0215,
-      "grad_norm": 0.7762932777404785,
-      "learning_rate": 0.0001958958958958959,
-      "loss": 3.0401,
-      "step": 215
-    },
-    {
-      "epoch": 0.022,
-      "grad_norm": 0.6487250924110413,
-      "learning_rate": 0.00019579579579579582,
-      "loss": 3.0074,
-      "step": 220
-    },
-    {
-      "epoch": 0.0225,
-      "grad_norm": 0.7411482334136963,
-      "learning_rate": 0.0001956956956956957,
-      "loss": 2.9665,
-      "step": 225
-    },
-    {
-      "epoch": 0.023,
-      "grad_norm": 0.727695643901825,
-      "learning_rate": 0.00019559559559559562,
-      "loss": 2.9418,
-      "step": 230
-    },
-    {
-      "epoch": 0.0235,
-      "grad_norm": 0.6558846235275269,
-      "learning_rate": 0.0001954954954954955,
-      "loss": 2.8922,
-      "step": 235
-    },
-    {
-      "epoch": 0.024,
-      "grad_norm": 0.7584027051925659,
-      "learning_rate": 0.0001953953953953954,
-      "loss": 2.8897,
-      "step": 240
-    },
-    {
-      "epoch": 0.0245,
-      "grad_norm": 0.6296901106834412,
-      "learning_rate": 0.0001952952952952953,
-      "loss": 2.8531,
-      "step": 245
-    },
-    {
-      "epoch": 0.025,
-      "grad_norm": 0.6529428362846375,
-      "learning_rate": 0.0001951951951951952,
-      "loss": 2.8375,
-      "step": 250
-    },
-    {
-      "epoch": 0.0255,
-      "grad_norm": 0.6653200387954712,
-      "learning_rate": 0.0001950950950950951,
-      "loss": 2.796,
-      "step": 255
-    },
-    {
-      "epoch": 0.026,
-      "grad_norm": 0.6050741076469421,
-      "learning_rate": 0.00019499499499499502,
-      "loss": 2.787,
-      "step": 260
-    },
-    {
-      "epoch": 0.0265,
-      "grad_norm": 0.6170589923858643,
-      "learning_rate": 0.0001948948948948949,
-      "loss": 2.7591,
-      "step": 265
-    },
-    {
-      "epoch": 0.027,
-      "grad_norm": 0.6681796908378601,
-      "learning_rate": 0.00019479479479479482,
-      "loss": 2.7431,
-      "step": 270
-    },
-    {
-      "epoch": 0.0275,
-      "grad_norm": 0.6189929246902466,
-      "learning_rate": 0.0001946946946946947,
-      "loss": 2.7374,
-      "step": 275
-    },
-    {
-      "epoch": 0.028,
-      "grad_norm": 0.6890608668327332,
-      "learning_rate": 0.00019459459459459462,
-      "loss": 2.6941,
-      "step": 280
-    },
-    {
-      "epoch": 0.0285,
-      "grad_norm": 0.6476343274116516,
-      "learning_rate": 0.0001944944944944945,
-      "loss": 2.6852,
-      "step": 285
-    },
-    {
-      "epoch": 0.029,
-      "grad_norm": 0.7976285815238953,
-      "learning_rate": 0.0001943943943943944,
-      "loss": 2.6704,
-      "step": 290
-    },
-    {
-      "epoch": 0.0295,
-      "grad_norm": 0.8300926089286804,
-      "learning_rate": 0.0001942942942942943,
-      "loss": 2.645,
-      "step": 295
-    },
-    {
-      "epoch": 0.03,
-      "grad_norm": 0.7338405251502991,
-      "learning_rate": 0.0001941941941941942,
-      "loss": 2.6236,
-      "step": 300
     }
   ],
   "logging_steps": 5,
@@ -446,7 +166,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 60847777382400.0,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.01,
   "eval_steps": 500,
+  "global_step": 100,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.0005,
+      "grad_norm": 1.354914903640747,
       "learning_rate": 0.0001,
+      "loss": 5.3068,
       "step": 5
     },
     {
       "epoch": 0.001,
+      "grad_norm": 1.0461070537567139,
       "learning_rate": 0.0002,
+      "loss": 5.0784,
       "step": 10
     },
     {
       "epoch": 0.0015,
+      "grad_norm": 0.7310259938240051,
       "learning_rate": 0.0001998998998998999,
+      "loss": 4.8251,
       "step": 15
     },
     {
       "epoch": 0.002,
+      "grad_norm": 0.82170170545578,
       "learning_rate": 0.0001997997997997998,
+      "loss": 4.6949,
       "step": 20
     },
     {
       "epoch": 0.0025,
+      "grad_norm": 0.9640143513679504,
       "learning_rate": 0.0001996996996996997,
+      "loss": 4.5294,
       "step": 25
     },
     {
       "epoch": 0.003,
+      "grad_norm": 0.6337556838989258,
       "learning_rate": 0.0001995995995995996,
+      "loss": 4.3776,
       "step": 30
     },
     {
       "epoch": 0.0035,
+      "grad_norm": 0.5715162754058838,
       "learning_rate": 0.0001994994994994995,
+      "loss": 4.251,
       "step": 35
     },
     {
       "epoch": 0.004,
+      "grad_norm": 0.47545069456100464,
       "learning_rate": 0.0001993993993993994,
+      "loss": 4.142,
       "step": 40
     },
     {
       "epoch": 0.0045,
+      "grad_norm": 0.43138620257377625,
       "learning_rate": 0.00019929929929929932,
+      "loss": 4.0538,
       "step": 45
     },
     {
       "epoch": 0.005,
+      "grad_norm": 0.41834330558776855,
       "learning_rate": 0.0001991991991991992,
+      "loss": 3.9896,
       "step": 50
     },
     {
       "epoch": 0.0055,
+      "grad_norm": 0.3807925283908844,
       "learning_rate": 0.00019909909909909912,
+      "loss": 3.9316,
       "step": 55
     },
     {
       "epoch": 0.006,
+      "grad_norm": 0.4051252603530884,
       "learning_rate": 0.000198998998998999,
+      "loss": 3.8816,
       "step": 60
     },
     {
       "epoch": 0.0065,
+      "grad_norm": 0.3600367307662964,
       "learning_rate": 0.0001988988988988989,
+      "loss": 3.8327,
       "step": 65
     },
     {
       "epoch": 0.007,
+      "grad_norm": 0.3089018762111664,
       "learning_rate": 0.0001987987987987988,
+      "loss": 3.7908,
       "step": 70
     },
     {
       "epoch": 0.0075,
+      "grad_norm": 0.2999509572982788,
       "learning_rate": 0.0001986986986986987,
+      "loss": 3.7632,
       "step": 75
     },
     {
       "epoch": 0.008,
+      "grad_norm": 0.29107317328453064,
       "learning_rate": 0.0001985985985985986,
+      "loss": 3.7366,
       "step": 80
     },
     {
       "epoch": 0.0085,
+      "grad_norm": 0.3126203417778015,
       "learning_rate": 0.0001984984984984985,
+      "loss": 3.7243,
       "step": 85
     },
     {
       "epoch": 0.009,
+      "grad_norm": 0.3028947710990906,
       "learning_rate": 0.0001983983983983984,
+      "loss": 3.6909,
       "step": 90
     },
     {
       "epoch": 0.0095,
+      "grad_norm": 0.3013005554676056,
       "learning_rate": 0.00019829829829829833,
+      "loss": 3.6686,
       "step": 95
     },
     {
       "epoch": 0.01,
+      "grad_norm": 0.26517948508262634,
       "learning_rate": 0.0001981981981981982,
+      "loss": 3.6513,
       "step": 100
     }
   ],
   "logging_steps": 5,
       "attributes": {}
     }
   },
+  "total_flos": 41292084019200.0,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:90ddf80d128466488ddd874e714e6eae19dff7f2112c05fbb4f6f15228ab4bf4
 size 5713

 version https://git-lfs.github.com/spec/v1
+oid sha256:87f7094c9781b5c9394410d447866dce36653e1a7dc4508ca501767ea42b00ab
 size 5713