Auto-save flat update: checkpoint-100

Browse files

Files changed (6) hide show

config.json +1 -1
model.safetensors +1 -1
optimizer.pt +1 -1
scheduler.pt +1 -1
trainer_state.json +43 -183
training_args.bin +1 -1

config.json CHANGED Viewed

@@ -13,7 +13,7 @@
   "hidden_size": 256,
   "initializer_range": 0.02,
   "intermediate_size": 1024,
-  "max_position_embeddings": 1024,
   "max_window_layers": 28,
   "mlp_bias": false,
   "model_type": "qwen2",

   "hidden_size": 256,
   "initializer_range": 0.02,
   "intermediate_size": 1024,
+  "max_position_embeddings": 512,
   "max_window_layers": 28,
   "mlp_bias": false,
   "model_type": "qwen2",

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c633054e8f62adb0c63a3f768c6f64adee2e808946e35fd25e0a4ff8fe9886cc
 size 4398536

 version https://git-lfs.github.com/spec/v1
+oid sha256:cc2024b06d6d177a54f5f514c8784fc3039cc0904d37d02dd58522a0d3362322
 size 4398536

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:be20755d8a8f3672d8753c50dea6913d0e7fe725cef330124867906dd6d1d499
 size 8806533

 version https://git-lfs.github.com/spec/v1
+oid sha256:faee95f783ecf608464997a8200a1558c89d78e05ee6481dc796ec32c4bbbdca
 size 8806533

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ac7010ca38527d647c6bec40d9e474292bd22ba1a7391c34323926f03e67d0ef
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:d82c58c32b204ed6cf1be47fcccac4a2997bdd7e1431fe3a6ec925f0a86a9891
 size 1465

trainer_state.json CHANGED Viewed

@@ -1,292 +1,152 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.02,
   "eval_steps": 500,
-  "global_step": 200,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.0005,
-      "grad_norm": 1.354914903640747,
       "learning_rate": 0.0001,
-      "loss": 5.3068,
       "step": 5
     },
     {
       "epoch": 0.001,
-      "grad_norm": 1.0461070537567139,
       "learning_rate": 0.0002,
-      "loss": 5.0784,
       "step": 10
     },
     {
       "epoch": 0.0015,
-      "grad_norm": 0.7310259938240051,
       "learning_rate": 0.0001998998998998999,
-      "loss": 4.8251,
       "step": 15
     },
     {
       "epoch": 0.002,
-      "grad_norm": 0.82170170545578,
       "learning_rate": 0.0001997997997997998,
-      "loss": 4.6949,
       "step": 20
     },
     {
       "epoch": 0.0025,
-      "grad_norm": 0.9640143513679504,
       "learning_rate": 0.0001996996996996997,
-      "loss": 4.5294,
       "step": 25
     },
     {
       "epoch": 0.003,
-      "grad_norm": 0.6337556838989258,
       "learning_rate": 0.0001995995995995996,
-      "loss": 4.3776,
       "step": 30
     },
     {
       "epoch": 0.0035,
-      "grad_norm": 0.5715162754058838,
       "learning_rate": 0.0001994994994994995,
-      "loss": 4.251,
       "step": 35
     },
     {
       "epoch": 0.004,
-      "grad_norm": 0.47545069456100464,
       "learning_rate": 0.0001993993993993994,
-      "loss": 4.142,
       "step": 40
     },
     {
       "epoch": 0.0045,
-      "grad_norm": 0.43138620257377625,
       "learning_rate": 0.00019929929929929932,
-      "loss": 4.0538,
       "step": 45
     },
     {
       "epoch": 0.005,
-      "grad_norm": 0.41834330558776855,
       "learning_rate": 0.0001991991991991992,
-      "loss": 3.9896,
       "step": 50
     },
     {
       "epoch": 0.0055,
-      "grad_norm": 0.3807925283908844,
       "learning_rate": 0.00019909909909909912,
-      "loss": 3.9316,
       "step": 55
     },
     {
       "epoch": 0.006,
-      "grad_norm": 0.4051252603530884,
       "learning_rate": 0.000198998998998999,
-      "loss": 3.8816,
       "step": 60
     },
     {
       "epoch": 0.0065,
-      "grad_norm": 0.3600367307662964,
       "learning_rate": 0.0001988988988988989,
-      "loss": 3.8327,
       "step": 65
     },
     {
       "epoch": 0.007,
-      "grad_norm": 0.3089018762111664,
       "learning_rate": 0.0001987987987987988,
-      "loss": 3.7908,
       "step": 70
     },
     {
       "epoch": 0.0075,
-      "grad_norm": 0.2999509572982788,
       "learning_rate": 0.0001986986986986987,
-      "loss": 3.7632,
       "step": 75
     },
     {
       "epoch": 0.008,
-      "grad_norm": 0.29107317328453064,
       "learning_rate": 0.0001985985985985986,
-      "loss": 3.7366,
       "step": 80
     },
     {
       "epoch": 0.0085,
-      "grad_norm": 0.3126203417778015,
       "learning_rate": 0.0001984984984984985,
-      "loss": 3.7243,
       "step": 85
     },
     {
       "epoch": 0.009,
-      "grad_norm": 0.3028947710990906,
       "learning_rate": 0.0001983983983983984,
-      "loss": 3.6909,
       "step": 90
     },
     {
       "epoch": 0.0095,
-      "grad_norm": 0.3013005554676056,
       "learning_rate": 0.00019829829829829833,
-      "loss": 3.6686,
       "step": 95
     },
     {
       "epoch": 0.01,
-      "grad_norm": 0.26517948508262634,
       "learning_rate": 0.0001981981981981982,
-      "loss": 3.6513,
       "step": 100
-    },
-    {
-      "epoch": 0.0105,
-      "grad_norm": 0.283869206905365,
-      "learning_rate": 0.00019809809809809813,
-      "loss": 3.6389,
-      "step": 105
-    },
-    {
-      "epoch": 0.011,
-      "grad_norm": 0.3128926455974579,
-      "learning_rate": 0.000197997997997998,
-      "loss": 3.6236,
-      "step": 110
-    },
-    {
-      "epoch": 0.0115,
-      "grad_norm": 0.3017060458660126,
-      "learning_rate": 0.0001978978978978979,
-      "loss": 3.6056,
-      "step": 115
-    },
-    {
-      "epoch": 0.012,
-      "grad_norm": 0.3050957918167114,
-      "learning_rate": 0.0001977977977977978,
-      "loss": 3.5945,
-      "step": 120
-    },
-    {
-      "epoch": 0.0125,
-      "grad_norm": 0.39496731758117676,
-      "learning_rate": 0.0001976976976976977,
-      "loss": 3.576,
-      "step": 125
-    },
-    {
-      "epoch": 0.013,
-      "grad_norm": 0.39083537459373474,
-      "learning_rate": 0.0001975975975975976,
-      "loss": 3.5746,
-      "step": 130
-    },
-    {
-      "epoch": 0.0135,
-      "grad_norm": 0.30549755692481995,
-      "learning_rate": 0.0001974974974974975,
-      "loss": 3.5464,
-      "step": 135
-    },
-    {
-      "epoch": 0.014,
-      "grad_norm": 0.30516958236694336,
-      "learning_rate": 0.00019739739739739739,
-      "loss": 3.5528,
-      "step": 140
-    },
-    {
-      "epoch": 0.0145,
-      "grad_norm": 0.28228166699409485,
-      "learning_rate": 0.0001972972972972973,
-      "loss": 3.5414,
-      "step": 145
-    },
-    {
-      "epoch": 0.015,
-      "grad_norm": 0.2340458333492279,
-      "learning_rate": 0.0001971971971971972,
-      "loss": 3.5297,
-      "step": 150
-    },
-    {
-      "epoch": 0.0155,
-      "grad_norm": 0.3061468005180359,
-      "learning_rate": 0.00019709709709709713,
-      "loss": 3.5114,
-      "step": 155
-    },
-    {
-      "epoch": 0.016,
-      "grad_norm": 0.3535705804824829,
-      "learning_rate": 0.00019699699699699701,
-      "loss": 3.4996,
-      "step": 160
-    },
-    {
-      "epoch": 0.0165,
-      "grad_norm": 0.3399507403373718,
-      "learning_rate": 0.0001968968968968969,
-      "loss": 3.4855,
-      "step": 165
-    },
-    {
-      "epoch": 0.017,
-      "grad_norm": 0.26981884241104126,
-      "learning_rate": 0.00019679679679679681,
-      "loss": 3.4712,
-      "step": 170
-    },
-    {
-      "epoch": 0.0175,
-      "grad_norm": 0.3286713659763336,
-      "learning_rate": 0.0001966966966966967,
-      "loss": 3.4543,
-      "step": 175
-    },
-    {
-      "epoch": 0.018,
-      "grad_norm": 0.31991562247276306,
-      "learning_rate": 0.00019659659659659661,
-      "loss": 3.4302,
-      "step": 180
-    },
-    {
-      "epoch": 0.0185,
-      "grad_norm": 0.40395843982696533,
-      "learning_rate": 0.0001964964964964965,
-      "loss": 3.4062,
-      "step": 185
-    },
-    {
-      "epoch": 0.019,
-      "grad_norm": 0.3666783571243286,
-      "learning_rate": 0.0001963963963963964,
-      "loss": 3.379,
-      "step": 190
-    },
-    {
-      "epoch": 0.0195,
-      "grad_norm": 0.3933778703212738,
-      "learning_rate": 0.0001962962962962963,
-      "loss": 3.3496,
-      "step": 195
-    },
-    {
-      "epoch": 0.02,
-      "grad_norm": 0.34942948818206787,
-      "learning_rate": 0.00019619619619619621,
-      "loss": 3.3259,
-      "step": 200
     }
   ],
   "logging_steps": 5,
@@ -306,7 +166,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 82584168038400.0,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.01,
   "eval_steps": 500,
+  "global_step": 100,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.0005,
+      "grad_norm": 1.3692148923873901,
       "learning_rate": 0.0001,
+      "loss": 5.3023,
       "step": 5
     },
     {
       "epoch": 0.001,
+      "grad_norm": 1.0570337772369385,
       "learning_rate": 0.0002,
+      "loss": 5.0871,
       "step": 10
     },
     {
       "epoch": 0.0015,
+      "grad_norm": 0.7336458563804626,
       "learning_rate": 0.0001998998998998999,
+      "loss": 4.8384,
       "step": 15
     },
     {
       "epoch": 0.002,
+      "grad_norm": 0.729788601398468,
       "learning_rate": 0.0001997997997997998,
+      "loss": 4.7071,
       "step": 20
     },
     {
       "epoch": 0.0025,
+      "grad_norm": 0.8077158331871033,
       "learning_rate": 0.0001996996996996997,
+      "loss": 4.5564,
       "step": 25
     },
     {
       "epoch": 0.003,
+      "grad_norm": 0.6561239361763,
       "learning_rate": 0.0001995995995995996,
+      "loss": 4.4024,
       "step": 30
     },
     {
       "epoch": 0.0035,
+      "grad_norm": 0.5824812650680542,
       "learning_rate": 0.0001994994994994995,
+      "loss": 4.2921,
       "step": 35
     },
     {
       "epoch": 0.004,
+      "grad_norm": 0.5250737071037292,
       "learning_rate": 0.0001993993993993994,
+      "loss": 4.1845,
       "step": 40
     },
     {
       "epoch": 0.0045,
+      "grad_norm": 0.5088778734207153,
       "learning_rate": 0.00019929929929929932,
+      "loss": 4.0933,
       "step": 45
     },
     {
       "epoch": 0.005,
+      "grad_norm": 0.4544166326522827,
       "learning_rate": 0.0001991991991991992,
+      "loss": 4.0118,
       "step": 50
     },
     {
       "epoch": 0.0055,
+      "grad_norm": 0.41549327969551086,
       "learning_rate": 0.00019909909909909912,
+      "loss": 3.9531,
       "step": 55
     },
     {
       "epoch": 0.006,
+      "grad_norm": 0.3995205760002136,
       "learning_rate": 0.000198998998998999,
+      "loss": 3.8955,
       "step": 60
     },
     {
       "epoch": 0.0065,
+      "grad_norm": 0.3810112178325653,
       "learning_rate": 0.0001988988988988989,
+      "loss": 3.8356,
       "step": 65
     },
     {
       "epoch": 0.007,
+      "grad_norm": 0.3959825932979584,
       "learning_rate": 0.0001987987987987988,
+      "loss": 3.8059,
       "step": 70
     },
     {
       "epoch": 0.0075,
+      "grad_norm": 0.34660765528678894,
       "learning_rate": 0.0001986986986986987,
+      "loss": 3.786,
       "step": 75
     },
     {
       "epoch": 0.008,
+      "grad_norm": 0.35685837268829346,
       "learning_rate": 0.0001985985985985986,
+      "loss": 3.7469,
       "step": 80
     },
     {
       "epoch": 0.0085,
+      "grad_norm": 0.3709333539009094,
       "learning_rate": 0.0001984984984984985,
+      "loss": 3.7236,
       "step": 85
     },
     {
       "epoch": 0.009,
+      "grad_norm": 0.3217354118824005,
       "learning_rate": 0.0001983983983983984,
+      "loss": 3.7075,
       "step": 90
     },
     {
       "epoch": 0.0095,
+      "grad_norm": 0.42025989294052124,
       "learning_rate": 0.00019829829829829833,
+      "loss": 3.682,
       "step": 95
     },
     {
       "epoch": 0.01,
+      "grad_norm": 0.35580873489379883,
       "learning_rate": 0.0001981981981981982,
+      "loss": 3.6626,
       "step": 100
     }
   ],
   "logging_steps": 5,
       "attributes": {}
     }
   },
+  "total_flos": 20646042009600.0,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:87f7094c9781b5c9394410d447866dce36653e1a7dc4508ca501767ea42b00ab
 size 5713

 version https://git-lfs.github.com/spec/v1
+oid sha256:28b22be76b34e68797fdb33f5525668e9c928c7650cec1eef415c99efec1ffeb
 size 5713