Training in progress, step 3000, checkpoint

Browse files

Files changed (8) hide show

last-checkpoint/adapter_config.json +3 -3
last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scaler.pt +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +315 -3
last-checkpoint/training_args.bin +1 -1

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -31,11 +31,11 @@
   "target_modules": [
     "v_proj",
     "k_proj",
     "gate_proj",
-    "o_proj",
     "up_proj",
-    "down_proj",
-    "q_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

   "target_modules": [
     "v_proj",
     "k_proj",
+    "down_proj",
+    "q_proj",
     "gate_proj",
     "up_proj",
+    "o_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:df0b3c057589426de11702e8aa51f40578fbdc1c16b5298b4df1b3741a358543
 size 228140600

 version https://git-lfs.github.com/spec/v1
+oid sha256:1ddd49b9fa83b41042972589b0185429c9038b2514af8abc9c0ad4f6f229c6c8
 size 228140600

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7e1a2a35f3f40624f11f416233f78a070b1dea29da95a3a90a9a787a9173de3d
 size 117931203

 version https://git-lfs.github.com/spec/v1
+oid sha256:22dc5729293f37d17c0b6650d94819a21d18fab4c702a46d62401aec711792f3
 size 117931203

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54ee403e6e7f52e165fb91ab2843ca4f38ca3d3c64d81b59c5a39f9e4c098413
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:ce6193889ea75b9cef214b87184b6c99e6c6f661ab938ae5ad158be7367ecf8b
 size 14645

last-checkpoint/scaler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:88be0f049d620e88b111c309644f5ca8c552ca0e64dbf5a41f67ac4dd14016eb
 size 1383

 version https://git-lfs.github.com/spec/v1
+oid sha256:7ab8f7fae8c5bc945ba8d0476887328f81726abcc0550ee4572fa2d3eac0adcb
 size 1383

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6abcf0c15a7ba90c608cb1903d96b4ad18eb9806fb694a46be4e23a52b64410b
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:2a3a79343e37b2abae291bedd1957475ce7f9b47f8942adec4a76182dbe5dbf9
 size 1465

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": 750,
   "best_metric": 0.5089643597602844,
   "best_model_checkpoint": "./adapter-phase1/checkpoint-750",
-  "epoch": 4.32,
   "eval_steps": 300,
-  "global_step": 2700,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2876,6 +2876,318 @@
       "eval_samples_per_second": 2.299,
       "eval_steps_per_second": 0.575,
       "step": 2700
     }
   ],
   "logging_steps": 10,
@@ -2895,7 +3207,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 4.639214588564275e+17,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

   "best_global_step": 750,
   "best_metric": 0.5089643597602844,
   "best_model_checkpoint": "./adapter-phase1/checkpoint-750",
+  "epoch": 4.8,
   "eval_steps": 300,
+  "global_step": 3000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 2.299,
       "eval_steps_per_second": 0.575,
       "step": 2700
+    },
+    {
+      "entropy": 0.24316317560151218,
+      "epoch": 4.336,
+      "grad_norm": 0.757876455783844,
+      "learning_rate": 1.3376e-05,
+      "loss": 0.2118,
+      "mean_token_accuracy": 0.9327260747551918,
+      "num_tokens": 39749.0,
+      "step": 2710
+    },
+    {
+      "entropy": 0.2465177897363901,
+      "epoch": 4.352,
+      "grad_norm": 0.73354172706604,
+      "learning_rate": 1.3056000000000002e-05,
+      "loss": 0.21,
+      "mean_token_accuracy": 0.9354286625981331,
+      "num_tokens": 68464.0,
+      "step": 2720
+    },
+    {
+      "entropy": 0.24799817334860563,
+      "epoch": 4.368,
+      "grad_norm": 0.9990701675415039,
+      "learning_rate": 1.2736000000000001e-05,
+      "loss": 0.2039,
+      "mean_token_accuracy": 0.940489636361599,
+      "num_tokens": 91656.0,
+      "step": 2730
+    },
+    {
+      "entropy": 0.26067384518682957,
+      "epoch": 4.384,
+      "grad_norm": 0.9379425644874573,
+      "learning_rate": 1.2416000000000001e-05,
+      "loss": 0.2182,
+      "mean_token_accuracy": 0.9411718167364598,
+      "num_tokens": 110505.0,
+      "step": 2740
+    },
+    {
+      "entropy": 0.3018894817214459,
+      "epoch": 4.4,
+      "grad_norm": 1.0026336908340454,
+      "learning_rate": 1.2096e-05,
+      "loss": 0.2267,
+      "mean_token_accuracy": 0.9386275008320808,
+      "num_tokens": 123324.0,
+      "step": 2750
+    },
+    {
+      "entropy": 0.21805389355868102,
+      "epoch": 4.416,
+      "grad_norm": 0.6372848153114319,
+      "learning_rate": 1.1776e-05,
+      "loss": 0.1861,
+      "mean_token_accuracy": 0.9427805945277214,
+      "num_tokens": 163777.0,
+      "step": 2760
+    },
+    {
+      "entropy": 0.21196621540002525,
+      "epoch": 4.432,
+      "grad_norm": 0.5572025179862976,
+      "learning_rate": 1.1456e-05,
+      "loss": 0.1581,
+      "mean_token_accuracy": 0.9551307797431946,
+      "num_tokens": 192177.0,
+      "step": 2770
+    },
+    {
+      "entropy": 0.20902398317120968,
+      "epoch": 4.448,
+      "grad_norm": 0.7340620756149292,
+      "learning_rate": 1.1136e-05,
+      "loss": 0.1582,
+      "mean_token_accuracy": 0.9570909071713686,
+      "num_tokens": 215456.0,
+      "step": 2780
+    },
+    {
+      "entropy": 0.2131565590389073,
+      "epoch": 4.464,
+      "grad_norm": 1.0014139413833618,
+      "learning_rate": 1.0816000000000001e-05,
+      "loss": 0.1583,
+      "mean_token_accuracy": 0.9551056247204542,
+      "num_tokens": 234122.0,
+      "step": 2790
+    },
+    {
+      "entropy": 0.25133530045859515,
+      "epoch": 4.48,
+      "grad_norm": 0.8922705054283142,
+      "learning_rate": 1.0496e-05,
+      "loss": 0.1818,
+      "mean_token_accuracy": 0.9524805508553982,
+      "num_tokens": 246749.0,
+      "step": 2800
+    },
+    {
+      "entropy": 0.19833970288746058,
+      "epoch": 4.496,
+      "grad_norm": 0.8713212609291077,
+      "learning_rate": 1.0176e-05,
+      "loss": 0.1667,
+      "mean_token_accuracy": 0.9479088947176934,
+      "num_tokens": 287475.0,
+      "step": 2810
+    },
+    {
+      "entropy": 0.18820378091186285,
+      "epoch": 4.5120000000000005,
+      "grad_norm": 0.782958984375,
+      "learning_rate": 9.856e-06,
+      "loss": 0.1507,
+      "mean_token_accuracy": 0.9564289052039385,
+      "num_tokens": 316228.0,
+      "step": 2820
+    },
+    {
+      "entropy": 0.1986434136983007,
+      "epoch": 4.5280000000000005,
+      "grad_norm": 0.9405664801597595,
+      "learning_rate": 9.536e-06,
+      "loss": 0.1652,
+      "mean_token_accuracy": 0.9527083396911621,
+      "num_tokens": 339312.0,
+      "step": 2830
+    },
+    {
+      "entropy": 0.20359546076506377,
+      "epoch": 4.5440000000000005,
+      "grad_norm": 1.8294662237167358,
+      "learning_rate": 9.216000000000001e-06,
+      "loss": 0.1605,
+      "mean_token_accuracy": 0.958249793574214,
+      "num_tokens": 357957.0,
+      "step": 2840
+    },
+    {
+      "entropy": 0.2478945675306022,
+      "epoch": 4.5600000000000005,
+      "grad_norm": 1.8756585121154785,
+      "learning_rate": 8.896000000000001e-06,
+      "loss": 0.1791,
+      "mean_token_accuracy": 0.9529225923120975,
+      "num_tokens": 371074.0,
+      "step": 2850
+    },
+    {
+      "entropy": 0.19137877360917627,
+      "epoch": 4.576,
+      "grad_norm": 0.7811349034309387,
+      "learning_rate": 8.576e-06,
+      "loss": 0.1603,
+      "mean_token_accuracy": 0.9505746208131314,
+      "num_tokens": 412461.0,
+      "step": 2860
+    },
+    {
+      "entropy": 0.19941019406542182,
+      "epoch": 4.592,
+      "grad_norm": 0.8849194645881653,
+      "learning_rate": 8.256e-06,
+      "loss": 0.1559,
+      "mean_token_accuracy": 0.9538026105612516,
+      "num_tokens": 441113.0,
+      "step": 2870
+    },
+    {
+      "entropy": 0.20037598102353513,
+      "epoch": 4.608,
+      "grad_norm": 1.007367730140686,
+      "learning_rate": 7.936e-06,
+      "loss": 0.1577,
+      "mean_token_accuracy": 0.9563030891120434,
+      "num_tokens": 464301.0,
+      "step": 2880
+    },
+    {
+      "entropy": 0.21458538975566627,
+      "epoch": 4.624,
+      "grad_norm": 1.0605765581130981,
+      "learning_rate": 7.616000000000001e-06,
+      "loss": 0.1636,
+      "mean_token_accuracy": 0.9558106277137994,
+      "num_tokens": 483422.0,
+      "step": 2890
+    },
+    {
+      "entropy": 0.2460995698813349,
+      "epoch": 4.64,
+      "grad_norm": 1.1102747917175293,
+      "learning_rate": 7.296e-06,
+      "loss": 0.178,
+      "mean_token_accuracy": 0.9527418158948422,
+      "num_tokens": 496524.0,
+      "step": 2900
+    },
+    {
+      "entropy": 0.1917059404309839,
+      "epoch": 4.656,
+      "grad_norm": 0.7104383111000061,
+      "learning_rate": 6.976000000000001e-06,
+      "loss": 0.1692,
+      "mean_token_accuracy": 0.9471572674810886,
+      "num_tokens": 537262.0,
+      "step": 2910
+    },
+    {
+      "entropy": 0.19903061082586646,
+      "epoch": 4.672,
+      "grad_norm": 0.8522951006889343,
+      "learning_rate": 6.688e-06,
+      "loss": 0.1668,
+      "mean_token_accuracy": 0.9495650254189968,
+      "num_tokens": 566118.0,
+      "step": 2920
+    },
+    {
+      "entropy": 0.20533090075477958,
+      "epoch": 4.688,
+      "grad_norm": 0.7692112326622009,
+      "learning_rate": 6.368000000000001e-06,
+      "loss": 0.1597,
+      "mean_token_accuracy": 0.9538190443068743,
+      "num_tokens": 589316.0,
+      "step": 2930
+    },
+    {
+      "entropy": 0.20868746675550937,
+      "epoch": 4.704,
+      "grad_norm": 0.8645059466362,
+      "learning_rate": 6.048e-06,
+      "loss": 0.1496,
+      "mean_token_accuracy": 0.9595503833144903,
+      "num_tokens": 607904.0,
+      "step": 2940
+    },
+    {
+      "entropy": 0.23888139198534192,
+      "epoch": 4.72,
+      "grad_norm": 1.08635413646698,
+      "learning_rate": 5.728e-06,
+      "loss": 0.1706,
+      "mean_token_accuracy": 0.9570875108242035,
+      "num_tokens": 620936.0,
+      "step": 2950
+    },
+    {
+      "entropy": 0.18963255980052054,
+      "epoch": 4.736,
+      "grad_norm": 0.7276900410652161,
+      "learning_rate": 5.4080000000000006e-06,
+      "loss": 0.1633,
+      "mean_token_accuracy": 0.9485368836671114,
+      "num_tokens": 661079.0,
+      "step": 2960
+    },
+    {
+      "entropy": 0.19404892213642597,
+      "epoch": 4.752,
+      "grad_norm": 0.8436645269393921,
+      "learning_rate": 5.088e-06,
+      "loss": 0.1523,
+      "mean_token_accuracy": 0.9547487128525972,
+      "num_tokens": 689649.0,
+      "step": 2970
+    },
+    {
+      "entropy": 0.20046764588914812,
+      "epoch": 4.768,
+      "grad_norm": 1.0704182386398315,
+      "learning_rate": 4.768e-06,
+      "loss": 0.1574,
+      "mean_token_accuracy": 0.9545170154422522,
+      "num_tokens": 712841.0,
+      "step": 2980
+    },
+    {
+      "entropy": 0.2065018493682146,
+      "epoch": 4.784,
+      "grad_norm": 0.9045215249061584,
+      "learning_rate": 4.4480000000000004e-06,
+      "loss": 0.155,
+      "mean_token_accuracy": 0.9589469760656357,
+      "num_tokens": 731548.0,
+      "step": 2990
+    },
+    {
+      "entropy": 0.2458665339741856,
+      "epoch": 4.8,
+      "grad_norm": 1.7165741920471191,
+      "learning_rate": 4.128e-06,
+      "loss": 0.173,
+      "mean_token_accuracy": 0.9542810652405024,
+      "num_tokens": 744375.0,
+      "step": 3000
+    },
+    {
+      "epoch": 4.8,
+      "eval_accuracy": 0.026236095361078154,
+      "eval_entropy": 0.3239293715655804,
+      "eval_loss": 0.6594926714897156,
+      "eval_mean_token_accuracy": 0.8544400478601456,
+      "eval_num_tokens": 744375.0,
+      "eval_runtime": 966.0583,
+      "eval_samples_per_second": 2.07,
+      "eval_steps_per_second": 0.518,
+      "step": 3000
     }
   ],
   "logging_steps": 10,
       "attributes": {}
     }
   },
+  "total_flos": 5.158805165012275e+17,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dc0c9c43aae96575e8afc416e967ac5674d13cc1a38c487b69cd4534aafef005
 size 6353

 version https://git-lfs.github.com/spec/v1
+oid sha256:ab977af6525318ffc5b089ead4268f65e71f68e9d355f66185c43f4d771a6da2
 size 6353