Uploaded checkpoint-5000

Browse files

Files changed (5) hide show

model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +711 -3

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dee94e938767a260117ef4c72990c915717f6a1428c66dfc2fa292feae4c1b55
 size 2836579040

 version https://git-lfs.github.com/spec/v1
+oid sha256:e0b03057c36e4ae3d1c8ad803821d19398ff358211eaf32a34baa7283fb29868
 size 2836579040

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dc4db2b04f54904da67f0f6ed9acfd7baeb09ee856c56ef2cc5e7bc3840e567f
 size 5673376169

 version https://git-lfs.github.com/spec/v1
+oid sha256:f8b601cbd05f4af730d23f7a08329aa98d5112750e1bbc5c713b794aa64db464
 size 5673376169

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b0cfd838427005f3598246259455779f1c1e118bacce9f330af533309c11e6ad
 size 14180

 version https://git-lfs.github.com/spec/v1
+oid sha256:c24f333818658fb1a6ac065e8380ec5713b07987c477e3ef1d4a113c4ec403e3
 size 14180

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9925f5cdfedfb49cb76b21cdd6c0c2e868c58e55559c6dcbe5c03dc7caaf7e9e
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:506671c04dd848ffd9038cbb7a15f1988edb1a31beec0ecf80efaa4f06c169fa
 size 1064

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 2.599935001624959,
   "eval_steps": 1000,
-  "global_step": 4000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2839,6 +2839,714 @@
       "eval_samples_per_second": 13.901,
       "eval_steps_per_second": 13.901,
       "step": 4000
     }
   ],
   "logging_steps": 10,
@@ -2846,7 +3554,7 @@
   "num_input_tokens_seen": 0,
   "num_train_epochs": 4,
   "save_steps": 1000,
-  "total_flos": 6.455688167424e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 3.249918752031199,
   "eval_steps": 1000,
+  "global_step": 5000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 13.901,
       "eval_steps_per_second": 13.901,
       "step": 4000
+    },
+    {
+      "epoch": 2.61,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 6.6e-07,
+      "loss": 0.0016,
+      "step": 4010
+    },
+    {
+      "epoch": 2.61,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 6.533333333333334e-07,
+      "loss": 0.0132,
+      "step": 4020
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 0.046875,
+      "learning_rate": 6.466666666666667e-07,
+      "loss": 0.0025,
+      "step": 4030
+    },
+    {
+      "epoch": 2.63,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 6.4e-07,
+      "loss": 0.0039,
+      "step": 4040
+    },
+    {
+      "epoch": 2.63,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 6.333333333333333e-07,
+      "loss": 0.0049,
+      "step": 4050
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 6.266666666666668e-07,
+      "loss": 0.0064,
+      "step": 4060
+    },
+    {
+      "epoch": 2.65,
+      "grad_norm": 0.0537109375,
+      "learning_rate": 6.2e-07,
+      "loss": 0.0044,
+      "step": 4070
+    },
+    {
+      "epoch": 2.65,
+      "grad_norm": 0.310546875,
+      "learning_rate": 6.133333333333334e-07,
+      "loss": 0.0173,
+      "step": 4080
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 6.066666666666666e-07,
+      "loss": 0.0015,
+      "step": 4090
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 6.000000000000001e-07,
+      "loss": 0.0018,
+      "step": 4100
+    },
+    {
+      "epoch": 2.67,
+      "grad_norm": 0.052978515625,
+      "learning_rate": 5.933333333333333e-07,
+      "loss": 0.0017,
+      "step": 4110
+    },
+    {
+      "epoch": 2.68,
+      "grad_norm": 1.2109375,
+      "learning_rate": 5.866666666666667e-07,
+      "loss": 0.0099,
+      "step": 4120
+    },
+    {
+      "epoch": 2.68,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 5.8e-07,
+      "loss": 0.007,
+      "step": 4130
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 0.04345703125,
+      "learning_rate": 5.733333333333334e-07,
+      "loss": 0.0049,
+      "step": 4140
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 5.666666666666667e-07,
+      "loss": 0.0108,
+      "step": 4150
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 5.6e-07,
+      "loss": 0.0033,
+      "step": 4160
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 5.533333333333334e-07,
+      "loss": 0.0018,
+      "step": 4170
+    },
+    {
+      "epoch": 2.72,
+      "grad_norm": 0.27734375,
+      "learning_rate": 5.466666666666667e-07,
+      "loss": 0.007,
+      "step": 4180
+    },
+    {
+      "epoch": 2.72,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 5.4e-07,
+      "loss": 0.0052,
+      "step": 4190
+    },
+    {
+      "epoch": 2.73,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 5.333333333333333e-07,
+      "loss": 0.0014,
+      "step": 4200
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 0.11328125,
+      "learning_rate": 5.266666666666667e-07,
+      "loss": 0.0051,
+      "step": 4210
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 5.2e-07,
+      "loss": 0.0015,
+      "step": 4220
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 0.30859375,
+      "learning_rate": 5.133333333333333e-07,
+      "loss": 0.0044,
+      "step": 4230
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 0.064453125,
+      "learning_rate": 5.066666666666667e-07,
+      "loss": 0.0023,
+      "step": 4240
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 5e-07,
+      "loss": 0.0073,
+      "step": 4250
+    },
+    {
+      "epoch": 2.77,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 4.933333333333334e-07,
+      "loss": 0.0131,
+      "step": 4260
+    },
+    {
+      "epoch": 2.78,
+      "grad_norm": 0.044921875,
+      "learning_rate": 4.866666666666666e-07,
+      "loss": 0.0042,
+      "step": 4270
+    },
+    {
+      "epoch": 2.78,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 4.800000000000001e-07,
+      "loss": 0.0127,
+      "step": 4280
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 0.072265625,
+      "learning_rate": 4.7333333333333334e-07,
+      "loss": 0.0016,
+      "step": 4290
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 4.6666666666666666e-07,
+      "loss": 0.0019,
+      "step": 4300
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.044921875,
+      "learning_rate": 4.6e-07,
+      "loss": 0.0038,
+      "step": 4310
+    },
+    {
+      "epoch": 2.81,
+      "grad_norm": 0.1015625,
+      "learning_rate": 4.5333333333333337e-07,
+      "loss": 0.0016,
+      "step": 4320
+    },
+    {
+      "epoch": 2.81,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 4.4666666666666664e-07,
+      "loss": 0.0051,
+      "step": 4330
+    },
+    {
+      "epoch": 2.82,
+      "grad_norm": 0.0556640625,
+      "learning_rate": 4.4e-07,
+      "loss": 0.0088,
+      "step": 4340
+    },
+    {
+      "epoch": 2.83,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 4.333333333333333e-07,
+      "loss": 0.0026,
+      "step": 4350
+    },
+    {
+      "epoch": 2.83,
+      "grad_norm": 0.061767578125,
+      "learning_rate": 4.266666666666667e-07,
+      "loss": 0.0017,
+      "step": 4360
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 4.2000000000000006e-07,
+      "loss": 0.0028,
+      "step": 4370
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 0.099609375,
+      "learning_rate": 4.1333333333333333e-07,
+      "loss": 0.0052,
+      "step": 4380
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 4.066666666666667e-07,
+      "loss": 0.0039,
+      "step": 4390
+    },
+    {
+      "epoch": 2.86,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 4e-07,
+      "loss": 0.0015,
+      "step": 4400
+    },
+    {
+      "epoch": 2.87,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 3.9333333333333336e-07,
+      "loss": 0.0029,
+      "step": 4410
+    },
+    {
+      "epoch": 2.87,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 3.866666666666667e-07,
+      "loss": 0.0024,
+      "step": 4420
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 3.8000000000000007e-07,
+      "loss": 0.0024,
+      "step": 4430
+    },
+    {
+      "epoch": 2.89,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 3.7333333333333334e-07,
+      "loss": 0.0024,
+      "step": 4440
+    },
+    {
+      "epoch": 2.89,
+      "grad_norm": 1.703125,
+      "learning_rate": 3.6666666666666667e-07,
+      "loss": 0.0124,
+      "step": 4450
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 3.6e-07,
+      "loss": 0.0019,
+      "step": 4460
+    },
+    {
+      "epoch": 2.91,
+      "grad_norm": 0.70703125,
+      "learning_rate": 3.533333333333333e-07,
+      "loss": 0.0053,
+      "step": 4470
+    },
+    {
+      "epoch": 2.91,
+      "grad_norm": 0.31640625,
+      "learning_rate": 3.4666666666666665e-07,
+      "loss": 0.0053,
+      "step": 4480
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 0.4140625,
+      "learning_rate": 3.4e-07,
+      "loss": 0.0052,
+      "step": 4490
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 1.1640625,
+      "learning_rate": 3.333333333333333e-07,
+      "loss": 0.0138,
+      "step": 4500
+    },
+    {
+      "epoch": 2.93,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 3.266666666666667e-07,
+      "loss": 0.0057,
+      "step": 4510
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 3.2e-07,
+      "loss": 0.0073,
+      "step": 4520
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 0.29296875,
+      "learning_rate": 3.133333333333334e-07,
+      "loss": 0.0026,
+      "step": 4530
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 0.103515625,
+      "learning_rate": 3.066666666666667e-07,
+      "loss": 0.0044,
+      "step": 4540
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 3.0000000000000004e-07,
+      "loss": 0.0119,
+      "step": 4550
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 2.9333333333333337e-07,
+      "loss": 0.0042,
+      "step": 4560
+    },
+    {
+      "epoch": 2.97,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 2.866666666666667e-07,
+      "loss": 0.0114,
+      "step": 4570
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 2.8e-07,
+      "loss": 0.0052,
+      "step": 4580
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 0.064453125,
+      "learning_rate": 2.7333333333333335e-07,
+      "loss": 0.0061,
+      "step": 4590
+    },
+    {
+      "epoch": 2.99,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 2.6666666666666667e-07,
+      "loss": 0.0024,
+      "step": 4600
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 2.6e-07,
+      "loss": 0.0025,
+      "step": 4610
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 1.7421875,
+      "learning_rate": 2.533333333333333e-07,
+      "loss": 0.0151,
+      "step": 4620
+    },
+    {
+      "epoch": 3.01,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 2.466666666666667e-07,
+      "loss": 0.0015,
+      "step": 4630
+    },
+    {
+      "epoch": 3.02,
+      "grad_norm": 1.1640625,
+      "learning_rate": 2.4000000000000003e-07,
+      "loss": 0.0147,
+      "step": 4640
+    },
+    {
+      "epoch": 3.02,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 2.3333333333333333e-07,
+      "loss": 0.0041,
+      "step": 4650
+    },
+    {
+      "epoch": 3.03,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 2.2666666666666668e-07,
+      "loss": 0.0033,
+      "step": 4660
+    },
+    {
+      "epoch": 3.04,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 2.2e-07,
+      "loss": 0.0018,
+      "step": 4670
+    },
+    {
+      "epoch": 3.04,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 2.1333333333333334e-07,
+      "loss": 0.0033,
+      "step": 4680
+    },
+    {
+      "epoch": 3.05,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 2.0666666666666666e-07,
+      "loss": 0.0078,
+      "step": 4690
+    },
+    {
+      "epoch": 3.05,
+      "grad_norm": 0.12109375,
+      "learning_rate": 2e-07,
+      "loss": 0.0087,
+      "step": 4700
+    },
+    {
+      "epoch": 3.06,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 1.9333333333333334e-07,
+      "loss": 0.0061,
+      "step": 4710
+    },
+    {
+      "epoch": 3.07,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 1.8666666666666667e-07,
+      "loss": 0.0026,
+      "step": 4720
+    },
+    {
+      "epoch": 3.07,
+      "grad_norm": 0.068359375,
+      "learning_rate": 1.8e-07,
+      "loss": 0.0039,
+      "step": 4730
+    },
+    {
+      "epoch": 3.08,
+      "grad_norm": 0.057861328125,
+      "learning_rate": 1.7333333333333332e-07,
+      "loss": 0.0017,
+      "step": 4740
+    },
+    {
+      "epoch": 3.09,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 1.6666666666666665e-07,
+      "loss": 0.0076,
+      "step": 4750
+    },
+    {
+      "epoch": 3.09,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 1.6e-07,
+      "loss": 0.0045,
+      "step": 4760
+    },
+    {
+      "epoch": 3.1,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 1.5333333333333336e-07,
+      "loss": 0.0021,
+      "step": 4770
+    },
+    {
+      "epoch": 3.11,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 1.4666666666666668e-07,
+      "loss": 0.0017,
+      "step": 4780
+    },
+    {
+      "epoch": 3.11,
+      "grad_norm": 0.0546875,
+      "learning_rate": 1.4e-07,
+      "loss": 0.0107,
+      "step": 4790
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 1.3333333333333334e-07,
+      "loss": 0.0045,
+      "step": 4800
+    },
+    {
+      "epoch": 3.13,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 1.2666666666666666e-07,
+      "loss": 0.0053,
+      "step": 4810
+    },
+    {
+      "epoch": 3.13,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 1.2000000000000002e-07,
+      "loss": 0.0112,
+      "step": 4820
+    },
+    {
+      "epoch": 3.14,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 1.1333333333333334e-07,
+      "loss": 0.0014,
+      "step": 4830
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 0.44921875,
+      "learning_rate": 1.0666666666666667e-07,
+      "loss": 0.0083,
+      "step": 4840
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 0.130859375,
+      "learning_rate": 1e-07,
+      "loss": 0.0019,
+      "step": 4850
+    },
+    {
+      "epoch": 3.16,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 9.333333333333334e-08,
+      "loss": 0.0107,
+      "step": 4860
+    },
+    {
+      "epoch": 3.17,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 8.666666666666666e-08,
+      "loss": 0.0014,
+      "step": 4870
+    },
+    {
+      "epoch": 3.17,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 8e-08,
+      "loss": 0.0024,
+      "step": 4880
+    },
+    {
+      "epoch": 3.18,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 7.333333333333334e-08,
+      "loss": 0.0048,
+      "step": 4890
+    },
+    {
+      "epoch": 3.18,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 6.666666666666667e-08,
+      "loss": 0.0026,
+      "step": 4900
+    },
+    {
+      "epoch": 3.19,
+      "grad_norm": 0.044189453125,
+      "learning_rate": 6.000000000000001e-08,
+      "loss": 0.0054,
+      "step": 4910
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 5.3333333333333334e-08,
+      "loss": 0.0052,
+      "step": 4920
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 4.666666666666667e-08,
+      "loss": 0.0039,
+      "step": 4930
+    },
+    {
+      "epoch": 3.21,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 4e-08,
+      "loss": 0.0092,
+      "step": 4940
+    },
+    {
+      "epoch": 3.22,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 3.3333333333333334e-08,
+      "loss": 0.0035,
+      "step": 4950
+    },
+    {
+      "epoch": 3.22,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 2.6666666666666667e-08,
+      "loss": 0.0044,
+      "step": 4960
+    },
+    {
+      "epoch": 3.23,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 2e-08,
+      "loss": 0.0048,
+      "step": 4970
+    },
+    {
+      "epoch": 3.24,
+      "grad_norm": 0.046875,
+      "learning_rate": 1.3333333333333334e-08,
+      "loss": 0.0015,
+      "step": 4980
+    },
+    {
+      "epoch": 3.24,
+      "grad_norm": 0.294921875,
+      "learning_rate": 6.666666666666667e-09,
+      "loss": 0.0031,
+      "step": 4990
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 0.0,
+      "loss": 0.0036,
+      "step": 5000
+    },
+    {
+      "epoch": 3.25,
+      "eval_loss": 0.004249705467373133,
+      "eval_runtime": 72.2773,
+      "eval_samples_per_second": 13.836,
+      "eval_steps_per_second": 13.836,
+      "step": 5000
     }
   ],
   "logging_steps": 10,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 4,
   "save_steps": 1000,
+  "total_flos": 8.06961020928e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null