Training in progress, step 1500, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +361 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f6a6ed44cd2dfba870cb534237ba4896f3e7dc134dd8f5e7b12dba7ffa27c335
 size 328277848

 version https://git-lfs.github.com/spec/v1
+oid sha256:13b39b25712b1700516628197082779e670c991b5446245f1b02d4d7584d5995
 size 328277848

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a3d65452f71865cff75ce7bd9061bd7c195d9f7790eb08651bef46b28c8cf5db
 size 318646859

 version https://git-lfs.github.com/spec/v1
+oid sha256:8fa04a29b27b1343d5fb5458eddeb0052332c0d610a5e2af9e8f8706e9e6b91a
 size 318646859

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a141ddada80b12146ad2875b480471ca4604a84a507446df6ce95668765adaf4
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:abce81d7290a22f9b260f2e004a835c5fd7f98ca8d48012d38a32b582885319d
 size 14645

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:641712abd35039f810da46b5ecace55e8c31f5b5a7d2cfa0aaa8182597f8aad6
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:2219b103874c49a564cb9902ed8bfe290939ff6276f6750739e5f7ca5ec6aba7
 size 1465

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.16894745734076702,
   "eval_steps": 500,
-  "global_step": 1000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -731,6 +731,364 @@
       "eval_samples_per_second": 272.845,
       "eval_steps_per_second": 5.73,
       "step": 1000
     }
   ],
   "logging_steps": 10,
@@ -750,7 +1108,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 3.3445682085888e+16,
   "train_batch_size": 48,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.25342118601115055,
   "eval_steps": 500,
+  "global_step": 1500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 272.845,
       "eval_steps_per_second": 5.73,
       "step": 1000
+    },
+    {
+      "epoch": 0.1706369319141747,
+      "grad_norm": 0.9963025450706482,
+      "learning_rate": 0.00015134999999999997,
+      "loss": 6.613154602050781,
+      "step": 1010
+    },
+    {
+      "epoch": 0.17232640648758235,
+      "grad_norm": 0.872097909450531,
+      "learning_rate": 0.00015284999999999997,
+      "loss": 6.613529968261719,
+      "step": 1020
+    },
+    {
+      "epoch": 0.17401588106099003,
+      "grad_norm": 1.2607650756835938,
+      "learning_rate": 0.00015434999999999998,
+      "loss": 6.587220001220703,
+      "step": 1030
+    },
+    {
+      "epoch": 0.1757053556343977,
+      "grad_norm": 1.0194809436798096,
+      "learning_rate": 0.00015584999999999997,
+      "loss": 6.585498046875,
+      "step": 1040
+    },
+    {
+      "epoch": 0.17739483020780536,
+      "grad_norm": 0.9153720736503601,
+      "learning_rate": 0.00015734999999999998,
+      "loss": 6.5845489501953125,
+      "step": 1050
+    },
+    {
+      "epoch": 0.17908430478121304,
+      "grad_norm": 1.1903005838394165,
+      "learning_rate": 0.00015884999999999999,
+      "loss": 6.566903686523437,
+      "step": 1060
+    },
+    {
+      "epoch": 0.18077377935462072,
+      "grad_norm": 0.9262056350708008,
+      "learning_rate": 0.00016034999999999997,
+      "loss": 6.520059204101562,
+      "step": 1070
+    },
+    {
+      "epoch": 0.1824632539280284,
+      "grad_norm": 1.0881860256195068,
+      "learning_rate": 0.00016184999999999998,
+      "loss": 6.543362426757812,
+      "step": 1080
+    },
+    {
+      "epoch": 0.18415272850143605,
+      "grad_norm": 0.9753679633140564,
+      "learning_rate": 0.00016334999999999999,
+      "loss": 6.528910064697266,
+      "step": 1090
+    },
+    {
+      "epoch": 0.18584220307484373,
+      "grad_norm": 1.2809370756149292,
+      "learning_rate": 0.00016485,
+      "loss": 6.49705810546875,
+      "step": 1100
+    },
+    {
+      "epoch": 0.1875316776482514,
+      "grad_norm": 1.0647395849227905,
+      "learning_rate": 0.00016634999999999998,
+      "loss": 6.508152008056641,
+      "step": 1110
+    },
+    {
+      "epoch": 0.18922115222165906,
+      "grad_norm": 0.9427017569541931,
+      "learning_rate": 0.00016785,
+      "loss": 6.492857360839844,
+      "step": 1120
+    },
+    {
+      "epoch": 0.19091062679506673,
+      "grad_norm": 1.1307021379470825,
+      "learning_rate": 0.00016935,
+      "loss": 6.474656677246093,
+      "step": 1130
+    },
+    {
+      "epoch": 0.1926001013684744,
+      "grad_norm": 1.182411789894104,
+      "learning_rate": 0.00017084999999999998,
+      "loss": 6.457868194580078,
+      "step": 1140
+    },
+    {
+      "epoch": 0.19428957594188206,
+      "grad_norm": 1.1442158222198486,
+      "learning_rate": 0.00017235,
+      "loss": 6.443910217285156,
+      "step": 1150
+    },
+    {
+      "epoch": 0.19597905051528974,
+      "grad_norm": 1.2637932300567627,
+      "learning_rate": 0.00017385,
+      "loss": 6.428031158447266,
+      "step": 1160
+    },
+    {
+      "epoch": 0.19766852508869742,
+      "grad_norm": 1.334306001663208,
+      "learning_rate": 0.00017534999999999998,
+      "loss": 6.415740966796875,
+      "step": 1170
+    },
+    {
+      "epoch": 0.19935799966210507,
+      "grad_norm": 0.882560670375824,
+      "learning_rate": 0.00017685,
+      "loss": 6.413926696777343,
+      "step": 1180
+    },
+    {
+      "epoch": 0.20104747423551275,
+      "grad_norm": 0.9657256603240967,
+      "learning_rate": 0.00017835,
+      "loss": 6.425054931640625,
+      "step": 1190
+    },
+    {
+      "epoch": 0.20273694880892043,
+      "grad_norm": 1.0196014642715454,
+      "learning_rate": 0.00017984999999999998,
+      "loss": 6.391595077514649,
+      "step": 1200
+    },
+    {
+      "epoch": 0.2044264233823281,
+      "grad_norm": 1.297837257385254,
+      "learning_rate": 0.00018135,
+      "loss": 6.382472991943359,
+      "step": 1210
+    },
+    {
+      "epoch": 0.20611589795573576,
+      "grad_norm": 1.1288139820098877,
+      "learning_rate": 0.00018285,
+      "loss": 6.358099746704101,
+      "step": 1220
+    },
+    {
+      "epoch": 0.20780537252914344,
+      "grad_norm": 0.9396995306015015,
+      "learning_rate": 0.00018435,
+      "loss": 6.355449676513672,
+      "step": 1230
+    },
+    {
+      "epoch": 0.20949484710255112,
+      "grad_norm": 1.1936787366867065,
+      "learning_rate": 0.00018585,
+      "loss": 6.356659698486328,
+      "step": 1240
+    },
+    {
+      "epoch": 0.21118432167595877,
+      "grad_norm": 0.9550564289093018,
+      "learning_rate": 0.00018735,
+      "loss": 6.337493515014648,
+      "step": 1250
+    },
+    {
+      "epoch": 0.21287379624936645,
+      "grad_norm": 1.2012646198272705,
+      "learning_rate": 0.00018884999999999996,
+      "loss": 6.317781829833985,
+      "step": 1260
+    },
+    {
+      "epoch": 0.21456327082277413,
+      "grad_norm": 1.0816755294799805,
+      "learning_rate": 0.00019034999999999996,
+      "loss": 6.316750335693359,
+      "step": 1270
+    },
+    {
+      "epoch": 0.21625274539618178,
+      "grad_norm": 1.3777987957000732,
+      "learning_rate": 0.00019184999999999997,
+      "loss": 6.3194934844970705,
+      "step": 1280
+    },
+    {
+      "epoch": 0.21794221996958946,
+      "grad_norm": 1.187603235244751,
+      "learning_rate": 0.00019334999999999998,
+      "loss": 6.30432357788086,
+      "step": 1290
+    },
+    {
+      "epoch": 0.21963169454299714,
+      "grad_norm": 1.0069150924682617,
+      "learning_rate": 0.00019484999999999997,
+      "loss": 6.2757713317871096,
+      "step": 1300
+    },
+    {
+      "epoch": 0.2213211691164048,
+      "grad_norm": 1.2410210371017456,
+      "learning_rate": 0.00019634999999999998,
+      "loss": 6.2698211669921875,
+      "step": 1310
+    },
+    {
+      "epoch": 0.22301064368981247,
+      "grad_norm": 1.1892989873886108,
+      "learning_rate": 0.00019784999999999998,
+      "loss": 6.2431591033935545,
+      "step": 1320
+    },
+    {
+      "epoch": 0.22470011826322014,
+      "grad_norm": 1.1054743528366089,
+      "learning_rate": 0.00019934999999999997,
+      "loss": 6.26300163269043,
+      "step": 1330
+    },
+    {
+      "epoch": 0.2263895928366278,
+      "grad_norm": 1.145757794380188,
+      "learning_rate": 0.00020084999999999998,
+      "loss": 6.226350021362305,
+      "step": 1340
+    },
+    {
+      "epoch": 0.22807906741003547,
+      "grad_norm": 1.0067166090011597,
+      "learning_rate": 0.00020234999999999999,
+      "loss": 6.2175750732421875,
+      "step": 1350
+    },
+    {
+      "epoch": 0.22976854198344315,
+      "grad_norm": 1.5041327476501465,
+      "learning_rate": 0.00020384999999999997,
+      "loss": 6.191579055786133,
+      "step": 1360
+    },
+    {
+      "epoch": 0.23145801655685083,
+      "grad_norm": 1.2780109643936157,
+      "learning_rate": 0.00020534999999999998,
+      "loss": 6.204021835327149,
+      "step": 1370
+    },
+    {
+      "epoch": 0.23314749113025848,
+      "grad_norm": 1.1531580686569214,
+      "learning_rate": 0.00020684999999999999,
+      "loss": 6.191404342651367,
+      "step": 1380
+    },
+    {
+      "epoch": 0.23483696570366616,
+      "grad_norm": 1.056857705116272,
+      "learning_rate": 0.00020835,
+      "loss": 6.17081298828125,
+      "step": 1390
+    },
+    {
+      "epoch": 0.23652644027707384,
+      "grad_norm": 1.1238850355148315,
+      "learning_rate": 0.00020984999999999998,
+      "loss": 6.153195190429687,
+      "step": 1400
+    },
+    {
+      "epoch": 0.2382159148504815,
+      "grad_norm": 1.2115790843963623,
+      "learning_rate": 0.00021135,
+      "loss": 6.157797622680664,
+      "step": 1410
+    },
+    {
+      "epoch": 0.23990538942388917,
+      "grad_norm": 1.1303883790969849,
+      "learning_rate": 0.00021285,
+      "loss": 6.119416809082031,
+      "step": 1420
+    },
+    {
+      "epoch": 0.24159486399729685,
+      "grad_norm": 1.2523441314697266,
+      "learning_rate": 0.00021434999999999998,
+      "loss": 6.133832550048828,
+      "step": 1430
+    },
+    {
+      "epoch": 0.2432843385707045,
+      "grad_norm": 1.1120916604995728,
+      "learning_rate": 0.00021585,
+      "loss": 6.122848129272461,
+      "step": 1440
+    },
+    {
+      "epoch": 0.24497381314411218,
+      "grad_norm": 1.239675521850586,
+      "learning_rate": 0.00021735,
+      "loss": 6.106191253662109,
+      "step": 1450
+    },
+    {
+      "epoch": 0.24666328771751986,
+      "grad_norm": 1.1382733583450317,
+      "learning_rate": 0.00021884999999999998,
+      "loss": 6.0912620544433596,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2483527622909275,
+      "grad_norm": 1.3199714422225952,
+      "learning_rate": 0.00022035,
+      "loss": 6.09831428527832,
+      "step": 1470
+    },
+    {
+      "epoch": 0.2500422368643352,
+      "grad_norm": 1.2705349922180176,
+      "learning_rate": 0.00022185,
+      "loss": 6.078111267089843,
+      "step": 1480
+    },
+    {
+      "epoch": 0.25173171143774287,
+      "grad_norm": 1.436306357383728,
+      "learning_rate": 0.00022335,
+      "loss": 6.058963012695313,
+      "step": 1490
+    },
+    {
+      "epoch": 0.25342118601115055,
+      "grad_norm": 1.179898977279663,
+      "learning_rate": 0.00022485,
+      "loss": 6.029299545288086,
+      "step": 1500
+    },
+    {
+      "epoch": 0.25342118601115055,
+      "eval_loss": 6.033608436584473,
+      "eval_runtime": 3.6064,
+      "eval_samples_per_second": 277.282,
+      "eval_steps_per_second": 5.823,
+      "step": 1500
     }
   ],
   "logging_steps": 10,
       "attributes": {}
     }
   },
+  "total_flos": 5.0168523128832e+16,
   "train_batch_size": 48,
   "trial_name": null,
   "trial_params": null