Training in progress, step 2000, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +361 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:13b39b25712b1700516628197082779e670c991b5446245f1b02d4d7584d5995
 size 328277848

 version https://git-lfs.github.com/spec/v1
+oid sha256:8b30006c3c8ebdd220eda160d67d570192e678e4b938a46729d63d00fc226c89
 size 328277848

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8fa04a29b27b1343d5fb5458eddeb0052332c0d610a5e2af9e8f8706e9e6b91a
 size 318646859

 version https://git-lfs.github.com/spec/v1
+oid sha256:2d83b910297466c079691649d9d51db171a5eff2b984ed10840ddd4d5cf17b1d
 size 318646859

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:abce81d7290a22f9b260f2e004a835c5fd7f98ca8d48012d38a32b582885319d
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:8647979d889bb2b15d0a3e8961a7e547be28d07767d240f858bd959476bb870c
 size 14645

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2219b103874c49a564cb9902ed8bfe290939ff6276f6750739e5f7ca5ec6aba7
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:4a6e444c46ec49de792e4afbe9af4aa4613bca60425da2b0ac2cae225e516fcc
 size 1465

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.25342118601115055,
   "eval_steps": 500,
-  "global_step": 1500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -1089,6 +1089,364 @@
       "eval_samples_per_second": 277.282,
       "eval_steps_per_second": 5.823,
       "step": 1500
     }
   ],
   "logging_steps": 10,
@@ -1108,7 +1466,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 5.0168523128832e+16,
   "train_batch_size": 48,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.33789491468153404,
   "eval_steps": 500,
+  "global_step": 2000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 277.282,
       "eval_steps_per_second": 5.823,
       "step": 1500
+    },
+    {
+      "epoch": 0.2551106605845582,
+      "grad_norm": 1.3389363288879395,
+      "learning_rate": 0.00022634999999999997,
+      "loss": 6.027260589599609,
+      "step": 1510
+    },
+    {
+      "epoch": 0.25680013515796585,
+      "grad_norm": 1.2689851522445679,
+      "learning_rate": 0.00022784999999999995,
+      "loss": 6.00293083190918,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2584896097313735,
+      "grad_norm": 1.4860210418701172,
+      "learning_rate": 0.00022934999999999996,
+      "loss": 5.998868942260742,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2601790843047812,
+      "grad_norm": 1.2490425109863281,
+      "learning_rate": 0.00023084999999999997,
+      "loss": 5.984478759765625,
+      "step": 1540
+    },
+    {
+      "epoch": 0.2618685588781889,
+      "grad_norm": 1.5586382150650024,
+      "learning_rate": 0.00023234999999999998,
+      "loss": 5.9672401428222654,
+      "step": 1550
+    },
+    {
+      "epoch": 0.26355803345159656,
+      "grad_norm": 1.3526853322982788,
+      "learning_rate": 0.00023384999999999997,
+      "loss": 5.982438278198242,
+      "step": 1560
+    },
+    {
+      "epoch": 0.26524750802500424,
+      "grad_norm": 1.3406753540039062,
+      "learning_rate": 0.00023534999999999997,
+      "loss": 5.938652801513672,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2669369825984119,
+      "grad_norm": 1.0397038459777832,
+      "learning_rate": 0.00023684999999999998,
+      "loss": 5.920218658447266,
+      "step": 1580
+    },
+    {
+      "epoch": 0.26862645717181954,
+      "grad_norm": 1.7000986337661743,
+      "learning_rate": 0.00023834999999999997,
+      "loss": 5.896316146850586,
+      "step": 1590
+    },
+    {
+      "epoch": 0.2703159317452272,
+      "grad_norm": 1.1729341745376587,
+      "learning_rate": 0.00023984999999999998,
+      "loss": 5.8752281188964846,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2720054063186349,
+      "grad_norm": 1.3115921020507812,
+      "learning_rate": 0.00024134999999999998,
+      "loss": 5.877028274536133,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2736948808920426,
+      "grad_norm": 1.5481823682785034,
+      "learning_rate": 0.00024284999999999997,
+      "loss": 5.863247299194336,
+      "step": 1620
+    },
+    {
+      "epoch": 0.27538435546545026,
+      "grad_norm": 1.4173649549484253,
+      "learning_rate": 0.00024435,
+      "loss": 5.848538970947265,
+      "step": 1630
+    },
+    {
+      "epoch": 0.27707383003885794,
+      "grad_norm": 1.2587963342666626,
+      "learning_rate": 0.00024585,
+      "loss": 5.841713333129883,
+      "step": 1640
+    },
+    {
+      "epoch": 0.27876330461226556,
+      "grad_norm": 1.0922702550888062,
+      "learning_rate": 0.00024734999999999997,
+      "loss": 5.8486980438232425,
+      "step": 1650
+    },
+    {
+      "epoch": 0.28045277918567324,
+      "grad_norm": 1.6068239212036133,
+      "learning_rate": 0.00024885,
+      "loss": 5.819171142578125,
+      "step": 1660
+    },
+    {
+      "epoch": 0.2821422537590809,
+      "grad_norm": 1.5260576009750366,
+      "learning_rate": 0.00025035,
+      "loss": 5.809968566894531,
+      "step": 1670
+    },
+    {
+      "epoch": 0.2838317283324886,
+      "grad_norm": 1.2246356010437012,
+      "learning_rate": 0.00025184999999999997,
+      "loss": 5.788796997070312,
+      "step": 1680
+    },
+    {
+      "epoch": 0.2855212029058963,
+      "grad_norm": 1.0366030931472778,
+      "learning_rate": 0.00025335,
+      "loss": 5.78180160522461,
+      "step": 1690
+    },
+    {
+      "epoch": 0.28721067747930396,
+      "grad_norm": 1.2072358131408691,
+      "learning_rate": 0.00025485,
+      "loss": 5.770789337158203,
+      "step": 1700
+    },
+    {
+      "epoch": 0.28890015205271163,
+      "grad_norm": 1.3359684944152832,
+      "learning_rate": 0.00025634999999999997,
+      "loss": 5.737417221069336,
+      "step": 1710
+    },
+    {
+      "epoch": 0.29058962662611926,
+      "grad_norm": 1.355406403541565,
+      "learning_rate": 0.00025785,
+      "loss": 5.725430297851562,
+      "step": 1720
+    },
+    {
+      "epoch": 0.29227910119952694,
+      "grad_norm": 1.1998307704925537,
+      "learning_rate": 0.00025935,
+      "loss": 5.723165130615234,
+      "step": 1730
+    },
+    {
+      "epoch": 0.2939685757729346,
+      "grad_norm": 1.0525386333465576,
+      "learning_rate": 0.00026084999999999997,
+      "loss": 5.720573043823242,
+      "step": 1740
+    },
+    {
+      "epoch": 0.2956580503463423,
+      "grad_norm": 1.2880501747131348,
+      "learning_rate": 0.00026235,
+      "loss": 5.684521102905274,
+      "step": 1750
+    },
+    {
+      "epoch": 0.29734752491975,
+      "grad_norm": 1.2246838808059692,
+      "learning_rate": 0.00026384999999999994,
+      "loss": 5.670655059814453,
+      "step": 1760
+    },
+    {
+      "epoch": 0.29903699949315765,
+      "grad_norm": 1.2167463302612305,
+      "learning_rate": 0.00026534999999999997,
+      "loss": 5.690992736816407,
+      "step": 1770
+    },
+    {
+      "epoch": 0.3007264740665653,
+      "grad_norm": 1.2467341423034668,
+      "learning_rate": 0.00026684999999999995,
+      "loss": 5.694464492797851,
+      "step": 1780
+    },
+    {
+      "epoch": 0.30241594863997295,
+      "grad_norm": 1.2740100622177124,
+      "learning_rate": 0.00026835,
+      "loss": 5.679082870483398,
+      "step": 1790
+    },
+    {
+      "epoch": 0.30410542321338063,
+      "grad_norm": 1.2217073440551758,
+      "learning_rate": 0.00026984999999999997,
+      "loss": 5.650615692138672,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3057948977867883,
+      "grad_norm": 1.1172698736190796,
+      "learning_rate": 0.00027134999999999995,
+      "loss": 5.651753234863281,
+      "step": 1810
+    },
+    {
+      "epoch": 0.307484372360196,
+      "grad_norm": 1.1706960201263428,
+      "learning_rate": 0.00027285,
+      "loss": 5.6512096405029295,
+      "step": 1820
+    },
+    {
+      "epoch": 0.30917384693360367,
+      "grad_norm": 0.91384357213974,
+      "learning_rate": 0.00027435,
+      "loss": 5.63836784362793,
+      "step": 1830
+    },
+    {
+      "epoch": 0.3108633215070113,
+      "grad_norm": 1.1929048299789429,
+      "learning_rate": 0.00027584999999999996,
+      "loss": 5.628775787353516,
+      "step": 1840
+    },
+    {
+      "epoch": 0.31255279608041897,
+      "grad_norm": 1.023672103881836,
+      "learning_rate": 0.00027735,
+      "loss": 5.616031265258789,
+      "step": 1850
+    },
+    {
+      "epoch": 0.31424227065382665,
+      "grad_norm": 1.1450271606445312,
+      "learning_rate": 0.00027885,
+      "loss": 5.612253952026367,
+      "step": 1860
+    },
+    {
+      "epoch": 0.31593174522723433,
+      "grad_norm": 1.0316193103790283,
+      "learning_rate": 0.00028034999999999996,
+      "loss": 5.577928161621093,
+      "step": 1870
+    },
+    {
+      "epoch": 0.317621219800642,
+      "grad_norm": 1.1516318321228027,
+      "learning_rate": 0.00028185,
+      "loss": 5.589142227172852,
+      "step": 1880
+    },
+    {
+      "epoch": 0.3193106943740497,
+      "grad_norm": 1.426249384880066,
+      "learning_rate": 0.00028335,
+      "loss": 5.594329071044922,
+      "step": 1890
+    },
+    {
+      "epoch": 0.32100016894745736,
+      "grad_norm": 1.0666186809539795,
+      "learning_rate": 0.00028484999999999996,
+      "loss": 5.582658386230468,
+      "step": 1900
+    },
+    {
+      "epoch": 0.322689643520865,
+      "grad_norm": 0.8879145979881287,
+      "learning_rate": 0.00028635,
+      "loss": 5.542075347900391,
+      "step": 1910
+    },
+    {
+      "epoch": 0.32437911809427267,
+      "grad_norm": 1.2985228300094604,
+      "learning_rate": 0.00028785,
+      "loss": 5.572188949584961,
+      "step": 1920
+    },
+    {
+      "epoch": 0.32606859266768035,
+      "grad_norm": 1.1801198720932007,
+      "learning_rate": 0.00028934999999999996,
+      "loss": 5.531465530395508,
+      "step": 1930
+    },
+    {
+      "epoch": 0.327758067241088,
+      "grad_norm": 1.3345341682434082,
+      "learning_rate": 0.00029085,
+      "loss": 5.5121315002441404,
+      "step": 1940
+    },
+    {
+      "epoch": 0.3294475418144957,
+      "grad_norm": 0.9832890629768372,
+      "learning_rate": 0.00029235,
+      "loss": 5.515644073486328,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3311370163879034,
+      "grad_norm": 1.379388689994812,
+      "learning_rate": 0.00029384999999999996,
+      "loss": 5.5223854064941404,
+      "step": 1960
+    },
+    {
+      "epoch": 0.332826490961311,
+      "grad_norm": 1.0441769361495972,
+      "learning_rate": 0.00029535,
+      "loss": 5.502047729492188,
+      "step": 1970
+    },
+    {
+      "epoch": 0.3345159655347187,
+      "grad_norm": 1.0386887788772583,
+      "learning_rate": 0.00029685,
+      "loss": 5.521197128295898,
+      "step": 1980
+    },
+    {
+      "epoch": 0.33620544010812636,
+      "grad_norm": 0.8223176598548889,
+      "learning_rate": 0.00029835,
+      "loss": 5.479276275634765,
+      "step": 1990
+    },
+    {
+      "epoch": 0.33789491468153404,
+      "grad_norm": 1.2531520128250122,
+      "learning_rate": 0.00029985,
+      "loss": 5.487053298950196,
+      "step": 2000
+    },
+    {
+      "epoch": 0.33789491468153404,
+      "eval_loss": 5.460203170776367,
+      "eval_runtime": 3.9099,
+      "eval_samples_per_second": 255.761,
+      "eval_steps_per_second": 5.371,
+      "step": 2000
     }
   ],
   "logging_steps": 10,
       "attributes": {}
     }
   },
+  "total_flos": 6.6891364171776e+16,
   "train_batch_size": 48,
   "trial_name": null,
   "trial_params": null