Training in progress, epoch 5, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +459 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2a231763b5faedaca18c9c191c07586b3550c6053a1eddc5ce3eee6f01c3b980
 size 852404428

 version https://git-lfs.github.com/spec/v1
+oid sha256:5f873b0e350ecae608ae2a7c6d5064a22a73d12e41a6adc123d385a128fd6ea9
 size 852404428

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:db9aea2220afea8c93b03bf9c5dc034e1c9c9fdd3ba5f8b9d73dc58e1a4f72ef
 size 1705187266

 version https://git-lfs.github.com/spec/v1
+oid sha256:148de1705a5a44a135e6a613648d0a1b113e2d7acafe2bee645417f3d4b152db
 size 1705187266

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:66de308632efa7bbf1e857a5abd292ee172491d8a7d235c0db723e6bccfc78f2
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:dc39a926741056c260a97c86cef4ca60de38e0b73eeb05622c76a983f99f2c84
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7ac3e56631fd633144f95d1e4f53fccbe783b424998d247387be1c2ff61fe767
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:2ccdb98bd899bb8d04e411fe55a1b125ef862b3eaa3dc4b4994d14e842f255ee
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": 0.8315503597259521,
   "best_model_checkpoint": "./results/checkpoint-63554",
-  "epoch": 4.999996066352762,
   "eval_steps": 500,
-  "global_step": 158885,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2266,6 +2266,462 @@
       "eval_samples_per_second": 54.646,
       "eval_steps_per_second": 0.854,
       "step": 158885
     }
   ],
   "logging_steps": 500,
@@ -2294,7 +2750,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 3.216354771107147e+18,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": 0.8315503597259521,
   "best_model_checkpoint": "./results/checkpoint-63554",
+  "epoch": 5.999996066352762,
   "eval_steps": 500,
+  "global_step": 190662,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 54.646,
       "eval_steps_per_second": 0.854,
       "step": 158885
+    },
+    {
+      "epoch": 5.003618955459312,
+      "grad_norm": 93093.9921875,
+      "learning_rate": 1.0088047426114604e-05,
+      "loss": 0.4392,
+      "step": 159000
+    },
+    {
+      "epoch": 5.019353544412844,
+      "grad_norm": 101550.875,
+      "learning_rate": 1.0038303518828214e-05,
+      "loss": 0.4355,
+      "step": 159500
+    },
+    {
+      "epoch": 5.035088133366376,
+      "grad_norm": 87909.828125,
+      "learning_rate": 9.988558663700153e-06,
+      "loss": 0.4364,
+      "step": 160000
+    },
+    {
+      "epoch": 5.050822722319908,
+      "grad_norm": 91443.0078125,
+      "learning_rate": 9.938814091694261e-06,
+      "loss": 0.4357,
+      "step": 160500
+    },
+    {
+      "epoch": 5.0665573112734394,
+      "grad_norm": 113526.984375,
+      "learning_rate": 9.889071033767369e-06,
+      "loss": 0.4365,
+      "step": 161000
+    },
+    {
+      "epoch": 5.082291900226972,
+      "grad_norm": 96782.1640625,
+      "learning_rate": 9.839330720838837e-06,
+      "loss": 0.4369,
+      "step": 161500
+    },
+    {
+      "epoch": 5.098026489180503,
+      "grad_norm": 94661.75,
+      "learning_rate": 9.789594383760112e-06,
+      "loss": 0.4345,
+      "step": 162000
+    },
+    {
+      "epoch": 5.1137610781340355,
+      "grad_norm": 88317.609375,
+      "learning_rate": 9.73986325328424e-06,
+      "loss": 0.4334,
+      "step": 162500
+    },
+    {
+      "epoch": 5.129495667087567,
+      "grad_norm": 85447.578125,
+      "learning_rate": 9.690138560035441e-06,
+      "loss": 0.4383,
+      "step": 163000
+    },
+    {
+      "epoch": 5.145230256041099,
+      "grad_norm": 88336.3125,
+      "learning_rate": 9.64042153447863e-06,
+      "loss": 0.4336,
+      "step": 163500
+    },
+    {
+      "epoch": 5.160964844994631,
+      "grad_norm": 106095.5,
+      "learning_rate": 9.59071340688899e-06,
+      "loss": 0.4359,
+      "step": 164000
+    },
+    {
+      "epoch": 5.176699433948162,
+      "grad_norm": 99273.8203125,
+      "learning_rate": 9.541015407321514e-06,
+      "loss": 0.4351,
+      "step": 164500
+    },
+    {
+      "epoch": 5.192434022901694,
+      "grad_norm": 90265.3125,
+      "learning_rate": 9.491328765580572e-06,
+      "loss": 0.4408,
+      "step": 165000
+    },
+    {
+      "epoch": 5.208168611855226,
+      "grad_norm": 79781.9140625,
+      "learning_rate": 9.441654711189482e-06,
+      "loss": 0.4333,
+      "step": 165500
+    },
+    {
+      "epoch": 5.223903200808758,
+      "grad_norm": 94186.921875,
+      "learning_rate": 9.391994473360074e-06,
+      "loss": 0.4368,
+      "step": 166000
+    },
+    {
+      "epoch": 5.239637789762289,
+      "grad_norm": 92022.53125,
+      "learning_rate": 9.342349280962287e-06,
+      "loss": 0.4363,
+      "step": 166500
+    },
+    {
+      "epoch": 5.255372378715822,
+      "grad_norm": 99226.21875,
+      "learning_rate": 9.292720362493748e-06,
+      "loss": 0.434,
+      "step": 167000
+    },
+    {
+      "epoch": 5.271106967669353,
+      "grad_norm": 84206.7421875,
+      "learning_rate": 9.24310894604938e-06,
+      "loss": 0.4369,
+      "step": 167500
+    },
+    {
+      "epoch": 5.2868415566228855,
+      "grad_norm": 99916.9921875,
+      "learning_rate": 9.193516259291002e-06,
+      "loss": 0.4353,
+      "step": 168000
+    },
+    {
+      "epoch": 5.302576145576417,
+      "grad_norm": 101395.203125,
+      "learning_rate": 9.143943529416966e-06,
+      "loss": 0.4351,
+      "step": 168500
+    },
+    {
+      "epoch": 5.318310734529949,
+      "grad_norm": 106872.765625,
+      "learning_rate": 9.09439198313177e-06,
+      "loss": 0.4349,
+      "step": 169000
+    },
+    {
+      "epoch": 5.334045323483481,
+      "grad_norm": 85570.0859375,
+      "learning_rate": 9.044862846615724e-06,
+      "loss": 0.4342,
+      "step": 169500
+    },
+    {
+      "epoch": 5.349779912437013,
+      "grad_norm": 89842.7421875,
+      "learning_rate": 8.995357345494588e-06,
+      "loss": 0.4376,
+      "step": 170000
+    },
+    {
+      "epoch": 5.365514501390544,
+      "grad_norm": 87297.84375,
+      "learning_rate": 8.94587670480925e-06,
+      "loss": 0.4342,
+      "step": 170500
+    },
+    {
+      "epoch": 5.381249090344076,
+      "grad_norm": 83102.15625,
+      "learning_rate": 8.896422148985418e-06,
+      "loss": 0.4337,
+      "step": 171000
+    },
+    {
+      "epoch": 5.396983679297608,
+      "grad_norm": 105961.4296875,
+      "learning_rate": 8.846994901803313e-06,
+      "loss": 0.4356,
+      "step": 171500
+    },
+    {
+      "epoch": 5.412718268251139,
+      "grad_norm": 88251.3046875,
+      "learning_rate": 8.797596186367387e-06,
+      "loss": 0.4358,
+      "step": 172000
+    },
+    {
+      "epoch": 5.428452857204672,
+      "grad_norm": 89459.875,
+      "learning_rate": 8.748227225076064e-06,
+      "loss": 0.4347,
+      "step": 172500
+    },
+    {
+      "epoch": 5.444187446158203,
+      "grad_norm": 85620.671875,
+      "learning_rate": 8.698889239591477e-06,
+      "loss": 0.4356,
+      "step": 173000
+    },
+    {
+      "epoch": 5.4599220351117355,
+      "grad_norm": 90309.140625,
+      "learning_rate": 8.649583450809254e-06,
+      "loss": 0.4331,
+      "step": 173500
+    },
+    {
+      "epoch": 5.475656624065267,
+      "grad_norm": 89333.3671875,
+      "learning_rate": 8.600311078828291e-06,
+      "loss": 0.4369,
+      "step": 174000
+    },
+    {
+      "epoch": 5.491391213018799,
+      "grad_norm": 99571.203125,
+      "learning_rate": 8.55107334292057e-06,
+      "loss": 0.4334,
+      "step": 174500
+    },
+    {
+      "epoch": 5.507125801972331,
+      "grad_norm": 103962.2734375,
+      "learning_rate": 8.501871461500981e-06,
+      "loss": 0.4339,
+      "step": 175000
+    },
+    {
+      "epoch": 5.522860390925863,
+      "grad_norm": 90352.78125,
+      "learning_rate": 8.452706652097187e-06,
+      "loss": 0.4311,
+      "step": 175500
+    },
+    {
+      "epoch": 5.538594979879394,
+      "grad_norm": 85944.9609375,
+      "learning_rate": 8.403580131319469e-06,
+      "loss": 0.4325,
+      "step": 176000
+    },
+    {
+      "epoch": 5.554329568832927,
+      "grad_norm": 81867.296875,
+      "learning_rate": 8.354493114830642e-06,
+      "loss": 0.4332,
+      "step": 176500
+    },
+    {
+      "epoch": 5.570064157786458,
+      "grad_norm": 100460.6171875,
+      "learning_rate": 8.305446817315961e-06,
+      "loss": 0.4351,
+      "step": 177000
+    },
+    {
+      "epoch": 5.585798746739989,
+      "grad_norm": 92339.1953125,
+      "learning_rate": 8.256442452453073e-06,
+      "loss": 0.4329,
+      "step": 177500
+    },
+    {
+      "epoch": 5.601533335693522,
+      "grad_norm": 89528.234375,
+      "learning_rate": 8.207481232881975e-06,
+      "loss": 0.4326,
+      "step": 178000
+    },
+    {
+      "epoch": 5.617267924647053,
+      "grad_norm": 98558.296875,
+      "learning_rate": 8.15856437017501e-06,
+      "loss": 0.4347,
+      "step": 178500
+    },
+    {
+      "epoch": 5.6330025136005855,
+      "grad_norm": 96948.5546875,
+      "learning_rate": 8.109693074806891e-06,
+      "loss": 0.4348,
+      "step": 179000
+    },
+    {
+      "epoch": 5.648737102554117,
+      "grad_norm": 99710.2890625,
+      "learning_rate": 8.060868556124735e-06,
+      "loss": 0.4342,
+      "step": 179500
+    },
+    {
+      "epoch": 5.664471691507649,
+      "grad_norm": 103463.359375,
+      "learning_rate": 8.012092022318148e-06,
+      "loss": 0.4304,
+      "step": 180000
+    },
+    {
+      "epoch": 5.680206280461181,
+      "grad_norm": 87654.609375,
+      "learning_rate": 7.963364680389322e-06,
+      "loss": 0.4281,
+      "step": 180500
+    },
+    {
+      "epoch": 5.695940869414713,
+      "grad_norm": 81925.9609375,
+      "learning_rate": 7.914687736123171e-06,
+      "loss": 0.435,
+      "step": 181000
+    },
+    {
+      "epoch": 5.711675458368244,
+      "grad_norm": 89614.90625,
+      "learning_rate": 7.866062394057486e-06,
+      "loss": 0.4346,
+      "step": 181500
+    },
+    {
+      "epoch": 5.727410047321777,
+      "grad_norm": 93440.1171875,
+      "learning_rate": 7.817489857453146e-06,
+      "loss": 0.4336,
+      "step": 182000
+    },
+    {
+      "epoch": 5.743144636275308,
+      "grad_norm": 93339.640625,
+      "learning_rate": 7.768971328264314e-06,
+      "loss": 0.4325,
+      "step": 182500
+    },
+    {
+      "epoch": 5.75887922522884,
+      "grad_norm": 107675.4296875,
+      "learning_rate": 7.720508007108721e-06,
+      "loss": 0.4275,
+      "step": 183000
+    },
+    {
+      "epoch": 5.774613814182372,
+      "grad_norm": 91257.9609375,
+      "learning_rate": 7.672101093237936e-06,
+      "loss": 0.4318,
+      "step": 183500
+    },
+    {
+      "epoch": 5.790348403135903,
+      "grad_norm": 93339.4453125,
+      "learning_rate": 7.623751784507707e-06,
+      "loss": 0.4306,
+      "step": 184000
+    },
+    {
+      "epoch": 5.8060829920894355,
+      "grad_norm": 95532.234375,
+      "learning_rate": 7.575461277348304e-06,
+      "loss": 0.431,
+      "step": 184500
+    },
+    {
+      "epoch": 5.821817581042967,
+      "grad_norm": 95540.015625,
+      "learning_rate": 7.527230766734925e-06,
+      "loss": 0.4297,
+      "step": 185000
+    },
+    {
+      "epoch": 5.837552169996499,
+      "grad_norm": 99399.796875,
+      "learning_rate": 7.479061446158119e-06,
+      "loss": 0.4273,
+      "step": 185500
+    },
+    {
+      "epoch": 5.853286758950031,
+      "grad_norm": 99299.875,
+      "learning_rate": 7.4309545075942494e-06,
+      "loss": 0.4295,
+      "step": 186000
+    },
+    {
+      "epoch": 5.869021347903563,
+      "grad_norm": 95743.7265625,
+      "learning_rate": 7.38291114147601e-06,
+      "loss": 0.4305,
+      "step": 186500
+    },
+    {
+      "epoch": 5.884755936857094,
+      "grad_norm": 97193.7421875,
+      "learning_rate": 7.334932536662957e-06,
+      "loss": 0.432,
+      "step": 187000
+    },
+    {
+      "epoch": 5.900490525810627,
+      "grad_norm": 88953.2421875,
+      "learning_rate": 7.2870198804120874e-06,
+      "loss": 0.4309,
+      "step": 187500
+    },
+    {
+      "epoch": 5.916225114764158,
+      "grad_norm": 101634.828125,
+      "learning_rate": 7.239174358348464e-06,
+      "loss": 0.4289,
+      "step": 188000
+    },
+    {
+      "epoch": 5.93195970371769,
+      "grad_norm": 94037.0,
+      "learning_rate": 7.191397154435893e-06,
+      "loss": 0.4262,
+      "step": 188500
+    },
+    {
+      "epoch": 5.947694292671222,
+      "grad_norm": 91032.2578125,
+      "learning_rate": 7.143689450947593e-06,
+      "loss": 0.4303,
+      "step": 189000
+    },
+    {
+      "epoch": 5.963428881624754,
+      "grad_norm": 100366.0,
+      "learning_rate": 7.096052428436962e-06,
+      "loss": 0.431,
+      "step": 189500
+    },
+    {
+      "epoch": 5.9791634705782855,
+      "grad_norm": 86355.7421875,
+      "learning_rate": 7.048487265708357e-06,
+      "loss": 0.4309,
+      "step": 190000
+    },
+    {
+      "epoch": 5.994898059531817,
+      "grad_norm": 88552.546875,
+      "learning_rate": 7.000995139787929e-06,
+      "loss": 0.4284,
+      "step": 190500
+    },
+    {
+      "epoch": 5.999996066352762,
+      "eval_loss": 0.8569527268409729,
+      "eval_runtime": 4685.7619,
+      "eval_samples_per_second": 54.61,
+      "eval_steps_per_second": 0.853,
+      "step": 190662
     }
   ],
   "logging_steps": 500,
       "attributes": {}
     }
   },
+  "total_flos": 3.8596166159870546e+18,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null