Training in progress, step 69000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c67ab3cac009a5afdc201af7f0117dd68a478413d54e0923fe125d5f63dd515
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d01209ddef39b46affb20fe03502cb8000499194b31764df158aa95dc134101e
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1397d04798a1fd86f4b074ba5cc769a269eab9bb0994d2bcfee86faa58f609a6
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bba508cada3fb6a2130ffab8142880b38ad6264731466b5965eb74743d23afc9
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -12112,11 +12112,189 @@
|
|
| 12112 |
"eval_steps_per_second": 23.197,
|
| 12113 |
"num_input_tokens_seen": 17825792000,
|
| 12114 |
"step": 68000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12115 |
}
|
| 12116 |
],
|
| 12117 |
"logging_steps": 50,
|
| 12118 |
"max_steps": 70000,
|
| 12119 |
-
"num_input_tokens_seen":
|
| 12120 |
"num_train_epochs": 1,
|
| 12121 |
"save_steps": 1000,
|
| 12122 |
"stateful_callbacks": {
|
|
@@ -12131,7 +12309,7 @@
|
|
| 12131 |
"attributes": {}
|
| 12132 |
}
|
| 12133 |
},
|
| 12134 |
-
"total_flos": 4.
|
| 12135 |
"train_batch_size": 64,
|
| 12136 |
"trial_name": null,
|
| 12137 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.4641284223165053,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 69000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 12112 |
"eval_steps_per_second": 23.197,
|
| 12113 |
"num_input_tokens_seen": 17825792000,
|
| 12114 |
"step": 68000
|
| 12115 |
+
},
|
| 12116 |
+
{
|
| 12117 |
+
"epoch": 0.4577382483860607,
|
| 12118 |
+
"grad_norm": 0.14517797529697418,
|
| 12119 |
+
"learning_rate": 3.6204032372137984e-05,
|
| 12120 |
+
"loss": 2.9674,
|
| 12121 |
+
"num_input_tokens_seen": 17838899200,
|
| 12122 |
+
"step": 68050
|
| 12123 |
+
},
|
| 12124 |
+
{
|
| 12125 |
+
"epoch": 0.4580745733297683,
|
| 12126 |
+
"grad_norm": 0.14154207706451416,
|
| 12127 |
+
"learning_rate": 3.439253262059822e-05,
|
| 12128 |
+
"loss": 2.9627,
|
| 12129 |
+
"num_input_tokens_seen": 17852006400,
|
| 12130 |
+
"step": 68100
|
| 12131 |
+
},
|
| 12132 |
+
{
|
| 12133 |
+
"epoch": 0.4584108982734759,
|
| 12134 |
+
"grad_norm": 0.14251314103603363,
|
| 12135 |
+
"learning_rate": 3.2625909085853776e-05,
|
| 12136 |
+
"loss": 2.9681,
|
| 12137 |
+
"num_input_tokens_seen": 17865113600,
|
| 12138 |
+
"step": 68150
|
| 12139 |
+
},
|
| 12140 |
+
{
|
| 12141 |
+
"epoch": 0.45874722321718353,
|
| 12142 |
+
"grad_norm": 0.15670983493328094,
|
| 12143 |
+
"learning_rate": 3.0904332038757974e-05,
|
| 12144 |
+
"loss": 2.9708,
|
| 12145 |
+
"num_input_tokens_seen": 17878220800,
|
| 12146 |
+
"step": 68200
|
| 12147 |
+
},
|
| 12148 |
+
{
|
| 12149 |
+
"epoch": 0.45908354816089114,
|
| 12150 |
+
"grad_norm": 0.1453925371170044,
|
| 12151 |
+
"learning_rate": 2.9227967408489654e-05,
|
| 12152 |
+
"loss": 2.9686,
|
| 12153 |
+
"num_input_tokens_seen": 17891328000,
|
| 12154 |
+
"step": 68250
|
| 12155 |
+
},
|
| 12156 |
+
{
|
| 12157 |
+
"epoch": 0.45941987310459875,
|
| 12158 |
+
"grad_norm": 0.13307476043701172,
|
| 12159 |
+
"learning_rate": 2.7596976766560976e-05,
|
| 12160 |
+
"loss": 2.9595,
|
| 12161 |
+
"num_input_tokens_seen": 17904435200,
|
| 12162 |
+
"step": 68300
|
| 12163 |
+
},
|
| 12164 |
+
{
|
| 12165 |
+
"epoch": 0.45975619804830636,
|
| 12166 |
+
"grad_norm": 0.14958307147026062,
|
| 12167 |
+
"learning_rate": 2.6011517311244848e-05,
|
| 12168 |
+
"loss": 2.9661,
|
| 12169 |
+
"num_input_tokens_seen": 17917542400,
|
| 12170 |
+
"step": 68350
|
| 12171 |
+
},
|
| 12172 |
+
{
|
| 12173 |
+
"epoch": 0.460092522992014,
|
| 12174 |
+
"grad_norm": 0.14210085570812225,
|
| 12175 |
+
"learning_rate": 2.4471741852423235e-05,
|
| 12176 |
+
"loss": 2.9737,
|
| 12177 |
+
"num_input_tokens_seen": 17930649600,
|
| 12178 |
+
"step": 68400
|
| 12179 |
+
},
|
| 12180 |
+
{
|
| 12181 |
+
"epoch": 0.4604288479357216,
|
| 12182 |
+
"grad_norm": 0.15127155184745789,
|
| 12183 |
+
"learning_rate": 2.2977798796859794e-05,
|
| 12184 |
+
"loss": 2.9627,
|
| 12185 |
+
"num_input_tokens_seen": 17943756800,
|
| 12186 |
+
"step": 68450
|
| 12187 |
+
},
|
| 12188 |
+
{
|
| 12189 |
+
"epoch": 0.4607651728794292,
|
| 12190 |
+
"grad_norm": 0.14184921979904175,
|
| 12191 |
+
"learning_rate": 2.152983213389559e-05,
|
| 12192 |
+
"loss": 2.9732,
|
| 12193 |
+
"num_input_tokens_seen": 17956864000,
|
| 12194 |
+
"step": 68500
|
| 12195 |
+
},
|
| 12196 |
+
{
|
| 12197 |
+
"epoch": 0.4607651728794292,
|
| 12198 |
+
"eval_loss": 2.865307331085205,
|
| 12199 |
+
"eval_runtime": 53.2908,
|
| 12200 |
+
"eval_samples_per_second": 93.825,
|
| 12201 |
+
"eval_steps_per_second": 23.456,
|
| 12202 |
+
"num_input_tokens_seen": 17956864000,
|
| 12203 |
+
"step": 68500
|
| 12204 |
+
},
|
| 12205 |
+
{
|
| 12206 |
+
"epoch": 0.4611014978231368,
|
| 12207 |
+
"grad_norm": 0.14755961298942566,
|
| 12208 |
+
"learning_rate": 2.0127981421571295e-05,
|
| 12209 |
+
"loss": 2.9687,
|
| 12210 |
+
"num_input_tokens_seen": 17969971200,
|
| 12211 |
+
"step": 68550
|
| 12212 |
+
},
|
| 12213 |
+
{
|
| 12214 |
+
"epoch": 0.4614378227668444,
|
| 12215 |
+
"grad_norm": 0.1370965540409088,
|
| 12216 |
+
"learning_rate": 1.8772381773176416e-05,
|
| 12217 |
+
"loss": 2.9711,
|
| 12218 |
+
"num_input_tokens_seen": 17983078400,
|
| 12219 |
+
"step": 68600
|
| 12220 |
+
},
|
| 12221 |
+
{
|
| 12222 |
+
"epoch": 0.46177414771055203,
|
| 12223 |
+
"grad_norm": 0.14454130828380585,
|
| 12224 |
+
"learning_rate": 1.7463163844226305e-05,
|
| 12225 |
+
"loss": 2.9633,
|
| 12226 |
+
"num_input_tokens_seen": 17996185600,
|
| 12227 |
+
"step": 68650
|
| 12228 |
+
},
|
| 12229 |
+
{
|
| 12230 |
+
"epoch": 0.46211047265425964,
|
| 12231 |
+
"grad_norm": 0.13908445835113525,
|
| 12232 |
+
"learning_rate": 1.620045381987012e-05,
|
| 12233 |
+
"loss": 2.9662,
|
| 12234 |
+
"num_input_tokens_seen": 18009292800,
|
| 12235 |
+
"step": 68700
|
| 12236 |
+
},
|
| 12237 |
+
{
|
| 12238 |
+
"epoch": 0.46244679759796725,
|
| 12239 |
+
"grad_norm": 0.2359876185655594,
|
| 12240 |
+
"learning_rate": 1.4984373402728013e-05,
|
| 12241 |
+
"loss": 2.9671,
|
| 12242 |
+
"num_input_tokens_seen": 18022400000,
|
| 12243 |
+
"step": 68750
|
| 12244 |
+
},
|
| 12245 |
+
{
|
| 12246 |
+
"epoch": 0.46278312254167486,
|
| 12247 |
+
"grad_norm": 0.13809122145175934,
|
| 12248 |
+
"learning_rate": 1.3815039801161721e-05,
|
| 12249 |
+
"loss": 2.9684,
|
| 12250 |
+
"num_input_tokens_seen": 18035507200,
|
| 12251 |
+
"step": 68800
|
| 12252 |
+
},
|
| 12253 |
+
{
|
| 12254 |
+
"epoch": 0.4631194474853825,
|
| 12255 |
+
"grad_norm": 0.14375115931034088,
|
| 12256 |
+
"learning_rate": 1.26925657179775e-05,
|
| 12257 |
+
"loss": 2.9677,
|
| 12258 |
+
"num_input_tokens_seen": 18048614400,
|
| 12259 |
+
"step": 68850
|
| 12260 |
+
},
|
| 12261 |
+
{
|
| 12262 |
+
"epoch": 0.4634557724290901,
|
| 12263 |
+
"grad_norm": 0.14648525416851044,
|
| 12264 |
+
"learning_rate": 1.1617059339563806e-05,
|
| 12265 |
+
"loss": 2.9625,
|
| 12266 |
+
"num_input_tokens_seen": 18061721600,
|
| 12267 |
+
"step": 68900
|
| 12268 |
+
},
|
| 12269 |
+
{
|
| 12270 |
+
"epoch": 0.4637920973727977,
|
| 12271 |
+
"grad_norm": 0.1428016871213913,
|
| 12272 |
+
"learning_rate": 1.058862432546387e-05,
|
| 12273 |
+
"loss": 2.9717,
|
| 12274 |
+
"num_input_tokens_seen": 18074828800,
|
| 12275 |
+
"step": 68950
|
| 12276 |
+
},
|
| 12277 |
+
{
|
| 12278 |
+
"epoch": 0.4641284223165053,
|
| 12279 |
+
"grad_norm": 0.14518927037715912,
|
| 12280 |
+
"learning_rate": 9.607359798384786e-06,
|
| 12281 |
+
"loss": 2.9622,
|
| 12282 |
+
"num_input_tokens_seen": 18087936000,
|
| 12283 |
+
"step": 69000
|
| 12284 |
+
},
|
| 12285 |
+
{
|
| 12286 |
+
"epoch": 0.4641284223165053,
|
| 12287 |
+
"eval_loss": 2.8647797107696533,
|
| 12288 |
+
"eval_runtime": 53.1259,
|
| 12289 |
+
"eval_samples_per_second": 94.116,
|
| 12290 |
+
"eval_steps_per_second": 23.529,
|
| 12291 |
+
"num_input_tokens_seen": 18087936000,
|
| 12292 |
+
"step": 69000
|
| 12293 |
}
|
| 12294 |
],
|
| 12295 |
"logging_steps": 50,
|
| 12296 |
"max_steps": 70000,
|
| 12297 |
+
"num_input_tokens_seen": 18087936000,
|
| 12298 |
"num_train_epochs": 1,
|
| 12299 |
"save_steps": 1000,
|
| 12300 |
"stateful_callbacks": {
|
|
|
|
| 12309 |
"attributes": {}
|
| 12310 |
}
|
| 12311 |
},
|
| 12312 |
+
"total_flos": 4.83869941825536e+18,
|
| 12313 |
"train_batch_size": 64,
|
| 12314 |
"trial_name": null,
|
| 12315 |
"trial_params": null
|