Training in progress, step 69000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:255db0ba45691d582a9ee109acc81eb21c645c6dd171a1bf2c3e231a0982d734
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:78af01c29bc4dc4815cd8cb2a0e12aac4f0221e2ad669f18caa4315a15ef83d7
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1397d04798a1fd86f4b074ba5cc769a269eab9bb0994d2bcfee86faa58f609a6
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:16992573f9fe212ca32ce6bacf3d51d66103db65f351073047f24fab4f0f55af
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -12112,11 +12112,189 @@
|
|
| 12112 |
"eval_steps_per_second": 23.177,
|
| 12113 |
"num_input_tokens_seen": 17825787456,
|
| 12114 |
"step": 68000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12115 |
}
|
| 12116 |
],
|
| 12117 |
"logging_steps": 50,
|
| 12118 |
"max_steps": 70000,
|
| 12119 |
-
"num_input_tokens_seen":
|
| 12120 |
"num_train_epochs": 1,
|
| 12121 |
"save_steps": 1000,
|
| 12122 |
"stateful_callbacks": {
|
|
@@ -12131,7 +12309,7 @@
|
|
| 12131 |
"attributes": {}
|
| 12132 |
}
|
| 12133 |
},
|
| 12134 |
-
"total_flos": 4.
|
| 12135 |
"train_batch_size": 64,
|
| 12136 |
"trial_name": null,
|
| 12137 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.3291317361699083,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 69000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 12112 |
"eval_steps_per_second": 23.177,
|
| 12113 |
"num_input_tokens_seen": 17825787456,
|
| 12114 |
"step": 68000
|
| 12115 |
+
},
|
| 12116 |
+
{
|
| 12117 |
+
"epoch": 0.3246002122661197,
|
| 12118 |
+
"grad_norm": 0.15067367255687714,
|
| 12119 |
+
"learning_rate": 4.7109889986402973e-05,
|
| 12120 |
+
"loss": 2.5181,
|
| 12121 |
+
"num_input_tokens_seen": 17838894656,
|
| 12122 |
+
"step": 68050
|
| 12123 |
+
},
|
| 12124 |
+
{
|
| 12125 |
+
"epoch": 0.32483871352421384,
|
| 12126 |
+
"grad_norm": 0.1534261703491211,
|
| 12127 |
+
"learning_rate": 4.476122667059207e-05,
|
| 12128 |
+
"loss": 2.533,
|
| 12129 |
+
"num_input_tokens_seen": 17852001856,
|
| 12130 |
+
"step": 68100
|
| 12131 |
+
},
|
| 12132 |
+
{
|
| 12133 |
+
"epoch": 0.325077214782308,
|
| 12134 |
+
"grad_norm": 0.1585472822189331,
|
| 12135 |
+
"learning_rate": 4.2469871766340095e-05,
|
| 12136 |
+
"loss": 2.509,
|
| 12137 |
+
"num_input_tokens_seen": 17865109056,
|
| 12138 |
+
"step": 68150
|
| 12139 |
+
},
|
| 12140 |
+
{
|
| 12141 |
+
"epoch": 0.3253157160404021,
|
| 12142 |
+
"grad_norm": 0.15480853617191315,
|
| 12143 |
+
"learning_rate": 4.0236113724274713e-05,
|
| 12144 |
+
"loss": 2.524,
|
| 12145 |
+
"num_input_tokens_seen": 17878216256,
|
| 12146 |
+
"step": 68200
|
| 12147 |
+
},
|
| 12148 |
+
{
|
| 12149 |
+
"epoch": 0.32555421729849626,
|
| 12150 |
+
"grad_norm": 0.24341611564159393,
|
| 12151 |
+
"learning_rate": 3.806023374435663e-05,
|
| 12152 |
+
"loss": 2.5293,
|
| 12153 |
+
"num_input_tokens_seen": 17891323456,
|
| 12154 |
+
"step": 68250
|
| 12155 |
+
},
|
| 12156 |
+
{
|
| 12157 |
+
"epoch": 0.32579271855659037,
|
| 12158 |
+
"grad_norm": 0.15290473401546478,
|
| 12159 |
+
"learning_rate": 3.594250574048058e-05,
|
| 12160 |
+
"loss": 2.5149,
|
| 12161 |
+
"num_input_tokens_seen": 17904430656,
|
| 12162 |
+
"step": 68300
|
| 12163 |
+
},
|
| 12164 |
+
{
|
| 12165 |
+
"epoch": 0.3260312198146845,
|
| 12166 |
+
"grad_norm": 0.1606835126876831,
|
| 12167 |
+
"learning_rate": 3.3883196305992905e-05,
|
| 12168 |
+
"loss": 2.5327,
|
| 12169 |
+
"num_input_tokens_seen": 17917537856,
|
| 12170 |
+
"step": 68350
|
| 12171 |
+
},
|
| 12172 |
+
{
|
| 12173 |
+
"epoch": 0.3262697210727787,
|
| 12174 |
+
"grad_norm": 0.1537574976682663,
|
| 12175 |
+
"learning_rate": 3.18825646801314e-05,
|
| 12176 |
+
"loss": 2.5416,
|
| 12177 |
+
"num_input_tokens_seen": 17930645056,
|
| 12178 |
+
"step": 68400
|
| 12179 |
+
},
|
| 12180 |
+
{
|
| 12181 |
+
"epoch": 0.3265082223308728,
|
| 12182 |
+
"grad_norm": 0.16943201422691345,
|
| 12183 |
+
"learning_rate": 2.994086271539048e-05,
|
| 12184 |
+
"loss": 2.5233,
|
| 12185 |
+
"num_input_tokens_seen": 17943752256,
|
| 12186 |
+
"step": 68450
|
| 12187 |
+
},
|
| 12188 |
+
{
|
| 12189 |
+
"epoch": 0.32674672358896695,
|
| 12190 |
+
"grad_norm": 0.15832561254501343,
|
| 12191 |
+
"learning_rate": 2.8058334845816213e-05,
|
| 12192 |
+
"loss": 2.5439,
|
| 12193 |
+
"num_input_tokens_seen": 17956859456,
|
| 12194 |
+
"step": 68500
|
| 12195 |
+
},
|
| 12196 |
+
{
|
| 12197 |
+
"epoch": 0.32674672358896695,
|
| 12198 |
+
"eval_loss": 2.4128847122192383,
|
| 12199 |
+
"eval_runtime": 53.1054,
|
| 12200 |
+
"eval_samples_per_second": 94.152,
|
| 12201 |
+
"eval_steps_per_second": 23.538,
|
| 12202 |
+
"num_input_tokens_seen": 17956859456,
|
| 12203 |
+
"step": 68500
|
| 12204 |
+
},
|
| 12205 |
+
{
|
| 12206 |
+
"epoch": 0.32698522484706105,
|
| 12207 |
+
"grad_norm": 0.15245509147644043,
|
| 12208 |
+
"learning_rate": 2.6235218056235634e-05,
|
| 12209 |
+
"loss": 2.5209,
|
| 12210 |
+
"num_input_tokens_seen": 17969966656,
|
| 12211 |
+
"step": 68550
|
| 12212 |
+
},
|
| 12213 |
+
{
|
| 12214 |
+
"epoch": 0.3272237261051552,
|
| 12215 |
+
"grad_norm": 0.15148235857486725,
|
| 12216 |
+
"learning_rate": 2.4471741852423235e-05,
|
| 12217 |
+
"loss": 2.5284,
|
| 12218 |
+
"num_input_tokens_seen": 17983073856,
|
| 12219 |
+
"step": 68600
|
| 12220 |
+
},
|
| 12221 |
+
{
|
| 12222 |
+
"epoch": 0.3274622273632493,
|
| 12223 |
+
"grad_norm": 0.15678688883781433,
|
| 12224 |
+
"learning_rate": 2.276812823220964e-05,
|
| 12225 |
+
"loss": 2.537,
|
| 12226 |
+
"num_input_tokens_seen": 17996181056,
|
| 12227 |
+
"step": 68650
|
| 12228 |
+
},
|
| 12229 |
+
{
|
| 12230 |
+
"epoch": 0.3277007286213435,
|
| 12231 |
+
"grad_norm": 0.15105360746383667,
|
| 12232 |
+
"learning_rate": 2.1124591657534777e-05,
|
| 12233 |
+
"loss": 2.5321,
|
| 12234 |
+
"num_input_tokens_seen": 18009288256,
|
| 12235 |
+
"step": 68700
|
| 12236 |
+
},
|
| 12237 |
+
{
|
| 12238 |
+
"epoch": 0.32793922987943763,
|
| 12239 |
+
"grad_norm": 0.15369552373886108,
|
| 12240 |
+
"learning_rate": 1.9541339027450256e-05,
|
| 12241 |
+
"loss": 2.5291,
|
| 12242 |
+
"num_input_tokens_seen": 18022395456,
|
| 12243 |
+
"step": 68750
|
| 12244 |
+
},
|
| 12245 |
+
{
|
| 12246 |
+
"epoch": 0.32817773113753174,
|
| 12247 |
+
"grad_norm": 0.1551530808210373,
|
| 12248 |
+
"learning_rate": 1.801856965207338e-05,
|
| 12249 |
+
"loss": 2.5201,
|
| 12250 |
+
"num_input_tokens_seen": 18035502656,
|
| 12251 |
+
"step": 68800
|
| 12252 |
+
},
|
| 12253 |
+
{
|
| 12254 |
+
"epoch": 0.3284162323956259,
|
| 12255 |
+
"grad_norm": 0.14859162271022797,
|
| 12256 |
+
"learning_rate": 1.6556475227496815e-05,
|
| 12257 |
+
"loss": 2.5436,
|
| 12258 |
+
"num_input_tokens_seen": 18048609856,
|
| 12259 |
+
"step": 68850
|
| 12260 |
+
},
|
| 12261 |
+
{
|
| 12262 |
+
"epoch": 0.32865473365372,
|
| 12263 |
+
"grad_norm": 0.14972691237926483,
|
| 12264 |
+
"learning_rate": 1.5155239811656562e-05,
|
| 12265 |
+
"loss": 2.5221,
|
| 12266 |
+
"num_input_tokens_seen": 18061717056,
|
| 12267 |
+
"step": 68900
|
| 12268 |
+
},
|
| 12269 |
+
{
|
| 12270 |
+
"epoch": 0.32889323491181416,
|
| 12271 |
+
"grad_norm": 0.156805619597435,
|
| 12272 |
+
"learning_rate": 1.3815039801161721e-05,
|
| 12273 |
+
"loss": 2.5248,
|
| 12274 |
+
"num_input_tokens_seen": 18074824256,
|
| 12275 |
+
"step": 68950
|
| 12276 |
+
},
|
| 12277 |
+
{
|
| 12278 |
+
"epoch": 0.3291317361699083,
|
| 12279 |
+
"grad_norm": 0.148334801197052,
|
| 12280 |
+
"learning_rate": 1.2536043909088191e-05,
|
| 12281 |
+
"loss": 2.5361,
|
| 12282 |
+
"num_input_tokens_seen": 18087931456,
|
| 12283 |
+
"step": 69000
|
| 12284 |
+
},
|
| 12285 |
+
{
|
| 12286 |
+
"epoch": 0.3291317361699083,
|
| 12287 |
+
"eval_loss": 2.4120428562164307,
|
| 12288 |
+
"eval_runtime": 52.9258,
|
| 12289 |
+
"eval_samples_per_second": 94.472,
|
| 12290 |
+
"eval_steps_per_second": 23.618,
|
| 12291 |
+
"num_input_tokens_seen": 18087931456,
|
| 12292 |
+
"step": 69000
|
| 12293 |
}
|
| 12294 |
],
|
| 12295 |
"logging_steps": 50,
|
| 12296 |
"max_steps": 70000,
|
| 12297 |
+
"num_input_tokens_seen": 18087931456,
|
| 12298 |
"num_train_epochs": 1,
|
| 12299 |
"save_steps": 1000,
|
| 12300 |
"stateful_callbacks": {
|
|
|
|
| 12309 |
"attributes": {}
|
| 12310 |
}
|
| 12311 |
},
|
| 12312 |
+
"total_flos": 4.838698202691011e+18,
|
| 12313 |
"train_batch_size": 64,
|
| 12314 |
"trial_name": null,
|
| 12315 |
"trial_params": null
|