Training in progress, step 52000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d3592942d50fd128f616a1b607af53de041def2895dde8221a2068841bbfc75f
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c28c8ab74c2ab24140a66eba7b08b4da3f0a1c0487aa3d24a61f15278b3cefdb
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:746267b8ba996549a033d105e363328c635034a7afa0e3070ea8447957aaca5a
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:24b3fcbecd3d55078c913506015bb6e1182f04ee52bf4c0845fc043823a61161
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9086,11 +9086,189 @@
|
|
| 9086 |
"eval_steps_per_second": 23.341,
|
| 9087 |
"num_input_tokens_seen": 13369339456,
|
| 9088 |
"step": 51000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9089 |
}
|
| 9090 |
],
|
| 9091 |
"logging_steps": 50,
|
| 9092 |
"max_steps": 70000,
|
| 9093 |
-
"num_input_tokens_seen":
|
| 9094 |
"num_train_epochs": 1,
|
| 9095 |
"save_steps": 1000,
|
| 9096 |
"stateful_callbacks": {
|
|
@@ -9105,7 +9283,7 @@
|
|
| 9105 |
"attributes": {}
|
| 9106 |
}
|
| 9107 |
},
|
| 9108 |
-
"total_flos": 3.
|
| 9109 |
"train_batch_size": 64,
|
| 9110 |
"trial_name": null,
|
| 9111 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.2480413084179019,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 52000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9086 |
"eval_steps_per_second": 23.341,
|
| 9087 |
"num_input_tokens_seen": 13369339456,
|
| 9088 |
"step": 51000
|
| 9089 |
+
},
|
| 9090 |
+
{
|
| 9091 |
+
"epoch": 0.24350978451411331,
|
| 9092 |
+
"grad_norm": 0.20502831041812897,
|
| 9093 |
+
"learning_rate": 0.001,
|
| 9094 |
+
"loss": 2.6059,
|
| 9095 |
+
"num_input_tokens_seen": 13382446656,
|
| 9096 |
+
"step": 51050
|
| 9097 |
+
},
|
| 9098 |
+
{
|
| 9099 |
+
"epoch": 0.24374828577220745,
|
| 9100 |
+
"grad_norm": 0.20750559866428375,
|
| 9101 |
+
"learning_rate": 0.001,
|
| 9102 |
+
"loss": 2.6056,
|
| 9103 |
+
"num_input_tokens_seen": 13395553856,
|
| 9104 |
+
"step": 51100
|
| 9105 |
+
},
|
| 9106 |
+
{
|
| 9107 |
+
"epoch": 0.24398678703030158,
|
| 9108 |
+
"grad_norm": 0.19882823526859283,
|
| 9109 |
+
"learning_rate": 0.001,
|
| 9110 |
+
"loss": 2.5983,
|
| 9111 |
+
"num_input_tokens_seen": 13408661056,
|
| 9112 |
+
"step": 51150
|
| 9113 |
+
},
|
| 9114 |
+
{
|
| 9115 |
+
"epoch": 0.2442252882883957,
|
| 9116 |
+
"grad_norm": 0.20900660753250122,
|
| 9117 |
+
"learning_rate": 0.001,
|
| 9118 |
+
"loss": 2.6087,
|
| 9119 |
+
"num_input_tokens_seen": 13421768256,
|
| 9120 |
+
"step": 51200
|
| 9121 |
+
},
|
| 9122 |
+
{
|
| 9123 |
+
"epoch": 0.24446378954648987,
|
| 9124 |
+
"grad_norm": 0.21428415179252625,
|
| 9125 |
+
"learning_rate": 0.001,
|
| 9126 |
+
"loss": 2.5901,
|
| 9127 |
+
"num_input_tokens_seen": 13434875456,
|
| 9128 |
+
"step": 51250
|
| 9129 |
+
},
|
| 9130 |
+
{
|
| 9131 |
+
"epoch": 0.244702290804584,
|
| 9132 |
+
"grad_norm": 0.19987250864505768,
|
| 9133 |
+
"learning_rate": 0.001,
|
| 9134 |
+
"loss": 2.5982,
|
| 9135 |
+
"num_input_tokens_seen": 13447982656,
|
| 9136 |
+
"step": 51300
|
| 9137 |
+
},
|
| 9138 |
+
{
|
| 9139 |
+
"epoch": 0.24494079206267813,
|
| 9140 |
+
"grad_norm": 0.2045862078666687,
|
| 9141 |
+
"learning_rate": 0.001,
|
| 9142 |
+
"loss": 2.6058,
|
| 9143 |
+
"num_input_tokens_seen": 13461089856,
|
| 9144 |
+
"step": 51350
|
| 9145 |
+
},
|
| 9146 |
+
{
|
| 9147 |
+
"epoch": 0.24517929332077226,
|
| 9148 |
+
"grad_norm": 0.22261273860931396,
|
| 9149 |
+
"learning_rate": 0.001,
|
| 9150 |
+
"loss": 2.5972,
|
| 9151 |
+
"num_input_tokens_seen": 13474197056,
|
| 9152 |
+
"step": 51400
|
| 9153 |
+
},
|
| 9154 |
+
{
|
| 9155 |
+
"epoch": 0.2454177945788664,
|
| 9156 |
+
"grad_norm": 0.20395706593990326,
|
| 9157 |
+
"learning_rate": 0.001,
|
| 9158 |
+
"loss": 2.6064,
|
| 9159 |
+
"num_input_tokens_seen": 13487304256,
|
| 9160 |
+
"step": 51450
|
| 9161 |
+
},
|
| 9162 |
+
{
|
| 9163 |
+
"epoch": 0.24565629583696055,
|
| 9164 |
+
"grad_norm": 0.21490858495235443,
|
| 9165 |
+
"learning_rate": 0.001,
|
| 9166 |
+
"loss": 2.5922,
|
| 9167 |
+
"num_input_tokens_seen": 13500411456,
|
| 9168 |
+
"step": 51500
|
| 9169 |
+
},
|
| 9170 |
+
{
|
| 9171 |
+
"epoch": 0.24565629583696055,
|
| 9172 |
+
"eval_loss": 2.488300085067749,
|
| 9173 |
+
"eval_runtime": 53.7972,
|
| 9174 |
+
"eval_samples_per_second": 92.942,
|
| 9175 |
+
"eval_steps_per_second": 23.235,
|
| 9176 |
+
"num_input_tokens_seen": 13500411456,
|
| 9177 |
+
"step": 51500
|
| 9178 |
+
},
|
| 9179 |
+
{
|
| 9180 |
+
"epoch": 0.24589479709505468,
|
| 9181 |
+
"grad_norm": 0.2039102464914322,
|
| 9182 |
+
"learning_rate": 0.001,
|
| 9183 |
+
"loss": 2.5894,
|
| 9184 |
+
"num_input_tokens_seen": 13513518656,
|
| 9185 |
+
"step": 51550
|
| 9186 |
+
},
|
| 9187 |
+
{
|
| 9188 |
+
"epoch": 0.24613329835314882,
|
| 9189 |
+
"grad_norm": 0.21426360309123993,
|
| 9190 |
+
"learning_rate": 0.001,
|
| 9191 |
+
"loss": 2.6089,
|
| 9192 |
+
"num_input_tokens_seen": 13526625856,
|
| 9193 |
+
"step": 51600
|
| 9194 |
+
},
|
| 9195 |
+
{
|
| 9196 |
+
"epoch": 0.24637179961124295,
|
| 9197 |
+
"grad_norm": 0.194682314991951,
|
| 9198 |
+
"learning_rate": 0.001,
|
| 9199 |
+
"loss": 2.5932,
|
| 9200 |
+
"num_input_tokens_seen": 13539733056,
|
| 9201 |
+
"step": 51650
|
| 9202 |
+
},
|
| 9203 |
+
{
|
| 9204 |
+
"epoch": 0.24661030086933708,
|
| 9205 |
+
"grad_norm": 0.1901472508907318,
|
| 9206 |
+
"learning_rate": 0.001,
|
| 9207 |
+
"loss": 2.6031,
|
| 9208 |
+
"num_input_tokens_seen": 13552840256,
|
| 9209 |
+
"step": 51700
|
| 9210 |
+
},
|
| 9211 |
+
{
|
| 9212 |
+
"epoch": 0.2468488021274312,
|
| 9213 |
+
"grad_norm": 0.20517823100090027,
|
| 9214 |
+
"learning_rate": 0.001,
|
| 9215 |
+
"loss": 2.5978,
|
| 9216 |
+
"num_input_tokens_seen": 13565947456,
|
| 9217 |
+
"step": 51750
|
| 9218 |
+
},
|
| 9219 |
+
{
|
| 9220 |
+
"epoch": 0.24708730338552537,
|
| 9221 |
+
"grad_norm": 0.23713302612304688,
|
| 9222 |
+
"learning_rate": 0.001,
|
| 9223 |
+
"loss": 2.6061,
|
| 9224 |
+
"num_input_tokens_seen": 13579054656,
|
| 9225 |
+
"step": 51800
|
| 9226 |
+
},
|
| 9227 |
+
{
|
| 9228 |
+
"epoch": 0.2473258046436195,
|
| 9229 |
+
"grad_norm": 0.2431441992521286,
|
| 9230 |
+
"learning_rate": 0.001,
|
| 9231 |
+
"loss": 2.6062,
|
| 9232 |
+
"num_input_tokens_seen": 13592161856,
|
| 9233 |
+
"step": 51850
|
| 9234 |
+
},
|
| 9235 |
+
{
|
| 9236 |
+
"epoch": 0.24756430590171363,
|
| 9237 |
+
"grad_norm": 0.20358557999134064,
|
| 9238 |
+
"learning_rate": 0.001,
|
| 9239 |
+
"loss": 2.6161,
|
| 9240 |
+
"num_input_tokens_seen": 13605269056,
|
| 9241 |
+
"step": 51900
|
| 9242 |
+
},
|
| 9243 |
+
{
|
| 9244 |
+
"epoch": 0.24780280715980776,
|
| 9245 |
+
"grad_norm": 0.21245016157627106,
|
| 9246 |
+
"learning_rate": 0.001,
|
| 9247 |
+
"loss": 2.6166,
|
| 9248 |
+
"num_input_tokens_seen": 13618376256,
|
| 9249 |
+
"step": 51950
|
| 9250 |
+
},
|
| 9251 |
+
{
|
| 9252 |
+
"epoch": 0.2480413084179019,
|
| 9253 |
+
"grad_norm": 0.24295999109745026,
|
| 9254 |
+
"learning_rate": 0.001,
|
| 9255 |
+
"loss": 2.6139,
|
| 9256 |
+
"num_input_tokens_seen": 13631483456,
|
| 9257 |
+
"step": 52000
|
| 9258 |
+
},
|
| 9259 |
+
{
|
| 9260 |
+
"epoch": 0.2480413084179019,
|
| 9261 |
+
"eval_loss": 2.4932186603546143,
|
| 9262 |
+
"eval_runtime": 53.6797,
|
| 9263 |
+
"eval_samples_per_second": 93.145,
|
| 9264 |
+
"eval_steps_per_second": 23.286,
|
| 9265 |
+
"num_input_tokens_seen": 13631483456,
|
| 9266 |
+
"step": 52000
|
| 9267 |
}
|
| 9268 |
],
|
| 9269 |
"logging_steps": 50,
|
| 9270 |
"max_steps": 70000,
|
| 9271 |
+
"num_input_tokens_seen": 13631483456,
|
| 9272 |
"num_train_epochs": 1,
|
| 9273 |
"save_steps": 1000,
|
| 9274 |
"stateful_callbacks": {
|
|
|
|
| 9283 |
"attributes": {}
|
| 9284 |
}
|
| 9285 |
},
|
| 9286 |
+
"total_flos": 3.6465548677585306e+18,
|
| 9287 |
"train_batch_size": 64,
|
| 9288 |
"trial_name": null,
|
| 9289 |
"trial_params": null
|