Azrail commited on
Commit
0907f49
·
verified ·
1 Parent(s): d983cfc

Training in progress, step 159000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:416215f0149ccdc8795d454a743c8eca2679e864ee093c2c92b9a7dc89d715bb
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae4822495b17518f3a55422be75e8f39fa5ce0b3a594c8d20dc409f424fc55f1
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f007001464c0735c2a1e81f89ec3fb177e1babb5501510e4e9c8bbe36278a009
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a748f752d89b80e6b8bbdf3f3863033a42fa4fd7d76ff87d364cfe1c8a3f1531
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09a1e45eb7b9bd5bee8831d08f28097d6e76d93bacd9f185db082ca81501cddf
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c359d76fc12146bca34b3d81bea11d1cbc59763f4362e664b11294254a3637f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30aae8d205cf2e798817f5244eb6202efaefc5672b52769c357c6911e1e312c2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcf0ec0a64c804385e2c0458be72943b2083978ccaa5abb266ef0e69fa674231
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 3.4706886977079106,
6
  "eval_steps": 500,
7
- "global_step": 158000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -28132,11 +28132,189 @@
28132
  "eval_steps_per_second": 15.499,
28133
  "num_input_tokens_seen": 92262378048,
28134
  "step": 158000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28135
  }
28136
  ],
28137
  "logging_steps": 50,
28138
  "max_steps": 200000,
28139
- "num_input_tokens_seen": 92262378048,
28140
  "num_train_epochs": 5,
28141
  "save_steps": 1000,
28142
  "stateful_callbacks": {
@@ -28151,7 +28329,7 @@
28151
  "attributes": {}
28152
  }
28153
  },
28154
- "total_flos": 1.632875804872041e+20,
28155
  "train_batch_size": 32,
28156
  "trial_name": null,
28157
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.492654756029168,
6
  "eval_steps": 500,
7
+ "global_step": 159000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
28132
  "eval_steps_per_second": 15.499,
28133
  "num_input_tokens_seen": 92262378048,
28134
  "step": 158000
28135
+ },
28136
+ {
28137
+ "epoch": 3.4717870006239733,
28138
+ "grad_norm": 0.09290427714586258,
28139
+ "learning_rate": 0.0001,
28140
+ "loss": 2.3419,
28141
+ "num_input_tokens_seen": 92314806848,
28142
+ "step": 158050
28143
+ },
28144
+ {
28145
+ "epoch": 3.4728853035400364,
28146
+ "grad_norm": 0.09033751487731934,
28147
+ "learning_rate": 0.0001,
28148
+ "loss": 2.344,
28149
+ "num_input_tokens_seen": 92367235648,
28150
+ "step": 158100
28151
+ },
28152
+ {
28153
+ "epoch": 3.473983606456099,
28154
+ "grad_norm": 0.0893242284655571,
28155
+ "learning_rate": 0.0001,
28156
+ "loss": 2.342,
28157
+ "num_input_tokens_seen": 92419664448,
28158
+ "step": 158150
28159
+ },
28160
+ {
28161
+ "epoch": 3.475081909372162,
28162
+ "grad_norm": 0.09877942502498627,
28163
+ "learning_rate": 0.0001,
28164
+ "loss": 2.3459,
28165
+ "num_input_tokens_seen": 92472093248,
28166
+ "step": 158200
28167
+ },
28168
+ {
28169
+ "epoch": 3.476180212288225,
28170
+ "grad_norm": 0.09732919931411743,
28171
+ "learning_rate": 0.0001,
28172
+ "loss": 2.3476,
28173
+ "num_input_tokens_seen": 92524522048,
28174
+ "step": 158250
28175
+ },
28176
+ {
28177
+ "epoch": 3.4772785152042878,
28178
+ "grad_norm": 0.08927954733371735,
28179
+ "learning_rate": 0.0001,
28180
+ "loss": 2.349,
28181
+ "num_input_tokens_seen": 92576950848,
28182
+ "step": 158300
28183
+ },
28184
+ {
28185
+ "epoch": 3.4783768181203505,
28186
+ "grad_norm": 0.09306230396032333,
28187
+ "learning_rate": 0.0001,
28188
+ "loss": 2.3396,
28189
+ "num_input_tokens_seen": 92629379648,
28190
+ "step": 158350
28191
+ },
28192
+ {
28193
+ "epoch": 3.4794751210364137,
28194
+ "grad_norm": 0.09522947669029236,
28195
+ "learning_rate": 0.0001,
28196
+ "loss": 2.3394,
28197
+ "num_input_tokens_seen": 92681808448,
28198
+ "step": 158400
28199
+ },
28200
+ {
28201
+ "epoch": 3.4805734239524764,
28202
+ "grad_norm": 0.09624941647052765,
28203
+ "learning_rate": 0.0001,
28204
+ "loss": 2.3475,
28205
+ "num_input_tokens_seen": 92734237248,
28206
+ "step": 158450
28207
+ },
28208
+ {
28209
+ "epoch": 3.481671726868539,
28210
+ "grad_norm": 0.09459653496742249,
28211
+ "learning_rate": 0.0001,
28212
+ "loss": 2.3431,
28213
+ "num_input_tokens_seen": 92786665216,
28214
+ "step": 158500
28215
+ },
28216
+ {
28217
+ "epoch": 3.481671726868539,
28218
+ "eval_loss": 2.257422685623169,
28219
+ "eval_runtime": 80.6973,
28220
+ "eval_samples_per_second": 61.96,
28221
+ "eval_steps_per_second": 15.49,
28222
+ "num_input_tokens_seen": 92786665216,
28223
+ "step": 158500
28224
+ },
28225
+ {
28226
+ "epoch": 3.4827700297846023,
28227
+ "grad_norm": 0.09564249962568283,
28228
+ "learning_rate": 0.0001,
28229
+ "loss": 2.341,
28230
+ "num_input_tokens_seen": 92839093024,
28231
+ "step": 158550
28232
+ },
28233
+ {
28234
+ "epoch": 3.483868332700665,
28235
+ "grad_norm": 0.10864699631929398,
28236
+ "learning_rate": 0.0001,
28237
+ "loss": 2.3405,
28238
+ "num_input_tokens_seen": 92891521824,
28239
+ "step": 158600
28240
+ },
28241
+ {
28242
+ "epoch": 3.484966635616728,
28243
+ "grad_norm": 0.09777586907148361,
28244
+ "learning_rate": 0.0001,
28245
+ "loss": 2.3445,
28246
+ "num_input_tokens_seen": 92943950624,
28247
+ "step": 158650
28248
+ },
28249
+ {
28250
+ "epoch": 3.486064938532791,
28251
+ "grad_norm": 0.09032690525054932,
28252
+ "learning_rate": 0.0001,
28253
+ "loss": 2.3423,
28254
+ "num_input_tokens_seen": 92996375712,
28255
+ "step": 158700
28256
+ },
28257
+ {
28258
+ "epoch": 3.4871632414488536,
28259
+ "grad_norm": 0.09027489274740219,
28260
+ "learning_rate": 0.0001,
28261
+ "loss": 2.3412,
28262
+ "num_input_tokens_seen": 93048803136,
28263
+ "step": 158750
28264
+ },
28265
+ {
28266
+ "epoch": 3.4882615443649168,
28267
+ "grad_norm": 0.09923077374696732,
28268
+ "learning_rate": 0.0001,
28269
+ "loss": 2.3455,
28270
+ "num_input_tokens_seen": 93101231936,
28271
+ "step": 158800
28272
+ },
28273
+ {
28274
+ "epoch": 3.4893598472809795,
28275
+ "grad_norm": 0.10047315806150436,
28276
+ "learning_rate": 0.0001,
28277
+ "loss": 2.3416,
28278
+ "num_input_tokens_seen": 93153660736,
28279
+ "step": 158850
28280
+ },
28281
+ {
28282
+ "epoch": 3.4904581501970426,
28283
+ "grad_norm": 0.0912187322974205,
28284
+ "learning_rate": 0.0001,
28285
+ "loss": 2.3437,
28286
+ "num_input_tokens_seen": 93206089536,
28287
+ "step": 158900
28288
+ },
28289
+ {
28290
+ "epoch": 3.4915564531131054,
28291
+ "grad_norm": 0.09997432678937912,
28292
+ "learning_rate": 0.0001,
28293
+ "loss": 2.341,
28294
+ "num_input_tokens_seen": 93258518336,
28295
+ "step": 158950
28296
+ },
28297
+ {
28298
+ "epoch": 3.492654756029168,
28299
+ "grad_norm": 0.09082050621509552,
28300
+ "learning_rate": 0.0001,
28301
+ "loss": 2.3338,
28302
+ "num_input_tokens_seen": 93310947136,
28303
+ "step": 159000
28304
+ },
28305
+ {
28306
+ "epoch": 3.492654756029168,
28307
+ "eval_loss": 2.2572686672210693,
28308
+ "eval_runtime": 80.2123,
28309
+ "eval_samples_per_second": 62.335,
28310
+ "eval_steps_per_second": 15.584,
28311
+ "num_input_tokens_seen": 93310947136,
28312
+ "step": 159000
28313
  }
28314
  ],
28315
  "logging_steps": 50,
28316
  "max_steps": 200000,
28317
+ "num_input_tokens_seen": 93310947136,
28318
  "num_train_epochs": 5,
28319
  "save_steps": 1000,
28320
  "stateful_callbacks": {
 
28329
  "attributes": {}
28330
  }
28331
  },
28332
+ "total_flos": 1.651433565139625e+20,
28333
  "train_batch_size": 32,
28334
  "trial_name": null,
28335
  "trial_params": null