Plofski commited on
Commit
d75a2fc
·
verified ·
1 Parent(s): d994af2

Training in progress, step 10500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25829a343b7e06cb4e4167e9b46a367935f8229a77e72a1421998542e27d1c90
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cc6d2ac14b136a0c5c39d3842c8290195765d0231c31019222880ab2ada323a
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d82f068e68971eb9728724c53cc1a345fe8d815fa606c2f3450b9b39b939104
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6c4e658acbdc5e0bc6eda245ab297a40c16a3c1814b13d63c1d7cae82962a95
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2aa24cd194618e57510eb16be4a4510b1af7e8497163286c5cb19c98f052ca0
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f1b6e95985cf829ad61f7f680a73f323339cc556ff96e0fd4cb8e86a2237898
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.014910336490026,
6
  "eval_steps": 500,
7
- "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9008,6 +9008,456 @@
9008
  "mean_token_accuracy": 0.801843786239624,
9009
  "num_tokens": 11076275.0,
9010
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9011
  }
9012
  ],
9013
  "logging_steps": 10,
@@ -9027,7 +9477,7 @@
9027
  "attributes": {}
9028
  }
9029
  },
9030
- "total_flos": 1.3397361208068096e+16,
9031
  "train_batch_size": 8,
9032
  "trial_name": null,
9033
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.1156558533145273,
6
  "eval_steps": 500,
7
+ "global_step": 10500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9008
  "mean_token_accuracy": 0.801843786239624,
9009
  "num_tokens": 11076275.0,
9010
  "step": 10000
9011
+ },
9012
+ {
9013
+ "epoch": 2.016925246826516,
9014
+ "grad_norm": 12.0625,
9015
+ "learning_rate": 6.555174961380886e-06,
9016
+ "loss": 0.7623,
9017
+ "mean_token_accuracy": 0.809873354434967,
9018
+ "num_tokens": 11087022.0,
9019
+ "step": 10010
9020
+ },
9021
+ {
9022
+ "epoch": 2.0189401571630063,
9023
+ "grad_norm": 11.0625,
9024
+ "learning_rate": 6.5417422258042855e-06,
9025
+ "loss": 0.9055,
9026
+ "mean_token_accuracy": 0.7793718695640564,
9027
+ "num_tokens": 11098027.0,
9028
+ "step": 10020
9029
+ },
9030
+ {
9031
+ "epoch": 2.0209550674994965,
9032
+ "grad_norm": 12.1875,
9033
+ "learning_rate": 6.528309490227685e-06,
9034
+ "loss": 0.7758,
9035
+ "mean_token_accuracy": 0.8036232054233551,
9036
+ "num_tokens": 11110290.0,
9037
+ "step": 10030
9038
+ },
9039
+ {
9040
+ "epoch": 2.022969977835986,
9041
+ "grad_norm": 11.6875,
9042
+ "learning_rate": 6.514876754651085e-06,
9043
+ "loss": 0.7201,
9044
+ "mean_token_accuracy": 0.8190805375576019,
9045
+ "num_tokens": 11120919.0,
9046
+ "step": 10040
9047
+ },
9048
+ {
9049
+ "epoch": 2.0249848881724763,
9050
+ "grad_norm": 12.5,
9051
+ "learning_rate": 6.501444019074485e-06,
9052
+ "loss": 0.9213,
9053
+ "mean_token_accuracy": 0.7752482295036316,
9054
+ "num_tokens": 11132545.0,
9055
+ "step": 10050
9056
+ },
9057
+ {
9058
+ "epoch": 2.0269997985089665,
9059
+ "grad_norm": 12.6875,
9060
+ "learning_rate": 6.488011283497885e-06,
9061
+ "loss": 0.8841,
9062
+ "mean_token_accuracy": 0.7821820974349976,
9063
+ "num_tokens": 11143351.0,
9064
+ "step": 10060
9065
+ },
9066
+ {
9067
+ "epoch": 2.029014708845456,
9068
+ "grad_norm": 12.1875,
9069
+ "learning_rate": 6.474578547921284e-06,
9070
+ "loss": 0.7844,
9071
+ "mean_token_accuracy": 0.7987569034099579,
9072
+ "num_tokens": 11153881.0,
9073
+ "step": 10070
9074
+ },
9075
+ {
9076
+ "epoch": 2.0310296191819464,
9077
+ "grad_norm": 11.75,
9078
+ "learning_rate": 6.461145812344684e-06,
9079
+ "loss": 0.8333,
9080
+ "mean_token_accuracy": 0.790239280462265,
9081
+ "num_tokens": 11164316.0,
9082
+ "step": 10080
9083
+ },
9084
+ {
9085
+ "epoch": 2.0330445295184365,
9086
+ "grad_norm": 10.0,
9087
+ "learning_rate": 6.447713076768084e-06,
9088
+ "loss": 0.7952,
9089
+ "mean_token_accuracy": 0.798451715707779,
9090
+ "num_tokens": 11175390.0,
9091
+ "step": 10090
9092
+ },
9093
+ {
9094
+ "epoch": 2.0350594398549267,
9095
+ "grad_norm": 11.125,
9096
+ "learning_rate": 6.434280341191485e-06,
9097
+ "loss": 0.866,
9098
+ "mean_token_accuracy": 0.7892335176467895,
9099
+ "num_tokens": 11187303.0,
9100
+ "step": 10100
9101
+ },
9102
+ {
9103
+ "epoch": 2.0370743501914164,
9104
+ "grad_norm": 12.875,
9105
+ "learning_rate": 6.4208476056148835e-06,
9106
+ "loss": 0.8048,
9107
+ "mean_token_accuracy": 0.796100115776062,
9108
+ "num_tokens": 11198006.0,
9109
+ "step": 10110
9110
+ },
9111
+ {
9112
+ "epoch": 2.0390892605279065,
9113
+ "grad_norm": 11.8125,
9114
+ "learning_rate": 6.407414870038284e-06,
9115
+ "loss": 0.8066,
9116
+ "mean_token_accuracy": 0.796057403087616,
9117
+ "num_tokens": 11207940.0,
9118
+ "step": 10120
9119
+ },
9120
+ {
9121
+ "epoch": 2.0411041708643967,
9122
+ "grad_norm": 12.25,
9123
+ "learning_rate": 6.393982134461684e-06,
9124
+ "loss": 0.7275,
9125
+ "mean_token_accuracy": 0.818196564912796,
9126
+ "num_tokens": 11218927.0,
9127
+ "step": 10130
9128
+ },
9129
+ {
9130
+ "epoch": 2.0431190812008864,
9131
+ "grad_norm": 12.5,
9132
+ "learning_rate": 6.380549398885083e-06,
9133
+ "loss": 1.0617,
9134
+ "mean_token_accuracy": 0.755308473110199,
9135
+ "num_tokens": 11230139.0,
9136
+ "step": 10140
9137
+ },
9138
+ {
9139
+ "epoch": 2.0451339915373765,
9140
+ "grad_norm": 9.75,
9141
+ "learning_rate": 6.367116663308484e-06,
9142
+ "loss": 0.793,
9143
+ "mean_token_accuracy": 0.8068155586719513,
9144
+ "num_tokens": 11241453.0,
9145
+ "step": 10150
9146
+ },
9147
+ {
9148
+ "epoch": 2.0471489018738667,
9149
+ "grad_norm": 9.3125,
9150
+ "learning_rate": 6.353683927731883e-06,
9151
+ "loss": 0.9479,
9152
+ "mean_token_accuracy": 0.7669933021068573,
9153
+ "num_tokens": 11253740.0,
9154
+ "step": 10160
9155
+ },
9156
+ {
9157
+ "epoch": 2.049163812210357,
9158
+ "grad_norm": 9.3125,
9159
+ "learning_rate": 6.340251192155283e-06,
9160
+ "loss": 0.6941,
9161
+ "mean_token_accuracy": 0.8190494358539582,
9162
+ "num_tokens": 11266238.0,
9163
+ "step": 10170
9164
+ },
9165
+ {
9166
+ "epoch": 2.0511787225468465,
9167
+ "grad_norm": 10.9375,
9168
+ "learning_rate": 6.326818456578683e-06,
9169
+ "loss": 0.7265,
9170
+ "mean_token_accuracy": 0.8130114257335663,
9171
+ "num_tokens": 11277298.0,
9172
+ "step": 10180
9173
+ },
9174
+ {
9175
+ "epoch": 2.0531936328833367,
9176
+ "grad_norm": 10.5625,
9177
+ "learning_rate": 6.313385721002082e-06,
9178
+ "loss": 0.7497,
9179
+ "mean_token_accuracy": 0.8102090239524842,
9180
+ "num_tokens": 11289134.0,
9181
+ "step": 10190
9182
+ },
9183
+ {
9184
+ "epoch": 2.055208543219827,
9185
+ "grad_norm": 12.875,
9186
+ "learning_rate": 6.299952985425482e-06,
9187
+ "loss": 0.823,
9188
+ "mean_token_accuracy": 0.7950894236564636,
9189
+ "num_tokens": 11299969.0,
9190
+ "step": 10200
9191
+ },
9192
+ {
9193
+ "epoch": 2.0572234535563165,
9194
+ "grad_norm": 11.75,
9195
+ "learning_rate": 6.286520249848882e-06,
9196
+ "loss": 0.7238,
9197
+ "mean_token_accuracy": 0.8222428441047669,
9198
+ "num_tokens": 11311491.0,
9199
+ "step": 10210
9200
+ },
9201
+ {
9202
+ "epoch": 2.0592383638928067,
9203
+ "grad_norm": 13.4375,
9204
+ "learning_rate": 6.2730875142722825e-06,
9205
+ "loss": 0.8528,
9206
+ "mean_token_accuracy": 0.7913759410381317,
9207
+ "num_tokens": 11321862.0,
9208
+ "step": 10220
9209
+ },
9210
+ {
9211
+ "epoch": 2.061253274229297,
9212
+ "grad_norm": 10.6875,
9213
+ "learning_rate": 6.259654778695682e-06,
9214
+ "loss": 0.7371,
9215
+ "mean_token_accuracy": 0.8153777897357941,
9216
+ "num_tokens": 11333343.0,
9217
+ "step": 10230
9218
+ },
9219
+ {
9220
+ "epoch": 2.063268184565787,
9221
+ "grad_norm": 11.0,
9222
+ "learning_rate": 6.246222043119081e-06,
9223
+ "loss": 0.8531,
9224
+ "mean_token_accuracy": 0.7924916267395019,
9225
+ "num_tokens": 11345705.0,
9226
+ "step": 10240
9227
+ },
9228
+ {
9229
+ "epoch": 2.0652830949022767,
9230
+ "grad_norm": 11.125,
9231
+ "learning_rate": 6.2327893075424815e-06,
9232
+ "loss": 0.7911,
9233
+ "mean_token_accuracy": 0.8001395165920258,
9234
+ "num_tokens": 11356265.0,
9235
+ "step": 10250
9236
+ },
9237
+ {
9238
+ "epoch": 2.067298005238767,
9239
+ "grad_norm": 11.5625,
9240
+ "learning_rate": 6.219356571965881e-06,
9241
+ "loss": 0.7311,
9242
+ "mean_token_accuracy": 0.8132711887359619,
9243
+ "num_tokens": 11366725.0,
9244
+ "step": 10260
9245
+ },
9246
+ {
9247
+ "epoch": 2.069312915575257,
9248
+ "grad_norm": 11.4375,
9249
+ "learning_rate": 6.205923836389282e-06,
9250
+ "loss": 0.8873,
9251
+ "mean_token_accuracy": 0.7833378136157989,
9252
+ "num_tokens": 11377435.0,
9253
+ "step": 10270
9254
+ },
9255
+ {
9256
+ "epoch": 2.071327825911747,
9257
+ "grad_norm": 10.5625,
9258
+ "learning_rate": 6.192491100812681e-06,
9259
+ "loss": 0.802,
9260
+ "mean_token_accuracy": 0.805024367570877,
9261
+ "num_tokens": 11388919.0,
9262
+ "step": 10280
9263
+ },
9264
+ {
9265
+ "epoch": 2.073342736248237,
9266
+ "grad_norm": 9.9375,
9267
+ "learning_rate": 6.179058365236081e-06,
9268
+ "loss": 0.782,
9269
+ "mean_token_accuracy": 0.8056276500225067,
9270
+ "num_tokens": 11399842.0,
9271
+ "step": 10290
9272
+ },
9273
+ {
9274
+ "epoch": 2.075357646584727,
9275
+ "grad_norm": 12.125,
9276
+ "learning_rate": 6.165625629659481e-06,
9277
+ "loss": 0.7425,
9278
+ "mean_token_accuracy": 0.8137720346450805,
9279
+ "num_tokens": 11410189.0,
9280
+ "step": 10300
9281
+ },
9282
+ {
9283
+ "epoch": 2.077372556921217,
9284
+ "grad_norm": 13.8125,
9285
+ "learning_rate": 6.1521928940828805e-06,
9286
+ "loss": 0.7876,
9287
+ "mean_token_accuracy": 0.8061869978904724,
9288
+ "num_tokens": 11419892.0,
9289
+ "step": 10310
9290
+ },
9291
+ {
9292
+ "epoch": 2.079387467257707,
9293
+ "grad_norm": 11.125,
9294
+ "learning_rate": 6.138760158506281e-06,
9295
+ "loss": 0.8218,
9296
+ "mean_token_accuracy": 0.7978219330310822,
9297
+ "num_tokens": 11432030.0,
9298
+ "step": 10320
9299
+ },
9300
+ {
9301
+ "epoch": 2.081402377594197,
9302
+ "grad_norm": 12.125,
9303
+ "learning_rate": 6.12532742292968e-06,
9304
+ "loss": 0.9129,
9305
+ "mean_token_accuracy": 0.7785973668098449,
9306
+ "num_tokens": 11443216.0,
9307
+ "step": 10330
9308
+ },
9309
+ {
9310
+ "epoch": 2.083417287930687,
9311
+ "grad_norm": 14.25,
9312
+ "learning_rate": 6.11189468735308e-06,
9313
+ "loss": 0.8782,
9314
+ "mean_token_accuracy": 0.7902339398860931,
9315
+ "num_tokens": 11454439.0,
9316
+ "step": 10340
9317
+ },
9318
+ {
9319
+ "epoch": 2.0854321982671773,
9320
+ "grad_norm": 13.1875,
9321
+ "learning_rate": 6.09846195177648e-06,
9322
+ "loss": 0.7751,
9323
+ "mean_token_accuracy": 0.8015202224254608,
9324
+ "num_tokens": 11465006.0,
9325
+ "step": 10350
9326
+ },
9327
+ {
9328
+ "epoch": 2.087447108603667,
9329
+ "grad_norm": 10.9375,
9330
+ "learning_rate": 6.085029216199879e-06,
9331
+ "loss": 0.8718,
9332
+ "mean_token_accuracy": 0.788787704706192,
9333
+ "num_tokens": 11474763.0,
9334
+ "step": 10360
9335
+ },
9336
+ {
9337
+ "epoch": 2.089462018940157,
9338
+ "grad_norm": 11.875,
9339
+ "learning_rate": 6.071596480623279e-06,
9340
+ "loss": 0.8359,
9341
+ "mean_token_accuracy": 0.7963649094104767,
9342
+ "num_tokens": 11485808.0,
9343
+ "step": 10370
9344
+ },
9345
+ {
9346
+ "epoch": 2.0914769292766473,
9347
+ "grad_norm": 11.4375,
9348
+ "learning_rate": 6.058163745046679e-06,
9349
+ "loss": 0.7756,
9350
+ "mean_token_accuracy": 0.8061880767345428,
9351
+ "num_tokens": 11497158.0,
9352
+ "step": 10380
9353
+ },
9354
+ {
9355
+ "epoch": 2.093491839613137,
9356
+ "grad_norm": 12.1875,
9357
+ "learning_rate": 6.04473100947008e-06,
9358
+ "loss": 0.946,
9359
+ "mean_token_accuracy": 0.7709095120429993,
9360
+ "num_tokens": 11507759.0,
9361
+ "step": 10390
9362
+ },
9363
+ {
9364
+ "epoch": 2.095506749949627,
9365
+ "grad_norm": 10.5625,
9366
+ "learning_rate": 6.0312982738934785e-06,
9367
+ "loss": 0.7665,
9368
+ "mean_token_accuracy": 0.8077448666095733,
9369
+ "num_tokens": 11518936.0,
9370
+ "step": 10400
9371
+ },
9372
+ {
9373
+ "epoch": 2.0975216602861173,
9374
+ "grad_norm": 12.125,
9375
+ "learning_rate": 6.017865538316878e-06,
9376
+ "loss": 0.8428,
9377
+ "mean_token_accuracy": 0.7939082264900208,
9378
+ "num_tokens": 11529031.0,
9379
+ "step": 10410
9380
+ },
9381
+ {
9382
+ "epoch": 2.0995365706226075,
9383
+ "grad_norm": 12.4375,
9384
+ "learning_rate": 6.004432802740279e-06,
9385
+ "loss": 0.8232,
9386
+ "mean_token_accuracy": 0.8003376543521881,
9387
+ "num_tokens": 11541092.0,
9388
+ "step": 10420
9389
+ },
9390
+ {
9391
+ "epoch": 2.101551480959097,
9392
+ "grad_norm": 13.1875,
9393
+ "learning_rate": 5.991000067163678e-06,
9394
+ "loss": 0.7565,
9395
+ "mean_token_accuracy": 0.8153795897960663,
9396
+ "num_tokens": 11550995.0,
9397
+ "step": 10430
9398
+ },
9399
+ {
9400
+ "epoch": 2.1035663912955873,
9401
+ "grad_norm": 11.1875,
9402
+ "learning_rate": 5.977567331587079e-06,
9403
+ "loss": 0.7814,
9404
+ "mean_token_accuracy": 0.8018035531044007,
9405
+ "num_tokens": 11560893.0,
9406
+ "step": 10440
9407
+ },
9408
+ {
9409
+ "epoch": 2.1055813016320775,
9410
+ "grad_norm": 13.125,
9411
+ "learning_rate": 5.964134596010478e-06,
9412
+ "loss": 0.8012,
9413
+ "mean_token_accuracy": 0.7951594650745392,
9414
+ "num_tokens": 11571484.0,
9415
+ "step": 10450
9416
+ },
9417
+ {
9418
+ "epoch": 2.1075962119685676,
9419
+ "grad_norm": 11.5,
9420
+ "learning_rate": 5.950701860433877e-06,
9421
+ "loss": 0.8872,
9422
+ "mean_token_accuracy": 0.7848295509815216,
9423
+ "num_tokens": 11581689.0,
9424
+ "step": 10460
9425
+ },
9426
+ {
9427
+ "epoch": 2.1096111223050573,
9428
+ "grad_norm": 11.4375,
9429
+ "learning_rate": 5.937269124857278e-06,
9430
+ "loss": 0.7238,
9431
+ "mean_token_accuracy": 0.8154263854026794,
9432
+ "num_tokens": 11591503.0,
9433
+ "step": 10470
9434
+ },
9435
+ {
9436
+ "epoch": 2.1116260326415475,
9437
+ "grad_norm": 11.25,
9438
+ "learning_rate": 5.923836389280677e-06,
9439
+ "loss": 0.8675,
9440
+ "mean_token_accuracy": 0.7881495654582977,
9441
+ "num_tokens": 11602668.0,
9442
+ "step": 10480
9443
+ },
9444
+ {
9445
+ "epoch": 2.1136409429780376,
9446
+ "grad_norm": 14.25,
9447
+ "learning_rate": 5.910403653704077e-06,
9448
+ "loss": 0.7428,
9449
+ "mean_token_accuracy": 0.8135782480239868,
9450
+ "num_tokens": 11613507.0,
9451
+ "step": 10490
9452
+ },
9453
+ {
9454
+ "epoch": 2.1156558533145273,
9455
+ "grad_norm": 10.5,
9456
+ "learning_rate": 5.896970918127477e-06,
9457
+ "loss": 0.7815,
9458
+ "mean_token_accuracy": 0.807522964477539,
9459
+ "num_tokens": 11623915.0,
9460
+ "step": 10500
9461
  }
9462
  ],
9463
  "logging_steps": 10,
 
9477
  "attributes": {}
9478
  }
9479
  },
9480
+ "total_flos": 1.4062792370479104e+16,
9481
  "train_batch_size": 8,
9482
  "trial_name": null,
9483
  "trial_params": null