Azrail commited on
Commit
423c24c
·
verified ·
1 Parent(s): c1edb51

Training in progress, step 114000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:129f26bf285c927fb1ca67bf7975ab174e3ba9305c910bf5556605aeaa81c78e
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b778ecb426d78f0896855e8fb4aad5b0ed64f4bb1e53aede2d8069fdd044f83f
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:139c4b8aa767d3344e8dd6530590ff617e5f940f55af80e164711a5e937099df
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e40a86136eefe7a52f906d32b10df1f61bc2559012b7bd8d21fd2f6358ab1422
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef617a95cce573c223584c3ba54aeae3c5fde1db1b14c5d13506c2f7079cec61
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f9d7695201cafd8e529bbb705c4e86352c97146b7f2c1d17b903edf259b2912
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c067dcd97b2d11c79b34cd5a1dfeb3c320d7856efa0a4a62a589309e4cce1b3d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a3df12db58d0a78ce660a6cf049d113e8861e8aa8611c9714bf603dc61fb3a9
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0780280715980777,
6
  "eval_steps": 500,
7
- "global_step": 113000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -20122,11 +20122,189 @@
20122
  "eval_steps_per_second": 11.42,
20123
  "num_input_tokens_seen": 59235047232,
20124
  "step": 113000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20125
  }
20126
  ],
20127
  "logging_steps": 50,
20128
  "max_steps": 140000,
20129
- "num_input_tokens_seen": 59235047232,
20130
  "num_train_epochs": 2,
20131
  "save_steps": 1000,
20132
  "stateful_callbacks": {
@@ -20141,7 +20319,7 @@
20141
  "attributes": {}
20142
  }
20143
  },
20144
- "total_flos": 1.0483522912802488e+20,
20145
  "train_batch_size": 32,
20146
  "trial_name": null,
20147
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0875681219218432,
6
  "eval_steps": 500,
7
+ "global_step": 114000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
20122
  "eval_steps_per_second": 11.42,
20123
  "num_input_tokens_seen": 59235047232,
20124
  "step": 113000
20125
+ },
20126
+ {
20127
+ "epoch": 1.0785050741142659,
20128
+ "grad_norm": 0.14696183800697327,
20129
+ "learning_rate": 0.0009965342284774632,
20130
+ "loss": 2.084,
20131
+ "num_input_tokens_seen": 59261256096,
20132
+ "step": 113050
20133
+ },
20134
+ {
20135
+ "epoch": 1.0789820766304543,
20136
+ "grad_norm": 0.1535506546497345,
20137
+ "learning_rate": 0.0009961967251474822,
20138
+ "loss": 2.0905,
20139
+ "num_input_tokens_seen": 59287464384,
20140
+ "step": 113100
20141
+ },
20142
+ {
20143
+ "epoch": 1.0794590791466425,
20144
+ "grad_norm": 0.14321501553058624,
20145
+ "learning_rate": 0.000995843605578539,
20146
+ "loss": 2.0971,
20147
+ "num_input_tokens_seen": 59313669856,
20148
+ "step": 113150
20149
+ },
20150
+ {
20151
+ "epoch": 1.0799360816628307,
20152
+ "grad_norm": 0.15687337517738342,
20153
+ "learning_rate": 0.0009954748808839674,
20154
+ "loss": 2.0864,
20155
+ "num_input_tokens_seen": 59339879328,
20156
+ "step": 113200
20157
+ },
20158
+ {
20159
+ "epoch": 1.080413084179019,
20160
+ "grad_norm": 0.16271081566810608,
20161
+ "learning_rate": 0.000995090562668223,
20162
+ "loss": 2.0948,
20163
+ "num_input_tokens_seen": 59366089088,
20164
+ "step": 113250
20165
+ },
20166
+ {
20167
+ "epoch": 1.0808900866952074,
20168
+ "grad_norm": 0.14683839678764343,
20169
+ "learning_rate": 0.0009946906630265184,
20170
+ "loss": 2.105,
20171
+ "num_input_tokens_seen": 59392300448,
20172
+ "step": 113300
20173
+ },
20174
+ {
20175
+ "epoch": 1.0813670892113956,
20176
+ "grad_norm": 0.15148819983005524,
20177
+ "learning_rate": 0.0009942751945444437,
20178
+ "loss": 2.0814,
20179
+ "num_input_tokens_seen": 59418514560,
20180
+ "step": 113350
20181
+ },
20182
+ {
20183
+ "epoch": 1.0818440917275838,
20184
+ "grad_norm": 0.14587359130382538,
20185
+ "learning_rate": 0.0009938441702975688,
20186
+ "loss": 2.0943,
20187
+ "num_input_tokens_seen": 59444719360,
20188
+ "step": 113400
20189
+ },
20190
+ {
20191
+ "epoch": 1.0823210942437722,
20192
+ "grad_norm": 0.14699944853782654,
20193
+ "learning_rate": 0.0009933976038510332,
20194
+ "loss": 2.0927,
20195
+ "num_input_tokens_seen": 59470933600,
20196
+ "step": 113450
20197
+ },
20198
+ {
20199
+ "epoch": 1.0827980967599604,
20200
+ "grad_norm": 0.14229649305343628,
20201
+ "learning_rate": 0.0009929355092591179,
20202
+ "loss": 2.0985,
20203
+ "num_input_tokens_seen": 59497148000,
20204
+ "step": 113500
20205
+ },
20206
+ {
20207
+ "epoch": 1.0827980967599604,
20208
+ "eval_loss": 2.009983539581299,
20209
+ "eval_runtime": 82.6823,
20210
+ "eval_samples_per_second": 60.472,
20211
+ "eval_steps_per_second": 15.118,
20212
+ "num_input_tokens_seen": 59497148000,
20213
+ "step": 113500
20214
+ },
20215
+ {
20216
+ "epoch": 1.0832750992761486,
20217
+ "grad_norm": 0.14160077273845673,
20218
+ "learning_rate": 0.0009924579010648041,
20219
+ "loss": 2.0935,
20220
+ "num_input_tokens_seen": 59523359584,
20221
+ "step": 113550
20222
+ },
20223
+ {
20224
+ "epoch": 1.083752101792337,
20225
+ "grad_norm": 0.1411445587873459,
20226
+ "learning_rate": 0.0009919647942993148,
20227
+ "loss": 2.093,
20228
+ "num_input_tokens_seen": 59549569568,
20229
+ "step": 113600
20230
+ },
20231
+ {
20232
+ "epoch": 1.0842291043085253,
20233
+ "grad_norm": 0.13501347601413727,
20234
+ "learning_rate": 0.0009914562044816423,
20235
+ "loss": 2.0919,
20236
+ "num_input_tokens_seen": 59575783200,
20237
+ "step": 113650
20238
+ },
20239
+ {
20240
+ "epoch": 1.0847061068247135,
20241
+ "grad_norm": 0.14355099201202393,
20242
+ "learning_rate": 0.0009909321476180592,
20243
+ "loss": 2.0913,
20244
+ "num_input_tokens_seen": 59601990304,
20245
+ "step": 113700
20246
+ },
20247
+ {
20248
+ "epoch": 1.0851831093409017,
20249
+ "grad_norm": 0.13246339559555054,
20250
+ "learning_rate": 0.0009903926402016153,
20251
+ "loss": 2.0803,
20252
+ "num_input_tokens_seen": 59628197120,
20253
+ "step": 113750
20254
+ },
20255
+ {
20256
+ "epoch": 1.08566011185709,
20257
+ "grad_norm": 0.13418996334075928,
20258
+ "learning_rate": 0.0009898376992116178,
20259
+ "loss": 2.1042,
20260
+ "num_input_tokens_seen": 59654409856,
20261
+ "step": 113800
20262
+ },
20263
+ {
20264
+ "epoch": 1.0861371143732783,
20265
+ "grad_norm": 0.15235918760299683,
20266
+ "learning_rate": 0.0009892673421130977,
20267
+ "loss": 2.0987,
20268
+ "num_input_tokens_seen": 59680620096,
20269
+ "step": 113850
20270
+ },
20271
+ {
20272
+ "epoch": 1.0866141168894665,
20273
+ "grad_norm": 0.1395738422870636,
20274
+ "learning_rate": 0.0009886815868562597,
20275
+ "loss": 2.0932,
20276
+ "num_input_tokens_seen": 59706827264,
20277
+ "step": 113900
20278
+ },
20279
+ {
20280
+ "epoch": 1.087091119405655,
20281
+ "grad_norm": 0.1433008313179016,
20282
+ "learning_rate": 0.000988080451875917,
20283
+ "loss": 2.0943,
20284
+ "num_input_tokens_seen": 59733034688,
20285
+ "step": 113950
20286
+ },
20287
+ {
20288
+ "epoch": 1.0875681219218432,
20289
+ "grad_norm": 0.14490137994289398,
20290
+ "learning_rate": 0.0009874639560909118,
20291
+ "loss": 2.1012,
20292
+ "num_input_tokens_seen": 59759249088,
20293
+ "step": 114000
20294
+ },
20295
+ {
20296
+ "epoch": 1.0875681219218432,
20297
+ "eval_loss": 2.0104737281799316,
20298
+ "eval_runtime": 82.5956,
20299
+ "eval_samples_per_second": 60.536,
20300
+ "eval_steps_per_second": 15.134,
20301
+ "num_input_tokens_seen": 59759249088,
20302
+ "step": 114000
20303
  }
20304
  ],
20305
  "logging_steps": 50,
20306
  "max_steps": 140000,
20307
+ "num_input_tokens_seen": 59759249088,
20308
  "num_train_epochs": 2,
20309
  "save_steps": 1000,
20310
  "stateful_callbacks": {
 
20319
  "attributes": {}
20320
  }
20321
  },
20322
+ "total_flos": 1.0576297079872635e+20,
20323
  "train_batch_size": 32,
20324
  "trial_name": null,
20325
  "trial_params": null