Azrail commited on
Commit
eb4f930
·
verified ·
1 Parent(s): a306db1

Training in progress, step 69000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88085ee37b0edacc225a0fb86ed3cfd9ddce1ecb2e83ddb9feeeb81a70bb80bd
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c67ab3cac009a5afdc201af7f0117dd68a478413d54e0923fe125d5f63dd515
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b2f336afb5813ccf452282223e763afdce040692a315590bb908f2063975a3f
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d01209ddef39b46affb20fe03502cb8000499194b31764df158aa95dc134101e
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb3d1fb9e8324a04c98053fb02a6fde8d1a865fd7ced6a674f76811c1bbb259f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1397d04798a1fd86f4b074ba5cc769a269eab9bb0994d2bcfee86faa58f609a6
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:215e906fb9e492afed15b6bbd2ab828199f0238620feca89e4e09f3e2ffc4109
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bba508cada3fb6a2130ffab8142880b38ad6264731466b5965eb74743d23afc9
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.4574019234423531,
6
  "eval_steps": 500,
7
- "global_step": 68000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -12112,11 +12112,189 @@
12112
  "eval_steps_per_second": 23.197,
12113
  "num_input_tokens_seen": 17825792000,
12114
  "step": 68000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12115
  }
12116
  ],
12117
  "logging_steps": 50,
12118
  "max_steps": 70000,
12119
- "num_input_tokens_seen": 17825792000,
12120
  "num_train_epochs": 1,
12121
  "save_steps": 1000,
12122
  "stateful_callbacks": {
@@ -12131,7 +12309,7 @@
12131
  "attributes": {}
12132
  }
12133
  },
12134
- "total_flos": 4.76857333972992e+18,
12135
  "train_batch_size": 64,
12136
  "trial_name": null,
12137
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4641284223165053,
6
  "eval_steps": 500,
7
+ "global_step": 69000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
12112
  "eval_steps_per_second": 23.197,
12113
  "num_input_tokens_seen": 17825792000,
12114
  "step": 68000
12115
+ },
12116
+ {
12117
+ "epoch": 0.4577382483860607,
12118
+ "grad_norm": 0.14517797529697418,
12119
+ "learning_rate": 3.6204032372137984e-05,
12120
+ "loss": 2.9674,
12121
+ "num_input_tokens_seen": 17838899200,
12122
+ "step": 68050
12123
+ },
12124
+ {
12125
+ "epoch": 0.4580745733297683,
12126
+ "grad_norm": 0.14154207706451416,
12127
+ "learning_rate": 3.439253262059822e-05,
12128
+ "loss": 2.9627,
12129
+ "num_input_tokens_seen": 17852006400,
12130
+ "step": 68100
12131
+ },
12132
+ {
12133
+ "epoch": 0.4584108982734759,
12134
+ "grad_norm": 0.14251314103603363,
12135
+ "learning_rate": 3.2625909085853776e-05,
12136
+ "loss": 2.9681,
12137
+ "num_input_tokens_seen": 17865113600,
12138
+ "step": 68150
12139
+ },
12140
+ {
12141
+ "epoch": 0.45874722321718353,
12142
+ "grad_norm": 0.15670983493328094,
12143
+ "learning_rate": 3.0904332038757974e-05,
12144
+ "loss": 2.9708,
12145
+ "num_input_tokens_seen": 17878220800,
12146
+ "step": 68200
12147
+ },
12148
+ {
12149
+ "epoch": 0.45908354816089114,
12150
+ "grad_norm": 0.1453925371170044,
12151
+ "learning_rate": 2.9227967408489654e-05,
12152
+ "loss": 2.9686,
12153
+ "num_input_tokens_seen": 17891328000,
12154
+ "step": 68250
12155
+ },
12156
+ {
12157
+ "epoch": 0.45941987310459875,
12158
+ "grad_norm": 0.13307476043701172,
12159
+ "learning_rate": 2.7596976766560976e-05,
12160
+ "loss": 2.9595,
12161
+ "num_input_tokens_seen": 17904435200,
12162
+ "step": 68300
12163
+ },
12164
+ {
12165
+ "epoch": 0.45975619804830636,
12166
+ "grad_norm": 0.14958307147026062,
12167
+ "learning_rate": 2.6011517311244848e-05,
12168
+ "loss": 2.9661,
12169
+ "num_input_tokens_seen": 17917542400,
12170
+ "step": 68350
12171
+ },
12172
+ {
12173
+ "epoch": 0.460092522992014,
12174
+ "grad_norm": 0.14210085570812225,
12175
+ "learning_rate": 2.4471741852423235e-05,
12176
+ "loss": 2.9737,
12177
+ "num_input_tokens_seen": 17930649600,
12178
+ "step": 68400
12179
+ },
12180
+ {
12181
+ "epoch": 0.4604288479357216,
12182
+ "grad_norm": 0.15127155184745789,
12183
+ "learning_rate": 2.2977798796859794e-05,
12184
+ "loss": 2.9627,
12185
+ "num_input_tokens_seen": 17943756800,
12186
+ "step": 68450
12187
+ },
12188
+ {
12189
+ "epoch": 0.4607651728794292,
12190
+ "grad_norm": 0.14184921979904175,
12191
+ "learning_rate": 2.152983213389559e-05,
12192
+ "loss": 2.9732,
12193
+ "num_input_tokens_seen": 17956864000,
12194
+ "step": 68500
12195
+ },
12196
+ {
12197
+ "epoch": 0.4607651728794292,
12198
+ "eval_loss": 2.865307331085205,
12199
+ "eval_runtime": 53.2908,
12200
+ "eval_samples_per_second": 93.825,
12201
+ "eval_steps_per_second": 23.456,
12202
+ "num_input_tokens_seen": 17956864000,
12203
+ "step": 68500
12204
+ },
12205
+ {
12206
+ "epoch": 0.4611014978231368,
12207
+ "grad_norm": 0.14755961298942566,
12208
+ "learning_rate": 2.0127981421571295e-05,
12209
+ "loss": 2.9687,
12210
+ "num_input_tokens_seen": 17969971200,
12211
+ "step": 68550
12212
+ },
12213
+ {
12214
+ "epoch": 0.4614378227668444,
12215
+ "grad_norm": 0.1370965540409088,
12216
+ "learning_rate": 1.8772381773176416e-05,
12217
+ "loss": 2.9711,
12218
+ "num_input_tokens_seen": 17983078400,
12219
+ "step": 68600
12220
+ },
12221
+ {
12222
+ "epoch": 0.46177414771055203,
12223
+ "grad_norm": 0.14454130828380585,
12224
+ "learning_rate": 1.7463163844226305e-05,
12225
+ "loss": 2.9633,
12226
+ "num_input_tokens_seen": 17996185600,
12227
+ "step": 68650
12228
+ },
12229
+ {
12230
+ "epoch": 0.46211047265425964,
12231
+ "grad_norm": 0.13908445835113525,
12232
+ "learning_rate": 1.620045381987012e-05,
12233
+ "loss": 2.9662,
12234
+ "num_input_tokens_seen": 18009292800,
12235
+ "step": 68700
12236
+ },
12237
+ {
12238
+ "epoch": 0.46244679759796725,
12239
+ "grad_norm": 0.2359876185655594,
12240
+ "learning_rate": 1.4984373402728013e-05,
12241
+ "loss": 2.9671,
12242
+ "num_input_tokens_seen": 18022400000,
12243
+ "step": 68750
12244
+ },
12245
+ {
12246
+ "epoch": 0.46278312254167486,
12247
+ "grad_norm": 0.13809122145175934,
12248
+ "learning_rate": 1.3815039801161721e-05,
12249
+ "loss": 2.9684,
12250
+ "num_input_tokens_seen": 18035507200,
12251
+ "step": 68800
12252
+ },
12253
+ {
12254
+ "epoch": 0.4631194474853825,
12255
+ "grad_norm": 0.14375115931034088,
12256
+ "learning_rate": 1.26925657179775e-05,
12257
+ "loss": 2.9677,
12258
+ "num_input_tokens_seen": 18048614400,
12259
+ "step": 68850
12260
+ },
12261
+ {
12262
+ "epoch": 0.4634557724290901,
12263
+ "grad_norm": 0.14648525416851044,
12264
+ "learning_rate": 1.1617059339563806e-05,
12265
+ "loss": 2.9625,
12266
+ "num_input_tokens_seen": 18061721600,
12267
+ "step": 68900
12268
+ },
12269
+ {
12270
+ "epoch": 0.4637920973727977,
12271
+ "grad_norm": 0.1428016871213913,
12272
+ "learning_rate": 1.058862432546387e-05,
12273
+ "loss": 2.9717,
12274
+ "num_input_tokens_seen": 18074828800,
12275
+ "step": 68950
12276
+ },
12277
+ {
12278
+ "epoch": 0.4641284223165053,
12279
+ "grad_norm": 0.14518927037715912,
12280
+ "learning_rate": 9.607359798384786e-06,
12281
+ "loss": 2.9622,
12282
+ "num_input_tokens_seen": 18087936000,
12283
+ "step": 69000
12284
+ },
12285
+ {
12286
+ "epoch": 0.4641284223165053,
12287
+ "eval_loss": 2.8647797107696533,
12288
+ "eval_runtime": 53.1259,
12289
+ "eval_samples_per_second": 94.116,
12290
+ "eval_steps_per_second": 23.529,
12291
+ "num_input_tokens_seen": 18087936000,
12292
+ "step": 69000
12293
  }
12294
  ],
12295
  "logging_steps": 50,
12296
  "max_steps": 70000,
12297
+ "num_input_tokens_seen": 18087936000,
12298
  "num_train_epochs": 1,
12299
  "save_steps": 1000,
12300
  "stateful_callbacks": {
 
12309
  "attributes": {}
12310
  }
12311
  },
12312
+ "total_flos": 4.83869941825536e+18,
12313
  "train_batch_size": 64,
12314
  "trial_name": null,
12315
  "trial_params": null