Azrail commited on
Commit
2f6caad
·
verified ·
1 Parent(s): 05c8389

Training in progress, step 69000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b26db7188c89cde52f93cc8f561f4529a8702aaa52ce9c883892b96769dd603
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:255db0ba45691d582a9ee109acc81eb21c645c6dd171a1bf2c3e231a0982d734
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec885f087630fd98da5aea6a3b9af5bf67a1e0daf9ab5c57e09d7f1ac7385946
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78af01c29bc4dc4815cd8cb2a0e12aac4f0221e2ad669f18caa4315a15ef83d7
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb3d1fb9e8324a04c98053fb02a6fde8d1a865fd7ced6a674f76811c1bbb259f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1397d04798a1fd86f4b074ba5cc769a269eab9bb0994d2bcfee86faa58f609a6
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4fb0106671a29e67305a03ecdd422ffd62f40cc2f3e19327fe3581d2d1603d90
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16992573f9fe212ca32ce6bacf3d51d66103db65f351073047f24fab4f0f55af
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.3243617110080256,
6
  "eval_steps": 500,
7
- "global_step": 68000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -12112,11 +12112,189 @@
12112
  "eval_steps_per_second": 23.177,
12113
  "num_input_tokens_seen": 17825787456,
12114
  "step": 68000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12115
  }
12116
  ],
12117
  "logging_steps": 50,
12118
  "max_steps": 70000,
12119
- "num_input_tokens_seen": 17825787456,
12120
  "num_train_epochs": 1,
12121
  "save_steps": 1000,
12122
  "stateful_callbacks": {
@@ -12131,7 +12309,7 @@
12131
  "attributes": {}
12132
  }
12133
  },
12134
- "total_flos": 4.768572124165571e+18,
12135
  "train_batch_size": 64,
12136
  "trial_name": null,
12137
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.3291317361699083,
6
  "eval_steps": 500,
7
+ "global_step": 69000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
12112
  "eval_steps_per_second": 23.177,
12113
  "num_input_tokens_seen": 17825787456,
12114
  "step": 68000
12115
+ },
12116
+ {
12117
+ "epoch": 0.3246002122661197,
12118
+ "grad_norm": 0.15067367255687714,
12119
+ "learning_rate": 4.7109889986402973e-05,
12120
+ "loss": 2.5181,
12121
+ "num_input_tokens_seen": 17838894656,
12122
+ "step": 68050
12123
+ },
12124
+ {
12125
+ "epoch": 0.32483871352421384,
12126
+ "grad_norm": 0.1534261703491211,
12127
+ "learning_rate": 4.476122667059207e-05,
12128
+ "loss": 2.533,
12129
+ "num_input_tokens_seen": 17852001856,
12130
+ "step": 68100
12131
+ },
12132
+ {
12133
+ "epoch": 0.325077214782308,
12134
+ "grad_norm": 0.1585472822189331,
12135
+ "learning_rate": 4.2469871766340095e-05,
12136
+ "loss": 2.509,
12137
+ "num_input_tokens_seen": 17865109056,
12138
+ "step": 68150
12139
+ },
12140
+ {
12141
+ "epoch": 0.3253157160404021,
12142
+ "grad_norm": 0.15480853617191315,
12143
+ "learning_rate": 4.0236113724274713e-05,
12144
+ "loss": 2.524,
12145
+ "num_input_tokens_seen": 17878216256,
12146
+ "step": 68200
12147
+ },
12148
+ {
12149
+ "epoch": 0.32555421729849626,
12150
+ "grad_norm": 0.24341611564159393,
12151
+ "learning_rate": 3.806023374435663e-05,
12152
+ "loss": 2.5293,
12153
+ "num_input_tokens_seen": 17891323456,
12154
+ "step": 68250
12155
+ },
12156
+ {
12157
+ "epoch": 0.32579271855659037,
12158
+ "grad_norm": 0.15290473401546478,
12159
+ "learning_rate": 3.594250574048058e-05,
12160
+ "loss": 2.5149,
12161
+ "num_input_tokens_seen": 17904430656,
12162
+ "step": 68300
12163
+ },
12164
+ {
12165
+ "epoch": 0.3260312198146845,
12166
+ "grad_norm": 0.1606835126876831,
12167
+ "learning_rate": 3.3883196305992905e-05,
12168
+ "loss": 2.5327,
12169
+ "num_input_tokens_seen": 17917537856,
12170
+ "step": 68350
12171
+ },
12172
+ {
12173
+ "epoch": 0.3262697210727787,
12174
+ "grad_norm": 0.1537574976682663,
12175
+ "learning_rate": 3.18825646801314e-05,
12176
+ "loss": 2.5416,
12177
+ "num_input_tokens_seen": 17930645056,
12178
+ "step": 68400
12179
+ },
12180
+ {
12181
+ "epoch": 0.3265082223308728,
12182
+ "grad_norm": 0.16943201422691345,
12183
+ "learning_rate": 2.994086271539048e-05,
12184
+ "loss": 2.5233,
12185
+ "num_input_tokens_seen": 17943752256,
12186
+ "step": 68450
12187
+ },
12188
+ {
12189
+ "epoch": 0.32674672358896695,
12190
+ "grad_norm": 0.15832561254501343,
12191
+ "learning_rate": 2.8058334845816213e-05,
12192
+ "loss": 2.5439,
12193
+ "num_input_tokens_seen": 17956859456,
12194
+ "step": 68500
12195
+ },
12196
+ {
12197
+ "epoch": 0.32674672358896695,
12198
+ "eval_loss": 2.4128847122192383,
12199
+ "eval_runtime": 53.1054,
12200
+ "eval_samples_per_second": 94.152,
12201
+ "eval_steps_per_second": 23.538,
12202
+ "num_input_tokens_seen": 17956859456,
12203
+ "step": 68500
12204
+ },
12205
+ {
12206
+ "epoch": 0.32698522484706105,
12207
+ "grad_norm": 0.15245509147644043,
12208
+ "learning_rate": 2.6235218056235634e-05,
12209
+ "loss": 2.5209,
12210
+ "num_input_tokens_seen": 17969966656,
12211
+ "step": 68550
12212
+ },
12213
+ {
12214
+ "epoch": 0.3272237261051552,
12215
+ "grad_norm": 0.15148235857486725,
12216
+ "learning_rate": 2.4471741852423235e-05,
12217
+ "loss": 2.5284,
12218
+ "num_input_tokens_seen": 17983073856,
12219
+ "step": 68600
12220
+ },
12221
+ {
12222
+ "epoch": 0.3274622273632493,
12223
+ "grad_norm": 0.15678688883781433,
12224
+ "learning_rate": 2.276812823220964e-05,
12225
+ "loss": 2.537,
12226
+ "num_input_tokens_seen": 17996181056,
12227
+ "step": 68650
12228
+ },
12229
+ {
12230
+ "epoch": 0.3277007286213435,
12231
+ "grad_norm": 0.15105360746383667,
12232
+ "learning_rate": 2.1124591657534777e-05,
12233
+ "loss": 2.5321,
12234
+ "num_input_tokens_seen": 18009288256,
12235
+ "step": 68700
12236
+ },
12237
+ {
12238
+ "epoch": 0.32793922987943763,
12239
+ "grad_norm": 0.15369552373886108,
12240
+ "learning_rate": 1.9541339027450256e-05,
12241
+ "loss": 2.5291,
12242
+ "num_input_tokens_seen": 18022395456,
12243
+ "step": 68750
12244
+ },
12245
+ {
12246
+ "epoch": 0.32817773113753174,
12247
+ "grad_norm": 0.1551530808210373,
12248
+ "learning_rate": 1.801856965207338e-05,
12249
+ "loss": 2.5201,
12250
+ "num_input_tokens_seen": 18035502656,
12251
+ "step": 68800
12252
+ },
12253
+ {
12254
+ "epoch": 0.3284162323956259,
12255
+ "grad_norm": 0.14859162271022797,
12256
+ "learning_rate": 1.6556475227496815e-05,
12257
+ "loss": 2.5436,
12258
+ "num_input_tokens_seen": 18048609856,
12259
+ "step": 68850
12260
+ },
12261
+ {
12262
+ "epoch": 0.32865473365372,
12263
+ "grad_norm": 0.14972691237926483,
12264
+ "learning_rate": 1.5155239811656562e-05,
12265
+ "loss": 2.5221,
12266
+ "num_input_tokens_seen": 18061717056,
12267
+ "step": 68900
12268
+ },
12269
+ {
12270
+ "epoch": 0.32889323491181416,
12271
+ "grad_norm": 0.156805619597435,
12272
+ "learning_rate": 1.3815039801161721e-05,
12273
+ "loss": 2.5248,
12274
+ "num_input_tokens_seen": 18074824256,
12275
+ "step": 68950
12276
+ },
12277
+ {
12278
+ "epoch": 0.3291317361699083,
12279
+ "grad_norm": 0.148334801197052,
12280
+ "learning_rate": 1.2536043909088191e-05,
12281
+ "loss": 2.5361,
12282
+ "num_input_tokens_seen": 18087931456,
12283
+ "step": 69000
12284
+ },
12285
+ {
12286
+ "epoch": 0.3291317361699083,
12287
+ "eval_loss": 2.4120428562164307,
12288
+ "eval_runtime": 52.9258,
12289
+ "eval_samples_per_second": 94.472,
12290
+ "eval_steps_per_second": 23.618,
12291
+ "num_input_tokens_seen": 18087931456,
12292
+ "step": 69000
12293
  }
12294
  ],
12295
  "logging_steps": 50,
12296
  "max_steps": 70000,
12297
+ "num_input_tokens_seen": 18087931456,
12298
  "num_train_epochs": 1,
12299
  "save_steps": 1000,
12300
  "stateful_callbacks": {
 
12309
  "attributes": {}
12310
  }
12311
  },
12312
+ "total_flos": 4.838698202691011e+18,
12313
  "train_batch_size": 64,
12314
  "trial_name": null,
12315
  "trial_params": null