Azrail commited on
Commit
a488be6
·
verified ·
1 Parent(s): 3c88b6c

Training in progress, step 41000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b7729248601047f8e38f3850e7e18cf7889b4308d9f9580a679de009332f3da
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fb18d4c27c64f6607996dc76ab059b3274f96bf50194e20861ca91446bac906
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:358495fc0c32209eebada050e3f21202035c40fdaedcd448ad4300b16ac6f351
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4a71156c2d2f2da1c265821c7ca99486fbc72cc466c418215c7150c425f5836
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88b68a8a714da2056a995b3e2624d11f9159cf571839f94eec4ca36bd56bea1f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:028c63076d3d8e5d0c73e4da1b6fc8793d1c56810af68c19f7f253b3016ce7ac
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:adaad33497c37859c99058576b7822dd684fa1b3784f6f798b4f9dc171601aff
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bafdd2692f3ffed299379761090a99347b59a938d0713ea16130141db6dd54e
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.19080100647530915,
6
  "eval_steps": 500,
7
- "global_step": 40000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7128,11 +7128,189 @@
7128
  "eval_steps_per_second": 24.51,
7129
  "num_input_tokens_seen": 10485755456,
7130
  "step": 40000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7131
  }
7132
  ],
7133
  "logging_steps": 50,
7134
  "max_steps": 70000,
7135
- "num_input_tokens_seen": 10485755456,
7136
  "num_train_epochs": 1,
7137
  "save_steps": 1000,
7138
  "stateful_callbacks": {
@@ -7147,7 +7325,7 @@
7147
  "attributes": {}
7148
  }
7149
  },
7150
- "total_flos": 2.8050419254532506e+18,
7151
  "train_batch_size": 64,
7152
  "trial_name": null,
7153
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.1955710316371919,
6
  "eval_steps": 500,
7
+ "global_step": 41000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7128
  "eval_steps_per_second": 24.51,
7129
  "num_input_tokens_seen": 10485755456,
7130
  "step": 40000
7131
+ },
7132
+ {
7133
+ "epoch": 0.19103950773340328,
7134
+ "grad_norm": 0.18717694282531738,
7135
+ "learning_rate": 0.001,
7136
+ "loss": 2.6512,
7137
+ "num_input_tokens_seen": 10498862656,
7138
+ "step": 40050
7139
+ },
7140
+ {
7141
+ "epoch": 0.19127800899149744,
7142
+ "grad_norm": 0.2009858638048172,
7143
+ "learning_rate": 0.001,
7144
+ "loss": 2.6289,
7145
+ "num_input_tokens_seen": 10511969856,
7146
+ "step": 40100
7147
+ },
7148
+ {
7149
+ "epoch": 0.19151651024959157,
7150
+ "grad_norm": 0.2515949010848999,
7151
+ "learning_rate": 0.001,
7152
+ "loss": 2.6342,
7153
+ "num_input_tokens_seen": 10525077056,
7154
+ "step": 40150
7155
+ },
7156
+ {
7157
+ "epoch": 0.1917550115076857,
7158
+ "grad_norm": 0.19864948093891144,
7159
+ "learning_rate": 0.001,
7160
+ "loss": 2.6191,
7161
+ "num_input_tokens_seen": 10538184256,
7162
+ "step": 40200
7163
+ },
7164
+ {
7165
+ "epoch": 0.19199351276577983,
7166
+ "grad_norm": 0.17704185843467712,
7167
+ "learning_rate": 0.001,
7168
+ "loss": 2.6176,
7169
+ "num_input_tokens_seen": 10551291456,
7170
+ "step": 40250
7171
+ },
7172
+ {
7173
+ "epoch": 0.19223201402387396,
7174
+ "grad_norm": 0.2097242772579193,
7175
+ "learning_rate": 0.001,
7176
+ "loss": 2.6509,
7177
+ "num_input_tokens_seen": 10564398656,
7178
+ "step": 40300
7179
+ },
7180
+ {
7181
+ "epoch": 0.19247051528196812,
7182
+ "grad_norm": 0.18630579113960266,
7183
+ "learning_rate": 0.001,
7184
+ "loss": 2.6273,
7185
+ "num_input_tokens_seen": 10577505856,
7186
+ "step": 40350
7187
+ },
7188
+ {
7189
+ "epoch": 0.19270901654006226,
7190
+ "grad_norm": 0.24162743985652924,
7191
+ "learning_rate": 0.001,
7192
+ "loss": 2.6405,
7193
+ "num_input_tokens_seen": 10590613056,
7194
+ "step": 40400
7195
+ },
7196
+ {
7197
+ "epoch": 0.1929475177981564,
7198
+ "grad_norm": 0.19576874375343323,
7199
+ "learning_rate": 0.001,
7200
+ "loss": 2.6403,
7201
+ "num_input_tokens_seen": 10603720256,
7202
+ "step": 40450
7203
+ },
7204
+ {
7205
+ "epoch": 0.19318601905625052,
7206
+ "grad_norm": 0.18408045172691345,
7207
+ "learning_rate": 0.001,
7208
+ "loss": 2.6149,
7209
+ "num_input_tokens_seen": 10616827456,
7210
+ "step": 40500
7211
+ },
7212
+ {
7213
+ "epoch": 0.19318601905625052,
7214
+ "eval_loss": 2.511899709701538,
7215
+ "eval_runtime": 51.5326,
7216
+ "eval_samples_per_second": 97.026,
7217
+ "eval_steps_per_second": 24.257,
7218
+ "num_input_tokens_seen": 10616827456,
7219
+ "step": 40500
7220
+ },
7221
+ {
7222
+ "epoch": 0.19342452031434465,
7223
+ "grad_norm": 0.20845313370227814,
7224
+ "learning_rate": 0.001,
7225
+ "loss": 2.6242,
7226
+ "num_input_tokens_seen": 10629934656,
7227
+ "step": 40550
7228
+ },
7229
+ {
7230
+ "epoch": 0.19366302157243878,
7231
+ "grad_norm": 0.20603816211223602,
7232
+ "learning_rate": 0.001,
7233
+ "loss": 2.6305,
7234
+ "num_input_tokens_seen": 10643041856,
7235
+ "step": 40600
7236
+ },
7237
+ {
7238
+ "epoch": 0.19390152283053294,
7239
+ "grad_norm": 0.2180013507604599,
7240
+ "learning_rate": 0.001,
7241
+ "loss": 2.6271,
7242
+ "num_input_tokens_seen": 10656149056,
7243
+ "step": 40650
7244
+ },
7245
+ {
7246
+ "epoch": 0.19414002408862707,
7247
+ "grad_norm": 0.22217005491256714,
7248
+ "learning_rate": 0.001,
7249
+ "loss": 2.6407,
7250
+ "num_input_tokens_seen": 10669256256,
7251
+ "step": 40700
7252
+ },
7253
+ {
7254
+ "epoch": 0.1943785253467212,
7255
+ "grad_norm": 0.21379347145557404,
7256
+ "learning_rate": 0.001,
7257
+ "loss": 2.6209,
7258
+ "num_input_tokens_seen": 10682363456,
7259
+ "step": 40750
7260
+ },
7261
+ {
7262
+ "epoch": 0.19461702660481534,
7263
+ "grad_norm": 0.2011626958847046,
7264
+ "learning_rate": 0.001,
7265
+ "loss": 2.6471,
7266
+ "num_input_tokens_seen": 10695470656,
7267
+ "step": 40800
7268
+ },
7269
+ {
7270
+ "epoch": 0.19485552786290947,
7271
+ "grad_norm": 0.1946493685245514,
7272
+ "learning_rate": 0.001,
7273
+ "loss": 2.6267,
7274
+ "num_input_tokens_seen": 10708577856,
7275
+ "step": 40850
7276
+ },
7277
+ {
7278
+ "epoch": 0.19509402912100363,
7279
+ "grad_norm": 0.19157454371452332,
7280
+ "learning_rate": 0.001,
7281
+ "loss": 2.6362,
7282
+ "num_input_tokens_seen": 10721685056,
7283
+ "step": 40900
7284
+ },
7285
+ {
7286
+ "epoch": 0.19533253037909776,
7287
+ "grad_norm": 0.1978122442960739,
7288
+ "learning_rate": 0.001,
7289
+ "loss": 2.6448,
7290
+ "num_input_tokens_seen": 10734792256,
7291
+ "step": 40950
7292
+ },
7293
+ {
7294
+ "epoch": 0.1955710316371919,
7295
+ "grad_norm": 0.19996555149555206,
7296
+ "learning_rate": 0.001,
7297
+ "loss": 2.626,
7298
+ "num_input_tokens_seen": 10747899456,
7299
+ "step": 41000
7300
+ },
7301
+ {
7302
+ "epoch": 0.1955710316371919,
7303
+ "eval_loss": 2.5084941387176514,
7304
+ "eval_runtime": 51.6987,
7305
+ "eval_samples_per_second": 96.714,
7306
+ "eval_steps_per_second": 24.179,
7307
+ "num_input_tokens_seen": 10747899456,
7308
+ "step": 41000
7309
  }
7310
  ],
7311
  "logging_steps": 50,
7312
  "max_steps": 70000,
7313
+ "num_input_tokens_seen": 10747899456,
7314
  "num_train_epochs": 1,
7315
  "save_steps": 1000,
7316
  "stateful_callbacks": {
 
7325
  "attributes": {}
7326
  }
7327
  },
7328
+ "total_flos": 2.8751680039786906e+18,
7329
  "train_batch_size": 64,
7330
  "trial_name": null,
7331
  "trial_params": null