Azrail commited on
Commit
2742ab3
·
verified ·
1 Parent(s): 38b495b

Training in progress, step 52000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b71ed16611cd95fe8479b9b5158a65681e32cd86fc06fd6104792dca5e0ea90c
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3592942d50fd128f616a1b607af53de041def2895dde8221a2068841bbfc75f
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4d0121bc94172a095cdea5c65ddbc39cc2a2d68c3e7dea1521191e5bf66d6e4
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c28c8ab74c2ab24140a66eba7b08b4da3f0a1c0487aa3d24a61f15278b3cefdb
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:871241677306799dd94bb012f99e77b35a49885274956fc7cf6b8c017fdd6180
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:746267b8ba996549a033d105e363328c635034a7afa0e3070ea8447957aaca5a
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38e628546b6b3793b4db9c04b0c48bd7f457b5c91e760c9c29b133754fb90815
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24b3fcbecd3d55078c913506015bb6e1182f04ee52bf4c0845fc043823a61161
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.24327128325601918,
6
  "eval_steps": 500,
7
- "global_step": 51000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9086,11 +9086,189 @@
9086
  "eval_steps_per_second": 23.341,
9087
  "num_input_tokens_seen": 13369339456,
9088
  "step": 51000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9089
  }
9090
  ],
9091
  "logging_steps": 50,
9092
  "max_steps": 70000,
9093
- "num_input_tokens_seen": 13369339456,
9094
  "num_train_epochs": 1,
9095
  "save_steps": 1000,
9096
  "stateful_callbacks": {
@@ -9105,7 +9283,7 @@
9105
  "attributes": {}
9106
  }
9107
  },
9108
- "total_flos": 3.5764287892330906e+18,
9109
  "train_batch_size": 64,
9110
  "trial_name": null,
9111
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.2480413084179019,
6
  "eval_steps": 500,
7
+ "global_step": 52000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9086
  "eval_steps_per_second": 23.341,
9087
  "num_input_tokens_seen": 13369339456,
9088
  "step": 51000
9089
+ },
9090
+ {
9091
+ "epoch": 0.24350978451411331,
9092
+ "grad_norm": 0.20502831041812897,
9093
+ "learning_rate": 0.001,
9094
+ "loss": 2.6059,
9095
+ "num_input_tokens_seen": 13382446656,
9096
+ "step": 51050
9097
+ },
9098
+ {
9099
+ "epoch": 0.24374828577220745,
9100
+ "grad_norm": 0.20750559866428375,
9101
+ "learning_rate": 0.001,
9102
+ "loss": 2.6056,
9103
+ "num_input_tokens_seen": 13395553856,
9104
+ "step": 51100
9105
+ },
9106
+ {
9107
+ "epoch": 0.24398678703030158,
9108
+ "grad_norm": 0.19882823526859283,
9109
+ "learning_rate": 0.001,
9110
+ "loss": 2.5983,
9111
+ "num_input_tokens_seen": 13408661056,
9112
+ "step": 51150
9113
+ },
9114
+ {
9115
+ "epoch": 0.2442252882883957,
9116
+ "grad_norm": 0.20900660753250122,
9117
+ "learning_rate": 0.001,
9118
+ "loss": 2.6087,
9119
+ "num_input_tokens_seen": 13421768256,
9120
+ "step": 51200
9121
+ },
9122
+ {
9123
+ "epoch": 0.24446378954648987,
9124
+ "grad_norm": 0.21428415179252625,
9125
+ "learning_rate": 0.001,
9126
+ "loss": 2.5901,
9127
+ "num_input_tokens_seen": 13434875456,
9128
+ "step": 51250
9129
+ },
9130
+ {
9131
+ "epoch": 0.244702290804584,
9132
+ "grad_norm": 0.19987250864505768,
9133
+ "learning_rate": 0.001,
9134
+ "loss": 2.5982,
9135
+ "num_input_tokens_seen": 13447982656,
9136
+ "step": 51300
9137
+ },
9138
+ {
9139
+ "epoch": 0.24494079206267813,
9140
+ "grad_norm": 0.2045862078666687,
9141
+ "learning_rate": 0.001,
9142
+ "loss": 2.6058,
9143
+ "num_input_tokens_seen": 13461089856,
9144
+ "step": 51350
9145
+ },
9146
+ {
9147
+ "epoch": 0.24517929332077226,
9148
+ "grad_norm": 0.22261273860931396,
9149
+ "learning_rate": 0.001,
9150
+ "loss": 2.5972,
9151
+ "num_input_tokens_seen": 13474197056,
9152
+ "step": 51400
9153
+ },
9154
+ {
9155
+ "epoch": 0.2454177945788664,
9156
+ "grad_norm": 0.20395706593990326,
9157
+ "learning_rate": 0.001,
9158
+ "loss": 2.6064,
9159
+ "num_input_tokens_seen": 13487304256,
9160
+ "step": 51450
9161
+ },
9162
+ {
9163
+ "epoch": 0.24565629583696055,
9164
+ "grad_norm": 0.21490858495235443,
9165
+ "learning_rate": 0.001,
9166
+ "loss": 2.5922,
9167
+ "num_input_tokens_seen": 13500411456,
9168
+ "step": 51500
9169
+ },
9170
+ {
9171
+ "epoch": 0.24565629583696055,
9172
+ "eval_loss": 2.488300085067749,
9173
+ "eval_runtime": 53.7972,
9174
+ "eval_samples_per_second": 92.942,
9175
+ "eval_steps_per_second": 23.235,
9176
+ "num_input_tokens_seen": 13500411456,
9177
+ "step": 51500
9178
+ },
9179
+ {
9180
+ "epoch": 0.24589479709505468,
9181
+ "grad_norm": 0.2039102464914322,
9182
+ "learning_rate": 0.001,
9183
+ "loss": 2.5894,
9184
+ "num_input_tokens_seen": 13513518656,
9185
+ "step": 51550
9186
+ },
9187
+ {
9188
+ "epoch": 0.24613329835314882,
9189
+ "grad_norm": 0.21426360309123993,
9190
+ "learning_rate": 0.001,
9191
+ "loss": 2.6089,
9192
+ "num_input_tokens_seen": 13526625856,
9193
+ "step": 51600
9194
+ },
9195
+ {
9196
+ "epoch": 0.24637179961124295,
9197
+ "grad_norm": 0.194682314991951,
9198
+ "learning_rate": 0.001,
9199
+ "loss": 2.5932,
9200
+ "num_input_tokens_seen": 13539733056,
9201
+ "step": 51650
9202
+ },
9203
+ {
9204
+ "epoch": 0.24661030086933708,
9205
+ "grad_norm": 0.1901472508907318,
9206
+ "learning_rate": 0.001,
9207
+ "loss": 2.6031,
9208
+ "num_input_tokens_seen": 13552840256,
9209
+ "step": 51700
9210
+ },
9211
+ {
9212
+ "epoch": 0.2468488021274312,
9213
+ "grad_norm": 0.20517823100090027,
9214
+ "learning_rate": 0.001,
9215
+ "loss": 2.5978,
9216
+ "num_input_tokens_seen": 13565947456,
9217
+ "step": 51750
9218
+ },
9219
+ {
9220
+ "epoch": 0.24708730338552537,
9221
+ "grad_norm": 0.23713302612304688,
9222
+ "learning_rate": 0.001,
9223
+ "loss": 2.6061,
9224
+ "num_input_tokens_seen": 13579054656,
9225
+ "step": 51800
9226
+ },
9227
+ {
9228
+ "epoch": 0.2473258046436195,
9229
+ "grad_norm": 0.2431441992521286,
9230
+ "learning_rate": 0.001,
9231
+ "loss": 2.6062,
9232
+ "num_input_tokens_seen": 13592161856,
9233
+ "step": 51850
9234
+ },
9235
+ {
9236
+ "epoch": 0.24756430590171363,
9237
+ "grad_norm": 0.20358557999134064,
9238
+ "learning_rate": 0.001,
9239
+ "loss": 2.6161,
9240
+ "num_input_tokens_seen": 13605269056,
9241
+ "step": 51900
9242
+ },
9243
+ {
9244
+ "epoch": 0.24780280715980776,
9245
+ "grad_norm": 0.21245016157627106,
9246
+ "learning_rate": 0.001,
9247
+ "loss": 2.6166,
9248
+ "num_input_tokens_seen": 13618376256,
9249
+ "step": 51950
9250
+ },
9251
+ {
9252
+ "epoch": 0.2480413084179019,
9253
+ "grad_norm": 0.24295999109745026,
9254
+ "learning_rate": 0.001,
9255
+ "loss": 2.6139,
9256
+ "num_input_tokens_seen": 13631483456,
9257
+ "step": 52000
9258
+ },
9259
+ {
9260
+ "epoch": 0.2480413084179019,
9261
+ "eval_loss": 2.4932186603546143,
9262
+ "eval_runtime": 53.6797,
9263
+ "eval_samples_per_second": 93.145,
9264
+ "eval_steps_per_second": 23.286,
9265
+ "num_input_tokens_seen": 13631483456,
9266
+ "step": 52000
9267
  }
9268
  ],
9269
  "logging_steps": 50,
9270
  "max_steps": 70000,
9271
+ "num_input_tokens_seen": 13631483456,
9272
  "num_train_epochs": 1,
9273
  "save_steps": 1000,
9274
  "stateful_callbacks": {
 
9283
  "attributes": {}
9284
  }
9285
  },
9286
+ "total_flos": 3.6465548677585306e+18,
9287
  "train_batch_size": 64,
9288
  "trial_name": null,
9289
  "trial_params": null