Azrail commited on
Commit
a7c6d2d
·
verified ·
1 Parent(s): 3355861

Training in progress, step 35000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f5f555ad08e2b9af2d37e75627331d1470cb9ef7a24ec49aa6c8151031e160d
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39e47431790297c8d1ac0d590138e540ff35b008c08f15b4fec92555b68b3ca0
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8fc47eddc881f02400a6f527d01f719f196c43685429cccc1e20595fe30e54fb
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cb4360f6e3ef0a4db7ef43d5c8060cb784d63688538fb77fe4f179313685acd
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e52b818002a3e168d692336e37ab38a41e96baec8febde4371f1197492708c49
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3505914cea5cefe31834749326fbe845962aa02c10480cbc9f90524db4d28f1f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14f558fa3ebdf8eded534006aef123b31a5895b4203fad741964c0be3f8e2ef2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c432826b41d4d9850a94ad79c80845280b64911bf27c831beef66a783066385f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7468454702586639,
6
  "eval_steps": 500,
7
- "global_step": 34000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6060,11 +6060,189 @@
6060
  "eval_steps_per_second": 18.763,
6061
  "num_input_tokens_seen": 35651580160,
6062
  "step": 34000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6063
  }
6064
  ],
6065
  "logging_steps": 50,
6066
  "max_steps": 200000,
6067
- "num_input_tokens_seen": 35651580160,
6068
  "num_train_epochs": 5,
6069
  "save_steps": 1000,
6070
  "stateful_callbacks": {
@@ -6079,7 +6257,7 @@
6079
  "attributes": {}
6080
  }
6081
  },
6082
- "total_flos": 2.0303836155899412e+19,
6083
  "train_batch_size": 64,
6084
  "trial_name": null,
6085
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.7688115135015657,
6
  "eval_steps": 500,
7
+ "global_step": 35000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6060
  "eval_steps_per_second": 18.763,
6061
  "num_input_tokens_seen": 35651580160,
6062
  "step": 34000
6063
+ },
6064
+ {
6065
+ "epoch": 0.747943772420809,
6066
+ "grad_norm": 0.1586858183145523,
6067
+ "learning_rate": 0.001,
6068
+ "loss": 2.6654,
6069
+ "num_input_tokens_seen": 35704008960,
6070
+ "step": 34050
6071
+ },
6072
+ {
6073
+ "epoch": 0.749042074582954,
6074
+ "grad_norm": 0.1376073956489563,
6075
+ "learning_rate": 0.001,
6076
+ "loss": 2.6627,
6077
+ "num_input_tokens_seen": 35756437760,
6078
+ "step": 34100
6079
+ },
6080
+ {
6081
+ "epoch": 0.7501403767450991,
6082
+ "grad_norm": 0.13904818892478943,
6083
+ "learning_rate": 0.001,
6084
+ "loss": 2.6605,
6085
+ "num_input_tokens_seen": 35808866560,
6086
+ "step": 34150
6087
+ },
6088
+ {
6089
+ "epoch": 0.7512386789072443,
6090
+ "grad_norm": 0.14543947577476501,
6091
+ "learning_rate": 0.001,
6092
+ "loss": 2.6589,
6093
+ "num_input_tokens_seen": 35861295360,
6094
+ "step": 34200
6095
+ },
6096
+ {
6097
+ "epoch": 0.7523369810693894,
6098
+ "grad_norm": 0.14855198562145233,
6099
+ "learning_rate": 0.001,
6100
+ "loss": 2.6612,
6101
+ "num_input_tokens_seen": 35913724160,
6102
+ "step": 34250
6103
+ },
6104
+ {
6105
+ "epoch": 0.7534352832315344,
6106
+ "grad_norm": 0.14492908120155334,
6107
+ "learning_rate": 0.001,
6108
+ "loss": 2.6561,
6109
+ "num_input_tokens_seen": 35966152960,
6110
+ "step": 34300
6111
+ },
6112
+ {
6113
+ "epoch": 0.7545335853936795,
6114
+ "grad_norm": 0.1388978660106659,
6115
+ "learning_rate": 0.001,
6116
+ "loss": 2.6551,
6117
+ "num_input_tokens_seen": 36018581760,
6118
+ "step": 34350
6119
+ },
6120
+ {
6121
+ "epoch": 0.7556318875558247,
6122
+ "grad_norm": 0.14582422375679016,
6123
+ "learning_rate": 0.001,
6124
+ "loss": 2.6521,
6125
+ "num_input_tokens_seen": 36071010560,
6126
+ "step": 34400
6127
+ },
6128
+ {
6129
+ "epoch": 0.7567301897179697,
6130
+ "grad_norm": 0.17488695681095123,
6131
+ "learning_rate": 0.001,
6132
+ "loss": 2.6516,
6133
+ "num_input_tokens_seen": 36123439360,
6134
+ "step": 34450
6135
+ },
6136
+ {
6137
+ "epoch": 0.7578284918801148,
6138
+ "grad_norm": 0.12302416563034058,
6139
+ "learning_rate": 0.001,
6140
+ "loss": 2.6617,
6141
+ "num_input_tokens_seen": 36175868160,
6142
+ "step": 34500
6143
+ },
6144
+ {
6145
+ "epoch": 0.7578284918801148,
6146
+ "eval_loss": 2.5549991130828857,
6147
+ "eval_runtime": 67.5095,
6148
+ "eval_samples_per_second": 74.064,
6149
+ "eval_steps_per_second": 18.516,
6150
+ "num_input_tokens_seen": 36175868160,
6151
+ "step": 34500
6152
+ },
6153
+ {
6154
+ "epoch": 0.7589267940422599,
6155
+ "grad_norm": 0.14238396286964417,
6156
+ "learning_rate": 0.001,
6157
+ "loss": 2.6609,
6158
+ "num_input_tokens_seen": 36228296960,
6159
+ "step": 34550
6160
+ },
6161
+ {
6162
+ "epoch": 0.7600250962044051,
6163
+ "grad_norm": 0.17919403314590454,
6164
+ "learning_rate": 0.001,
6165
+ "loss": 2.6621,
6166
+ "num_input_tokens_seen": 36280725760,
6167
+ "step": 34600
6168
+ },
6169
+ {
6170
+ "epoch": 0.7611233983665501,
6171
+ "grad_norm": 0.13188666105270386,
6172
+ "learning_rate": 0.001,
6173
+ "loss": 2.6529,
6174
+ "num_input_tokens_seen": 36333154560,
6175
+ "step": 34650
6176
+ },
6177
+ {
6178
+ "epoch": 0.7622217005286952,
6179
+ "grad_norm": 0.16191646456718445,
6180
+ "learning_rate": 0.001,
6181
+ "loss": 2.6584,
6182
+ "num_input_tokens_seen": 36385583360,
6183
+ "step": 34700
6184
+ },
6185
+ {
6186
+ "epoch": 0.7633200026908403,
6187
+ "grad_norm": 0.14606165885925293,
6188
+ "learning_rate": 0.001,
6189
+ "loss": 2.6567,
6190
+ "num_input_tokens_seen": 36438012160,
6191
+ "step": 34750
6192
+ },
6193
+ {
6194
+ "epoch": 0.7644183048529853,
6195
+ "grad_norm": 0.1648443192243576,
6196
+ "learning_rate": 0.001,
6197
+ "loss": 2.6587,
6198
+ "num_input_tokens_seen": 36490440960,
6199
+ "step": 34800
6200
+ },
6201
+ {
6202
+ "epoch": 0.7655166070151305,
6203
+ "grad_norm": 0.19523674249649048,
6204
+ "learning_rate": 0.001,
6205
+ "loss": 2.6662,
6206
+ "num_input_tokens_seen": 36542869760,
6207
+ "step": 34850
6208
+ },
6209
+ {
6210
+ "epoch": 0.7666149091772756,
6211
+ "grad_norm": 0.1713179498910904,
6212
+ "learning_rate": 0.001,
6213
+ "loss": 2.6683,
6214
+ "num_input_tokens_seen": 36595298560,
6215
+ "step": 34900
6216
+ },
6217
+ {
6218
+ "epoch": 0.7677132113394207,
6219
+ "grad_norm": 0.14923711121082306,
6220
+ "learning_rate": 0.001,
6221
+ "loss": 2.6629,
6222
+ "num_input_tokens_seen": 36647727360,
6223
+ "step": 34950
6224
+ },
6225
+ {
6226
+ "epoch": 0.7688115135015657,
6227
+ "grad_norm": 0.13948023319244385,
6228
+ "learning_rate": 0.001,
6229
+ "loss": 2.6619,
6230
+ "num_input_tokens_seen": 36700156160,
6231
+ "step": 35000
6232
+ },
6233
+ {
6234
+ "epoch": 0.7688115135015657,
6235
+ "eval_loss": 2.5569379329681396,
6236
+ "eval_runtime": 67.9393,
6237
+ "eval_samples_per_second": 73.595,
6238
+ "eval_steps_per_second": 18.399,
6239
+ "num_input_tokens_seen": 36700156160,
6240
+ "step": 35000
6241
  }
6242
  ],
6243
  "logging_steps": 50,
6244
  "max_steps": 200000,
6245
+ "num_input_tokens_seen": 36700156160,
6246
  "num_train_epochs": 5,
6247
  "save_steps": 1000,
6248
  "stateful_callbacks": {
 
6257
  "attributes": {}
6258
  }
6259
  },
6260
+ "total_flos": 2.090100787186434e+19,
6261
  "train_batch_size": 64,
6262
  "trial_name": null,
6263
  "trial_params": null