Azrail commited on
Commit
37536e5
·
verified ·
1 Parent(s): cf5f2d6

Training in progress, step 46000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3ffbf5a816a6aa824466bdde4390b737dfef3183acb26f39844f7b4017bf30d
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa89571eb3340eba1a67ab65cc95a52de52c688ab135a582ba9671de6b4b9b2b
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6b19fbbf1f84052b99affd1a4abf045aa0dc4dae5e3396c29093fc71d96182f
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee8593e17fbb590b6be9983a2252f2eb629b591782e538eabf2da48b5e3443f7
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2efdaece0c1a392cf0dde4c3fd595f174e50c13358c4a6e5301669f684c3b3b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fdfef9e83b1fd0865026b3e547285feb0ce1b439ee58282cde4fbaa3e21a682
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4373b3ab47408a8ab65ab61c7aee7bfdf3c940344f36a198973da2bfc9da86a8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c85b77405559b6f9d3b974ee441baee89ea00505d86e9a6015f23da9cbeb2cb5
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2146511322847228,
6
  "eval_steps": 500,
7
- "global_step": 45000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8018,11 +8018,189 @@
8018
  "eval_steps_per_second": 24.43,
8019
  "num_input_tokens_seen": 11796475456,
8020
  "step": 45000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8021
  }
8022
  ],
8023
  "logging_steps": 50,
8024
  "max_steps": 70000,
8025
- "num_input_tokens_seen": 11796475456,
8026
  "num_train_epochs": 1,
8027
  "save_steps": 1000,
8028
  "stateful_callbacks": {
@@ -8037,7 +8215,7 @@
8037
  "attributes": {}
8038
  }
8039
  },
8040
- "total_flos": 3.1556723180804506e+18,
8041
  "train_batch_size": 64,
8042
  "trial_name": null,
8043
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.21942115744660554,
6
  "eval_steps": 500,
7
+ "global_step": 46000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8018
  "eval_steps_per_second": 24.43,
8019
  "num_input_tokens_seen": 11796475456,
8020
  "step": 45000
8021
+ },
8022
+ {
8023
+ "epoch": 0.21488963354281693,
8024
+ "grad_norm": 0.226406991481781,
8025
+ "learning_rate": 0.001,
8026
+ "loss": 2.626,
8027
+ "num_input_tokens_seen": 11809582656,
8028
+ "step": 45050
8029
+ },
8030
+ {
8031
+ "epoch": 0.21512813480091109,
8032
+ "grad_norm": 0.20505741238594055,
8033
+ "learning_rate": 0.001,
8034
+ "loss": 2.6095,
8035
+ "num_input_tokens_seen": 11822689856,
8036
+ "step": 45100
8037
+ },
8038
+ {
8039
+ "epoch": 0.21536663605900522,
8040
+ "grad_norm": 0.2917146682739258,
8041
+ "learning_rate": 0.001,
8042
+ "loss": 2.6439,
8043
+ "num_input_tokens_seen": 11835797056,
8044
+ "step": 45150
8045
+ },
8046
+ {
8047
+ "epoch": 0.21560513731709935,
8048
+ "grad_norm": 0.24030283093452454,
8049
+ "learning_rate": 0.001,
8050
+ "loss": 2.6386,
8051
+ "num_input_tokens_seen": 11848904256,
8052
+ "step": 45200
8053
+ },
8054
+ {
8055
+ "epoch": 0.21584363857519348,
8056
+ "grad_norm": 0.1799454241991043,
8057
+ "learning_rate": 0.001,
8058
+ "loss": 2.6344,
8059
+ "num_input_tokens_seen": 11862011456,
8060
+ "step": 45250
8061
+ },
8062
+ {
8063
+ "epoch": 0.2160821398332876,
8064
+ "grad_norm": 0.2093718945980072,
8065
+ "learning_rate": 0.001,
8066
+ "loss": 2.6152,
8067
+ "num_input_tokens_seen": 11875118656,
8068
+ "step": 45300
8069
+ },
8070
+ {
8071
+ "epoch": 0.21632064109138174,
8072
+ "grad_norm": 0.19477079808712006,
8073
+ "learning_rate": 0.001,
8074
+ "loss": 2.622,
8075
+ "num_input_tokens_seen": 11888225856,
8076
+ "step": 45350
8077
+ },
8078
+ {
8079
+ "epoch": 0.2165591423494759,
8080
+ "grad_norm": 0.2764741778373718,
8081
+ "learning_rate": 0.001,
8082
+ "loss": 2.5951,
8083
+ "num_input_tokens_seen": 11901333056,
8084
+ "step": 45400
8085
+ },
8086
+ {
8087
+ "epoch": 0.21679764360757003,
8088
+ "grad_norm": 0.2127208709716797,
8089
+ "learning_rate": 0.001,
8090
+ "loss": 2.6231,
8091
+ "num_input_tokens_seen": 11914440256,
8092
+ "step": 45450
8093
+ },
8094
+ {
8095
+ "epoch": 0.21703614486566417,
8096
+ "grad_norm": 0.21089383959770203,
8097
+ "learning_rate": 0.001,
8098
+ "loss": 2.6099,
8099
+ "num_input_tokens_seen": 11927547456,
8100
+ "step": 45500
8101
+ },
8102
+ {
8103
+ "epoch": 0.21703614486566417,
8104
+ "eval_loss": 2.502464771270752,
8105
+ "eval_runtime": 50.946,
8106
+ "eval_samples_per_second": 98.143,
8107
+ "eval_steps_per_second": 24.536,
8108
+ "num_input_tokens_seen": 11927547456,
8109
+ "step": 45500
8110
+ },
8111
+ {
8112
+ "epoch": 0.2172746461237583,
8113
+ "grad_norm": 0.19550016522407532,
8114
+ "learning_rate": 0.001,
8115
+ "loss": 2.6365,
8116
+ "num_input_tokens_seen": 11940654656,
8117
+ "step": 45550
8118
+ },
8119
+ {
8120
+ "epoch": 0.21751314738185243,
8121
+ "grad_norm": 0.18284358084201813,
8122
+ "learning_rate": 0.001,
8123
+ "loss": 2.6358,
8124
+ "num_input_tokens_seen": 11953761856,
8125
+ "step": 45600
8126
+ },
8127
+ {
8128
+ "epoch": 0.2177516486399466,
8129
+ "grad_norm": 0.21821847558021545,
8130
+ "learning_rate": 0.001,
8131
+ "loss": 2.607,
8132
+ "num_input_tokens_seen": 11966869056,
8133
+ "step": 45650
8134
+ },
8135
+ {
8136
+ "epoch": 0.21799014989804072,
8137
+ "grad_norm": 0.2195073515176773,
8138
+ "learning_rate": 0.001,
8139
+ "loss": 2.6195,
8140
+ "num_input_tokens_seen": 11979976256,
8141
+ "step": 45700
8142
+ },
8143
+ {
8144
+ "epoch": 0.21822865115613485,
8145
+ "grad_norm": 0.19679750502109528,
8146
+ "learning_rate": 0.001,
8147
+ "loss": 2.6259,
8148
+ "num_input_tokens_seen": 11993083456,
8149
+ "step": 45750
8150
+ },
8151
+ {
8152
+ "epoch": 0.21846715241422898,
8153
+ "grad_norm": 0.1985604166984558,
8154
+ "learning_rate": 0.001,
8155
+ "loss": 2.6224,
8156
+ "num_input_tokens_seen": 12006190656,
8157
+ "step": 45800
8158
+ },
8159
+ {
8160
+ "epoch": 0.2187056536723231,
8161
+ "grad_norm": 0.18398787081241608,
8162
+ "learning_rate": 0.001,
8163
+ "loss": 2.6215,
8164
+ "num_input_tokens_seen": 12019297856,
8165
+ "step": 45850
8166
+ },
8167
+ {
8168
+ "epoch": 0.21894415493041725,
8169
+ "grad_norm": 0.2306145578622818,
8170
+ "learning_rate": 0.001,
8171
+ "loss": 2.6346,
8172
+ "num_input_tokens_seen": 12032405056,
8173
+ "step": 45900
8174
+ },
8175
+ {
8176
+ "epoch": 0.2191826561885114,
8177
+ "grad_norm": 0.21335257589817047,
8178
+ "learning_rate": 0.001,
8179
+ "loss": 2.6232,
8180
+ "num_input_tokens_seen": 12045512256,
8181
+ "step": 45950
8182
+ },
8183
+ {
8184
+ "epoch": 0.21942115744660554,
8185
+ "grad_norm": 0.22988814115524292,
8186
+ "learning_rate": 0.001,
8187
+ "loss": 2.6132,
8188
+ "num_input_tokens_seen": 12058619456,
8189
+ "step": 46000
8190
+ },
8191
+ {
8192
+ "epoch": 0.21942115744660554,
8193
+ "eval_loss": 2.499041795730591,
8194
+ "eval_runtime": 50.6868,
8195
+ "eval_samples_per_second": 98.645,
8196
+ "eval_steps_per_second": 24.661,
8197
+ "num_input_tokens_seen": 12058619456,
8198
+ "step": 46000
8199
  }
8200
  ],
8201
  "logging_steps": 50,
8202
  "max_steps": 70000,
8203
+ "num_input_tokens_seen": 12058619456,
8204
  "num_train_epochs": 1,
8205
  "save_steps": 1000,
8206
  "stateful_callbacks": {
 
8215
  "attributes": {}
8216
  }
8217
  },
8218
+ "total_flos": 3.2257983966058906e+18,
8219
  "train_batch_size": 64,
8220
  "trial_name": null,
8221
  "trial_params": null