Azrail commited on
Commit
1008508
·
verified ·
1 Parent(s): d579a14

Training in progress, step 7000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f12e4b4cd151eaa16ec86c3f95ae395991c7622f58c2e7d2e74c474e3b36e760
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96bc77814a9f0609f35da88bd498f09fe67c0ec6e846f95d060a6c4b10a82b17
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37d9c729674e95fb1e29d44967d84df6c7f3c27e76670f3f7d480a455ded0987
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:124d39c4cbb90f633e078a9fc2ec3d64bd1a263157af6537f15cbfbe1041beb8
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f55de071972b9763cbcf2a8de91813bfc16f9cfb1e09299e92ce7c238a6f40c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f709c9b1d30b7448b89cdbd3384985d3e5e1529a919cf635b9e29a00476e9e36
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fae5b4037609f8d0983c46601237411cdbc2481ef3d858df1d7dd4ab2f6d6072
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cc38559b8af23697273496301060a7de66bceefb1f96fb5cbe737d317e8e471
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.13179625945741127,
6
  "eval_steps": 500,
7
- "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1076,11 +1076,189 @@
1076
  "eval_steps_per_second": 18.973,
1077
  "num_input_tokens_seen": 6291456000,
1078
  "step": 6000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1079
  }
1080
  ],
1081
  "logging_steps": 50,
1082
  "max_steps": 200000,
1083
- "num_input_tokens_seen": 6291456000,
1084
  "num_train_epochs": 5,
1085
  "save_steps": 1000,
1086
  "stateful_callbacks": {
@@ -1095,7 +1273,7 @@
1095
  "attributes": {}
1096
  }
1097
  },
1098
- "total_flos": 3.583030295789568e+18,
1099
  "train_batch_size": 64,
1100
  "trial_name": null,
1101
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.15376230270031316,
6
  "eval_steps": 500,
7
+ "global_step": 7000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1076
  "eval_steps_per_second": 18.973,
1077
  "num_input_tokens_seen": 6291456000,
1078
  "step": 6000
1079
+ },
1080
+ {
1081
+ "epoch": 0.13289456161955637,
1082
+ "grad_norm": 0.11629872024059296,
1083
+ "learning_rate": 0.001,
1084
+ "loss": 2.9406,
1085
+ "num_input_tokens_seen": 6343884800,
1086
+ "step": 6050
1087
+ },
1088
+ {
1089
+ "epoch": 0.13399286378170147,
1090
+ "grad_norm": 0.13740529119968414,
1091
+ "learning_rate": 0.001,
1092
+ "loss": 2.9343,
1093
+ "num_input_tokens_seen": 6396313600,
1094
+ "step": 6100
1095
+ },
1096
+ {
1097
+ "epoch": 0.13509116594384657,
1098
+ "grad_norm": 0.11548039317131042,
1099
+ "learning_rate": 0.001,
1100
+ "loss": 2.9374,
1101
+ "num_input_tokens_seen": 6448742400,
1102
+ "step": 6150
1103
+ },
1104
+ {
1105
+ "epoch": 0.13618946810599164,
1106
+ "grad_norm": 0.11710146814584732,
1107
+ "learning_rate": 0.001,
1108
+ "loss": 2.9376,
1109
+ "num_input_tokens_seen": 6501171200,
1110
+ "step": 6200
1111
+ },
1112
+ {
1113
+ "epoch": 0.13728777026813674,
1114
+ "grad_norm": 0.11223472654819489,
1115
+ "learning_rate": 0.001,
1116
+ "loss": 2.9284,
1117
+ "num_input_tokens_seen": 6553600000,
1118
+ "step": 6250
1119
+ },
1120
+ {
1121
+ "epoch": 0.13838607243028184,
1122
+ "grad_norm": 0.12880656123161316,
1123
+ "learning_rate": 0.001,
1124
+ "loss": 2.9303,
1125
+ "num_input_tokens_seen": 6606028800,
1126
+ "step": 6300
1127
+ },
1128
+ {
1129
+ "epoch": 0.13948437459242694,
1130
+ "grad_norm": 0.11898139119148254,
1131
+ "learning_rate": 0.001,
1132
+ "loss": 2.9246,
1133
+ "num_input_tokens_seen": 6658457600,
1134
+ "step": 6350
1135
+ },
1136
+ {
1137
+ "epoch": 0.14058267675457203,
1138
+ "grad_norm": 0.11154898256063461,
1139
+ "learning_rate": 0.001,
1140
+ "loss": 2.9254,
1141
+ "num_input_tokens_seen": 6710886400,
1142
+ "step": 6400
1143
+ },
1144
+ {
1145
+ "epoch": 0.14168097891671713,
1146
+ "grad_norm": 0.12669232487678528,
1147
+ "learning_rate": 0.001,
1148
+ "loss": 2.9162,
1149
+ "num_input_tokens_seen": 6763315200,
1150
+ "step": 6450
1151
+ },
1152
+ {
1153
+ "epoch": 0.1427792810788622,
1154
+ "grad_norm": 0.12259842455387115,
1155
+ "learning_rate": 0.001,
1156
+ "loss": 2.9179,
1157
+ "num_input_tokens_seen": 6815744000,
1158
+ "step": 6500
1159
+ },
1160
+ {
1161
+ "epoch": 0.1427792810788622,
1162
+ "eval_loss": 2.8220207691192627,
1163
+ "eval_runtime": 65.2868,
1164
+ "eval_samples_per_second": 76.585,
1165
+ "eval_steps_per_second": 19.146,
1166
+ "num_input_tokens_seen": 6815744000,
1167
+ "step": 6500
1168
+ },
1169
+ {
1170
+ "epoch": 0.1438775832410073,
1171
+ "grad_norm": 0.13403092324733734,
1172
+ "learning_rate": 0.001,
1173
+ "loss": 2.9102,
1174
+ "num_input_tokens_seen": 6868172800,
1175
+ "step": 6550
1176
+ },
1177
+ {
1178
+ "epoch": 0.1449758854031524,
1179
+ "grad_norm": 0.13063696026802063,
1180
+ "learning_rate": 0.001,
1181
+ "loss": 2.9112,
1182
+ "num_input_tokens_seen": 6920601600,
1183
+ "step": 6600
1184
+ },
1185
+ {
1186
+ "epoch": 0.1460741875652975,
1187
+ "grad_norm": 0.11871635168790817,
1188
+ "learning_rate": 0.001,
1189
+ "loss": 2.9085,
1190
+ "num_input_tokens_seen": 6973030400,
1191
+ "step": 6650
1192
+ },
1193
+ {
1194
+ "epoch": 0.1471724897274426,
1195
+ "grad_norm": 0.11007633060216904,
1196
+ "learning_rate": 0.001,
1197
+ "loss": 2.9098,
1198
+ "num_input_tokens_seen": 7025459200,
1199
+ "step": 6700
1200
+ },
1201
+ {
1202
+ "epoch": 0.1482707918895877,
1203
+ "grad_norm": 0.10521857440471649,
1204
+ "learning_rate": 0.001,
1205
+ "loss": 2.9086,
1206
+ "num_input_tokens_seen": 7077888000,
1207
+ "step": 6750
1208
+ },
1209
+ {
1210
+ "epoch": 0.14936909405173276,
1211
+ "grad_norm": 0.11179310083389282,
1212
+ "learning_rate": 0.001,
1213
+ "loss": 2.9066,
1214
+ "num_input_tokens_seen": 7130316800,
1215
+ "step": 6800
1216
+ },
1217
+ {
1218
+ "epoch": 0.15046739621387786,
1219
+ "grad_norm": 0.1192353144288063,
1220
+ "learning_rate": 0.001,
1221
+ "loss": 2.9135,
1222
+ "num_input_tokens_seen": 7182745600,
1223
+ "step": 6850
1224
+ },
1225
+ {
1226
+ "epoch": 0.15156569837602296,
1227
+ "grad_norm": 0.11084350198507309,
1228
+ "learning_rate": 0.001,
1229
+ "loss": 2.9054,
1230
+ "num_input_tokens_seen": 7235174400,
1231
+ "step": 6900
1232
+ },
1233
+ {
1234
+ "epoch": 0.15266400053816806,
1235
+ "grad_norm": 0.11826325207948685,
1236
+ "learning_rate": 0.001,
1237
+ "loss": 2.9054,
1238
+ "num_input_tokens_seen": 7287603200,
1239
+ "step": 6950
1240
+ },
1241
+ {
1242
+ "epoch": 0.15376230270031316,
1243
+ "grad_norm": 0.12597590684890747,
1244
+ "learning_rate": 0.001,
1245
+ "loss": 2.8945,
1246
+ "num_input_tokens_seen": 7340032000,
1247
+ "step": 7000
1248
+ },
1249
+ {
1250
+ "epoch": 0.15376230270031316,
1251
+ "eval_loss": 2.802734851837158,
1252
+ "eval_runtime": 65.3332,
1253
+ "eval_samples_per_second": 76.531,
1254
+ "eval_steps_per_second": 19.133,
1255
+ "num_input_tokens_seen": 7340032000,
1256
+ "step": 7000
1257
  }
1258
  ],
1259
  "logging_steps": 50,
1260
  "max_steps": 200000,
1261
+ "num_input_tokens_seen": 7340032000,
1262
  "num_train_epochs": 5,
1263
  "save_steps": 1000,
1264
  "stateful_callbacks": {
 
1273
  "attributes": {}
1274
  }
1275
  },
1276
+ "total_flos": 4.180202011754496e+18,
1277
  "train_batch_size": 64,
1278
  "trial_name": null,
1279
  "trial_params": null