ljcamargo commited on
Commit
d8c697f
·
verified ·
1 Parent(s): cada83a

Training in progress, step 1800, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e35f58cfc186debe53f8ca77f3187fcc171f64260bf63f1275d8d0b0ab69bede
3
  size 3237829088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b08b1672e2ea4211707e7ae1fc3be628d1c4cfcbac08051e5ed075820a85d750
3
  size 3237829088
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef2fb6b56d26498118cb8b387fcedcb4debb46e3fe9c3c47660644efe86198ea
3
  size 2062272049
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74263b5f8e059e873949491dec9e7a943acdde886eacc4fbfc309d2296ab82b6
3
  size 2062272049
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6eae40f4428968ab5083d1a5e4e97daade1451ea492899254cef072ae8e7b9d7
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c31bfa9c06956c0b54891b4da88a92b0061c8af3e34c97336d1d69755faea146
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4c90e73b569a38f99c2197447433676c2eaa22ce221aeecf0a7d6e7d0501c17
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1547aae10ac7691e1716f567b08e3b4d274fa923879a48af8c2bb55c815a28a2
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0d3e74929cb15c68f9b787eaa5631a6b89640ebdbca5e2e73c4cb4aa37e0203
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64f93a5d98422b9aaabc9ecb62e3fb6f0d27288e6198f54c3576af914532e165
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.4304778303917348,
6
  "eval_steps": 300,
7
- "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1058,6 +1058,216 @@
1058
  "learning_rate": 0.0001255560358785219,
1059
  "loss": 0.7828,
1060
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1061
  }
1062
  ],
1063
  "logging_steps": 10,
@@ -1077,7 +1287,7 @@
1077
  "attributes": {}
1078
  }
1079
  },
1080
- "total_flos": 6.137824149504e+19,
1081
  "train_batch_size": 6,
1082
  "trial_name": null,
1083
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.5165733964700818,
6
  "eval_steps": 300,
7
+ "global_step": 1800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1058
  "learning_rate": 0.0001255560358785219,
1059
  "loss": 0.7828,
1060
  "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 0.4333476825943464,
1064
+ "grad_norm": 5.9140400886535645,
1065
+ "learning_rate": 0.00012466583707033832,
1066
+ "loss": 0.8044,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 0.43621753479695796,
1071
+ "grad_norm": 5.575759410858154,
1072
+ "learning_rate": 0.00012377355205793854,
1073
+ "loss": 0.7996,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 0.4390873869995695,
1078
+ "grad_norm": 6.771875381469727,
1079
+ "learning_rate": 0.00012287925630962107,
1080
+ "loss": 0.8261,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 0.4419572392021811,
1085
+ "grad_norm": 18.849271774291992,
1086
+ "learning_rate": 0.00012198302546374978,
1087
+ "loss": 0.8224,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 0.44482709140479265,
1092
+ "grad_norm": 5.645337104797363,
1093
+ "learning_rate": 0.00012108493532235666,
1094
+ "loss": 0.8185,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 0.4476969436074042,
1099
+ "grad_norm": 4.3476481437683105,
1100
+ "learning_rate": 0.00012018506184473038,
1101
+ "loss": 0.7985,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 0.4505667958100158,
1106
+ "grad_norm": 8.391561508178711,
1107
+ "learning_rate": 0.00011928348114099195,
1108
+ "loss": 0.7965,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 0.45343664801262734,
1113
+ "grad_norm": 11.707796096801758,
1114
+ "learning_rate": 0.00011838026946565723,
1115
+ "loss": 0.8174,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 0.45630650021523894,
1120
+ "grad_norm": 9.046381950378418,
1121
+ "learning_rate": 0.00011747550321118763,
1122
+ "loss": 0.8,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 0.4591763524178505,
1127
+ "grad_norm": 8.26490306854248,
1128
+ "learning_rate": 0.00011656925890152877,
1129
+ "loss": 0.8229,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 0.46204620462046203,
1134
+ "grad_norm": 6.398012638092041,
1135
+ "learning_rate": 0.00011566161318563821,
1136
+ "loss": 0.8027,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 0.46491605682307363,
1141
+ "grad_norm": 5.92479133605957,
1142
+ "learning_rate": 0.0001147526428310027,
1143
+ "loss": 0.8094,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 0.4677859090256852,
1148
+ "grad_norm": 7.79962158203125,
1149
+ "learning_rate": 0.00011384242471714512,
1150
+ "loss": 0.8049,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 0.4706557612282967,
1155
+ "grad_norm": 4.564454078674316,
1156
+ "learning_rate": 0.00011293103582912221,
1157
+ "loss": 0.8382,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 0.4735256134309083,
1162
+ "grad_norm": 20.43712043762207,
1163
+ "learning_rate": 0.00011201855325101332,
1164
+ "loss": 0.829,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 0.47639546563351987,
1169
+ "grad_norm": 5.778446674346924,
1170
+ "learning_rate": 0.0001111050541594006,
1171
+ "loss": 0.8333,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 0.47926531783613147,
1176
+ "grad_norm": 5.030070781707764,
1177
+ "learning_rate": 0.00011019061581684165,
1178
+ "loss": 0.769,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 0.482135170038743,
1183
+ "grad_norm": 5.967840671539307,
1184
+ "learning_rate": 0.00010927531556533456,
1185
+ "loss": 0.8041,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 0.48500502224135456,
1190
+ "grad_norm": 4.707633972167969,
1191
+ "learning_rate": 0.00010835923081977673,
1192
+ "loss": 0.8105,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 0.48787487444396616,
1197
+ "grad_norm": 6.354760646820068,
1198
+ "learning_rate": 0.0001074424390614169,
1199
+ "loss": 0.8031,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 0.4907447266465777,
1204
+ "grad_norm": 6.2033915519714355,
1205
+ "learning_rate": 0.00010652501783130208,
1206
+ "loss": 0.7559,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 0.49361457884918924,
1211
+ "grad_norm": 3.7331125736236572,
1212
+ "learning_rate": 0.00010560704472371919,
1213
+ "loss": 0.8233,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 0.49648443105180085,
1218
+ "grad_norm": 9.511772155761719,
1219
+ "learning_rate": 0.00010468859737963217,
1220
+ "loss": 0.7945,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 0.4993542832544124,
1225
+ "grad_norm": 12.07361125946045,
1226
+ "learning_rate": 0.00010376975348011533,
1227
+ "loss": 0.8368,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 0.5022241354570239,
1232
+ "grad_norm": 4.957511901855469,
1233
+ "learning_rate": 0.00010285059073978312,
1234
+ "loss": 0.8241,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 0.5050939876596355,
1239
+ "grad_norm": 4.124336242675781,
1240
+ "learning_rate": 0.00010193118690021699,
1241
+ "loss": 0.807,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 0.5079638398622471,
1246
+ "grad_norm": 4.789161205291748,
1247
+ "learning_rate": 0.00010101161972339046,
1248
+ "loss": 0.8143,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 0.5108336920648586,
1253
+ "grad_norm": 5.026962757110596,
1254
+ "learning_rate": 0.00010009196698509173,
1255
+ "loss": 0.7765,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 0.5137035442674702,
1260
+ "grad_norm": 8.285078048706055,
1261
+ "learning_rate": 9.91723064683458e-05,
1262
+ "loss": 0.8053,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 0.5165733964700818,
1267
+ "grad_norm": 4.77803897857666,
1268
+ "learning_rate": 9.825271595683548e-05,
1269
+ "loss": 0.8072,
1270
+ "step": 1800
1271
  }
1272
  ],
1273
  "logging_steps": 10,
 
1287
  "attributes": {}
1288
  }
1289
  },
1290
+ "total_flos": 7.3653889794048e+19,
1291
  "train_batch_size": 6,
1292
  "trial_name": null,
1293
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c826a3ab5235a63f61a75099a41de538ae2f6fe824df40b96ea1279de029afd1
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb8fac145ce6d3b844b04932d52e4aba260f48f6c9dc5ba626561ea49a834bfb
3
  size 6033