ljcamargo commited on
Commit
ae3632b
·
verified ·
1 Parent(s): ff6ecd1

Training in progress, step 1800, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee0cd03d3a9be3023a0e3720a6e91db11f47e20d8e7bec88e3c1220ca8a10eaa
3
  size 3237818848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30cf4ddc8138dc0b63c04cf5856ccaefc44f54d57161548a2bcf67587713dfed
3
  size 3237818848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0523d436c3449c90448d00f0c9ea8840e7e341f44632cc2e10b78b0d80da3e7c
3
  size 2062251569
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c88d4612f6436cb0270beb0bb2ab7cbb57317eafb7b87764e12d36ec083c260
3
  size 2062251569
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:191b991347426ecc0aa235378fd9d2fce0ab0d707a85beb25ac14245f68ee477
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3b789b883f13ca849e56997deda5a819a4b325b5d103e882990a667f22165d3
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4c90e73b569a38f99c2197447433676c2eaa22ce221aeecf0a7d6e7d0501c17
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1547aae10ac7691e1716f567b08e3b4d274fa923879a48af8c2bb55c815a28a2
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8c49d54f38ea4c21892dfde13ddaac2daecfb954dcbad06d74b64fe3dec95fd
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f5a59feb5a16bc7cf6785205b16a58a4ce06c6d1cd586567a10fcc2307ab6fc
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6666666666666666,
6
  "eval_steps": 300,
7
- "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1070,6 +1070,216 @@
1070
  "learning_rate": 5.231958800515164e-05,
1071
  "loss": 1.0044,
1072
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1073
  }
1074
  ],
1075
  "logging_steps": 10,
@@ -1089,7 +1299,7 @@
1089
  "attributes": {}
1090
  }
1091
  },
1092
- "total_flos": 4.091882766336e+19,
1093
  "train_batch_size": 4,
1094
  "trial_name": null,
1095
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.8,
6
  "eval_steps": 300,
7
+ "global_step": 1800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1070
  "learning_rate": 5.231958800515164e-05,
1071
  "loss": 1.0044,
1072
  "step": 1500
1073
+ },
1074
+ {
1075
+ "epoch": 0.6711111111111111,
1076
+ "grad_norm": 7.949609279632568,
1077
+ "learning_rate": 5.107265654859855e-05,
1078
+ "loss": 1.0194,
1079
+ "step": 1510
1080
+ },
1081
+ {
1082
+ "epoch": 0.6755555555555556,
1083
+ "grad_norm": 8.028242111206055,
1084
+ "learning_rate": 4.983564786433763e-05,
1085
+ "loss": 0.9705,
1086
+ "step": 1520
1087
+ },
1088
+ {
1089
+ "epoch": 0.68,
1090
+ "grad_norm": 8.18526840209961,
1091
+ "learning_rate": 4.860881282549285e-05,
1092
+ "loss": 0.9802,
1093
+ "step": 1530
1094
+ },
1095
+ {
1096
+ "epoch": 0.6844444444444444,
1097
+ "grad_norm": 9.321311950683594,
1098
+ "learning_rate": 4.739240024190904e-05,
1099
+ "loss": 0.9649,
1100
+ "step": 1540
1101
+ },
1102
+ {
1103
+ "epoch": 0.6888888888888889,
1104
+ "grad_norm": 10.959417343139648,
1105
+ "learning_rate": 4.618665680969163e-05,
1106
+ "loss": 0.9957,
1107
+ "step": 1550
1108
+ },
1109
+ {
1110
+ "epoch": 0.6933333333333334,
1111
+ "grad_norm": 9.302586555480957,
1112
+ "learning_rate": 4.49918270611752e-05,
1113
+ "loss": 0.9833,
1114
+ "step": 1560
1115
+ },
1116
+ {
1117
+ "epoch": 0.6977777777777778,
1118
+ "grad_norm": 7.047448635101318,
1119
+ "learning_rate": 4.380815331533088e-05,
1120
+ "loss": 1.0179,
1121
+ "step": 1570
1122
+ },
1123
+ {
1124
+ "epoch": 0.7022222222222222,
1125
+ "grad_norm": 9.307101249694824,
1126
+ "learning_rate": 4.2635875628622345e-05,
1127
+ "loss": 0.9883,
1128
+ "step": 1580
1129
+ },
1130
+ {
1131
+ "epoch": 0.7066666666666667,
1132
+ "grad_norm": 8.306827545166016,
1133
+ "learning_rate": 4.147523174632103e-05,
1134
+ "loss": 0.984,
1135
+ "step": 1590
1136
+ },
1137
+ {
1138
+ "epoch": 0.7111111111111111,
1139
+ "grad_norm": 9.073155403137207,
1140
+ "learning_rate": 4.032645705428985e-05,
1141
+ "loss": 0.9916,
1142
+ "step": 1600
1143
+ },
1144
+ {
1145
+ "epoch": 0.7155555555555555,
1146
+ "grad_norm": 11.148294448852539,
1147
+ "learning_rate": 3.9189784531245334e-05,
1148
+ "loss": 0.993,
1149
+ "step": 1610
1150
+ },
1151
+ {
1152
+ "epoch": 0.72,
1153
+ "grad_norm": 7.878681659698486,
1154
+ "learning_rate": 3.806544470150831e-05,
1155
+ "loss": 0.9733,
1156
+ "step": 1620
1157
+ },
1158
+ {
1159
+ "epoch": 0.7244444444444444,
1160
+ "grad_norm": 9.204869270324707,
1161
+ "learning_rate": 3.6953665588251984e-05,
1162
+ "loss": 0.9689,
1163
+ "step": 1630
1164
+ },
1165
+ {
1166
+ "epoch": 0.7288888888888889,
1167
+ "grad_norm": 8.391727447509766,
1168
+ "learning_rate": 3.585467266725737e-05,
1169
+ "loss": 0.9782,
1170
+ "step": 1640
1171
+ },
1172
+ {
1173
+ "epoch": 0.7333333333333333,
1174
+ "grad_norm": 6.572085857391357,
1175
+ "learning_rate": 3.4768688821185566e-05,
1176
+ "loss": 0.9548,
1177
+ "step": 1650
1178
+ },
1179
+ {
1180
+ "epoch": 0.7377777777777778,
1181
+ "grad_norm": 9.943083763122559,
1182
+ "learning_rate": 3.3695934294375544e-05,
1183
+ "loss": 0.9904,
1184
+ "step": 1660
1185
+ },
1186
+ {
1187
+ "epoch": 0.7422222222222222,
1188
+ "grad_norm": 8.165312767028809,
1189
+ "learning_rate": 3.263662664817728e-05,
1190
+ "loss": 0.9728,
1191
+ "step": 1670
1192
+ },
1193
+ {
1194
+ "epoch": 0.7466666666666667,
1195
+ "grad_norm": 9.635257720947266,
1196
+ "learning_rate": 3.15909807168291e-05,
1197
+ "loss": 0.961,
1198
+ "step": 1680
1199
+ },
1200
+ {
1201
+ "epoch": 0.7511111111111111,
1202
+ "grad_norm": 7.636417865753174,
1203
+ "learning_rate": 3.055920856388779e-05,
1204
+ "loss": 0.9403,
1205
+ "step": 1690
1206
+ },
1207
+ {
1208
+ "epoch": 0.7555555555555555,
1209
+ "grad_norm": 6.770568370819092,
1210
+ "learning_rate": 2.95415194392207e-05,
1211
+ "loss": 0.9484,
1212
+ "step": 1700
1213
+ },
1214
+ {
1215
+ "epoch": 0.76,
1216
+ "grad_norm": 7.254674434661865,
1217
+ "learning_rate": 2.8538119736568845e-05,
1218
+ "loss": 0.9701,
1219
+ "step": 1710
1220
+ },
1221
+ {
1222
+ "epoch": 0.7644444444444445,
1223
+ "grad_norm": 8.287463188171387,
1224
+ "learning_rate": 2.7549212951688598e-05,
1225
+ "loss": 0.9591,
1226
+ "step": 1720
1227
+ },
1228
+ {
1229
+ "epoch": 0.7688888888888888,
1230
+ "grad_norm": 8.489920616149902,
1231
+ "learning_rate": 2.6574999641081812e-05,
1232
+ "loss": 0.9285,
1233
+ "step": 1730
1234
+ },
1235
+ {
1236
+ "epoch": 0.7733333333333333,
1237
+ "grad_norm": 7.725697994232178,
1238
+ "learning_rate": 2.561567738132149e-05,
1239
+ "loss": 0.8912,
1240
+ "step": 1740
1241
+ },
1242
+ {
1243
+ "epoch": 0.7777777777777778,
1244
+ "grad_norm": 8.986964225769043,
1245
+ "learning_rate": 2.467144072898202e-05,
1246
+ "loss": 0.9386,
1247
+ "step": 1750
1248
+ },
1249
+ {
1250
+ "epoch": 0.7822222222222223,
1251
+ "grad_norm": 8.926631927490234,
1252
+ "learning_rate": 2.3742481181182065e-05,
1253
+ "loss": 0.9224,
1254
+ "step": 1760
1255
+ },
1256
+ {
1257
+ "epoch": 0.7866666666666666,
1258
+ "grad_norm": 7.921815395355225,
1259
+ "learning_rate": 2.2828987136747505e-05,
1260
+ "loss": 0.9393,
1261
+ "step": 1770
1262
+ },
1263
+ {
1264
+ "epoch": 0.7911111111111111,
1265
+ "grad_norm": 6.680901050567627,
1266
+ "learning_rate": 2.193114385800309e-05,
1267
+ "loss": 0.9359,
1268
+ "step": 1780
1269
+ },
1270
+ {
1271
+ "epoch": 0.7955555555555556,
1272
+ "grad_norm": 6.957186698913574,
1273
+ "learning_rate": 2.104913343320013e-05,
1274
+ "loss": 0.9285,
1275
+ "step": 1790
1276
+ },
1277
+ {
1278
+ "epoch": 0.8,
1279
+ "grad_norm": 7.6232008934021,
1280
+ "learning_rate": 2.0183134739587807e-05,
1281
+ "loss": 0.9083,
1282
+ "step": 1800
1283
  }
1284
  ],
1285
  "logging_steps": 10,
 
1299
  "attributes": {}
1300
  }
1301
  },
1302
+ "total_flos": 4.9102593196032e+19,
1303
  "train_batch_size": 4,
1304
  "trial_name": null,
1305
  "trial_params": null