tsor13 commited on
Commit
9cbdcd0
·
verified ·
1 Parent(s): 55f6af7

Initial upload of fine‑tuned Gemma + custom tokenizer

Browse files
model-00001-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b4985a44b48184c143af3be5ef245eb45ecbb681e2ec90a4cba5ffe6f095f1b
3
  size 4979902192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:482be2455e637e399de5e4c8afaec5d2675ec11f89137e526362b82b23cedb4b
3
  size 4979902192
model-00002-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7b2e6c318860d148d9bea44b00483547b2ea9dd51e5b1214e833c256e5246ff
3
  size 4931296592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43f80968dd4e284d30ed74416856e65c7344cb8ed822d3f833e711249692160e
3
  size 4931296592
model-00003-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:615d3804cc1fd49aa251b169db0ecb27ca6a6a7c35a47cea5b3d3b384302ee97
3
  size 4931296656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0eb6398002e8b6604cef8459094542dcccbfd027bacd105d4a9c82f95ea16192
3
  size 4931296656
model-00004-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b048f3a29f3dddc0fbbe3e90ecff68d76073458a01befffaa93289bca652f80
3
  size 4931296656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c9e1e56ed4e0108727c4c549ee28ec2af3da9f48369c8fad1527a2978d19e20
3
  size 4931296656
model-00005-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b7e987800cdca2218c8a4d14167ddb87ee3396cef0b40a282557ec708e92f2c
3
  size 4601000928
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48f435f220bcf30ac3f7e864f113cdf55f1c2d86b35674047ab703a957ec75a5
3
  size 4601000928
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f614219902f7ff2c52abf6315720ed82231175199ef520d9eb861fdd96b6edb8
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81e125f2538e139197e1ffe42d2e924ff12ed600a2b3c103b6ba772812143b1d
3
  size 1465
tokenizer_config.json CHANGED
@@ -1107,7 +1107,7 @@
1107
  "special": false
1108
  },
1109
  "138": {
1110
- "content": "\u2581\u2581",
1111
  "lstrip": false,
1112
  "normalized": false,
1113
  "rstrip": false,
@@ -1115,7 +1115,7 @@
1115
  "special": false
1116
  },
1117
  "139": {
1118
- "content": "\u2581\u2581\u2581",
1119
  "lstrip": false,
1120
  "normalized": false,
1121
  "rstrip": false,
@@ -1123,7 +1123,7 @@
1123
  "special": false
1124
  },
1125
  "140": {
1126
- "content": "\u2581\u2581\u2581\u2581",
1127
  "lstrip": false,
1128
  "normalized": false,
1129
  "rstrip": false,
@@ -1131,7 +1131,7 @@
1131
  "special": false
1132
  },
1133
  "141": {
1134
- "content": "\u2581\u2581\u2581\u2581\u2581",
1135
  "lstrip": false,
1136
  "normalized": false,
1137
  "rstrip": false,
@@ -1139,7 +1139,7 @@
1139
  "special": false
1140
  },
1141
  "142": {
1142
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581",
1143
  "lstrip": false,
1144
  "normalized": false,
1145
  "rstrip": false,
@@ -1147,7 +1147,7 @@
1147
  "special": false
1148
  },
1149
  "143": {
1150
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1151
  "lstrip": false,
1152
  "normalized": false,
1153
  "rstrip": false,
@@ -1155,7 +1155,7 @@
1155
  "special": false
1156
  },
1157
  "144": {
1158
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1159
  "lstrip": false,
1160
  "normalized": false,
1161
  "rstrip": false,
@@ -1163,7 +1163,7 @@
1163
  "special": false
1164
  },
1165
  "145": {
1166
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1167
  "lstrip": false,
1168
  "normalized": false,
1169
  "rstrip": false,
@@ -1171,7 +1171,7 @@
1171
  "special": false
1172
  },
1173
  "146": {
1174
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1175
  "lstrip": false,
1176
  "normalized": false,
1177
  "rstrip": false,
@@ -1179,7 +1179,7 @@
1179
  "special": false
1180
  },
1181
  "147": {
1182
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1183
  "lstrip": false,
1184
  "normalized": false,
1185
  "rstrip": false,
@@ -1187,7 +1187,7 @@
1187
  "special": false
1188
  },
1189
  "148": {
1190
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1191
  "lstrip": false,
1192
  "normalized": false,
1193
  "rstrip": false,
@@ -1195,7 +1195,7 @@
1195
  "special": false
1196
  },
1197
  "149": {
1198
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1199
  "lstrip": false,
1200
  "normalized": false,
1201
  "rstrip": false,
@@ -1203,7 +1203,7 @@
1203
  "special": false
1204
  },
1205
  "150": {
1206
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1207
  "lstrip": false,
1208
  "normalized": false,
1209
  "rstrip": false,
@@ -1211,7 +1211,7 @@
1211
  "special": false
1212
  },
1213
  "151": {
1214
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1215
  "lstrip": false,
1216
  "normalized": false,
1217
  "rstrip": false,
@@ -1219,7 +1219,7 @@
1219
  "special": false
1220
  },
1221
  "152": {
1222
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1223
  "lstrip": false,
1224
  "normalized": false,
1225
  "rstrip": false,
@@ -1227,7 +1227,7 @@
1227
  "special": false
1228
  },
1229
  "153": {
1230
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1231
  "lstrip": false,
1232
  "normalized": false,
1233
  "rstrip": false,
@@ -1235,7 +1235,7 @@
1235
  "special": false
1236
  },
1237
  "154": {
1238
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1239
  "lstrip": false,
1240
  "normalized": false,
1241
  "rstrip": false,
@@ -1243,7 +1243,7 @@
1243
  "special": false
1244
  },
1245
  "155": {
1246
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1247
  "lstrip": false,
1248
  "normalized": false,
1249
  "rstrip": false,
@@ -1251,7 +1251,7 @@
1251
  "special": false
1252
  },
1253
  "156": {
1254
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1255
  "lstrip": false,
1256
  "normalized": false,
1257
  "rstrip": false,
@@ -1259,7 +1259,7 @@
1259
  "special": false
1260
  },
1261
  "157": {
1262
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1263
  "lstrip": false,
1264
  "normalized": false,
1265
  "rstrip": false,
@@ -1267,7 +1267,7 @@
1267
  "special": false
1268
  },
1269
  "158": {
1270
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1271
  "lstrip": false,
1272
  "normalized": false,
1273
  "rstrip": false,
@@ -1275,7 +1275,7 @@
1275
  "special": false
1276
  },
1277
  "159": {
1278
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1279
  "lstrip": false,
1280
  "normalized": false,
1281
  "rstrip": false,
@@ -1283,7 +1283,7 @@
1283
  "special": false
1284
  },
1285
  "160": {
1286
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1287
  "lstrip": false,
1288
  "normalized": false,
1289
  "rstrip": false,
@@ -1291,7 +1291,7 @@
1291
  "special": false
1292
  },
1293
  "161": {
1294
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1295
  "lstrip": false,
1296
  "normalized": false,
1297
  "rstrip": false,
@@ -1299,7 +1299,7 @@
1299
  "special": false
1300
  },
1301
  "162": {
1302
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1303
  "lstrip": false,
1304
  "normalized": false,
1305
  "rstrip": false,
@@ -1307,7 +1307,7 @@
1307
  "special": false
1308
  },
1309
  "163": {
1310
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1311
  "lstrip": false,
1312
  "normalized": false,
1313
  "rstrip": false,
@@ -1315,7 +1315,7 @@
1315
  "special": false
1316
  },
1317
  "164": {
1318
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1319
  "lstrip": false,
1320
  "normalized": false,
1321
  "rstrip": false,
@@ -1323,7 +1323,7 @@
1323
  "special": false
1324
  },
1325
  "165": {
1326
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1327
  "lstrip": false,
1328
  "normalized": false,
1329
  "rstrip": false,
@@ -1331,7 +1331,7 @@
1331
  "special": false
1332
  },
1333
  "166": {
1334
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1335
  "lstrip": false,
1336
  "normalized": false,
1337
  "rstrip": false,
@@ -1339,7 +1339,7 @@
1339
  "special": false
1340
  },
1341
  "167": {
1342
- "content": "\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581",
1343
  "lstrip": false,
1344
  "normalized": false,
1345
  "rstrip": false,
@@ -51326,7 +51326,6 @@
51326
  "boi_token": "<start_of_image>",
51327
  "bos_token": "<bos>",
51328
  "clean_up_tokenization_spaces": false,
51329
- "end_string": "<end_of_turn>",
51330
  "eoi_token": "<end_of_image>",
51331
  "eos_token": "<eos>",
51332
  "extra_special_tokens": {
@@ -51337,16 +51336,10 @@
51337
  "image_token": "<image_soft_token>",
51338
  "model_max_length": 1000000000000000019884624838656,
51339
  "pad_token": "<pad>",
 
51340
  "sp_model_kwargs": null,
51341
  "spaces_between_special_tokens": false,
51342
- "start_string": "<start_of_turn>",
51343
- "tokenizer_class": "GemmaSpecialTokenizer",
51344
  "unk_token": "<unk>",
51345
- "use_default_system_prompt": false,
51346
- "auto_map": {
51347
- "AutoTokenizer": [
51348
- "gemma_special_tokenizer.GemmaSpecialTokenizer",
51349
- "gemma_special_tokenizer.GemmaSpecialTokenizer"
51350
- ]
51351
- }
51352
- }
 
1107
  "special": false
1108
  },
1109
  "138": {
1110
+ "content": "▁▁",
1111
  "lstrip": false,
1112
  "normalized": false,
1113
  "rstrip": false,
 
1115
  "special": false
1116
  },
1117
  "139": {
1118
+ "content": "▁▁▁",
1119
  "lstrip": false,
1120
  "normalized": false,
1121
  "rstrip": false,
 
1123
  "special": false
1124
  },
1125
  "140": {
1126
+ "content": "▁▁▁▁",
1127
  "lstrip": false,
1128
  "normalized": false,
1129
  "rstrip": false,
 
1131
  "special": false
1132
  },
1133
  "141": {
1134
+ "content": "▁▁▁▁▁",
1135
  "lstrip": false,
1136
  "normalized": false,
1137
  "rstrip": false,
 
1139
  "special": false
1140
  },
1141
  "142": {
1142
+ "content": "▁▁▁▁▁▁",
1143
  "lstrip": false,
1144
  "normalized": false,
1145
  "rstrip": false,
 
1147
  "special": false
1148
  },
1149
  "143": {
1150
+ "content": "▁▁▁▁▁▁▁",
1151
  "lstrip": false,
1152
  "normalized": false,
1153
  "rstrip": false,
 
1155
  "special": false
1156
  },
1157
  "144": {
1158
+ "content": "▁▁▁▁▁▁▁▁",
1159
  "lstrip": false,
1160
  "normalized": false,
1161
  "rstrip": false,
 
1163
  "special": false
1164
  },
1165
  "145": {
1166
+ "content": "▁▁▁▁▁▁▁▁▁",
1167
  "lstrip": false,
1168
  "normalized": false,
1169
  "rstrip": false,
 
1171
  "special": false
1172
  },
1173
  "146": {
1174
+ "content": "▁▁▁▁▁▁▁▁▁▁",
1175
  "lstrip": false,
1176
  "normalized": false,
1177
  "rstrip": false,
 
1179
  "special": false
1180
  },
1181
  "147": {
1182
+ "content": "▁▁▁▁▁▁▁▁▁▁▁",
1183
  "lstrip": false,
1184
  "normalized": false,
1185
  "rstrip": false,
 
1187
  "special": false
1188
  },
1189
  "148": {
1190
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁",
1191
  "lstrip": false,
1192
  "normalized": false,
1193
  "rstrip": false,
 
1195
  "special": false
1196
  },
1197
  "149": {
1198
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁",
1199
  "lstrip": false,
1200
  "normalized": false,
1201
  "rstrip": false,
 
1203
  "special": false
1204
  },
1205
  "150": {
1206
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1207
  "lstrip": false,
1208
  "normalized": false,
1209
  "rstrip": false,
 
1211
  "special": false
1212
  },
1213
  "151": {
1214
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1215
  "lstrip": false,
1216
  "normalized": false,
1217
  "rstrip": false,
 
1219
  "special": false
1220
  },
1221
  "152": {
1222
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1223
  "lstrip": false,
1224
  "normalized": false,
1225
  "rstrip": false,
 
1227
  "special": false
1228
  },
1229
  "153": {
1230
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1231
  "lstrip": false,
1232
  "normalized": false,
1233
  "rstrip": false,
 
1235
  "special": false
1236
  },
1237
  "154": {
1238
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1239
  "lstrip": false,
1240
  "normalized": false,
1241
  "rstrip": false,
 
1243
  "special": false
1244
  },
1245
  "155": {
1246
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1247
  "lstrip": false,
1248
  "normalized": false,
1249
  "rstrip": false,
 
1251
  "special": false
1252
  },
1253
  "156": {
1254
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1255
  "lstrip": false,
1256
  "normalized": false,
1257
  "rstrip": false,
 
1259
  "special": false
1260
  },
1261
  "157": {
1262
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1263
  "lstrip": false,
1264
  "normalized": false,
1265
  "rstrip": false,
 
1267
  "special": false
1268
  },
1269
  "158": {
1270
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1271
  "lstrip": false,
1272
  "normalized": false,
1273
  "rstrip": false,
 
1275
  "special": false
1276
  },
1277
  "159": {
1278
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1279
  "lstrip": false,
1280
  "normalized": false,
1281
  "rstrip": false,
 
1283
  "special": false
1284
  },
1285
  "160": {
1286
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1287
  "lstrip": false,
1288
  "normalized": false,
1289
  "rstrip": false,
 
1291
  "special": false
1292
  },
1293
  "161": {
1294
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1295
  "lstrip": false,
1296
  "normalized": false,
1297
  "rstrip": false,
 
1299
  "special": false
1300
  },
1301
  "162": {
1302
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1303
  "lstrip": false,
1304
  "normalized": false,
1305
  "rstrip": false,
 
1307
  "special": false
1308
  },
1309
  "163": {
1310
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1311
  "lstrip": false,
1312
  "normalized": false,
1313
  "rstrip": false,
 
1315
  "special": false
1316
  },
1317
  "164": {
1318
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1319
  "lstrip": false,
1320
  "normalized": false,
1321
  "rstrip": false,
 
1323
  "special": false
1324
  },
1325
  "165": {
1326
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1327
  "lstrip": false,
1328
  "normalized": false,
1329
  "rstrip": false,
 
1331
  "special": false
1332
  },
1333
  "166": {
1334
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1335
  "lstrip": false,
1336
  "normalized": false,
1337
  "rstrip": false,
 
1339
  "special": false
1340
  },
1341
  "167": {
1342
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1343
  "lstrip": false,
1344
  "normalized": false,
1345
  "rstrip": false,
 
51326
  "boi_token": "<start_of_image>",
51327
  "bos_token": "<bos>",
51328
  "clean_up_tokenization_spaces": false,
 
51329
  "eoi_token": "<end_of_image>",
51330
  "eos_token": "<eos>",
51331
  "extra_special_tokens": {
 
51336
  "image_token": "<image_soft_token>",
51337
  "model_max_length": 1000000000000000019884624838656,
51338
  "pad_token": "<pad>",
51339
+ "processor_class": "Gemma3Processor",
51340
  "sp_model_kwargs": null,
51341
  "spaces_between_special_tokens": false,
51342
+ "tokenizer_class": "GemmaTokenizerFast",
 
51343
  "unk_token": "<unk>",
51344
+ "use_default_system_prompt": false
51345
+ }
 
 
 
 
 
 
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.1959058733499139,
6
  "eval_steps": 32,
7
  "global_step": 8,
8
  "is_hyper_param_search": false,
@@ -10,80 +10,80 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.02448823416873924,
14
- "grad_norm": 29817.93994076611,
15
  "learning_rate": 3e-06,
16
- "loss": 604.7336,
17
- "mean_token_accuracy": 0.5841934891068377,
18
- "num_tokens": 834003.0,
19
  "step": 1
20
  },
21
  {
22
- "epoch": 0.04897646833747848,
23
- "grad_norm": 13378.067339306495,
24
- "learning_rate": 2.925e-06,
25
- "loss": 501.2048,
26
- "mean_token_accuracy": 0.5982158905826509,
27
- "num_tokens": 1643975.0,
28
  "step": 2
29
  },
30
  {
31
- "epoch": 0.07346470250621771,
32
- "grad_norm": 4566.448298220422,
33
- "learning_rate": 2.85e-06,
34
- "loss": 375.0333,
35
- "mean_token_accuracy": 0.6305847020121291,
36
- "num_tokens": 2439700.0,
37
  "step": 3
38
  },
39
  {
40
- "epoch": 0.09795293667495696,
41
- "grad_norm": 6019.372015429163,
42
- "learning_rate": 2.775e-06,
43
- "loss": 369.5743,
44
- "mean_token_accuracy": 0.6201298969099298,
45
- "num_tokens": 3224139.0,
46
  "step": 4
47
  },
48
  {
49
- "epoch": 0.1224411708436962,
50
- "grad_norm": 5316.727220921461,
51
- "learning_rate": 2.7e-06,
52
- "loss": 353.3683,
53
- "mean_token_accuracy": 0.6273049607407302,
54
- "num_tokens": 4025450.0,
55
  "step": 5
56
  },
57
  {
58
- "epoch": 0.14692940501243543,
59
- "grad_norm": 2361.5776737530314,
60
- "learning_rate": 2.6250000000000003e-06,
61
- "loss": 315.4971,
62
- "mean_token_accuracy": 0.6325392066501081,
63
- "num_tokens": 4818537.0,
64
  "step": 6
65
  },
66
  {
67
- "epoch": 0.17141763918117467,
68
- "grad_norm": 1222.350631373476,
69
- "learning_rate": 2.55e-06,
70
- "loss": 290.5888,
71
- "mean_token_accuracy": 0.6436414200579748,
72
- "num_tokens": 5621502.0,
73
  "step": 7
74
  },
75
  {
76
- "epoch": 0.1959058733499139,
77
- "grad_norm": 1481.1270011064446,
78
- "learning_rate": 2.475e-06,
79
- "loss": 304.7263,
80
- "mean_token_accuracy": 0.6306580123491585,
81
- "num_tokens": 6419957.0,
82
  "step": 8
83
  }
84
  ],
85
  "logging_steps": 1,
86
- "max_steps": 40,
87
  "num_input_tokens_seen": 0,
88
  "num_train_epochs": 1,
89
  "save_steps": 8,
@@ -99,7 +99,7 @@
99
  "attributes": {}
100
  }
101
  },
102
- "total_flos": 42919320092672.0,
103
  "train_batch_size": 1,
104
  "trial_name": null,
105
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.5877457310948486,
6
  "eval_steps": 32,
7
  "global_step": 8,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.07346821638685608,
14
+ "grad_norm": 52224.03643740955,
15
  "learning_rate": 3e-06,
16
+ "loss": 1232.5903,
17
+ "mean_token_accuracy": 0.5698598268500064,
18
+ "num_tokens": 2439700.0,
19
  "step": 1
20
  },
21
  {
22
+ "epoch": 0.14693643277371216,
23
+ "grad_norm": 24564.28249818258,
24
+ "learning_rate": 2.7692307692307693e-06,
25
+ "loss": 1012.342,
26
+ "mean_token_accuracy": 0.5935460063046776,
27
+ "num_tokens": 4818537.0,
28
  "step": 2
29
  },
30
  {
31
+ "epoch": 0.22040464916056823,
32
+ "grad_norm": 10030.685256076777,
33
+ "learning_rate": 2.5384615384615385e-06,
34
+ "loss": 771.7506,
35
+ "mean_token_accuracy": 0.6206552012590691,
36
+ "num_tokens": 7222999.0,
37
  "step": 3
38
  },
39
  {
40
+ "epoch": 0.2938728655474243,
41
+ "grad_norm": 9797.578497343955,
42
+ "learning_rate": 2.307692307692308e-06,
43
+ "loss": 673.8949,
44
+ "mean_token_accuracy": 0.6229124862584285,
45
+ "num_tokens": 9603627.0,
46
  "step": 4
47
  },
48
  {
49
+ "epoch": 0.3673410819342804,
50
+ "grad_norm": 6906.536059477202,
51
+ "learning_rate": 2.076923076923077e-06,
52
+ "loss": 647.7118,
53
+ "mean_token_accuracy": 0.6252844646223821,
54
+ "num_tokens": 12031078.0,
55
  "step": 5
56
  },
57
  {
58
+ "epoch": 0.44080929832113647,
59
+ "grad_norm": 4121.501080449312,
60
+ "learning_rate": 1.8461538461538462e-06,
61
+ "loss": 616.8495,
62
+ "mean_token_accuracy": 0.6341593922115862,
63
+ "num_tokens": 14403431.0,
64
  "step": 6
65
  },
66
  {
67
+ "epoch": 0.5142775147079925,
68
+ "grad_norm": 2026.657249791317,
69
+ "learning_rate": 1.6153846153846154e-06,
70
+ "loss": 601.5251,
71
+ "mean_token_accuracy": 0.6352307246997952,
72
+ "num_tokens": 16811979.0,
73
  "step": 7
74
  },
75
  {
76
+ "epoch": 0.5877457310948486,
77
+ "grad_norm": 1892.3110141436696,
78
+ "learning_rate": 1.3846153846153846e-06,
79
+ "loss": 577.3281,
80
+ "mean_token_accuracy": 0.6342741788248532,
81
+ "num_tokens": 19217660.0,
82
  "step": 8
83
  }
84
  ],
85
  "logging_steps": 1,
86
+ "max_steps": 13,
87
  "num_input_tokens_seen": 0,
88
  "num_train_epochs": 1,
89
  "save_steps": 8,
 
99
  "attributes": {}
100
  }
101
  },
102
+ "total_flos": 128626689310720.0,
103
  "train_batch_size": 1,
104
  "trial_name": null,
105
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:120fb2dbabf3988fbddcfbd028d63dc7696d4bdf2d232abaed0310ab85177cb1
3
  size 7377
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e21928f1882517a4b05718e3914cb90eb7110650d66245962ef389df1995890
3
  size 7377