Training in progress, step 3500, checkpoint
Browse files- last-checkpoint/adapter_model.safetensors +1 -1
- last-checkpoint/global_step3500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step3500/mp_rank_00_model_states.pt +3 -0
- last-checkpoint/latest +1 -1
- last-checkpoint/rng_state.pth +1 -1
- last-checkpoint/trainer_state.json +206 -6
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 12017472
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:27d5b8ad136d1b37d0b53dfe5e54ffe63b01050eddd9539c59d73e1c91aa67b3
|
| 3 |
size 12017472
|
last-checkpoint/global_step3500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cda2758edb2fb2c8a863a8995389fecad1ddd0807037e9db5f15db85ead9758d
|
| 3 |
+
size 71982309
|
last-checkpoint/global_step3500/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2cbbf95b02a2067a2a4f20c353239d123a04495655920bfb42eb24afdd147c85
|
| 3 |
+
size 146356645
|
last-checkpoint/latest
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
global_step3500
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14709
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:327d808c225b6c78ea6a068082f3d00dba54671d88051aa14e820dd408eeac44
|
| 3 |
size 14709
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 0.
|
| 4 |
-
"best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-
|
| 5 |
-
"epoch": 2.
|
| 6 |
"eval_steps": 250,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -1217,6 +1217,206 @@
|
|
| 1217 |
"eval_samples_per_second": 43.351,
|
| 1218 |
"eval_steps_per_second": 5.425,
|
| 1219 |
"step": 3000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1220 |
}
|
| 1221 |
],
|
| 1222 |
"logging_steps": 25,
|
|
@@ -1236,7 +1436,7 @@
|
|
| 1236 |
"attributes": {}
|
| 1237 |
}
|
| 1238 |
},
|
| 1239 |
-
"total_flos": 1.
|
| 1240 |
"train_batch_size": 4,
|
| 1241 |
"trial_name": null,
|
| 1242 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 3500,
|
| 3 |
+
"best_metric": 0.614472508430481,
|
| 4 |
+
"best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-3500",
|
| 5 |
+
"epoch": 2.543901108889293,
|
| 6 |
"eval_steps": 250,
|
| 7 |
+
"global_step": 3500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 1217 |
"eval_samples_per_second": 43.351,
|
| 1218 |
"eval_steps_per_second": 5.425,
|
| 1219 |
"step": 3000
|
| 1220 |
+
},
|
| 1221 |
+
{
|
| 1222 |
+
"epoch": 2.1985093619341938,
|
| 1223 |
+
"grad_norm": 0.8226723074913025,
|
| 1224 |
+
"learning_rate": 7.389222583480705e-05,
|
| 1225 |
+
"loss": 0.6243,
|
| 1226 |
+
"mean_token_accuracy": 0.8068913269042969,
|
| 1227 |
+
"num_tokens": 66616416.0,
|
| 1228 |
+
"step": 3025
|
| 1229 |
+
},
|
| 1230 |
+
{
|
| 1231 |
+
"epoch": 2.2166878749318304,
|
| 1232 |
+
"grad_norm": 0.8199797868728638,
|
| 1233 |
+
"learning_rate": 7.379017366112587e-05,
|
| 1234 |
+
"loss": 0.628,
|
| 1235 |
+
"mean_token_accuracy": 0.8060924589633942,
|
| 1236 |
+
"num_tokens": 67170306.0,
|
| 1237 |
+
"step": 3050
|
| 1238 |
+
},
|
| 1239 |
+
{
|
| 1240 |
+
"epoch": 2.2348663879294675,
|
| 1241 |
+
"grad_norm": 0.8197723627090454,
|
| 1242 |
+
"learning_rate": 7.368734765605741e-05,
|
| 1243 |
+
"loss": 0.6125,
|
| 1244 |
+
"mean_token_accuracy": 0.8108021330833435,
|
| 1245 |
+
"num_tokens": 67707041.0,
|
| 1246 |
+
"step": 3075
|
| 1247 |
+
},
|
| 1248 |
+
{
|
| 1249 |
+
"epoch": 2.253044900927104,
|
| 1250 |
+
"grad_norm": 0.8319080471992493,
|
| 1251 |
+
"learning_rate": 7.358375017442797e-05,
|
| 1252 |
+
"loss": 0.6204,
|
| 1253 |
+
"mean_token_accuracy": 0.8081632897257804,
|
| 1254 |
+
"num_tokens": 68243518.0,
|
| 1255 |
+
"step": 3100
|
| 1256 |
+
},
|
| 1257 |
+
{
|
| 1258 |
+
"epoch": 2.271223413924741,
|
| 1259 |
+
"grad_norm": 0.8893775343894958,
|
| 1260 |
+
"learning_rate": 7.347938358873149e-05,
|
| 1261 |
+
"loss": 0.6138,
|
| 1262 |
+
"mean_token_accuracy": 0.8099391725659371,
|
| 1263 |
+
"num_tokens": 68787369.0,
|
| 1264 |
+
"step": 3125
|
| 1265 |
+
},
|
| 1266 |
+
{
|
| 1267 |
+
"epoch": 2.2894019269223778,
|
| 1268 |
+
"grad_norm": 0.8154735565185547,
|
| 1269 |
+
"learning_rate": 7.337425028907528e-05,
|
| 1270 |
+
"loss": 0.6178,
|
| 1271 |
+
"mean_token_accuracy": 0.8098280015587807,
|
| 1272 |
+
"num_tokens": 69334791.0,
|
| 1273 |
+
"step": 3150
|
| 1274 |
+
},
|
| 1275 |
+
{
|
| 1276 |
+
"epoch": 2.3075804399200144,
|
| 1277 |
+
"grad_norm": 0.8006751537322998,
|
| 1278 |
+
"learning_rate": 7.326835268312518e-05,
|
| 1279 |
+
"loss": 0.6158,
|
| 1280 |
+
"mean_token_accuracy": 0.8086746591329574,
|
| 1281 |
+
"num_tokens": 69884826.0,
|
| 1282 |
+
"step": 3175
|
| 1283 |
+
},
|
| 1284 |
+
{
|
| 1285 |
+
"epoch": 2.3257589529176514,
|
| 1286 |
+
"grad_norm": 0.8786169290542603,
|
| 1287 |
+
"learning_rate": 7.316169319605046e-05,
|
| 1288 |
+
"loss": 0.6269,
|
| 1289 |
+
"mean_token_accuracy": 0.8071727818250656,
|
| 1290 |
+
"num_tokens": 70442756.0,
|
| 1291 |
+
"step": 3200
|
| 1292 |
+
},
|
| 1293 |
+
{
|
| 1294 |
+
"epoch": 2.343937465915288,
|
| 1295 |
+
"grad_norm": 0.9075261950492859,
|
| 1296 |
+
"learning_rate": 7.30542742704683e-05,
|
| 1297 |
+
"loss": 0.6201,
|
| 1298 |
+
"mean_token_accuracy": 0.8087817251682281,
|
| 1299 |
+
"num_tokens": 70993300.0,
|
| 1300 |
+
"step": 3225
|
| 1301 |
+
},
|
| 1302 |
+
{
|
| 1303 |
+
"epoch": 2.362115978912925,
|
| 1304 |
+
"grad_norm": 0.8171051740646362,
|
| 1305 |
+
"learning_rate": 7.294609836638787e-05,
|
| 1306 |
+
"loss": 0.6188,
|
| 1307 |
+
"mean_token_accuracy": 0.8082248848676682,
|
| 1308 |
+
"num_tokens": 71543391.0,
|
| 1309 |
+
"step": 3250
|
| 1310 |
+
},
|
| 1311 |
+
{
|
| 1312 |
+
"epoch": 2.362115978912925,
|
| 1313 |
+
"eval_loss": 0.621147096157074,
|
| 1314 |
+
"eval_mean_token_accuracy": 0.8068399137141657,
|
| 1315 |
+
"eval_num_tokens": 71543391.0,
|
| 1316 |
+
"eval_runtime": 112.0822,
|
| 1317 |
+
"eval_samples_per_second": 43.629,
|
| 1318 |
+
"eval_steps_per_second": 5.46,
|
| 1319 |
+
"step": 3250
|
| 1320 |
+
},
|
| 1321 |
+
{
|
| 1322 |
+
"epoch": 2.3802944919105617,
|
| 1323 |
+
"grad_norm": 0.8513513207435608,
|
| 1324 |
+
"learning_rate": 7.283716796115393e-05,
|
| 1325 |
+
"loss": 0.6187,
|
| 1326 |
+
"mean_token_accuracy": 0.8077478906512261,
|
| 1327 |
+
"num_tokens": 72109371.0,
|
| 1328 |
+
"step": 3275
|
| 1329 |
+
},
|
| 1330 |
+
{
|
| 1331 |
+
"epoch": 2.3984730049081984,
|
| 1332 |
+
"grad_norm": 0.901434063911438,
|
| 1333 |
+
"learning_rate": 7.272748554939012e-05,
|
| 1334 |
+
"loss": 0.6135,
|
| 1335 |
+
"mean_token_accuracy": 0.8108441984653473,
|
| 1336 |
+
"num_tokens": 72661191.0,
|
| 1337 |
+
"step": 3300
|
| 1338 |
+
},
|
| 1339 |
+
{
|
| 1340 |
+
"epoch": 2.4166515179058354,
|
| 1341 |
+
"grad_norm": 0.8363370895385742,
|
| 1342 |
+
"learning_rate": 7.261705364294188e-05,
|
| 1343 |
+
"loss": 0.6124,
|
| 1344 |
+
"mean_token_accuracy": 0.8096053293347358,
|
| 1345 |
+
"num_tokens": 73213412.0,
|
| 1346 |
+
"step": 3325
|
| 1347 |
+
},
|
| 1348 |
+
{
|
| 1349 |
+
"epoch": 2.434830030903472,
|
| 1350 |
+
"grad_norm": 0.875728189945221,
|
| 1351 |
+
"learning_rate": 7.250587477081885e-05,
|
| 1352 |
+
"loss": 0.6215,
|
| 1353 |
+
"mean_token_accuracy": 0.8084959277510643,
|
| 1354 |
+
"num_tokens": 73764375.0,
|
| 1355 |
+
"step": 3350
|
| 1356 |
+
},
|
| 1357 |
+
{
|
| 1358 |
+
"epoch": 2.4530085439011087,
|
| 1359 |
+
"grad_norm": 0.7723637819290161,
|
| 1360 |
+
"learning_rate": 7.2393951479137e-05,
|
| 1361 |
+
"loss": 0.6066,
|
| 1362 |
+
"mean_token_accuracy": 0.8145261201262474,
|
| 1363 |
+
"num_tokens": 74309911.0,
|
| 1364 |
+
"step": 3375
|
| 1365 |
+
},
|
| 1366 |
+
{
|
| 1367 |
+
"epoch": 2.4711870568987457,
|
| 1368 |
+
"grad_norm": 0.8123798370361328,
|
| 1369 |
+
"learning_rate": 7.228128633106032e-05,
|
| 1370 |
+
"loss": 0.6111,
|
| 1371 |
+
"mean_token_accuracy": 0.8112738102674484,
|
| 1372 |
+
"num_tokens": 74856337.0,
|
| 1373 |
+
"step": 3400
|
| 1374 |
+
},
|
| 1375 |
+
{
|
| 1376 |
+
"epoch": 2.4893655698963824,
|
| 1377 |
+
"grad_norm": 0.8313596844673157,
|
| 1378 |
+
"learning_rate": 7.21678819067421e-05,
|
| 1379 |
+
"loss": 0.6258,
|
| 1380 |
+
"mean_token_accuracy": 0.8076113468408584,
|
| 1381 |
+
"num_tokens": 75410769.0,
|
| 1382 |
+
"step": 3425
|
| 1383 |
+
},
|
| 1384 |
+
{
|
| 1385 |
+
"epoch": 2.5075440828940194,
|
| 1386 |
+
"grad_norm": 0.8260684013366699,
|
| 1387 |
+
"learning_rate": 7.205374080326585e-05,
|
| 1388 |
+
"loss": 0.6147,
|
| 1389 |
+
"mean_token_accuracy": 0.8095012375712395,
|
| 1390 |
+
"num_tokens": 75963770.0,
|
| 1391 |
+
"step": 3450
|
| 1392 |
+
},
|
| 1393 |
+
{
|
| 1394 |
+
"epoch": 2.525722595891656,
|
| 1395 |
+
"grad_norm": 0.7737406492233276,
|
| 1396 |
+
"learning_rate": 7.193886563458585e-05,
|
| 1397 |
+
"loss": 0.6191,
|
| 1398 |
+
"mean_token_accuracy": 0.8077240213751793,
|
| 1399 |
+
"num_tokens": 76528809.0,
|
| 1400 |
+
"step": 3475
|
| 1401 |
+
},
|
| 1402 |
+
{
|
| 1403 |
+
"epoch": 2.543901108889293,
|
| 1404 |
+
"grad_norm": 0.7885979413986206,
|
| 1405 |
+
"learning_rate": 7.182325903146721e-05,
|
| 1406 |
+
"loss": 0.6168,
|
| 1407 |
+
"mean_token_accuracy": 0.8091749155521393,
|
| 1408 |
+
"num_tokens": 77090179.0,
|
| 1409 |
+
"step": 3500
|
| 1410 |
+
},
|
| 1411 |
+
{
|
| 1412 |
+
"epoch": 2.543901108889293,
|
| 1413 |
+
"eval_loss": 0.614472508430481,
|
| 1414 |
+
"eval_mean_token_accuracy": 0.8088948371168835,
|
| 1415 |
+
"eval_num_tokens": 77090179.0,
|
| 1416 |
+
"eval_runtime": 112.1372,
|
| 1417 |
+
"eval_samples_per_second": 43.607,
|
| 1418 |
+
"eval_steps_per_second": 5.458,
|
| 1419 |
+
"step": 3500
|
| 1420 |
}
|
| 1421 |
],
|
| 1422 |
"logging_steps": 25,
|
|
|
|
| 1436 |
"attributes": {}
|
| 1437 |
}
|
| 1438 |
},
|
| 1439 |
+
"total_flos": 1.9432793569440563e+17,
|
| 1440 |
"train_batch_size": 4,
|
| 1441 |
"trial_name": null,
|
| 1442 |
"trial_params": null
|