aghatage commited on
Commit
8eea47b
·
verified ·
1 Parent(s): 343f2f4

Training in progress, step 3500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e8f2e633fe08d7eb7ff6e133c7cb6469ced1b595814d785aa46e33c6e65f452
3
  size 12017472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27d5b8ad136d1b37d0b53dfe5e54ffe63b01050eddd9539c59d73e1c91aa67b3
3
  size 12017472
last-checkpoint/global_step3500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cda2758edb2fb2c8a863a8995389fecad1ddd0807037e9db5f15db85ead9758d
3
+ size 71982309
last-checkpoint/global_step3500/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cbbf95b02a2067a2a4f20c353239d123a04495655920bfb42eb24afdd147c85
3
+ size 146356645
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step3000
 
1
+ global_step3500
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7573584526c2fe8e68dba3ab40f8f3cffab852e01de9ee2eb0aaf2a4192e0852
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:327d808c225b6c78ea6a068082f3d00dba54671d88051aa14e820dd408eeac44
3
  size 14709
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 3000,
3
- "best_metric": 0.6261406540870667,
4
- "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-3000",
5
- "epoch": 2.180330848936557,
6
  "eval_steps": 250,
7
- "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1217,6 +1217,206 @@
1217
  "eval_samples_per_second": 43.351,
1218
  "eval_steps_per_second": 5.425,
1219
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1220
  }
1221
  ],
1222
  "logging_steps": 25,
@@ -1236,7 +1436,7 @@
1236
  "attributes": {}
1237
  }
1238
  },
1239
- "total_flos": 1.6655712651942298e+17,
1240
  "train_batch_size": 4,
1241
  "trial_name": null,
1242
  "trial_params": null
 
1
  {
2
+ "best_global_step": 3500,
3
+ "best_metric": 0.614472508430481,
4
+ "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-3500",
5
+ "epoch": 2.543901108889293,
6
  "eval_steps": 250,
7
+ "global_step": 3500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1217
  "eval_samples_per_second": 43.351,
1218
  "eval_steps_per_second": 5.425,
1219
  "step": 3000
1220
+ },
1221
+ {
1222
+ "epoch": 2.1985093619341938,
1223
+ "grad_norm": 0.8226723074913025,
1224
+ "learning_rate": 7.389222583480705e-05,
1225
+ "loss": 0.6243,
1226
+ "mean_token_accuracy": 0.8068913269042969,
1227
+ "num_tokens": 66616416.0,
1228
+ "step": 3025
1229
+ },
1230
+ {
1231
+ "epoch": 2.2166878749318304,
1232
+ "grad_norm": 0.8199797868728638,
1233
+ "learning_rate": 7.379017366112587e-05,
1234
+ "loss": 0.628,
1235
+ "mean_token_accuracy": 0.8060924589633942,
1236
+ "num_tokens": 67170306.0,
1237
+ "step": 3050
1238
+ },
1239
+ {
1240
+ "epoch": 2.2348663879294675,
1241
+ "grad_norm": 0.8197723627090454,
1242
+ "learning_rate": 7.368734765605741e-05,
1243
+ "loss": 0.6125,
1244
+ "mean_token_accuracy": 0.8108021330833435,
1245
+ "num_tokens": 67707041.0,
1246
+ "step": 3075
1247
+ },
1248
+ {
1249
+ "epoch": 2.253044900927104,
1250
+ "grad_norm": 0.8319080471992493,
1251
+ "learning_rate": 7.358375017442797e-05,
1252
+ "loss": 0.6204,
1253
+ "mean_token_accuracy": 0.8081632897257804,
1254
+ "num_tokens": 68243518.0,
1255
+ "step": 3100
1256
+ },
1257
+ {
1258
+ "epoch": 2.271223413924741,
1259
+ "grad_norm": 0.8893775343894958,
1260
+ "learning_rate": 7.347938358873149e-05,
1261
+ "loss": 0.6138,
1262
+ "mean_token_accuracy": 0.8099391725659371,
1263
+ "num_tokens": 68787369.0,
1264
+ "step": 3125
1265
+ },
1266
+ {
1267
+ "epoch": 2.2894019269223778,
1268
+ "grad_norm": 0.8154735565185547,
1269
+ "learning_rate": 7.337425028907528e-05,
1270
+ "loss": 0.6178,
1271
+ "mean_token_accuracy": 0.8098280015587807,
1272
+ "num_tokens": 69334791.0,
1273
+ "step": 3150
1274
+ },
1275
+ {
1276
+ "epoch": 2.3075804399200144,
1277
+ "grad_norm": 0.8006751537322998,
1278
+ "learning_rate": 7.326835268312518e-05,
1279
+ "loss": 0.6158,
1280
+ "mean_token_accuracy": 0.8086746591329574,
1281
+ "num_tokens": 69884826.0,
1282
+ "step": 3175
1283
+ },
1284
+ {
1285
+ "epoch": 2.3257589529176514,
1286
+ "grad_norm": 0.8786169290542603,
1287
+ "learning_rate": 7.316169319605046e-05,
1288
+ "loss": 0.6269,
1289
+ "mean_token_accuracy": 0.8071727818250656,
1290
+ "num_tokens": 70442756.0,
1291
+ "step": 3200
1292
+ },
1293
+ {
1294
+ "epoch": 2.343937465915288,
1295
+ "grad_norm": 0.9075261950492859,
1296
+ "learning_rate": 7.30542742704683e-05,
1297
+ "loss": 0.6201,
1298
+ "mean_token_accuracy": 0.8087817251682281,
1299
+ "num_tokens": 70993300.0,
1300
+ "step": 3225
1301
+ },
1302
+ {
1303
+ "epoch": 2.362115978912925,
1304
+ "grad_norm": 0.8171051740646362,
1305
+ "learning_rate": 7.294609836638787e-05,
1306
+ "loss": 0.6188,
1307
+ "mean_token_accuracy": 0.8082248848676682,
1308
+ "num_tokens": 71543391.0,
1309
+ "step": 3250
1310
+ },
1311
+ {
1312
+ "epoch": 2.362115978912925,
1313
+ "eval_loss": 0.621147096157074,
1314
+ "eval_mean_token_accuracy": 0.8068399137141657,
1315
+ "eval_num_tokens": 71543391.0,
1316
+ "eval_runtime": 112.0822,
1317
+ "eval_samples_per_second": 43.629,
1318
+ "eval_steps_per_second": 5.46,
1319
+ "step": 3250
1320
+ },
1321
+ {
1322
+ "epoch": 2.3802944919105617,
1323
+ "grad_norm": 0.8513513207435608,
1324
+ "learning_rate": 7.283716796115393e-05,
1325
+ "loss": 0.6187,
1326
+ "mean_token_accuracy": 0.8077478906512261,
1327
+ "num_tokens": 72109371.0,
1328
+ "step": 3275
1329
+ },
1330
+ {
1331
+ "epoch": 2.3984730049081984,
1332
+ "grad_norm": 0.901434063911438,
1333
+ "learning_rate": 7.272748554939012e-05,
1334
+ "loss": 0.6135,
1335
+ "mean_token_accuracy": 0.8108441984653473,
1336
+ "num_tokens": 72661191.0,
1337
+ "step": 3300
1338
+ },
1339
+ {
1340
+ "epoch": 2.4166515179058354,
1341
+ "grad_norm": 0.8363370895385742,
1342
+ "learning_rate": 7.261705364294188e-05,
1343
+ "loss": 0.6124,
1344
+ "mean_token_accuracy": 0.8096053293347358,
1345
+ "num_tokens": 73213412.0,
1346
+ "step": 3325
1347
+ },
1348
+ {
1349
+ "epoch": 2.434830030903472,
1350
+ "grad_norm": 0.875728189945221,
1351
+ "learning_rate": 7.250587477081885e-05,
1352
+ "loss": 0.6215,
1353
+ "mean_token_accuracy": 0.8084959277510643,
1354
+ "num_tokens": 73764375.0,
1355
+ "step": 3350
1356
+ },
1357
+ {
1358
+ "epoch": 2.4530085439011087,
1359
+ "grad_norm": 0.7723637819290161,
1360
+ "learning_rate": 7.2393951479137e-05,
1361
+ "loss": 0.6066,
1362
+ "mean_token_accuracy": 0.8145261201262474,
1363
+ "num_tokens": 74309911.0,
1364
+ "step": 3375
1365
+ },
1366
+ {
1367
+ "epoch": 2.4711870568987457,
1368
+ "grad_norm": 0.8123798370361328,
1369
+ "learning_rate": 7.228128633106032e-05,
1370
+ "loss": 0.6111,
1371
+ "mean_token_accuracy": 0.8112738102674484,
1372
+ "num_tokens": 74856337.0,
1373
+ "step": 3400
1374
+ },
1375
+ {
1376
+ "epoch": 2.4893655698963824,
1377
+ "grad_norm": 0.8313596844673157,
1378
+ "learning_rate": 7.21678819067421e-05,
1379
+ "loss": 0.6258,
1380
+ "mean_token_accuracy": 0.8076113468408584,
1381
+ "num_tokens": 75410769.0,
1382
+ "step": 3425
1383
+ },
1384
+ {
1385
+ "epoch": 2.5075440828940194,
1386
+ "grad_norm": 0.8260684013366699,
1387
+ "learning_rate": 7.205374080326585e-05,
1388
+ "loss": 0.6147,
1389
+ "mean_token_accuracy": 0.8095012375712395,
1390
+ "num_tokens": 75963770.0,
1391
+ "step": 3450
1392
+ },
1393
+ {
1394
+ "epoch": 2.525722595891656,
1395
+ "grad_norm": 0.7737406492233276,
1396
+ "learning_rate": 7.193886563458585e-05,
1397
+ "loss": 0.6191,
1398
+ "mean_token_accuracy": 0.8077240213751793,
1399
+ "num_tokens": 76528809.0,
1400
+ "step": 3475
1401
+ },
1402
+ {
1403
+ "epoch": 2.543901108889293,
1404
+ "grad_norm": 0.7885979413986206,
1405
+ "learning_rate": 7.182325903146721e-05,
1406
+ "loss": 0.6168,
1407
+ "mean_token_accuracy": 0.8091749155521393,
1408
+ "num_tokens": 77090179.0,
1409
+ "step": 3500
1410
+ },
1411
+ {
1412
+ "epoch": 2.543901108889293,
1413
+ "eval_loss": 0.614472508430481,
1414
+ "eval_mean_token_accuracy": 0.8088948371168835,
1415
+ "eval_num_tokens": 77090179.0,
1416
+ "eval_runtime": 112.1372,
1417
+ "eval_samples_per_second": 43.607,
1418
+ "eval_steps_per_second": 5.458,
1419
+ "step": 3500
1420
  }
1421
  ],
1422
  "logging_steps": 25,
 
1436
  "attributes": {}
1437
  }
1438
  },
1439
+ "total_flos": 1.9432793569440563e+17,
1440
  "train_batch_size": 4,
1441
  "trial_name": null,
1442
  "trial_params": null