aghatage commited on
Commit
79a7dc7
·
verified ·
1 Parent(s): 1ae476f

Training in progress, step 16500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:748ac312d2e44467fcda4fec5d9101ba80583460b392693c16cb03d972d0d373
3
  size 12017472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84490a0ba5e68272e718e1b587f957c02192d8adb3af51f4ee9f3e16c57d8791
3
  size 12017472
last-checkpoint/global_step16500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecddbe1aff6e55389d660e2ef4b3e28b3ddd7f5e51d8ff2861b1f4dfb4439e89
3
+ size 71982309
last-checkpoint/global_step16500/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a3e559e7998fe0b3e9109e5a1fcca3316d4f86d200dd5db4d8ea7b7da1c59c0
3
+ size 146356645
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step16000
 
1
+ global_step16500
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e7f4f223e9a53b3be578c7196145efd3d2905fa7329c5f3661f540d7f3b0a13
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25f34d42adfb703043e77cccecb301eb61eb2fb980871ddcbacc2771de139a9c
3
  size 14709
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 16000,
3
  "best_metric": 0.5378558039665222,
4
  "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-16000",
5
- "epoch": 11.628249409198327,
6
  "eval_steps": 250,
7
- "global_step": 16000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6417,6 +6417,206 @@
6417
  "eval_samples_per_second": 43.873,
6418
  "eval_steps_per_second": 5.491,
6419
  "step": 16000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6420
  }
6421
  ],
6422
  "logging_steps": 25,
@@ -6436,7 +6636,7 @@
6436
  "attributes": {}
6437
  }
6438
  },
6439
- "total_flos": 8.886881404730737e+17,
6440
  "train_batch_size": 4,
6441
  "trial_name": null,
6442
  "trial_params": null
 
2
  "best_global_step": 16000,
3
  "best_metric": 0.5378558039665222,
4
  "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-16000",
5
+ "epoch": 11.991819669151063,
6
  "eval_steps": 250,
7
+ "global_step": 16500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6417
  "eval_samples_per_second": 43.873,
6418
  "eval_steps_per_second": 5.491,
6419
  "step": 16000
6420
+ },
6421
+ {
6422
+ "epoch": 11.646427922195965,
6423
+ "grad_norm": 0.8521804213523865,
6424
+ "learning_rate": 1.7368012049631344e-07,
6425
+ "loss": 0.5054,
6426
+ "mean_token_accuracy": 0.8433844763040542,
6427
+ "num_tokens": 352828019.0,
6428
+ "step": 16025
6429
+ },
6430
+ {
6431
+ "epoch": 11.664606435193601,
6432
+ "grad_norm": 0.8544715642929077,
6433
+ "learning_rate": 1.5631750216815733e-07,
6434
+ "loss": 0.5127,
6435
+ "mean_token_accuracy": 0.8406408429145813,
6436
+ "num_tokens": 353383848.0,
6437
+ "step": 16050
6438
+ },
6439
+ {
6440
+ "epoch": 11.682784948191237,
6441
+ "grad_norm": 0.8408353924751282,
6442
+ "learning_rate": 1.398673471025891e-07,
6443
+ "loss": 0.5119,
6444
+ "mean_token_accuracy": 0.8391477057337761,
6445
+ "num_tokens": 353947699.0,
6446
+ "step": 16075
6447
+ },
6448
+ {
6449
+ "epoch": 11.700963461188875,
6450
+ "grad_norm": 0.9233898520469666,
6451
+ "learning_rate": 1.2433003202588113e-07,
6452
+ "loss": 0.5064,
6453
+ "mean_token_accuracy": 0.8429214411973953,
6454
+ "num_tokens": 354490412.0,
6455
+ "step": 16100
6456
+ },
6457
+ {
6458
+ "epoch": 11.719141974186511,
6459
+ "grad_norm": 0.8499307036399841,
6460
+ "learning_rate": 1.0970591275929476e-07,
6461
+ "loss": 0.4985,
6462
+ "mean_token_accuracy": 0.8445121836662293,
6463
+ "num_tokens": 355031092.0,
6464
+ "step": 16125
6465
+ },
6466
+ {
6467
+ "epoch": 11.737320487184148,
6468
+ "grad_norm": 0.814639687538147,
6469
+ "learning_rate": 9.599532421092239e-08,
6470
+ "loss": 0.4986,
6471
+ "mean_token_accuracy": 0.8441943645477294,
6472
+ "num_tokens": 355575121.0,
6473
+ "step": 16150
6474
+ },
6475
+ {
6476
+ "epoch": 11.755499000181786,
6477
+ "grad_norm": 0.9159456491470337,
6478
+ "learning_rate": 8.31985803680091e-08,
6479
+ "loss": 0.5077,
6480
+ "mean_token_accuracy": 0.8415706121921539,
6481
+ "num_tokens": 356143674.0,
6482
+ "step": 16175
6483
+ },
6484
+ {
6485
+ "epoch": 11.773677513179422,
6486
+ "grad_norm": 0.9033915996551514,
6487
+ "learning_rate": 7.13159742897851e-08,
6488
+ "loss": 0.5093,
6489
+ "mean_token_accuracy": 0.8420270484685898,
6490
+ "num_tokens": 356687741.0,
6491
+ "step": 16200
6492
+ },
6493
+ {
6494
+ "epoch": 11.791856026177058,
6495
+ "grad_norm": 0.9012133479118347,
6496
+ "learning_rate": 6.034777810072445e-08,
6497
+ "loss": 0.5157,
6498
+ "mean_token_accuracy": 0.8388536632061004,
6499
+ "num_tokens": 357249933.0,
6500
+ "step": 16225
6501
+ },
6502
+ {
6503
+ "epoch": 11.810034539174696,
6504
+ "grad_norm": 0.8706744909286499,
6505
+ "learning_rate": 5.0294242984345595e-08,
6506
+ "loss": 0.5077,
6507
+ "mean_token_accuracy": 0.8408184266090393,
6508
+ "num_tokens": 357820486.0,
6509
+ "step": 16250
6510
+ },
6511
+ {
6512
+ "epoch": 11.810034539174696,
6513
+ "eval_loss": 0.5378739237785339,
6514
+ "eval_mean_token_accuracy": 0.8327080023054984,
6515
+ "eval_num_tokens": 357820486.0,
6516
+ "eval_runtime": 111.4072,
6517
+ "eval_samples_per_second": 43.893,
6518
+ "eval_steps_per_second": 5.493,
6519
+ "step": 16250
6520
+ },
6521
+ {
6522
+ "epoch": 11.828213052172332,
6523
+ "grad_norm": 0.8344452381134033,
6524
+ "learning_rate": 4.1155599177433725e-08,
6525
+ "loss": 0.5054,
6526
+ "mean_token_accuracy": 0.842702434360981,
6527
+ "num_tokens": 358376267.0,
6528
+ "step": 16275
6529
+ },
6530
+ {
6531
+ "epoch": 11.846391565169968,
6532
+ "grad_norm": 0.9577372074127197,
6533
+ "learning_rate": 3.293205596477833e-08,
6534
+ "loss": 0.5119,
6535
+ "mean_token_accuracy": 0.8426962018013,
6536
+ "num_tokens": 358925005.0,
6537
+ "step": 16300
6538
+ },
6539
+ {
6540
+ "epoch": 11.864570078167606,
6541
+ "grad_norm": 0.8274015188217163,
6542
+ "learning_rate": 2.5623801674381498e-08,
6543
+ "loss": 0.5108,
6544
+ "mean_token_accuracy": 0.8408406323194504,
6545
+ "num_tokens": 359493299.0,
6546
+ "step": 16325
6547
+ },
6548
+ {
6549
+ "epoch": 11.882748591165242,
6550
+ "grad_norm": 0.8797160387039185,
6551
+ "learning_rate": 1.9231003673145788e-08,
6552
+ "loss": 0.5047,
6553
+ "mean_token_accuracy": 0.842636145055294,
6554
+ "num_tokens": 360053289.0,
6555
+ "step": 16350
6556
+ },
6557
+ {
6558
+ "epoch": 11.900927104162879,
6559
+ "grad_norm": 0.8720398545265198,
6560
+ "learning_rate": 1.375380836302398e-08,
6561
+ "loss": 0.5006,
6562
+ "mean_token_accuracy": 0.8434941950440407,
6563
+ "num_tokens": 360612686.0,
6564
+ "step": 16375
6565
+ },
6566
+ {
6567
+ "epoch": 11.919105617160517,
6568
+ "grad_norm": 0.951812207698822,
6569
+ "learning_rate": 9.192341177697295e-09,
6570
+ "loss": 0.5031,
6571
+ "mean_token_accuracy": 0.8426809054613114,
6572
+ "num_tokens": 361163960.0,
6573
+ "step": 16400
6574
+ },
6575
+ {
6576
+ "epoch": 11.937284130158153,
6577
+ "grad_norm": 0.8881998062133789,
6578
+ "learning_rate": 5.546706579679928e-09,
6579
+ "loss": 0.5146,
6580
+ "mean_token_accuracy": 0.8401396802067757,
6581
+ "num_tokens": 361709569.0,
6582
+ "step": 16425
6583
+ },
6584
+ {
6585
+ "epoch": 11.95546264315579,
6586
+ "grad_norm": 0.864645779132843,
6587
+ "learning_rate": 2.816988057929848e-09,
6588
+ "loss": 0.516,
6589
+ "mean_token_accuracy": 0.838928511440754,
6590
+ "num_tokens": 362266243.0,
6591
+ "step": 16450
6592
+ },
6593
+ {
6594
+ "epoch": 11.973641156153427,
6595
+ "grad_norm": 0.8667661547660828,
6596
+ "learning_rate": 1.0032481259436566e-09,
6597
+ "loss": 0.5195,
6598
+ "mean_token_accuracy": 0.8384470102190972,
6599
+ "num_tokens": 362824221.0,
6600
+ "step": 16475
6601
+ },
6602
+ {
6603
+ "epoch": 11.991819669151063,
6604
+ "grad_norm": 0.8674113154411316,
6605
+ "learning_rate": 1.0552832032217908e-10,
6606
+ "loss": 0.508,
6607
+ "mean_token_accuracy": 0.8425227817893028,
6608
+ "num_tokens": 363367808.0,
6609
+ "step": 16500
6610
+ },
6611
+ {
6612
+ "epoch": 11.991819669151063,
6613
+ "eval_loss": 0.5378732681274414,
6614
+ "eval_mean_token_accuracy": 0.8326478998840244,
6615
+ "eval_num_tokens": 363367808.0,
6616
+ "eval_runtime": 111.938,
6617
+ "eval_samples_per_second": 43.685,
6618
+ "eval_steps_per_second": 5.467,
6619
+ "step": 16500
6620
  }
6621
  ],
6622
  "logging_steps": 25,
 
6636
  "attributes": {}
6637
  }
6638
  },
6639
+ "total_flos": 9.164882254873231e+17,
6640
  "train_batch_size": 4,
6641
  "trial_name": null,
6642
  "trial_params": null