aghatage commited on
Commit
ceb48da
·
verified ·
1 Parent(s): 851d836

Training in progress, step 11500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12a26c5ae72d4aef0518c84efdc9e2d761d59aa9a43290c6e5e8c3c753145a81
3
  size 12017472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d739a46be07afc08058bcee6abb1772a84e044deaf39817666f3049bcf653c23
3
  size 12017472
last-checkpoint/global_step11500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d600239e729c4ded64651931f6cb684445e13a28cc9c8180766ac66dd15525f4
3
+ size 71982309
last-checkpoint/global_step11500/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f2820de752ec7bbd3367caf1e9cd13773cad956bad3832cf6ffd7af2da666c1
3
+ size 146356645
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step11000
 
1
+ global_step11500
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5874e4cafeab6bbc56f1a9ea496d34e4351f44b1e8ee07a759c92cdeb51ddda5
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d17e6956d333adf450e550fb2bbfe82bc47be67acb5350845a13faa81c890b40
3
  size 14709
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 11000,
3
- "best_metric": 0.5450185537338257,
4
- "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-11000",
5
- "epoch": 7.9947282312306855,
6
  "eval_steps": 250,
7
- "global_step": 11000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4417,6 +4417,206 @@
4417
  "eval_samples_per_second": 43.083,
4418
  "eval_steps_per_second": 5.392,
4419
  "step": 11000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4420
  }
4421
  ],
4422
  "logging_steps": 25,
@@ -4436,7 +4636,7 @@
4436
  "attributes": {}
4437
  }
4438
  },
4439
- "total_flos": 6.109487891459604e+17,
4440
  "train_batch_size": 4,
4441
  "trial_name": null,
4442
  "trial_params": null
 
1
  {
2
+ "best_global_step": 11500,
3
+ "best_metric": 0.544745683670044,
4
+ "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-11500",
5
+ "epoch": 8.357753135793493,
6
  "eval_steps": 250,
7
+ "global_step": 11500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4417
  "eval_samples_per_second": 43.083,
4418
  "eval_steps_per_second": 5.392,
4419
  "step": 11000
4420
+ },
4421
+ {
4422
+ "epoch": 8.012361388838393,
4423
+ "grad_norm": 0.8800711035728455,
4424
+ "learning_rate": 2.0108403898867298e-05,
4425
+ "loss": 0.511,
4426
+ "mean_token_accuracy": 0.8405692936833372,
4427
+ "num_tokens": 242787188.0,
4428
+ "step": 11025
4429
+ },
4430
+ {
4431
+ "epoch": 8.030539901836029,
4432
+ "grad_norm": 0.8134773969650269,
4433
+ "learning_rate": 1.994255888710432e-05,
4434
+ "loss": 0.5161,
4435
+ "mean_token_accuracy": 0.8396863287687302,
4436
+ "num_tokens": 243323359.0,
4437
+ "step": 11050
4438
+ },
4439
+ {
4440
+ "epoch": 8.048718414833667,
4441
+ "grad_norm": 0.8506718277931213,
4442
+ "learning_rate": 1.977717321235564e-05,
4443
+ "loss": 0.5184,
4444
+ "mean_token_accuracy": 0.8389563143253327,
4445
+ "num_tokens": 243872329.0,
4446
+ "step": 11075
4447
+ },
4448
+ {
4449
+ "epoch": 8.066896927831303,
4450
+ "grad_norm": 0.8440150022506714,
4451
+ "learning_rate": 1.9612250662131406e-05,
4452
+ "loss": 0.5154,
4453
+ "mean_token_accuracy": 0.8382254421710968,
4454
+ "num_tokens": 244414078.0,
4455
+ "step": 11100
4456
+ },
4457
+ {
4458
+ "epoch": 8.08507544082894,
4459
+ "grad_norm": 0.8506153225898743,
4460
+ "learning_rate": 1.9447795013335734e-05,
4461
+ "loss": 0.5232,
4462
+ "mean_token_accuracy": 0.8369153061509133,
4463
+ "num_tokens": 244971752.0,
4464
+ "step": 11125
4465
+ },
4466
+ {
4467
+ "epoch": 8.103253953826577,
4468
+ "grad_norm": 0.8672150373458862,
4469
+ "learning_rate": 1.9283810032180205e-05,
4470
+ "loss": 0.512,
4471
+ "mean_token_accuracy": 0.8409202411770821,
4472
+ "num_tokens": 245520458.0,
4473
+ "step": 11150
4474
+ },
4475
+ {
4476
+ "epoch": 8.121432466824213,
4477
+ "grad_norm": 0.8128538727760315,
4478
+ "learning_rate": 1.9120299474097583e-05,
4479
+ "loss": 0.5209,
4480
+ "mean_token_accuracy": 0.837473659813404,
4481
+ "num_tokens": 246088615.0,
4482
+ "step": 11175
4483
+ },
4484
+ {
4485
+ "epoch": 8.139610979821851,
4486
+ "grad_norm": 0.7617402076721191,
4487
+ "learning_rate": 1.8957267083655835e-05,
4488
+ "loss": 0.5153,
4489
+ "mean_token_accuracy": 0.8394093406200409,
4490
+ "num_tokens": 246630403.0,
4491
+ "step": 11200
4492
+ },
4493
+ {
4494
+ "epoch": 8.157789492819488,
4495
+ "grad_norm": 0.8872790932655334,
4496
+ "learning_rate": 1.8794716594472376e-05,
4497
+ "loss": 0.5179,
4498
+ "mean_token_accuracy": 0.838570873439312,
4499
+ "num_tokens": 247175258.0,
4500
+ "step": 11225
4501
+ },
4502
+ {
4503
+ "epoch": 8.175968005817124,
4504
+ "grad_norm": 0.7966537475585938,
4505
+ "learning_rate": 1.8632651729128564e-05,
4506
+ "loss": 0.5209,
4507
+ "mean_token_accuracy": 0.8365825054049492,
4508
+ "num_tokens": 247743507.0,
4509
+ "step": 11250
4510
+ },
4511
+ {
4512
+ "epoch": 8.175968005817124,
4513
+ "eval_loss": 0.5448639392852783,
4514
+ "eval_mean_token_accuracy": 0.8303035672973184,
4515
+ "eval_num_tokens": 247743507.0,
4516
+ "eval_runtime": 113.7219,
4517
+ "eval_samples_per_second": 43.0,
4518
+ "eval_steps_per_second": 5.382,
4519
+ "step": 11250
4520
+ },
4521
+ {
4522
+ "epoch": 8.194146518814762,
4523
+ "grad_norm": 0.9003967642784119,
4524
+ "learning_rate": 1.847107619908445e-05,
4525
+ "loss": 0.5157,
4526
+ "mean_token_accuracy": 0.8391850134730339,
4527
+ "num_tokens": 248275961.0,
4528
+ "step": 11275
4529
+ },
4530
+ {
4531
+ "epoch": 8.212325031812398,
4532
+ "grad_norm": 0.7678829431533813,
4533
+ "learning_rate": 1.8309993704593756e-05,
4534
+ "loss": 0.5175,
4535
+ "mean_token_accuracy": 0.8387827044725418,
4536
+ "num_tokens": 248835571.0,
4537
+ "step": 11300
4538
+ },
4539
+ {
4540
+ "epoch": 8.230503544810034,
4541
+ "grad_norm": 0.8297247290611267,
4542
+ "learning_rate": 1.8149407934619215e-05,
4543
+ "loss": 0.5213,
4544
+ "mean_token_accuracy": 0.8382138457894325,
4545
+ "num_tokens": 249386561.0,
4546
+ "step": 11325
4547
+ },
4548
+ {
4549
+ "epoch": 8.248682057807672,
4550
+ "grad_norm": 0.8659992218017578,
4551
+ "learning_rate": 1.798932256674798e-05,
4552
+ "loss": 0.5181,
4553
+ "mean_token_accuracy": 0.8384436306357383,
4554
+ "num_tokens": 249964812.0,
4555
+ "step": 11350
4556
+ },
4557
+ {
4558
+ "epoch": 8.266860570805308,
4559
+ "grad_norm": 0.8487904071807861,
4560
+ "learning_rate": 1.782974126710748e-05,
4561
+ "loss": 0.5243,
4562
+ "mean_token_accuracy": 0.8366836148500443,
4563
+ "num_tokens": 250524273.0,
4564
+ "step": 11375
4565
+ },
4566
+ {
4567
+ "epoch": 8.285039083802944,
4568
+ "grad_norm": 0.8609278202056885,
4569
+ "learning_rate": 1.767066769028143e-05,
4570
+ "loss": 0.521,
4571
+ "mean_token_accuracy": 0.8375069627165794,
4572
+ "num_tokens": 251087296.0,
4573
+ "step": 11400
4574
+ },
4575
+ {
4576
+ "epoch": 8.303217596800582,
4577
+ "grad_norm": 0.8295932412147522,
4578
+ "learning_rate": 1.7512105479226144e-05,
4579
+ "loss": 0.5205,
4580
+ "mean_token_accuracy": 0.8369895967841149,
4581
+ "num_tokens": 251637750.0,
4582
+ "step": 11425
4583
+ },
4584
+ {
4585
+ "epoch": 8.321396109798219,
4586
+ "grad_norm": 0.8182777166366577,
4587
+ "learning_rate": 1.7354058265187116e-05,
4588
+ "loss": 0.5224,
4589
+ "mean_token_accuracy": 0.8378088471293449,
4590
+ "num_tokens": 252191575.0,
4591
+ "step": 11450
4592
+ },
4593
+ {
4594
+ "epoch": 8.339574622795855,
4595
+ "grad_norm": 0.7869584560394287,
4596
+ "learning_rate": 1.7196529667615838e-05,
4597
+ "loss": 0.518,
4598
+ "mean_token_accuracy": 0.8380302327871323,
4599
+ "num_tokens": 252747951.0,
4600
+ "step": 11475
4601
+ },
4602
+ {
4603
+ "epoch": 8.357753135793493,
4604
+ "grad_norm": 0.7901642918586731,
4605
+ "learning_rate": 1.7039523294086968e-05,
4606
+ "loss": 0.5188,
4607
+ "mean_token_accuracy": 0.8381170380115509,
4608
+ "num_tokens": 253308593.0,
4609
+ "step": 11500
4610
+ },
4611
+ {
4612
+ "epoch": 8.357753135793493,
4613
+ "eval_loss": 0.544745683670044,
4614
+ "eval_mean_token_accuracy": 0.8305268148386401,
4615
+ "eval_num_tokens": 253308593.0,
4616
+ "eval_runtime": 114.0858,
4617
+ "eval_samples_per_second": 42.862,
4618
+ "eval_steps_per_second": 5.364,
4619
+ "step": 11500
4620
  }
4621
  ],
4622
  "logging_steps": 25,
 
4636
  "attributes": {}
4637
  }
4638
  },
4639
+ "total_flos": 6.387220694035333e+17,
4640
  "train_batch_size": 4,
4641
  "trial_name": null,
4642
  "trial_params": null