Azrail commited on
Commit
ea8b40f
·
verified ·
1 Parent(s): 8b4ce0d

Training in progress, step 122000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71fae22dcd21758bd18c93255be6587d157b9938e670e9b4e1e58707f826293b
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81b6085fb8cdb1171b74b00e5808748cf92ce0ddf8ba548a106b9e635e652ce5
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c78fd0c407d20f07636b49b2421a64b67521b73a2c07508922e8bab006631080
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2584540bb683d62bf86744736defc5b1b50bc3492f528f85e121c6574fb37a99
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d05682589c4464dbd9ebcfc283944f7611626ce7745ad85f4042e5c5171b5198
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3efcdbc541e421955fc1801cd719c72805694f44c64389ef735698f77e94dcbf
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5601bca8adb9619336ad1a8f8dd5a3bb4b196a7ee7870568f8cb821d9554477
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd02e3ed8ffd9c6d891f91758bb97fdbe6142d1b35a6390b66d152313f44683b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.1543484741882013,
6
  "eval_steps": 500,
7
- "global_step": 121000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -21546,11 +21546,189 @@
21546
  "eval_steps_per_second": 15.103,
21547
  "num_input_tokens_seen": 63428647904,
21548
  "step": 121000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21549
  }
21550
  ],
21551
  "logging_steps": 50,
21552
  "max_steps": 140000,
21553
- "num_input_tokens_seen": 63428647904,
21554
  "num_train_epochs": 2,
21555
  "save_steps": 1000,
21556
  "stateful_callbacks": {
@@ -21565,7 +21743,7 @@
21565
  "attributes": {}
21566
  }
21567
  },
21568
- "total_flos": 1.1225713740470231e+20,
21569
  "train_batch_size": 32,
21570
  "trial_name": null,
21571
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.1638885245119668,
6
  "eval_steps": 500,
7
+ "global_step": 122000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
21546
  "eval_steps_per_second": 15.103,
21547
  "num_input_tokens_seen": 63428647904,
21548
  "step": 121000
21549
+ },
21550
+ {
21551
+ "epoch": 1.1548254767043895,
21552
+ "grad_norm": 0.1527141034603119,
21553
+ "learning_rate": 0.000763636808879545,
21554
+ "loss": 2.0812,
21555
+ "num_input_tokens_seen": 63454858592,
21556
+ "step": 121050
21557
+ },
21558
+ {
21559
+ "epoch": 1.155302479220578,
21560
+ "grad_norm": 0.14409616589546204,
21561
+ "learning_rate": 0.0007612492823579744,
21562
+ "loss": 2.0757,
21563
+ "num_input_tokens_seen": 63481069536,
21564
+ "step": 121100
21565
+ },
21566
+ {
21567
+ "epoch": 1.1557794817367661,
21568
+ "grad_norm": 0.1311630755662918,
21569
+ "learning_rate": 0.0007588535338328816,
21570
+ "loss": 2.0714,
21571
+ "num_input_tokens_seen": 63507276640,
21572
+ "step": 121150
21573
+ },
21574
+ {
21575
+ "epoch": 1.1562564842529544,
21576
+ "grad_norm": 0.12864112854003906,
21577
+ "learning_rate": 0.0007564496387029531,
21578
+ "loss": 2.0703,
21579
+ "num_input_tokens_seen": 63533491040,
21580
+ "step": 121200
21581
+ },
21582
+ {
21583
+ "epoch": 1.1567334867691428,
21584
+ "grad_norm": 0.1277550309896469,
21585
+ "learning_rate": 0.0007540376726232647,
21586
+ "loss": 2.0833,
21587
+ "num_input_tokens_seen": 63559699712,
21588
+ "step": 121250
21589
+ },
21590
+ {
21591
+ "epoch": 1.157210489285331,
21592
+ "grad_norm": 0.13141444325447083,
21593
+ "learning_rate": 0.0007516177115029001,
21594
+ "loss": 2.0755,
21595
+ "num_input_tokens_seen": 63585905408,
21596
+ "step": 121300
21597
+ },
21598
+ {
21599
+ "epoch": 1.1576874918015192,
21600
+ "grad_norm": 0.13436725735664368,
21601
+ "learning_rate": 0.0007491898315025615,
21602
+ "loss": 2.0716,
21603
+ "num_input_tokens_seen": 63612116704,
21604
+ "step": 121350
21605
+ },
21606
+ {
21607
+ "epoch": 1.1581644943177074,
21608
+ "grad_norm": 0.13668642938137054,
21609
+ "learning_rate": 0.0007467541090321735,
21610
+ "loss": 2.0766,
21611
+ "num_input_tokens_seen": 63638330048,
21612
+ "step": 121400
21613
+ },
21614
+ {
21615
+ "epoch": 1.1586414968338958,
21616
+ "grad_norm": 0.22589260339736938,
21617
+ "learning_rate": 0.0007443106207484776,
21618
+ "loss": 2.0793,
21619
+ "num_input_tokens_seen": 63664542944,
21620
+ "step": 121450
21621
+ },
21622
+ {
21623
+ "epoch": 1.159118499350084,
21624
+ "grad_norm": 0.14154261350631714,
21625
+ "learning_rate": 0.00074185944355262,
21626
+ "loss": 2.0938,
21627
+ "num_input_tokens_seen": 63690757024,
21628
+ "step": 121500
21629
+ },
21630
+ {
21631
+ "epoch": 1.159118499350084,
21632
+ "eval_loss": 1.9929685592651367,
21633
+ "eval_runtime": 82.8366,
21634
+ "eval_samples_per_second": 60.36,
21635
+ "eval_steps_per_second": 15.09,
21636
+ "num_input_tokens_seen": 63690757024,
21637
+ "step": 121500
21638
+ },
21639
+ {
21640
+ "epoch": 1.1595955018662722,
21641
+ "grad_norm": 0.13303405046463013,
21642
+ "learning_rate": 0.0007394006545877314,
21643
+ "loss": 2.078,
21644
+ "num_input_tokens_seen": 63716968288,
21645
+ "step": 121550
21646
+ },
21647
+ {
21648
+ "epoch": 1.1600725043824607,
21649
+ "grad_norm": 0.12762907147407532,
21650
+ "learning_rate": 0.0007369343312364993,
21651
+ "loss": 2.0757,
21652
+ "num_input_tokens_seen": 63743181728,
21653
+ "step": 121600
21654
+ },
21655
+ {
21656
+ "epoch": 1.1605495068986489,
21657
+ "grad_norm": 0.160507932305336,
21658
+ "learning_rate": 0.0007344605511187322,
21659
+ "loss": 2.076,
21660
+ "num_input_tokens_seen": 63769396128,
21661
+ "step": 121650
21662
+ },
21663
+ {
21664
+ "epoch": 1.161026509414837,
21665
+ "grad_norm": 0.14160197973251343,
21666
+ "learning_rate": 0.0007319793920889171,
21667
+ "loss": 2.0762,
21668
+ "num_input_tokens_seen": 63795607296,
21669
+ "step": 121700
21670
+ },
21671
+ {
21672
+ "epoch": 1.1615035119310255,
21673
+ "grad_norm": 0.15858200192451477,
21674
+ "learning_rate": 0.0007294909322337689,
21675
+ "loss": 2.08,
21676
+ "num_input_tokens_seen": 63821818336,
21677
+ "step": 121750
21678
+ },
21679
+ {
21680
+ "epoch": 1.1619805144472137,
21681
+ "grad_norm": 0.13940422236919403,
21682
+ "learning_rate": 0.0007269952498697733,
21683
+ "loss": 2.0816,
21684
+ "num_input_tokens_seen": 63848031552,
21685
+ "step": 121800
21686
+ },
21687
+ {
21688
+ "epoch": 1.162457516963402,
21689
+ "grad_norm": 0.13600219786167145,
21690
+ "learning_rate": 0.0007244924235407223,
21691
+ "loss": 2.0757,
21692
+ "num_input_tokens_seen": 63874245952,
21693
+ "step": 121850
21694
+ },
21695
+ {
21696
+ "epoch": 1.1629345194795904,
21697
+ "grad_norm": 0.14759120345115662,
21698
+ "learning_rate": 0.0007219825320152411,
21699
+ "loss": 2.0883,
21700
+ "num_input_tokens_seen": 63900453792,
21701
+ "step": 121900
21702
+ },
21703
+ {
21704
+ "epoch": 1.1634115219957786,
21705
+ "grad_norm": 0.12860442698001862,
21706
+ "learning_rate": 0.0007194656542843102,
21707
+ "loss": 2.0802,
21708
+ "num_input_tokens_seen": 63926661920,
21709
+ "step": 121950
21710
+ },
21711
+ {
21712
+ "epoch": 1.1638885245119668,
21713
+ "grad_norm": 0.13766394555568695,
21714
+ "learning_rate": 0.0007169418695587791,
21715
+ "loss": 2.072,
21716
+ "num_input_tokens_seen": 63952872768,
21717
+ "step": 122000
21718
+ },
21719
+ {
21720
+ "epoch": 1.1638885245119668,
21721
+ "eval_loss": 1.991066813468933,
21722
+ "eval_runtime": 82.2634,
21723
+ "eval_samples_per_second": 60.78,
21724
+ "eval_steps_per_second": 15.195,
21725
+ "num_input_tokens_seen": 63952872768,
21726
+ "step": 122000
21727
  }
21728
  ],
21729
  "logging_steps": 50,
21730
  "max_steps": 140000,
21731
+ "num_input_tokens_seen": 63952872768,
21732
  "num_train_epochs": 2,
21733
  "save_steps": 1000,
21734
  "stateful_callbacks": {
 
21743
  "attributes": {}
21744
  }
21745
  },
21746
+ "total_flos": 1.1318491979536712e+20,
21747
  "train_batch_size": 32,
21748
  "trial_name": null,
21749
  "trial_params": null