Wilsonwin commited on
Commit
4198228
·
verified ·
1 Parent(s): 51079db

Training in progress, step 9500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5822c5d51ff1a3f6c8d63d9491441c689004f44619d361568f98a19df1caeab
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb42262712eb0446298aefaf9502d1bce878381fa4256b4c412cb875cf7676dd
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2c88f007992dd9990ea0216c73aaca02a8b4aebfac4c43fbb77c941bb9cf18e
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37b61fef6d8dab3892dcb676937372c6938b18c4b8be84f3a00936c78dd241b6
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88595be53afbf68c948f838fbf4b1fa7776619d23de4baf3620fece471fafed5
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:482021b320968c1aef3bb227f66c018b401e7317860a8a4bae46f36ed2c71427
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:528ba9a1d2a5739586b1652bb1454f9e977f93a6ae9e9c38a71b51bc41c45de4
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4784f3b1ac308d4093c525f58ebfb1ed5c4e7ca17828bd58e2e6a8e2baed20b5
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.5205271160669032,
6
  "eval_steps": 500,
7
- "global_step": 9000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6459,6 +6459,364 @@
6459
  "eval_samples_per_second": 240.409,
6460
  "eval_steps_per_second": 5.049,
6461
  "step": 9000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6462
  }
6463
  ],
6464
  "logging_steps": 10,
@@ -6478,7 +6836,7 @@
6478
  "attributes": {}
6479
  }
6480
  },
6481
- "total_flos": 3.010090484178616e+17,
6482
  "train_batch_size": 48,
6483
  "trial_name": null,
6484
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.6050008447372868,
6
  "eval_steps": 500,
7
+ "global_step": 9500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6459
  "eval_samples_per_second": 240.409,
6460
  "eval_steps_per_second": 5.049,
6461
  "step": 9000
6462
+ },
6463
+ {
6464
+ "epoch": 1.522216590640311,
6465
+ "grad_norm": 0.48225322365760803,
6466
+ "learning_rate": 5.715753870066455e-05,
6467
+ "loss": 4.3221698760986325,
6468
+ "step": 9010
6469
+ },
6470
+ {
6471
+ "epoch": 1.5239060652137186,
6472
+ "grad_norm": 0.5018514394760132,
6473
+ "learning_rate": 5.67817924971296e-05,
6474
+ "loss": 4.321614456176758,
6475
+ "step": 9020
6476
+ },
6477
+ {
6478
+ "epoch": 1.5255955397871261,
6479
+ "grad_norm": 0.5176340341567993,
6480
+ "learning_rate": 5.6406996868811885e-05,
6481
+ "loss": 4.335358810424805,
6482
+ "step": 9030
6483
+ },
6484
+ {
6485
+ "epoch": 1.5272850143605339,
6486
+ "grad_norm": 0.48670732975006104,
6487
+ "learning_rate": 5.60331556376197e-05,
6488
+ "loss": 4.33309326171875,
6489
+ "step": 9040
6490
+ },
6491
+ {
6492
+ "epoch": 1.5289744889339416,
6493
+ "grad_norm": 0.49554112553596497,
6494
+ "learning_rate": 5.566027261572907e-05,
6495
+ "loss": 4.316144943237305,
6496
+ "step": 9050
6497
+ },
6498
+ {
6499
+ "epoch": 1.5306639635073491,
6500
+ "grad_norm": 0.5128636956214905,
6501
+ "learning_rate": 5.528835160554475e-05,
6502
+ "loss": 4.335286712646484,
6503
+ "step": 9060
6504
+ },
6505
+ {
6506
+ "epoch": 1.5323534380807569,
6507
+ "grad_norm": 0.4977918863296509,
6508
+ "learning_rate": 5.491739639966153e-05,
6509
+ "loss": 4.342447662353516,
6510
+ "step": 9070
6511
+ },
6512
+ {
6513
+ "epoch": 1.5340429126541646,
6514
+ "grad_norm": 0.5133760571479797,
6515
+ "learning_rate": 5.454741078082578e-05,
6516
+ "loss": 4.342383956909179,
6517
+ "step": 9080
6518
+ },
6519
+ {
6520
+ "epoch": 1.535732387227572,
6521
+ "grad_norm": 0.4748549461364746,
6522
+ "learning_rate": 5.417839852189653e-05,
6523
+ "loss": 4.369438171386719,
6524
+ "step": 9090
6525
+ },
6526
+ {
6527
+ "epoch": 1.53742186180098,
6528
+ "grad_norm": 0.4682454466819763,
6529
+ "learning_rate": 5.381036338580718e-05,
6530
+ "loss": 4.323982238769531,
6531
+ "step": 9100
6532
+ },
6533
+ {
6534
+ "epoch": 1.5391113363743876,
6535
+ "grad_norm": 0.48015832901000977,
6536
+ "learning_rate": 5.344330912552703e-05,
6537
+ "loss": 4.3247119903564455,
6538
+ "step": 9110
6539
+ },
6540
+ {
6541
+ "epoch": 1.540800810947795,
6542
+ "grad_norm": 0.4660989046096802,
6543
+ "learning_rate": 5.3077239484023385e-05,
6544
+ "loss": 4.338339614868164,
6545
+ "step": 9120
6546
+ },
6547
+ {
6548
+ "epoch": 1.542490285521203,
6549
+ "grad_norm": 0.48378968238830566,
6550
+ "learning_rate": 5.271215819422277e-05,
6551
+ "loss": 4.342069244384765,
6552
+ "step": 9130
6553
+ },
6554
+ {
6555
+ "epoch": 1.5441797600946106,
6556
+ "grad_norm": 0.4721354842185974,
6557
+ "learning_rate": 5.234806897897328e-05,
6558
+ "loss": 4.35260009765625,
6559
+ "step": 9140
6560
+ },
6561
+ {
6562
+ "epoch": 1.5458692346680183,
6563
+ "grad_norm": 0.47296905517578125,
6564
+ "learning_rate": 5.1984975551006434e-05,
6565
+ "loss": 4.3343055725097654,
6566
+ "step": 9150
6567
+ },
6568
+ {
6569
+ "epoch": 1.547558709241426,
6570
+ "grad_norm": 0.49029457569122314,
6571
+ "learning_rate": 5.1622881612899635e-05,
6572
+ "loss": 4.331478881835937,
6573
+ "step": 9160
6574
+ },
6575
+ {
6576
+ "epoch": 1.5492481838148335,
6577
+ "grad_norm": 0.4729316234588623,
6578
+ "learning_rate": 5.126179085703794e-05,
6579
+ "loss": 4.309265899658203,
6580
+ "step": 9170
6581
+ },
6582
+ {
6583
+ "epoch": 1.5509376583882413,
6584
+ "grad_norm": 0.4636003375053406,
6585
+ "learning_rate": 5.090170696557667e-05,
6586
+ "loss": 4.332284164428711,
6587
+ "step": 9180
6588
+ },
6589
+ {
6590
+ "epoch": 1.552627132961649,
6591
+ "grad_norm": 0.4683416783809662,
6592
+ "learning_rate": 5.054263361040395e-05,
6593
+ "loss": 4.323814392089844,
6594
+ "step": 9190
6595
+ },
6596
+ {
6597
+ "epoch": 1.5543166075350565,
6598
+ "grad_norm": 0.48071300983428955,
6599
+ "learning_rate": 5.018457445310313e-05,
6600
+ "loss": 4.331411743164063,
6601
+ "step": 9200
6602
+ },
6603
+ {
6604
+ "epoch": 1.5560060821084643,
6605
+ "grad_norm": 0.48741987347602844,
6606
+ "learning_rate": 4.9827533144915384e-05,
6607
+ "loss": 4.315482711791992,
6608
+ "step": 9210
6609
+ },
6610
+ {
6611
+ "epoch": 1.557695556681872,
6612
+ "grad_norm": 0.47064927220344543,
6613
+ "learning_rate": 4.9471513326702544e-05,
6614
+ "loss": 4.333251571655273,
6615
+ "step": 9220
6616
+ },
6617
+ {
6618
+ "epoch": 1.5593850312552795,
6619
+ "grad_norm": 0.48281940817832947,
6620
+ "learning_rate": 4.911651862891014e-05,
6621
+ "loss": 4.332812118530273,
6622
+ "step": 9230
6623
+ },
6624
+ {
6625
+ "epoch": 1.5610745058286872,
6626
+ "grad_norm": 0.4713364541530609,
6627
+ "learning_rate": 4.876255267153011e-05,
6628
+ "loss": 4.334049224853516,
6629
+ "step": 9240
6630
+ },
6631
+ {
6632
+ "epoch": 1.562763980402095,
6633
+ "grad_norm": 0.47604429721832275,
6634
+ "learning_rate": 4.8409619064063965e-05,
6635
+ "loss": 4.322870254516602,
6636
+ "step": 9250
6637
+ },
6638
+ {
6639
+ "epoch": 1.5644534549755025,
6640
+ "grad_norm": 0.4858945608139038,
6641
+ "learning_rate": 4.805772140548613e-05,
6642
+ "loss": 4.333293914794922,
6643
+ "step": 9260
6644
+ },
6645
+ {
6646
+ "epoch": 1.5661429295489104,
6647
+ "grad_norm": 0.4747396409511566,
6648
+ "learning_rate": 4.770686328420713e-05,
6649
+ "loss": 4.309678649902343,
6650
+ "step": 9270
6651
+ },
6652
+ {
6653
+ "epoch": 1.567832404122318,
6654
+ "grad_norm": 0.46066993474960327,
6655
+ "learning_rate": 4.7357048278036944e-05,
6656
+ "loss": 4.335137176513672,
6657
+ "step": 9280
6658
+ },
6659
+ {
6660
+ "epoch": 1.5695218786957257,
6661
+ "grad_norm": 0.48828113079071045,
6662
+ "learning_rate": 4.700827995414853e-05,
6663
+ "loss": 4.319439315795899,
6664
+ "step": 9290
6665
+ },
6666
+ {
6667
+ "epoch": 1.5712113532691334,
6668
+ "grad_norm": 0.48410648107528687,
6669
+ "learning_rate": 4.666056186904168e-05,
6670
+ "loss": 4.3514057159423825,
6671
+ "step": 9300
6672
+ },
6673
+ {
6674
+ "epoch": 1.572900827842541,
6675
+ "grad_norm": 0.4797396957874298,
6676
+ "learning_rate": 4.63138975685064e-05,
6677
+ "loss": 4.323817443847656,
6678
+ "step": 9310
6679
+ },
6680
+ {
6681
+ "epoch": 1.5745903024159487,
6682
+ "grad_norm": 0.46725404262542725,
6683
+ "learning_rate": 4.596829058758694e-05,
6684
+ "loss": 4.341088104248047,
6685
+ "step": 9320
6686
+ },
6687
+ {
6688
+ "epoch": 1.5762797769893564,
6689
+ "grad_norm": 0.4728842079639435,
6690
+ "learning_rate": 4.5623744450545846e-05,
6691
+ "loss": 4.356753540039063,
6692
+ "step": 9330
6693
+ },
6694
+ {
6695
+ "epoch": 1.577969251562764,
6696
+ "grad_norm": 0.4818381071090698,
6697
+ "learning_rate": 4.528026267082786e-05,
6698
+ "loss": 4.344687652587891,
6699
+ "step": 9340
6700
+ },
6701
+ {
6702
+ "epoch": 1.5796587261361716,
6703
+ "grad_norm": 0.47536230087280273,
6704
+ "learning_rate": 4.493784875102409e-05,
6705
+ "loss": 4.327443695068359,
6706
+ "step": 9350
6707
+ },
6708
+ {
6709
+ "epoch": 1.5813482007095794,
6710
+ "grad_norm": 0.5105261206626892,
6711
+ "learning_rate": 4.45965061828363e-05,
6712
+ "loss": 4.336804962158203,
6713
+ "step": 9360
6714
+ },
6715
+ {
6716
+ "epoch": 1.583037675282987,
6717
+ "grad_norm": 0.46770450472831726,
6718
+ "learning_rate": 4.4256238447041556e-05,
6719
+ "loss": 4.3366447448730465,
6720
+ "step": 9370
6721
+ },
6722
+ {
6723
+ "epoch": 1.5847271498563946,
6724
+ "grad_norm": 0.508904218673706,
6725
+ "learning_rate": 4.39170490134563e-05,
6726
+ "loss": 4.325738143920899,
6727
+ "step": 9380
6728
+ },
6729
+ {
6730
+ "epoch": 1.5864166244298024,
6731
+ "grad_norm": 0.46618375182151794,
6732
+ "learning_rate": 4.3578941340901274e-05,
6733
+ "loss": 4.311971282958984,
6734
+ "step": 9390
6735
+ },
6736
+ {
6737
+ "epoch": 1.5881060990032099,
6738
+ "grad_norm": 0.4693259596824646,
6739
+ "learning_rate": 4.324191887716612e-05,
6740
+ "loss": 4.320106124877929,
6741
+ "step": 9400
6742
+ },
6743
+ {
6744
+ "epoch": 1.5897955735766178,
6745
+ "grad_norm": 0.4754733145236969,
6746
+ "learning_rate": 4.290598505897439e-05,
6747
+ "loss": 4.334828948974609,
6748
+ "step": 9410
6749
+ },
6750
+ {
6751
+ "epoch": 1.5914850481500253,
6752
+ "grad_norm": 0.4678189158439636,
6753
+ "learning_rate": 4.25711433119483e-05,
6754
+ "loss": 4.344146347045898,
6755
+ "step": 9420
6756
+ },
6757
+ {
6758
+ "epoch": 1.5931745227234329,
6759
+ "grad_norm": 0.48414650559425354,
6760
+ "learning_rate": 4.223739705057384e-05,
6761
+ "loss": 4.333245849609375,
6762
+ "step": 9430
6763
+ },
6764
+ {
6765
+ "epoch": 1.5948639972968408,
6766
+ "grad_norm": 0.4806137979030609,
6767
+ "learning_rate": 4.1904749678165965e-05,
6768
+ "loss": 4.320773315429688,
6769
+ "step": 9440
6770
+ },
6771
+ {
6772
+ "epoch": 1.5965534718702483,
6773
+ "grad_norm": 0.46673110127449036,
6774
+ "learning_rate": 4.157320458683409e-05,
6775
+ "loss": 4.282149887084961,
6776
+ "step": 9450
6777
+ },
6778
+ {
6779
+ "epoch": 1.598242946443656,
6780
+ "grad_norm": 0.48379746079444885,
6781
+ "learning_rate": 4.124276515744713e-05,
6782
+ "loss": 4.316770935058594,
6783
+ "step": 9460
6784
+ },
6785
+ {
6786
+ "epoch": 1.5999324210170638,
6787
+ "grad_norm": 0.467012882232666,
6788
+ "learning_rate": 4.091343475959928e-05,
6789
+ "loss": 4.311006164550781,
6790
+ "step": 9470
6791
+ },
6792
+ {
6793
+ "epoch": 1.6016218955904713,
6794
+ "grad_norm": 0.45999497175216675,
6795
+ "learning_rate": 4.058521675157563e-05,
6796
+ "loss": 4.31392822265625,
6797
+ "step": 9480
6798
+ },
6799
+ {
6800
+ "epoch": 1.603311370163879,
6801
+ "grad_norm": 0.49554598331451416,
6802
+ "learning_rate": 4.025811448031792e-05,
6803
+ "loss": 4.317913818359375,
6804
+ "step": 9490
6805
+ },
6806
+ {
6807
+ "epoch": 1.6050008447372868,
6808
+ "grad_norm": 0.4512559771537781,
6809
+ "learning_rate": 3.993213128139027e-05,
6810
+ "loss": 4.320844650268555,
6811
+ "step": 9500
6812
+ },
6813
+ {
6814
+ "epoch": 1.6050008447372868,
6815
+ "eval_loss": 4.294473648071289,
6816
+ "eval_runtime": 7.7128,
6817
+ "eval_samples_per_second": 129.654,
6818
+ "eval_steps_per_second": 2.723,
6819
+ "step": 9500
6820
  }
6821
  ],
6822
  "logging_steps": 10,
 
6836
  "attributes": {}
6837
  }
6838
  },
6839
+ "total_flos": 3.177318894608056e+17,
6840
  "train_batch_size": 48,
6841
  "trial_name": null,
6842
  "trial_params": null