mtzig commited on
Commit
4525150
·
verified ·
1 Parent(s): 995fbb1

Training in progress, step 6500, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8da094bb58aa315c0dbf65d109a4451df66be453e2431cf1a0c4ecdd9ceebd97
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b70b1bdd1598615c49c4b9a7faeeaa85e0df1ab80935c4de9703e337cbef5419
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96e263ebb8f2a41872ded496e1dc52ad7720376bb634867f9591a3794ab7d3a1
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc84cdd38d4fb81c57c92e318089a0050c59636f80d52d59c7e95dfd9fd62580
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:344a82b08f46e3470679d0297ce3f97fd02b801ccee0da6f53e77cf6d7ea9808
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4712250438ce35119c47f3071be3ca85a4fce51b421eda9263e5ccdc56ad810
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e62e952e34e8732a682ff9a8aa0dfece0ec8b2415897d03feab5a9570104b06d
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03a7d0375d4dc32e11bcee4d7faf50e1efa9d4c215c6763c2e4a46a246814940
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:948187477401863aff3f049c3c44b0abdb0be5c10934fb97375600a1ce977bb0
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f97937888bf353d4425445e26e6749a80bf045549b1996cc08838b4dfb4b8dc4
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25236c04c8f8da3260ea3459f91081dcddc5d5ea2cbe8eabd6054cce06f92faa
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11d0834123efa7f652e2e631a76ccc6e13c613f625cb331ed1e2b81641ebca01
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:640b6e661958d243cae1ea127f269a99af317657fa786eb7dc174d158d645b7d
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:352b2435cc0fdbc839b93fecae50d3830aa0717204cfab826aa5127ee89d2407
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1f1752f740240735a873bcf35d461dfe262e32638d88fc837774925e8080436
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58cce52d813acc463fc99594977081fcbdb55dfd090284d6dbc8cb7c0ca23dd0
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d98bdde773257c04b20ce747461dedaea858963e3b0ee34044400ab89897a43
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89592922c71a0801bf0d6fdc601852fa0221a03b1ab5fb935185066acc67448f
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8504bbd2ddff94c982c1681f6db902504da78219fcef7f6818b65d415605b80c
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc83c14767c41315dc8ec259110c74b59ff4daddfa2add8ceb7d6ecfcf304840
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:196065f493df1d03858eb23e309a1135c0371a98a32ac2a517518646388ed9cf
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9011b070f191d0e0ac40350bb8e4c21dd15e660927e0930b9cd365a37b434167
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7fed0b4c620538bfc967033b7c4ab483c5214d361a87603ce37022eafee14fa
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd7e984441e517b75f1d23d418db3472b205bd6171ca12f9c999f36bc527e641
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60228bdd3999ee852e9677ac091321938441c1f39b0d501df20ea306992b3f39
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c58e283a30a20e60ecca74baeb4de711e3041934465bd25e1ee1e0167c92157
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9443009959424566,
5
  "eval_steps": 20,
6
- "global_step": 6400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -48659,6 +48659,766 @@
48659
  "eval_samples_per_second": 5.929,
48660
  "eval_steps_per_second": 0.204,
48661
  "step": 6400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48662
  }
48663
  ],
48664
  "logging_steps": 1,
@@ -48678,7 +49438,7 @@
48678
  "attributes": {}
48679
  }
48680
  },
48681
- "total_flos": 1.9720278275952476e+18,
48682
  "train_batch_size": 8,
48683
  "trial_name": null,
48684
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9590556990040575,
5
  "eval_steps": 20,
6
+ "global_step": 6500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
48659
  "eval_samples_per_second": 5.929,
48660
  "eval_steps_per_second": 0.204,
48661
  "step": 6400
48662
+ },
48663
+ {
48664
+ "epoch": 0.9444485429730727,
48665
+ "grad_norm": 5.365920543670654,
48666
+ "learning_rate": 1.8696924803641825e-07,
48667
+ "loss": 0.0539,
48668
+ "step": 6401
48669
+ },
48670
+ {
48671
+ "epoch": 0.9445960900036887,
48672
+ "grad_norm": 1.1807736158370972,
48673
+ "learning_rate": 1.8597914335757085e-07,
48674
+ "loss": 0.0485,
48675
+ "step": 6402
48676
+ },
48677
+ {
48678
+ "epoch": 0.9447436370343046,
48679
+ "grad_norm": 1.9345782995224,
48680
+ "learning_rate": 1.8499164260956548e-07,
48681
+ "loss": 0.025,
48682
+ "step": 6403
48683
+ },
48684
+ {
48685
+ "epoch": 0.9448911840649207,
48686
+ "grad_norm": 2.943807363510132,
48687
+ "learning_rate": 1.840067460544137e-07,
48688
+ "loss": 0.0545,
48689
+ "step": 6404
48690
+ },
48691
+ {
48692
+ "epoch": 0.9450387310955367,
48693
+ "grad_norm": 1.9922869205474854,
48694
+ "learning_rate": 1.830244539534376e-07,
48695
+ "loss": 0.0358,
48696
+ "step": 6405
48697
+ },
48698
+ {
48699
+ "epoch": 0.9451862781261527,
48700
+ "grad_norm": 2.3645715713500977,
48701
+ "learning_rate": 1.820447665672631e-07,
48702
+ "loss": 0.0791,
48703
+ "step": 6406
48704
+ },
48705
+ {
48706
+ "epoch": 0.9453338251567687,
48707
+ "grad_norm": 2.1656434535980225,
48708
+ "learning_rate": 1.810676841558301e-07,
48709
+ "loss": 0.0472,
48710
+ "step": 6407
48711
+ },
48712
+ {
48713
+ "epoch": 0.9454813721873847,
48714
+ "grad_norm": 1.5175650119781494,
48715
+ "learning_rate": 1.8009320697838672e-07,
48716
+ "loss": 0.024,
48717
+ "step": 6408
48718
+ },
48719
+ {
48720
+ "epoch": 0.9456289192180007,
48721
+ "grad_norm": 1.9532819986343384,
48722
+ "learning_rate": 1.791213352934851e-07,
48723
+ "loss": 0.045,
48724
+ "step": 6409
48725
+ },
48726
+ {
48727
+ "epoch": 0.9457764662486168,
48728
+ "grad_norm": 1.9861067533493042,
48729
+ "learning_rate": 1.7815206935899332e-07,
48730
+ "loss": 0.0689,
48731
+ "step": 6410
48732
+ },
48733
+ {
48734
+ "epoch": 0.9459240132792327,
48735
+ "grad_norm": 5.2447967529296875,
48736
+ "learning_rate": 1.771854094320835e-07,
48737
+ "loss": 0.0425,
48738
+ "step": 6411
48739
+ },
48740
+ {
48741
+ "epoch": 0.9460715603098487,
48742
+ "grad_norm": 2.146310567855835,
48743
+ "learning_rate": 1.7622135576923495e-07,
48744
+ "loss": 0.0601,
48745
+ "step": 6412
48746
+ },
48747
+ {
48748
+ "epoch": 0.9462191073404648,
48749
+ "grad_norm": 2.063035726547241,
48750
+ "learning_rate": 1.7525990862624188e-07,
48751
+ "loss": 0.0776,
48752
+ "step": 6413
48753
+ },
48754
+ {
48755
+ "epoch": 0.9463666543710808,
48756
+ "grad_norm": 2.2232484817504883,
48757
+ "learning_rate": 1.7430106825819804e-07,
48758
+ "loss": 0.0306,
48759
+ "step": 6414
48760
+ },
48761
+ {
48762
+ "epoch": 0.9465142014016968,
48763
+ "grad_norm": 1.885646939277649,
48764
+ "learning_rate": 1.7334483491951327e-07,
48765
+ "loss": 0.0365,
48766
+ "step": 6415
48767
+ },
48768
+ {
48769
+ "epoch": 0.9466617484323128,
48770
+ "grad_norm": 2.823607921600342,
48771
+ "learning_rate": 1.7239120886390347e-07,
48772
+ "loss": 0.0833,
48773
+ "step": 6416
48774
+ },
48775
+ {
48776
+ "epoch": 0.9468092954629288,
48777
+ "grad_norm": 3.014352798461914,
48778
+ "learning_rate": 1.7144019034438851e-07,
48779
+ "loss": 0.0913,
48780
+ "step": 6417
48781
+ },
48782
+ {
48783
+ "epoch": 0.9469568424935448,
48784
+ "grad_norm": 2.5719082355499268,
48785
+ "learning_rate": 1.7049177961330432e-07,
48786
+ "loss": 0.0422,
48787
+ "step": 6418
48788
+ },
48789
+ {
48790
+ "epoch": 0.9471043895241609,
48791
+ "grad_norm": 2.7025134563446045,
48792
+ "learning_rate": 1.6954597692228626e-07,
48793
+ "loss": 0.1112,
48794
+ "step": 6419
48795
+ },
48796
+ {
48797
+ "epoch": 0.9472519365547768,
48798
+ "grad_norm": 1.737754464149475,
48799
+ "learning_rate": 1.6860278252228358e-07,
48800
+ "loss": 0.0323,
48801
+ "step": 6420
48802
+ },
48803
+ {
48804
+ "epoch": 0.9472519365547768,
48805
+ "eval_accuracy": 0.9782923299565847,
48806
+ "eval_f1": 0.9629629629629629,
48807
+ "eval_loss": 0.05500521510839462,
48808
+ "eval_precision": 0.9798994974874372,
48809
+ "eval_recall": 0.9466019417475728,
48810
+ "eval_runtime": 48.669,
48811
+ "eval_samples_per_second": 5.979,
48812
+ "eval_steps_per_second": 0.205,
48813
+ "step": 6420
48814
+ },
48815
+ {
48816
+ "epoch": 0.9473994835853928,
48817
+ "grad_norm": 2.6450870037078857,
48818
+ "learning_rate": 1.6766219666355278e-07,
48819
+ "loss": 0.0535,
48820
+ "step": 6421
48821
+ },
48822
+ {
48823
+ "epoch": 0.9475470306160089,
48824
+ "grad_norm": 2.7876486778259277,
48825
+ "learning_rate": 1.6672421959565755e-07,
48826
+ "loss": 0.0498,
48827
+ "step": 6422
48828
+ },
48829
+ {
48830
+ "epoch": 0.9476945776466249,
48831
+ "grad_norm": 4.724228858947754,
48832
+ "learning_rate": 1.6578885156746548e-07,
48833
+ "loss": 0.0836,
48834
+ "step": 6423
48835
+ },
48836
+ {
48837
+ "epoch": 0.9478421246772408,
48838
+ "grad_norm": 1.4518014192581177,
48839
+ "learning_rate": 1.648560928271592e-07,
48840
+ "loss": 0.0291,
48841
+ "step": 6424
48842
+ },
48843
+ {
48844
+ "epoch": 0.9479896717078569,
48845
+ "grad_norm": 2.3440370559692383,
48846
+ "learning_rate": 1.6392594362222515e-07,
48847
+ "loss": 0.0662,
48848
+ "step": 6425
48849
+ },
48850
+ {
48851
+ "epoch": 0.9481372187384729,
48852
+ "grad_norm": 1.8037214279174805,
48853
+ "learning_rate": 1.629984041994559e-07,
48854
+ "loss": 0.0443,
48855
+ "step": 6426
48856
+ },
48857
+ {
48858
+ "epoch": 0.9482847657690889,
48859
+ "grad_norm": 0.7664615511894226,
48860
+ "learning_rate": 1.6207347480495462e-07,
48861
+ "loss": 0.0098,
48862
+ "step": 6427
48863
+ },
48864
+ {
48865
+ "epoch": 0.9484323127997049,
48866
+ "grad_norm": 3.4400036334991455,
48867
+ "learning_rate": 1.6115115568412942e-07,
48868
+ "loss": 0.084,
48869
+ "step": 6428
48870
+ },
48871
+ {
48872
+ "epoch": 0.9485798598303209,
48873
+ "grad_norm": 2.720244884490967,
48874
+ "learning_rate": 1.602314470816968e-07,
48875
+ "loss": 0.0446,
48876
+ "step": 6429
48877
+ },
48878
+ {
48879
+ "epoch": 0.9487274068609369,
48880
+ "grad_norm": 1.642297625541687,
48881
+ "learning_rate": 1.5931434924168377e-07,
48882
+ "loss": 0.0146,
48883
+ "step": 6430
48884
+ },
48885
+ {
48886
+ "epoch": 0.948874953891553,
48887
+ "grad_norm": 2.7700729370117188,
48888
+ "learning_rate": 1.583998624074179e-07,
48889
+ "loss": 0.0629,
48890
+ "step": 6431
48891
+ },
48892
+ {
48893
+ "epoch": 0.9490225009221689,
48894
+ "grad_norm": 4.600953578948975,
48895
+ "learning_rate": 1.5748798682154177e-07,
48896
+ "loss": 0.1125,
48897
+ "step": 6432
48898
+ },
48899
+ {
48900
+ "epoch": 0.9491700479527849,
48901
+ "grad_norm": 1.555990219116211,
48902
+ "learning_rate": 1.5657872272599738e-07,
48903
+ "loss": 0.0382,
48904
+ "step": 6433
48905
+ },
48906
+ {
48907
+ "epoch": 0.949317594983401,
48908
+ "grad_norm": 2.383833408355713,
48909
+ "learning_rate": 1.5567207036203957e-07,
48910
+ "loss": 0.0967,
48911
+ "step": 6434
48912
+ },
48913
+ {
48914
+ "epoch": 0.949465142014017,
48915
+ "grad_norm": 3.316439390182495,
48916
+ "learning_rate": 1.5476802997022812e-07,
48917
+ "loss": 0.1172,
48918
+ "step": 6435
48919
+ },
48920
+ {
48921
+ "epoch": 0.949612689044633,
48922
+ "grad_norm": 4.898162841796875,
48923
+ "learning_rate": 1.538666017904311e-07,
48924
+ "loss": 0.1416,
48925
+ "step": 6436
48926
+ },
48927
+ {
48928
+ "epoch": 0.949760236075249,
48929
+ "grad_norm": 2.7083256244659424,
48930
+ "learning_rate": 1.5296778606181839e-07,
48931
+ "loss": 0.057,
48932
+ "step": 6437
48933
+ },
48934
+ {
48935
+ "epoch": 0.949907783105865,
48936
+ "grad_norm": 2.0090880393981934,
48937
+ "learning_rate": 1.5207158302287472e-07,
48938
+ "loss": 0.0365,
48939
+ "step": 6438
48940
+ },
48941
+ {
48942
+ "epoch": 0.950055330136481,
48943
+ "grad_norm": 2.323190689086914,
48944
+ "learning_rate": 1.5117799291138657e-07,
48945
+ "loss": 0.0455,
48946
+ "step": 6439
48947
+ },
48948
+ {
48949
+ "epoch": 0.9502028771670971,
48950
+ "grad_norm": 2.2493746280670166,
48951
+ "learning_rate": 1.502870159644465e-07,
48952
+ "loss": 0.0496,
48953
+ "step": 6440
48954
+ },
48955
+ {
48956
+ "epoch": 0.9502028771670971,
48957
+ "eval_accuracy": 0.9782923299565847,
48958
+ "eval_f1": 0.9629629629629629,
48959
+ "eval_loss": 0.055333010852336884,
48960
+ "eval_precision": 0.9798994974874372,
48961
+ "eval_recall": 0.9466019417475728,
48962
+ "eval_runtime": 49.0651,
48963
+ "eval_samples_per_second": 5.931,
48964
+ "eval_steps_per_second": 0.204,
48965
+ "step": 6440
48966
+ },
48967
+ {
48968
+ "epoch": 0.950350424197713,
48969
+ "grad_norm": 9.028923034667969,
48970
+ "learning_rate": 1.4939865241845652e-07,
48971
+ "loss": 0.1114,
48972
+ "step": 6441
48973
+ },
48974
+ {
48975
+ "epoch": 0.950497971228329,
48976
+ "grad_norm": 3.4927148818969727,
48977
+ "learning_rate": 1.4851290250912365e-07,
48978
+ "loss": 0.0462,
48979
+ "step": 6442
48980
+ },
48981
+ {
48982
+ "epoch": 0.950645518258945,
48983
+ "grad_norm": 1.9196674823760986,
48984
+ "learning_rate": 1.476297664714621e-07,
48985
+ "loss": 0.0741,
48986
+ "step": 6443
48987
+ },
48988
+ {
48989
+ "epoch": 0.9507930652895611,
48990
+ "grad_norm": 0.9236531257629395,
48991
+ "learning_rate": 1.4674924453979223e-07,
48992
+ "loss": 0.0087,
48993
+ "step": 6444
48994
+ },
48995
+ {
48996
+ "epoch": 0.950940612320177,
48997
+ "grad_norm": 1.8691339492797852,
48998
+ "learning_rate": 1.4587133694774048e-07,
48999
+ "loss": 0.0449,
49000
+ "step": 6445
49001
+ },
49002
+ {
49003
+ "epoch": 0.951088159350793,
49004
+ "grad_norm": 3.3348920345306396,
49005
+ "learning_rate": 1.4499604392824052e-07,
49006
+ "loss": 0.0528,
49007
+ "step": 6446
49008
+ },
49009
+ {
49010
+ "epoch": 0.9512357063814091,
49011
+ "grad_norm": 3.1573545932769775,
49012
+ "learning_rate": 1.4412336571353103e-07,
49013
+ "loss": 0.1077,
49014
+ "step": 6447
49015
+ },
49016
+ {
49017
+ "epoch": 0.9513832534120251,
49018
+ "grad_norm": 1.6667567491531372,
49019
+ "learning_rate": 1.4325330253515902e-07,
49020
+ "loss": 0.0289,
49021
+ "step": 6448
49022
+ },
49023
+ {
49024
+ "epoch": 0.951530800442641,
49025
+ "grad_norm": 2.5860416889190674,
49026
+ "learning_rate": 1.4238585462397536e-07,
49027
+ "loss": 0.076,
49028
+ "step": 6449
49029
+ },
49030
+ {
49031
+ "epoch": 0.9516783474732571,
49032
+ "grad_norm": 2.118489980697632,
49033
+ "learning_rate": 1.4152102221013708e-07,
49034
+ "loss": 0.0561,
49035
+ "step": 6450
49036
+ },
49037
+ {
49038
+ "epoch": 0.9518258945038731,
49039
+ "grad_norm": 1.936941385269165,
49040
+ "learning_rate": 1.4065880552310952e-07,
49041
+ "loss": 0.0525,
49042
+ "step": 6451
49043
+ },
49044
+ {
49045
+ "epoch": 0.9519734415344892,
49046
+ "grad_norm": 1.3135408163070679,
49047
+ "learning_rate": 1.3979920479166187e-07,
49048
+ "loss": 0.0513,
49049
+ "step": 6452
49050
+ },
49051
+ {
49052
+ "epoch": 0.9521209885651051,
49053
+ "grad_norm": 2.240351915359497,
49054
+ "learning_rate": 1.389422202438706e-07,
49055
+ "loss": 0.0541,
49056
+ "step": 6453
49057
+ },
49058
+ {
49059
+ "epoch": 0.9522685355957211,
49060
+ "grad_norm": 4.377660751342773,
49061
+ "learning_rate": 1.3808785210711606e-07,
49062
+ "loss": 0.0642,
49063
+ "step": 6454
49064
+ },
49065
+ {
49066
+ "epoch": 0.9524160826263371,
49067
+ "grad_norm": 4.16725492477417,
49068
+ "learning_rate": 1.3723610060808801e-07,
49069
+ "loss": 0.0707,
49070
+ "step": 6455
49071
+ },
49072
+ {
49073
+ "epoch": 0.9525636296569532,
49074
+ "grad_norm": 1.5783486366271973,
49075
+ "learning_rate": 1.3638696597277678e-07,
49076
+ "loss": 0.0347,
49077
+ "step": 6456
49078
+ },
49079
+ {
49080
+ "epoch": 0.9527111766875692,
49081
+ "grad_norm": 0.9976248741149902,
49082
+ "learning_rate": 1.3554044842648217e-07,
49083
+ "loss": 0.0198,
49084
+ "step": 6457
49085
+ },
49086
+ {
49087
+ "epoch": 0.9528587237181851,
49088
+ "grad_norm": 2.819964647293091,
49089
+ "learning_rate": 1.3469654819381118e-07,
49090
+ "loss": 0.0753,
49091
+ "step": 6458
49092
+ },
49093
+ {
49094
+ "epoch": 0.9530062707488012,
49095
+ "grad_norm": 0.9899864196777344,
49096
+ "learning_rate": 1.3385526549867022e-07,
49097
+ "loss": 0.013,
49098
+ "step": 6459
49099
+ },
49100
+ {
49101
+ "epoch": 0.9531538177794172,
49102
+ "grad_norm": 2.6590030193328857,
49103
+ "learning_rate": 1.3301660056427745e-07,
49104
+ "loss": 0.0809,
49105
+ "step": 6460
49106
+ },
49107
+ {
49108
+ "epoch": 0.9531538177794172,
49109
+ "eval_accuracy": 0.9782923299565847,
49110
+ "eval_f1": 0.9629629629629629,
49111
+ "eval_loss": 0.05532016232609749,
49112
+ "eval_precision": 0.9798994974874372,
49113
+ "eval_recall": 0.9466019417475728,
49114
+ "eval_runtime": 48.7081,
49115
+ "eval_samples_per_second": 5.974,
49116
+ "eval_steps_per_second": 0.205,
49117
+ "step": 6460
49118
+ },
49119
+ {
49120
+ "epoch": 0.9533013648100332,
49121
+ "grad_norm": 2.229666233062744,
49122
+ "learning_rate": 1.3218055361315262e-07,
49123
+ "loss": 0.0684,
49124
+ "step": 6461
49125
+ },
49126
+ {
49127
+ "epoch": 0.9534489118406492,
49128
+ "grad_norm": 2.2530157566070557,
49129
+ "learning_rate": 1.3134712486712165e-07,
49130
+ "loss": 0.0549,
49131
+ "step": 6462
49132
+ },
49133
+ {
49134
+ "epoch": 0.9535964588712652,
49135
+ "grad_norm": 2.005972146987915,
49136
+ "learning_rate": 1.3051631454731873e-07,
49137
+ "loss": 0.0462,
49138
+ "step": 6463
49139
+ },
49140
+ {
49141
+ "epoch": 0.9537440059018812,
49142
+ "grad_norm": 3.3792154788970947,
49143
+ "learning_rate": 1.2968812287417753e-07,
49144
+ "loss": 0.1276,
49145
+ "step": 6464
49146
+ },
49147
+ {
49148
+ "epoch": 0.9538915529324973,
49149
+ "grad_norm": 3.6405813694000244,
49150
+ "learning_rate": 1.288625500674412e-07,
49151
+ "loss": 0.0681,
49152
+ "step": 6465
49153
+ },
49154
+ {
49155
+ "epoch": 0.9540390999631132,
49156
+ "grad_norm": 1.7894163131713867,
49157
+ "learning_rate": 1.2803959634615782e-07,
49158
+ "loss": 0.0496,
49159
+ "step": 6466
49160
+ },
49161
+ {
49162
+ "epoch": 0.9541866469937292,
49163
+ "grad_norm": 0.9998230934143066,
49164
+ "learning_rate": 1.272192619286805e-07,
49165
+ "loss": 0.0204,
49166
+ "step": 6467
49167
+ },
49168
+ {
49169
+ "epoch": 0.9543341940243453,
49170
+ "grad_norm": 2.581663131713867,
49171
+ "learning_rate": 1.2640154703266405e-07,
49172
+ "loss": 0.0972,
49173
+ "step": 6468
49174
+ },
49175
+ {
49176
+ "epoch": 0.9544817410549613,
49177
+ "grad_norm": 1.5313342809677124,
49178
+ "learning_rate": 1.2558645187507267e-07,
49179
+ "loss": 0.0412,
49180
+ "step": 6469
49181
+ },
49182
+ {
49183
+ "epoch": 0.9546292880855772,
49184
+ "grad_norm": 2.740792751312256,
49185
+ "learning_rate": 1.247739766721734e-07,
49186
+ "loss": 0.0527,
49187
+ "step": 6470
49188
+ },
49189
+ {
49190
+ "epoch": 0.9547768351161933,
49191
+ "grad_norm": 1.9213531017303467,
49192
+ "learning_rate": 1.2396412163953709e-07,
49193
+ "loss": 0.0462,
49194
+ "step": 6471
49195
+ },
49196
+ {
49197
+ "epoch": 0.9549243821468093,
49198
+ "grad_norm": 2.4002788066864014,
49199
+ "learning_rate": 1.2315688699204298e-07,
49200
+ "loss": 0.0869,
49201
+ "step": 6472
49202
+ },
49203
+ {
49204
+ "epoch": 0.9550719291774253,
49205
+ "grad_norm": 4.3059186935424805,
49206
+ "learning_rate": 1.2235227294387085e-07,
49207
+ "loss": 0.0794,
49208
+ "step": 6473
49209
+ },
49210
+ {
49211
+ "epoch": 0.9552194762080413,
49212
+ "grad_norm": 2.3202598094940186,
49213
+ "learning_rate": 1.2155027970850776e-07,
49214
+ "loss": 0.0262,
49215
+ "step": 6474
49216
+ },
49217
+ {
49218
+ "epoch": 0.9553670232386573,
49219
+ "grad_norm": 2.168534517288208,
49220
+ "learning_rate": 1.2075090749874451e-07,
49221
+ "loss": 0.0299,
49222
+ "step": 6475
49223
+ },
49224
+ {
49225
+ "epoch": 0.9555145702692733,
49226
+ "grad_norm": 4.975533485412598,
49227
+ "learning_rate": 1.1995415652667598e-07,
49228
+ "loss": 0.1115,
49229
+ "step": 6476
49230
+ },
49231
+ {
49232
+ "epoch": 0.9556621172998894,
49233
+ "grad_norm": 2.08109450340271,
49234
+ "learning_rate": 1.1916002700370411e-07,
49235
+ "loss": 0.0311,
49236
+ "step": 6477
49237
+ },
49238
+ {
49239
+ "epoch": 0.9558096643305053,
49240
+ "grad_norm": 3.003537654876709,
49241
+ "learning_rate": 1.183685191405315e-07,
49242
+ "loss": 0.033,
49243
+ "step": 6478
49244
+ },
49245
+ {
49246
+ "epoch": 0.9559572113611213,
49247
+ "grad_norm": 1.9444817304611206,
49248
+ "learning_rate": 1.1757963314716791e-07,
49249
+ "loss": 0.051,
49250
+ "step": 6479
49251
+ },
49252
+ {
49253
+ "epoch": 0.9561047583917374,
49254
+ "grad_norm": 2.8040788173675537,
49255
+ "learning_rate": 1.1679336923292594e-07,
49256
+ "loss": 0.0706,
49257
+ "step": 6480
49258
+ },
49259
+ {
49260
+ "epoch": 0.9561047583917374,
49261
+ "eval_accuracy": 0.9797395079594791,
49262
+ "eval_f1": 0.9653465346534653,
49263
+ "eval_loss": 0.05492851138114929,
49264
+ "eval_precision": 0.9848484848484849,
49265
+ "eval_recall": 0.9466019417475728,
49266
+ "eval_runtime": 48.3996,
49267
+ "eval_samples_per_second": 6.012,
49268
+ "eval_steps_per_second": 0.207,
49269
+ "step": 6480
49270
+ },
49271
+ {
49272
+ "epoch": 0.9562523054223534,
49273
+ "grad_norm": 5.251060485839844,
49274
+ "learning_rate": 1.1600972760642426e-07,
49275
+ "loss": 0.0943,
49276
+ "step": 6481
49277
+ },
49278
+ {
49279
+ "epoch": 0.9563998524529694,
49280
+ "grad_norm": 2.8643219470977783,
49281
+ "learning_rate": 1.1522870847558432e-07,
49282
+ "loss": 0.0305,
49283
+ "step": 6482
49284
+ },
49285
+ {
49286
+ "epoch": 0.9565473994835854,
49287
+ "grad_norm": 3.2723355293273926,
49288
+ "learning_rate": 1.1445031204763146e-07,
49289
+ "loss": 0.0637,
49290
+ "step": 6483
49291
+ },
49292
+ {
49293
+ "epoch": 0.9566949465142014,
49294
+ "grad_norm": 3.418752908706665,
49295
+ "learning_rate": 1.1367453852909493e-07,
49296
+ "loss": 0.0994,
49297
+ "step": 6484
49298
+ },
49299
+ {
49300
+ "epoch": 0.9568424935448174,
49301
+ "grad_norm": 0.9884876608848572,
49302
+ "learning_rate": 1.1290138812581009e-07,
49303
+ "loss": 0.0218,
49304
+ "step": 6485
49305
+ },
49306
+ {
49307
+ "epoch": 0.9569900405754335,
49308
+ "grad_norm": 2.2045938968658447,
49309
+ "learning_rate": 1.1213086104291615e-07,
49310
+ "loss": 0.032,
49311
+ "step": 6486
49312
+ },
49313
+ {
49314
+ "epoch": 0.9571375876060494,
49315
+ "grad_norm": 2.34030818939209,
49316
+ "learning_rate": 1.1136295748485293e-07,
49317
+ "loss": 0.0321,
49318
+ "step": 6487
49319
+ },
49320
+ {
49321
+ "epoch": 0.9572851346366654,
49322
+ "grad_norm": 2.480587959289551,
49323
+ "learning_rate": 1.1059767765536856e-07,
49324
+ "loss": 0.0331,
49325
+ "step": 6488
49326
+ },
49327
+ {
49328
+ "epoch": 0.9574326816672815,
49329
+ "grad_norm": 2.711378574371338,
49330
+ "learning_rate": 1.098350217575117e-07,
49331
+ "loss": 0.0592,
49332
+ "step": 6489
49333
+ },
49334
+ {
49335
+ "epoch": 0.9575802286978975,
49336
+ "grad_norm": 1.1604716777801514,
49337
+ "learning_rate": 1.0907498999363609e-07,
49338
+ "loss": 0.015,
49339
+ "step": 6490
49340
+ },
49341
+ {
49342
+ "epoch": 0.9577277757285134,
49343
+ "grad_norm": 1.9541465044021606,
49344
+ "learning_rate": 1.0831758256539925e-07,
49345
+ "loss": 0.043,
49346
+ "step": 6491
49347
+ },
49348
+ {
49349
+ "epoch": 0.9578753227591295,
49350
+ "grad_norm": 6.780413627624512,
49351
+ "learning_rate": 1.075627996737627e-07,
49352
+ "loss": 0.0828,
49353
+ "step": 6492
49354
+ },
49355
+ {
49356
+ "epoch": 0.9580228697897455,
49357
+ "grad_norm": 0.8017694354057312,
49358
+ "learning_rate": 1.0681064151899068e-07,
49359
+ "loss": 0.0084,
49360
+ "step": 6493
49361
+ },
49362
+ {
49363
+ "epoch": 0.9581704168203615,
49364
+ "grad_norm": 1.1130073070526123,
49365
+ "learning_rate": 1.0606110830065131e-07,
49366
+ "loss": 0.011,
49367
+ "step": 6494
49368
+ },
49369
+ {
49370
+ "epoch": 0.9583179638509775,
49371
+ "grad_norm": 2.2371890544891357,
49372
+ "learning_rate": 1.0531420021761662e-07,
49373
+ "loss": 0.0579,
49374
+ "step": 6495
49375
+ },
49376
+ {
49377
+ "epoch": 0.9584655108815935,
49378
+ "grad_norm": 2.2667269706726074,
49379
+ "learning_rate": 1.0456991746806366e-07,
49380
+ "loss": 0.03,
49381
+ "step": 6496
49382
+ },
49383
+ {
49384
+ "epoch": 0.9586130579122095,
49385
+ "grad_norm": 0.9488065242767334,
49386
+ "learning_rate": 1.0382826024946891e-07,
49387
+ "loss": 0.0154,
49388
+ "step": 6497
49389
+ },
49390
+ {
49391
+ "epoch": 0.9587606049428256,
49392
+ "grad_norm": 2.35026478767395,
49393
+ "learning_rate": 1.0308922875861493e-07,
49394
+ "loss": 0.0377,
49395
+ "step": 6498
49396
+ },
49397
+ {
49398
+ "epoch": 0.9589081519734415,
49399
+ "grad_norm": 1.9522687196731567,
49400
+ "learning_rate": 1.0235282319158823e-07,
49401
+ "loss": 0.0769,
49402
+ "step": 6499
49403
+ },
49404
+ {
49405
+ "epoch": 0.9590556990040575,
49406
+ "grad_norm": 4.399130344390869,
49407
+ "learning_rate": 1.0161904374377696e-07,
49408
+ "loss": 0.0716,
49409
+ "step": 6500
49410
+ },
49411
+ {
49412
+ "epoch": 0.9590556990040575,
49413
+ "eval_accuracy": 0.9782923299565847,
49414
+ "eval_f1": 0.9629629629629629,
49415
+ "eval_loss": 0.05545896664261818,
49416
+ "eval_precision": 0.9798994974874372,
49417
+ "eval_recall": 0.9466019417475728,
49418
+ "eval_runtime": 50.2134,
49419
+ "eval_samples_per_second": 5.795,
49420
+ "eval_steps_per_second": 0.199,
49421
+ "step": 6500
49422
  }
49423
  ],
49424
  "logging_steps": 1,
 
49438
  "attributes": {}
49439
  }
49440
  },
49441
+ "total_flos": 2.0029445559969382e+18,
49442
  "train_batch_size": 8,
49443
  "trial_name": null,
49444
  "trial_params": null