robertou2 commited on
Commit
e9ad8de
·
verified ·
1 Parent(s): 176748c

Upload folder using huggingface_hub

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5f2a4574a1cf3760f6f91ba60977d9722c117968695c26c25c621af59a4e41c
3
  size 369134112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b98ba6ac806c03c0409f8d783327298917bd9290b863f004e8c9f4949a49cab
3
  size 369134112
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca865aab08124c6a7502014773b63e03e2db1b57e13d98db1ce833c9c645aa41
3
  size 738417355
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7750a024ffc36f0b2b3d75b6d23a4abc45828022cd9fc314ed0ca873e7afc478
3
  size 738417355
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2534e434cd5abbb8f7668d3eab0549db0ef95d6a797a3efa86b712e8e32266a7
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de69a2834426ff9ef8199d077e00892579278af31d8969d77f98235b5cfc010a
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:580cf0c8deda9a5cdf877c15cfecec4f5a37dd72edd01f252f4b56d158b7550a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2b8b314158649523e5cd4cc114f7b492743419645cb17f66610bf7539ffeb77
3
  size 1465
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 3.0,
6
  "eval_steps": 500,
7
- "global_step": 99,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -658,346 +658,6 @@
658
  "mean_token_accuracy": 0.7686784416437149,
659
  "num_tokens": 116981.0,
660
  "step": 65
661
- },
662
- {
663
- "entropy": 1.2168401956558228,
664
- "epoch": 2.0,
665
- "grad_norm": 18.375,
666
- "learning_rate": 1.0578672383836437e-05,
667
- "loss": 1.1399,
668
- "mean_token_accuracy": 0.6772964239120484,
669
- "num_tokens": 118284.0,
670
- "step": 66
671
- },
672
- {
673
- "entropy": 1.0973209738731384,
674
- "epoch": 2.0306513409961684,
675
- "grad_norm": 7.8125,
676
- "learning_rate": 1.0045814070672498e-05,
677
- "loss": 0.3245,
678
- "mean_token_accuracy": 0.9032263904809952,
679
- "num_tokens": 119663.0,
680
- "step": 67
681
- },
682
- {
683
- "entropy": 1.053741380572319,
684
- "epoch": 2.0613026819923372,
685
- "grad_norm": 6.0,
686
- "learning_rate": 9.519884634504074e-06,
687
- "loss": 0.2808,
688
- "mean_token_accuracy": 0.9356953203678131,
689
- "num_tokens": 121476.0,
690
- "step": 68
691
- },
692
- {
693
- "entropy": 0.9946238845586777,
694
- "epoch": 2.0919540229885056,
695
- "grad_norm": 5.375,
696
- "learning_rate": 9.001619635203889e-06,
697
- "loss": 0.2809,
698
- "mean_token_accuracy": 0.9175683632493019,
699
- "num_tokens": 123792.0,
700
- "step": 69
701
- },
702
- {
703
- "entropy": 1.015475258231163,
704
- "epoch": 2.1226053639846745,
705
- "grad_norm": 6.65625,
706
- "learning_rate": 8.491743913236629e-06,
707
- "loss": 0.2802,
708
- "mean_token_accuracy": 0.9311554208397865,
709
- "num_tokens": 125329.0,
710
- "step": 70
711
- },
712
- {
713
- "entropy": 0.9921716600656509,
714
- "epoch": 2.153256704980843,
715
- "grad_norm": 6.78125,
716
- "learning_rate": 7.99097057590407e-06,
717
- "loss": 0.2807,
718
- "mean_token_accuracy": 0.9192091822624207,
719
- "num_tokens": 126654.0,
720
- "step": 71
721
- },
722
- {
723
- "entropy": 0.8778632581233978,
724
- "epoch": 2.1839080459770113,
725
- "grad_norm": 6.09375,
726
- "learning_rate": 7.500000000000004e-06,
727
- "loss": 0.2776,
728
- "mean_token_accuracy": 0.9309542253613472,
729
- "num_tokens": 128629.0,
730
- "step": 72
731
- },
732
- {
733
- "entropy": 0.953188918530941,
734
- "epoch": 2.21455938697318,
735
- "grad_norm": 8.6875,
736
- "learning_rate": 7.019518852269953e-06,
737
- "loss": 0.4596,
738
- "mean_token_accuracy": 0.8634384647011757,
739
- "num_tokens": 130344.0,
740
- "step": 73
741
- },
742
- {
743
- "entropy": 0.8518025800585747,
744
- "epoch": 2.2452107279693485,
745
- "grad_norm": 7.46875,
746
- "learning_rate": 6.55019912904567e-06,
747
- "loss": 0.3006,
748
- "mean_token_accuracy": 0.9241785854101181,
749
- "num_tokens": 132152.0,
750
- "step": 74
751
- },
752
- {
753
- "entropy": 0.8467591479420662,
754
- "epoch": 2.2758620689655173,
755
- "grad_norm": 6.40625,
756
- "learning_rate": 6.092697216397478e-06,
757
- "loss": 0.2682,
758
- "mean_token_accuracy": 0.9179906323552132,
759
- "num_tokens": 134144.0,
760
- "step": 75
761
- },
762
- {
763
- "entropy": 0.7837551906704903,
764
- "epoch": 2.3065134099616857,
765
- "grad_norm": 7.25,
766
- "learning_rate": 5.647652972118998e-06,
767
- "loss": 0.3422,
768
- "mean_token_accuracy": 0.8964523077011108,
769
- "num_tokens": 136715.0,
770
- "step": 76
771
- },
772
- {
773
- "entropy": 0.7817510291934013,
774
- "epoch": 2.3371647509578546,
775
- "grad_norm": 7.25,
776
- "learning_rate": 5.2156888308281875e-06,
777
- "loss": 0.2678,
778
- "mean_token_accuracy": 0.9292137995362282,
779
- "num_tokens": 138907.0,
780
- "step": 77
781
- },
782
- {
783
- "entropy": 0.7645558379590511,
784
- "epoch": 2.367816091954023,
785
- "grad_norm": 7.6875,
786
- "learning_rate": 4.797408933436207e-06,
787
- "loss": 0.2069,
788
- "mean_token_accuracy": 0.9325998574495316,
789
- "num_tokens": 140536.0,
790
- "step": 78
791
- },
792
- {
793
- "entropy": 0.756471686065197,
794
- "epoch": 2.3984674329501914,
795
- "grad_norm": 8.6875,
796
- "learning_rate": 4.393398282201788e-06,
797
- "loss": 0.2288,
798
- "mean_token_accuracy": 0.924439363181591,
799
- "num_tokens": 142205.0,
800
- "step": 79
801
- },
802
- {
803
- "entropy": 0.7203860953450203,
804
- "epoch": 2.42911877394636,
805
- "grad_norm": 8.75,
806
- "learning_rate": 4.004221922552608e-06,
807
- "loss": 0.3023,
808
- "mean_token_accuracy": 0.9196523949503899,
809
- "num_tokens": 143937.0,
810
- "step": 80
811
- },
812
- {
813
- "entropy": 0.7062718719244003,
814
- "epoch": 2.4597701149425286,
815
- "grad_norm": 8.3125,
816
- "learning_rate": 3.630424152818203e-06,
817
- "loss": 0.242,
818
- "mean_token_accuracy": 0.9289174377918243,
819
- "num_tokens": 145867.0,
820
- "step": 81
821
- },
822
- {
823
- "entropy": 0.7174801900982857,
824
- "epoch": 2.4904214559386975,
825
- "grad_norm": 10.0625,
826
- "learning_rate": 3.272527762979553e-06,
827
- "loss": 0.3277,
828
- "mean_token_accuracy": 0.9081463739275932,
829
- "num_tokens": 147522.0,
830
- "step": 82
831
- },
832
- {
833
- "entropy": 0.7576407790184021,
834
- "epoch": 2.521072796934866,
835
- "grad_norm": 10.5,
836
- "learning_rate": 2.931033303499975e-06,
837
- "loss": 0.2869,
838
- "mean_token_accuracy": 0.9234072640538216,
839
- "num_tokens": 149154.0,
840
- "step": 83
841
- },
842
- {
843
- "entropy": 0.6603295132517815,
844
- "epoch": 2.5517241379310347,
845
- "grad_norm": 8.5,
846
- "learning_rate": 2.60641838526008e-06,
847
- "loss": 0.2954,
848
- "mean_token_accuracy": 0.9192768260836601,
849
- "num_tokens": 151443.0,
850
- "step": 84
851
- },
852
- {
853
- "entropy": 0.7209493666887283,
854
- "epoch": 2.582375478927203,
855
- "grad_norm": 7.625,
856
- "learning_rate": 2.2991370115757383e-06,
857
- "loss": 0.2553,
858
- "mean_token_accuracy": 0.9288515150547028,
859
- "num_tokens": 153346.0,
860
- "step": 85
861
- },
862
- {
863
- "entropy": 0.7502265051007271,
864
- "epoch": 2.6130268199233715,
865
- "grad_norm": 10.0625,
866
- "learning_rate": 2.0096189432334194e-06,
867
- "loss": 0.2759,
868
- "mean_token_accuracy": 0.9101333618164062,
869
- "num_tokens": 155041.0,
870
- "step": 86
871
- },
872
- {
873
- "entropy": 0.6479271687567234,
874
- "epoch": 2.6436781609195403,
875
- "grad_norm": 7.65625,
876
- "learning_rate": 1.7382690974308551e-06,
877
- "loss": 0.1765,
878
- "mean_token_accuracy": 0.9528548792004585,
879
- "num_tokens": 156508.0,
880
- "step": 87
881
- },
882
- {
883
- "entropy": 0.686508409678936,
884
- "epoch": 2.6743295019157087,
885
- "grad_norm": 6.5625,
886
- "learning_rate": 1.4854669814637145e-06,
887
- "loss": 0.1907,
888
- "mean_token_accuracy": 0.9471124485135078,
889
- "num_tokens": 158506.0,
890
- "step": 88
891
- },
892
- {
893
- "entropy": 0.6940162889659405,
894
- "epoch": 2.704980842911877,
895
- "grad_norm": 7.0625,
896
- "learning_rate": 1.2515661619503572e-06,
897
- "loss": 0.2139,
898
- "mean_token_accuracy": 0.9348281025886536,
899
- "num_tokens": 160511.0,
900
- "step": 89
901
- },
902
- {
903
- "entropy": 0.7100252062082291,
904
- "epoch": 2.735632183908046,
905
- "grad_norm": 9.0625,
906
- "learning_rate": 1.036893770336938e-06,
907
- "loss": 0.2846,
908
- "mean_token_accuracy": 0.9120082557201385,
909
- "num_tokens": 162548.0,
910
- "step": 90
911
- },
912
- {
913
- "entropy": 0.689895510673523,
914
- "epoch": 2.766283524904215,
915
- "grad_norm": 7.59375,
916
- "learning_rate": 8.417500453744864e-07,
917
- "loss": 0.2794,
918
- "mean_token_accuracy": 0.9187788665294647,
919
- "num_tokens": 164874.0,
920
- "step": 91
921
- },
922
- {
923
- "entropy": 0.6664801873266697,
924
- "epoch": 2.796934865900383,
925
- "grad_norm": 7.96875,
926
- "learning_rate": 6.664079132078881e-07,
927
- "loss": 0.199,
928
- "mean_token_accuracy": 0.94305020570755,
929
- "num_tokens": 166614.0,
930
- "step": 92
931
- },
932
- {
933
- "entropy": 0.7356143966317177,
934
- "epoch": 2.8275862068965516,
935
- "grad_norm": 29.25,
936
- "learning_rate": 5.11112605663977e-07,
937
- "loss": 0.3566,
938
- "mean_token_accuracy": 0.8869450762867928,
939
- "num_tokens": 168220.0,
940
- "step": 93
941
- },
942
- {
943
- "entropy": 0.7260653525590897,
944
- "epoch": 2.8582375478927204,
945
- "grad_norm": 12.0625,
946
- "learning_rate": 3.760813172726457e-07,
947
- "loss": 0.2395,
948
- "mean_token_accuracy": 0.9347701147198677,
949
- "num_tokens": 169540.0,
950
- "step": 94
951
- },
952
- {
953
- "entropy": 0.6620675958693027,
954
- "epoch": 2.888888888888889,
955
- "grad_norm": 7.3125,
956
- "learning_rate": 2.6150290150067593e-07,
957
- "loss": 0.2358,
958
- "mean_token_accuracy": 0.9333521574735641,
959
- "num_tokens": 171709.0,
960
- "step": 95
961
- },
962
- {
963
- "entropy": 0.6657432429492474,
964
- "epoch": 2.9195402298850572,
965
- "grad_norm": 9.375,
966
- "learning_rate": 1.6753760662307217e-07,
967
- "loss": 0.2499,
968
- "mean_token_accuracy": 0.9248412474989891,
969
- "num_tokens": 173432.0,
970
- "step": 96
971
- },
972
- {
973
- "entropy": 0.6610175892710686,
974
- "epoch": 2.950191570881226,
975
- "grad_norm": 10.3125,
976
- "learning_rate": 9.431685160136094e-08,
977
- "loss": 0.2274,
978
- "mean_token_accuracy": 0.9352346211671829,
979
- "num_tokens": 174962.0,
980
- "step": 97
981
- },
982
- {
983
- "entropy": 0.6855079308152199,
984
- "epoch": 2.9808429118773945,
985
- "grad_norm": 9.4375,
986
- "learning_rate": 4.194304228229806e-08,
987
- "loss": 0.2806,
988
- "mean_token_accuracy": 0.9201195910573006,
989
- "num_tokens": 176611.0,
990
- "step": 98
991
- },
992
- {
993
- "entropy": 0.6942157626152039,
994
- "epoch": 3.0,
995
- "grad_norm": 11.0625,
996
- "learning_rate": 1.0489428174020877e-08,
997
- "loss": 0.1556,
998
- "mean_token_accuracy": 0.9565272331237793,
999
- "num_tokens": 177426.0,
1000
- "step": 99
1001
  }
1002
  ],
1003
  "logging_steps": 1,
@@ -1012,12 +672,12 @@
1012
  "should_evaluate": false,
1013
  "should_log": false,
1014
  "should_save": true,
1015
- "should_training_stop": true
1016
  },
1017
  "attributes": {}
1018
  }
1019
  },
1020
- "total_flos": 4854658109841408.0,
1021
  "train_batch_size": 2,
1022
  "trial_name": null,
1023
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.9808429118773945,
6
  "eval_steps": 500,
7
+ "global_step": 65,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
658
  "mean_token_accuracy": 0.7686784416437149,
659
  "num_tokens": 116981.0,
660
  "step": 65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661
  }
662
  ],
663
  "logging_steps": 1,
 
672
  "should_evaluate": false,
673
  "should_log": false,
674
  "should_save": true,
675
+ "should_training_stop": false
676
  },
677
  "attributes": {}
678
  }
679
  },
680
+ "total_flos": 3202052021059584.0,
681
  "train_batch_size": 2,
682
  "trial_name": null,
683
  "trial_params": null