madeofajala commited on
Commit
6cc56e1
·
verified ·
1 Parent(s): 8ddf638

Training in progress, step 2425

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16867459c238ecc7a5d407fbde70d292d80c5c02caba34a26ed29ea260f4db5d
3
  size 108113968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab0efb69d2dd4ccd18add7c8d575a842a3d074ef54403c99c6db819af71c77a1
3
  size 108113968
last-checkpoint/adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "gate_proj",
33
- "v_proj",
34
  "k_proj",
 
35
  "o_proj",
36
- "down_proj",
37
- "q_proj",
38
- "up_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "down_proj",
 
33
  "k_proj",
34
+ "up_proj",
35
  "o_proj",
36
+ "gate_proj",
37
+ "v_proj",
38
+ "q_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16867459c238ecc7a5d407fbde70d292d80c5c02caba34a26ed29ea260f4db5d
3
  size 108113968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23cc74daf6c70e4089aff09333b0706b30bba28a0cf6991c49bb172b7614c70a
3
  size 108113968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8efc06c38fa74258b408de16da02e778df592df2079aa509df3315fa14619e4
3
  size 57081771
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45e2021b79e47156a888c5a9b65619596c377b821c0a36373339d1a5a3dfdb5b
3
  size 57081771
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14dad153ace9f17b878a1326b68c8639a626f910f5e0ed1e9324c8e1af846b2d
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb8c24123d0a6abb40712c04ec45e32a580173995d543bee32e57aefd8bd098c
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6878761822871883,
6
  "eval_steps": 300,
7
- "global_step": 2400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -498,476 +498,6 @@
498
  "mean_token_accuracy": 0.9329368638992309,
499
  "num_tokens": 122190.0,
500
  "step": 1225
501
- },
502
- {
503
- "entropy": 0.2075369682908058,
504
- "epoch": 0.3582688449412439,
505
- "grad_norm": 0.39205244183540344,
506
- "learning_rate": 0.0002,
507
- "loss": 0.2030281639099121,
508
- "mean_token_accuracy": 0.9333784127235413,
509
- "num_tokens": 40591.0,
510
- "step": 1250
511
- },
512
- {
513
- "entropy": 0.21607059925794603,
514
- "epoch": 0.3654342218400688,
515
- "grad_norm": 0.24999791383743286,
516
- "learning_rate": 0.0002,
517
- "loss": 0.21144355773925783,
518
- "mean_token_accuracy": 0.93144282579422,
519
- "num_tokens": 81873.0,
520
- "step": 1275
521
- },
522
- {
523
- "entropy": 0.2062842446565628,
524
- "epoch": 0.37259959873889364,
525
- "grad_norm": 0.36865198612213135,
526
- "learning_rate": 0.0002,
527
- "loss": 0.20512981414794923,
528
- "mean_token_accuracy": 0.9319202589988709,
529
- "num_tokens": 122819.0,
530
- "step": 1300
531
- },
532
- {
533
- "entropy": 0.20617701470851899,
534
- "epoch": 0.37976497563771855,
535
- "grad_norm": 0.2900436222553253,
536
- "learning_rate": 0.0002,
537
- "loss": 0.20203329086303712,
538
- "mean_token_accuracy": 0.9338054943084717,
539
- "num_tokens": 164426.0,
540
- "step": 1325
541
- },
542
- {
543
- "entropy": 0.20276340633630752,
544
- "epoch": 0.3869303525365434,
545
- "grad_norm": 0.3424394130706787,
546
- "learning_rate": 0.0002,
547
- "loss": 0.19647628784179688,
548
- "mean_token_accuracy": 0.9352651834487915,
549
- "num_tokens": 204693.0,
550
- "step": 1350
551
- },
552
- {
553
- "entropy": 0.19353608846664427,
554
- "epoch": 0.3940957294353683,
555
- "grad_norm": 0.2800115644931793,
556
- "learning_rate": 0.0002,
557
- "loss": 0.1909392547607422,
558
- "mean_token_accuracy": 0.936229157447815,
559
- "num_tokens": 245202.0,
560
- "step": 1375
561
- },
562
- {
563
- "entropy": 0.20765207797288895,
564
- "epoch": 0.40126110633419315,
565
- "grad_norm": 0.29286009073257446,
566
- "learning_rate": 0.0002,
567
- "loss": 0.20011020660400392,
568
- "mean_token_accuracy": 0.9337928628921509,
569
- "num_tokens": 286944.0,
570
- "step": 1400
571
- },
572
- {
573
- "entropy": 0.2191137021780014,
574
- "epoch": 0.40842648323301806,
575
- "grad_norm": 0.26620274782180786,
576
- "learning_rate": 0.0002,
577
- "loss": 0.21411985397338867,
578
- "mean_token_accuracy": 0.9297138857841492,
579
- "num_tokens": 329201.0,
580
- "step": 1425
581
- },
582
- {
583
- "entropy": 0.21264588654041292,
584
- "epoch": 0.41559186013184296,
585
- "grad_norm": 0.38385578989982605,
586
- "learning_rate": 0.0002,
587
- "loss": 0.208143367767334,
588
- "mean_token_accuracy": 0.9316162085533142,
589
- "num_tokens": 371891.0,
590
- "step": 1450
591
- },
592
- {
593
- "entropy": 0.2007578819990158,
594
- "epoch": 0.4227572370306678,
595
- "grad_norm": 0.3052174746990204,
596
- "learning_rate": 0.0002,
597
- "loss": 0.19854948043823242,
598
- "mean_token_accuracy": 0.9345853734016418,
599
- "num_tokens": 412534.0,
600
- "step": 1475
601
- },
602
- {
603
- "entropy": 0.20256735682487487,
604
- "epoch": 0.4299226139294927,
605
- "grad_norm": 0.2761523723602295,
606
- "learning_rate": 0.0002,
607
- "loss": 0.19539085388183594,
608
- "mean_token_accuracy": 0.9367341923713685,
609
- "num_tokens": 452855.0,
610
- "step": 1500
611
- },
612
- {
613
- "entropy": 0.19164222806692124,
614
- "epoch": 0.43708799082831756,
615
- "grad_norm": 0.3495299220085144,
616
- "learning_rate": 0.0002,
617
- "loss": 0.1890553665161133,
618
- "mean_token_accuracy": 0.9373278784751892,
619
- "num_tokens": 493202.0,
620
- "step": 1525
621
- },
622
- {
623
- "entropy": 0.20341520249843598,
624
- "epoch": 0.44425336772714247,
625
- "grad_norm": 0.3206697702407837,
626
- "learning_rate": 0.0002,
627
- "loss": 0.20173826217651367,
628
- "mean_token_accuracy": 0.9332520008087158,
629
- "num_tokens": 534946.0,
630
- "step": 1550
631
- },
632
- {
633
- "entropy": 0.20512860178947448,
634
- "epoch": 0.4514187446259673,
635
- "grad_norm": 0.369289755821228,
636
- "learning_rate": 0.0002,
637
- "loss": 0.1998735237121582,
638
- "mean_token_accuracy": 0.9336103320121765,
639
- "num_tokens": 576115.0,
640
- "step": 1575
641
- },
642
- {
643
- "entropy": 0.19730552673339843,
644
- "epoch": 0.4585841215247922,
645
- "grad_norm": 0.1693185716867447,
646
- "learning_rate": 0.0002,
647
- "loss": 0.19725181579589843,
648
- "mean_token_accuracy": 0.9343607997894288,
649
- "num_tokens": 616923.0,
650
- "step": 1600
651
- },
652
- {
653
- "entropy": 0.20145605146884918,
654
- "epoch": 0.46574949842361707,
655
- "grad_norm": 0.34067076444625854,
656
- "learning_rate": 0.0002,
657
- "loss": 0.19863763809204102,
658
- "mean_token_accuracy": 0.9344722628593445,
659
- "num_tokens": 658021.0,
660
- "step": 1625
661
- },
662
- {
663
- "entropy": 0.19174030989408494,
664
- "epoch": 0.472914875322442,
665
- "grad_norm": 0.282787024974823,
666
- "learning_rate": 0.0002,
667
- "loss": 0.18856592178344728,
668
- "mean_token_accuracy": 0.9382144474983215,
669
- "num_tokens": 698701.0,
670
- "step": 1650
671
- },
672
- {
673
- "entropy": 0.19893687069416047,
674
- "epoch": 0.4800802522212668,
675
- "grad_norm": 0.21854329109191895,
676
- "learning_rate": 0.0002,
677
- "loss": 0.19327503204345703,
678
- "mean_token_accuracy": 0.9353450679779053,
679
- "num_tokens": 739913.0,
680
- "step": 1675
681
- },
682
- {
683
- "entropy": 0.19346537590026855,
684
- "epoch": 0.48724562912009173,
685
- "grad_norm": 0.19436436891555786,
686
- "learning_rate": 0.0002,
687
- "loss": 0.19321285247802733,
688
- "mean_token_accuracy": 0.9373372173309327,
689
- "num_tokens": 780719.0,
690
- "step": 1700
691
- },
692
- {
693
- "entropy": 0.20528113186359406,
694
- "epoch": 0.4944110060189166,
695
- "grad_norm": 0.31415456533432007,
696
- "learning_rate": 0.0002,
697
- "loss": 0.2044132423400879,
698
- "mean_token_accuracy": 0.9320711612701416,
699
- "num_tokens": 822130.0,
700
- "step": 1725
701
- },
702
- {
703
- "entropy": 0.20051146537065506,
704
- "epoch": 0.5015763829177414,
705
- "grad_norm": 0.36767083406448364,
706
- "learning_rate": 0.0002,
707
- "loss": 0.19968202590942383,
708
- "mean_token_accuracy": 0.9361233901977539,
709
- "num_tokens": 863055.0,
710
- "step": 1750
711
- },
712
- {
713
- "entropy": 0.19146274596452714,
714
- "epoch": 0.5087417598165663,
715
- "grad_norm": 0.36641210317611694,
716
- "learning_rate": 0.0002,
717
- "loss": 0.1849520492553711,
718
- "mean_token_accuracy": 0.9378811025619507,
719
- "num_tokens": 903979.0,
720
- "step": 1775
721
- },
722
- {
723
- "entropy": 0.20497863948345185,
724
- "epoch": 0.5159071367153912,
725
- "grad_norm": 0.41181716322898865,
726
- "learning_rate": 0.0002,
727
- "loss": 0.2043849754333496,
728
- "mean_token_accuracy": 0.9320770597457886,
729
- "num_tokens": 945010.0,
730
- "step": 1800
731
- },
732
- {
733
- "entropy": 0.19871506720781326,
734
- "epoch": 0.5230725136142161,
735
- "grad_norm": 0.34865760803222656,
736
- "learning_rate": 0.0002,
737
- "loss": 0.19058765411376954,
738
- "mean_token_accuracy": 0.936968915462494,
739
- "num_tokens": 985351.0,
740
- "step": 1825
741
- },
742
- {
743
- "entropy": 0.21031922459602356,
744
- "epoch": 0.5302378905130409,
745
- "grad_norm": 0.35983604192733765,
746
- "learning_rate": 0.0002,
747
- "loss": 0.20398990631103517,
748
- "mean_token_accuracy": 0.9338763618469238,
749
- "num_tokens": 1027146.0,
750
- "step": 1850
751
- },
752
- {
753
- "entropy": 0.20145108669996262,
754
- "epoch": 0.5374032674118658,
755
- "grad_norm": 0.2126716524362564,
756
- "learning_rate": 0.0002,
757
- "loss": 0.19558551788330078,
758
- "mean_token_accuracy": 0.9350816106796265,
759
- "num_tokens": 1068454.0,
760
- "step": 1875
761
- },
762
- {
763
- "entropy": 0.19600239813327788,
764
- "epoch": 0.5445686443106907,
765
- "grad_norm": 0.2547587752342224,
766
- "learning_rate": 0.0002,
767
- "loss": 0.18890924453735353,
768
- "mean_token_accuracy": 0.9360025477409363,
769
- "num_tokens": 1109230.0,
770
- "step": 1900
771
- },
772
- {
773
- "entropy": 0.17782112330198288,
774
- "epoch": 0.5517340212095156,
775
- "grad_norm": 0.28866520524024963,
776
- "learning_rate": 0.0002,
777
- "loss": 0.17644382476806642,
778
- "mean_token_accuracy": 0.9422430300712585,
779
- "num_tokens": 1148978.0,
780
- "step": 1925
781
- },
782
- {
783
- "entropy": 0.18634845435619354,
784
- "epoch": 0.5588993981083406,
785
- "grad_norm": 0.2348451316356659,
786
- "learning_rate": 0.0002,
787
- "loss": 0.1815641212463379,
788
- "mean_token_accuracy": 0.9392524695396424,
789
- "num_tokens": 1189196.0,
790
- "step": 1950
791
- },
792
- {
793
- "entropy": 0.18852397054433823,
794
- "epoch": 0.5660647750071653,
795
- "grad_norm": 0.25562164187431335,
796
- "learning_rate": 0.0002,
797
- "loss": 0.18350950241088868,
798
- "mean_token_accuracy": 0.9394415140151977,
799
- "num_tokens": 1229072.0,
800
- "step": 1975
801
- },
802
- {
803
- "entropy": 0.18256970256567,
804
- "epoch": 0.5732301519059902,
805
- "grad_norm": 0.36442917585372925,
806
- "learning_rate": 0.0002,
807
- "loss": 0.18093914031982422,
808
- "mean_token_accuracy": 0.9397966265678406,
809
- "num_tokens": 1269371.0,
810
- "step": 2000
811
- },
812
- {
813
- "entropy": 0.20554341971874238,
814
- "epoch": 0.5803955288048152,
815
- "grad_norm": 0.3102213442325592,
816
- "learning_rate": 0.0002,
817
- "loss": 0.2052627372741699,
818
- "mean_token_accuracy": 0.9325902485847473,
819
- "num_tokens": 1311354.0,
820
- "step": 2025
821
- },
822
- {
823
- "entropy": 0.2037496653199196,
824
- "epoch": 0.5875609057036401,
825
- "grad_norm": 0.24330857396125793,
826
- "learning_rate": 0.0002,
827
- "loss": 0.20022052764892578,
828
- "mean_token_accuracy": 0.9342735767364502,
829
- "num_tokens": 1353051.0,
830
- "step": 2050
831
- },
832
- {
833
- "entropy": 0.19858424603939057,
834
- "epoch": 0.5947262826024649,
835
- "grad_norm": 0.2955344021320343,
836
- "learning_rate": 0.0002,
837
- "loss": 0.19497306823730468,
838
- "mean_token_accuracy": 0.9353799057006836,
839
- "num_tokens": 1394712.0,
840
- "step": 2075
841
- },
842
- {
843
- "entropy": 0.20194011509418489,
844
- "epoch": 0.6018916595012898,
845
- "grad_norm": 0.20898522436618805,
846
- "learning_rate": 0.0002,
847
- "loss": 0.19739873886108397,
848
- "mean_token_accuracy": 0.9346665263175964,
849
- "num_tokens": 1436282.0,
850
- "step": 2100
851
- },
852
- {
853
- "entropy": 0.18827197730541229,
854
- "epoch": 0.6090570364001147,
855
- "grad_norm": 0.3064703643321991,
856
- "learning_rate": 0.0002,
857
- "loss": 0.1838802719116211,
858
- "mean_token_accuracy": 0.939849009513855,
859
- "num_tokens": 1476569.0,
860
- "step": 2125
861
- },
862
- {
863
- "entropy": 0.20322401821613312,
864
- "epoch": 0.6162224132989396,
865
- "grad_norm": 0.42201489210128784,
866
- "learning_rate": 0.0002,
867
- "loss": 0.20033023834228517,
868
- "mean_token_accuracy": 0.9345659923553467,
869
- "num_tokens": 1518315.0,
870
- "step": 2150
871
- },
872
- {
873
- "entropy": 0.1822732812166214,
874
- "epoch": 0.6233877901977644,
875
- "grad_norm": 0.2799566388130188,
876
- "learning_rate": 0.0002,
877
- "loss": 0.18143136978149413,
878
- "mean_token_accuracy": 0.9404231834411622,
879
- "num_tokens": 1558340.0,
880
- "step": 2175
881
- },
882
- {
883
- "entropy": 0.19505684196949005,
884
- "epoch": 0.6305531670965893,
885
- "grad_norm": 0.20578612387180328,
886
- "learning_rate": 0.0002,
887
- "loss": 0.18889547348022462,
888
- "mean_token_accuracy": 0.9381808185577393,
889
- "num_tokens": 1599592.0,
890
- "step": 2200
891
- },
892
- {
893
- "entropy": 0.19981920778751372,
894
- "epoch": 0.6377185439954142,
895
- "grad_norm": 0.28131991624832153,
896
- "learning_rate": 0.0002,
897
- "loss": 0.19793636322021485,
898
- "mean_token_accuracy": 0.935631537437439,
899
- "num_tokens": 1641401.0,
900
- "step": 2225
901
- },
902
- {
903
- "entropy": 0.19168403446674348,
904
- "epoch": 0.6448839208942391,
905
- "grad_norm": 0.25856539607048035,
906
- "learning_rate": 0.0002,
907
- "loss": 0.1897783088684082,
908
- "mean_token_accuracy": 0.9356019353866577,
909
- "num_tokens": 1682949.0,
910
- "step": 2250
911
- },
912
- {
913
- "entropy": 0.1931222453713417,
914
- "epoch": 0.6520492977930639,
915
- "grad_norm": 0.4090195596218109,
916
- "learning_rate": 0.0002,
917
- "loss": 0.1929492950439453,
918
- "mean_token_accuracy": 0.9369300937652588,
919
- "num_tokens": 1724557.0,
920
- "step": 2275
921
- },
922
- {
923
- "entropy": 0.19567115902900695,
924
- "epoch": 0.6592146746918888,
925
- "grad_norm": 0.19224579632282257,
926
- "learning_rate": 0.0002,
927
- "loss": 0.19031965255737304,
928
- "mean_token_accuracy": 0.9367053961753845,
929
- "num_tokens": 1765618.0,
930
- "step": 2300
931
- },
932
- {
933
- "entropy": 0.18622912466526031,
934
- "epoch": 0.6663800515907137,
935
- "grad_norm": 0.27013909816741943,
936
- "learning_rate": 0.0002,
937
- "loss": 0.18465063095092774,
938
- "mean_token_accuracy": 0.9383154940605164,
939
- "num_tokens": 1806491.0,
940
- "step": 2325
941
- },
942
- {
943
- "entropy": 0.19851551949977875,
944
- "epoch": 0.6735454284895386,
945
- "grad_norm": 0.3999996483325958,
946
- "learning_rate": 0.0002,
947
- "loss": 0.19640205383300782,
948
- "mean_token_accuracy": 0.9344918823242188,
949
- "num_tokens": 1848741.0,
950
- "step": 2350
951
- },
952
- {
953
- "entropy": 0.18972006916999817,
954
- "epoch": 0.6807108053883635,
955
- "grad_norm": 0.26580268144607544,
956
- "learning_rate": 0.0002,
957
- "loss": 0.1871095657348633,
958
- "mean_token_accuracy": 0.9390228629112244,
959
- "num_tokens": 1890071.0,
960
- "step": 2375
961
- },
962
- {
963
- "entropy": 0.19580536246299743,
964
- "epoch": 0.6878761822871883,
965
- "grad_norm": 0.2682396471500397,
966
- "learning_rate": 0.0002,
967
- "loss": 0.19406461715698242,
968
- "mean_token_accuracy": 0.9354706478118896,
969
- "num_tokens": 1931751.0,
970
- "step": 2400
971
  }
972
  ],
973
  "logging_steps": 25,
@@ -987,7 +517,7 @@
987
  "attributes": {}
988
  }
989
  },
990
- "total_flos": 2.329083936152494e+17,
991
  "train_batch_size": 4,
992
  "trial_name": null,
993
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.35110346804241904,
6
  "eval_steps": 300,
7
+ "global_step": 1225,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
498
  "mean_token_accuracy": 0.9329368638992309,
499
  "num_tokens": 122190.0,
500
  "step": 1225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501
  }
502
  ],
503
  "logging_steps": 25,
 
517
  "attributes": {}
518
  }
519
  },
520
+ "total_flos": 1.1890534404816077e+17,
521
  "train_batch_size": 4,
522
  "trial_name": null,
523
  "trial_params": null