madhuHuggingface commited on
Commit
5ca05ba
·
verified ·
1 Parent(s): 6554589

Training in progress, step 1400

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e3c47872211fbcf052ba7df0f0915e501b20e8abc3bca44c5104a2f4ba81046
3
  size 121537408
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5548db897760d11eb42f797aede598c24ba8638657290f8d4ec41761003bdc5
3
  size 121537408
last-checkpoint/adapter_config.json CHANGED
@@ -33,13 +33,13 @@
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
36
- "o_proj",
37
- "down_proj",
38
  "up_proj",
39
- "v_proj",
40
  "q_proj",
 
 
 
41
  "gate_proj",
42
- "k_proj"
43
  ],
44
  "target_parameters": null,
45
  "task_type": "CAUSAL_LM",
 
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
 
 
36
  "up_proj",
 
37
  "q_proj",
38
+ "v_proj",
39
+ "k_proj",
40
+ "o_proj",
41
  "gate_proj",
42
+ "down_proj"
43
  ],
44
  "target_parameters": null,
45
  "task_type": "CAUSAL_LM",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e3c47872211fbcf052ba7df0f0915e501b20e8abc3bca44c5104a2f4ba81046
3
  size 121537408
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac86271e61f61c7e0e996c9a0b387781c0cf7e105d9e2809cf486578571e3692
3
  size 121537408
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ae996e1fc958f0c827e315fbb2ff4690cc75f8738d96a091461acafa611b7e6
3
- size 62655371
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6166d92c0eec7a743b35a9b0a6952a1ccf685a19c88ceec877f066ad8bcf660
3
+ size 62000725
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e226002dfde38ec81edb46535c412ea560a3a1d4ecaee989cfc82070757a4f85
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2f5818bcde61cb939645ced952eb7a6ec5c7bbec5f630a7156d7c2ae39b50d0
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.44616044616044614,
6
  "eval_steps": 500,
7
- "global_step": 1300,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -428,496 +428,6 @@
428
  "learning_rate": 0.00019783319385748891,
429
  "loss": 0.005371841043233872,
430
  "step": 600
431
- },
432
- {
433
- "epoch": 0.20935220935220936,
434
- "grad_norm": 0.03531381115317345,
435
- "learning_rate": 0.00019775798436093438,
436
- "loss": 0.0034491006284952165,
437
- "step": 610
438
- },
439
- {
440
- "epoch": 0.21278421278421278,
441
- "grad_norm": 0.3441513478755951,
442
- "learning_rate": 0.00019768150657209797,
443
- "loss": 0.0036659084260463716,
444
- "step": 620
445
- },
446
- {
447
- "epoch": 0.21621621621621623,
448
- "grad_norm": 0.12480789422988892,
449
- "learning_rate": 0.00019760376148318697,
450
- "loss": 0.005842024832963944,
451
- "step": 630
452
- },
453
- {
454
- "epoch": 0.21964821964821965,
455
- "grad_norm": 0.18049751222133636,
456
- "learning_rate": 0.00019752475010285044,
457
- "loss": 0.004476210474967957,
458
- "step": 640
459
- },
460
- {
461
- "epoch": 0.22308022308022307,
462
- "grad_norm": 0.03993777185678482,
463
- "learning_rate": 0.00019744447345616603,
464
- "loss": 0.002967344969511032,
465
- "step": 650
466
- },
467
- {
468
- "epoch": 0.22651222651222652,
469
- "grad_norm": 0.014075578190386295,
470
- "learning_rate": 0.00019736293258462663,
471
- "loss": 0.0038539741188287737,
472
- "step": 660
473
- },
474
- {
475
- "epoch": 0.22994422994422994,
476
- "grad_norm": 0.10091689974069595,
477
- "learning_rate": 0.00019728012854612707,
478
- "loss": 0.0027353862300515176,
479
- "step": 670
480
- },
481
- {
482
- "epoch": 0.23337623337623337,
483
- "grad_norm": 0.07889121025800705,
484
- "learning_rate": 0.00019719606241495015,
485
- "loss": 0.002710958570241928,
486
- "step": 680
487
- },
488
- {
489
- "epoch": 0.23680823680823682,
490
- "grad_norm": 0.30752819776535034,
491
- "learning_rate": 0.00019711073528175276,
492
- "loss": 0.00333489403128624,
493
- "step": 690
494
- },
495
- {
496
- "epoch": 0.24024024024024024,
497
- "grad_norm": 0.044186487793922424,
498
- "learning_rate": 0.00019702414825355192,
499
- "loss": 0.0013431076891720294,
500
- "step": 700
501
- },
502
- {
503
- "epoch": 0.24367224367224366,
504
- "grad_norm": 0.002193969674408436,
505
- "learning_rate": 0.00019693630245371012,
506
- "loss": 0.0016361255198717116,
507
- "step": 710
508
- },
509
- {
510
- "epoch": 0.2471042471042471,
511
- "grad_norm": 0.0048366570845246315,
512
- "learning_rate": 0.00019684719902192098,
513
- "loss": 0.0031337443739175796,
514
- "step": 720
515
- },
516
- {
517
- "epoch": 0.25053625053625056,
518
- "grad_norm": 0.003241632366552949,
519
- "learning_rate": 0.0001967568391141944,
520
- "loss": 0.0018327673897147179,
521
- "step": 730
522
- },
523
- {
524
- "epoch": 0.25396825396825395,
525
- "grad_norm": 0.07791941612958908,
526
- "learning_rate": 0.00019666522390284155,
527
- "loss": 0.005795871838927269,
528
- "step": 740
529
- },
530
- {
531
- "epoch": 0.2574002574002574,
532
- "grad_norm": 0.04366520047187805,
533
- "learning_rate": 0.00019657235457645956,
534
- "loss": 0.0033348515629768372,
535
- "step": 750
536
- },
537
- {
538
- "epoch": 0.26083226083226085,
539
- "grad_norm": 0.013953814283013344,
540
- "learning_rate": 0.00019647823233991623,
541
- "loss": 0.00804762840270996,
542
- "step": 760
543
- },
544
- {
545
- "epoch": 0.26426426426426425,
546
- "grad_norm": 0.017218926921486855,
547
- "learning_rate": 0.00019638285841433442,
548
- "loss": 0.005346460640430451,
549
- "step": 770
550
- },
551
- {
552
- "epoch": 0.2676962676962677,
553
- "grad_norm": 1.2277870178222656,
554
- "learning_rate": 0.00019628623403707612,
555
- "loss": 0.005925276502966881,
556
- "step": 780
557
- },
558
- {
559
- "epoch": 0.27112827112827115,
560
- "grad_norm": 0.05051695927977562,
561
- "learning_rate": 0.00019618836046172647,
562
- "loss": 0.0050374619662761685,
563
- "step": 790
564
- },
565
- {
566
- "epoch": 0.27456027456027454,
567
- "grad_norm": 0.004715205170214176,
568
- "learning_rate": 0.00019608923895807732,
569
- "loss": 0.0031466834247112275,
570
- "step": 800
571
- },
572
- {
573
- "epoch": 0.277992277992278,
574
- "grad_norm": 0.15201859176158905,
575
- "learning_rate": 0.00019598887081211103,
576
- "loss": 0.00553952269256115,
577
- "step": 810
578
- },
579
- {
580
- "epoch": 0.28142428142428144,
581
- "grad_norm": 0.017294466495513916,
582
- "learning_rate": 0.00019588725732598358,
583
- "loss": 0.0026330363005399706,
584
- "step": 820
585
- },
586
- {
587
- "epoch": 0.28485628485628484,
588
- "grad_norm": 0.009572334587574005,
589
- "learning_rate": 0.0001957843998180077,
590
- "loss": 0.0053256206214427945,
591
- "step": 830
592
- },
593
- {
594
- "epoch": 0.2882882882882883,
595
- "grad_norm": 0.9419263005256653,
596
- "learning_rate": 0.00019568029962263592,
597
- "loss": 0.005014676600694656,
598
- "step": 840
599
- },
600
- {
601
- "epoch": 0.29172029172029174,
602
- "grad_norm": 0.007972943596541882,
603
- "learning_rate": 0.0001955749580904431,
604
- "loss": 0.002865707501769066,
605
- "step": 850
606
- },
607
- {
608
- "epoch": 0.29515229515229513,
609
- "grad_norm": 0.0037338004913181067,
610
- "learning_rate": 0.00019546837658810883,
611
- "loss": 0.002737715095281601,
612
- "step": 860
613
- },
614
- {
615
- "epoch": 0.2985842985842986,
616
- "grad_norm": 0.044745851308107376,
617
- "learning_rate": 0.00019536055649840007,
618
- "loss": 0.005683861300349235,
619
- "step": 870
620
- },
621
- {
622
- "epoch": 0.30201630201630203,
623
- "grad_norm": 0.3476060628890991,
624
- "learning_rate": 0.00019525149922015268,
625
- "loss": 0.007439766824245453,
626
- "step": 880
627
- },
628
- {
629
- "epoch": 0.3054483054483054,
630
- "grad_norm": 0.046753134578466415,
631
- "learning_rate": 0.00019514120616825377,
632
- "loss": 0.009560897201299667,
633
- "step": 890
634
- },
635
- {
636
- "epoch": 0.3088803088803089,
637
- "grad_norm": 0.6365974545478821,
638
- "learning_rate": 0.00019502967877362305,
639
- "loss": 0.006552433967590332,
640
- "step": 900
641
- },
642
- {
643
- "epoch": 0.3123123123123123,
644
- "grad_norm": 1.7795714139938354,
645
- "learning_rate": 0.00019491691848319432,
646
- "loss": 0.0097378209233284,
647
- "step": 910
648
- },
649
- {
650
- "epoch": 0.3157443157443157,
651
- "grad_norm": 0.030942745506763458,
652
- "learning_rate": 0.00019480292675989677,
653
- "loss": 0.011464773118495942,
654
- "step": 920
655
- },
656
- {
657
- "epoch": 0.31917631917631917,
658
- "grad_norm": 0.05824369192123413,
659
- "learning_rate": 0.00019468770508263586,
660
- "loss": 0.0077786631882190704,
661
- "step": 930
662
- },
663
- {
664
- "epoch": 0.3226083226083226,
665
- "grad_norm": 1.115598440170288,
666
- "learning_rate": 0.00019457125494627431,
667
- "loss": 0.005580966919660568,
668
- "step": 940
669
- },
670
- {
671
- "epoch": 0.32604032604032607,
672
- "grad_norm": 0.11986860632896423,
673
- "learning_rate": 0.00019445357786161265,
674
- "loss": 0.01756148934364319,
675
- "step": 950
676
- },
677
- {
678
- "epoch": 0.32947232947232946,
679
- "grad_norm": 0.026070566847920418,
680
- "learning_rate": 0.00019433467535536947,
681
- "loss": 0.01354750245809555,
682
- "step": 960
683
- },
684
- {
685
- "epoch": 0.3329043329043329,
686
- "grad_norm": 0.11441248655319214,
687
- "learning_rate": 0.0001942145489701618,
688
- "loss": 0.008156213909387589,
689
- "step": 970
690
- },
691
- {
692
- "epoch": 0.33633633633633636,
693
- "grad_norm": 0.3980163037776947,
694
- "learning_rate": 0.00019409320026448504,
695
- "loss": 0.0047673903405666355,
696
- "step": 980
697
- },
698
- {
699
- "epoch": 0.33976833976833976,
700
- "grad_norm": 1.6305179595947266,
701
- "learning_rate": 0.0001939706308126927,
702
- "loss": 0.012219312787055969,
703
- "step": 990
704
- },
705
- {
706
- "epoch": 0.3432003432003432,
707
- "grad_norm": 0.3982492983341217,
708
- "learning_rate": 0.00019384684220497605,
709
- "loss": 0.011943883448839187,
710
- "step": 1000
711
- },
712
- {
713
- "epoch": 0.34663234663234666,
714
- "grad_norm": 0.8722233772277832,
715
- "learning_rate": 0.00019372183604734336,
716
- "loss": 0.01119406521320343,
717
- "step": 1010
718
- },
719
- {
720
- "epoch": 0.35006435006435005,
721
- "grad_norm": 0.4889911413192749,
722
- "learning_rate": 0.00019359561396159922,
723
- "loss": 0.01964961290359497,
724
- "step": 1020
725
- },
726
- {
727
- "epoch": 0.3534963534963535,
728
- "grad_norm": 2.336963176727295,
729
- "learning_rate": 0.00019346817758532337,
730
- "loss": 0.013343000411987304,
731
- "step": 1030
732
- },
733
- {
734
- "epoch": 0.35692835692835695,
735
- "grad_norm": 5.095973014831543,
736
- "learning_rate": 0.0001933395285718495,
737
- "loss": 0.045030930638313295,
738
- "step": 1040
739
- },
740
- {
741
- "epoch": 0.36036036036036034,
742
- "grad_norm": 1.8921918869018555,
743
- "learning_rate": 0.00019320966859024397,
744
- "loss": 0.017123931646347047,
745
- "step": 1050
746
- },
747
- {
748
- "epoch": 0.3637923637923638,
749
- "grad_norm": 0.32683122158050537,
750
- "learning_rate": 0.00019307859932528375,
751
- "loss": 0.0226660281419754,
752
- "step": 1060
753
- },
754
- {
755
- "epoch": 0.36722436722436724,
756
- "grad_norm": 1.3853524923324585,
757
- "learning_rate": 0.000192946322477435,
758
- "loss": 0.009634046256542206,
759
- "step": 1070
760
- },
761
- {
762
- "epoch": 0.37065637065637064,
763
- "grad_norm": 0.34646913409233093,
764
- "learning_rate": 0.0001928128397628307,
765
- "loss": 0.03943045735359192,
766
- "step": 1080
767
- },
768
- {
769
- "epoch": 0.3740883740883741,
770
- "grad_norm": 0.8397905826568604,
771
- "learning_rate": 0.00019267815291324852,
772
- "loss": 0.017884735763072968,
773
- "step": 1090
774
- },
775
- {
776
- "epoch": 0.37752037752037754,
777
- "grad_norm": 0.9594695568084717,
778
- "learning_rate": 0.00019254226367608842,
779
- "loss": 0.01769815683364868,
780
- "step": 1100
781
- },
782
- {
783
- "epoch": 0.38095238095238093,
784
- "grad_norm": 0.7242799997329712,
785
- "learning_rate": 0.0001924051738143498,
786
- "loss": 0.04254389405250549,
787
- "step": 1110
788
- },
789
- {
790
- "epoch": 0.3843843843843844,
791
- "grad_norm": 0.1905749887228012,
792
- "learning_rate": 0.00019226688510660877,
793
- "loss": 0.011978869885206222,
794
- "step": 1120
795
- },
796
- {
797
- "epoch": 0.38781638781638783,
798
- "grad_norm": 0.12193301320075989,
799
- "learning_rate": 0.00019212739934699498,
800
- "loss": 0.010143650323152542,
801
- "step": 1130
802
- },
803
- {
804
- "epoch": 0.3912483912483912,
805
- "grad_norm": 0.20223842561244965,
806
- "learning_rate": 0.00019198671834516843,
807
- "loss": 0.012704399228096009,
808
- "step": 1140
809
- },
810
- {
811
- "epoch": 0.3946803946803947,
812
- "grad_norm": 0.16973139345645905,
813
- "learning_rate": 0.00019184484392629586,
814
- "loss": 0.009967386722564697,
815
- "step": 1150
816
- },
817
- {
818
- "epoch": 0.3981123981123981,
819
- "grad_norm": 0.3374408185482025,
820
- "learning_rate": 0.00019170177793102736,
821
- "loss": 0.013026086986064911,
822
- "step": 1160
823
- },
824
- {
825
- "epoch": 0.4015444015444015,
826
- "grad_norm": 0.6263651847839355,
827
- "learning_rate": 0.0001915575222154721,
828
- "loss": 0.014929966628551483,
829
- "step": 1170
830
- },
831
- {
832
- "epoch": 0.40497640497640497,
833
- "grad_norm": 0.5159519910812378,
834
- "learning_rate": 0.00019141207865117448,
835
- "loss": 0.022531284391880034,
836
- "step": 1180
837
- },
838
- {
839
- "epoch": 0.4084084084084084,
840
- "grad_norm": 0.27669215202331543,
841
- "learning_rate": 0.0001912654491250899,
842
- "loss": 0.013255235552787781,
843
- "step": 1190
844
- },
845
- {
846
- "epoch": 0.4118404118404118,
847
- "grad_norm": 0.3306339383125305,
848
- "learning_rate": 0.00019111763553956006,
849
- "loss": 0.0071789674460887905,
850
- "step": 1200
851
- },
852
- {
853
- "epoch": 0.41527241527241526,
854
- "grad_norm": 0.1400749832391739,
855
- "learning_rate": 0.0001909686398122885,
856
- "loss": 0.012304867804050445,
857
- "step": 1210
858
- },
859
- {
860
- "epoch": 0.4187044187044187,
861
- "grad_norm": 0.480392724275589,
862
- "learning_rate": 0.0001908184638763156,
863
- "loss": 0.013182352483272552,
864
- "step": 1220
865
- },
866
- {
867
- "epoch": 0.42213642213642216,
868
- "grad_norm": 0.07393156737089157,
869
- "learning_rate": 0.00019066710967999352,
870
- "loss": 0.01357671171426773,
871
- "step": 1230
872
- },
873
- {
874
- "epoch": 0.42556842556842556,
875
- "grad_norm": 0.5818315148353577,
876
- "learning_rate": 0.00019051457918696092,
877
- "loss": 0.01494317352771759,
878
- "step": 1240
879
- },
880
- {
881
- "epoch": 0.429000429000429,
882
- "grad_norm": 0.12537962198257446,
883
- "learning_rate": 0.0001903608743761175,
884
- "loss": 0.016142460703849792,
885
- "step": 1250
886
- },
887
- {
888
- "epoch": 0.43243243243243246,
889
- "grad_norm": 0.6750714778900146,
890
- "learning_rate": 0.00019020599724159842,
891
- "loss": 0.010620266944169999,
892
- "step": 1260
893
- },
894
- {
895
- "epoch": 0.43586443586443585,
896
- "grad_norm": 2.9211416244506836,
897
- "learning_rate": 0.00019004994979274816,
898
- "loss": 0.02269883006811142,
899
- "step": 1270
900
- },
901
- {
902
- "epoch": 0.4392964392964393,
903
- "grad_norm": 0.16892600059509277,
904
- "learning_rate": 0.0001898927340540947,
905
- "loss": 0.020440049469470978,
906
- "step": 1280
907
- },
908
- {
909
- "epoch": 0.44272844272844275,
910
- "grad_norm": 1.130327820777893,
911
- "learning_rate": 0.00018973435206532323,
912
- "loss": 0.012587438523769378,
913
- "step": 1290
914
- },
915
- {
916
- "epoch": 0.44616044616044614,
917
- "grad_norm": 1.3412842750549316,
918
- "learning_rate": 0.00018957480588124956,
919
- "loss": 0.009808909147977829,
920
- "step": 1300
921
  }
922
  ],
923
  "logging_steps": 10,
@@ -937,7 +447,7 @@
937
  "attributes": {}
938
  }
939
  },
940
- "total_flos": 4930831518299136.0,
941
  "train_batch_size": 2,
942
  "trial_name": null,
943
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.2059202059202059,
6
  "eval_steps": 500,
7
+ "global_step": 600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
428
  "learning_rate": 0.00019783319385748891,
429
  "loss": 0.005371841043233872,
430
  "step": 600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  }
432
  ],
433
  "logging_steps": 10,
 
447
  "attributes": {}
448
  }
449
  },
450
+ "total_flos": 2276492884035072.0,
451
  "train_batch_size": 2,
452
  "trial_name": null,
453
  "trial_params": null