NBAmine commited on
Commit
c29f78f
·
verified ·
1 Parent(s): 62c9e44

Training in progress, epoch 2, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5128d9dcd70414a36e31024f4ad5ec042101281c4c972fc6e1627cf56599a4f6
3
  size 228140600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad737a62d5e6dd3601ece9ec89b866a23dce6e9660089db12fbc69ce938d925e
3
  size 228140600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b29aeaf32b580836c4ab6ce3bb3b6341694319fadb2c700d01bdfa699d0c67c
3
  size 116484839
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dfe45f7e8553ab326b74acce24981ad2310a19952e96422e10cdcd05d9f3261
3
  size 116484839
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e2a25360b265ca8d0b891411b6f03807107a036c84312fe5f9c527c82dffde4
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4ff3bd83efcc74d45f6dc982dfad42de943c268219c0ad0ee388295c41e8e02
3
  size 14709
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2784a2c99c69b1eeb46f85a93b50eab9ad7944681abfbfbe77fcff06d3d98c4
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cb6f523fe7cbe7ec261f5a7daf8f68472cbad6a063d529646d1f827a9ef9fd3
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:235ac77d5afb578b9d394edc166238b7f00aecfd5e424e6f4eb719fa59ee4941
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53f409af08acb24ba2f85422d6d830e93fdc97a01268b4582a53eec3cbfeb20a
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 438,
3
  "best_metric": 1.2615772485733032,
4
  "best_model_checkpoint": "./adapter-phase2/checkpoint-438",
5
- "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 438,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -450,6 +450,458 @@
450
  "eval_samples_per_second": 3.459,
451
  "eval_steps_per_second": 0.866,
452
  "step": 438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
  }
454
  ],
455
  "logging_steps": 10,
@@ -469,7 +921,7 @@
469
  "attributes": {}
470
  }
471
  },
472
- "total_flos": 2.940029330061312e+16,
473
  "train_batch_size": 1,
474
  "trial_name": null,
475
  "trial_params": null
 
2
  "best_global_step": 438,
3
  "best_metric": 1.2615772485733032,
4
  "best_model_checkpoint": "./adapter-phase2/checkpoint-438",
5
+ "epoch": 2.0,
6
  "eval_steps": 500,
7
+ "global_step": 876,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
450
  "eval_samples_per_second": 3.459,
451
  "eval_steps_per_second": 0.866,
452
  "step": 438
453
+ },
454
+ {
455
+ "entropy": 0.9707203358411789,
456
+ "epoch": 1.0045714285714287,
457
+ "grad_norm": 1.046062707901001,
458
+ "learning_rate": 7.99543378995434e-06,
459
+ "loss": 0.8608,
460
+ "mean_token_accuracy": 0.7782945515293824,
461
+ "num_tokens": 425275.0,
462
+ "step": 440
463
+ },
464
+ {
465
+ "entropy": 0.7523622503504157,
466
+ "epoch": 1.0274285714285714,
467
+ "grad_norm": 1.2251920700073242,
468
+ "learning_rate": 7.949771689497718e-06,
469
+ "loss": 0.7473,
470
+ "mean_token_accuracy": 0.8092356324195862,
471
+ "num_tokens": 439850.0,
472
+ "step": 450
473
+ },
474
+ {
475
+ "entropy": 0.84888547193259,
476
+ "epoch": 1.0502857142857143,
477
+ "grad_norm": 1.325343370437622,
478
+ "learning_rate": 7.904109589041097e-06,
479
+ "loss": 0.7907,
480
+ "mean_token_accuracy": 0.7977306388318539,
481
+ "num_tokens": 451136.0,
482
+ "step": 460
483
+ },
484
+ {
485
+ "entropy": 0.9735806178301573,
486
+ "epoch": 1.0731428571428572,
487
+ "grad_norm": 1.6439032554626465,
488
+ "learning_rate": 7.858447488584475e-06,
489
+ "loss": 0.9513,
490
+ "mean_token_accuracy": 0.7617030199617147,
491
+ "num_tokens": 459486.0,
492
+ "step": 470
493
+ },
494
+ {
495
+ "entropy": 1.0277788739651441,
496
+ "epoch": 1.096,
497
+ "grad_norm": 1.8182581663131714,
498
+ "learning_rate": 7.812785388127855e-06,
499
+ "loss": 0.9526,
500
+ "mean_token_accuracy": 0.7630892738699913,
501
+ "num_tokens": 465975.0,
502
+ "step": 480
503
+ },
504
+ {
505
+ "entropy": 0.943339848332107,
506
+ "epoch": 1.1188571428571428,
507
+ "grad_norm": 1.1697943210601807,
508
+ "learning_rate": 7.767123287671234e-06,
509
+ "loss": 0.851,
510
+ "mean_token_accuracy": 0.7827658370137215,
511
+ "num_tokens": 473929.0,
512
+ "step": 490
513
+ },
514
+ {
515
+ "entropy": 0.7733788685873151,
516
+ "epoch": 1.1417142857142857,
517
+ "grad_norm": 1.2411632537841797,
518
+ "learning_rate": 7.721461187214612e-06,
519
+ "loss": 0.7691,
520
+ "mean_token_accuracy": 0.8111804500222206,
521
+ "num_tokens": 488306.0,
522
+ "step": 500
523
+ },
524
+ {
525
+ "entropy": 0.824394048191607,
526
+ "epoch": 1.1645714285714286,
527
+ "grad_norm": 1.3971821069717407,
528
+ "learning_rate": 7.675799086757991e-06,
529
+ "loss": 0.7429,
530
+ "mean_token_accuracy": 0.8088308341801167,
531
+ "num_tokens": 499208.0,
532
+ "step": 510
533
+ },
534
+ {
535
+ "entropy": 0.9718045836314559,
536
+ "epoch": 1.1874285714285715,
537
+ "grad_norm": 1.7808269262313843,
538
+ "learning_rate": 7.630136986301371e-06,
539
+ "loss": 0.9365,
540
+ "mean_token_accuracy": 0.762304800376296,
541
+ "num_tokens": 507299.0,
542
+ "step": 520
543
+ },
544
+ {
545
+ "entropy": 0.9984221205115318,
546
+ "epoch": 1.2102857142857144,
547
+ "grad_norm": 2.0445668697357178,
548
+ "learning_rate": 7.58447488584475e-06,
549
+ "loss": 0.9299,
550
+ "mean_token_accuracy": 0.7667763099074364,
551
+ "num_tokens": 513558.0,
552
+ "step": 530
553
+ },
554
+ {
555
+ "entropy": 0.9143548993393779,
556
+ "epoch": 1.233142857142857,
557
+ "grad_norm": 1.4540475606918335,
558
+ "learning_rate": 7.538812785388129e-06,
559
+ "loss": 0.7977,
560
+ "mean_token_accuracy": 0.7940818261355161,
561
+ "num_tokens": 521477.0,
562
+ "step": 540
563
+ },
564
+ {
565
+ "entropy": 0.7225218357518315,
566
+ "epoch": 1.256,
567
+ "grad_norm": 1.4530831575393677,
568
+ "learning_rate": 7.4931506849315075e-06,
569
+ "loss": 0.7282,
570
+ "mean_token_accuracy": 0.8143456902354955,
571
+ "num_tokens": 536156.0,
572
+ "step": 550
573
+ },
574
+ {
575
+ "entropy": 0.826616644486785,
576
+ "epoch": 1.278857142857143,
577
+ "grad_norm": 1.307198166847229,
578
+ "learning_rate": 7.447488584474887e-06,
579
+ "loss": 0.7509,
580
+ "mean_token_accuracy": 0.8082947298884392,
581
+ "num_tokens": 547250.0,
582
+ "step": 560
583
+ },
584
+ {
585
+ "entropy": 0.9597857438027859,
586
+ "epoch": 1.3017142857142856,
587
+ "grad_norm": 2.1991994380950928,
588
+ "learning_rate": 7.401826484018265e-06,
589
+ "loss": 0.9294,
590
+ "mean_token_accuracy": 0.7654231000691653,
591
+ "num_tokens": 555523.0,
592
+ "step": 570
593
+ },
594
+ {
595
+ "entropy": 1.0113731533288957,
596
+ "epoch": 1.3245714285714285,
597
+ "grad_norm": 2.2134881019592285,
598
+ "learning_rate": 7.356164383561645e-06,
599
+ "loss": 0.9149,
600
+ "mean_token_accuracy": 0.7716960549354553,
601
+ "num_tokens": 561790.0,
602
+ "step": 580
603
+ },
604
+ {
605
+ "entropy": 0.89712286721915,
606
+ "epoch": 1.3474285714285714,
607
+ "grad_norm": 1.2084845304489136,
608
+ "learning_rate": 7.310502283105023e-06,
609
+ "loss": 0.7891,
610
+ "mean_token_accuracy": 0.7911178763955832,
611
+ "num_tokens": 569844.0,
612
+ "step": 590
613
+ },
614
+ {
615
+ "entropy": 0.7331022916361689,
616
+ "epoch": 1.3702857142857143,
617
+ "grad_norm": 1.3023542165756226,
618
+ "learning_rate": 7.269406392694065e-06,
619
+ "loss": 0.7457,
620
+ "mean_token_accuracy": 0.8113049529492855,
621
+ "num_tokens": 584459.0,
622
+ "step": 600
623
+ },
624
+ {
625
+ "entropy": 0.7879349924623966,
626
+ "epoch": 1.3931428571428572,
627
+ "grad_norm": 1.555379867553711,
628
+ "learning_rate": 7.223744292237444e-06,
629
+ "loss": 0.7306,
630
+ "mean_token_accuracy": 0.8145640216767788,
631
+ "num_tokens": 595804.0,
632
+ "step": 610
633
+ },
634
+ {
635
+ "entropy": 0.9201723251491785,
636
+ "epoch": 1.416,
637
+ "grad_norm": 2.0131261348724365,
638
+ "learning_rate": 7.178082191780823e-06,
639
+ "loss": 0.881,
640
+ "mean_token_accuracy": 0.7761596899479628,
641
+ "num_tokens": 604098.0,
642
+ "step": 620
643
+ },
644
+ {
645
+ "entropy": 1.0043058268725873,
646
+ "epoch": 1.4388571428571428,
647
+ "grad_norm": 1.952837586402893,
648
+ "learning_rate": 7.132420091324202e-06,
649
+ "loss": 0.9229,
650
+ "mean_token_accuracy": 0.7723097205162048,
651
+ "num_tokens": 610481.0,
652
+ "step": 630
653
+ },
654
+ {
655
+ "entropy": 0.8940085913985968,
656
+ "epoch": 1.4617142857142857,
657
+ "grad_norm": 1.2801399230957031,
658
+ "learning_rate": 7.086757990867581e-06,
659
+ "loss": 0.8006,
660
+ "mean_token_accuracy": 0.7930479496717453,
661
+ "num_tokens": 618699.0,
662
+ "step": 640
663
+ },
664
+ {
665
+ "entropy": 0.6966889450326562,
666
+ "epoch": 1.4845714285714287,
667
+ "grad_norm": 1.557562232017517,
668
+ "learning_rate": 7.0410958904109596e-06,
669
+ "loss": 0.665,
670
+ "mean_token_accuracy": 0.8264754865318537,
671
+ "num_tokens": 632856.0,
672
+ "step": 650
673
+ },
674
+ {
675
+ "entropy": 0.8100471086800098,
676
+ "epoch": 1.5074285714285716,
677
+ "grad_norm": 1.7616751194000244,
678
+ "learning_rate": 6.995433789954339e-06,
679
+ "loss": 0.7669,
680
+ "mean_token_accuracy": 0.8096333492547274,
681
+ "num_tokens": 643712.0,
682
+ "step": 660
683
+ },
684
+ {
685
+ "entropy": 0.9476521443575621,
686
+ "epoch": 1.5302857142857142,
687
+ "grad_norm": 1.97320556640625,
688
+ "learning_rate": 6.9497716894977175e-06,
689
+ "loss": 0.8769,
690
+ "mean_token_accuracy": 0.7822451706975698,
691
+ "num_tokens": 651732.0,
692
+ "step": 670
693
+ },
694
+ {
695
+ "entropy": 0.9541807420551777,
696
+ "epoch": 1.5531428571428572,
697
+ "grad_norm": 2.2813711166381836,
698
+ "learning_rate": 6.904109589041097e-06,
699
+ "loss": 0.8731,
700
+ "mean_token_accuracy": 0.7764547783881426,
701
+ "num_tokens": 658104.0,
702
+ "step": 680
703
+ },
704
+ {
705
+ "entropy": 0.8891686601564288,
706
+ "epoch": 1.576,
707
+ "grad_norm": 1.2347137928009033,
708
+ "learning_rate": 6.858447488584475e-06,
709
+ "loss": 0.8099,
710
+ "mean_token_accuracy": 0.795854776352644,
711
+ "num_tokens": 666681.0,
712
+ "step": 690
713
+ },
714
+ {
715
+ "entropy": 0.7062053712084889,
716
+ "epoch": 1.5988571428571428,
717
+ "grad_norm": 1.505817174911499,
718
+ "learning_rate": 6.812785388127855e-06,
719
+ "loss": 0.6689,
720
+ "mean_token_accuracy": 0.8225430808961391,
721
+ "num_tokens": 681161.0,
722
+ "step": 700
723
+ },
724
+ {
725
+ "entropy": 0.7627910353243351,
726
+ "epoch": 1.6217142857142857,
727
+ "grad_norm": 1.7354750633239746,
728
+ "learning_rate": 6.767123287671233e-06,
729
+ "loss": 0.7217,
730
+ "mean_token_accuracy": 0.8088484812527895,
731
+ "num_tokens": 692262.0,
732
+ "step": 710
733
+ },
734
+ {
735
+ "entropy": 0.9181301448494196,
736
+ "epoch": 1.6445714285714286,
737
+ "grad_norm": 1.9427331686019897,
738
+ "learning_rate": 6.721461187214613e-06,
739
+ "loss": 0.8664,
740
+ "mean_token_accuracy": 0.7764203164726495,
741
+ "num_tokens": 700252.0,
742
+ "step": 720
743
+ },
744
+ {
745
+ "entropy": 0.970825233310461,
746
+ "epoch": 1.6674285714285715,
747
+ "grad_norm": 2.231489419937134,
748
+ "learning_rate": 6.675799086757991e-06,
749
+ "loss": 0.8727,
750
+ "mean_token_accuracy": 0.77991351634264,
751
+ "num_tokens": 706466.0,
752
+ "step": 730
753
+ },
754
+ {
755
+ "entropy": 0.8769128978252411,
756
+ "epoch": 1.6902857142857144,
757
+ "grad_norm": 1.3580577373504639,
758
+ "learning_rate": 6.630136986301371e-06,
759
+ "loss": 0.7826,
760
+ "mean_token_accuracy": 0.7997685220092535,
761
+ "num_tokens": 714701.0,
762
+ "step": 740
763
+ },
764
+ {
765
+ "entropy": 0.6923451218754053,
766
+ "epoch": 1.713142857142857,
767
+ "grad_norm": 1.4095361232757568,
768
+ "learning_rate": 6.584474885844749e-06,
769
+ "loss": 0.6984,
770
+ "mean_token_accuracy": 0.8204937841743231,
771
+ "num_tokens": 729622.0,
772
+ "step": 750
773
+ },
774
+ {
775
+ "entropy": 0.7426450593397022,
776
+ "epoch": 1.736,
777
+ "grad_norm": 1.5736570358276367,
778
+ "learning_rate": 6.538812785388129e-06,
779
+ "loss": 0.667,
780
+ "mean_token_accuracy": 0.8291565012186766,
781
+ "num_tokens": 740772.0,
782
+ "step": 760
783
+ },
784
+ {
785
+ "entropy": 0.910079357214272,
786
+ "epoch": 1.758857142857143,
787
+ "grad_norm": 2.1047656536102295,
788
+ "learning_rate": 6.493150684931508e-06,
789
+ "loss": 0.875,
790
+ "mean_token_accuracy": 0.7781037461012602,
791
+ "num_tokens": 748857.0,
792
+ "step": 770
793
+ },
794
+ {
795
+ "entropy": 0.9749910116195679,
796
+ "epoch": 1.7817142857142856,
797
+ "grad_norm": 2.2609705924987793,
798
+ "learning_rate": 6.447488584474887e-06,
799
+ "loss": 0.9058,
800
+ "mean_token_accuracy": 0.7749961122870446,
801
+ "num_tokens": 755273.0,
802
+ "step": 780
803
+ },
804
+ {
805
+ "entropy": 0.8688624935224653,
806
+ "epoch": 1.8045714285714287,
807
+ "grad_norm": 2.156954765319824,
808
+ "learning_rate": 6.401826484018266e-06,
809
+ "loss": 0.7568,
810
+ "mean_token_accuracy": 0.8001648161560297,
811
+ "num_tokens": 763404.0,
812
+ "step": 790
813
+ },
814
+ {
815
+ "entropy": 0.6553533479571343,
816
+ "epoch": 1.8274285714285714,
817
+ "grad_norm": 1.5286246538162231,
818
+ "learning_rate": 6.356164383561645e-06,
819
+ "loss": 0.6357,
820
+ "mean_token_accuracy": 0.8322514686733484,
821
+ "num_tokens": 777652.0,
822
+ "step": 800
823
+ },
824
+ {
825
+ "entropy": 0.7381465582177043,
826
+ "epoch": 1.8502857142857143,
827
+ "grad_norm": 1.889930248260498,
828
+ "learning_rate": 6.3105022831050235e-06,
829
+ "loss": 0.6995,
830
+ "mean_token_accuracy": 0.8194405883550644,
831
+ "num_tokens": 788541.0,
832
+ "step": 810
833
+ },
834
+ {
835
+ "entropy": 0.9207667458802462,
836
+ "epoch": 1.8731428571428572,
837
+ "grad_norm": 2.3677663803100586,
838
+ "learning_rate": 6.264840182648403e-06,
839
+ "loss": 0.876,
840
+ "mean_token_accuracy": 0.7714111492037773,
841
+ "num_tokens": 796574.0,
842
+ "step": 820
843
+ },
844
+ {
845
+ "entropy": 0.9494761880487204,
846
+ "epoch": 1.896,
847
+ "grad_norm": 2.424638032913208,
848
+ "learning_rate": 6.219178082191781e-06,
849
+ "loss": 0.8548,
850
+ "mean_token_accuracy": 0.7811690699309111,
851
+ "num_tokens": 802836.0,
852
+ "step": 830
853
+ },
854
+ {
855
+ "entropy": 0.8909835416823626,
856
+ "epoch": 1.9188571428571428,
857
+ "grad_norm": 1.3449039459228516,
858
+ "learning_rate": 6.173515981735161e-06,
859
+ "loss": 0.7825,
860
+ "mean_token_accuracy": 0.7954777158796787,
861
+ "num_tokens": 810726.0,
862
+ "step": 840
863
+ },
864
+ {
865
+ "entropy": 0.6921561988070607,
866
+ "epoch": 1.9417142857142857,
867
+ "grad_norm": 1.490689992904663,
868
+ "learning_rate": 6.127853881278539e-06,
869
+ "loss": 0.6554,
870
+ "mean_token_accuracy": 0.8262197155505419,
871
+ "num_tokens": 824145.0,
872
+ "step": 850
873
+ },
874
+ {
875
+ "entropy": 0.8137379666790366,
876
+ "epoch": 1.9645714285714284,
877
+ "grad_norm": 2.0120434761047363,
878
+ "learning_rate": 6.082191780821919e-06,
879
+ "loss": 0.8024,
880
+ "mean_token_accuracy": 0.7950452182441949,
881
+ "num_tokens": 833220.0,
882
+ "step": 860
883
+ },
884
+ {
885
+ "entropy": 0.9502449594438076,
886
+ "epoch": 1.9874285714285715,
887
+ "grad_norm": 2.679570198059082,
888
+ "learning_rate": 6.036529680365297e-06,
889
+ "loss": 0.8545,
890
+ "mean_token_accuracy": 0.781839894503355,
891
+ "num_tokens": 839758.0,
892
+ "step": 870
893
+ },
894
+ {
895
+ "epoch": 2.0,
896
+ "eval_accuracy": 0.00894328845369237,
897
+ "eval_entropy": 0.9275054344799528,
898
+ "eval_loss": 1.3659894466400146,
899
+ "eval_mean_token_accuracy": 0.7276899333626147,
900
+ "eval_num_tokens": 842388.0,
901
+ "eval_runtime": 299.6651,
902
+ "eval_samples_per_second": 3.451,
903
+ "eval_steps_per_second": 0.864,
904
+ "step": 876
905
  }
906
  ],
907
  "logging_steps": 10,
 
921
  "attributes": {}
922
  }
923
  },
924
+ "total_flos": 5.880058660122624e+16,
925
  "train_batch_size": 1,
926
  "trial_name": null,
927
  "trial_params": null