Plofski commited on
Commit
4573aba
·
verified ·
1 Parent(s): fe09261

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5868447b981ceff440b9a26d6ac08b1eb131c66c461c8f6cd15cd33c16c3425
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f35b737982e48d3830ee78a27c3784e950c9cb1cc8a81e9ff82bc0cbeca9a095
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02ceea59533679cf8e117ebd8d876b10849b1306fb9459e9ee4998596ecbdb03
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f734370fa1e43861a64bf46d2f3ddd2b2e741b3042916e97b9b0aa3948a2d4f5
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:157bd2aed929bf3aecd89cca519b674ca176680d01354e1f32ab94471cfeb630
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d673fab80548770f45e3c6b7ce6376b297de04f44a8ac658823035a1ec8497c
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.10074551682450131,
6
  "eval_steps": 500,
7
- "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -458,6 +458,456 @@
458
  "mean_token_accuracy": 0.7595715343952179,
459
  "num_tokens": 549174.0,
460
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
  }
462
  ],
463
  "logging_steps": 10,
@@ -477,7 +927,7 @@
477
  "attributes": {}
478
  }
479
  },
480
- "total_flos": 668729881817088.0,
481
  "train_batch_size": 8,
482
  "trial_name": null,
483
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.20149103364900262,
6
  "eval_steps": 500,
7
+ "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
458
  "mean_token_accuracy": 0.7595715343952179,
459
  "num_tokens": 549174.0,
460
  "step": 500
461
+ },
462
+ {
463
+ "epoch": 0.10276042716099133,
464
+ "grad_norm": 12.625,
465
+ "learning_rate": 1.9316273759151052e-05,
466
+ "loss": 1.0164,
467
+ "mean_token_accuracy": 0.7571583390235901,
468
+ "num_tokens": 559988.0,
469
+ "step": 510
470
+ },
471
+ {
472
+ "epoch": 0.10477533749748136,
473
+ "grad_norm": 14.3125,
474
+ "learning_rate": 1.930284102357445e-05,
475
+ "loss": 1.1148,
476
+ "mean_token_accuracy": 0.7423564851284027,
477
+ "num_tokens": 571510.0,
478
+ "step": 520
479
+ },
480
+ {
481
+ "epoch": 0.10679024783397138,
482
+ "grad_norm": 14.6875,
483
+ "learning_rate": 1.9289408287997854e-05,
484
+ "loss": 1.053,
485
+ "mean_token_accuracy": 0.7485374748706818,
486
+ "num_tokens": 583020.0,
487
+ "step": 530
488
+ },
489
+ {
490
+ "epoch": 0.10880515817046141,
491
+ "grad_norm": 11.4375,
492
+ "learning_rate": 1.9275975552421252e-05,
493
+ "loss": 0.9756,
494
+ "mean_token_accuracy": 0.7606720209121705,
495
+ "num_tokens": 594042.0,
496
+ "step": 540
497
+ },
498
+ {
499
+ "epoch": 0.11082006850695145,
500
+ "grad_norm": 13.3125,
501
+ "learning_rate": 1.926254281684465e-05,
502
+ "loss": 0.9514,
503
+ "mean_token_accuracy": 0.7702824532985687,
504
+ "num_tokens": 605932.0,
505
+ "step": 550
506
+ },
507
+ {
508
+ "epoch": 0.11283497884344147,
509
+ "grad_norm": 10.625,
510
+ "learning_rate": 1.9249110081268053e-05,
511
+ "loss": 1.0008,
512
+ "mean_token_accuracy": 0.7583375632762909,
513
+ "num_tokens": 617431.0,
514
+ "step": 560
515
+ },
516
+ {
517
+ "epoch": 0.1148498891799315,
518
+ "grad_norm": 10.875,
519
+ "learning_rate": 1.9235677345691452e-05,
520
+ "loss": 0.998,
521
+ "mean_token_accuracy": 0.7597042858600617,
522
+ "num_tokens": 629827.0,
523
+ "step": 570
524
+ },
525
+ {
526
+ "epoch": 0.11686479951642152,
527
+ "grad_norm": 12.5625,
528
+ "learning_rate": 1.922224461011485e-05,
529
+ "loss": 0.9512,
530
+ "mean_token_accuracy": 0.7806954503059387,
531
+ "num_tokens": 640144.0,
532
+ "step": 580
533
+ },
534
+ {
535
+ "epoch": 0.11887970985291155,
536
+ "grad_norm": 10.5625,
537
+ "learning_rate": 1.920881187453825e-05,
538
+ "loss": 0.9292,
539
+ "mean_token_accuracy": 0.7761410176753998,
540
+ "num_tokens": 652386.0,
541
+ "step": 590
542
+ },
543
+ {
544
+ "epoch": 0.12089462018940157,
545
+ "grad_norm": 11.0,
546
+ "learning_rate": 1.9195379138961652e-05,
547
+ "loss": 1.0768,
548
+ "mean_token_accuracy": 0.7544383645057678,
549
+ "num_tokens": 663460.0,
550
+ "step": 600
551
+ },
552
+ {
553
+ "epoch": 0.1229095305258916,
554
+ "grad_norm": 14.3125,
555
+ "learning_rate": 1.918194640338505e-05,
556
+ "loss": 0.8975,
557
+ "mean_token_accuracy": 0.7799494147300721,
558
+ "num_tokens": 673425.0,
559
+ "step": 610
560
+ },
561
+ {
562
+ "epoch": 0.12492444086238162,
563
+ "grad_norm": 10.375,
564
+ "learning_rate": 1.916851366780845e-05,
565
+ "loss": 0.899,
566
+ "mean_token_accuracy": 0.7885317802429199,
567
+ "num_tokens": 683817.0,
568
+ "step": 620
569
+ },
570
+ {
571
+ "epoch": 0.12693935119887165,
572
+ "grad_norm": 13.375,
573
+ "learning_rate": 1.9155080932231852e-05,
574
+ "loss": 0.998,
575
+ "mean_token_accuracy": 0.7671383440494537,
576
+ "num_tokens": 694196.0,
577
+ "step": 630
578
+ },
579
+ {
580
+ "epoch": 0.12895426153536169,
581
+ "grad_norm": 11.625,
582
+ "learning_rate": 1.914164819665525e-05,
583
+ "loss": 0.9808,
584
+ "mean_token_accuracy": 0.7700311303138733,
585
+ "num_tokens": 704564.0,
586
+ "step": 640
587
+ },
588
+ {
589
+ "epoch": 0.1309691718718517,
590
+ "grad_norm": 13.25,
591
+ "learning_rate": 1.912821546107865e-05,
592
+ "loss": 1.0077,
593
+ "mean_token_accuracy": 0.7643253684043885,
594
+ "num_tokens": 715775.0,
595
+ "step": 650
596
+ },
597
+ {
598
+ "epoch": 0.13298408220834174,
599
+ "grad_norm": 13.625,
600
+ "learning_rate": 1.911478272550205e-05,
601
+ "loss": 0.9457,
602
+ "mean_token_accuracy": 0.7678769171237946,
603
+ "num_tokens": 726005.0,
604
+ "step": 660
605
+ },
606
+ {
607
+ "epoch": 0.13499899254483175,
608
+ "grad_norm": 13.5,
609
+ "learning_rate": 1.910134998992545e-05,
610
+ "loss": 1.0155,
611
+ "mean_token_accuracy": 0.7607427120208741,
612
+ "num_tokens": 738053.0,
613
+ "step": 670
614
+ },
615
+ {
616
+ "epoch": 0.1370139028813218,
617
+ "grad_norm": 11.6875,
618
+ "learning_rate": 1.908791725434885e-05,
619
+ "loss": 0.9395,
620
+ "mean_token_accuracy": 0.7723658442497253,
621
+ "num_tokens": 748480.0,
622
+ "step": 680
623
+ },
624
+ {
625
+ "epoch": 0.1390288132178118,
626
+ "grad_norm": 15.6875,
627
+ "learning_rate": 1.907448451877225e-05,
628
+ "loss": 0.9639,
629
+ "mean_token_accuracy": 0.7676171123981476,
630
+ "num_tokens": 759972.0,
631
+ "step": 690
632
+ },
633
+ {
634
+ "epoch": 0.14104372355430184,
635
+ "grad_norm": 12.875,
636
+ "learning_rate": 1.906105178319565e-05,
637
+ "loss": 0.9557,
638
+ "mean_token_accuracy": 0.7719902992248535,
639
+ "num_tokens": 771123.0,
640
+ "step": 700
641
+ },
642
+ {
643
+ "epoch": 0.14305863389079185,
644
+ "grad_norm": 12.8125,
645
+ "learning_rate": 1.904761904761905e-05,
646
+ "loss": 1.0022,
647
+ "mean_token_accuracy": 0.7667870819568634,
648
+ "num_tokens": 782532.0,
649
+ "step": 710
650
+ },
651
+ {
652
+ "epoch": 0.1450735442272819,
653
+ "grad_norm": 11.0625,
654
+ "learning_rate": 1.903418631204245e-05,
655
+ "loss": 0.9519,
656
+ "mean_token_accuracy": 0.7708106875419617,
657
+ "num_tokens": 794067.0,
658
+ "step": 720
659
+ },
660
+ {
661
+ "epoch": 0.14708845456377193,
662
+ "grad_norm": 14.125,
663
+ "learning_rate": 1.902075357646585e-05,
664
+ "loss": 0.9718,
665
+ "mean_token_accuracy": 0.766555666923523,
666
+ "num_tokens": 804871.0,
667
+ "step": 730
668
+ },
669
+ {
670
+ "epoch": 0.14910336490026194,
671
+ "grad_norm": 12.8125,
672
+ "learning_rate": 1.900732084088925e-05,
673
+ "loss": 0.9852,
674
+ "mean_token_accuracy": 0.7678309619426728,
675
+ "num_tokens": 815050.0,
676
+ "step": 740
677
+ },
678
+ {
679
+ "epoch": 0.15111827523675198,
680
+ "grad_norm": 11.3125,
681
+ "learning_rate": 1.8993888105312648e-05,
682
+ "loss": 0.9951,
683
+ "mean_token_accuracy": 0.7627758264541626,
684
+ "num_tokens": 826248.0,
685
+ "step": 750
686
+ },
687
+ {
688
+ "epoch": 0.153133185573242,
689
+ "grad_norm": 17.25,
690
+ "learning_rate": 1.8980455369736047e-05,
691
+ "loss": 1.0433,
692
+ "mean_token_accuracy": 0.7571396887302398,
693
+ "num_tokens": 835706.0,
694
+ "step": 760
695
+ },
696
+ {
697
+ "epoch": 0.15514809590973203,
698
+ "grad_norm": 10.9375,
699
+ "learning_rate": 1.896702263415945e-05,
700
+ "loss": 1.0518,
701
+ "mean_token_accuracy": 0.7517435431480408,
702
+ "num_tokens": 847261.0,
703
+ "step": 770
704
+ },
705
+ {
706
+ "epoch": 0.15716300624622204,
707
+ "grad_norm": 10.4375,
708
+ "learning_rate": 1.8953589898582848e-05,
709
+ "loss": 0.9629,
710
+ "mean_token_accuracy": 0.7732720315456391,
711
+ "num_tokens": 858655.0,
712
+ "step": 780
713
+ },
714
+ {
715
+ "epoch": 0.15917791658271208,
716
+ "grad_norm": 12.5,
717
+ "learning_rate": 1.8940157163006247e-05,
718
+ "loss": 1.0231,
719
+ "mean_token_accuracy": 0.7555422127246857,
720
+ "num_tokens": 870002.0,
721
+ "step": 790
722
+ },
723
+ {
724
+ "epoch": 0.1611928269192021,
725
+ "grad_norm": 11.0,
726
+ "learning_rate": 1.892672442742965e-05,
727
+ "loss": 1.1283,
728
+ "mean_token_accuracy": 0.7441882312297821,
729
+ "num_tokens": 881131.0,
730
+ "step": 800
731
+ },
732
+ {
733
+ "epoch": 0.16320773725569213,
734
+ "grad_norm": 12.4375,
735
+ "learning_rate": 1.8913291691853048e-05,
736
+ "loss": 1.0252,
737
+ "mean_token_accuracy": 0.7630669414997101,
738
+ "num_tokens": 893437.0,
739
+ "step": 810
740
+ },
741
+ {
742
+ "epoch": 0.16522264759218214,
743
+ "grad_norm": 11.0,
744
+ "learning_rate": 1.8899858956276447e-05,
745
+ "loss": 1.0528,
746
+ "mean_token_accuracy": 0.7483877301216125,
747
+ "num_tokens": 904976.0,
748
+ "step": 820
749
+ },
750
+ {
751
+ "epoch": 0.16723755792867218,
752
+ "grad_norm": 12.375,
753
+ "learning_rate": 1.8886426220699846e-05,
754
+ "loss": 0.8715,
755
+ "mean_token_accuracy": 0.7899761021137237,
756
+ "num_tokens": 915631.0,
757
+ "step": 830
758
+ },
759
+ {
760
+ "epoch": 0.1692524682651622,
761
+ "grad_norm": 13.375,
762
+ "learning_rate": 1.8872993485123248e-05,
763
+ "loss": 1.0548,
764
+ "mean_token_accuracy": 0.7494987368583679,
765
+ "num_tokens": 927141.0,
766
+ "step": 840
767
+ },
768
+ {
769
+ "epoch": 0.17126737860165223,
770
+ "grad_norm": 11.0,
771
+ "learning_rate": 1.8859560749546647e-05,
772
+ "loss": 0.9579,
773
+ "mean_token_accuracy": 0.7668360054492951,
774
+ "num_tokens": 938792.0,
775
+ "step": 850
776
+ },
777
+ {
778
+ "epoch": 0.17328228893814226,
779
+ "grad_norm": 13.125,
780
+ "learning_rate": 1.8846128013970046e-05,
781
+ "loss": 0.8595,
782
+ "mean_token_accuracy": 0.7870603501796722,
783
+ "num_tokens": 949894.0,
784
+ "step": 860
785
+ },
786
+ {
787
+ "epoch": 0.17529719927463228,
788
+ "grad_norm": 12.625,
789
+ "learning_rate": 1.8832695278393448e-05,
790
+ "loss": 0.9216,
791
+ "mean_token_accuracy": 0.7846542239189148,
792
+ "num_tokens": 961003.0,
793
+ "step": 870
794
+ },
795
+ {
796
+ "epoch": 0.17731210961112231,
797
+ "grad_norm": 12.125,
798
+ "learning_rate": 1.8819262542816847e-05,
799
+ "loss": 1.0052,
800
+ "mean_token_accuracy": 0.7603223979473114,
801
+ "num_tokens": 971577.0,
802
+ "step": 880
803
+ },
804
+ {
805
+ "epoch": 0.17932701994761233,
806
+ "grad_norm": 12.4375,
807
+ "learning_rate": 1.8805829807240245e-05,
808
+ "loss": 0.9299,
809
+ "mean_token_accuracy": 0.7757908642292023,
810
+ "num_tokens": 982234.0,
811
+ "step": 890
812
+ },
813
+ {
814
+ "epoch": 0.18134193028410237,
815
+ "grad_norm": 11.0,
816
+ "learning_rate": 1.8792397071663648e-05,
817
+ "loss": 1.0312,
818
+ "mean_token_accuracy": 0.7591780245304107,
819
+ "num_tokens": 992997.0,
820
+ "step": 900
821
+ },
822
+ {
823
+ "epoch": 0.18335684062059238,
824
+ "grad_norm": 10.5625,
825
+ "learning_rate": 1.8778964336087047e-05,
826
+ "loss": 0.8999,
827
+ "mean_token_accuracy": 0.779550439119339,
828
+ "num_tokens": 1004102.0,
829
+ "step": 910
830
+ },
831
+ {
832
+ "epoch": 0.18537175095708242,
833
+ "grad_norm": 12.625,
834
+ "learning_rate": 1.8765531600510445e-05,
835
+ "loss": 0.8892,
836
+ "mean_token_accuracy": 0.7890210688114166,
837
+ "num_tokens": 1015447.0,
838
+ "step": 920
839
+ },
840
+ {
841
+ "epoch": 0.18738666129357243,
842
+ "grad_norm": 12.125,
843
+ "learning_rate": 1.8752098864933844e-05,
844
+ "loss": 1.0344,
845
+ "mean_token_accuracy": 0.7584980130195618,
846
+ "num_tokens": 1026939.0,
847
+ "step": 930
848
+ },
849
+ {
850
+ "epoch": 0.18940157163006247,
851
+ "grad_norm": 10.9375,
852
+ "learning_rate": 1.8738666129357246e-05,
853
+ "loss": 0.9686,
854
+ "mean_token_accuracy": 0.7649740993976593,
855
+ "num_tokens": 1037937.0,
856
+ "step": 940
857
+ },
858
+ {
859
+ "epoch": 0.19141648196655248,
860
+ "grad_norm": 8.75,
861
+ "learning_rate": 1.8725233393780645e-05,
862
+ "loss": 1.0364,
863
+ "mean_token_accuracy": 0.7554452955722809,
864
+ "num_tokens": 1049173.0,
865
+ "step": 950
866
+ },
867
+ {
868
+ "epoch": 0.19343139230304252,
869
+ "grad_norm": 13.625,
870
+ "learning_rate": 1.8711800658204044e-05,
871
+ "loss": 1.0173,
872
+ "mean_token_accuracy": 0.7559767007827759,
873
+ "num_tokens": 1060166.0,
874
+ "step": 960
875
+ },
876
+ {
877
+ "epoch": 0.19544630263953255,
878
+ "grad_norm": 11.1875,
879
+ "learning_rate": 1.8698367922627446e-05,
880
+ "loss": 0.9464,
881
+ "mean_token_accuracy": 0.7735530078411103,
882
+ "num_tokens": 1070458.0,
883
+ "step": 970
884
+ },
885
+ {
886
+ "epoch": 0.19746121297602257,
887
+ "grad_norm": 11.1875,
888
+ "learning_rate": 1.8684935187050845e-05,
889
+ "loss": 0.9397,
890
+ "mean_token_accuracy": 0.7724673867225647,
891
+ "num_tokens": 1081477.0,
892
+ "step": 980
893
+ },
894
+ {
895
+ "epoch": 0.1994761233125126,
896
+ "grad_norm": 15.1875,
897
+ "learning_rate": 1.8671502451474244e-05,
898
+ "loss": 1.0769,
899
+ "mean_token_accuracy": 0.7459556341171265,
900
+ "num_tokens": 1094205.0,
901
+ "step": 990
902
+ },
903
+ {
904
+ "epoch": 0.20149103364900262,
905
+ "grad_norm": 16.5,
906
+ "learning_rate": 1.8658069715897643e-05,
907
+ "loss": 0.9763,
908
+ "mean_token_accuracy": 0.7707934081554413,
909
+ "num_tokens": 1104929.0,
910
+ "step": 1000
911
  }
912
  ],
913
  "logging_steps": 10,
 
927
  "attributes": {}
928
  }
929
  },
930
+ "total_flos": 1337180456005632.0,
931
  "train_batch_size": 8,
932
  "trial_name": null,
933
  "trial_params": null