furmaniak commited on
Commit
7f4abb5
·
verified ·
1 Parent(s): 1956989

End of training

Browse files
Files changed (5) hide show
  1. README.md +1 -1
  2. all_results.json +6 -6
  3. train_results.json +6 -6
  4. trainer_state.json +606 -11
  5. training_loss.png +0 -0
README.md CHANGED
@@ -16,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # pretrain
18
 
19
- This model is a fine-tuned version of [Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) on an unknown dataset.
20
 
21
  ## Model description
22
 
 
16
 
17
  # pretrain
18
 
19
+ This model is a fine-tuned version of [Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) on the openalex_references dataset.
20
 
21
  ## Model description
22
 
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.9967213114754099,
3
- "total_flos": 534721415086080.0,
4
- "train_loss": 1.1572106621767346,
5
- "train_runtime": 17915.2053,
6
- "train_samples_per_second": 0.545,
7
- "train_steps_per_second": 0.004
8
  }
 
1
  {
2
+ "epoch": 0.9949787562765546,
3
+ "total_flos": 1132817220108288.0,
4
+ "train_loss": 0.578794286858221,
5
+ "train_runtime": 20320.7913,
6
+ "train_samples_per_second": 1.019,
7
+ "train_steps_per_second": 0.008
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.9967213114754099,
3
- "total_flos": 534721415086080.0,
4
- "train_loss": 1.1572106621767346,
5
- "train_runtime": 17915.2053,
6
- "train_samples_per_second": 0.545,
7
- "train_steps_per_second": 0.004
8
  }
 
1
  {
2
+ "epoch": 0.9949787562765546,
3
+ "total_flos": 1132817220108288.0,
4
+ "train_loss": 0.578794286858221,
5
+ "train_runtime": 20320.7913,
6
+ "train_samples_per_second": 1.019,
7
+ "train_steps_per_second": 0.008
8
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9967213114754099,
5
  "eval_steps": 500,
6
- "global_step": 76,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -541,17 +541,612 @@
541
  "step": 76
542
  },
543
  {
544
- "epoch": 0.9967213114754099,
545
- "step": 76,
546
- "total_flos": 534721415086080.0,
547
- "train_loss": 1.1572106621767346,
548
- "train_runtime": 17915.2053,
549
- "train_samples_per_second": 0.545,
550
- "train_steps_per_second": 0.004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551
  }
552
  ],
553
  "logging_steps": 1,
554
- "max_steps": 76,
555
  "num_input_tokens_seen": 0,
556
  "num_train_epochs": 1,
557
  "save_steps": 100,
@@ -567,7 +1162,7 @@
567
  "attributes": {}
568
  }
569
  },
570
- "total_flos": 534721415086080.0,
571
  "train_batch_size": 1,
572
  "trial_name": null,
573
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9949787562765546,
5
  "eval_steps": 500,
6
+ "global_step": 161,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
541
  "step": 76
542
  },
543
  {
544
+ "epoch": 0.47585940517574354,
545
+ "grad_norm": 0.00927089061588049,
546
+ "learning_rate": 6.294095225512603e-05,
547
+ "loss": 1.0998,
548
+ "step": 77
549
+ },
550
+ {
551
+ "epoch": 0.4820393974507532,
552
+ "grad_norm": 0.009278366342186928,
553
+ "learning_rate": 6.188429461630866e-05,
554
+ "loss": 1.0809,
555
+ "step": 78
556
+ },
557
+ {
558
+ "epoch": 0.48821938972576284,
559
+ "grad_norm": 0.009307453408837318,
560
+ "learning_rate": 6.0821980696905146e-05,
561
+ "loss": 1.1079,
562
+ "step": 79
563
+ },
564
+ {
565
+ "epoch": 0.4943993820007725,
566
+ "grad_norm": 0.008874714374542236,
567
+ "learning_rate": 5.9754516100806423e-05,
568
+ "loss": 1.0846,
569
+ "step": 80
570
+ },
571
+ {
572
+ "epoch": 0.5005793742757821,
573
+ "grad_norm": 0.00868895836174488,
574
+ "learning_rate": 5.868240888334653e-05,
575
+ "loss": 1.0991,
576
+ "step": 81
577
+ },
578
+ {
579
+ "epoch": 0.5067593665507918,
580
+ "grad_norm": 0.008949129842221737,
581
+ "learning_rate": 5.7606169309495836e-05,
582
+ "loss": 1.107,
583
+ "step": 82
584
+ },
585
+ {
586
+ "epoch": 0.5129393588258014,
587
+ "grad_norm": 0.009207559749484062,
588
+ "learning_rate": 5.6526309611002594e-05,
589
+ "loss": 1.1034,
590
+ "step": 83
591
+ },
592
+ {
593
+ "epoch": 0.5191193511008111,
594
+ "grad_norm": 0.009371085092425346,
595
+ "learning_rate": 5.544334374259823e-05,
596
+ "loss": 1.0936,
597
+ "step": 84
598
+ },
599
+ {
600
+ "epoch": 0.5252993433758207,
601
+ "grad_norm": 0.009222784079611301,
602
+ "learning_rate": 5.435778713738292e-05,
603
+ "loss": 1.0909,
604
+ "step": 85
605
+ },
606
+ {
607
+ "epoch": 0.5314793356508304,
608
+ "grad_norm": 0.00895879790186882,
609
+ "learning_rate": 5.327015646150716e-05,
610
+ "loss": 1.0871,
611
+ "step": 86
612
+ },
613
+ {
614
+ "epoch": 0.5376593279258401,
615
+ "grad_norm": 0.008927428163588047,
616
+ "learning_rate": 5.218096936826681e-05,
617
+ "loss": 1.0917,
618
+ "step": 87
619
+ },
620
+ {
621
+ "epoch": 0.5438393202008498,
622
+ "grad_norm": 0.00859418697655201,
623
+ "learning_rate": 5.1090744251728064e-05,
624
+ "loss": 1.1013,
625
+ "step": 88
626
+ },
627
+ {
628
+ "epoch": 0.5500193124758594,
629
+ "grad_norm": 0.009128894656896591,
630
+ "learning_rate": 5e-05,
631
+ "loss": 1.0948,
632
+ "step": 89
633
+ },
634
+ {
635
+ "epoch": 0.5561993047508691,
636
+ "grad_norm": 0.008752775378525257,
637
+ "learning_rate": 4.890925574827195e-05,
638
+ "loss": 1.103,
639
+ "step": 90
640
+ },
641
+ {
642
+ "epoch": 0.5623792970258787,
643
+ "grad_norm": 0.009119733236730099,
644
+ "learning_rate": 4.781903063173321e-05,
645
+ "loss": 1.0858,
646
+ "step": 91
647
+ },
648
+ {
649
+ "epoch": 0.5685592893008884,
650
+ "grad_norm": 0.009288666769862175,
651
+ "learning_rate": 4.6729843538492847e-05,
652
+ "loss": 1.0867,
653
+ "step": 92
654
+ },
655
+ {
656
+ "epoch": 0.574739281575898,
657
+ "grad_norm": 0.0089786471799016,
658
+ "learning_rate": 4.564221286261709e-05,
659
+ "loss": 1.0861,
660
+ "step": 93
661
+ },
662
+ {
663
+ "epoch": 0.5809192738509077,
664
+ "grad_norm": 0.008815642446279526,
665
+ "learning_rate": 4.4556656257401786e-05,
666
+ "loss": 1.0981,
667
+ "step": 94
668
+ },
669
+ {
670
+ "epoch": 0.5870992661259173,
671
+ "grad_norm": 0.00881979987025261,
672
+ "learning_rate": 4.347369038899744e-05,
673
+ "loss": 1.1144,
674
+ "step": 95
675
+ },
676
+ {
677
+ "epoch": 0.593279258400927,
678
+ "grad_norm": 0.009116360917687416,
679
+ "learning_rate": 4.239383069050417e-05,
680
+ "loss": 1.1074,
681
+ "step": 96
682
+ },
683
+ {
684
+ "epoch": 0.5994592506759366,
685
+ "grad_norm": 0.008931254036724567,
686
+ "learning_rate": 4.131759111665349e-05,
687
+ "loss": 1.1069,
688
+ "step": 97
689
+ },
690
+ {
691
+ "epoch": 0.6056392429509463,
692
+ "grad_norm": 0.00889168307185173,
693
+ "learning_rate": 4.0245483899193595e-05,
694
+ "loss": 1.1113,
695
+ "step": 98
696
+ },
697
+ {
698
+ "epoch": 0.6118192352259559,
699
+ "grad_norm": 0.008884157054126263,
700
+ "learning_rate": 3.917801930309486e-05,
701
+ "loss": 1.0798,
702
+ "step": 99
703
+ },
704
+ {
705
+ "epoch": 0.6179992275009656,
706
+ "grad_norm": 0.008808060549199581,
707
+ "learning_rate": 3.8115705383691355e-05,
708
+ "loss": 1.0835,
709
+ "step": 100
710
+ },
711
+ {
712
+ "epoch": 0.6241792197759752,
713
+ "grad_norm": 0.009600223042070866,
714
+ "learning_rate": 3.705904774487396e-05,
715
+ "loss": 1.0937,
716
+ "step": 101
717
+ },
718
+ {
719
+ "epoch": 0.6303592120509849,
720
+ "grad_norm": 0.009098890237510204,
721
+ "learning_rate": 3.60085492984504e-05,
722
+ "loss": 1.1008,
723
+ "step": 102
724
+ },
725
+ {
726
+ "epoch": 0.6365392043259946,
727
+ "grad_norm": 0.009177979081869125,
728
+ "learning_rate": 3.4964710024786354e-05,
729
+ "loss": 1.096,
730
+ "step": 103
731
+ },
732
+ {
733
+ "epoch": 0.6427191966010043,
734
+ "grad_norm": 0.008857106789946556,
735
+ "learning_rate": 3.392802673484193e-05,
736
+ "loss": 1.0815,
737
+ "step": 104
738
+ },
739
+ {
740
+ "epoch": 0.6488991888760139,
741
+ "grad_norm": 0.009007126092910767,
742
+ "learning_rate": 3.289899283371657e-05,
743
+ "loss": 1.091,
744
+ "step": 105
745
+ },
746
+ {
747
+ "epoch": 0.6550791811510236,
748
+ "grad_norm": 0.009332729503512383,
749
+ "learning_rate": 3.1878098085814924e-05,
750
+ "loss": 1.0834,
751
+ "step": 106
752
+ },
753
+ {
754
+ "epoch": 0.6612591734260332,
755
+ "grad_norm": 0.009046237915754318,
756
+ "learning_rate": 3.086582838174551e-05,
757
+ "loss": 1.0844,
758
+ "step": 107
759
+ },
760
+ {
761
+ "epoch": 0.6674391657010429,
762
+ "grad_norm": 0.008926077745854855,
763
+ "learning_rate": 2.9862665507063147e-05,
764
+ "loss": 1.0649,
765
+ "step": 108
766
+ },
767
+ {
768
+ "epoch": 0.6736191579760525,
769
+ "grad_norm": 0.00914798304438591,
770
+ "learning_rate": 2.886908691296504e-05,
771
+ "loss": 1.0912,
772
+ "step": 109
773
+ },
774
+ {
775
+ "epoch": 0.6797991502510622,
776
+ "grad_norm": 0.008840657770633698,
777
+ "learning_rate": 2.7885565489049946e-05,
778
+ "loss": 1.085,
779
+ "step": 110
780
+ },
781
+ {
782
+ "epoch": 0.6859791425260718,
783
+ "grad_norm": 0.009009969420731068,
784
+ "learning_rate": 2.6912569338248315e-05,
785
+ "loss": 1.0945,
786
+ "step": 111
787
+ },
788
+ {
789
+ "epoch": 0.6921591348010815,
790
+ "grad_norm": 0.008585930801928043,
791
+ "learning_rate": 2.595056155403063e-05,
792
+ "loss": 1.0973,
793
+ "step": 112
794
+ },
795
+ {
796
+ "epoch": 0.6983391270760911,
797
+ "grad_norm": 0.009383322671055794,
798
+ "learning_rate": 2.500000000000001e-05,
799
+ "loss": 1.0918,
800
+ "step": 113
801
+ },
802
+ {
803
+ "epoch": 0.7045191193511008,
804
+ "grad_norm": 0.009045167826116085,
805
+ "learning_rate": 2.4061337091973918e-05,
806
+ "loss": 1.1037,
807
+ "step": 114
808
+ },
809
+ {
810
+ "epoch": 0.7106991116261104,
811
+ "grad_norm": 0.009319834411144257,
812
+ "learning_rate": 2.3135019582658802e-05,
813
+ "loss": 1.1059,
814
+ "step": 115
815
+ },
816
+ {
817
+ "epoch": 0.7168791039011201,
818
+ "grad_norm": 0.008737134747207165,
819
+ "learning_rate": 2.2221488349019903e-05,
820
+ "loss": 1.0922,
821
+ "step": 116
822
+ },
823
+ {
824
+ "epoch": 0.7230590961761297,
825
+ "grad_norm": 0.009303976781666279,
826
+ "learning_rate": 2.132117818244771e-05,
827
+ "loss": 1.0925,
828
+ "step": 117
829
+ },
830
+ {
831
+ "epoch": 0.7292390884511394,
832
+ "grad_norm": 0.009160283021628857,
833
+ "learning_rate": 2.0434517581820896e-05,
834
+ "loss": 1.1057,
835
+ "step": 118
836
+ },
837
+ {
838
+ "epoch": 0.7354190807261491,
839
+ "grad_norm": 0.009896110743284225,
840
+ "learning_rate": 1.9561928549563968e-05,
841
+ "loss": 1.1058,
842
+ "step": 119
843
+ },
844
+ {
845
+ "epoch": 0.7415990730011588,
846
+ "grad_norm": 0.008766653947532177,
847
+ "learning_rate": 1.8703826390797048e-05,
848
+ "loss": 1.0959,
849
+ "step": 120
850
+ },
851
+ {
852
+ "epoch": 0.7477790652761684,
853
+ "grad_norm": 0.008967863395810127,
854
+ "learning_rate": 1.7860619515673033e-05,
855
+ "loss": 1.1026,
856
+ "step": 121
857
+ },
858
+ {
859
+ "epoch": 0.7539590575511781,
860
+ "grad_norm": 0.009152066893875599,
861
+ "learning_rate": 1.703270924499656e-05,
862
+ "loss": 1.0926,
863
+ "step": 122
864
+ },
865
+ {
866
+ "epoch": 0.7601390498261877,
867
+ "grad_norm": 0.008764652535319328,
868
+ "learning_rate": 1.622048961921699e-05,
869
+ "loss": 1.0756,
870
+ "step": 123
871
+ },
872
+ {
873
+ "epoch": 0.7663190421011974,
874
+ "grad_norm": 0.009184801019728184,
875
+ "learning_rate": 1.5424347210886538e-05,
876
+ "loss": 1.1013,
877
+ "step": 124
878
+ },
879
+ {
880
+ "epoch": 0.772499034376207,
881
+ "grad_norm": 0.009281960316002369,
882
+ "learning_rate": 1.4644660940672627e-05,
883
+ "loss": 1.1041,
884
+ "step": 125
885
+ },
886
+ {
887
+ "epoch": 0.7786790266512167,
888
+ "grad_norm": 0.009053783491253853,
889
+ "learning_rate": 1.3881801897012225e-05,
890
+ "loss": 1.1018,
891
+ "step": 126
892
+ },
893
+ {
894
+ "epoch": 0.7848590189262263,
895
+ "grad_norm": 0.008994681760668755,
896
+ "learning_rate": 1.3136133159493802e-05,
897
+ "loss": 1.0951,
898
+ "step": 127
899
+ },
900
+ {
901
+ "epoch": 0.791039011201236,
902
+ "grad_norm": 0.008957086130976677,
903
+ "learning_rate": 1.2408009626051137e-05,
904
+ "loss": 1.0848,
905
+ "step": 128
906
+ },
907
+ {
908
+ "epoch": 0.7972190034762456,
909
+ "grad_norm": 0.008901839144527912,
910
+ "learning_rate": 1.1697777844051105e-05,
911
+ "loss": 1.0876,
912
+ "step": 129
913
+ },
914
+ {
915
+ "epoch": 0.8033989957512553,
916
+ "grad_norm": 0.009184077382087708,
917
+ "learning_rate": 1.100577584535592e-05,
918
+ "loss": 1.0919,
919
+ "step": 130
920
+ },
921
+ {
922
+ "epoch": 0.8095789880262649,
923
+ "grad_norm": 0.008814208209514618,
924
+ "learning_rate": 1.0332332985438248e-05,
925
+ "loss": 1.092,
926
+ "step": 131
927
+ },
928
+ {
929
+ "epoch": 0.8157589803012746,
930
+ "grad_norm": 0.009356915950775146,
931
+ "learning_rate": 9.677769786625867e-06,
932
+ "loss": 1.092,
933
+ "step": 132
934
+ },
935
+ {
936
+ "epoch": 0.8219389725762842,
937
+ "grad_norm": 0.009066778235137463,
938
+ "learning_rate": 9.042397785550405e-06,
939
+ "loss": 1.1062,
940
+ "step": 133
941
+ },
942
+ {
943
+ "epoch": 0.8281189648512939,
944
+ "grad_norm": 0.009054549038410187,
945
+ "learning_rate": 8.426519384872733e-06,
946
+ "loss": 1.0959,
947
+ "step": 134
948
+ },
949
+ {
950
+ "epoch": 0.8342989571263036,
951
+ "grad_norm": 0.009238997474312782,
952
+ "learning_rate": 7.830427709355725e-06,
953
+ "loss": 1.1261,
954
+ "step": 135
955
+ },
956
+ {
957
+ "epoch": 0.8404789494013133,
958
+ "grad_norm": 0.009531921707093716,
959
+ "learning_rate": 7.2544064663526815e-06,
960
+ "loss": 1.1119,
961
+ "step": 136
962
+ },
963
+ {
964
+ "epoch": 0.8466589416763229,
965
+ "grad_norm": 0.008905571885406971,
966
+ "learning_rate": 6.698729810778065e-06,
967
+ "loss": 1.0965,
968
+ "step": 137
969
+ },
970
+ {
971
+ "epoch": 0.8528389339513326,
972
+ "grad_norm": 0.008772294037044048,
973
+ "learning_rate": 6.163662214624616e-06,
974
+ "loss": 1.0972,
975
+ "step": 138
976
+ },
977
+ {
978
+ "epoch": 0.8590189262263422,
979
+ "grad_norm": 0.008754718117415905,
980
+ "learning_rate": 5.649458341088915e-06,
981
+ "loss": 1.0918,
982
+ "step": 139
983
+ },
984
+ {
985
+ "epoch": 0.8651989185013519,
986
+ "grad_norm": 0.008972358889877796,
987
+ "learning_rate": 5.156362923365588e-06,
988
+ "loss": 1.1049,
989
+ "step": 140
990
+ },
991
+ {
992
+ "epoch": 0.8713789107763615,
993
+ "grad_norm": 0.00903693214058876,
994
+ "learning_rate": 4.684610648167503e-06,
995
+ "loss": 1.0926,
996
+ "step": 141
997
+ },
998
+ {
999
+ "epoch": 0.8775589030513712,
1000
+ "grad_norm": 0.009000574238598347,
1001
+ "learning_rate": 4.234426044027645e-06,
1002
+ "loss": 1.1078,
1003
+ "step": 142
1004
+ },
1005
+ {
1006
+ "epoch": 0.8837388953263808,
1007
+ "grad_norm": 0.009073416702449322,
1008
+ "learning_rate": 3.8060233744356633e-06,
1009
+ "loss": 1.111,
1010
+ "step": 143
1011
+ },
1012
+ {
1013
+ "epoch": 0.8899188876013905,
1014
+ "grad_norm": 0.009199617430567741,
1015
+ "learning_rate": 3.3996065358600782e-06,
1016
+ "loss": 1.0996,
1017
+ "step": 144
1018
+ },
1019
+ {
1020
+ "epoch": 0.8960988798764001,
1021
+ "grad_norm": 0.009446380659937859,
1022
+ "learning_rate": 3.0153689607045845e-06,
1023
+ "loss": 1.0956,
1024
+ "step": 145
1025
+ },
1026
+ {
1027
+ "epoch": 0.9022788721514098,
1028
+ "grad_norm": 0.00881500355899334,
1029
+ "learning_rate": 2.653493525244721e-06,
1030
+ "loss": 1.0953,
1031
+ "step": 146
1032
+ },
1033
+ {
1034
+ "epoch": 0.9084588644264194,
1035
+ "grad_norm": 0.009240192361176014,
1036
+ "learning_rate": 2.314152462588659e-06,
1037
+ "loss": 1.1046,
1038
+ "step": 147
1039
+ },
1040
+ {
1041
+ "epoch": 0.9146388567014291,
1042
+ "grad_norm": 0.009173831902444363,
1043
+ "learning_rate": 1.99750728070357e-06,
1044
+ "loss": 1.097,
1045
+ "step": 148
1046
+ },
1047
+ {
1048
+ "epoch": 0.9208188489764387,
1049
+ "grad_norm": 0.009656915441155434,
1050
+ "learning_rate": 1.70370868554659e-06,
1051
+ "loss": 1.0777,
1052
+ "step": 149
1053
+ },
1054
+ {
1055
+ "epoch": 0.9269988412514485,
1056
+ "grad_norm": 0.008921938017010689,
1057
+ "learning_rate": 1.4328965093369283e-06,
1058
+ "loss": 1.0916,
1059
+ "step": 150
1060
+ },
1061
+ {
1062
+ "epoch": 0.9331788335264581,
1063
+ "grad_norm": 0.009205098263919353,
1064
+ "learning_rate": 1.1851996440033319e-06,
1065
+ "loss": 1.1057,
1066
+ "step": 151
1067
+ },
1068
+ {
1069
+ "epoch": 0.9393588258014678,
1070
+ "grad_norm": 0.008920296095311642,
1071
+ "learning_rate": 9.607359798384785e-07,
1072
+ "loss": 1.0906,
1073
+ "step": 152
1074
+ },
1075
+ {
1076
+ "epoch": 0.9455388180764774,
1077
+ "grad_norm": 0.009275338612496853,
1078
+ "learning_rate": 7.596123493895991e-07,
1079
+ "loss": 1.1111,
1080
+ "step": 153
1081
+ },
1082
+ {
1083
+ "epoch": 0.9517188103514871,
1084
+ "grad_norm": 0.008771958760917187,
1085
+ "learning_rate": 5.81924476611967e-07,
1086
+ "loss": 1.1001,
1087
+ "step": 154
1088
+ },
1089
+ {
1090
+ "epoch": 0.9578988026264967,
1091
+ "grad_norm": 0.009109330363571644,
1092
+ "learning_rate": 4.277569313094809e-07,
1093
+ "loss": 1.0804,
1094
+ "step": 155
1095
+ },
1096
+ {
1097
+ "epoch": 0.9640787949015064,
1098
+ "grad_norm": 0.009273674339056015,
1099
+ "learning_rate": 2.971830888840177e-07,
1100
+ "loss": 1.0919,
1101
+ "step": 156
1102
+ },
1103
+ {
1104
+ "epoch": 0.970258787176516,
1105
+ "grad_norm": 0.008920193649828434,
1106
+ "learning_rate": 1.9026509541272275e-07,
1107
+ "loss": 1.0908,
1108
+ "step": 157
1109
+ },
1110
+ {
1111
+ "epoch": 0.9764387794515257,
1112
+ "grad_norm": 0.008690367452800274,
1113
+ "learning_rate": 1.0705383806982606e-07,
1114
+ "loss": 1.1054,
1115
+ "step": 158
1116
+ },
1117
+ {
1118
+ "epoch": 0.9826187717265353,
1119
+ "grad_norm": 0.009283354505896568,
1120
+ "learning_rate": 4.7588920907110094e-08,
1121
+ "loss": 1.1086,
1122
+ "step": 159
1123
+ },
1124
+ {
1125
+ "epoch": 0.988798764001545,
1126
+ "grad_norm": 0.009044487960636616,
1127
+ "learning_rate": 1.189864600454338e-08,
1128
+ "loss": 1.117,
1129
+ "step": 160
1130
+ },
1131
+ {
1132
+ "epoch": 0.9949787562765546,
1133
+ "grad_norm": 0.009743698872625828,
1134
+ "learning_rate": 0.0,
1135
+ "loss": 1.0973,
1136
+ "step": 161
1137
+ },
1138
+ {
1139
+ "epoch": 0.9949787562765546,
1140
+ "step": 161,
1141
+ "total_flos": 1132817220108288.0,
1142
+ "train_loss": 0.578794286858221,
1143
+ "train_runtime": 20320.7913,
1144
+ "train_samples_per_second": 1.019,
1145
+ "train_steps_per_second": 0.008
1146
  }
1147
  ],
1148
  "logging_steps": 1,
1149
+ "max_steps": 161,
1150
  "num_input_tokens_seen": 0,
1151
  "num_train_epochs": 1,
1152
  "save_steps": 100,
 
1162
  "attributes": {}
1163
  }
1164
  },
1165
+ "total_flos": 1132817220108288.0,
1166
  "train_batch_size": 1,
1167
  "trial_name": null,
1168
  "trial_params": null
training_loss.png CHANGED