JohnMarble commited on
Commit
83216ff
·
verified ·
1 Parent(s): 6a888ad

Training in progress, step 10000, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -20,9 +20,13 @@
20
  "pad_token_id": 3,
21
  "partial_rotary_factor": 0.5,
22
  "rms_norm_eps": 1.5625e-07,
23
- "rope_theta": 10000.0,
 
 
 
 
24
  "tie_word_embeddings": false,
25
- "transformers_version": "4.57.1",
26
  "use_cache": false,
27
  "vocab_size": 32000
28
  }
 
20
  "pad_token_id": 3,
21
  "partial_rotary_factor": 0.5,
22
  "rms_norm_eps": 1.5625e-07,
23
+ "rope_parameters": {
24
+ "partial_rotary_factor": 0.5,
25
+ "rope_theta": 10000.0,
26
+ "rope_type": "default"
27
+ },
28
  "tie_word_embeddings": false,
29
+ "transformers_version": "5.0.0",
30
  "use_cache": false,
31
  "vocab_size": 32000
32
  }
last-checkpoint/generation_config.json CHANGED
@@ -2,7 +2,9 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
 
 
5
  "pad_token_id": 3,
6
- "transformers_version": "4.57.1",
7
  "use_cache": false
8
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
  "pad_token_id": 3,
8
+ "transformers_version": "5.0.0",
9
  "use_cache": false
10
  }
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a828e44ea590620dd6e370c9d40ba9bc1bbd4f4347064955ff8f505a434b8f7
3
  size 846294848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90da201742c3407f5b628001559e8cf49801a87f3a796b6cdea47d50c92b281f
3
  size 846294848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca08361236dad76d92438e8c7f7d4ff7f0211f5008cdfe13993144a24be3871d
3
  size 1692647947
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8513713413166c8f257a9925e268ea1a0d5430a79d9d6d7fa6952d27dbe62557
3
  size 1692647947
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97f621c40c37c796d75663f728f7490ddcd9db068eaa82d92691bb9a37ff256f
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9445552595536daf5bd8731be4eabb308bd26e76a3f4f0c20c4aa55fcf9ea202
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0033c7745b46bdca3ecab5787678834ca68f7f7e1288869dceeb38812abc253
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ef18b6bb7867a00caaf997560388e8adda0cd2d38d75f02294c699351b4d5ce
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c604541636f2426a1e67c6a83e8d4da5024a11c0f0ec5748139dfa5cdd27427
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc5ce31d63bcbfc2427ac8bcbb3ed9994536852fec0d908b961baa42c8630d30
3
  size 1465
last-checkpoint/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer_config.json CHANGED
@@ -1,73 +1,14 @@
1
  {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "[UNK]",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "[CLS]",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "[SEP]",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "[PAD]",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "4": {
36
- "content": "[MASK]",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- },
43
- "5": {
44
- "content": "[gMASK]",
45
- "lstrip": false,
46
- "normalized": false,
47
- "rstrip": false,
48
- "single_word": false,
49
- "special": true
50
- },
51
- "6": {
52
- "content": "[sMASK]",
53
- "lstrip": false,
54
- "normalized": false,
55
- "rstrip": false,
56
- "single_word": false,
57
- "special": true
58
- }
59
- },
60
- "additional_special_tokens": [
61
  "[gMASK]",
62
  "[sMASK]"
63
  ],
64
- "bos_token": "[CLS]",
65
- "clean_up_tokenization_spaces": false,
66
- "eos_token": "[SEP]",
67
- "extra_special_tokens": {},
68
  "mask_token": "[MASK]",
69
  "model_max_length": 1000000000000000019884624838656,
70
  "pad_token": "[PAD]",
71
- "tokenizer_class": "PreTrainedTokenizerFast",
72
  "unk_token": "[UNK]"
73
  }
 
1
  {
2
+ "backend": "tokenizers",
3
+ "bos_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "extra_special_tokens": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "[gMASK]",
7
  "[sMASK]"
8
  ],
 
 
 
 
9
  "mask_token": "[MASK]",
10
  "model_max_length": 1000000000000000019884624838656,
11
  "pad_token": "[PAD]",
12
+ "tokenizer_class": "TokenizersBackend",
13
  "unk_token": "[UNK]"
14
  }
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 3000,
3
  "best_metric": 3.3578011989593506,
4
  "best_model_checkpoint": "./vi-en-glm-model/checkpoint-3000",
5
- "epoch": 6.441579371474617,
6
  "eval_steps": 500,
7
- "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -632,10 +632,946 @@
632
  "eval_samples_per_second": 51.891,
633
  "eval_steps_per_second": 3.244,
634
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
  }
636
  ],
637
  "logging_steps": 50,
638
- "max_steps": 6210,
639
  "num_input_tokens_seen": 0,
640
  "num_train_epochs": 10,
641
  "save_steps": 500,
@@ -646,7 +1582,7 @@
646
  "early_stopping_threshold": 0.0
647
  },
648
  "attributes": {
649
- "early_stopping_patience_counter": 2
650
  }
651
  },
652
  "TrainerControl": {
@@ -655,12 +1591,12 @@
655
  "should_evaluate": false,
656
  "should_log": false,
657
  "should_save": true,
658
- "should_training_stop": false
659
  },
660
  "attributes": {}
661
  }
662
  },
663
- "total_flos": 6.135453553629594e+17,
664
  "train_batch_size": 16,
665
  "trial_name": null,
666
  "trial_params": null
 
2
  "best_global_step": 3000,
3
  "best_metric": 3.3578011989593506,
4
  "best_model_checkpoint": "./vi-en-glm-model/checkpoint-3000",
5
+ "epoch": 8.006406406406406,
6
  "eval_steps": 500,
7
+ "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
632
  "eval_samples_per_second": 51.891,
633
  "eval_steps_per_second": 3.244,
634
  "step": 4000
635
+ },
636
+ {
637
+ "epoch": 3.2426426426426427,
638
+ "grad_norm": 0.7968094944953918,
639
+ "learning_rate": 0.0004041065569523268,
640
+ "loss": 9.003502197265625,
641
+ "step": 4050
642
+ },
643
+ {
644
+ "epoch": 3.2826826826826827,
645
+ "grad_norm": 0.8398051261901855,
646
+ "learning_rate": 0.00040148700747181376,
647
+ "loss": 7.386565551757813,
648
+ "step": 4100
649
+ },
650
+ {
651
+ "epoch": 3.3227227227227227,
652
+ "grad_norm": 0.6804362535476685,
653
+ "learning_rate": 0.0003988409074541256,
654
+ "loss": 6.411687622070312,
655
+ "step": 4150
656
+ },
657
+ {
658
+ "epoch": 3.3627627627627628,
659
+ "grad_norm": 0.5826159119606018,
660
+ "learning_rate": 0.0003961687206708958,
661
+ "loss": 5.595321044921875,
662
+ "step": 4200
663
+ },
664
+ {
665
+ "epoch": 3.402802802802803,
666
+ "grad_norm": 0.5103576183319092,
667
+ "learning_rate": 0.0003934709154658834,
668
+ "loss": 5.196689453125,
669
+ "step": 4250
670
+ },
671
+ {
672
+ "epoch": 3.442842842842843,
673
+ "grad_norm": 0.5858429670333862,
674
+ "learning_rate": 0.00039074796467288833,
675
+ "loss": 4.971948852539063,
676
+ "step": 4300
677
+ },
678
+ {
679
+ "epoch": 3.482882882882883,
680
+ "grad_norm": 0.5835692286491394,
681
+ "learning_rate": 0.00038800034553287976,
682
+ "loss": 4.809457397460937,
683
+ "step": 4350
684
+ },
685
+ {
686
+ "epoch": 3.522922922922923,
687
+ "grad_norm": 0.5084450244903564,
688
+ "learning_rate": 0.0003852285396103518,
689
+ "loss": 4.6604766845703125,
690
+ "step": 4400
691
+ },
692
+ {
693
+ "epoch": 3.562962962962963,
694
+ "grad_norm": 0.5399057269096375,
695
+ "learning_rate": 0.0003824330327089215,
696
+ "loss": 4.551234436035156,
697
+ "step": 4450
698
+ },
699
+ {
700
+ "epoch": 3.603003003003003,
701
+ "grad_norm": 0.4749791920185089,
702
+ "learning_rate": 0.0003796143147861845,
703
+ "loss": 4.42959228515625,
704
+ "step": 4500
705
+ },
706
+ {
707
+ "epoch": 3.603003003003003,
708
+ "eval_loss": 4.379153728485107,
709
+ "eval_runtime": 236.0735,
710
+ "eval_samples_per_second": 83.385,
711
+ "eval_steps_per_second": 10.425,
712
+ "step": 4500
713
+ },
714
+ {
715
+ "epoch": 3.643043043043043,
716
+ "grad_norm": 0.4609665274620056,
717
+ "learning_rate": 0.000376772879867841,
718
+ "loss": 4.312644653320312,
719
+ "step": 4550
720
+ },
721
+ {
722
+ "epoch": 3.683083083083083,
723
+ "grad_norm": 0.48116734623908997,
724
+ "learning_rate": 0.0003739092259611112,
725
+ "loss": 4.297572326660156,
726
+ "step": 4600
727
+ },
728
+ {
729
+ "epoch": 3.723123123123123,
730
+ "grad_norm": 0.43770402669906616,
731
+ "learning_rate": 0.00037102385496745025,
732
+ "loss": 4.211837158203125,
733
+ "step": 4650
734
+ },
735
+ {
736
+ "epoch": 3.763163163163163,
737
+ "grad_norm": 0.44192102551460266,
738
+ "learning_rate": 0.00036811727259458243,
739
+ "loss": 4.156084594726562,
740
+ "step": 4700
741
+ },
742
+ {
743
+ "epoch": 3.803203203203203,
744
+ "grad_norm": 0.45893535017967224,
745
+ "learning_rate": 0.0003651899882678681,
746
+ "loss": 4.092442626953125,
747
+ "step": 4750
748
+ },
749
+ {
750
+ "epoch": 3.8432432432432435,
751
+ "grad_norm": 0.4549737870693207,
752
+ "learning_rate": 0.0003622425150410182,
753
+ "loss": 4.083527526855469,
754
+ "step": 4800
755
+ },
756
+ {
757
+ "epoch": 3.8832832832832835,
758
+ "grad_norm": 0.42553290724754333,
759
+ "learning_rate": 0.00035927536950617326,
760
+ "loss": 4.043920288085937,
761
+ "step": 4850
762
+ },
763
+ {
764
+ "epoch": 3.9233233233233236,
765
+ "grad_norm": 0.4284481108188629,
766
+ "learning_rate": 0.00035628907170336295,
767
+ "loss": 3.9837408447265625,
768
+ "step": 4900
769
+ },
770
+ {
771
+ "epoch": 3.9633633633633636,
772
+ "grad_norm": 0.438009649515152,
773
+ "learning_rate": 0.00035328414502935993,
774
+ "loss": 3.967952880859375,
775
+ "step": 4950
776
+ },
777
+ {
778
+ "epoch": 4.003203203203203,
779
+ "grad_norm": 0.4328106641769409,
780
+ "learning_rate": 0.0003502611161459465,
781
+ "loss": 3.903425598144531,
782
+ "step": 5000
783
+ },
784
+ {
785
+ "epoch": 4.003203203203203,
786
+ "eval_loss": 3.910961627960205,
787
+ "eval_runtime": 235.6942,
788
+ "eval_samples_per_second": 83.519,
789
+ "eval_steps_per_second": 10.441,
790
+ "step": 5000
791
+ },
792
+ {
793
+ "epoch": 4.043243243243243,
794
+ "grad_norm": 0.43771296739578247,
795
+ "learning_rate": 0.000347220514887609,
796
+ "loss": 3.749495849609375,
797
+ "step": 5050
798
+ },
799
+ {
800
+ "epoch": 4.083283283283284,
801
+ "grad_norm": 0.4155007600784302,
802
+ "learning_rate": 0.0003441628741686749,
803
+ "loss": 3.7494277954101562,
804
+ "step": 5100
805
+ },
806
+ {
807
+ "epoch": 4.123323323323324,
808
+ "grad_norm": 0.45008692145347595,
809
+ "learning_rate": 0.0003410887298899118,
810
+ "loss": 3.7609121704101565,
811
+ "step": 5150
812
+ },
813
+ {
814
+ "epoch": 4.163363363363364,
815
+ "grad_norm": 0.4209502935409546,
816
+ "learning_rate": 0.00033799862084460196,
817
+ "loss": 3.71232177734375,
818
+ "step": 5200
819
+ },
820
+ {
821
+ "epoch": 4.203403403403404,
822
+ "grad_norm": 0.4234021306037903,
823
+ "learning_rate": 0.00033489308862410974,
824
+ "loss": 3.7151113891601564,
825
+ "step": 5250
826
+ },
827
+ {
828
+ "epoch": 4.243443443443444,
829
+ "grad_norm": 0.4215286374092102,
830
+ "learning_rate": 0.00033177267752295943,
831
+ "loss": 3.697460021972656,
832
+ "step": 5300
833
+ },
834
+ {
835
+ "epoch": 4.283483483483484,
836
+ "grad_norm": 0.38700735569000244,
837
+ "learning_rate": 0.0003286379344434388,
838
+ "loss": 3.68086669921875,
839
+ "step": 5350
840
+ },
841
+ {
842
+ "epoch": 4.323523523523524,
843
+ "grad_norm": 0.44765904545783997,
844
+ "learning_rate": 0.00032548940879974545,
845
+ "loss": 3.6621414184570313,
846
+ "step": 5400
847
+ },
848
+ {
849
+ "epoch": 4.363563563563564,
850
+ "grad_norm": 0.4115334451198578,
851
+ "learning_rate": 0.00032232765242169346,
852
+ "loss": 3.6620281982421874,
853
+ "step": 5450
854
+ },
855
+ {
856
+ "epoch": 4.403603603603604,
857
+ "grad_norm": 0.40621140599250793,
858
+ "learning_rate": 0.0003191532194579959,
859
+ "loss": 3.614684753417969,
860
+ "step": 5500
861
+ },
862
+ {
863
+ "epoch": 4.403603603603604,
864
+ "eval_loss": 3.704507350921631,
865
+ "eval_runtime": 236.3805,
866
+ "eval_samples_per_second": 83.277,
867
+ "eval_steps_per_second": 10.411,
868
+ "step": 5500
869
+ },
870
+ {
871
+ "epoch": 4.443643643643644,
872
+ "grad_norm": 0.41125524044036865,
873
+ "learning_rate": 0.0003159666662791416,
874
+ "loss": 3.634700927734375,
875
+ "step": 5550
876
+ },
877
+ {
878
+ "epoch": 4.483683683683684,
879
+ "grad_norm": 0.4052564203739166,
880
+ "learning_rate": 0.00031276855137988256,
881
+ "loss": 3.620810241699219,
882
+ "step": 5600
883
+ },
884
+ {
885
+ "epoch": 4.523723723723724,
886
+ "grad_norm": 0.40402984619140625,
887
+ "learning_rate": 0.0003095594352813481,
888
+ "loss": 3.6102496337890626,
889
+ "step": 5650
890
+ },
891
+ {
892
+ "epoch": 4.563763763763764,
893
+ "grad_norm": 0.3992173969745636,
894
+ "learning_rate": 0.00030633988043280493,
895
+ "loss": 3.5947637939453125,
896
+ "step": 5700
897
+ },
898
+ {
899
+ "epoch": 4.603803803803804,
900
+ "grad_norm": 0.3955991566181183,
901
+ "learning_rate": 0.00030311045111307885,
902
+ "loss": 3.5669595336914064,
903
+ "step": 5750
904
+ },
905
+ {
906
+ "epoch": 4.643843843843844,
907
+ "grad_norm": 0.37418413162231445,
908
+ "learning_rate": 0.0002998717133316557,
909
+ "loss": 3.546644287109375,
910
+ "step": 5800
911
+ },
912
+ {
913
+ "epoch": 4.683883883883884,
914
+ "grad_norm": 0.3923473656177521,
915
+ "learning_rate": 0.00029662423472947896,
916
+ "loss": 3.5425369262695314,
917
+ "step": 5850
918
+ },
919
+ {
920
+ "epoch": 4.723923923923924,
921
+ "grad_norm": 0.3697595000267029,
922
+ "learning_rate": 0.00029336858447946197,
923
+ "loss": 3.53588623046875,
924
+ "step": 5900
925
+ },
926
+ {
927
+ "epoch": 4.763963963963964,
928
+ "grad_norm": 0.3823640048503876,
929
+ "learning_rate": 0.0002901053331867307,
930
+ "loss": 3.5406414794921877,
931
+ "step": 5950
932
+ },
933
+ {
934
+ "epoch": 4.804004004004004,
935
+ "grad_norm": 0.4052514433860779,
936
+ "learning_rate": 0.00028683505278861635,
937
+ "loss": 3.522510681152344,
938
+ "step": 6000
939
+ },
940
+ {
941
+ "epoch": 4.804004004004004,
942
+ "eval_loss": 3.566638231277466,
943
+ "eval_runtime": 236.1197,
944
+ "eval_samples_per_second": 83.369,
945
+ "eval_steps_per_second": 10.423,
946
+ "step": 6000
947
+ },
948
+ {
949
+ "epoch": 4.844044044044044,
950
+ "grad_norm": 0.4094746708869934,
951
+ "learning_rate": 0.0002835583164544139,
952
+ "loss": 3.519625549316406,
953
+ "step": 6050
954
+ },
955
+ {
956
+ "epoch": 4.884084084084084,
957
+ "grad_norm": 0.6515923738479614,
958
+ "learning_rate": 0.0002802756984849252,
959
+ "loss": 3.4766189575195314,
960
+ "step": 6100
961
+ },
962
+ {
963
+ "epoch": 4.924124124124124,
964
+ "grad_norm": 0.384624719619751,
965
+ "learning_rate": 0.00027698777421180336,
966
+ "loss": 3.4937869262695314,
967
+ "step": 6150
968
+ },
969
+ {
970
+ "epoch": 4.964164164164164,
971
+ "grad_norm": 0.3783823549747467,
972
+ "learning_rate": 0.00027369511989671665,
973
+ "loss": 3.4830267333984377,
974
+ "step": 6200
975
+ },
976
+ {
977
+ "epoch": 5.004004004004004,
978
+ "grad_norm": 0.4266754984855652,
979
+ "learning_rate": 0.00027039831263034916,
980
+ "loss": 3.453427734375,
981
+ "step": 6250
982
+ },
983
+ {
984
+ "epoch": 5.044044044044044,
985
+ "grad_norm": 0.39402705430984497,
986
+ "learning_rate": 0.0002670979302312569,
987
+ "loss": 3.2129632568359376,
988
+ "step": 6300
989
+ },
990
+ {
991
+ "epoch": 5.084084084084084,
992
+ "grad_norm": 0.4370722472667694,
993
+ "learning_rate": 0.00026379455114459527,
994
+ "loss": 3.241674499511719,
995
+ "step": 6350
996
+ },
997
+ {
998
+ "epoch": 5.124124124124124,
999
+ "grad_norm": 0.397320419549942,
1000
+ "learning_rate": 0.00026048875434073724,
1001
+ "loss": 3.1959967041015624,
1002
+ "step": 6400
1003
+ },
1004
+ {
1005
+ "epoch": 5.1641641641641645,
1006
+ "grad_norm": 0.4379047751426697,
1007
+ "learning_rate": 0.00025718111921380006,
1008
+ "loss": 3.222423400878906,
1009
+ "step": 6450
1010
+ },
1011
+ {
1012
+ "epoch": 5.2042042042042045,
1013
+ "grad_norm": 0.4231470227241516,
1014
+ "learning_rate": 0.00025387222548009633,
1015
+ "loss": 3.236834716796875,
1016
+ "step": 6500
1017
+ },
1018
+ {
1019
+ "epoch": 5.2042042042042045,
1020
+ "eval_loss": 3.5047855377197266,
1021
+ "eval_runtime": 237.1477,
1022
+ "eval_samples_per_second": 83.007,
1023
+ "eval_steps_per_second": 10.377,
1024
+ "step": 6500
1025
+ },
1026
+ {
1027
+ "epoch": 5.2442442442442445,
1028
+ "grad_norm": 0.40576690435409546,
1029
+ "learning_rate": 0.00025056265307652983,
1030
+ "loss": 3.2533297729492188,
1031
+ "step": 6550
1032
+ },
1033
+ {
1034
+ "epoch": 5.2842842842842845,
1035
+ "grad_norm": 0.40202781558036804,
1036
+ "learning_rate": 0.0002472529820589524,
1037
+ "loss": 3.2079669189453126,
1038
+ "step": 6600
1039
+ },
1040
+ {
1041
+ "epoch": 5.324324324324325,
1042
+ "grad_norm": 0.40235257148742676,
1043
+ "learning_rate": 0.00024394379250049927,
1044
+ "loss": 3.2289459228515627,
1045
+ "step": 6650
1046
+ },
1047
+ {
1048
+ "epoch": 5.364364364364365,
1049
+ "grad_norm": 0.39760297536849976,
1050
+ "learning_rate": 0.00024063566438992237,
1051
+ "loss": 3.222519836425781,
1052
+ "step": 6700
1053
+ },
1054
+ {
1055
+ "epoch": 5.404404404404405,
1056
+ "grad_norm": 0.3901459276676178,
1057
+ "learning_rate": 0.00023732917752993768,
1058
+ "loss": 3.235470886230469,
1059
+ "step": 6750
1060
+ },
1061
+ {
1062
+ "epoch": 5.444444444444445,
1063
+ "grad_norm": 0.4042525291442871,
1064
+ "learning_rate": 0.0002340249114356058,
1065
+ "loss": 3.236663513183594,
1066
+ "step": 6800
1067
+ },
1068
+ {
1069
+ "epoch": 5.484484484484485,
1070
+ "grad_norm": 0.3963824510574341,
1071
+ "learning_rate": 0.00023072344523276218,
1072
+ "loss": 3.2130169677734375,
1073
+ "step": 6850
1074
+ },
1075
+ {
1076
+ "epoch": 5.524524524524525,
1077
+ "grad_norm": 0.4119073450565338,
1078
+ "learning_rate": 0.00022742535755651623,
1079
+ "loss": 3.2359417724609374,
1080
+ "step": 6900
1081
+ },
1082
+ {
1083
+ "epoch": 5.564564564564565,
1084
+ "grad_norm": 0.41091352701187134,
1085
+ "learning_rate": 0.00022413122644983637,
1086
+ "loss": 3.218328552246094,
1087
+ "step": 6950
1088
+ },
1089
+ {
1090
+ "epoch": 5.604604604604605,
1091
+ "grad_norm": 0.37782832980155945,
1092
+ "learning_rate": 0.00022084162926223823,
1093
+ "loss": 3.2080789184570313,
1094
+ "step": 7000
1095
+ },
1096
+ {
1097
+ "epoch": 5.604604604604605,
1098
+ "eval_loss": 3.4369895458221436,
1099
+ "eval_runtime": 235.6375,
1100
+ "eval_samples_per_second": 83.539,
1101
+ "eval_steps_per_second": 10.444,
1102
+ "step": 7000
1103
+ },
1104
+ {
1105
+ "epoch": 5.644644644644645,
1106
+ "grad_norm": 0.4090370833873749,
1107
+ "learning_rate": 0.00021755714254859533,
1108
+ "loss": 3.2288442993164064,
1109
+ "step": 7050
1110
+ },
1111
+ {
1112
+ "epoch": 5.684684684684685,
1113
+ "grad_norm": 0.4288695752620697,
1114
+ "learning_rate": 0.00021427834196808816,
1115
+ "loss": 3.22770751953125,
1116
+ "step": 7100
1117
+ },
1118
+ {
1119
+ "epoch": 5.724724724724725,
1120
+ "grad_norm": 0.3806166648864746,
1121
+ "learning_rate": 0.00021100580218331094,
1122
+ "loss": 3.2031744384765624,
1123
+ "step": 7150
1124
+ },
1125
+ {
1126
+ "epoch": 5.764764764764765,
1127
+ "grad_norm": 0.41686615347862244,
1128
+ "learning_rate": 0.00020774009675955278,
1129
+ "loss": 3.198349609375,
1130
+ "step": 7200
1131
+ },
1132
+ {
1133
+ "epoch": 5.804804804804805,
1134
+ "grad_norm": 0.39290764927864075,
1135
+ "learning_rate": 0.00020448179806427068,
1136
+ "loss": 3.1860992431640627,
1137
+ "step": 7250
1138
+ },
1139
+ {
1140
+ "epoch": 5.844844844844845,
1141
+ "grad_norm": 0.40635761618614197,
1142
+ "learning_rate": 0.0002012314771667734,
1143
+ "loss": 3.2112152099609377,
1144
+ "step": 7300
1145
+ },
1146
+ {
1147
+ "epoch": 5.884884884884885,
1148
+ "grad_norm": 0.3944772779941559,
1149
+ "learning_rate": 0.00019798970373813214,
1150
+ "loss": 3.1969940185546877,
1151
+ "step": 7350
1152
+ },
1153
+ {
1154
+ "epoch": 5.924924924924925,
1155
+ "grad_norm": 0.39625492691993713,
1156
+ "learning_rate": 0.0001947570459513365,
1157
+ "loss": 3.2008709716796875,
1158
+ "step": 7400
1159
+ },
1160
+ {
1161
+ "epoch": 5.964964964964965,
1162
+ "grad_norm": 0.4190053343772888,
1163
+ "learning_rate": 0.0001915340703817131,
1164
+ "loss": 3.2011529541015626,
1165
+ "step": 7450
1166
+ },
1167
+ {
1168
+ "epoch": 6.004804804804805,
1169
+ "grad_norm": 0.48919281363487244,
1170
+ "learning_rate": 0.00018832134190762434,
1171
+ "loss": 3.156895751953125,
1172
+ "step": 7500
1173
+ },
1174
+ {
1175
+ "epoch": 6.004804804804805,
1176
+ "eval_loss": 3.4287805557250977,
1177
+ "eval_runtime": 237.2268,
1178
+ "eval_samples_per_second": 82.98,
1179
+ "eval_steps_per_second": 10.374,
1180
+ "step": 7500
1181
+ },
1182
+ {
1183
+ "epoch": 6.044844844844845,
1184
+ "grad_norm": 0.4277288615703583,
1185
+ "learning_rate": 0.0001851194236114638,
1186
+ "loss": 2.862073974609375,
1187
+ "step": 7550
1188
+ },
1189
+ {
1190
+ "epoch": 6.084884884884885,
1191
+ "grad_norm": 0.4406304955482483,
1192
+ "learning_rate": 0.00018192887668096752,
1193
+ "loss": 2.8927264404296875,
1194
+ "step": 7600
1195
+ },
1196
+ {
1197
+ "epoch": 6.124924924924925,
1198
+ "grad_norm": 0.4504638612270355,
1199
+ "learning_rate": 0.00017875026031085648,
1200
+ "loss": 2.8985806274414063,
1201
+ "step": 7650
1202
+ },
1203
+ {
1204
+ "epoch": 6.164964964964965,
1205
+ "grad_norm": 0.45850783586502075,
1206
+ "learning_rate": 0.0001755841316048289,
1207
+ "loss": 2.907267761230469,
1208
+ "step": 7700
1209
+ },
1210
+ {
1211
+ "epoch": 6.205005005005005,
1212
+ "grad_norm": 0.437261700630188,
1213
+ "learning_rate": 0.000172431045477919,
1214
+ "loss": 2.9213519287109375,
1215
+ "step": 7750
1216
+ },
1217
+ {
1218
+ "epoch": 6.245045045045045,
1219
+ "grad_norm": 0.46647876501083374,
1220
+ "learning_rate": 0.00016929155455923872,
1221
+ "loss": 2.9296710205078127,
1222
+ "step": 7800
1223
+ },
1224
+ {
1225
+ "epoch": 6.285085085085085,
1226
+ "grad_norm": 0.445600301027298,
1227
+ "learning_rate": 0.00016616620909512108,
1228
+ "loss": 2.9163543701171877,
1229
+ "step": 7850
1230
+ },
1231
+ {
1232
+ "epoch": 6.325125125125125,
1233
+ "grad_norm": 0.45171597599983215,
1234
+ "learning_rate": 0.00016305555685268026,
1235
+ "loss": 2.937907409667969,
1236
+ "step": 7900
1237
+ },
1238
+ {
1239
+ "epoch": 6.365165165165165,
1240
+ "grad_norm": 0.44579142332077026,
1241
+ "learning_rate": 0.0001599601430238068,
1242
+ "loss": 2.953569030761719,
1243
+ "step": 7950
1244
+ },
1245
+ {
1246
+ "epoch": 6.405205205205205,
1247
+ "grad_norm": 0.4332555830478668,
1248
+ "learning_rate": 0.00015688051012961395,
1249
+ "loss": 2.922398681640625,
1250
+ "step": 8000
1251
+ },
1252
+ {
1253
+ "epoch": 6.405205205205205,
1254
+ "eval_loss": 3.403954029083252,
1255
+ "eval_runtime": 236.6606,
1256
+ "eval_samples_per_second": 83.178,
1257
+ "eval_steps_per_second": 10.399,
1258
+ "step": 8000
1259
+ },
1260
+ {
1261
+ "epoch": 6.445245245245245,
1262
+ "grad_norm": 0.4661726951599121,
1263
+ "learning_rate": 0.0001538171979253522,
1264
+ "loss": 2.911597900390625,
1265
+ "step": 8050
1266
+ },
1267
+ {
1268
+ "epoch": 6.485285285285285,
1269
+ "grad_norm": 0.442968487739563,
1270
+ "learning_rate": 0.0001507707433058081,
1271
+ "loss": 2.927822265625,
1272
+ "step": 8100
1273
+ },
1274
+ {
1275
+ "epoch": 6.525325325325325,
1276
+ "grad_norm": 0.478756844997406,
1277
+ "learning_rate": 0.00014774168021120516,
1278
+ "loss": 2.9317803955078126,
1279
+ "step": 8150
1280
+ },
1281
+ {
1282
+ "epoch": 6.565365365365365,
1283
+ "grad_norm": 0.45225295424461365,
1284
+ "learning_rate": 0.00014473053953362208,
1285
+ "loss": 2.931801452636719,
1286
+ "step": 8200
1287
+ },
1288
+ {
1289
+ "epoch": 6.605405405405405,
1290
+ "grad_norm": 0.49030792713165283,
1291
+ "learning_rate": 0.0001417378490239455,
1292
+ "loss": 2.9215069580078126,
1293
+ "step": 8250
1294
+ },
1295
+ {
1296
+ "epoch": 6.6454454454454455,
1297
+ "grad_norm": 0.46768876910209656,
1298
+ "learning_rate": 0.00013876413319937315,
1299
+ "loss": 2.9123870849609377,
1300
+ "step": 8300
1301
+ },
1302
+ {
1303
+ "epoch": 6.6854854854854855,
1304
+ "grad_norm": 0.46513912081718445,
1305
+ "learning_rate": 0.00013580991325148323,
1306
+ "loss": 2.9105740356445313,
1307
+ "step": 8350
1308
+ },
1309
+ {
1310
+ "epoch": 6.7255255255255255,
1311
+ "grad_norm": 0.4525302052497864,
1312
+ "learning_rate": 0.00013287570695488826,
1313
+ "loss": 2.909984130859375,
1314
+ "step": 8400
1315
+ },
1316
+ {
1317
+ "epoch": 6.7655655655655655,
1318
+ "grad_norm": 0.4305197596549988,
1319
+ "learning_rate": 0.0001299620285764856,
1320
+ "loss": 2.899495849609375,
1321
+ "step": 8450
1322
+ },
1323
+ {
1324
+ "epoch": 6.805605605605606,
1325
+ "grad_norm": 0.4248438775539398,
1326
+ "learning_rate": 0.00012706938878532484,
1327
+ "loss": 2.9081976318359377,
1328
+ "step": 8500
1329
+ },
1330
+ {
1331
+ "epoch": 6.805605605605606,
1332
+ "eval_loss": 3.3699452877044678,
1333
+ "eval_runtime": 237.0014,
1334
+ "eval_samples_per_second": 83.059,
1335
+ "eval_steps_per_second": 10.384,
1336
+ "step": 8500
1337
+ },
1338
+ {
1339
+ "epoch": 6.845645645645646,
1340
+ "grad_norm": 0.5702593922615051,
1341
+ "learning_rate": 0.00012419829456310392,
1342
+ "loss": 2.9158230590820313,
1343
+ "step": 8550
1344
+ },
1345
+ {
1346
+ "epoch": 6.885685685685686,
1347
+ "grad_norm": 0.4605788290500641,
1348
+ "learning_rate": 0.00012134924911531359,
1349
+ "loss": 2.8952603149414062,
1350
+ "step": 8600
1351
+ },
1352
+ {
1353
+ "epoch": 6.925725725725726,
1354
+ "grad_norm": 0.4367736279964447,
1355
+ "learning_rate": 0.00011852275178304123,
1356
+ "loss": 2.9171136474609374,
1357
+ "step": 8650
1358
+ },
1359
+ {
1360
+ "epoch": 6.965765765765766,
1361
+ "grad_norm": 0.43834176659584045,
1362
+ "learning_rate": 0.00011571929795545438,
1363
+ "loss": 2.9009600830078126,
1364
+ "step": 8700
1365
+ },
1366
+ {
1367
+ "epoch": 7.005605605605606,
1368
+ "grad_norm": 0.5653858184814453,
1369
+ "learning_rate": 0.00011293937898297496,
1370
+ "loss": 2.8570040893554687,
1371
+ "step": 8750
1372
+ },
1373
+ {
1374
+ "epoch": 7.045645645645646,
1375
+ "grad_norm": 0.48288777470588684,
1376
+ "learning_rate": 0.00011018348209116297,
1377
+ "loss": 2.59486572265625,
1378
+ "step": 8800
1379
+ },
1380
+ {
1381
+ "epoch": 7.085685685685686,
1382
+ "grad_norm": 0.48602017760276794,
1383
+ "learning_rate": 0.00010745209029532161,
1384
+ "loss": 2.60861328125,
1385
+ "step": 8850
1386
+ },
1387
+ {
1388
+ "epoch": 7.125725725725726,
1389
+ "grad_norm": 0.5394498109817505,
1390
+ "learning_rate": 0.00010474568231584194,
1391
+ "loss": 2.6064111328125,
1392
+ "step": 8900
1393
+ },
1394
+ {
1395
+ "epoch": 7.165765765765766,
1396
+ "grad_norm": 0.5203927159309387,
1397
+ "learning_rate": 0.00010206473249429843,
1398
+ "loss": 2.6003097534179687,
1399
+ "step": 8950
1400
+ },
1401
+ {
1402
+ "epoch": 7.205805805805806,
1403
+ "grad_norm": 0.5133360624313354,
1404
+ "learning_rate": 9.940971071031388e-05,
1405
+ "loss": 2.5918447875976565,
1406
+ "step": 9000
1407
+ },
1408
+ {
1409
+ "epoch": 7.205805805805806,
1410
+ "eval_loss": 3.4392001628875732,
1411
+ "eval_runtime": 237.0678,
1412
+ "eval_samples_per_second": 83.035,
1413
+ "eval_steps_per_second": 10.381,
1414
+ "step": 9000
1415
+ },
1416
+ {
1417
+ "epoch": 7.245845845845846,
1418
+ "grad_norm": 0.5268146991729736,
1419
+ "learning_rate": 9.678108229920465e-05,
1420
+ "loss": 2.6212808227539064,
1421
+ "step": 9050
1422
+ },
1423
+ {
1424
+ "epoch": 7.285885885885886,
1425
+ "grad_norm": 0.5330535173416138,
1426
+ "learning_rate": 9.417930797042384e-05,
1427
+ "loss": 2.595316162109375,
1428
+ "step": 9100
1429
+ },
1430
+ {
1431
+ "epoch": 7.325925925925926,
1432
+ "grad_norm": 0.552038311958313,
1433
+ "learning_rate": 9.160484372681411e-05,
1434
+ "loss": 2.61090576171875,
1435
+ "step": 9150
1436
+ },
1437
+ {
1438
+ "epoch": 7.365965965965966,
1439
+ "grad_norm": 0.5219402313232422,
1440
+ "learning_rate": 8.90581407846861e-05,
1441
+ "loss": 2.6292263793945314,
1442
+ "step": 9200
1443
+ },
1444
+ {
1445
+ "epoch": 7.406006006006006,
1446
+ "grad_norm": 0.5238960385322571,
1447
+ "learning_rate": 8.653964549473512e-05,
1448
+ "loss": 2.6288876342773437,
1449
+ "step": 9250
1450
+ },
1451
+ {
1452
+ "epoch": 7.446046046046046,
1453
+ "grad_norm": 0.5215076208114624,
1454
+ "learning_rate": 8.404979926381154e-05,
1455
+ "loss": 2.629596862792969,
1456
+ "step": 9300
1457
+ },
1458
+ {
1459
+ "epoch": 7.486086086086086,
1460
+ "grad_norm": 0.5718568563461304,
1461
+ "learning_rate": 8.158903847755661e-05,
1462
+ "loss": 2.601263122558594,
1463
+ "step": 9350
1464
+ },
1465
+ {
1466
+ "epoch": 7.526126126126126,
1467
+ "grad_norm": 0.5244564414024353,
1468
+ "learning_rate": 7.915779442391924e-05,
1469
+ "loss": 2.6586846923828125,
1470
+ "step": 9400
1471
+ },
1472
+ {
1473
+ "epoch": 7.566166166166166,
1474
+ "grad_norm": 0.5422726273536682,
1475
+ "learning_rate": 7.67564932175657e-05,
1476
+ "loss": 2.623194580078125,
1477
+ "step": 9450
1478
+ },
1479
+ {
1480
+ "epoch": 7.606206206206206,
1481
+ "grad_norm": 0.5429977774620056,
1482
+ "learning_rate": 7.438555572519621e-05,
1483
+ "loss": 2.619925842285156,
1484
+ "step": 9500
1485
+ },
1486
+ {
1487
+ "epoch": 7.606206206206206,
1488
+ "eval_loss": 3.4265189170837402,
1489
+ "eval_runtime": 236.5041,
1490
+ "eval_samples_per_second": 83.233,
1491
+ "eval_steps_per_second": 10.406,
1492
+ "step": 9500
1493
+ },
1494
+ {
1495
+ "epoch": 7.646246246246246,
1496
+ "grad_norm": 0.5386999249458313,
1497
+ "learning_rate": 7.204539749178094e-05,
1498
+ "loss": 2.637367858886719,
1499
+ "step": 9550
1500
+ },
1501
+ {
1502
+ "epoch": 7.686286286286286,
1503
+ "grad_norm": 0.518723726272583,
1504
+ "learning_rate": 6.973642866772973e-05,
1505
+ "loss": 2.627269592285156,
1506
+ "step": 9600
1507
+ },
1508
+ {
1509
+ "epoch": 7.726326326326326,
1510
+ "grad_norm": 0.5138410329818726,
1511
+ "learning_rate": 6.74590539370058e-05,
1512
+ "loss": 2.6370574951171877,
1513
+ "step": 9650
1514
+ },
1515
+ {
1516
+ "epoch": 7.766366366366366,
1517
+ "grad_norm": 0.5250265002250671,
1518
+ "learning_rate": 6.521367244619942e-05,
1519
+ "loss": 2.63766845703125,
1520
+ "step": 9700
1521
+ },
1522
+ {
1523
+ "epoch": 7.806406406406406,
1524
+ "grad_norm": 0.5369194149971008,
1525
+ "learning_rate": 6.300067773456983e-05,
1526
+ "loss": 2.625033264160156,
1527
+ "step": 9750
1528
+ },
1529
+ {
1530
+ "epoch": 7.846446446446446,
1531
+ "grad_norm": 0.536523163318634,
1532
+ "learning_rate": 6.082045766507213e-05,
1533
+ "loss": 2.63074462890625,
1534
+ "step": 9800
1535
+ },
1536
+ {
1537
+ "epoch": 7.886486486486486,
1538
+ "grad_norm": 0.5119531154632568,
1539
+ "learning_rate": 5.8673394356377474e-05,
1540
+ "loss": 2.653492126464844,
1541
+ "step": 9850
1542
+ },
1543
+ {
1544
+ "epoch": 7.926526526526526,
1545
+ "grad_norm": 0.5774253606796265,
1546
+ "learning_rate": 5.6559864115901e-05,
1547
+ "loss": 2.6246636962890624,
1548
+ "step": 9900
1549
+ },
1550
+ {
1551
+ "epoch": 7.966566566566566,
1552
+ "grad_norm": 0.5321469306945801,
1553
+ "learning_rate": 5.448023737384744e-05,
1554
+ "loss": 2.6052349853515624,
1555
+ "step": 9950
1556
+ },
1557
+ {
1558
+ "epoch": 8.006406406406406,
1559
+ "grad_norm": 0.576283872127533,
1560
+ "learning_rate": 5.243487861828802e-05,
1561
+ "loss": 2.5904965209960937,
1562
+ "step": 10000
1563
+ },
1564
+ {
1565
+ "epoch": 8.006406406406406,
1566
+ "eval_loss": 3.4888463020324707,
1567
+ "eval_runtime": 236.9925,
1568
+ "eval_samples_per_second": 83.062,
1569
+ "eval_steps_per_second": 10.384,
1570
+ "step": 10000
1571
  }
1572
  ],
1573
  "logging_steps": 50,
1574
+ "max_steps": 12490,
1575
  "num_input_tokens_seen": 0,
1576
  "num_train_epochs": 10,
1577
  "save_steps": 500,
 
1582
  "early_stopping_threshold": 0.0
1583
  },
1584
  "attributes": {
1585
+ "early_stopping_patience_counter": 3
1586
  }
1587
  },
1588
  "TrainerControl": {
 
1591
  "should_evaluate": false,
1592
  "should_log": false,
1593
  "should_save": true,
1594
+ "should_training_stop": true
1595
  },
1596
  "attributes": {}
1597
  }
1598
  },
1599
+ "total_flos": 1.0738938356537754e+18,
1600
  "train_batch_size": 16,
1601
  "trial_name": null,
1602
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b06587b0bc69f524cde3061d77fd09bc473e0f46d4bfe76becdc179a84f8c0e4
3
- size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86ddcc39c2e432cced0213ba777fee5802b9462879abc1b1e88fa34ebf71af14
3
+ size 5265