Pushing checkpoint-750 (best) to main
Browse files- README.md +1 -1
- adapter_config.json +5 -5
- adapter_model.safetensors +1 -1
- last-checkpoint/adapter_config.json +5 -5
- last-checkpoint/adapter_model.safetensors +1 -1
- last-checkpoint/optimizer.pt +2 -2
- last-checkpoint/rng_state.pth +1 -1
- last-checkpoint/scaler.pt +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +6 -2501
- last-checkpoint/training_args.bin +1 -1
- training_args.bin +1 -1
README.md
CHANGED
|
@@ -37,7 +37,7 @@ This model was trained with SFT.
|
|
| 37 |
- TRL: 0.27.0
|
| 38 |
- Transformers: 4.57.6
|
| 39 |
- Pytorch: 2.8.0+cu126
|
| 40 |
-
- Datasets: 4.4.
|
| 41 |
- Tokenizers: 0.22.1
|
| 42 |
|
| 43 |
## Citations
|
|
|
|
| 37 |
- TRL: 0.27.0
|
| 38 |
- Transformers: 4.57.6
|
| 39 |
- Pytorch: 2.8.0+cu126
|
| 40 |
+
- Datasets: 4.4.1
|
| 41 |
- Tokenizers: 0.22.1
|
| 42 |
|
| 43 |
## Citations
|
adapter_config.json
CHANGED
|
@@ -29,13 +29,13 @@
|
|
| 29 |
"rank_pattern": {},
|
| 30 |
"revision": null,
|
| 31 |
"target_modules": [
|
| 32 |
-
"
|
| 33 |
-
"k_proj",
|
| 34 |
-
"down_proj",
|
| 35 |
"q_proj",
|
|
|
|
|
|
|
|
|
|
| 36 |
"gate_proj",
|
| 37 |
-
"
|
| 38 |
-
"o_proj"
|
| 39 |
],
|
| 40 |
"target_parameters": null,
|
| 41 |
"task_type": "CAUSAL_LM",
|
|
|
|
| 29 |
"rank_pattern": {},
|
| 30 |
"revision": null,
|
| 31 |
"target_modules": [
|
| 32 |
+
"up_proj",
|
|
|
|
|
|
|
| 33 |
"q_proj",
|
| 34 |
+
"down_proj",
|
| 35 |
+
"o_proj",
|
| 36 |
+
"v_proj",
|
| 37 |
"gate_proj",
|
| 38 |
+
"k_proj"
|
|
|
|
| 39 |
],
|
| 40 |
"target_parameters": null,
|
| 41 |
"task_type": "CAUSAL_LM",
|
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 228140600
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cd11b39803251198dcb7e030bb69c10b05cece6a9e45160afcc921794cb790cc
|
| 3 |
size 228140600
|
last-checkpoint/adapter_config.json
CHANGED
|
@@ -29,13 +29,13 @@
|
|
| 29 |
"rank_pattern": {},
|
| 30 |
"revision": null,
|
| 31 |
"target_modules": [
|
| 32 |
-
"
|
| 33 |
-
"k_proj",
|
| 34 |
-
"down_proj",
|
| 35 |
"q_proj",
|
|
|
|
|
|
|
|
|
|
| 36 |
"gate_proj",
|
| 37 |
-
"
|
| 38 |
-
"o_proj"
|
| 39 |
],
|
| 40 |
"target_parameters": null,
|
| 41 |
"task_type": "CAUSAL_LM",
|
|
|
|
| 29 |
"rank_pattern": {},
|
| 30 |
"revision": null,
|
| 31 |
"target_modules": [
|
| 32 |
+
"up_proj",
|
|
|
|
|
|
|
| 33 |
"q_proj",
|
| 34 |
+
"down_proj",
|
| 35 |
+
"o_proj",
|
| 36 |
+
"v_proj",
|
| 37 |
"gate_proj",
|
| 38 |
+
"k_proj"
|
|
|
|
| 39 |
],
|
| 40 |
"target_parameters": null,
|
| 41 |
"task_type": "CAUSAL_LM",
|
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 228140600
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cd11b39803251198dcb7e030bb69c10b05cece6a9e45160afcc921794cb790cc
|
| 3 |
size 228140600
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2acc6b93233f66c6ddb8b195904fe7cd974047004ffcd02f1d993e85ebc0a677
|
| 3 |
+
size 116484839
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b7883d803ebcafeb5684e5f2bcceb39f2a54258143c0c4972785bf0a17a36dc8
|
| 3 |
size 14645
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e188a4cd7f588ff088ff68a7d9c18ed5ca570c5b11d6790654dcb4e3accb81e
|
| 3 |
size 1383
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:08f9e08af1aa8eb785ad1df11d9714b6c859fed11b125506168e50ec9ce7af28
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": 750,
|
| 3 |
"best_metric": 0.5089643597602844,
|
| 4 |
"best_model_checkpoint": "./adapter-phase1/checkpoint-750",
|
| 5 |
-
"epoch":
|
| 6 |
-
"eval_steps":
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -813,2508 +813,13 @@
|
|
| 813 |
"eval_samples_per_second": 2.106,
|
| 814 |
"eval_steps_per_second": 0.526,
|
| 815 |
"step": 750
|
| 816 |
-
},
|
| 817 |
-
{
|
| 818 |
-
"entropy": 0.4685165178030729,
|
| 819 |
-
"epoch": 1.216,
|
| 820 |
-
"grad_norm": 0.4797472059726715,
|
| 821 |
-
"learning_rate": 7.5744e-05,
|
| 822 |
-
"loss": 0.4371,
|
| 823 |
-
"mean_token_accuracy": 0.872249535471201,
|
| 824 |
-
"num_tokens": 20779.0,
|
| 825 |
-
"step": 760
|
| 826 |
-
},
|
| 827 |
-
{
|
| 828 |
-
"entropy": 0.5129861503839492,
|
| 829 |
-
"epoch": 1.232,
|
| 830 |
-
"grad_norm": 0.5743088126182556,
|
| 831 |
-
"learning_rate": 7.5424e-05,
|
| 832 |
-
"loss": 0.4703,
|
| 833 |
-
"mean_token_accuracy": 0.8656402382999658,
|
| 834 |
-
"num_tokens": 37039.0,
|
| 835 |
-
"step": 770
|
| 836 |
-
},
|
| 837 |
-
{
|
| 838 |
-
"entropy": 0.47918802928179505,
|
| 839 |
-
"epoch": 1.248,
|
| 840 |
-
"grad_norm": 0.41004160046577454,
|
| 841 |
-
"learning_rate": 7.5104e-05,
|
| 842 |
-
"loss": 0.4631,
|
| 843 |
-
"mean_token_accuracy": 0.8624460745602847,
|
| 844 |
-
"num_tokens": 66230.0,
|
| 845 |
-
"step": 780
|
| 846 |
-
},
|
| 847 |
-
{
|
| 848 |
-
"entropy": 0.423713362775743,
|
| 849 |
-
"epoch": 1.264,
|
| 850 |
-
"grad_norm": 0.39121007919311523,
|
| 851 |
-
"learning_rate": 7.4784e-05,
|
| 852 |
-
"loss": 0.4005,
|
| 853 |
-
"mean_token_accuracy": 0.8780338373035192,
|
| 854 |
-
"num_tokens": 98315.0,
|
| 855 |
-
"step": 790
|
| 856 |
-
},
|
| 857 |
-
{
|
| 858 |
-
"entropy": 0.46349438820034267,
|
| 859 |
-
"epoch": 1.28,
|
| 860 |
-
"grad_norm": 0.4372813403606415,
|
| 861 |
-
"learning_rate": 7.4464e-05,
|
| 862 |
-
"loss": 0.4236,
|
| 863 |
-
"mean_token_accuracy": 0.8776358034461736,
|
| 864 |
-
"num_tokens": 123538.0,
|
| 865 |
-
"step": 800
|
| 866 |
-
},
|
| 867 |
-
{
|
| 868 |
-
"entropy": 0.46192670799791813,
|
| 869 |
-
"epoch": 1.296,
|
| 870 |
-
"grad_norm": 0.5512360334396362,
|
| 871 |
-
"learning_rate": 7.4144e-05,
|
| 872 |
-
"loss": 0.4276,
|
| 873 |
-
"mean_token_accuracy": 0.8758893702179193,
|
| 874 |
-
"num_tokens": 143855.0,
|
| 875 |
-
"step": 810
|
| 876 |
-
},
|
| 877 |
-
{
|
| 878 |
-
"entropy": 0.5323605043813586,
|
| 879 |
-
"epoch": 1.312,
|
| 880 |
-
"grad_norm": 0.6361510753631592,
|
| 881 |
-
"learning_rate": 7.3824e-05,
|
| 882 |
-
"loss": 0.491,
|
| 883 |
-
"mean_token_accuracy": 0.856274176388979,
|
| 884 |
-
"num_tokens": 159314.0,
|
| 885 |
-
"step": 820
|
| 886 |
-
},
|
| 887 |
-
{
|
| 888 |
-
"entropy": 0.4423897641710937,
|
| 889 |
-
"epoch": 1.328,
|
| 890 |
-
"grad_norm": 0.4728486239910126,
|
| 891 |
-
"learning_rate": 7.3504e-05,
|
| 892 |
-
"loss": 0.4335,
|
| 893 |
-
"mean_token_accuracy": 0.8686480693519115,
|
| 894 |
-
"num_tokens": 187464.0,
|
| 895 |
-
"step": 830
|
| 896 |
-
},
|
| 897 |
-
{
|
| 898 |
-
"entropy": 0.41830341406166555,
|
| 899 |
-
"epoch": 1.3439999999999999,
|
| 900 |
-
"grad_norm": 0.49457916617393494,
|
| 901 |
-
"learning_rate": 7.318400000000001e-05,
|
| 902 |
-
"loss": 0.3985,
|
| 903 |
-
"mean_token_accuracy": 0.8779879175126553,
|
| 904 |
-
"num_tokens": 219657.0,
|
| 905 |
-
"step": 840
|
| 906 |
-
},
|
| 907 |
-
{
|
| 908 |
-
"entropy": 0.44871921837329865,
|
| 909 |
-
"epoch": 1.3599999999999999,
|
| 910 |
-
"grad_norm": 0.46471357345581055,
|
| 911 |
-
"learning_rate": 7.2864e-05,
|
| 912 |
-
"loss": 0.4009,
|
| 913 |
-
"mean_token_accuracy": 0.8793935433030129,
|
| 914 |
-
"num_tokens": 245396.0,
|
| 915 |
-
"step": 850
|
| 916 |
-
},
|
| 917 |
-
{
|
| 918 |
-
"entropy": 0.4491863099858165,
|
| 919 |
-
"epoch": 1.376,
|
| 920 |
-
"grad_norm": 0.4910559356212616,
|
| 921 |
-
"learning_rate": 7.2544e-05,
|
| 922 |
-
"loss": 0.432,
|
| 923 |
-
"mean_token_accuracy": 0.8772312045097351,
|
| 924 |
-
"num_tokens": 266432.0,
|
| 925 |
-
"step": 860
|
| 926 |
-
},
|
| 927 |
-
{
|
| 928 |
-
"entropy": 0.5239890940487385,
|
| 929 |
-
"epoch": 1.392,
|
| 930 |
-
"grad_norm": 0.7272471785545349,
|
| 931 |
-
"learning_rate": 7.2224e-05,
|
| 932 |
-
"loss": 0.4652,
|
| 933 |
-
"mean_token_accuracy": 0.8644176237285137,
|
| 934 |
-
"num_tokens": 282655.0,
|
| 935 |
-
"step": 870
|
| 936 |
-
},
|
| 937 |
-
{
|
| 938 |
-
"entropy": 0.45916353976354,
|
| 939 |
-
"epoch": 1.408,
|
| 940 |
-
"grad_norm": 0.4625614583492279,
|
| 941 |
-
"learning_rate": 7.190400000000001e-05,
|
| 942 |
-
"loss": 0.4543,
|
| 943 |
-
"mean_token_accuracy": 0.8663836497813463,
|
| 944 |
-
"num_tokens": 310801.0,
|
| 945 |
-
"step": 880
|
| 946 |
-
},
|
| 947 |
-
{
|
| 948 |
-
"entropy": 0.4246408801525831,
|
| 949 |
-
"epoch": 1.424,
|
| 950 |
-
"grad_norm": 0.48823705315589905,
|
| 951 |
-
"learning_rate": 7.158400000000001e-05,
|
| 952 |
-
"loss": 0.395,
|
| 953 |
-
"mean_token_accuracy": 0.879553859308362,
|
| 954 |
-
"num_tokens": 343555.0,
|
| 955 |
-
"step": 890
|
| 956 |
-
},
|
| 957 |
-
{
|
| 958 |
-
"entropy": 0.44206046797335147,
|
| 959 |
-
"epoch": 1.44,
|
| 960 |
-
"grad_norm": 0.48411789536476135,
|
| 961 |
-
"learning_rate": 7.126400000000001e-05,
|
| 962 |
-
"loss": 0.4127,
|
| 963 |
-
"mean_token_accuracy": 0.8790250942111015,
|
| 964 |
-
"num_tokens": 369134.0,
|
| 965 |
-
"step": 900
|
| 966 |
-
},
|
| 967 |
-
{
|
| 968 |
-
"epoch": 1.44,
|
| 969 |
-
"eval_entropy": 0.4753110625743866,
|
| 970 |
-
"eval_loss": 0.5131832361221313,
|
| 971 |
-
"eval_mean_token_accuracy": 0.8568402478694915,
|
| 972 |
-
"eval_num_tokens": 369134.0,
|
| 973 |
-
"eval_runtime": 895.8989,
|
| 974 |
-
"eval_samples_per_second": 2.232,
|
| 975 |
-
"eval_steps_per_second": 0.558,
|
| 976 |
-
"step": 900
|
| 977 |
-
},
|
| 978 |
-
{
|
| 979 |
-
"entropy": 0.4554462408646941,
|
| 980 |
-
"epoch": 1.456,
|
| 981 |
-
"grad_norm": 0.537635087966919,
|
| 982 |
-
"learning_rate": 7.0944e-05,
|
| 983 |
-
"loss": 0.4188,
|
| 984 |
-
"mean_token_accuracy": 0.8769065048545599,
|
| 985 |
-
"num_tokens": 390152.0,
|
| 986 |
-
"step": 910
|
| 987 |
-
},
|
| 988 |
-
{
|
| 989 |
-
"entropy": 0.5131621342152357,
|
| 990 |
-
"epoch": 1.472,
|
| 991 |
-
"grad_norm": 0.6558974385261536,
|
| 992 |
-
"learning_rate": 7.062400000000001e-05,
|
| 993 |
-
"loss": 0.4663,
|
| 994 |
-
"mean_token_accuracy": 0.8643736276775599,
|
| 995 |
-
"num_tokens": 406222.0,
|
| 996 |
-
"step": 920
|
| 997 |
-
},
|
| 998 |
-
{
|
| 999 |
-
"entropy": 0.46728117018938065,
|
| 1000 |
-
"epoch": 1.488,
|
| 1001 |
-
"grad_norm": 0.4205915927886963,
|
| 1002 |
-
"learning_rate": 7.030400000000001e-05,
|
| 1003 |
-
"loss": 0.4539,
|
| 1004 |
-
"mean_token_accuracy": 0.8652867745608092,
|
| 1005 |
-
"num_tokens": 434767.0,
|
| 1006 |
-
"step": 930
|
| 1007 |
-
},
|
| 1008 |
-
{
|
| 1009 |
-
"entropy": 0.3991527833044529,
|
| 1010 |
-
"epoch": 1.504,
|
| 1011 |
-
"grad_norm": 0.4031739830970764,
|
| 1012 |
-
"learning_rate": 6.9984e-05,
|
| 1013 |
-
"loss": 0.3805,
|
| 1014 |
-
"mean_token_accuracy": 0.8837333973497152,
|
| 1015 |
-
"num_tokens": 467498.0,
|
| 1016 |
-
"step": 940
|
| 1017 |
-
},
|
| 1018 |
-
{
|
| 1019 |
-
"entropy": 0.43914526589214803,
|
| 1020 |
-
"epoch": 1.52,
|
| 1021 |
-
"grad_norm": 0.42591777443885803,
|
| 1022 |
-
"learning_rate": 6.9664e-05,
|
| 1023 |
-
"loss": 0.3989,
|
| 1024 |
-
"mean_token_accuracy": 0.8830435562878847,
|
| 1025 |
-
"num_tokens": 493146.0,
|
| 1026 |
-
"step": 950
|
| 1027 |
-
},
|
| 1028 |
-
{
|
| 1029 |
-
"entropy": 0.4609672848135233,
|
| 1030 |
-
"epoch": 1.536,
|
| 1031 |
-
"grad_norm": 0.5175366997718811,
|
| 1032 |
-
"learning_rate": 6.934399999999999e-05,
|
| 1033 |
-
"loss": 0.4377,
|
| 1034 |
-
"mean_token_accuracy": 0.8746946189552546,
|
| 1035 |
-
"num_tokens": 513926.0,
|
| 1036 |
-
"step": 960
|
| 1037 |
-
},
|
| 1038 |
-
{
|
| 1039 |
-
"entropy": 0.5191182948648929,
|
| 1040 |
-
"epoch": 1.552,
|
| 1041 |
-
"grad_norm": 0.6527137160301208,
|
| 1042 |
-
"learning_rate": 6.9024e-05,
|
| 1043 |
-
"loss": 0.465,
|
| 1044 |
-
"mean_token_accuracy": 0.86279138289392,
|
| 1045 |
-
"num_tokens": 530263.0,
|
| 1046 |
-
"step": 970
|
| 1047 |
-
},
|
| 1048 |
-
{
|
| 1049 |
-
"entropy": 0.4549953695386648,
|
| 1050 |
-
"epoch": 1.568,
|
| 1051 |
-
"grad_norm": 0.4532809257507324,
|
| 1052 |
-
"learning_rate": 6.8704e-05,
|
| 1053 |
-
"loss": 0.4345,
|
| 1054 |
-
"mean_token_accuracy": 0.8703642163425684,
|
| 1055 |
-
"num_tokens": 559791.0,
|
| 1056 |
-
"step": 980
|
| 1057 |
-
},
|
| 1058 |
-
{
|
| 1059 |
-
"entropy": 0.39457473438233137,
|
| 1060 |
-
"epoch": 1.584,
|
| 1061 |
-
"grad_norm": 0.4516853094100952,
|
| 1062 |
-
"learning_rate": 6.8384e-05,
|
| 1063 |
-
"loss": 0.3833,
|
| 1064 |
-
"mean_token_accuracy": 0.885482932254672,
|
| 1065 |
-
"num_tokens": 592398.0,
|
| 1066 |
-
"step": 990
|
| 1067 |
-
},
|
| 1068 |
-
{
|
| 1069 |
-
"entropy": 0.44856451768428085,
|
| 1070 |
-
"epoch": 1.6,
|
| 1071 |
-
"grad_norm": 0.4582580029964447,
|
| 1072 |
-
"learning_rate": 6.8064e-05,
|
| 1073 |
-
"loss": 0.4081,
|
| 1074 |
-
"mean_token_accuracy": 0.8799201253801584,
|
| 1075 |
-
"num_tokens": 617982.0,
|
| 1076 |
-
"step": 1000
|
| 1077 |
-
},
|
| 1078 |
-
{
|
| 1079 |
-
"entropy": 0.46642222460359334,
|
| 1080 |
-
"epoch": 1.616,
|
| 1081 |
-
"grad_norm": 0.45997655391693115,
|
| 1082 |
-
"learning_rate": 6.774400000000001e-05,
|
| 1083 |
-
"loss": 0.4375,
|
| 1084 |
-
"mean_token_accuracy": 0.8738056540489196,
|
| 1085 |
-
"num_tokens": 639115.0,
|
| 1086 |
-
"step": 1010
|
| 1087 |
-
},
|
| 1088 |
-
{
|
| 1089 |
-
"entropy": 0.5127991208806634,
|
| 1090 |
-
"epoch": 1.6320000000000001,
|
| 1091 |
-
"grad_norm": 0.6186177730560303,
|
| 1092 |
-
"learning_rate": 6.7424e-05,
|
| 1093 |
-
"loss": 0.4503,
|
| 1094 |
-
"mean_token_accuracy": 0.8713137298822403,
|
| 1095 |
-
"num_tokens": 655709.0,
|
| 1096 |
-
"step": 1020
|
| 1097 |
-
},
|
| 1098 |
-
{
|
| 1099 |
-
"entropy": 0.45265620658174155,
|
| 1100 |
-
"epoch": 1.6480000000000001,
|
| 1101 |
-
"grad_norm": 0.4363885819911957,
|
| 1102 |
-
"learning_rate": 6.7104e-05,
|
| 1103 |
-
"loss": 0.4347,
|
| 1104 |
-
"mean_token_accuracy": 0.866806122660637,
|
| 1105 |
-
"num_tokens": 684500.0,
|
| 1106 |
-
"step": 1030
|
| 1107 |
-
},
|
| 1108 |
-
{
|
| 1109 |
-
"entropy": 0.39329283433035017,
|
| 1110 |
-
"epoch": 1.6640000000000001,
|
| 1111 |
-
"grad_norm": 0.39802274107933044,
|
| 1112 |
-
"learning_rate": 6.6784e-05,
|
| 1113 |
-
"loss": 0.3699,
|
| 1114 |
-
"mean_token_accuracy": 0.8874391701072455,
|
| 1115 |
-
"num_tokens": 717271.0,
|
| 1116 |
-
"step": 1040
|
| 1117 |
-
},
|
| 1118 |
-
{
|
| 1119 |
-
"entropy": 0.4426466390490532,
|
| 1120 |
-
"epoch": 1.6800000000000002,
|
| 1121 |
-
"grad_norm": 0.4594961404800415,
|
| 1122 |
-
"learning_rate": 6.6464e-05,
|
| 1123 |
-
"loss": 0.4061,
|
| 1124 |
-
"mean_token_accuracy": 0.8797303918749094,
|
| 1125 |
-
"num_tokens": 743068.0,
|
| 1126 |
-
"step": 1050
|
| 1127 |
-
},
|
| 1128 |
-
{
|
| 1129 |
-
"epoch": 1.6800000000000002,
|
| 1130 |
-
"eval_entropy": 0.4718739038705826,
|
| 1131 |
-
"eval_loss": 0.511114239692688,
|
| 1132 |
-
"eval_mean_token_accuracy": 0.8577107313871384,
|
| 1133 |
-
"eval_num_tokens": 743068.0,
|
| 1134 |
-
"eval_runtime": 895.9979,
|
| 1135 |
-
"eval_samples_per_second": 2.232,
|
| 1136 |
-
"eval_steps_per_second": 0.558,
|
| 1137 |
-
"step": 1050
|
| 1138 |
-
},
|
| 1139 |
-
{
|
| 1140 |
-
"entropy": 0.45297340136021375,
|
| 1141 |
-
"epoch": 1.696,
|
| 1142 |
-
"grad_norm": 0.5545983910560608,
|
| 1143 |
-
"learning_rate": 6.614400000000001e-05,
|
| 1144 |
-
"loss": 0.4144,
|
| 1145 |
-
"mean_token_accuracy": 0.8779479678720236,
|
| 1146 |
-
"num_tokens": 763925.0,
|
| 1147 |
-
"step": 1060
|
| 1148 |
-
},
|
| 1149 |
-
{
|
| 1150 |
-
"entropy": 0.497313455119729,
|
| 1151 |
-
"epoch": 1.712,
|
| 1152 |
-
"grad_norm": 0.6375033259391785,
|
| 1153 |
-
"learning_rate": 6.582400000000001e-05,
|
| 1154 |
-
"loss": 0.4523,
|
| 1155 |
-
"mean_token_accuracy": 0.8679382588714362,
|
| 1156 |
-
"num_tokens": 780111.0,
|
| 1157 |
-
"step": 1070
|
| 1158 |
-
},
|
| 1159 |
-
{
|
| 1160 |
-
"entropy": 0.4517807062715292,
|
| 1161 |
-
"epoch": 1.728,
|
| 1162 |
-
"grad_norm": 0.42967426776885986,
|
| 1163 |
-
"learning_rate": 6.5504e-05,
|
| 1164 |
-
"loss": 0.4297,
|
| 1165 |
-
"mean_token_accuracy": 0.8690480105578899,
|
| 1166 |
-
"num_tokens": 808661.0,
|
| 1167 |
-
"step": 1080
|
| 1168 |
-
},
|
| 1169 |
-
{
|
| 1170 |
-
"entropy": 0.40057806484401226,
|
| 1171 |
-
"epoch": 1.744,
|
| 1172 |
-
"grad_norm": 0.4295614957809448,
|
| 1173 |
-
"learning_rate": 6.5184e-05,
|
| 1174 |
-
"loss": 0.3765,
|
| 1175 |
-
"mean_token_accuracy": 0.8872563410550356,
|
| 1176 |
-
"num_tokens": 840932.0,
|
| 1177 |
-
"step": 1090
|
| 1178 |
-
},
|
| 1179 |
-
{
|
| 1180 |
-
"entropy": 0.4463956480845809,
|
| 1181 |
-
"epoch": 1.76,
|
| 1182 |
-
"grad_norm": 0.49008527398109436,
|
| 1183 |
-
"learning_rate": 6.486400000000001e-05,
|
| 1184 |
-
"loss": 0.4064,
|
| 1185 |
-
"mean_token_accuracy": 0.8804606605321169,
|
| 1186 |
-
"num_tokens": 866564.0,
|
| 1187 |
-
"step": 1100
|
| 1188 |
-
},
|
| 1189 |
-
{
|
| 1190 |
-
"entropy": 0.45895243529230356,
|
| 1191 |
-
"epoch": 1.776,
|
| 1192 |
-
"grad_norm": 0.5231919288635254,
|
| 1193 |
-
"learning_rate": 6.454400000000001e-05,
|
| 1194 |
-
"loss": 0.4249,
|
| 1195 |
-
"mean_token_accuracy": 0.8787468057125807,
|
| 1196 |
-
"num_tokens": 887527.0,
|
| 1197 |
-
"step": 1110
|
| 1198 |
-
},
|
| 1199 |
-
{
|
| 1200 |
-
"entropy": 0.5028131037950516,
|
| 1201 |
-
"epoch": 1.792,
|
| 1202 |
-
"grad_norm": 0.6885866522789001,
|
| 1203 |
-
"learning_rate": 6.4224e-05,
|
| 1204 |
-
"loss": 0.4549,
|
| 1205 |
-
"mean_token_accuracy": 0.8681917265057564,
|
| 1206 |
-
"num_tokens": 903688.0,
|
| 1207 |
-
"step": 1120
|
| 1208 |
-
},
|
| 1209 |
-
{
|
| 1210 |
-
"entropy": 0.4443069422617555,
|
| 1211 |
-
"epoch": 1.808,
|
| 1212 |
-
"grad_norm": 0.4276801347732544,
|
| 1213 |
-
"learning_rate": 6.3904e-05,
|
| 1214 |
-
"loss": 0.419,
|
| 1215 |
-
"mean_token_accuracy": 0.8721311956644058,
|
| 1216 |
-
"num_tokens": 932975.0,
|
| 1217 |
-
"step": 1130
|
| 1218 |
-
},
|
| 1219 |
-
{
|
| 1220 |
-
"entropy": 0.38013150785118344,
|
| 1221 |
-
"epoch": 1.8239999999999998,
|
| 1222 |
-
"grad_norm": 0.4245995283126831,
|
| 1223 |
-
"learning_rate": 6.358399999999999e-05,
|
| 1224 |
-
"loss": 0.3752,
|
| 1225 |
-
"mean_token_accuracy": 0.8849109452217817,
|
| 1226 |
-
"num_tokens": 965221.0,
|
| 1227 |
-
"step": 1140
|
| 1228 |
-
},
|
| 1229 |
-
{
|
| 1230 |
-
"entropy": 0.44638209473341706,
|
| 1231 |
-
"epoch": 1.8399999999999999,
|
| 1232 |
-
"grad_norm": 0.47453537583351135,
|
| 1233 |
-
"learning_rate": 6.3264e-05,
|
| 1234 |
-
"loss": 0.4066,
|
| 1235 |
-
"mean_token_accuracy": 0.8791540212929249,
|
| 1236 |
-
"num_tokens": 990859.0,
|
| 1237 |
-
"step": 1150
|
| 1238 |
-
},
|
| 1239 |
-
{
|
| 1240 |
-
"entropy": 0.4516189154237509,
|
| 1241 |
-
"epoch": 1.8559999999999999,
|
| 1242 |
-
"grad_norm": 0.5056102871894836,
|
| 1243 |
-
"learning_rate": 6.2944e-05,
|
| 1244 |
-
"loss": 0.4127,
|
| 1245 |
-
"mean_token_accuracy": 0.8801082350313664,
|
| 1246 |
-
"num_tokens": 1011268.0,
|
| 1247 |
-
"step": 1160
|
| 1248 |
-
},
|
| 1249 |
-
{
|
| 1250 |
-
"entropy": 0.5173742642626167,
|
| 1251 |
-
"epoch": 1.8719999999999999,
|
| 1252 |
-
"grad_norm": 0.6503537893295288,
|
| 1253 |
-
"learning_rate": 6.2624e-05,
|
| 1254 |
-
"loss": 0.48,
|
| 1255 |
-
"mean_token_accuracy": 0.8607568740844727,
|
| 1256 |
-
"num_tokens": 1026942.0,
|
| 1257 |
-
"step": 1170
|
| 1258 |
-
},
|
| 1259 |
-
{
|
| 1260 |
-
"entropy": 0.4701320366002619,
|
| 1261 |
-
"epoch": 1.888,
|
| 1262 |
-
"grad_norm": 0.3796524405479431,
|
| 1263 |
-
"learning_rate": 6.2304e-05,
|
| 1264 |
-
"loss": 0.4387,
|
| 1265 |
-
"mean_token_accuracy": 0.8685618557035923,
|
| 1266 |
-
"num_tokens": 1053937.0,
|
| 1267 |
-
"step": 1180
|
| 1268 |
-
},
|
| 1269 |
-
{
|
| 1270 |
-
"entropy": 0.3984457287937403,
|
| 1271 |
-
"epoch": 1.904,
|
| 1272 |
-
"grad_norm": 0.4399532377719879,
|
| 1273 |
-
"learning_rate": 6.1984e-05,
|
| 1274 |
-
"loss": 0.3885,
|
| 1275 |
-
"mean_token_accuracy": 0.8824849870055914,
|
| 1276 |
-
"num_tokens": 1084820.0,
|
| 1277 |
-
"step": 1190
|
| 1278 |
-
},
|
| 1279 |
-
{
|
| 1280 |
-
"entropy": 0.44145693685859444,
|
| 1281 |
-
"epoch": 1.92,
|
| 1282 |
-
"grad_norm": 0.44594088196754456,
|
| 1283 |
-
"learning_rate": 6.1664e-05,
|
| 1284 |
-
"loss": 0.3917,
|
| 1285 |
-
"mean_token_accuracy": 0.884940878674388,
|
| 1286 |
-
"num_tokens": 1109943.0,
|
| 1287 |
-
"step": 1200
|
| 1288 |
-
},
|
| 1289 |
-
{
|
| 1290 |
-
"epoch": 1.92,
|
| 1291 |
-
"eval_entropy": 0.47450968527793885,
|
| 1292 |
-
"eval_loss": 0.5091220140457153,
|
| 1293 |
-
"eval_mean_token_accuracy": 0.8581691147089004,
|
| 1294 |
-
"eval_num_tokens": 1109943.0,
|
| 1295 |
-
"eval_runtime": 897.2539,
|
| 1296 |
-
"eval_samples_per_second": 2.229,
|
| 1297 |
-
"eval_steps_per_second": 0.557,
|
| 1298 |
-
"step": 1200
|
| 1299 |
-
},
|
| 1300 |
-
{
|
| 1301 |
-
"entropy": 0.4571360006928444,
|
| 1302 |
-
"epoch": 1.936,
|
| 1303 |
-
"grad_norm": 0.5302743315696716,
|
| 1304 |
-
"learning_rate": 6.1344e-05,
|
| 1305 |
-
"loss": 0.4167,
|
| 1306 |
-
"mean_token_accuracy": 0.8782664395868778,
|
| 1307 |
-
"num_tokens": 1130543.0,
|
| 1308 |
-
"step": 1210
|
| 1309 |
-
},
|
| 1310 |
-
{
|
| 1311 |
-
"entropy": 0.49114823453128337,
|
| 1312 |
-
"epoch": 1.952,
|
| 1313 |
-
"grad_norm": 0.6523593664169312,
|
| 1314 |
-
"learning_rate": 6.1024000000000004e-05,
|
| 1315 |
-
"loss": 0.4495,
|
| 1316 |
-
"mean_token_accuracy": 0.8662942215800286,
|
| 1317 |
-
"num_tokens": 1146676.0,
|
| 1318 |
-
"step": 1220
|
| 1319 |
-
},
|
| 1320 |
-
{
|
| 1321 |
-
"entropy": 0.46395022002980113,
|
| 1322 |
-
"epoch": 1.968,
|
| 1323 |
-
"grad_norm": 0.42906099557876587,
|
| 1324 |
-
"learning_rate": 6.070400000000001e-05,
|
| 1325 |
-
"loss": 0.4392,
|
| 1326 |
-
"mean_token_accuracy": 0.8659089788794517,
|
| 1327 |
-
"num_tokens": 1172078.0,
|
| 1328 |
-
"step": 1230
|
| 1329 |
-
},
|
| 1330 |
-
{
|
| 1331 |
-
"entropy": 0.4239502627402544,
|
| 1332 |
-
"epoch": 1.984,
|
| 1333 |
-
"grad_norm": 0.5165457129478455,
|
| 1334 |
-
"learning_rate": 6.038400000000001e-05,
|
| 1335 |
-
"loss": 0.4067,
|
| 1336 |
-
"mean_token_accuracy": 0.8755033057183027,
|
| 1337 |
-
"num_tokens": 1197464.0,
|
| 1338 |
-
"step": 1240
|
| 1339 |
-
},
|
| 1340 |
-
{
|
| 1341 |
-
"entropy": 0.5474816044792533,
|
| 1342 |
-
"epoch": 2.0,
|
| 1343 |
-
"grad_norm": 0.7312328219413757,
|
| 1344 |
-
"learning_rate": 6.0064e-05,
|
| 1345 |
-
"loss": 0.4816,
|
| 1346 |
-
"mean_token_accuracy": 0.8589978538453579,
|
| 1347 |
-
"num_tokens": 1212204.0,
|
| 1348 |
-
"step": 1250
|
| 1349 |
-
},
|
| 1350 |
-
{
|
| 1351 |
-
"entropy": 0.3549959819763899,
|
| 1352 |
-
"epoch": 2.016,
|
| 1353 |
-
"grad_norm": 0.44957467913627625,
|
| 1354 |
-
"learning_rate": 5.9744e-05,
|
| 1355 |
-
"loss": 0.329,
|
| 1356 |
-
"mean_token_accuracy": 0.8989395320415496,
|
| 1357 |
-
"num_tokens": 1253503.0,
|
| 1358 |
-
"step": 1260
|
| 1359 |
-
},
|
| 1360 |
-
{
|
| 1361 |
-
"entropy": 0.3744832394644618,
|
| 1362 |
-
"epoch": 2.032,
|
| 1363 |
-
"grad_norm": 0.48583275079727173,
|
| 1364 |
-
"learning_rate": 5.9424e-05,
|
| 1365 |
-
"loss": 0.3574,
|
| 1366 |
-
"mean_token_accuracy": 0.8903608873486519,
|
| 1367 |
-
"num_tokens": 1282285.0,
|
| 1368 |
-
"step": 1270
|
| 1369 |
-
},
|
| 1370 |
-
{
|
| 1371 |
-
"entropy": 0.4087462780997157,
|
| 1372 |
-
"epoch": 2.048,
|
| 1373 |
-
"grad_norm": 0.5598016381263733,
|
| 1374 |
-
"learning_rate": 5.9104e-05,
|
| 1375 |
-
"loss": 0.3668,
|
| 1376 |
-
"mean_token_accuracy": 0.8913519535213709,
|
| 1377 |
-
"num_tokens": 1305738.0,
|
| 1378 |
-
"step": 1280
|
| 1379 |
-
},
|
| 1380 |
-
{
|
| 1381 |
-
"entropy": 0.4285904698073864,
|
| 1382 |
-
"epoch": 2.064,
|
| 1383 |
-
"grad_norm": 0.637881338596344,
|
| 1384 |
-
"learning_rate": 5.8784000000000005e-05,
|
| 1385 |
-
"loss": 0.3693,
|
| 1386 |
-
"mean_token_accuracy": 0.8924054179340601,
|
| 1387 |
-
"num_tokens": 1324499.0,
|
| 1388 |
-
"step": 1290
|
| 1389 |
-
},
|
| 1390 |
-
{
|
| 1391 |
-
"entropy": 0.46786304665729406,
|
| 1392 |
-
"epoch": 2.08,
|
| 1393 |
-
"grad_norm": 0.7997815012931824,
|
| 1394 |
-
"learning_rate": 5.846400000000001e-05,
|
| 1395 |
-
"loss": 0.3988,
|
| 1396 |
-
"mean_token_accuracy": 0.8819302976131439,
|
| 1397 |
-
"num_tokens": 1337343.0,
|
| 1398 |
-
"step": 1300
|
| 1399 |
-
},
|
| 1400 |
-
{
|
| 1401 |
-
"entropy": 0.30622370541095734,
|
| 1402 |
-
"epoch": 2.096,
|
| 1403 |
-
"grad_norm": 0.46214577555656433,
|
| 1404 |
-
"learning_rate": 5.8144e-05,
|
| 1405 |
-
"loss": 0.3356,
|
| 1406 |
-
"mean_token_accuracy": 0.8964022137224674,
|
| 1407 |
-
"num_tokens": 1378091.0,
|
| 1408 |
-
"step": 1310
|
| 1409 |
-
},
|
| 1410 |
-
{
|
| 1411 |
-
"entropy": 0.38422031346708535,
|
| 1412 |
-
"epoch": 2.112,
|
| 1413 |
-
"grad_norm": 0.5669556856155396,
|
| 1414 |
-
"learning_rate": 5.7824e-05,
|
| 1415 |
-
"loss": 0.3431,
|
| 1416 |
-
"mean_token_accuracy": 0.8970716085284949,
|
| 1417 |
-
"num_tokens": 1406636.0,
|
| 1418 |
-
"step": 1320
|
| 1419 |
-
},
|
| 1420 |
-
{
|
| 1421 |
-
"entropy": 0.4110618421807885,
|
| 1422 |
-
"epoch": 2.128,
|
| 1423 |
-
"grad_norm": 0.5469350218772888,
|
| 1424 |
-
"learning_rate": 5.7504000000000004e-05,
|
| 1425 |
-
"loss": 0.3556,
|
| 1426 |
-
"mean_token_accuracy": 0.8946326076984406,
|
| 1427 |
-
"num_tokens": 1429756.0,
|
| 1428 |
-
"step": 1330
|
| 1429 |
-
},
|
| 1430 |
-
{
|
| 1431 |
-
"entropy": 0.41729052886366846,
|
| 1432 |
-
"epoch": 2.144,
|
| 1433 |
-
"grad_norm": 0.5956342220306396,
|
| 1434 |
-
"learning_rate": 5.718400000000001e-05,
|
| 1435 |
-
"loss": 0.3707,
|
| 1436 |
-
"mean_token_accuracy": 0.8897294741123914,
|
| 1437 |
-
"num_tokens": 1448487.0,
|
| 1438 |
-
"step": 1340
|
| 1439 |
-
},
|
| 1440 |
-
{
|
| 1441 |
-
"entropy": 0.4881801651790738,
|
| 1442 |
-
"epoch": 2.16,
|
| 1443 |
-
"grad_norm": 1.0672754049301147,
|
| 1444 |
-
"learning_rate": 5.6864e-05,
|
| 1445 |
-
"loss": 0.4108,
|
| 1446 |
-
"mean_token_accuracy": 0.8769895020872355,
|
| 1447 |
-
"num_tokens": 1461450.0,
|
| 1448 |
-
"step": 1350
|
| 1449 |
-
},
|
| 1450 |
-
{
|
| 1451 |
-
"epoch": 2.16,
|
| 1452 |
-
"eval_entropy": 0.3968213936388493,
|
| 1453 |
-
"eval_loss": 0.5419190526008606,
|
| 1454 |
-
"eval_mean_token_accuracy": 0.8568335684537888,
|
| 1455 |
-
"eval_num_tokens": 1461450.0,
|
| 1456 |
-
"eval_runtime": 896.689,
|
| 1457 |
-
"eval_samples_per_second": 2.23,
|
| 1458 |
-
"eval_steps_per_second": 0.558,
|
| 1459 |
-
"step": 1350
|
| 1460 |
-
},
|
| 1461 |
-
{
|
| 1462 |
-
"entropy": 0.3141488812863827,
|
| 1463 |
-
"epoch": 2.176,
|
| 1464 |
-
"grad_norm": 0.47934019565582275,
|
| 1465 |
-
"learning_rate": 5.6544000000000006e-05,
|
| 1466 |
-
"loss": 0.3396,
|
| 1467 |
-
"mean_token_accuracy": 0.8963223662227392,
|
| 1468 |
-
"num_tokens": 1501473.0,
|
| 1469 |
-
"step": 1360
|
| 1470 |
-
},
|
| 1471 |
-
{
|
| 1472 |
-
"entropy": 0.37752851136028764,
|
| 1473 |
-
"epoch": 2.192,
|
| 1474 |
-
"grad_norm": 0.580359160900116,
|
| 1475 |
-
"learning_rate": 5.6223999999999996e-05,
|
| 1476 |
-
"loss": 0.345,
|
| 1477 |
-
"mean_token_accuracy": 0.8945828888565301,
|
| 1478 |
-
"num_tokens": 1529468.0,
|
| 1479 |
-
"step": 1370
|
| 1480 |
-
},
|
| 1481 |
-
{
|
| 1482 |
-
"entropy": 0.4134438899345696,
|
| 1483 |
-
"epoch": 2.208,
|
| 1484 |
-
"grad_norm": 0.6379365921020508,
|
| 1485 |
-
"learning_rate": 5.5904e-05,
|
| 1486 |
-
"loss": 0.3692,
|
| 1487 |
-
"mean_token_accuracy": 0.8921185087412595,
|
| 1488 |
-
"num_tokens": 1552194.0,
|
| 1489 |
-
"step": 1380
|
| 1490 |
-
},
|
| 1491 |
-
{
|
| 1492 |
-
"entropy": 0.4285835810005665,
|
| 1493 |
-
"epoch": 2.224,
|
| 1494 |
-
"grad_norm": 0.7130568027496338,
|
| 1495 |
-
"learning_rate": 5.5584e-05,
|
| 1496 |
-
"loss": 0.3708,
|
| 1497 |
-
"mean_token_accuracy": 0.8914431348443032,
|
| 1498 |
-
"num_tokens": 1570340.0,
|
| 1499 |
-
"step": 1390
|
| 1500 |
-
},
|
| 1501 |
-
{
|
| 1502 |
-
"entropy": 0.4728871438652277,
|
| 1503 |
-
"epoch": 2.24,
|
| 1504 |
-
"grad_norm": 0.8636120557785034,
|
| 1505 |
-
"learning_rate": 5.5264000000000005e-05,
|
| 1506 |
-
"loss": 0.4016,
|
| 1507 |
-
"mean_token_accuracy": 0.8787129417061805,
|
| 1508 |
-
"num_tokens": 1583068.0,
|
| 1509 |
-
"step": 1400
|
| 1510 |
-
},
|
| 1511 |
-
{
|
| 1512 |
-
"entropy": 0.3159141786396503,
|
| 1513 |
-
"epoch": 2.2560000000000002,
|
| 1514 |
-
"grad_norm": 0.5734344720840454,
|
| 1515 |
-
"learning_rate": 5.494400000000001e-05,
|
| 1516 |
-
"loss": 0.3333,
|
| 1517 |
-
"mean_token_accuracy": 0.8989784453064203,
|
| 1518 |
-
"num_tokens": 1621780.0,
|
| 1519 |
-
"step": 1410
|
| 1520 |
-
},
|
| 1521 |
-
{
|
| 1522 |
-
"entropy": 0.3770693183876574,
|
| 1523 |
-
"epoch": 2.2720000000000002,
|
| 1524 |
-
"grad_norm": 0.5254765152931213,
|
| 1525 |
-
"learning_rate": 5.4624e-05,
|
| 1526 |
-
"loss": 0.3526,
|
| 1527 |
-
"mean_token_accuracy": 0.8930392079055309,
|
| 1528 |
-
"num_tokens": 1649762.0,
|
| 1529 |
-
"step": 1420
|
| 1530 |
-
},
|
| 1531 |
-
{
|
| 1532 |
-
"entropy": 0.42590463180094956,
|
| 1533 |
-
"epoch": 2.288,
|
| 1534 |
-
"grad_norm": 0.6342437267303467,
|
| 1535 |
-
"learning_rate": 5.4304e-05,
|
| 1536 |
-
"loss": 0.374,
|
| 1537 |
-
"mean_token_accuracy": 0.8877194058150053,
|
| 1538 |
-
"num_tokens": 1672433.0,
|
| 1539 |
-
"step": 1430
|
| 1540 |
-
},
|
| 1541 |
-
{
|
| 1542 |
-
"entropy": 0.4354470370337367,
|
| 1543 |
-
"epoch": 2.304,
|
| 1544 |
-
"grad_norm": 0.7154885530471802,
|
| 1545 |
-
"learning_rate": 5.3984000000000004e-05,
|
| 1546 |
-
"loss": 0.3778,
|
| 1547 |
-
"mean_token_accuracy": 0.8892953939735889,
|
| 1548 |
-
"num_tokens": 1690543.0,
|
| 1549 |
-
"step": 1440
|
| 1550 |
-
},
|
| 1551 |
-
{
|
| 1552 |
-
"entropy": 0.48633114621043205,
|
| 1553 |
-
"epoch": 2.32,
|
| 1554 |
-
"grad_norm": 1.0084096193313599,
|
| 1555 |
-
"learning_rate": 5.3664e-05,
|
| 1556 |
-
"loss": 0.4139,
|
| 1557 |
-
"mean_token_accuracy": 0.8807312864810228,
|
| 1558 |
-
"num_tokens": 1702841.0,
|
| 1559 |
-
"step": 1450
|
| 1560 |
-
},
|
| 1561 |
-
{
|
| 1562 |
-
"entropy": 0.3131198097020388,
|
| 1563 |
-
"epoch": 2.336,
|
| 1564 |
-
"grad_norm": 0.5311539769172668,
|
| 1565 |
-
"learning_rate": 5.3344e-05,
|
| 1566 |
-
"loss": 0.3289,
|
| 1567 |
-
"mean_token_accuracy": 0.8994152408093214,
|
| 1568 |
-
"num_tokens": 1742325.0,
|
| 1569 |
-
"step": 1460
|
| 1570 |
-
},
|
| 1571 |
-
{
|
| 1572 |
-
"entropy": 0.38374699037522075,
|
| 1573 |
-
"epoch": 2.352,
|
| 1574 |
-
"grad_norm": 0.4948159158229828,
|
| 1575 |
-
"learning_rate": 5.3024000000000006e-05,
|
| 1576 |
-
"loss": 0.3589,
|
| 1577 |
-
"mean_token_accuracy": 0.8915071442723275,
|
| 1578 |
-
"num_tokens": 1770764.0,
|
| 1579 |
-
"step": 1470
|
| 1580 |
-
},
|
| 1581 |
-
{
|
| 1582 |
-
"entropy": 0.42046497501432895,
|
| 1583 |
-
"epoch": 2.368,
|
| 1584 |
-
"grad_norm": 0.6284568309783936,
|
| 1585 |
-
"learning_rate": 5.2703999999999995e-05,
|
| 1586 |
-
"loss": 0.3595,
|
| 1587 |
-
"mean_token_accuracy": 0.8936832427978516,
|
| 1588 |
-
"num_tokens": 1793963.0,
|
| 1589 |
-
"step": 1480
|
| 1590 |
-
},
|
| 1591 |
-
{
|
| 1592 |
-
"entropy": 0.4265410235151649,
|
| 1593 |
-
"epoch": 2.384,
|
| 1594 |
-
"grad_norm": 0.6891266703605652,
|
| 1595 |
-
"learning_rate": 5.2384e-05,
|
| 1596 |
-
"loss": 0.39,
|
| 1597 |
-
"mean_token_accuracy": 0.8861893687397242,
|
| 1598 |
-
"num_tokens": 1812331.0,
|
| 1599 |
-
"step": 1490
|
| 1600 |
-
},
|
| 1601 |
-
{
|
| 1602 |
-
"entropy": 0.4833611447364092,
|
| 1603 |
-
"epoch": 2.4,
|
| 1604 |
-
"grad_norm": 0.91993248462677,
|
| 1605 |
-
"learning_rate": 5.2064e-05,
|
| 1606 |
-
"loss": 0.4097,
|
| 1607 |
-
"mean_token_accuracy": 0.8784359741955996,
|
| 1608 |
-
"num_tokens": 1824943.0,
|
| 1609 |
-
"step": 1500
|
| 1610 |
-
},
|
| 1611 |
-
{
|
| 1612 |
-
"epoch": 2.4,
|
| 1613 |
-
"eval_entropy": 0.4156067478954792,
|
| 1614 |
-
"eval_loss": 0.531775951385498,
|
| 1615 |
-
"eval_mean_token_accuracy": 0.8573460700511932,
|
| 1616 |
-
"eval_num_tokens": 1824943.0,
|
| 1617 |
-
"eval_runtime": 896.7745,
|
| 1618 |
-
"eval_samples_per_second": 2.23,
|
| 1619 |
-
"eval_steps_per_second": 0.558,
|
| 1620 |
-
"step": 1500
|
| 1621 |
-
},
|
| 1622 |
-
{
|
| 1623 |
-
"entropy": 0.2992474908940494,
|
| 1624 |
-
"epoch": 2.416,
|
| 1625 |
-
"grad_norm": 0.43484658002853394,
|
| 1626 |
-
"learning_rate": 5.1744000000000005e-05,
|
| 1627 |
-
"loss": 0.293,
|
| 1628 |
-
"mean_token_accuracy": 0.9087961092591286,
|
| 1629 |
-
"num_tokens": 40453.0,
|
| 1630 |
-
"step": 1510
|
| 1631 |
-
},
|
| 1632 |
-
{
|
| 1633 |
-
"entropy": 0.3289525999687612,
|
| 1634 |
-
"epoch": 2.432,
|
| 1635 |
-
"grad_norm": 0.5937761664390564,
|
| 1636 |
-
"learning_rate": 5.142400000000001e-05,
|
| 1637 |
-
"loss": 0.303,
|
| 1638 |
-
"mean_token_accuracy": 0.9087128143757581,
|
| 1639 |
-
"num_tokens": 68853.0,
|
| 1640 |
-
"step": 1520
|
| 1641 |
-
},
|
| 1642 |
-
{
|
| 1643 |
-
"entropy": 0.3652105055749416,
|
| 1644 |
-
"epoch": 2.448,
|
| 1645 |
-
"grad_norm": 0.5499975681304932,
|
| 1646 |
-
"learning_rate": 5.110400000000001e-05,
|
| 1647 |
-
"loss": 0.3153,
|
| 1648 |
-
"mean_token_accuracy": 0.9054864585399628,
|
| 1649 |
-
"num_tokens": 92132.0,
|
| 1650 |
-
"step": 1530
|
| 1651 |
-
},
|
| 1652 |
-
{
|
| 1653 |
-
"entropy": 0.37480679620057344,
|
| 1654 |
-
"epoch": 2.464,
|
| 1655 |
-
"grad_norm": 0.8807706236839294,
|
| 1656 |
-
"learning_rate": 5.0784e-05,
|
| 1657 |
-
"loss": 0.3132,
|
| 1658 |
-
"mean_token_accuracy": 0.9067570131272078,
|
| 1659 |
-
"num_tokens": 110798.0,
|
| 1660 |
-
"step": 1540
|
| 1661 |
-
},
|
| 1662 |
-
{
|
| 1663 |
-
"entropy": 0.408511808142066,
|
| 1664 |
-
"epoch": 2.48,
|
| 1665 |
-
"grad_norm": 1.0242410898208618,
|
| 1666 |
-
"learning_rate": 5.0464e-05,
|
| 1667 |
-
"loss": 0.3242,
|
| 1668 |
-
"mean_token_accuracy": 0.9066624633967877,
|
| 1669 |
-
"num_tokens": 123425.0,
|
| 1670 |
-
"step": 1550
|
| 1671 |
-
},
|
| 1672 |
-
{
|
| 1673 |
-
"entropy": 0.27335043689236044,
|
| 1674 |
-
"epoch": 2.496,
|
| 1675 |
-
"grad_norm": 0.5802608728408813,
|
| 1676 |
-
"learning_rate": 5.0144e-05,
|
| 1677 |
-
"loss": 0.305,
|
| 1678 |
-
"mean_token_accuracy": 0.9049082029610872,
|
| 1679 |
-
"num_tokens": 164151.0,
|
| 1680 |
-
"step": 1560
|
| 1681 |
-
},
|
| 1682 |
-
{
|
| 1683 |
-
"entropy": 0.3443534100428224,
|
| 1684 |
-
"epoch": 2.512,
|
| 1685 |
-
"grad_norm": 0.5400863289833069,
|
| 1686 |
-
"learning_rate": 4.9824e-05,
|
| 1687 |
-
"loss": 0.3053,
|
| 1688 |
-
"mean_token_accuracy": 0.9077682174742222,
|
| 1689 |
-
"num_tokens": 192904.0,
|
| 1690 |
-
"step": 1570
|
| 1691 |
-
},
|
| 1692 |
-
{
|
| 1693 |
-
"entropy": 0.36752058789134023,
|
| 1694 |
-
"epoch": 2.528,
|
| 1695 |
-
"grad_norm": 0.6786855459213257,
|
| 1696 |
-
"learning_rate": 4.9504e-05,
|
| 1697 |
-
"loss": 0.3092,
|
| 1698 |
-
"mean_token_accuracy": 0.9070601720362902,
|
| 1699 |
-
"num_tokens": 215988.0,
|
| 1700 |
-
"step": 1580
|
| 1701 |
-
},
|
| 1702 |
-
{
|
| 1703 |
-
"entropy": 0.3739521996118128,
|
| 1704 |
-
"epoch": 2.544,
|
| 1705 |
-
"grad_norm": 0.821361243724823,
|
| 1706 |
-
"learning_rate": 4.9184e-05,
|
| 1707 |
-
"loss": 0.3238,
|
| 1708 |
-
"mean_token_accuracy": 0.9040915958583355,
|
| 1709 |
-
"num_tokens": 234633.0,
|
| 1710 |
-
"step": 1590
|
| 1711 |
-
},
|
| 1712 |
-
{
|
| 1713 |
-
"entropy": 0.4124453643336892,
|
| 1714 |
-
"epoch": 2.56,
|
| 1715 |
-
"grad_norm": 1.0654460191726685,
|
| 1716 |
-
"learning_rate": 4.8864000000000005e-05,
|
| 1717 |
-
"loss": 0.3293,
|
| 1718 |
-
"mean_token_accuracy": 0.9035760186612606,
|
| 1719 |
-
"num_tokens": 247750.0,
|
| 1720 |
-
"step": 1600
|
| 1721 |
-
},
|
| 1722 |
-
{
|
| 1723 |
-
"epoch": 2.56,
|
| 1724 |
-
"eval_entropy": 0.3808738026022911,
|
| 1725 |
-
"eval_loss": 0.5724619626998901,
|
| 1726 |
-
"eval_mean_token_accuracy": 0.8541958237886429,
|
| 1727 |
-
"eval_num_tokens": 247750.0,
|
| 1728 |
-
"eval_runtime": 980.7136,
|
| 1729 |
-
"eval_samples_per_second": 2.039,
|
| 1730 |
-
"eval_steps_per_second": 0.51,
|
| 1731 |
-
"step": 1600
|
| 1732 |
-
},
|
| 1733 |
-
{
|
| 1734 |
-
"entropy": 0.27413347605615856,
|
| 1735 |
-
"epoch": 2.576,
|
| 1736 |
-
"grad_norm": 0.6262645125389099,
|
| 1737 |
-
"learning_rate": 4.8544e-05,
|
| 1738 |
-
"loss": 0.291,
|
| 1739 |
-
"mean_token_accuracy": 0.909802608937025,
|
| 1740 |
-
"num_tokens": 289137.0,
|
| 1741 |
-
"step": 1610
|
| 1742 |
-
},
|
| 1743 |
-
{
|
| 1744 |
-
"entropy": 0.3372902118600905,
|
| 1745 |
-
"epoch": 2.592,
|
| 1746 |
-
"grad_norm": 0.6019719243049622,
|
| 1747 |
-
"learning_rate": 4.8224000000000004e-05,
|
| 1748 |
-
"loss": 0.3089,
|
| 1749 |
-
"mean_token_accuracy": 0.9065854378044605,
|
| 1750 |
-
"num_tokens": 317789.0,
|
| 1751 |
-
"step": 1620
|
| 1752 |
-
},
|
| 1753 |
-
{
|
| 1754 |
-
"entropy": 0.37745234509930015,
|
| 1755 |
-
"epoch": 2.608,
|
| 1756 |
-
"grad_norm": 0.6852167248725891,
|
| 1757 |
-
"learning_rate": 4.790400000000001e-05,
|
| 1758 |
-
"loss": 0.3237,
|
| 1759 |
-
"mean_token_accuracy": 0.9017773322761059,
|
| 1760 |
-
"num_tokens": 340977.0,
|
| 1761 |
-
"step": 1630
|
| 1762 |
-
},
|
| 1763 |
-
{
|
| 1764 |
-
"entropy": 0.3725322958081961,
|
| 1765 |
-
"epoch": 2.624,
|
| 1766 |
-
"grad_norm": 0.7118895053863525,
|
| 1767 |
-
"learning_rate": 4.7584000000000004e-05,
|
| 1768 |
-
"loss": 0.3207,
|
| 1769 |
-
"mean_token_accuracy": 0.9077424634248018,
|
| 1770 |
-
"num_tokens": 360098.0,
|
| 1771 |
-
"step": 1640
|
| 1772 |
-
},
|
| 1773 |
-
{
|
| 1774 |
-
"entropy": 0.4033573804423213,
|
| 1775 |
-
"epoch": 2.64,
|
| 1776 |
-
"grad_norm": 1.0586738586425781,
|
| 1777 |
-
"learning_rate": 4.7264e-05,
|
| 1778 |
-
"loss": 0.3174,
|
| 1779 |
-
"mean_token_accuracy": 0.9044062152504921,
|
| 1780 |
-
"num_tokens": 373200.0,
|
| 1781 |
-
"step": 1650
|
| 1782 |
-
},
|
| 1783 |
-
{
|
| 1784 |
-
"entropy": 0.2776737127453089,
|
| 1785 |
-
"epoch": 2.656,
|
| 1786 |
-
"grad_norm": 0.6017902493476868,
|
| 1787 |
-
"learning_rate": 4.6944e-05,
|
| 1788 |
-
"loss": 0.2942,
|
| 1789 |
-
"mean_token_accuracy": 0.9093959752470255,
|
| 1790 |
-
"num_tokens": 413938.0,
|
| 1791 |
-
"step": 1660
|
| 1792 |
-
},
|
| 1793 |
-
{
|
| 1794 |
-
"entropy": 0.33967588590458037,
|
| 1795 |
-
"epoch": 2.672,
|
| 1796 |
-
"grad_norm": 0.6162438988685608,
|
| 1797 |
-
"learning_rate": 4.6624e-05,
|
| 1798 |
-
"loss": 0.3075,
|
| 1799 |
-
"mean_token_accuracy": 0.905268831551075,
|
| 1800 |
-
"num_tokens": 442794.0,
|
| 1801 |
-
"step": 1670
|
| 1802 |
-
},
|
| 1803 |
-
{
|
| 1804 |
-
"entropy": 0.37314077839255333,
|
| 1805 |
-
"epoch": 2.6879999999999997,
|
| 1806 |
-
"grad_norm": 0.6455461382865906,
|
| 1807 |
-
"learning_rate": 4.6304e-05,
|
| 1808 |
-
"loss": 0.312,
|
| 1809 |
-
"mean_token_accuracy": 0.9044175367802382,
|
| 1810 |
-
"num_tokens": 465992.0,
|
| 1811 |
-
"step": 1680
|
| 1812 |
-
},
|
| 1813 |
-
{
|
| 1814 |
-
"entropy": 0.3640971322543919,
|
| 1815 |
-
"epoch": 2.7039999999999997,
|
| 1816 |
-
"grad_norm": 0.7681553959846497,
|
| 1817 |
-
"learning_rate": 4.5984000000000006e-05,
|
| 1818 |
-
"loss": 0.3049,
|
| 1819 |
-
"mean_token_accuracy": 0.9096171893179417,
|
| 1820 |
-
"num_tokens": 484580.0,
|
| 1821 |
-
"step": 1690
|
| 1822 |
-
},
|
| 1823 |
-
{
|
| 1824 |
-
"entropy": 0.39063505809754134,
|
| 1825 |
-
"epoch": 2.7199999999999998,
|
| 1826 |
-
"grad_norm": 0.9511684775352478,
|
| 1827 |
-
"learning_rate": 4.5664e-05,
|
| 1828 |
-
"loss": 0.3225,
|
| 1829 |
-
"mean_token_accuracy": 0.9034549340605735,
|
| 1830 |
-
"num_tokens": 497612.0,
|
| 1831 |
-
"step": 1700
|
| 1832 |
-
},
|
| 1833 |
-
{
|
| 1834 |
-
"entropy": 0.2883146867156029,
|
| 1835 |
-
"epoch": 2.7359999999999998,
|
| 1836 |
-
"grad_norm": 0.6692296862602234,
|
| 1837 |
-
"learning_rate": 4.5344000000000005e-05,
|
| 1838 |
-
"loss": 0.2935,
|
| 1839 |
-
"mean_token_accuracy": 0.9078109141439199,
|
| 1840 |
-
"num_tokens": 537755.0,
|
| 1841 |
-
"step": 1710
|
| 1842 |
-
},
|
| 1843 |
-
{
|
| 1844 |
-
"entropy": 0.34244058514013886,
|
| 1845 |
-
"epoch": 2.752,
|
| 1846 |
-
"grad_norm": 0.5983220934867859,
|
| 1847 |
-
"learning_rate": 4.5024e-05,
|
| 1848 |
-
"loss": 0.3076,
|
| 1849 |
-
"mean_token_accuracy": 0.9057810723781585,
|
| 1850 |
-
"num_tokens": 566325.0,
|
| 1851 |
-
"step": 1720
|
| 1852 |
-
},
|
| 1853 |
-
{
|
| 1854 |
-
"entropy": 0.3659200777299702,
|
| 1855 |
-
"epoch": 2.768,
|
| 1856 |
-
"grad_norm": 0.7049655318260193,
|
| 1857 |
-
"learning_rate": 4.4704000000000004e-05,
|
| 1858 |
-
"loss": 0.3059,
|
| 1859 |
-
"mean_token_accuracy": 0.9072589132934809,
|
| 1860 |
-
"num_tokens": 589517.0,
|
| 1861 |
-
"step": 1730
|
| 1862 |
-
},
|
| 1863 |
-
{
|
| 1864 |
-
"entropy": 0.35552563723176717,
|
| 1865 |
-
"epoch": 2.784,
|
| 1866 |
-
"grad_norm": 0.7242270112037659,
|
| 1867 |
-
"learning_rate": 4.4384e-05,
|
| 1868 |
-
"loss": 0.3013,
|
| 1869 |
-
"mean_token_accuracy": 0.912841784581542,
|
| 1870 |
-
"num_tokens": 608224.0,
|
| 1871 |
-
"step": 1740
|
| 1872 |
-
},
|
| 1873 |
-
{
|
| 1874 |
-
"entropy": 0.4027377144433558,
|
| 1875 |
-
"epoch": 2.8,
|
| 1876 |
-
"grad_norm": 1.5430299043655396,
|
| 1877 |
-
"learning_rate": 4.4064e-05,
|
| 1878 |
-
"loss": 0.3223,
|
| 1879 |
-
"mean_token_accuracy": 0.9028574671596289,
|
| 1880 |
-
"num_tokens": 621051.0,
|
| 1881 |
-
"step": 1750
|
| 1882 |
-
},
|
| 1883 |
-
{
|
| 1884 |
-
"entropy": 0.2703737439122051,
|
| 1885 |
-
"epoch": 2.816,
|
| 1886 |
-
"grad_norm": 0.7151817083358765,
|
| 1887 |
-
"learning_rate": 4.3744e-05,
|
| 1888 |
-
"loss": 0.2894,
|
| 1889 |
-
"mean_token_accuracy": 0.9102732315659523,
|
| 1890 |
-
"num_tokens": 662133.0,
|
| 1891 |
-
"step": 1760
|
| 1892 |
-
},
|
| 1893 |
-
{
|
| 1894 |
-
"entropy": 0.32695954395458104,
|
| 1895 |
-
"epoch": 2.832,
|
| 1896 |
-
"grad_norm": 0.6097021698951721,
|
| 1897 |
-
"learning_rate": 4.3424e-05,
|
| 1898 |
-
"loss": 0.2967,
|
| 1899 |
-
"mean_token_accuracy": 0.9080837737768889,
|
| 1900 |
-
"num_tokens": 690682.0,
|
| 1901 |
-
"step": 1770
|
| 1902 |
-
},
|
| 1903 |
-
{
|
| 1904 |
-
"entropy": 0.36010922444984317,
|
| 1905 |
-
"epoch": 2.848,
|
| 1906 |
-
"grad_norm": 0.7698465585708618,
|
| 1907 |
-
"learning_rate": 4.3104e-05,
|
| 1908 |
-
"loss": 0.3064,
|
| 1909 |
-
"mean_token_accuracy": 0.9076121047139167,
|
| 1910 |
-
"num_tokens": 713519.0,
|
| 1911 |
-
"step": 1780
|
| 1912 |
-
},
|
| 1913 |
-
{
|
| 1914 |
-
"entropy": 0.369490017183125,
|
| 1915 |
-
"epoch": 2.864,
|
| 1916 |
-
"grad_norm": 0.997474730014801,
|
| 1917 |
-
"learning_rate": 4.2784e-05,
|
| 1918 |
-
"loss": 0.3153,
|
| 1919 |
-
"mean_token_accuracy": 0.9070124924182892,
|
| 1920 |
-
"num_tokens": 731712.0,
|
| 1921 |
-
"step": 1790
|
| 1922 |
-
},
|
| 1923 |
-
{
|
| 1924 |
-
"entropy": 0.41184745989739896,
|
| 1925 |
-
"epoch": 2.88,
|
| 1926 |
-
"grad_norm": 0.9906476736068726,
|
| 1927 |
-
"learning_rate": 4.2464000000000005e-05,
|
| 1928 |
-
"loss": 0.3325,
|
| 1929 |
-
"mean_token_accuracy": 0.9020481187850237,
|
| 1930 |
-
"num_tokens": 744149.0,
|
| 1931 |
-
"step": 1800
|
| 1932 |
-
},
|
| 1933 |
-
{
|
| 1934 |
-
"entropy": 0.28201086847111584,
|
| 1935 |
-
"epoch": 2.896,
|
| 1936 |
-
"grad_norm": 0.6134458184242249,
|
| 1937 |
-
"learning_rate": 4.2144e-05,
|
| 1938 |
-
"loss": 0.2988,
|
| 1939 |
-
"mean_token_accuracy": 0.9069436389952898,
|
| 1940 |
-
"num_tokens": 782193.0,
|
| 1941 |
-
"step": 1810
|
| 1942 |
-
},
|
| 1943 |
-
{
|
| 1944 |
-
"entropy": 0.33303718706592916,
|
| 1945 |
-
"epoch": 2.912,
|
| 1946 |
-
"grad_norm": 0.6062189936637878,
|
| 1947 |
-
"learning_rate": 4.1824000000000005e-05,
|
| 1948 |
-
"loss": 0.3086,
|
| 1949 |
-
"mean_token_accuracy": 0.9056244477629661,
|
| 1950 |
-
"num_tokens": 809927.0,
|
| 1951 |
-
"step": 1820
|
| 1952 |
-
},
|
| 1953 |
-
{
|
| 1954 |
-
"entropy": 0.3643056120723486,
|
| 1955 |
-
"epoch": 2.928,
|
| 1956 |
-
"grad_norm": 0.6338886618614197,
|
| 1957 |
-
"learning_rate": 4.1504e-05,
|
| 1958 |
-
"loss": 0.3035,
|
| 1959 |
-
"mean_token_accuracy": 0.911867779865861,
|
| 1960 |
-
"num_tokens": 832745.0,
|
| 1961 |
-
"step": 1830
|
| 1962 |
-
},
|
| 1963 |
-
{
|
| 1964 |
-
"entropy": 0.35973973935469983,
|
| 1965 |
-
"epoch": 2.944,
|
| 1966 |
-
"grad_norm": 0.8483228087425232,
|
| 1967 |
-
"learning_rate": 4.1184e-05,
|
| 1968 |
-
"loss": 0.3084,
|
| 1969 |
-
"mean_token_accuracy": 0.9093430683016777,
|
| 1970 |
-
"num_tokens": 851193.0,
|
| 1971 |
-
"step": 1840
|
| 1972 |
-
},
|
| 1973 |
-
{
|
| 1974 |
-
"entropy": 0.4053435407578945,
|
| 1975 |
-
"epoch": 2.96,
|
| 1976 |
-
"grad_norm": 0.9516308903694153,
|
| 1977 |
-
"learning_rate": 4.0864e-05,
|
| 1978 |
-
"loss": 0.332,
|
| 1979 |
-
"mean_token_accuracy": 0.8999160658568144,
|
| 1980 |
-
"num_tokens": 863867.0,
|
| 1981 |
-
"step": 1850
|
| 1982 |
-
},
|
| 1983 |
-
{
|
| 1984 |
-
"entropy": 0.2989065528847277,
|
| 1985 |
-
"epoch": 2.976,
|
| 1986 |
-
"grad_norm": 0.6929520964622498,
|
| 1987 |
-
"learning_rate": 4.0544000000000003e-05,
|
| 1988 |
-
"loss": 0.2943,
|
| 1989 |
-
"mean_token_accuracy": 0.9087879080325365,
|
| 1990 |
-
"num_tokens": 898118.0,
|
| 1991 |
-
"step": 1860
|
| 1992 |
-
},
|
| 1993 |
-
{
|
| 1994 |
-
"entropy": 0.3597102670930326,
|
| 1995 |
-
"epoch": 2.992,
|
| 1996 |
-
"grad_norm": 0.7972533106803894,
|
| 1997 |
-
"learning_rate": 4.0224e-05,
|
| 1998 |
-
"loss": 0.3215,
|
| 1999 |
-
"mean_token_accuracy": 0.902438759058714,
|
| 2000 |
-
"num_tokens": 918026.0,
|
| 2001 |
-
"step": 1870
|
| 2002 |
-
},
|
| 2003 |
-
{
|
| 2004 |
-
"entropy": 0.3693191984202713,
|
| 2005 |
-
"epoch": 3.008,
|
| 2006 |
-
"grad_norm": 0.4952141344547272,
|
| 2007 |
-
"learning_rate": 3.9904e-05,
|
| 2008 |
-
"loss": 0.3109,
|
| 2009 |
-
"mean_token_accuracy": 0.9047053713351488,
|
| 2010 |
-
"num_tokens": 946468.0,
|
| 2011 |
-
"step": 1880
|
| 2012 |
-
},
|
| 2013 |
-
{
|
| 2014 |
-
"entropy": 0.30884325662627815,
|
| 2015 |
-
"epoch": 3.024,
|
| 2016 |
-
"grad_norm": 0.6402750015258789,
|
| 2017 |
-
"learning_rate": 3.9584000000000006e-05,
|
| 2018 |
-
"loss": 0.287,
|
| 2019 |
-
"mean_token_accuracy": 0.9127614002674818,
|
| 2020 |
-
"num_tokens": 978498.0,
|
| 2021 |
-
"step": 1890
|
| 2022 |
-
},
|
| 2023 |
-
{
|
| 2024 |
-
"entropy": 0.3251019007526338,
|
| 2025 |
-
"epoch": 3.04,
|
| 2026 |
-
"grad_norm": 0.7701610326766968,
|
| 2027 |
-
"learning_rate": 3.9264e-05,
|
| 2028 |
-
"loss": 0.3012,
|
| 2029 |
-
"mean_token_accuracy": 0.9117080509662628,
|
| 2030 |
-
"num_tokens": 1004128.0,
|
| 2031 |
-
"step": 1900
|
| 2032 |
-
},
|
| 2033 |
-
{
|
| 2034 |
-
"entropy": 0.3512966329231858,
|
| 2035 |
-
"epoch": 3.056,
|
| 2036 |
-
"grad_norm": 0.934260368347168,
|
| 2037 |
-
"learning_rate": 3.8944000000000005e-05,
|
| 2038 |
-
"loss": 0.2996,
|
| 2039 |
-
"mean_token_accuracy": 0.9139776781201363,
|
| 2040 |
-
"num_tokens": 1025136.0,
|
| 2041 |
-
"step": 1910
|
| 2042 |
-
},
|
| 2043 |
-
{
|
| 2044 |
-
"entropy": 0.36649829614907503,
|
| 2045 |
-
"epoch": 3.072,
|
| 2046 |
-
"grad_norm": 1.147735357284546,
|
| 2047 |
-
"learning_rate": 3.8624e-05,
|
| 2048 |
-
"loss": 0.3172,
|
| 2049 |
-
"mean_token_accuracy": 0.90965236723423,
|
| 2050 |
-
"num_tokens": 1041157.0,
|
| 2051 |
-
"step": 1920
|
| 2052 |
-
},
|
| 2053 |
-
{
|
| 2054 |
-
"entropy": 0.33526935083791615,
|
| 2055 |
-
"epoch": 3.088,
|
| 2056 |
-
"grad_norm": 0.6278552412986755,
|
| 2057 |
-
"learning_rate": 3.8304e-05,
|
| 2058 |
-
"loss": 0.294,
|
| 2059 |
-
"mean_token_accuracy": 0.914416927471757,
|
| 2060 |
-
"num_tokens": 1069401.0,
|
| 2061 |
-
"step": 1930
|
| 2062 |
-
},
|
| 2063 |
-
{
|
| 2064 |
-
"entropy": 0.2916401638649404,
|
| 2065 |
-
"epoch": 3.104,
|
| 2066 |
-
"grad_norm": 0.7106419205665588,
|
| 2067 |
-
"learning_rate": 3.7984e-05,
|
| 2068 |
-
"loss": 0.2833,
|
| 2069 |
-
"mean_token_accuracy": 0.9128728475421667,
|
| 2070 |
-
"num_tokens": 1101705.0,
|
| 2071 |
-
"step": 1940
|
| 2072 |
-
},
|
| 2073 |
-
{
|
| 2074 |
-
"entropy": 0.31783650666475294,
|
| 2075 |
-
"epoch": 3.12,
|
| 2076 |
-
"grad_norm": 0.6372864246368408,
|
| 2077 |
-
"learning_rate": 3.7664e-05,
|
| 2078 |
-
"loss": 0.2808,
|
| 2079 |
-
"mean_token_accuracy": 0.9190873377025127,
|
| 2080 |
-
"num_tokens": 1127173.0,
|
| 2081 |
-
"step": 1950
|
| 2082 |
-
},
|
| 2083 |
-
{
|
| 2084 |
-
"entropy": 0.33883463945239783,
|
| 2085 |
-
"epoch": 3.136,
|
| 2086 |
-
"grad_norm": 0.7593994736671448,
|
| 2087 |
-
"learning_rate": 3.7344e-05,
|
| 2088 |
-
"loss": 0.2932,
|
| 2089 |
-
"mean_token_accuracy": 0.9133320480585099,
|
| 2090 |
-
"num_tokens": 1147878.0,
|
| 2091 |
-
"step": 1960
|
| 2092 |
-
},
|
| 2093 |
-
{
|
| 2094 |
-
"entropy": 0.36267717741429806,
|
| 2095 |
-
"epoch": 3.152,
|
| 2096 |
-
"grad_norm": 0.9578737616539001,
|
| 2097 |
-
"learning_rate": 3.7024e-05,
|
| 2098 |
-
"loss": 0.3018,
|
| 2099 |
-
"mean_token_accuracy": 0.9135202784091234,
|
| 2100 |
-
"num_tokens": 1164084.0,
|
| 2101 |
-
"step": 1970
|
| 2102 |
-
},
|
| 2103 |
-
{
|
| 2104 |
-
"entropy": 0.33903956757858394,
|
| 2105 |
-
"epoch": 3.168,
|
| 2106 |
-
"grad_norm": 0.5553727746009827,
|
| 2107 |
-
"learning_rate": 3.6704e-05,
|
| 2108 |
-
"loss": 0.2962,
|
| 2109 |
-
"mean_token_accuracy": 0.9128197953104973,
|
| 2110 |
-
"num_tokens": 1192486.0,
|
| 2111 |
-
"step": 1980
|
| 2112 |
-
},
|
| 2113 |
-
{
|
| 2114 |
-
"entropy": 0.2897605660371482,
|
| 2115 |
-
"epoch": 3.184,
|
| 2116 |
-
"grad_norm": 0.7067289352416992,
|
| 2117 |
-
"learning_rate": 3.6384e-05,
|
| 2118 |
-
"loss": 0.2867,
|
| 2119 |
-
"mean_token_accuracy": 0.9137052699923516,
|
| 2120 |
-
"num_tokens": 1224540.0,
|
| 2121 |
-
"step": 1990
|
| 2122 |
-
},
|
| 2123 |
-
{
|
| 2124 |
-
"entropy": 0.32448912151157855,
|
| 2125 |
-
"epoch": 3.2,
|
| 2126 |
-
"grad_norm": 0.7603920102119446,
|
| 2127 |
-
"learning_rate": 3.6064000000000006e-05,
|
| 2128 |
-
"loss": 0.2908,
|
| 2129 |
-
"mean_token_accuracy": 0.9150090869516134,
|
| 2130 |
-
"num_tokens": 1249827.0,
|
| 2131 |
-
"step": 2000
|
| 2132 |
-
},
|
| 2133 |
-
{
|
| 2134 |
-
"epoch": 3.2,
|
| 2135 |
-
"eval_entropy": 0.4150727687478066,
|
| 2136 |
-
"eval_loss": 0.5455561280250549,
|
| 2137 |
-
"eval_mean_token_accuracy": 0.857409807562828,
|
| 2138 |
-
"eval_num_tokens": 1249827.0,
|
| 2139 |
-
"eval_runtime": 982.2461,
|
| 2140 |
-
"eval_samples_per_second": 2.036,
|
| 2141 |
-
"eval_steps_per_second": 0.509,
|
| 2142 |
-
"step": 2000
|
| 2143 |
-
},
|
| 2144 |
-
{
|
| 2145 |
-
"entropy": 0.3617474281229079,
|
| 2146 |
-
"epoch": 3.216,
|
| 2147 |
-
"grad_norm": 0.7705036997795105,
|
| 2148 |
-
"learning_rate": 3.5744e-05,
|
| 2149 |
-
"loss": 0.3175,
|
| 2150 |
-
"mean_token_accuracy": 0.9062783475965261,
|
| 2151 |
-
"num_tokens": 20779.0,
|
| 2152 |
-
"step": 2010
|
| 2153 |
-
},
|
| 2154 |
-
{
|
| 2155 |
-
"entropy": 0.3887558562681079,
|
| 2156 |
-
"epoch": 3.232,
|
| 2157 |
-
"grad_norm": 0.9926668405532837,
|
| 2158 |
-
"learning_rate": 3.5424e-05,
|
| 2159 |
-
"loss": 0.3243,
|
| 2160 |
-
"mean_token_accuracy": 0.9048940639942884,
|
| 2161 |
-
"num_tokens": 37039.0,
|
| 2162 |
-
"step": 2020
|
| 2163 |
-
},
|
| 2164 |
-
{
|
| 2165 |
-
"entropy": 0.36308987056836484,
|
| 2166 |
-
"epoch": 3.248,
|
| 2167 |
-
"grad_norm": 0.5336251258850098,
|
| 2168 |
-
"learning_rate": 3.5104e-05,
|
| 2169 |
-
"loss": 0.3286,
|
| 2170 |
-
"mean_token_accuracy": 0.9028704173862934,
|
| 2171 |
-
"num_tokens": 66230.0,
|
| 2172 |
-
"step": 2030
|
| 2173 |
-
},
|
| 2174 |
-
{
|
| 2175 |
-
"entropy": 0.3100855226628482,
|
| 2176 |
-
"epoch": 3.2640000000000002,
|
| 2177 |
-
"grad_norm": 0.6235008239746094,
|
| 2178 |
-
"learning_rate": 3.4784e-05,
|
| 2179 |
-
"loss": 0.3026,
|
| 2180 |
-
"mean_token_accuracy": 0.9074051853269338,
|
| 2181 |
-
"num_tokens": 98315.0,
|
| 2182 |
-
"step": 2040
|
| 2183 |
-
},
|
| 2184 |
-
{
|
| 2185 |
-
"entropy": 0.33463340234011413,
|
| 2186 |
-
"epoch": 3.2800000000000002,
|
| 2187 |
-
"grad_norm": 0.6380220651626587,
|
| 2188 |
-
"learning_rate": 3.4464e-05,
|
| 2189 |
-
"loss": 0.3058,
|
| 2190 |
-
"mean_token_accuracy": 0.9115277793258428,
|
| 2191 |
-
"num_tokens": 123538.0,
|
| 2192 |
-
"step": 2050
|
| 2193 |
-
},
|
| 2194 |
-
{
|
| 2195 |
-
"entropy": 0.3619419479742646,
|
| 2196 |
-
"epoch": 3.296,
|
| 2197 |
-
"grad_norm": 0.7604582905769348,
|
| 2198 |
-
"learning_rate": 3.4144000000000004e-05,
|
| 2199 |
-
"loss": 0.3112,
|
| 2200 |
-
"mean_token_accuracy": 0.9084025923162699,
|
| 2201 |
-
"num_tokens": 143855.0,
|
| 2202 |
-
"step": 2060
|
| 2203 |
-
},
|
| 2204 |
-
{
|
| 2205 |
-
"entropy": 0.3980453579686582,
|
| 2206 |
-
"epoch": 3.312,
|
| 2207 |
-
"grad_norm": 0.8576037883758545,
|
| 2208 |
-
"learning_rate": 3.3824e-05,
|
| 2209 |
-
"loss": 0.3267,
|
| 2210 |
-
"mean_token_accuracy": 0.9037791218608617,
|
| 2211 |
-
"num_tokens": 159314.0,
|
| 2212 |
-
"step": 2070
|
| 2213 |
-
},
|
| 2214 |
-
{
|
| 2215 |
-
"entropy": 0.35077386572957037,
|
| 2216 |
-
"epoch": 3.328,
|
| 2217 |
-
"grad_norm": 0.5504621863365173,
|
| 2218 |
-
"learning_rate": 3.3504e-05,
|
| 2219 |
-
"loss": 0.3004,
|
| 2220 |
-
"mean_token_accuracy": 0.9084354028105736,
|
| 2221 |
-
"num_tokens": 187464.0,
|
| 2222 |
-
"step": 2080
|
| 2223 |
-
},
|
| 2224 |
-
{
|
| 2225 |
-
"entropy": 0.28209723997861147,
|
| 2226 |
-
"epoch": 3.344,
|
| 2227 |
-
"grad_norm": 0.8361979126930237,
|
| 2228 |
-
"learning_rate": 3.3184000000000006e-05,
|
| 2229 |
-
"loss": 0.2903,
|
| 2230 |
-
"mean_token_accuracy": 0.9112230580300092,
|
| 2231 |
-
"num_tokens": 219657.0,
|
| 2232 |
-
"step": 2090
|
| 2233 |
-
},
|
| 2234 |
-
{
|
| 2235 |
-
"entropy": 0.3153431011363864,
|
| 2236 |
-
"epoch": 3.36,
|
| 2237 |
-
"grad_norm": 0.6275749802589417,
|
| 2238 |
-
"learning_rate": 3.2864e-05,
|
| 2239 |
-
"loss": 0.2894,
|
| 2240 |
-
"mean_token_accuracy": 0.9114996068179607,
|
| 2241 |
-
"num_tokens": 245396.0,
|
| 2242 |
-
"step": 2100
|
| 2243 |
-
},
|
| 2244 |
-
{
|
| 2245 |
-
"epoch": 3.36,
|
| 2246 |
-
"eval_accuracy": 0.026501569905019107,
|
| 2247 |
-
"eval_entropy": 0.4113759865760803,
|
| 2248 |
-
"eval_loss": 0.541074275970459,
|
| 2249 |
-
"eval_mean_token_accuracy": 0.8583663606643677,
|
| 2250 |
-
"eval_num_tokens": 245396.0,
|
| 2251 |
-
"eval_runtime": 869.6626,
|
| 2252 |
-
"eval_samples_per_second": 2.3,
|
| 2253 |
-
"eval_steps_per_second": 0.575,
|
| 2254 |
-
"step": 2100
|
| 2255 |
-
},
|
| 2256 |
-
{
|
| 2257 |
-
"entropy": 0.3517730229534209,
|
| 2258 |
-
"epoch": 3.376,
|
| 2259 |
-
"grad_norm": 0.6908054947853088,
|
| 2260 |
-
"learning_rate": 3.2544000000000006e-05,
|
| 2261 |
-
"loss": 0.3057,
|
| 2262 |
-
"mean_token_accuracy": 0.9103573642671108,
|
| 2263 |
-
"num_tokens": 266432.0,
|
| 2264 |
-
"step": 2110
|
| 2265 |
-
},
|
| 2266 |
-
{
|
| 2267 |
-
"entropy": 0.38618900515139104,
|
| 2268 |
-
"epoch": 3.392,
|
| 2269 |
-
"grad_norm": 0.9056383967399597,
|
| 2270 |
-
"learning_rate": 3.2224e-05,
|
| 2271 |
-
"loss": 0.3188,
|
| 2272 |
-
"mean_token_accuracy": 0.9076898027211427,
|
| 2273 |
-
"num_tokens": 282655.0,
|
| 2274 |
-
"step": 2120
|
| 2275 |
-
},
|
| 2276 |
-
{
|
| 2277 |
-
"entropy": 0.3537537831813097,
|
| 2278 |
-
"epoch": 3.408,
|
| 2279 |
-
"grad_norm": 0.48644715547561646,
|
| 2280 |
-
"learning_rate": 3.1904e-05,
|
| 2281 |
-
"loss": 0.2886,
|
| 2282 |
-
"mean_token_accuracy": 0.9162093725055456,
|
| 2283 |
-
"num_tokens": 310801.0,
|
| 2284 |
-
"step": 2130
|
| 2285 |
-
},
|
| 2286 |
-
{
|
| 2287 |
-
"entropy": 0.26729877749457953,
|
| 2288 |
-
"epoch": 3.424,
|
| 2289 |
-
"grad_norm": 0.6074755787849426,
|
| 2290 |
-
"learning_rate": 3.1584e-05,
|
| 2291 |
-
"loss": 0.2371,
|
| 2292 |
-
"mean_token_accuracy": 0.9263024788349867,
|
| 2293 |
-
"num_tokens": 343555.0,
|
| 2294 |
-
"step": 2140
|
| 2295 |
-
},
|
| 2296 |
-
{
|
| 2297 |
-
"entropy": 0.25955253606662154,
|
| 2298 |
-
"epoch": 3.44,
|
| 2299 |
-
"grad_norm": 0.8773949146270752,
|
| 2300 |
-
"learning_rate": 3.1264e-05,
|
| 2301 |
-
"loss": 0.2227,
|
| 2302 |
-
"mean_token_accuracy": 0.9337353933602571,
|
| 2303 |
-
"num_tokens": 369134.0,
|
| 2304 |
-
"step": 2150
|
| 2305 |
-
},
|
| 2306 |
-
{
|
| 2307 |
-
"entropy": 0.27338800597935914,
|
| 2308 |
-
"epoch": 3.456,
|
| 2309 |
-
"grad_norm": 0.7504522204399109,
|
| 2310 |
-
"learning_rate": 3.0975999999999996e-05,
|
| 2311 |
-
"loss": 0.2261,
|
| 2312 |
-
"mean_token_accuracy": 0.9332862004637719,
|
| 2313 |
-
"num_tokens": 390152.0,
|
| 2314 |
-
"step": 2160
|
| 2315 |
-
},
|
| 2316 |
-
{
|
| 2317 |
-
"entropy": 0.30181694105267526,
|
| 2318 |
-
"epoch": 3.472,
|
| 2319 |
-
"grad_norm": 0.8649200201034546,
|
| 2320 |
-
"learning_rate": 3.0656e-05,
|
| 2321 |
-
"loss": 0.2289,
|
| 2322 |
-
"mean_token_accuracy": 0.9334215141832829,
|
| 2323 |
-
"num_tokens": 406222.0,
|
| 2324 |
-
"step": 2170
|
| 2325 |
-
},
|
| 2326 |
-
{
|
| 2327 |
-
"entropy": 0.28406244921498003,
|
| 2328 |
-
"epoch": 3.488,
|
| 2329 |
-
"grad_norm": 1.9269925355911255,
|
| 2330 |
-
"learning_rate": 3.0336000000000002e-05,
|
| 2331 |
-
"loss": 0.2353,
|
| 2332 |
-
"mean_token_accuracy": 0.9303826864808797,
|
| 2333 |
-
"num_tokens": 434767.0,
|
| 2334 |
-
"step": 2180
|
| 2335 |
-
},
|
| 2336 |
-
{
|
| 2337 |
-
"entropy": 0.2358154426328838,
|
| 2338 |
-
"epoch": 3.504,
|
| 2339 |
-
"grad_norm": 0.7775760293006897,
|
| 2340 |
-
"learning_rate": 3.0016e-05,
|
| 2341 |
-
"loss": 0.2277,
|
| 2342 |
-
"mean_token_accuracy": 0.9293628957122564,
|
| 2343 |
-
"num_tokens": 467498.0,
|
| 2344 |
-
"step": 2190
|
| 2345 |
-
},
|
| 2346 |
-
{
|
| 2347 |
-
"entropy": 0.2596265008673072,
|
| 2348 |
-
"epoch": 3.52,
|
| 2349 |
-
"grad_norm": 0.7286163568496704,
|
| 2350 |
-
"learning_rate": 2.9696e-05,
|
| 2351 |
-
"loss": 0.2266,
|
| 2352 |
-
"mean_token_accuracy": 0.9321592267602682,
|
| 2353 |
-
"num_tokens": 493146.0,
|
| 2354 |
-
"step": 2200
|
| 2355 |
-
},
|
| 2356 |
-
{
|
| 2357 |
-
"entropy": 0.28550293026492,
|
| 2358 |
-
"epoch": 3.536,
|
| 2359 |
-
"grad_norm": 0.7693914175033569,
|
| 2360 |
-
"learning_rate": 2.9376000000000005e-05,
|
| 2361 |
-
"loss": 0.2291,
|
| 2362 |
-
"mean_token_accuracy": 0.9351058643311262,
|
| 2363 |
-
"num_tokens": 513926.0,
|
| 2364 |
-
"step": 2210
|
| 2365 |
-
},
|
| 2366 |
-
{
|
| 2367 |
-
"entropy": 0.2885140863247216,
|
| 2368 |
-
"epoch": 3.552,
|
| 2369 |
-
"grad_norm": 1.1927505731582642,
|
| 2370 |
-
"learning_rate": 2.9056e-05,
|
| 2371 |
-
"loss": 0.219,
|
| 2372 |
-
"mean_token_accuracy": 0.9396381825208664,
|
| 2373 |
-
"num_tokens": 530263.0,
|
| 2374 |
-
"step": 2220
|
| 2375 |
-
},
|
| 2376 |
-
{
|
| 2377 |
-
"entropy": 0.283741835039109,
|
| 2378 |
-
"epoch": 3.568,
|
| 2379 |
-
"grad_norm": 0.6537899971008301,
|
| 2380 |
-
"learning_rate": 2.8736e-05,
|
| 2381 |
-
"loss": 0.2324,
|
| 2382 |
-
"mean_token_accuracy": 0.9302929677069187,
|
| 2383 |
-
"num_tokens": 559791.0,
|
| 2384 |
-
"step": 2230
|
| 2385 |
-
},
|
| 2386 |
-
{
|
| 2387 |
-
"entropy": 0.2369093818590045,
|
| 2388 |
-
"epoch": 3.584,
|
| 2389 |
-
"grad_norm": 0.793480396270752,
|
| 2390 |
-
"learning_rate": 2.8416000000000004e-05,
|
| 2391 |
-
"loss": 0.2165,
|
| 2392 |
-
"mean_token_accuracy": 0.9320364937186241,
|
| 2393 |
-
"num_tokens": 592398.0,
|
| 2394 |
-
"step": 2240
|
| 2395 |
-
},
|
| 2396 |
-
{
|
| 2397 |
-
"entropy": 0.264733817987144,
|
| 2398 |
-
"epoch": 3.6,
|
| 2399 |
-
"grad_norm": 0.7945203185081482,
|
| 2400 |
-
"learning_rate": 2.8096e-05,
|
| 2401 |
-
"loss": 0.2337,
|
| 2402 |
-
"mean_token_accuracy": 0.9294226188212633,
|
| 2403 |
-
"num_tokens": 617982.0,
|
| 2404 |
-
"step": 2250
|
| 2405 |
-
},
|
| 2406 |
-
{
|
| 2407 |
-
"entropy": 0.2889886857941747,
|
| 2408 |
-
"epoch": 3.616,
|
| 2409 |
-
"grad_norm": 0.7558261752128601,
|
| 2410 |
-
"learning_rate": 2.7776000000000003e-05,
|
| 2411 |
-
"loss": 0.2305,
|
| 2412 |
-
"mean_token_accuracy": 0.9317790925502777,
|
| 2413 |
-
"num_tokens": 639115.0,
|
| 2414 |
-
"step": 2260
|
| 2415 |
-
},
|
| 2416 |
-
{
|
| 2417 |
-
"entropy": 0.28708559228107333,
|
| 2418 |
-
"epoch": 3.632,
|
| 2419 |
-
"grad_norm": 0.6877163648605347,
|
| 2420 |
-
"learning_rate": 2.7456000000000003e-05,
|
| 2421 |
-
"loss": 0.2215,
|
| 2422 |
-
"mean_token_accuracy": 0.9357377961277962,
|
| 2423 |
-
"num_tokens": 655709.0,
|
| 2424 |
-
"step": 2270
|
| 2425 |
-
},
|
| 2426 |
-
{
|
| 2427 |
-
"entropy": 0.28660596534609795,
|
| 2428 |
-
"epoch": 3.648,
|
| 2429 |
-
"grad_norm": 0.6599491238594055,
|
| 2430 |
-
"learning_rate": 2.7136e-05,
|
| 2431 |
-
"loss": 0.2363,
|
| 2432 |
-
"mean_token_accuracy": 0.928611570596695,
|
| 2433 |
-
"num_tokens": 684500.0,
|
| 2434 |
-
"step": 2280
|
| 2435 |
-
},
|
| 2436 |
-
{
|
| 2437 |
-
"entropy": 0.23836621949449183,
|
| 2438 |
-
"epoch": 3.664,
|
| 2439 |
-
"grad_norm": 0.7436323165893555,
|
| 2440 |
-
"learning_rate": 2.6816000000000002e-05,
|
| 2441 |
-
"loss": 0.2194,
|
| 2442 |
-
"mean_token_accuracy": 0.9314162913709879,
|
| 2443 |
-
"num_tokens": 717271.0,
|
| 2444 |
-
"step": 2290
|
| 2445 |
-
},
|
| 2446 |
-
{
|
| 2447 |
-
"entropy": 0.27099227644503115,
|
| 2448 |
-
"epoch": 3.68,
|
| 2449 |
-
"grad_norm": 0.7519745826721191,
|
| 2450 |
-
"learning_rate": 2.6496e-05,
|
| 2451 |
-
"loss": 0.2369,
|
| 2452 |
-
"mean_token_accuracy": 0.9278060872107744,
|
| 2453 |
-
"num_tokens": 743068.0,
|
| 2454 |
-
"step": 2300
|
| 2455 |
-
},
|
| 2456 |
-
{
|
| 2457 |
-
"entropy": 0.282380092702806,
|
| 2458 |
-
"epoch": 3.6959999999999997,
|
| 2459 |
-
"grad_norm": 0.7645207643508911,
|
| 2460 |
-
"learning_rate": 2.6176e-05,
|
| 2461 |
-
"loss": 0.2175,
|
| 2462 |
-
"mean_token_accuracy": 0.9372334524989128,
|
| 2463 |
-
"num_tokens": 763925.0,
|
| 2464 |
-
"step": 2310
|
| 2465 |
-
},
|
| 2466 |
-
{
|
| 2467 |
-
"entropy": 0.2850790939293802,
|
| 2468 |
-
"epoch": 3.7119999999999997,
|
| 2469 |
-
"grad_norm": 0.9016556143760681,
|
| 2470 |
-
"learning_rate": 2.5856e-05,
|
| 2471 |
-
"loss": 0.217,
|
| 2472 |
-
"mean_token_accuracy": 0.9392455574125051,
|
| 2473 |
-
"num_tokens": 780111.0,
|
| 2474 |
-
"step": 2320
|
| 2475 |
-
},
|
| 2476 |
-
{
|
| 2477 |
-
"entropy": 0.2691464308649302,
|
| 2478 |
-
"epoch": 3.7279999999999998,
|
| 2479 |
-
"grad_norm": 0.77091383934021,
|
| 2480 |
-
"learning_rate": 2.5535999999999997e-05,
|
| 2481 |
-
"loss": 0.2334,
|
| 2482 |
-
"mean_token_accuracy": 0.929338139295578,
|
| 2483 |
-
"num_tokens": 808661.0,
|
| 2484 |
-
"step": 2330
|
| 2485 |
-
},
|
| 2486 |
-
{
|
| 2487 |
-
"entropy": 0.2395469973795116,
|
| 2488 |
-
"epoch": 3.7439999999999998,
|
| 2489 |
-
"grad_norm": 0.7632396221160889,
|
| 2490 |
-
"learning_rate": 2.5216e-05,
|
| 2491 |
-
"loss": 0.2148,
|
| 2492 |
-
"mean_token_accuracy": 0.9322273649275303,
|
| 2493 |
-
"num_tokens": 840932.0,
|
| 2494 |
-
"step": 2340
|
| 2495 |
-
},
|
| 2496 |
-
{
|
| 2497 |
-
"entropy": 0.2645680231973529,
|
| 2498 |
-
"epoch": 3.76,
|
| 2499 |
-
"grad_norm": 0.819273054599762,
|
| 2500 |
-
"learning_rate": 2.4896e-05,
|
| 2501 |
-
"loss": 0.226,
|
| 2502 |
-
"mean_token_accuracy": 0.930556321516633,
|
| 2503 |
-
"num_tokens": 866564.0,
|
| 2504 |
-
"step": 2350
|
| 2505 |
-
},
|
| 2506 |
-
{
|
| 2507 |
-
"entropy": 0.2808503101579845,
|
| 2508 |
-
"epoch": 3.776,
|
| 2509 |
-
"grad_norm": 0.8598120808601379,
|
| 2510 |
-
"learning_rate": 2.4576000000000003e-05,
|
| 2511 |
-
"loss": 0.2215,
|
| 2512 |
-
"mean_token_accuracy": 0.9356644533574581,
|
| 2513 |
-
"num_tokens": 887527.0,
|
| 2514 |
-
"step": 2360
|
| 2515 |
-
},
|
| 2516 |
-
{
|
| 2517 |
-
"entropy": 0.28694011168554423,
|
| 2518 |
-
"epoch": 3.792,
|
| 2519 |
-
"grad_norm": 1.0404748916625977,
|
| 2520 |
-
"learning_rate": 2.4256e-05,
|
| 2521 |
-
"loss": 0.214,
|
| 2522 |
-
"mean_token_accuracy": 0.9388030290603637,
|
| 2523 |
-
"num_tokens": 903688.0,
|
| 2524 |
-
"step": 2370
|
| 2525 |
-
},
|
| 2526 |
-
{
|
| 2527 |
-
"entropy": 0.2774578414391726,
|
| 2528 |
-
"epoch": 3.808,
|
| 2529 |
-
"grad_norm": 1.2308194637298584,
|
| 2530 |
-
"learning_rate": 2.3936e-05,
|
| 2531 |
-
"loss": 0.2328,
|
| 2532 |
-
"mean_token_accuracy": 0.929581755027175,
|
| 2533 |
-
"num_tokens": 932975.0,
|
| 2534 |
-
"step": 2380
|
| 2535 |
-
},
|
| 2536 |
-
{
|
| 2537 |
-
"entropy": 0.2381771973334253,
|
| 2538 |
-
"epoch": 3.824,
|
| 2539 |
-
"grad_norm": 0.7983541488647461,
|
| 2540 |
-
"learning_rate": 2.3616000000000002e-05,
|
| 2541 |
-
"loss": 0.2177,
|
| 2542 |
-
"mean_token_accuracy": 0.9316004611551761,
|
| 2543 |
-
"num_tokens": 965221.0,
|
| 2544 |
-
"step": 2390
|
| 2545 |
-
},
|
| 2546 |
-
{
|
| 2547 |
-
"entropy": 0.2579630766995251,
|
| 2548 |
-
"epoch": 3.84,
|
| 2549 |
-
"grad_norm": 0.8867554068565369,
|
| 2550 |
-
"learning_rate": 2.3296000000000002e-05,
|
| 2551 |
-
"loss": 0.2221,
|
| 2552 |
-
"mean_token_accuracy": 0.9320516049861908,
|
| 2553 |
-
"num_tokens": 990859.0,
|
| 2554 |
-
"step": 2400
|
| 2555 |
-
},
|
| 2556 |
-
{
|
| 2557 |
-
"epoch": 3.84,
|
| 2558 |
-
"eval_accuracy": 0.02676376698545462,
|
| 2559 |
-
"eval_entropy": 0.3534155045747757,
|
| 2560 |
-
"eval_loss": 0.6058897972106934,
|
| 2561 |
-
"eval_mean_token_accuracy": 0.8553497910499572,
|
| 2562 |
-
"eval_num_tokens": 990859.0,
|
| 2563 |
-
"eval_runtime": 869.2088,
|
| 2564 |
-
"eval_samples_per_second": 2.301,
|
| 2565 |
-
"eval_steps_per_second": 0.575,
|
| 2566 |
-
"step": 2400
|
| 2567 |
-
},
|
| 2568 |
-
{
|
| 2569 |
-
"entropy": 0.2655953477136791,
|
| 2570 |
-
"epoch": 3.856,
|
| 2571 |
-
"grad_norm": 0.8277497291564941,
|
| 2572 |
-
"learning_rate": 2.2976e-05,
|
| 2573 |
-
"loss": 0.2109,
|
| 2574 |
-
"mean_token_accuracy": 0.9393812574446201,
|
| 2575 |
-
"num_tokens": 1011268.0,
|
| 2576 |
-
"step": 2410
|
| 2577 |
-
},
|
| 2578 |
-
{
|
| 2579 |
-
"entropy": 0.2920296056661755,
|
| 2580 |
-
"epoch": 3.872,
|
| 2581 |
-
"grad_norm": 1.015434980392456,
|
| 2582 |
-
"learning_rate": 2.2656e-05,
|
| 2583 |
-
"loss": 0.2243,
|
| 2584 |
-
"mean_token_accuracy": 0.9357186656445264,
|
| 2585 |
-
"num_tokens": 1026942.0,
|
| 2586 |
-
"step": 2420
|
| 2587 |
-
},
|
| 2588 |
-
{
|
| 2589 |
-
"entropy": 0.2859017666429281,
|
| 2590 |
-
"epoch": 3.888,
|
| 2591 |
-
"grad_norm": 0.6656726002693176,
|
| 2592 |
-
"learning_rate": 2.2336e-05,
|
| 2593 |
-
"loss": 0.2389,
|
| 2594 |
-
"mean_token_accuracy": 0.9283736657351256,
|
| 2595 |
-
"num_tokens": 1053937.0,
|
| 2596 |
-
"step": 2430
|
| 2597 |
-
},
|
| 2598 |
-
{
|
| 2599 |
-
"entropy": 0.24961302392184734,
|
| 2600 |
-
"epoch": 3.904,
|
| 2601 |
-
"grad_norm": 0.8390278816223145,
|
| 2602 |
-
"learning_rate": 2.2016e-05,
|
| 2603 |
-
"loss": 0.2211,
|
| 2604 |
-
"mean_token_accuracy": 0.9312011521309614,
|
| 2605 |
-
"num_tokens": 1084820.0,
|
| 2606 |
-
"step": 2440
|
| 2607 |
-
},
|
| 2608 |
-
{
|
| 2609 |
-
"entropy": 0.2519187033176422,
|
| 2610 |
-
"epoch": 3.92,
|
| 2611 |
-
"grad_norm": 0.8542287349700928,
|
| 2612 |
-
"learning_rate": 2.1696e-05,
|
| 2613 |
-
"loss": 0.2126,
|
| 2614 |
-
"mean_token_accuracy": 0.9375488836318254,
|
| 2615 |
-
"num_tokens": 1109943.0,
|
| 2616 |
-
"step": 2450
|
| 2617 |
-
},
|
| 2618 |
-
{
|
| 2619 |
-
"entropy": 0.27277124775573613,
|
| 2620 |
-
"epoch": 3.936,
|
| 2621 |
-
"grad_norm": 0.9245595335960388,
|
| 2622 |
-
"learning_rate": 2.1376e-05,
|
| 2623 |
-
"loss": 0.2161,
|
| 2624 |
-
"mean_token_accuracy": 0.9364014331251382,
|
| 2625 |
-
"num_tokens": 1130543.0,
|
| 2626 |
-
"step": 2460
|
| 2627 |
-
},
|
| 2628 |
-
{
|
| 2629 |
-
"entropy": 0.28273853762075307,
|
| 2630 |
-
"epoch": 3.952,
|
| 2631 |
-
"grad_norm": 0.9764724969863892,
|
| 2632 |
-
"learning_rate": 2.1056e-05,
|
| 2633 |
-
"loss": 0.2217,
|
| 2634 |
-
"mean_token_accuracy": 0.9356040749698877,
|
| 2635 |
-
"num_tokens": 1146676.0,
|
| 2636 |
-
"step": 2470
|
| 2637 |
-
},
|
| 2638 |
-
{
|
| 2639 |
-
"entropy": 0.2879827093333006,
|
| 2640 |
-
"epoch": 3.968,
|
| 2641 |
-
"grad_norm": 0.7532303929328918,
|
| 2642 |
-
"learning_rate": 2.0736e-05,
|
| 2643 |
-
"loss": 0.2413,
|
| 2644 |
-
"mean_token_accuracy": 0.9290374431759119,
|
| 2645 |
-
"num_tokens": 1172078.0,
|
| 2646 |
-
"step": 2480
|
| 2647 |
-
},
|
| 2648 |
-
{
|
| 2649 |
-
"entropy": 0.2530561724677682,
|
| 2650 |
-
"epoch": 3.984,
|
| 2651 |
-
"grad_norm": 0.8568546175956726,
|
| 2652 |
-
"learning_rate": 2.0416000000000002e-05,
|
| 2653 |
-
"loss": 0.2177,
|
| 2654 |
-
"mean_token_accuracy": 0.9337470591068268,
|
| 2655 |
-
"num_tokens": 1197464.0,
|
| 2656 |
-
"step": 2490
|
| 2657 |
-
},
|
| 2658 |
-
{
|
| 2659 |
-
"entropy": 0.3038310568779707,
|
| 2660 |
-
"epoch": 4.0,
|
| 2661 |
-
"grad_norm": 0.9622617959976196,
|
| 2662 |
-
"learning_rate": 2.0096000000000002e-05,
|
| 2663 |
-
"loss": 0.2368,
|
| 2664 |
-
"mean_token_accuracy": 0.9296225290745497,
|
| 2665 |
-
"num_tokens": 1212204.0,
|
| 2666 |
-
"step": 2500
|
| 2667 |
-
},
|
| 2668 |
-
{
|
| 2669 |
-
"entropy": 0.24809251818805933,
|
| 2670 |
-
"epoch": 4.016,
|
| 2671 |
-
"grad_norm": 0.8197008371353149,
|
| 2672 |
-
"learning_rate": 1.9776000000000002e-05,
|
| 2673 |
-
"loss": 0.2395,
|
| 2674 |
-
"mean_token_accuracy": 0.928604032099247,
|
| 2675 |
-
"num_tokens": 1253458.0,
|
| 2676 |
-
"step": 2510
|
| 2677 |
-
},
|
| 2678 |
-
{
|
| 2679 |
-
"entropy": 0.24905966678634286,
|
| 2680 |
-
"epoch": 4.032,
|
| 2681 |
-
"grad_norm": 0.8056384921073914,
|
| 2682 |
-
"learning_rate": 1.9456e-05,
|
| 2683 |
-
"loss": 0.2301,
|
| 2684 |
-
"mean_token_accuracy": 0.9330911111086607,
|
| 2685 |
-
"num_tokens": 1282365.0,
|
| 2686 |
-
"step": 2520
|
| 2687 |
-
},
|
| 2688 |
-
{
|
| 2689 |
-
"entropy": 0.26601817598566413,
|
| 2690 |
-
"epoch": 4.048,
|
| 2691 |
-
"grad_norm": 0.9766417145729065,
|
| 2692 |
-
"learning_rate": 1.9136e-05,
|
| 2693 |
-
"loss": 0.2237,
|
| 2694 |
-
"mean_token_accuracy": 0.9384452097117901,
|
| 2695 |
-
"num_tokens": 1305420.0,
|
| 2696 |
-
"step": 2530
|
| 2697 |
-
},
|
| 2698 |
-
{
|
| 2699 |
-
"entropy": 0.28673125999048354,
|
| 2700 |
-
"epoch": 4.064,
|
| 2701 |
-
"grad_norm": 1.2241604328155518,
|
| 2702 |
-
"learning_rate": 1.8816e-05,
|
| 2703 |
-
"loss": 0.2615,
|
| 2704 |
-
"mean_token_accuracy": 0.9268214203417301,
|
| 2705 |
-
"num_tokens": 1323367.0,
|
| 2706 |
-
"step": 2540
|
| 2707 |
-
},
|
| 2708 |
-
{
|
| 2709 |
-
"entropy": 0.3297149523161352,
|
| 2710 |
-
"epoch": 4.08,
|
| 2711 |
-
"grad_norm": 1.2444630861282349,
|
| 2712 |
-
"learning_rate": 1.8496000000000004e-05,
|
| 2713 |
-
"loss": 0.266,
|
| 2714 |
-
"mean_token_accuracy": 0.9285014558583498,
|
| 2715 |
-
"num_tokens": 1335370.0,
|
| 2716 |
-
"step": 2550
|
| 2717 |
-
},
|
| 2718 |
-
{
|
| 2719 |
-
"entropy": 0.25180468857288363,
|
| 2720 |
-
"epoch": 4.096,
|
| 2721 |
-
"grad_norm": 0.6901214718818665,
|
| 2722 |
-
"learning_rate": 1.8176e-05,
|
| 2723 |
-
"loss": 0.2242,
|
| 2724 |
-
"mean_token_accuracy": 0.9317782554775477,
|
| 2725 |
-
"num_tokens": 1374567.0,
|
| 2726 |
-
"step": 2560
|
| 2727 |
-
},
|
| 2728 |
-
{
|
| 2729 |
-
"entropy": 0.25819407450035214,
|
| 2730 |
-
"epoch": 4.112,
|
| 2731 |
-
"grad_norm": 0.8702373504638672,
|
| 2732 |
-
"learning_rate": 1.7856e-05,
|
| 2733 |
-
"loss": 0.2344,
|
| 2734 |
-
"mean_token_accuracy": 0.9326971143484115,
|
| 2735 |
-
"num_tokens": 1402608.0,
|
| 2736 |
-
"step": 2570
|
| 2737 |
-
},
|
| 2738 |
-
{
|
| 2739 |
-
"entropy": 0.26549670435488226,
|
| 2740 |
-
"epoch": 4.128,
|
| 2741 |
-
"grad_norm": 0.7631207704544067,
|
| 2742 |
-
"learning_rate": 1.7536e-05,
|
| 2743 |
-
"loss": 0.2297,
|
| 2744 |
-
"mean_token_accuracy": 0.9365796335041523,
|
| 2745 |
-
"num_tokens": 1425524.0,
|
| 2746 |
-
"step": 2580
|
| 2747 |
-
},
|
| 2748 |
-
{
|
| 2749 |
-
"entropy": 0.26975566176697613,
|
| 2750 |
-
"epoch": 4.144,
|
| 2751 |
-
"grad_norm": 1.1718668937683105,
|
| 2752 |
-
"learning_rate": 1.7216000000000003e-05,
|
| 2753 |
-
"loss": 0.221,
|
| 2754 |
-
"mean_token_accuracy": 0.9397962510585784,
|
| 2755 |
-
"num_tokens": 1444092.0,
|
| 2756 |
-
"step": 2590
|
| 2757 |
-
},
|
| 2758 |
-
{
|
| 2759 |
-
"entropy": 0.3168819394893944,
|
| 2760 |
-
"epoch": 4.16,
|
| 2761 |
-
"grad_norm": 1.0534077882766724,
|
| 2762 |
-
"learning_rate": 1.6896000000000002e-05,
|
| 2763 |
-
"loss": 0.2544,
|
| 2764 |
-
"mean_token_accuracy": 0.9319371480494738,
|
| 2765 |
-
"num_tokens": 1456844.0,
|
| 2766 |
-
"step": 2600
|
| 2767 |
-
},
|
| 2768 |
-
{
|
| 2769 |
-
"entropy": 0.25265237540006635,
|
| 2770 |
-
"epoch": 4.176,
|
| 2771 |
-
"grad_norm": 0.7592364549636841,
|
| 2772 |
-
"learning_rate": 1.6576e-05,
|
| 2773 |
-
"loss": 0.2395,
|
| 2774 |
-
"mean_token_accuracy": 0.9289916418492794,
|
| 2775 |
-
"num_tokens": 1496545.0,
|
| 2776 |
-
"step": 2610
|
| 2777 |
-
},
|
| 2778 |
-
{
|
| 2779 |
-
"entropy": 0.2543726827017963,
|
| 2780 |
-
"epoch": 4.192,
|
| 2781 |
-
"grad_norm": 0.9639586210250854,
|
| 2782 |
-
"learning_rate": 1.6256e-05,
|
| 2783 |
-
"loss": 0.2351,
|
| 2784 |
-
"mean_token_accuracy": 0.9337568439543247,
|
| 2785 |
-
"num_tokens": 1525103.0,
|
| 2786 |
-
"step": 2620
|
| 2787 |
-
},
|
| 2788 |
-
{
|
| 2789 |
-
"entropy": 0.26547051025554536,
|
| 2790 |
-
"epoch": 4.208,
|
| 2791 |
-
"grad_norm": 0.9620559215545654,
|
| 2792 |
-
"learning_rate": 1.5936e-05,
|
| 2793 |
-
"loss": 0.2382,
|
| 2794 |
-
"mean_token_accuracy": 0.9348125293850899,
|
| 2795 |
-
"num_tokens": 1548306.0,
|
| 2796 |
-
"step": 2630
|
| 2797 |
-
},
|
| 2798 |
-
{
|
| 2799 |
-
"entropy": 0.27369030360132457,
|
| 2800 |
-
"epoch": 4.224,
|
| 2801 |
-
"grad_norm": 0.8373218774795532,
|
| 2802 |
-
"learning_rate": 1.5616e-05,
|
| 2803 |
-
"loss": 0.2254,
|
| 2804 |
-
"mean_token_accuracy": 0.9375662509351969,
|
| 2805 |
-
"num_tokens": 1566990.0,
|
| 2806 |
-
"step": 2640
|
| 2807 |
-
},
|
| 2808 |
-
{
|
| 2809 |
-
"entropy": 0.3024815677665174,
|
| 2810 |
-
"epoch": 4.24,
|
| 2811 |
-
"grad_norm": 1.3148176670074463,
|
| 2812 |
-
"learning_rate": 1.5296e-05,
|
| 2813 |
-
"loss": 0.2391,
|
| 2814 |
-
"mean_token_accuracy": 0.9351990919560194,
|
| 2815 |
-
"num_tokens": 1580065.0,
|
| 2816 |
-
"step": 2650
|
| 2817 |
-
},
|
| 2818 |
-
{
|
| 2819 |
-
"entropy": 0.2600595161318779,
|
| 2820 |
-
"epoch": 4.256,
|
| 2821 |
-
"grad_norm": 0.6774656176567078,
|
| 2822 |
-
"learning_rate": 1.4976000000000002e-05,
|
| 2823 |
-
"loss": 0.2377,
|
| 2824 |
-
"mean_token_accuracy": 0.9274554952979088,
|
| 2825 |
-
"num_tokens": 1619083.0,
|
| 2826 |
-
"step": 2660
|
| 2827 |
-
},
|
| 2828 |
-
{
|
| 2829 |
-
"entropy": 0.26013899641111493,
|
| 2830 |
-
"epoch": 4.272,
|
| 2831 |
-
"grad_norm": 0.9727310538291931,
|
| 2832 |
-
"learning_rate": 1.4656e-05,
|
| 2833 |
-
"loss": 0.2294,
|
| 2834 |
-
"mean_token_accuracy": 0.934112536534667,
|
| 2835 |
-
"num_tokens": 1646970.0,
|
| 2836 |
-
"step": 2670
|
| 2837 |
-
},
|
| 2838 |
-
{
|
| 2839 |
-
"entropy": 0.25867203902453184,
|
| 2840 |
-
"epoch": 4.288,
|
| 2841 |
-
"grad_norm": 0.9198706150054932,
|
| 2842 |
-
"learning_rate": 1.4336e-05,
|
| 2843 |
-
"loss": 0.2184,
|
| 2844 |
-
"mean_token_accuracy": 0.9373745564371347,
|
| 2845 |
-
"num_tokens": 1669364.0,
|
| 2846 |
-
"step": 2680
|
| 2847 |
-
},
|
| 2848 |
-
{
|
| 2849 |
-
"entropy": 0.26432402124628424,
|
| 2850 |
-
"epoch": 4.304,
|
| 2851 |
-
"grad_norm": 0.9908862709999084,
|
| 2852 |
-
"learning_rate": 1.4016000000000001e-05,
|
| 2853 |
-
"loss": 0.2195,
|
| 2854 |
-
"mean_token_accuracy": 0.9392576098442078,
|
| 2855 |
-
"num_tokens": 1687812.0,
|
| 2856 |
-
"step": 2690
|
| 2857 |
-
},
|
| 2858 |
-
{
|
| 2859 |
-
"entropy": 0.30741472546942533,
|
| 2860 |
-
"epoch": 4.32,
|
| 2861 |
-
"grad_norm": 1.0388495922088623,
|
| 2862 |
-
"learning_rate": 1.3696e-05,
|
| 2863 |
-
"loss": 0.2503,
|
| 2864 |
-
"mean_token_accuracy": 0.9325483400374651,
|
| 2865 |
-
"num_tokens": 1700598.0,
|
| 2866 |
-
"step": 2700
|
| 2867 |
-
},
|
| 2868 |
-
{
|
| 2869 |
-
"epoch": 4.32,
|
| 2870 |
-
"eval_accuracy": 0.02638358121882313,
|
| 2871 |
-
"eval_entropy": 0.3719751555919647,
|
| 2872 |
-
"eval_loss": 0.5846644043922424,
|
| 2873 |
-
"eval_mean_token_accuracy": 0.8568292667865753,
|
| 2874 |
-
"eval_num_tokens": 1700598.0,
|
| 2875 |
-
"eval_runtime": 869.8497,
|
| 2876 |
-
"eval_samples_per_second": 2.299,
|
| 2877 |
-
"eval_steps_per_second": 0.575,
|
| 2878 |
-
"step": 2700
|
| 2879 |
-
},
|
| 2880 |
-
{
|
| 2881 |
-
"entropy": 0.24316317560151218,
|
| 2882 |
-
"epoch": 4.336,
|
| 2883 |
-
"grad_norm": 0.757876455783844,
|
| 2884 |
-
"learning_rate": 1.3376e-05,
|
| 2885 |
-
"loss": 0.2118,
|
| 2886 |
-
"mean_token_accuracy": 0.9327260747551918,
|
| 2887 |
-
"num_tokens": 39749.0,
|
| 2888 |
-
"step": 2710
|
| 2889 |
-
},
|
| 2890 |
-
{
|
| 2891 |
-
"entropy": 0.2465177897363901,
|
| 2892 |
-
"epoch": 4.352,
|
| 2893 |
-
"grad_norm": 0.73354172706604,
|
| 2894 |
-
"learning_rate": 1.3056000000000002e-05,
|
| 2895 |
-
"loss": 0.21,
|
| 2896 |
-
"mean_token_accuracy": 0.9354286625981331,
|
| 2897 |
-
"num_tokens": 68464.0,
|
| 2898 |
-
"step": 2720
|
| 2899 |
-
},
|
| 2900 |
-
{
|
| 2901 |
-
"entropy": 0.24799817334860563,
|
| 2902 |
-
"epoch": 4.368,
|
| 2903 |
-
"grad_norm": 0.9990701675415039,
|
| 2904 |
-
"learning_rate": 1.2736000000000001e-05,
|
| 2905 |
-
"loss": 0.2039,
|
| 2906 |
-
"mean_token_accuracy": 0.940489636361599,
|
| 2907 |
-
"num_tokens": 91656.0,
|
| 2908 |
-
"step": 2730
|
| 2909 |
-
},
|
| 2910 |
-
{
|
| 2911 |
-
"entropy": 0.26067384518682957,
|
| 2912 |
-
"epoch": 4.384,
|
| 2913 |
-
"grad_norm": 0.9379425644874573,
|
| 2914 |
-
"learning_rate": 1.2416000000000001e-05,
|
| 2915 |
-
"loss": 0.2182,
|
| 2916 |
-
"mean_token_accuracy": 0.9411718167364598,
|
| 2917 |
-
"num_tokens": 110505.0,
|
| 2918 |
-
"step": 2740
|
| 2919 |
-
},
|
| 2920 |
-
{
|
| 2921 |
-
"entropy": 0.3018894817214459,
|
| 2922 |
-
"epoch": 4.4,
|
| 2923 |
-
"grad_norm": 1.0026336908340454,
|
| 2924 |
-
"learning_rate": 1.2096e-05,
|
| 2925 |
-
"loss": 0.2267,
|
| 2926 |
-
"mean_token_accuracy": 0.9386275008320808,
|
| 2927 |
-
"num_tokens": 123324.0,
|
| 2928 |
-
"step": 2750
|
| 2929 |
-
},
|
| 2930 |
-
{
|
| 2931 |
-
"entropy": 0.21805389355868102,
|
| 2932 |
-
"epoch": 4.416,
|
| 2933 |
-
"grad_norm": 0.6372848153114319,
|
| 2934 |
-
"learning_rate": 1.1776e-05,
|
| 2935 |
-
"loss": 0.1861,
|
| 2936 |
-
"mean_token_accuracy": 0.9427805945277214,
|
| 2937 |
-
"num_tokens": 163777.0,
|
| 2938 |
-
"step": 2760
|
| 2939 |
-
},
|
| 2940 |
-
{
|
| 2941 |
-
"entropy": 0.21196621540002525,
|
| 2942 |
-
"epoch": 4.432,
|
| 2943 |
-
"grad_norm": 0.5572025179862976,
|
| 2944 |
-
"learning_rate": 1.1456e-05,
|
| 2945 |
-
"loss": 0.1581,
|
| 2946 |
-
"mean_token_accuracy": 0.9551307797431946,
|
| 2947 |
-
"num_tokens": 192177.0,
|
| 2948 |
-
"step": 2770
|
| 2949 |
-
},
|
| 2950 |
-
{
|
| 2951 |
-
"entropy": 0.20902398317120968,
|
| 2952 |
-
"epoch": 4.448,
|
| 2953 |
-
"grad_norm": 0.7340620756149292,
|
| 2954 |
-
"learning_rate": 1.1136e-05,
|
| 2955 |
-
"loss": 0.1582,
|
| 2956 |
-
"mean_token_accuracy": 0.9570909071713686,
|
| 2957 |
-
"num_tokens": 215456.0,
|
| 2958 |
-
"step": 2780
|
| 2959 |
-
},
|
| 2960 |
-
{
|
| 2961 |
-
"entropy": 0.2131565590389073,
|
| 2962 |
-
"epoch": 4.464,
|
| 2963 |
-
"grad_norm": 1.0014139413833618,
|
| 2964 |
-
"learning_rate": 1.0816000000000001e-05,
|
| 2965 |
-
"loss": 0.1583,
|
| 2966 |
-
"mean_token_accuracy": 0.9551056247204542,
|
| 2967 |
-
"num_tokens": 234122.0,
|
| 2968 |
-
"step": 2790
|
| 2969 |
-
},
|
| 2970 |
-
{
|
| 2971 |
-
"entropy": 0.25133530045859515,
|
| 2972 |
-
"epoch": 4.48,
|
| 2973 |
-
"grad_norm": 0.8922705054283142,
|
| 2974 |
-
"learning_rate": 1.0496e-05,
|
| 2975 |
-
"loss": 0.1818,
|
| 2976 |
-
"mean_token_accuracy": 0.9524805508553982,
|
| 2977 |
-
"num_tokens": 246749.0,
|
| 2978 |
-
"step": 2800
|
| 2979 |
-
},
|
| 2980 |
-
{
|
| 2981 |
-
"entropy": 0.19833970288746058,
|
| 2982 |
-
"epoch": 4.496,
|
| 2983 |
-
"grad_norm": 0.8713212609291077,
|
| 2984 |
-
"learning_rate": 1.0176e-05,
|
| 2985 |
-
"loss": 0.1667,
|
| 2986 |
-
"mean_token_accuracy": 0.9479088947176934,
|
| 2987 |
-
"num_tokens": 287475.0,
|
| 2988 |
-
"step": 2810
|
| 2989 |
-
},
|
| 2990 |
-
{
|
| 2991 |
-
"entropy": 0.18820378091186285,
|
| 2992 |
-
"epoch": 4.5120000000000005,
|
| 2993 |
-
"grad_norm": 0.782958984375,
|
| 2994 |
-
"learning_rate": 9.856e-06,
|
| 2995 |
-
"loss": 0.1507,
|
| 2996 |
-
"mean_token_accuracy": 0.9564289052039385,
|
| 2997 |
-
"num_tokens": 316228.0,
|
| 2998 |
-
"step": 2820
|
| 2999 |
-
},
|
| 3000 |
-
{
|
| 3001 |
-
"entropy": 0.1986434136983007,
|
| 3002 |
-
"epoch": 4.5280000000000005,
|
| 3003 |
-
"grad_norm": 0.9405664801597595,
|
| 3004 |
-
"learning_rate": 9.536e-06,
|
| 3005 |
-
"loss": 0.1652,
|
| 3006 |
-
"mean_token_accuracy": 0.9527083396911621,
|
| 3007 |
-
"num_tokens": 339312.0,
|
| 3008 |
-
"step": 2830
|
| 3009 |
-
},
|
| 3010 |
-
{
|
| 3011 |
-
"entropy": 0.20359546076506377,
|
| 3012 |
-
"epoch": 4.5440000000000005,
|
| 3013 |
-
"grad_norm": 1.8294662237167358,
|
| 3014 |
-
"learning_rate": 9.216000000000001e-06,
|
| 3015 |
-
"loss": 0.1605,
|
| 3016 |
-
"mean_token_accuracy": 0.958249793574214,
|
| 3017 |
-
"num_tokens": 357957.0,
|
| 3018 |
-
"step": 2840
|
| 3019 |
-
},
|
| 3020 |
-
{
|
| 3021 |
-
"entropy": 0.2478945675306022,
|
| 3022 |
-
"epoch": 4.5600000000000005,
|
| 3023 |
-
"grad_norm": 1.8756585121154785,
|
| 3024 |
-
"learning_rate": 8.896000000000001e-06,
|
| 3025 |
-
"loss": 0.1791,
|
| 3026 |
-
"mean_token_accuracy": 0.9529225923120975,
|
| 3027 |
-
"num_tokens": 371074.0,
|
| 3028 |
-
"step": 2850
|
| 3029 |
-
},
|
| 3030 |
-
{
|
| 3031 |
-
"entropy": 0.19137877360917627,
|
| 3032 |
-
"epoch": 4.576,
|
| 3033 |
-
"grad_norm": 0.7811349034309387,
|
| 3034 |
-
"learning_rate": 8.576e-06,
|
| 3035 |
-
"loss": 0.1603,
|
| 3036 |
-
"mean_token_accuracy": 0.9505746208131314,
|
| 3037 |
-
"num_tokens": 412461.0,
|
| 3038 |
-
"step": 2860
|
| 3039 |
-
},
|
| 3040 |
-
{
|
| 3041 |
-
"entropy": 0.19941019406542182,
|
| 3042 |
-
"epoch": 4.592,
|
| 3043 |
-
"grad_norm": 0.8849194645881653,
|
| 3044 |
-
"learning_rate": 8.256e-06,
|
| 3045 |
-
"loss": 0.1559,
|
| 3046 |
-
"mean_token_accuracy": 0.9538026105612516,
|
| 3047 |
-
"num_tokens": 441113.0,
|
| 3048 |
-
"step": 2870
|
| 3049 |
-
},
|
| 3050 |
-
{
|
| 3051 |
-
"entropy": 0.20037598102353513,
|
| 3052 |
-
"epoch": 4.608,
|
| 3053 |
-
"grad_norm": 1.007367730140686,
|
| 3054 |
-
"learning_rate": 7.936e-06,
|
| 3055 |
-
"loss": 0.1577,
|
| 3056 |
-
"mean_token_accuracy": 0.9563030891120434,
|
| 3057 |
-
"num_tokens": 464301.0,
|
| 3058 |
-
"step": 2880
|
| 3059 |
-
},
|
| 3060 |
-
{
|
| 3061 |
-
"entropy": 0.21458538975566627,
|
| 3062 |
-
"epoch": 4.624,
|
| 3063 |
-
"grad_norm": 1.0605765581130981,
|
| 3064 |
-
"learning_rate": 7.616000000000001e-06,
|
| 3065 |
-
"loss": 0.1636,
|
| 3066 |
-
"mean_token_accuracy": 0.9558106277137994,
|
| 3067 |
-
"num_tokens": 483422.0,
|
| 3068 |
-
"step": 2890
|
| 3069 |
-
},
|
| 3070 |
-
{
|
| 3071 |
-
"entropy": 0.2460995698813349,
|
| 3072 |
-
"epoch": 4.64,
|
| 3073 |
-
"grad_norm": 1.1102747917175293,
|
| 3074 |
-
"learning_rate": 7.296e-06,
|
| 3075 |
-
"loss": 0.178,
|
| 3076 |
-
"mean_token_accuracy": 0.9527418158948422,
|
| 3077 |
-
"num_tokens": 496524.0,
|
| 3078 |
-
"step": 2900
|
| 3079 |
-
},
|
| 3080 |
-
{
|
| 3081 |
-
"entropy": 0.1917059404309839,
|
| 3082 |
-
"epoch": 4.656,
|
| 3083 |
-
"grad_norm": 0.7104383111000061,
|
| 3084 |
-
"learning_rate": 6.976000000000001e-06,
|
| 3085 |
-
"loss": 0.1692,
|
| 3086 |
-
"mean_token_accuracy": 0.9471572674810886,
|
| 3087 |
-
"num_tokens": 537262.0,
|
| 3088 |
-
"step": 2910
|
| 3089 |
-
},
|
| 3090 |
-
{
|
| 3091 |
-
"entropy": 0.19903061082586646,
|
| 3092 |
-
"epoch": 4.672,
|
| 3093 |
-
"grad_norm": 0.8522951006889343,
|
| 3094 |
-
"learning_rate": 6.688e-06,
|
| 3095 |
-
"loss": 0.1668,
|
| 3096 |
-
"mean_token_accuracy": 0.9495650254189968,
|
| 3097 |
-
"num_tokens": 566118.0,
|
| 3098 |
-
"step": 2920
|
| 3099 |
-
},
|
| 3100 |
-
{
|
| 3101 |
-
"entropy": 0.20533090075477958,
|
| 3102 |
-
"epoch": 4.688,
|
| 3103 |
-
"grad_norm": 0.7692112326622009,
|
| 3104 |
-
"learning_rate": 6.368000000000001e-06,
|
| 3105 |
-
"loss": 0.1597,
|
| 3106 |
-
"mean_token_accuracy": 0.9538190443068743,
|
| 3107 |
-
"num_tokens": 589316.0,
|
| 3108 |
-
"step": 2930
|
| 3109 |
-
},
|
| 3110 |
-
{
|
| 3111 |
-
"entropy": 0.20868746675550937,
|
| 3112 |
-
"epoch": 4.704,
|
| 3113 |
-
"grad_norm": 0.8645059466362,
|
| 3114 |
-
"learning_rate": 6.048e-06,
|
| 3115 |
-
"loss": 0.1496,
|
| 3116 |
-
"mean_token_accuracy": 0.9595503833144903,
|
| 3117 |
-
"num_tokens": 607904.0,
|
| 3118 |
-
"step": 2940
|
| 3119 |
-
},
|
| 3120 |
-
{
|
| 3121 |
-
"entropy": 0.23888139198534192,
|
| 3122 |
-
"epoch": 4.72,
|
| 3123 |
-
"grad_norm": 1.08635413646698,
|
| 3124 |
-
"learning_rate": 5.728e-06,
|
| 3125 |
-
"loss": 0.1706,
|
| 3126 |
-
"mean_token_accuracy": 0.9570875108242035,
|
| 3127 |
-
"num_tokens": 620936.0,
|
| 3128 |
-
"step": 2950
|
| 3129 |
-
},
|
| 3130 |
-
{
|
| 3131 |
-
"entropy": 0.18963255980052054,
|
| 3132 |
-
"epoch": 4.736,
|
| 3133 |
-
"grad_norm": 0.7276900410652161,
|
| 3134 |
-
"learning_rate": 5.4080000000000006e-06,
|
| 3135 |
-
"loss": 0.1633,
|
| 3136 |
-
"mean_token_accuracy": 0.9485368836671114,
|
| 3137 |
-
"num_tokens": 661079.0,
|
| 3138 |
-
"step": 2960
|
| 3139 |
-
},
|
| 3140 |
-
{
|
| 3141 |
-
"entropy": 0.19404892213642597,
|
| 3142 |
-
"epoch": 4.752,
|
| 3143 |
-
"grad_norm": 0.8436645269393921,
|
| 3144 |
-
"learning_rate": 5.088e-06,
|
| 3145 |
-
"loss": 0.1523,
|
| 3146 |
-
"mean_token_accuracy": 0.9547487128525972,
|
| 3147 |
-
"num_tokens": 689649.0,
|
| 3148 |
-
"step": 2970
|
| 3149 |
-
},
|
| 3150 |
-
{
|
| 3151 |
-
"entropy": 0.20046764588914812,
|
| 3152 |
-
"epoch": 4.768,
|
| 3153 |
-
"grad_norm": 1.0704182386398315,
|
| 3154 |
-
"learning_rate": 4.768e-06,
|
| 3155 |
-
"loss": 0.1574,
|
| 3156 |
-
"mean_token_accuracy": 0.9545170154422522,
|
| 3157 |
-
"num_tokens": 712841.0,
|
| 3158 |
-
"step": 2980
|
| 3159 |
-
},
|
| 3160 |
-
{
|
| 3161 |
-
"entropy": 0.2065018493682146,
|
| 3162 |
-
"epoch": 4.784,
|
| 3163 |
-
"grad_norm": 0.9045215249061584,
|
| 3164 |
-
"learning_rate": 4.4480000000000004e-06,
|
| 3165 |
-
"loss": 0.155,
|
| 3166 |
-
"mean_token_accuracy": 0.9589469760656357,
|
| 3167 |
-
"num_tokens": 731548.0,
|
| 3168 |
-
"step": 2990
|
| 3169 |
-
},
|
| 3170 |
-
{
|
| 3171 |
-
"entropy": 0.2458665339741856,
|
| 3172 |
-
"epoch": 4.8,
|
| 3173 |
-
"grad_norm": 1.7165741920471191,
|
| 3174 |
-
"learning_rate": 4.128e-06,
|
| 3175 |
-
"loss": 0.173,
|
| 3176 |
-
"mean_token_accuracy": 0.9542810652405024,
|
| 3177 |
-
"num_tokens": 744375.0,
|
| 3178 |
-
"step": 3000
|
| 3179 |
-
},
|
| 3180 |
-
{
|
| 3181 |
-
"epoch": 4.8,
|
| 3182 |
-
"eval_accuracy": 0.026236095361078154,
|
| 3183 |
-
"eval_entropy": 0.3239293715655804,
|
| 3184 |
-
"eval_loss": 0.6594926714897156,
|
| 3185 |
-
"eval_mean_token_accuracy": 0.8544400478601456,
|
| 3186 |
-
"eval_num_tokens": 744375.0,
|
| 3187 |
-
"eval_runtime": 966.0583,
|
| 3188 |
-
"eval_samples_per_second": 2.07,
|
| 3189 |
-
"eval_steps_per_second": 0.518,
|
| 3190 |
-
"step": 3000
|
| 3191 |
-
},
|
| 3192 |
-
{
|
| 3193 |
-
"entropy": 0.19047842593863606,
|
| 3194 |
-
"epoch": 4.816,
|
| 3195 |
-
"grad_norm": 0.8224709033966064,
|
| 3196 |
-
"learning_rate": 3.8080000000000006e-06,
|
| 3197 |
-
"loss": 0.1691,
|
| 3198 |
-
"mean_token_accuracy": 0.9483149264007806,
|
| 3199 |
-
"num_tokens": 785457.0,
|
| 3200 |
-
"step": 3010
|
| 3201 |
-
},
|
| 3202 |
-
{
|
| 3203 |
-
"entropy": 0.1947814745362848,
|
| 3204 |
-
"epoch": 4.832,
|
| 3205 |
-
"grad_norm": 0.8581233024597168,
|
| 3206 |
-
"learning_rate": 3.4880000000000003e-06,
|
| 3207 |
-
"loss": 0.1535,
|
| 3208 |
-
"mean_token_accuracy": 0.9543764512985945,
|
| 3209 |
-
"num_tokens": 814006.0,
|
| 3210 |
-
"step": 3020
|
| 3211 |
-
},
|
| 3212 |
-
{
|
| 3213 |
-
"entropy": 0.20228669252246617,
|
| 3214 |
-
"epoch": 4.848,
|
| 3215 |
-
"grad_norm": 0.7815537452697754,
|
| 3216 |
-
"learning_rate": 3.168e-06,
|
| 3217 |
-
"loss": 0.1539,
|
| 3218 |
-
"mean_token_accuracy": 0.9561178237199783,
|
| 3219 |
-
"num_tokens": 836843.0,
|
| 3220 |
-
"step": 3030
|
| 3221 |
-
},
|
| 3222 |
-
{
|
| 3223 |
-
"entropy": 0.2111768877133727,
|
| 3224 |
-
"epoch": 4.864,
|
| 3225 |
-
"grad_norm": 2.0849273204803467,
|
| 3226 |
-
"learning_rate": 2.848e-06,
|
| 3227 |
-
"loss": 0.1553,
|
| 3228 |
-
"mean_token_accuracy": 0.9579557087272406,
|
| 3229 |
-
"num_tokens": 855036.0,
|
| 3230 |
-
"step": 3040
|
| 3231 |
-
},
|
| 3232 |
-
{
|
| 3233 |
-
"entropy": 0.2543737689033151,
|
| 3234 |
-
"epoch": 4.88,
|
| 3235 |
-
"grad_norm": 0.9005395770072937,
|
| 3236 |
-
"learning_rate": 2.528e-06,
|
| 3237 |
-
"loss": 0.18,
|
| 3238 |
-
"mean_token_accuracy": 0.951928498968482,
|
| 3239 |
-
"num_tokens": 867473.0,
|
| 3240 |
-
"step": 3050
|
| 3241 |
-
},
|
| 3242 |
-
{
|
| 3243 |
-
"entropy": 0.19695296385325492,
|
| 3244 |
-
"epoch": 4.896,
|
| 3245 |
-
"grad_norm": 0.8913720846176147,
|
| 3246 |
-
"learning_rate": 2.208e-06,
|
| 3247 |
-
"loss": 0.1731,
|
| 3248 |
-
"mean_token_accuracy": 0.9454629000276327,
|
| 3249 |
-
"num_tokens": 905517.0,
|
| 3250 |
-
"step": 3060
|
| 3251 |
-
},
|
| 3252 |
-
{
|
| 3253 |
-
"entropy": 0.2020930268801749,
|
| 3254 |
-
"epoch": 4.912,
|
| 3255 |
-
"grad_norm": 1.0501484870910645,
|
| 3256 |
-
"learning_rate": 1.8880000000000002e-06,
|
| 3257 |
-
"loss": 0.1583,
|
| 3258 |
-
"mean_token_accuracy": 0.954399960488081,
|
| 3259 |
-
"num_tokens": 933251.0,
|
| 3260 |
-
"step": 3070
|
| 3261 |
-
},
|
| 3262 |
-
{
|
| 3263 |
-
"entropy": 0.20252155787311493,
|
| 3264 |
-
"epoch": 4.928,
|
| 3265 |
-
"grad_norm": 1.03731369972229,
|
| 3266 |
-
"learning_rate": 1.568e-06,
|
| 3267 |
-
"loss": 0.1531,
|
| 3268 |
-
"mean_token_accuracy": 0.9579384963959455,
|
| 3269 |
-
"num_tokens": 956069.0,
|
| 3270 |
-
"step": 3080
|
| 3271 |
-
},
|
| 3272 |
-
{
|
| 3273 |
-
"entropy": 0.2126692888326943,
|
| 3274 |
-
"epoch": 4.944,
|
| 3275 |
-
"grad_norm": 1.107572317123413,
|
| 3276 |
-
"learning_rate": 1.248e-06,
|
| 3277 |
-
"loss": 0.1568,
|
| 3278 |
-
"mean_token_accuracy": 0.9569063678383827,
|
| 3279 |
-
"num_tokens": 974517.0,
|
| 3280 |
-
"step": 3090
|
| 3281 |
-
},
|
| 3282 |
-
{
|
| 3283 |
-
"entropy": 0.24990466320887209,
|
| 3284 |
-
"epoch": 4.96,
|
| 3285 |
-
"grad_norm": 1.2767953872680664,
|
| 3286 |
-
"learning_rate": 9.28e-07,
|
| 3287 |
-
"loss": 0.1851,
|
| 3288 |
-
"mean_token_accuracy": 0.9518057998269797,
|
| 3289 |
-
"num_tokens": 987191.0,
|
| 3290 |
-
"step": 3100
|
| 3291 |
-
},
|
| 3292 |
-
{
|
| 3293 |
-
"entropy": 0.19635155922733247,
|
| 3294 |
-
"epoch": 4.976,
|
| 3295 |
-
"grad_norm": 0.838716447353363,
|
| 3296 |
-
"learning_rate": 6.08e-07,
|
| 3297 |
-
"loss": 0.1689,
|
| 3298 |
-
"mean_token_accuracy": 0.9492763552814723,
|
| 3299 |
-
"num_tokens": 1021442.0,
|
| 3300 |
-
"step": 3110
|
| 3301 |
-
},
|
| 3302 |
-
{
|
| 3303 |
-
"entropy": 0.21572725460864603,
|
| 3304 |
-
"epoch": 4.992,
|
| 3305 |
-
"grad_norm": 0.9043759107589722,
|
| 3306 |
-
"learning_rate": 2.8800000000000004e-07,
|
| 3307 |
-
"loss": 0.161,
|
| 3308 |
-
"mean_token_accuracy": 0.9549260966479778,
|
| 3309 |
-
"num_tokens": 1041350.0,
|
| 3310 |
-
"step": 3120
|
| 3311 |
}
|
| 3312 |
],
|
| 3313 |
"logging_steps": 10,
|
| 3314 |
"max_steps": 3125,
|
| 3315 |
"num_input_tokens_seen": 0,
|
| 3316 |
"num_train_epochs": 5,
|
| 3317 |
-
"save_steps":
|
| 3318 |
"stateful_callbacks": {
|
| 3319 |
"TrainerControl": {
|
| 3320 |
"args": {
|
|
@@ -3322,12 +827,12 @@
|
|
| 3322 |
"should_evaluate": false,
|
| 3323 |
"should_log": false,
|
| 3324 |
"should_save": true,
|
| 3325 |
-
"should_training_stop":
|
| 3326 |
},
|
| 3327 |
"attributes": {}
|
| 3328 |
}
|
| 3329 |
},
|
| 3330 |
-
"total_flos":
|
| 3331 |
"train_batch_size": 1,
|
| 3332 |
"trial_name": null,
|
| 3333 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": 750,
|
| 3 |
"best_metric": 0.5089643597602844,
|
| 4 |
"best_model_checkpoint": "./adapter-phase1/checkpoint-750",
|
| 5 |
+
"epoch": 1.2,
|
| 6 |
+
"eval_steps": 150,
|
| 7 |
+
"global_step": 750,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 813 |
"eval_samples_per_second": 2.106,
|
| 814 |
"eval_steps_per_second": 0.526,
|
| 815 |
"step": 750
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 816 |
}
|
| 817 |
],
|
| 818 |
"logging_steps": 10,
|
| 819 |
"max_steps": 3125,
|
| 820 |
"num_input_tokens_seen": 0,
|
| 821 |
"num_train_epochs": 5,
|
| 822 |
+
"save_steps": 150,
|
| 823 |
"stateful_callbacks": {
|
| 824 |
"TrainerControl": {
|
| 825 |
"args": {
|
|
|
|
| 827 |
"should_evaluate": false,
|
| 828 |
"should_log": false,
|
| 829 |
"should_save": true,
|
| 830 |
+
"should_training_stop": false
|
| 831 |
},
|
| 832 |
"attributes": {}
|
| 833 |
}
|
| 834 |
},
|
| 835 |
+
"total_flos": 1.3058997783257088e+17,
|
| 836 |
"train_batch_size": 1,
|
| 837 |
"trial_name": null,
|
| 838 |
"trial_params": null
|
last-checkpoint/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 6353
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d19add453be896fb8010267a01d849597b52aecb53969dce6ab3000e56f1b7d0
|
| 3 |
size 6353
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 6353
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d19add453be896fb8010267a01d849597b52aecb53969dce6ab3000e56f1b7d0
|
| 3 |
size 6353
|