NBAmine commited on
Commit
c483f91
·
verified ·
1 Parent(s): dce6a88

Pushing checkpoint-750 (best) to main

Browse files
README.md CHANGED
@@ -37,7 +37,7 @@ This model was trained with SFT.
37
  - TRL: 0.27.0
38
  - Transformers: 4.57.6
39
  - Pytorch: 2.8.0+cu126
40
- - Datasets: 4.4.2
41
  - Tokenizers: 0.22.1
42
 
43
  ## Citations
 
37
  - TRL: 0.27.0
38
  - Transformers: 4.57.6
39
  - Pytorch: 2.8.0+cu126
40
+ - Datasets: 4.4.1
41
  - Tokenizers: 0.22.1
42
 
43
  ## Citations
adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "k_proj",
34
- "down_proj",
35
  "q_proj",
 
 
 
36
  "gate_proj",
37
- "up_proj",
38
- "o_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "up_proj",
 
 
33
  "q_proj",
34
+ "down_proj",
35
+ "o_proj",
36
+ "v_proj",
37
  "gate_proj",
38
+ "k_proj"
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31e65c9ff039c74d59b4607524385f75a8ae083b148b3a163cece010a9774af0
3
  size 228140600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd11b39803251198dcb7e030bb69c10b05cece6a9e45160afcc921794cb790cc
3
  size 228140600
last-checkpoint/adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "k_proj",
34
- "down_proj",
35
  "q_proj",
 
 
 
36
  "gate_proj",
37
- "up_proj",
38
- "o_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "up_proj",
 
 
33
  "q_proj",
34
+ "down_proj",
35
+ "o_proj",
36
+ "v_proj",
37
  "gate_proj",
38
+ "k_proj"
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31e65c9ff039c74d59b4607524385f75a8ae083b148b3a163cece010a9774af0
3
  size 228140600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd11b39803251198dcb7e030bb69c10b05cece6a9e45160afcc921794cb790cc
3
  size 228140600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b96216027c02e20a6ee8541060ecd0085b74fd0ea5669cf82258347c31d3baf
3
- size 117931203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2acc6b93233f66c6ddb8b195904fe7cd974047004ffcd02f1d993e85ebc0a677
3
+ size 116484839
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e788bee1c067926ef11645e418ec428402ec185fb9258e04df56296e42d2286b
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7883d803ebcafeb5684e5f2bcceb39f2a54258143c0c4972785bf0a17a36dc8
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e230928162c4463d462e64ab14b3906988dfebe47926d517a84f2e81ec7582c
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e188a4cd7f588ff088ff68a7d9c18ed5ca570c5b11d6790654dcb4e3accb81e
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b0095603c7ffc8d3152c5de9d397fd1beca2e9651bdba9b9da9fbad8a37e19c
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08f9e08af1aa8eb785ad1df11d9714b6c859fed11b125506168e50ec9ce7af28
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 750,
3
  "best_metric": 0.5089643597602844,
4
  "best_model_checkpoint": "./adapter-phase1/checkpoint-750",
5
- "epoch": 5.0,
6
- "eval_steps": 300,
7
- "global_step": 3125,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -813,2508 +813,13 @@
813
  "eval_samples_per_second": 2.106,
814
  "eval_steps_per_second": 0.526,
815
  "step": 750
816
- },
817
- {
818
- "entropy": 0.4685165178030729,
819
- "epoch": 1.216,
820
- "grad_norm": 0.4797472059726715,
821
- "learning_rate": 7.5744e-05,
822
- "loss": 0.4371,
823
- "mean_token_accuracy": 0.872249535471201,
824
- "num_tokens": 20779.0,
825
- "step": 760
826
- },
827
- {
828
- "entropy": 0.5129861503839492,
829
- "epoch": 1.232,
830
- "grad_norm": 0.5743088126182556,
831
- "learning_rate": 7.5424e-05,
832
- "loss": 0.4703,
833
- "mean_token_accuracy": 0.8656402382999658,
834
- "num_tokens": 37039.0,
835
- "step": 770
836
- },
837
- {
838
- "entropy": 0.47918802928179505,
839
- "epoch": 1.248,
840
- "grad_norm": 0.41004160046577454,
841
- "learning_rate": 7.5104e-05,
842
- "loss": 0.4631,
843
- "mean_token_accuracy": 0.8624460745602847,
844
- "num_tokens": 66230.0,
845
- "step": 780
846
- },
847
- {
848
- "entropy": 0.423713362775743,
849
- "epoch": 1.264,
850
- "grad_norm": 0.39121007919311523,
851
- "learning_rate": 7.4784e-05,
852
- "loss": 0.4005,
853
- "mean_token_accuracy": 0.8780338373035192,
854
- "num_tokens": 98315.0,
855
- "step": 790
856
- },
857
- {
858
- "entropy": 0.46349438820034267,
859
- "epoch": 1.28,
860
- "grad_norm": 0.4372813403606415,
861
- "learning_rate": 7.4464e-05,
862
- "loss": 0.4236,
863
- "mean_token_accuracy": 0.8776358034461736,
864
- "num_tokens": 123538.0,
865
- "step": 800
866
- },
867
- {
868
- "entropy": 0.46192670799791813,
869
- "epoch": 1.296,
870
- "grad_norm": 0.5512360334396362,
871
- "learning_rate": 7.4144e-05,
872
- "loss": 0.4276,
873
- "mean_token_accuracy": 0.8758893702179193,
874
- "num_tokens": 143855.0,
875
- "step": 810
876
- },
877
- {
878
- "entropy": 0.5323605043813586,
879
- "epoch": 1.312,
880
- "grad_norm": 0.6361510753631592,
881
- "learning_rate": 7.3824e-05,
882
- "loss": 0.491,
883
- "mean_token_accuracy": 0.856274176388979,
884
- "num_tokens": 159314.0,
885
- "step": 820
886
- },
887
- {
888
- "entropy": 0.4423897641710937,
889
- "epoch": 1.328,
890
- "grad_norm": 0.4728486239910126,
891
- "learning_rate": 7.3504e-05,
892
- "loss": 0.4335,
893
- "mean_token_accuracy": 0.8686480693519115,
894
- "num_tokens": 187464.0,
895
- "step": 830
896
- },
897
- {
898
- "entropy": 0.41830341406166555,
899
- "epoch": 1.3439999999999999,
900
- "grad_norm": 0.49457916617393494,
901
- "learning_rate": 7.318400000000001e-05,
902
- "loss": 0.3985,
903
- "mean_token_accuracy": 0.8779879175126553,
904
- "num_tokens": 219657.0,
905
- "step": 840
906
- },
907
- {
908
- "entropy": 0.44871921837329865,
909
- "epoch": 1.3599999999999999,
910
- "grad_norm": 0.46471357345581055,
911
- "learning_rate": 7.2864e-05,
912
- "loss": 0.4009,
913
- "mean_token_accuracy": 0.8793935433030129,
914
- "num_tokens": 245396.0,
915
- "step": 850
916
- },
917
- {
918
- "entropy": 0.4491863099858165,
919
- "epoch": 1.376,
920
- "grad_norm": 0.4910559356212616,
921
- "learning_rate": 7.2544e-05,
922
- "loss": 0.432,
923
- "mean_token_accuracy": 0.8772312045097351,
924
- "num_tokens": 266432.0,
925
- "step": 860
926
- },
927
- {
928
- "entropy": 0.5239890940487385,
929
- "epoch": 1.392,
930
- "grad_norm": 0.7272471785545349,
931
- "learning_rate": 7.2224e-05,
932
- "loss": 0.4652,
933
- "mean_token_accuracy": 0.8644176237285137,
934
- "num_tokens": 282655.0,
935
- "step": 870
936
- },
937
- {
938
- "entropy": 0.45916353976354,
939
- "epoch": 1.408,
940
- "grad_norm": 0.4625614583492279,
941
- "learning_rate": 7.190400000000001e-05,
942
- "loss": 0.4543,
943
- "mean_token_accuracy": 0.8663836497813463,
944
- "num_tokens": 310801.0,
945
- "step": 880
946
- },
947
- {
948
- "entropy": 0.4246408801525831,
949
- "epoch": 1.424,
950
- "grad_norm": 0.48823705315589905,
951
- "learning_rate": 7.158400000000001e-05,
952
- "loss": 0.395,
953
- "mean_token_accuracy": 0.879553859308362,
954
- "num_tokens": 343555.0,
955
- "step": 890
956
- },
957
- {
958
- "entropy": 0.44206046797335147,
959
- "epoch": 1.44,
960
- "grad_norm": 0.48411789536476135,
961
- "learning_rate": 7.126400000000001e-05,
962
- "loss": 0.4127,
963
- "mean_token_accuracy": 0.8790250942111015,
964
- "num_tokens": 369134.0,
965
- "step": 900
966
- },
967
- {
968
- "epoch": 1.44,
969
- "eval_entropy": 0.4753110625743866,
970
- "eval_loss": 0.5131832361221313,
971
- "eval_mean_token_accuracy": 0.8568402478694915,
972
- "eval_num_tokens": 369134.0,
973
- "eval_runtime": 895.8989,
974
- "eval_samples_per_second": 2.232,
975
- "eval_steps_per_second": 0.558,
976
- "step": 900
977
- },
978
- {
979
- "entropy": 0.4554462408646941,
980
- "epoch": 1.456,
981
- "grad_norm": 0.537635087966919,
982
- "learning_rate": 7.0944e-05,
983
- "loss": 0.4188,
984
- "mean_token_accuracy": 0.8769065048545599,
985
- "num_tokens": 390152.0,
986
- "step": 910
987
- },
988
- {
989
- "entropy": 0.5131621342152357,
990
- "epoch": 1.472,
991
- "grad_norm": 0.6558974385261536,
992
- "learning_rate": 7.062400000000001e-05,
993
- "loss": 0.4663,
994
- "mean_token_accuracy": 0.8643736276775599,
995
- "num_tokens": 406222.0,
996
- "step": 920
997
- },
998
- {
999
- "entropy": 0.46728117018938065,
1000
- "epoch": 1.488,
1001
- "grad_norm": 0.4205915927886963,
1002
- "learning_rate": 7.030400000000001e-05,
1003
- "loss": 0.4539,
1004
- "mean_token_accuracy": 0.8652867745608092,
1005
- "num_tokens": 434767.0,
1006
- "step": 930
1007
- },
1008
- {
1009
- "entropy": 0.3991527833044529,
1010
- "epoch": 1.504,
1011
- "grad_norm": 0.4031739830970764,
1012
- "learning_rate": 6.9984e-05,
1013
- "loss": 0.3805,
1014
- "mean_token_accuracy": 0.8837333973497152,
1015
- "num_tokens": 467498.0,
1016
- "step": 940
1017
- },
1018
- {
1019
- "entropy": 0.43914526589214803,
1020
- "epoch": 1.52,
1021
- "grad_norm": 0.42591777443885803,
1022
- "learning_rate": 6.9664e-05,
1023
- "loss": 0.3989,
1024
- "mean_token_accuracy": 0.8830435562878847,
1025
- "num_tokens": 493146.0,
1026
- "step": 950
1027
- },
1028
- {
1029
- "entropy": 0.4609672848135233,
1030
- "epoch": 1.536,
1031
- "grad_norm": 0.5175366997718811,
1032
- "learning_rate": 6.934399999999999e-05,
1033
- "loss": 0.4377,
1034
- "mean_token_accuracy": 0.8746946189552546,
1035
- "num_tokens": 513926.0,
1036
- "step": 960
1037
- },
1038
- {
1039
- "entropy": 0.5191182948648929,
1040
- "epoch": 1.552,
1041
- "grad_norm": 0.6527137160301208,
1042
- "learning_rate": 6.9024e-05,
1043
- "loss": 0.465,
1044
- "mean_token_accuracy": 0.86279138289392,
1045
- "num_tokens": 530263.0,
1046
- "step": 970
1047
- },
1048
- {
1049
- "entropy": 0.4549953695386648,
1050
- "epoch": 1.568,
1051
- "grad_norm": 0.4532809257507324,
1052
- "learning_rate": 6.8704e-05,
1053
- "loss": 0.4345,
1054
- "mean_token_accuracy": 0.8703642163425684,
1055
- "num_tokens": 559791.0,
1056
- "step": 980
1057
- },
1058
- {
1059
- "entropy": 0.39457473438233137,
1060
- "epoch": 1.584,
1061
- "grad_norm": 0.4516853094100952,
1062
- "learning_rate": 6.8384e-05,
1063
- "loss": 0.3833,
1064
- "mean_token_accuracy": 0.885482932254672,
1065
- "num_tokens": 592398.0,
1066
- "step": 990
1067
- },
1068
- {
1069
- "entropy": 0.44856451768428085,
1070
- "epoch": 1.6,
1071
- "grad_norm": 0.4582580029964447,
1072
- "learning_rate": 6.8064e-05,
1073
- "loss": 0.4081,
1074
- "mean_token_accuracy": 0.8799201253801584,
1075
- "num_tokens": 617982.0,
1076
- "step": 1000
1077
- },
1078
- {
1079
- "entropy": 0.46642222460359334,
1080
- "epoch": 1.616,
1081
- "grad_norm": 0.45997655391693115,
1082
- "learning_rate": 6.774400000000001e-05,
1083
- "loss": 0.4375,
1084
- "mean_token_accuracy": 0.8738056540489196,
1085
- "num_tokens": 639115.0,
1086
- "step": 1010
1087
- },
1088
- {
1089
- "entropy": 0.5127991208806634,
1090
- "epoch": 1.6320000000000001,
1091
- "grad_norm": 0.6186177730560303,
1092
- "learning_rate": 6.7424e-05,
1093
- "loss": 0.4503,
1094
- "mean_token_accuracy": 0.8713137298822403,
1095
- "num_tokens": 655709.0,
1096
- "step": 1020
1097
- },
1098
- {
1099
- "entropy": 0.45265620658174155,
1100
- "epoch": 1.6480000000000001,
1101
- "grad_norm": 0.4363885819911957,
1102
- "learning_rate": 6.7104e-05,
1103
- "loss": 0.4347,
1104
- "mean_token_accuracy": 0.866806122660637,
1105
- "num_tokens": 684500.0,
1106
- "step": 1030
1107
- },
1108
- {
1109
- "entropy": 0.39329283433035017,
1110
- "epoch": 1.6640000000000001,
1111
- "grad_norm": 0.39802274107933044,
1112
- "learning_rate": 6.6784e-05,
1113
- "loss": 0.3699,
1114
- "mean_token_accuracy": 0.8874391701072455,
1115
- "num_tokens": 717271.0,
1116
- "step": 1040
1117
- },
1118
- {
1119
- "entropy": 0.4426466390490532,
1120
- "epoch": 1.6800000000000002,
1121
- "grad_norm": 0.4594961404800415,
1122
- "learning_rate": 6.6464e-05,
1123
- "loss": 0.4061,
1124
- "mean_token_accuracy": 0.8797303918749094,
1125
- "num_tokens": 743068.0,
1126
- "step": 1050
1127
- },
1128
- {
1129
- "epoch": 1.6800000000000002,
1130
- "eval_entropy": 0.4718739038705826,
1131
- "eval_loss": 0.511114239692688,
1132
- "eval_mean_token_accuracy": 0.8577107313871384,
1133
- "eval_num_tokens": 743068.0,
1134
- "eval_runtime": 895.9979,
1135
- "eval_samples_per_second": 2.232,
1136
- "eval_steps_per_second": 0.558,
1137
- "step": 1050
1138
- },
1139
- {
1140
- "entropy": 0.45297340136021375,
1141
- "epoch": 1.696,
1142
- "grad_norm": 0.5545983910560608,
1143
- "learning_rate": 6.614400000000001e-05,
1144
- "loss": 0.4144,
1145
- "mean_token_accuracy": 0.8779479678720236,
1146
- "num_tokens": 763925.0,
1147
- "step": 1060
1148
- },
1149
- {
1150
- "entropy": 0.497313455119729,
1151
- "epoch": 1.712,
1152
- "grad_norm": 0.6375033259391785,
1153
- "learning_rate": 6.582400000000001e-05,
1154
- "loss": 0.4523,
1155
- "mean_token_accuracy": 0.8679382588714362,
1156
- "num_tokens": 780111.0,
1157
- "step": 1070
1158
- },
1159
- {
1160
- "entropy": 0.4517807062715292,
1161
- "epoch": 1.728,
1162
- "grad_norm": 0.42967426776885986,
1163
- "learning_rate": 6.5504e-05,
1164
- "loss": 0.4297,
1165
- "mean_token_accuracy": 0.8690480105578899,
1166
- "num_tokens": 808661.0,
1167
- "step": 1080
1168
- },
1169
- {
1170
- "entropy": 0.40057806484401226,
1171
- "epoch": 1.744,
1172
- "grad_norm": 0.4295614957809448,
1173
- "learning_rate": 6.5184e-05,
1174
- "loss": 0.3765,
1175
- "mean_token_accuracy": 0.8872563410550356,
1176
- "num_tokens": 840932.0,
1177
- "step": 1090
1178
- },
1179
- {
1180
- "entropy": 0.4463956480845809,
1181
- "epoch": 1.76,
1182
- "grad_norm": 0.49008527398109436,
1183
- "learning_rate": 6.486400000000001e-05,
1184
- "loss": 0.4064,
1185
- "mean_token_accuracy": 0.8804606605321169,
1186
- "num_tokens": 866564.0,
1187
- "step": 1100
1188
- },
1189
- {
1190
- "entropy": 0.45895243529230356,
1191
- "epoch": 1.776,
1192
- "grad_norm": 0.5231919288635254,
1193
- "learning_rate": 6.454400000000001e-05,
1194
- "loss": 0.4249,
1195
- "mean_token_accuracy": 0.8787468057125807,
1196
- "num_tokens": 887527.0,
1197
- "step": 1110
1198
- },
1199
- {
1200
- "entropy": 0.5028131037950516,
1201
- "epoch": 1.792,
1202
- "grad_norm": 0.6885866522789001,
1203
- "learning_rate": 6.4224e-05,
1204
- "loss": 0.4549,
1205
- "mean_token_accuracy": 0.8681917265057564,
1206
- "num_tokens": 903688.0,
1207
- "step": 1120
1208
- },
1209
- {
1210
- "entropy": 0.4443069422617555,
1211
- "epoch": 1.808,
1212
- "grad_norm": 0.4276801347732544,
1213
- "learning_rate": 6.3904e-05,
1214
- "loss": 0.419,
1215
- "mean_token_accuracy": 0.8721311956644058,
1216
- "num_tokens": 932975.0,
1217
- "step": 1130
1218
- },
1219
- {
1220
- "entropy": 0.38013150785118344,
1221
- "epoch": 1.8239999999999998,
1222
- "grad_norm": 0.4245995283126831,
1223
- "learning_rate": 6.358399999999999e-05,
1224
- "loss": 0.3752,
1225
- "mean_token_accuracy": 0.8849109452217817,
1226
- "num_tokens": 965221.0,
1227
- "step": 1140
1228
- },
1229
- {
1230
- "entropy": 0.44638209473341706,
1231
- "epoch": 1.8399999999999999,
1232
- "grad_norm": 0.47453537583351135,
1233
- "learning_rate": 6.3264e-05,
1234
- "loss": 0.4066,
1235
- "mean_token_accuracy": 0.8791540212929249,
1236
- "num_tokens": 990859.0,
1237
- "step": 1150
1238
- },
1239
- {
1240
- "entropy": 0.4516189154237509,
1241
- "epoch": 1.8559999999999999,
1242
- "grad_norm": 0.5056102871894836,
1243
- "learning_rate": 6.2944e-05,
1244
- "loss": 0.4127,
1245
- "mean_token_accuracy": 0.8801082350313664,
1246
- "num_tokens": 1011268.0,
1247
- "step": 1160
1248
- },
1249
- {
1250
- "entropy": 0.5173742642626167,
1251
- "epoch": 1.8719999999999999,
1252
- "grad_norm": 0.6503537893295288,
1253
- "learning_rate": 6.2624e-05,
1254
- "loss": 0.48,
1255
- "mean_token_accuracy": 0.8607568740844727,
1256
- "num_tokens": 1026942.0,
1257
- "step": 1170
1258
- },
1259
- {
1260
- "entropy": 0.4701320366002619,
1261
- "epoch": 1.888,
1262
- "grad_norm": 0.3796524405479431,
1263
- "learning_rate": 6.2304e-05,
1264
- "loss": 0.4387,
1265
- "mean_token_accuracy": 0.8685618557035923,
1266
- "num_tokens": 1053937.0,
1267
- "step": 1180
1268
- },
1269
- {
1270
- "entropy": 0.3984457287937403,
1271
- "epoch": 1.904,
1272
- "grad_norm": 0.4399532377719879,
1273
- "learning_rate": 6.1984e-05,
1274
- "loss": 0.3885,
1275
- "mean_token_accuracy": 0.8824849870055914,
1276
- "num_tokens": 1084820.0,
1277
- "step": 1190
1278
- },
1279
- {
1280
- "entropy": 0.44145693685859444,
1281
- "epoch": 1.92,
1282
- "grad_norm": 0.44594088196754456,
1283
- "learning_rate": 6.1664e-05,
1284
- "loss": 0.3917,
1285
- "mean_token_accuracy": 0.884940878674388,
1286
- "num_tokens": 1109943.0,
1287
- "step": 1200
1288
- },
1289
- {
1290
- "epoch": 1.92,
1291
- "eval_entropy": 0.47450968527793885,
1292
- "eval_loss": 0.5091220140457153,
1293
- "eval_mean_token_accuracy": 0.8581691147089004,
1294
- "eval_num_tokens": 1109943.0,
1295
- "eval_runtime": 897.2539,
1296
- "eval_samples_per_second": 2.229,
1297
- "eval_steps_per_second": 0.557,
1298
- "step": 1200
1299
- },
1300
- {
1301
- "entropy": 0.4571360006928444,
1302
- "epoch": 1.936,
1303
- "grad_norm": 0.5302743315696716,
1304
- "learning_rate": 6.1344e-05,
1305
- "loss": 0.4167,
1306
- "mean_token_accuracy": 0.8782664395868778,
1307
- "num_tokens": 1130543.0,
1308
- "step": 1210
1309
- },
1310
- {
1311
- "entropy": 0.49114823453128337,
1312
- "epoch": 1.952,
1313
- "grad_norm": 0.6523593664169312,
1314
- "learning_rate": 6.1024000000000004e-05,
1315
- "loss": 0.4495,
1316
- "mean_token_accuracy": 0.8662942215800286,
1317
- "num_tokens": 1146676.0,
1318
- "step": 1220
1319
- },
1320
- {
1321
- "entropy": 0.46395022002980113,
1322
- "epoch": 1.968,
1323
- "grad_norm": 0.42906099557876587,
1324
- "learning_rate": 6.070400000000001e-05,
1325
- "loss": 0.4392,
1326
- "mean_token_accuracy": 0.8659089788794517,
1327
- "num_tokens": 1172078.0,
1328
- "step": 1230
1329
- },
1330
- {
1331
- "entropy": 0.4239502627402544,
1332
- "epoch": 1.984,
1333
- "grad_norm": 0.5165457129478455,
1334
- "learning_rate": 6.038400000000001e-05,
1335
- "loss": 0.4067,
1336
- "mean_token_accuracy": 0.8755033057183027,
1337
- "num_tokens": 1197464.0,
1338
- "step": 1240
1339
- },
1340
- {
1341
- "entropy": 0.5474816044792533,
1342
- "epoch": 2.0,
1343
- "grad_norm": 0.7312328219413757,
1344
- "learning_rate": 6.0064e-05,
1345
- "loss": 0.4816,
1346
- "mean_token_accuracy": 0.8589978538453579,
1347
- "num_tokens": 1212204.0,
1348
- "step": 1250
1349
- },
1350
- {
1351
- "entropy": 0.3549959819763899,
1352
- "epoch": 2.016,
1353
- "grad_norm": 0.44957467913627625,
1354
- "learning_rate": 5.9744e-05,
1355
- "loss": 0.329,
1356
- "mean_token_accuracy": 0.8989395320415496,
1357
- "num_tokens": 1253503.0,
1358
- "step": 1260
1359
- },
1360
- {
1361
- "entropy": 0.3744832394644618,
1362
- "epoch": 2.032,
1363
- "grad_norm": 0.48583275079727173,
1364
- "learning_rate": 5.9424e-05,
1365
- "loss": 0.3574,
1366
- "mean_token_accuracy": 0.8903608873486519,
1367
- "num_tokens": 1282285.0,
1368
- "step": 1270
1369
- },
1370
- {
1371
- "entropy": 0.4087462780997157,
1372
- "epoch": 2.048,
1373
- "grad_norm": 0.5598016381263733,
1374
- "learning_rate": 5.9104e-05,
1375
- "loss": 0.3668,
1376
- "mean_token_accuracy": 0.8913519535213709,
1377
- "num_tokens": 1305738.0,
1378
- "step": 1280
1379
- },
1380
- {
1381
- "entropy": 0.4285904698073864,
1382
- "epoch": 2.064,
1383
- "grad_norm": 0.637881338596344,
1384
- "learning_rate": 5.8784000000000005e-05,
1385
- "loss": 0.3693,
1386
- "mean_token_accuracy": 0.8924054179340601,
1387
- "num_tokens": 1324499.0,
1388
- "step": 1290
1389
- },
1390
- {
1391
- "entropy": 0.46786304665729406,
1392
- "epoch": 2.08,
1393
- "grad_norm": 0.7997815012931824,
1394
- "learning_rate": 5.846400000000001e-05,
1395
- "loss": 0.3988,
1396
- "mean_token_accuracy": 0.8819302976131439,
1397
- "num_tokens": 1337343.0,
1398
- "step": 1300
1399
- },
1400
- {
1401
- "entropy": 0.30622370541095734,
1402
- "epoch": 2.096,
1403
- "grad_norm": 0.46214577555656433,
1404
- "learning_rate": 5.8144e-05,
1405
- "loss": 0.3356,
1406
- "mean_token_accuracy": 0.8964022137224674,
1407
- "num_tokens": 1378091.0,
1408
- "step": 1310
1409
- },
1410
- {
1411
- "entropy": 0.38422031346708535,
1412
- "epoch": 2.112,
1413
- "grad_norm": 0.5669556856155396,
1414
- "learning_rate": 5.7824e-05,
1415
- "loss": 0.3431,
1416
- "mean_token_accuracy": 0.8970716085284949,
1417
- "num_tokens": 1406636.0,
1418
- "step": 1320
1419
- },
1420
- {
1421
- "entropy": 0.4110618421807885,
1422
- "epoch": 2.128,
1423
- "grad_norm": 0.5469350218772888,
1424
- "learning_rate": 5.7504000000000004e-05,
1425
- "loss": 0.3556,
1426
- "mean_token_accuracy": 0.8946326076984406,
1427
- "num_tokens": 1429756.0,
1428
- "step": 1330
1429
- },
1430
- {
1431
- "entropy": 0.41729052886366846,
1432
- "epoch": 2.144,
1433
- "grad_norm": 0.5956342220306396,
1434
- "learning_rate": 5.718400000000001e-05,
1435
- "loss": 0.3707,
1436
- "mean_token_accuracy": 0.8897294741123914,
1437
- "num_tokens": 1448487.0,
1438
- "step": 1340
1439
- },
1440
- {
1441
- "entropy": 0.4881801651790738,
1442
- "epoch": 2.16,
1443
- "grad_norm": 1.0672754049301147,
1444
- "learning_rate": 5.6864e-05,
1445
- "loss": 0.4108,
1446
- "mean_token_accuracy": 0.8769895020872355,
1447
- "num_tokens": 1461450.0,
1448
- "step": 1350
1449
- },
1450
- {
1451
- "epoch": 2.16,
1452
- "eval_entropy": 0.3968213936388493,
1453
- "eval_loss": 0.5419190526008606,
1454
- "eval_mean_token_accuracy": 0.8568335684537888,
1455
- "eval_num_tokens": 1461450.0,
1456
- "eval_runtime": 896.689,
1457
- "eval_samples_per_second": 2.23,
1458
- "eval_steps_per_second": 0.558,
1459
- "step": 1350
1460
- },
1461
- {
1462
- "entropy": 0.3141488812863827,
1463
- "epoch": 2.176,
1464
- "grad_norm": 0.47934019565582275,
1465
- "learning_rate": 5.6544000000000006e-05,
1466
- "loss": 0.3396,
1467
- "mean_token_accuracy": 0.8963223662227392,
1468
- "num_tokens": 1501473.0,
1469
- "step": 1360
1470
- },
1471
- {
1472
- "entropy": 0.37752851136028764,
1473
- "epoch": 2.192,
1474
- "grad_norm": 0.580359160900116,
1475
- "learning_rate": 5.6223999999999996e-05,
1476
- "loss": 0.345,
1477
- "mean_token_accuracy": 0.8945828888565301,
1478
- "num_tokens": 1529468.0,
1479
- "step": 1370
1480
- },
1481
- {
1482
- "entropy": 0.4134438899345696,
1483
- "epoch": 2.208,
1484
- "grad_norm": 0.6379365921020508,
1485
- "learning_rate": 5.5904e-05,
1486
- "loss": 0.3692,
1487
- "mean_token_accuracy": 0.8921185087412595,
1488
- "num_tokens": 1552194.0,
1489
- "step": 1380
1490
- },
1491
- {
1492
- "entropy": 0.4285835810005665,
1493
- "epoch": 2.224,
1494
- "grad_norm": 0.7130568027496338,
1495
- "learning_rate": 5.5584e-05,
1496
- "loss": 0.3708,
1497
- "mean_token_accuracy": 0.8914431348443032,
1498
- "num_tokens": 1570340.0,
1499
- "step": 1390
1500
- },
1501
- {
1502
- "entropy": 0.4728871438652277,
1503
- "epoch": 2.24,
1504
- "grad_norm": 0.8636120557785034,
1505
- "learning_rate": 5.5264000000000005e-05,
1506
- "loss": 0.4016,
1507
- "mean_token_accuracy": 0.8787129417061805,
1508
- "num_tokens": 1583068.0,
1509
- "step": 1400
1510
- },
1511
- {
1512
- "entropy": 0.3159141786396503,
1513
- "epoch": 2.2560000000000002,
1514
- "grad_norm": 0.5734344720840454,
1515
- "learning_rate": 5.494400000000001e-05,
1516
- "loss": 0.3333,
1517
- "mean_token_accuracy": 0.8989784453064203,
1518
- "num_tokens": 1621780.0,
1519
- "step": 1410
1520
- },
1521
- {
1522
- "entropy": 0.3770693183876574,
1523
- "epoch": 2.2720000000000002,
1524
- "grad_norm": 0.5254765152931213,
1525
- "learning_rate": 5.4624e-05,
1526
- "loss": 0.3526,
1527
- "mean_token_accuracy": 0.8930392079055309,
1528
- "num_tokens": 1649762.0,
1529
- "step": 1420
1530
- },
1531
- {
1532
- "entropy": 0.42590463180094956,
1533
- "epoch": 2.288,
1534
- "grad_norm": 0.6342437267303467,
1535
- "learning_rate": 5.4304e-05,
1536
- "loss": 0.374,
1537
- "mean_token_accuracy": 0.8877194058150053,
1538
- "num_tokens": 1672433.0,
1539
- "step": 1430
1540
- },
1541
- {
1542
- "entropy": 0.4354470370337367,
1543
- "epoch": 2.304,
1544
- "grad_norm": 0.7154885530471802,
1545
- "learning_rate": 5.3984000000000004e-05,
1546
- "loss": 0.3778,
1547
- "mean_token_accuracy": 0.8892953939735889,
1548
- "num_tokens": 1690543.0,
1549
- "step": 1440
1550
- },
1551
- {
1552
- "entropy": 0.48633114621043205,
1553
- "epoch": 2.32,
1554
- "grad_norm": 1.0084096193313599,
1555
- "learning_rate": 5.3664e-05,
1556
- "loss": 0.4139,
1557
- "mean_token_accuracy": 0.8807312864810228,
1558
- "num_tokens": 1702841.0,
1559
- "step": 1450
1560
- },
1561
- {
1562
- "entropy": 0.3131198097020388,
1563
- "epoch": 2.336,
1564
- "grad_norm": 0.5311539769172668,
1565
- "learning_rate": 5.3344e-05,
1566
- "loss": 0.3289,
1567
- "mean_token_accuracy": 0.8994152408093214,
1568
- "num_tokens": 1742325.0,
1569
- "step": 1460
1570
- },
1571
- {
1572
- "entropy": 0.38374699037522075,
1573
- "epoch": 2.352,
1574
- "grad_norm": 0.4948159158229828,
1575
- "learning_rate": 5.3024000000000006e-05,
1576
- "loss": 0.3589,
1577
- "mean_token_accuracy": 0.8915071442723275,
1578
- "num_tokens": 1770764.0,
1579
- "step": 1470
1580
- },
1581
- {
1582
- "entropy": 0.42046497501432895,
1583
- "epoch": 2.368,
1584
- "grad_norm": 0.6284568309783936,
1585
- "learning_rate": 5.2703999999999995e-05,
1586
- "loss": 0.3595,
1587
- "mean_token_accuracy": 0.8936832427978516,
1588
- "num_tokens": 1793963.0,
1589
- "step": 1480
1590
- },
1591
- {
1592
- "entropy": 0.4265410235151649,
1593
- "epoch": 2.384,
1594
- "grad_norm": 0.6891266703605652,
1595
- "learning_rate": 5.2384e-05,
1596
- "loss": 0.39,
1597
- "mean_token_accuracy": 0.8861893687397242,
1598
- "num_tokens": 1812331.0,
1599
- "step": 1490
1600
- },
1601
- {
1602
- "entropy": 0.4833611447364092,
1603
- "epoch": 2.4,
1604
- "grad_norm": 0.91993248462677,
1605
- "learning_rate": 5.2064e-05,
1606
- "loss": 0.4097,
1607
- "mean_token_accuracy": 0.8784359741955996,
1608
- "num_tokens": 1824943.0,
1609
- "step": 1500
1610
- },
1611
- {
1612
- "epoch": 2.4,
1613
- "eval_entropy": 0.4156067478954792,
1614
- "eval_loss": 0.531775951385498,
1615
- "eval_mean_token_accuracy": 0.8573460700511932,
1616
- "eval_num_tokens": 1824943.0,
1617
- "eval_runtime": 896.7745,
1618
- "eval_samples_per_second": 2.23,
1619
- "eval_steps_per_second": 0.558,
1620
- "step": 1500
1621
- },
1622
- {
1623
- "entropy": 0.2992474908940494,
1624
- "epoch": 2.416,
1625
- "grad_norm": 0.43484658002853394,
1626
- "learning_rate": 5.1744000000000005e-05,
1627
- "loss": 0.293,
1628
- "mean_token_accuracy": 0.9087961092591286,
1629
- "num_tokens": 40453.0,
1630
- "step": 1510
1631
- },
1632
- {
1633
- "entropy": 0.3289525999687612,
1634
- "epoch": 2.432,
1635
- "grad_norm": 0.5937761664390564,
1636
- "learning_rate": 5.142400000000001e-05,
1637
- "loss": 0.303,
1638
- "mean_token_accuracy": 0.9087128143757581,
1639
- "num_tokens": 68853.0,
1640
- "step": 1520
1641
- },
1642
- {
1643
- "entropy": 0.3652105055749416,
1644
- "epoch": 2.448,
1645
- "grad_norm": 0.5499975681304932,
1646
- "learning_rate": 5.110400000000001e-05,
1647
- "loss": 0.3153,
1648
- "mean_token_accuracy": 0.9054864585399628,
1649
- "num_tokens": 92132.0,
1650
- "step": 1530
1651
- },
1652
- {
1653
- "entropy": 0.37480679620057344,
1654
- "epoch": 2.464,
1655
- "grad_norm": 0.8807706236839294,
1656
- "learning_rate": 5.0784e-05,
1657
- "loss": 0.3132,
1658
- "mean_token_accuracy": 0.9067570131272078,
1659
- "num_tokens": 110798.0,
1660
- "step": 1540
1661
- },
1662
- {
1663
- "entropy": 0.408511808142066,
1664
- "epoch": 2.48,
1665
- "grad_norm": 1.0242410898208618,
1666
- "learning_rate": 5.0464e-05,
1667
- "loss": 0.3242,
1668
- "mean_token_accuracy": 0.9066624633967877,
1669
- "num_tokens": 123425.0,
1670
- "step": 1550
1671
- },
1672
- {
1673
- "entropy": 0.27335043689236044,
1674
- "epoch": 2.496,
1675
- "grad_norm": 0.5802608728408813,
1676
- "learning_rate": 5.0144e-05,
1677
- "loss": 0.305,
1678
- "mean_token_accuracy": 0.9049082029610872,
1679
- "num_tokens": 164151.0,
1680
- "step": 1560
1681
- },
1682
- {
1683
- "entropy": 0.3443534100428224,
1684
- "epoch": 2.512,
1685
- "grad_norm": 0.5400863289833069,
1686
- "learning_rate": 4.9824e-05,
1687
- "loss": 0.3053,
1688
- "mean_token_accuracy": 0.9077682174742222,
1689
- "num_tokens": 192904.0,
1690
- "step": 1570
1691
- },
1692
- {
1693
- "entropy": 0.36752058789134023,
1694
- "epoch": 2.528,
1695
- "grad_norm": 0.6786855459213257,
1696
- "learning_rate": 4.9504e-05,
1697
- "loss": 0.3092,
1698
- "mean_token_accuracy": 0.9070601720362902,
1699
- "num_tokens": 215988.0,
1700
- "step": 1580
1701
- },
1702
- {
1703
- "entropy": 0.3739521996118128,
1704
- "epoch": 2.544,
1705
- "grad_norm": 0.821361243724823,
1706
- "learning_rate": 4.9184e-05,
1707
- "loss": 0.3238,
1708
- "mean_token_accuracy": 0.9040915958583355,
1709
- "num_tokens": 234633.0,
1710
- "step": 1590
1711
- },
1712
- {
1713
- "entropy": 0.4124453643336892,
1714
- "epoch": 2.56,
1715
- "grad_norm": 1.0654460191726685,
1716
- "learning_rate": 4.8864000000000005e-05,
1717
- "loss": 0.3293,
1718
- "mean_token_accuracy": 0.9035760186612606,
1719
- "num_tokens": 247750.0,
1720
- "step": 1600
1721
- },
1722
- {
1723
- "epoch": 2.56,
1724
- "eval_entropy": 0.3808738026022911,
1725
- "eval_loss": 0.5724619626998901,
1726
- "eval_mean_token_accuracy": 0.8541958237886429,
1727
- "eval_num_tokens": 247750.0,
1728
- "eval_runtime": 980.7136,
1729
- "eval_samples_per_second": 2.039,
1730
- "eval_steps_per_second": 0.51,
1731
- "step": 1600
1732
- },
1733
- {
1734
- "entropy": 0.27413347605615856,
1735
- "epoch": 2.576,
1736
- "grad_norm": 0.6262645125389099,
1737
- "learning_rate": 4.8544e-05,
1738
- "loss": 0.291,
1739
- "mean_token_accuracy": 0.909802608937025,
1740
- "num_tokens": 289137.0,
1741
- "step": 1610
1742
- },
1743
- {
1744
- "entropy": 0.3372902118600905,
1745
- "epoch": 2.592,
1746
- "grad_norm": 0.6019719243049622,
1747
- "learning_rate": 4.8224000000000004e-05,
1748
- "loss": 0.3089,
1749
- "mean_token_accuracy": 0.9065854378044605,
1750
- "num_tokens": 317789.0,
1751
- "step": 1620
1752
- },
1753
- {
1754
- "entropy": 0.37745234509930015,
1755
- "epoch": 2.608,
1756
- "grad_norm": 0.6852167248725891,
1757
- "learning_rate": 4.790400000000001e-05,
1758
- "loss": 0.3237,
1759
- "mean_token_accuracy": 0.9017773322761059,
1760
- "num_tokens": 340977.0,
1761
- "step": 1630
1762
- },
1763
- {
1764
- "entropy": 0.3725322958081961,
1765
- "epoch": 2.624,
1766
- "grad_norm": 0.7118895053863525,
1767
- "learning_rate": 4.7584000000000004e-05,
1768
- "loss": 0.3207,
1769
- "mean_token_accuracy": 0.9077424634248018,
1770
- "num_tokens": 360098.0,
1771
- "step": 1640
1772
- },
1773
- {
1774
- "entropy": 0.4033573804423213,
1775
- "epoch": 2.64,
1776
- "grad_norm": 1.0586738586425781,
1777
- "learning_rate": 4.7264e-05,
1778
- "loss": 0.3174,
1779
- "mean_token_accuracy": 0.9044062152504921,
1780
- "num_tokens": 373200.0,
1781
- "step": 1650
1782
- },
1783
- {
1784
- "entropy": 0.2776737127453089,
1785
- "epoch": 2.656,
1786
- "grad_norm": 0.6017902493476868,
1787
- "learning_rate": 4.6944e-05,
1788
- "loss": 0.2942,
1789
- "mean_token_accuracy": 0.9093959752470255,
1790
- "num_tokens": 413938.0,
1791
- "step": 1660
1792
- },
1793
- {
1794
- "entropy": 0.33967588590458037,
1795
- "epoch": 2.672,
1796
- "grad_norm": 0.6162438988685608,
1797
- "learning_rate": 4.6624e-05,
1798
- "loss": 0.3075,
1799
- "mean_token_accuracy": 0.905268831551075,
1800
- "num_tokens": 442794.0,
1801
- "step": 1670
1802
- },
1803
- {
1804
- "entropy": 0.37314077839255333,
1805
- "epoch": 2.6879999999999997,
1806
- "grad_norm": 0.6455461382865906,
1807
- "learning_rate": 4.6304e-05,
1808
- "loss": 0.312,
1809
- "mean_token_accuracy": 0.9044175367802382,
1810
- "num_tokens": 465992.0,
1811
- "step": 1680
1812
- },
1813
- {
1814
- "entropy": 0.3640971322543919,
1815
- "epoch": 2.7039999999999997,
1816
- "grad_norm": 0.7681553959846497,
1817
- "learning_rate": 4.5984000000000006e-05,
1818
- "loss": 0.3049,
1819
- "mean_token_accuracy": 0.9096171893179417,
1820
- "num_tokens": 484580.0,
1821
- "step": 1690
1822
- },
1823
- {
1824
- "entropy": 0.39063505809754134,
1825
- "epoch": 2.7199999999999998,
1826
- "grad_norm": 0.9511684775352478,
1827
- "learning_rate": 4.5664e-05,
1828
- "loss": 0.3225,
1829
- "mean_token_accuracy": 0.9034549340605735,
1830
- "num_tokens": 497612.0,
1831
- "step": 1700
1832
- },
1833
- {
1834
- "entropy": 0.2883146867156029,
1835
- "epoch": 2.7359999999999998,
1836
- "grad_norm": 0.6692296862602234,
1837
- "learning_rate": 4.5344000000000005e-05,
1838
- "loss": 0.2935,
1839
- "mean_token_accuracy": 0.9078109141439199,
1840
- "num_tokens": 537755.0,
1841
- "step": 1710
1842
- },
1843
- {
1844
- "entropy": 0.34244058514013886,
1845
- "epoch": 2.752,
1846
- "grad_norm": 0.5983220934867859,
1847
- "learning_rate": 4.5024e-05,
1848
- "loss": 0.3076,
1849
- "mean_token_accuracy": 0.9057810723781585,
1850
- "num_tokens": 566325.0,
1851
- "step": 1720
1852
- },
1853
- {
1854
- "entropy": 0.3659200777299702,
1855
- "epoch": 2.768,
1856
- "grad_norm": 0.7049655318260193,
1857
- "learning_rate": 4.4704000000000004e-05,
1858
- "loss": 0.3059,
1859
- "mean_token_accuracy": 0.9072589132934809,
1860
- "num_tokens": 589517.0,
1861
- "step": 1730
1862
- },
1863
- {
1864
- "entropy": 0.35552563723176717,
1865
- "epoch": 2.784,
1866
- "grad_norm": 0.7242270112037659,
1867
- "learning_rate": 4.4384e-05,
1868
- "loss": 0.3013,
1869
- "mean_token_accuracy": 0.912841784581542,
1870
- "num_tokens": 608224.0,
1871
- "step": 1740
1872
- },
1873
- {
1874
- "entropy": 0.4027377144433558,
1875
- "epoch": 2.8,
1876
- "grad_norm": 1.5430299043655396,
1877
- "learning_rate": 4.4064e-05,
1878
- "loss": 0.3223,
1879
- "mean_token_accuracy": 0.9028574671596289,
1880
- "num_tokens": 621051.0,
1881
- "step": 1750
1882
- },
1883
- {
1884
- "entropy": 0.2703737439122051,
1885
- "epoch": 2.816,
1886
- "grad_norm": 0.7151817083358765,
1887
- "learning_rate": 4.3744e-05,
1888
- "loss": 0.2894,
1889
- "mean_token_accuracy": 0.9102732315659523,
1890
- "num_tokens": 662133.0,
1891
- "step": 1760
1892
- },
1893
- {
1894
- "entropy": 0.32695954395458104,
1895
- "epoch": 2.832,
1896
- "grad_norm": 0.6097021698951721,
1897
- "learning_rate": 4.3424e-05,
1898
- "loss": 0.2967,
1899
- "mean_token_accuracy": 0.9080837737768889,
1900
- "num_tokens": 690682.0,
1901
- "step": 1770
1902
- },
1903
- {
1904
- "entropy": 0.36010922444984317,
1905
- "epoch": 2.848,
1906
- "grad_norm": 0.7698465585708618,
1907
- "learning_rate": 4.3104e-05,
1908
- "loss": 0.3064,
1909
- "mean_token_accuracy": 0.9076121047139167,
1910
- "num_tokens": 713519.0,
1911
- "step": 1780
1912
- },
1913
- {
1914
- "entropy": 0.369490017183125,
1915
- "epoch": 2.864,
1916
- "grad_norm": 0.997474730014801,
1917
- "learning_rate": 4.2784e-05,
1918
- "loss": 0.3153,
1919
- "mean_token_accuracy": 0.9070124924182892,
1920
- "num_tokens": 731712.0,
1921
- "step": 1790
1922
- },
1923
- {
1924
- "entropy": 0.41184745989739896,
1925
- "epoch": 2.88,
1926
- "grad_norm": 0.9906476736068726,
1927
- "learning_rate": 4.2464000000000005e-05,
1928
- "loss": 0.3325,
1929
- "mean_token_accuracy": 0.9020481187850237,
1930
- "num_tokens": 744149.0,
1931
- "step": 1800
1932
- },
1933
- {
1934
- "entropy": 0.28201086847111584,
1935
- "epoch": 2.896,
1936
- "grad_norm": 0.6134458184242249,
1937
- "learning_rate": 4.2144e-05,
1938
- "loss": 0.2988,
1939
- "mean_token_accuracy": 0.9069436389952898,
1940
- "num_tokens": 782193.0,
1941
- "step": 1810
1942
- },
1943
- {
1944
- "entropy": 0.33303718706592916,
1945
- "epoch": 2.912,
1946
- "grad_norm": 0.6062189936637878,
1947
- "learning_rate": 4.1824000000000005e-05,
1948
- "loss": 0.3086,
1949
- "mean_token_accuracy": 0.9056244477629661,
1950
- "num_tokens": 809927.0,
1951
- "step": 1820
1952
- },
1953
- {
1954
- "entropy": 0.3643056120723486,
1955
- "epoch": 2.928,
1956
- "grad_norm": 0.6338886618614197,
1957
- "learning_rate": 4.1504e-05,
1958
- "loss": 0.3035,
1959
- "mean_token_accuracy": 0.911867779865861,
1960
- "num_tokens": 832745.0,
1961
- "step": 1830
1962
- },
1963
- {
1964
- "entropy": 0.35973973935469983,
1965
- "epoch": 2.944,
1966
- "grad_norm": 0.8483228087425232,
1967
- "learning_rate": 4.1184e-05,
1968
- "loss": 0.3084,
1969
- "mean_token_accuracy": 0.9093430683016777,
1970
- "num_tokens": 851193.0,
1971
- "step": 1840
1972
- },
1973
- {
1974
- "entropy": 0.4053435407578945,
1975
- "epoch": 2.96,
1976
- "grad_norm": 0.9516308903694153,
1977
- "learning_rate": 4.0864e-05,
1978
- "loss": 0.332,
1979
- "mean_token_accuracy": 0.8999160658568144,
1980
- "num_tokens": 863867.0,
1981
- "step": 1850
1982
- },
1983
- {
1984
- "entropy": 0.2989065528847277,
1985
- "epoch": 2.976,
1986
- "grad_norm": 0.6929520964622498,
1987
- "learning_rate": 4.0544000000000003e-05,
1988
- "loss": 0.2943,
1989
- "mean_token_accuracy": 0.9087879080325365,
1990
- "num_tokens": 898118.0,
1991
- "step": 1860
1992
- },
1993
- {
1994
- "entropy": 0.3597102670930326,
1995
- "epoch": 2.992,
1996
- "grad_norm": 0.7972533106803894,
1997
- "learning_rate": 4.0224e-05,
1998
- "loss": 0.3215,
1999
- "mean_token_accuracy": 0.902438759058714,
2000
- "num_tokens": 918026.0,
2001
- "step": 1870
2002
- },
2003
- {
2004
- "entropy": 0.3693191984202713,
2005
- "epoch": 3.008,
2006
- "grad_norm": 0.4952141344547272,
2007
- "learning_rate": 3.9904e-05,
2008
- "loss": 0.3109,
2009
- "mean_token_accuracy": 0.9047053713351488,
2010
- "num_tokens": 946468.0,
2011
- "step": 1880
2012
- },
2013
- {
2014
- "entropy": 0.30884325662627815,
2015
- "epoch": 3.024,
2016
- "grad_norm": 0.6402750015258789,
2017
- "learning_rate": 3.9584000000000006e-05,
2018
- "loss": 0.287,
2019
- "mean_token_accuracy": 0.9127614002674818,
2020
- "num_tokens": 978498.0,
2021
- "step": 1890
2022
- },
2023
- {
2024
- "entropy": 0.3251019007526338,
2025
- "epoch": 3.04,
2026
- "grad_norm": 0.7701610326766968,
2027
- "learning_rate": 3.9264e-05,
2028
- "loss": 0.3012,
2029
- "mean_token_accuracy": 0.9117080509662628,
2030
- "num_tokens": 1004128.0,
2031
- "step": 1900
2032
- },
2033
- {
2034
- "entropy": 0.3512966329231858,
2035
- "epoch": 3.056,
2036
- "grad_norm": 0.934260368347168,
2037
- "learning_rate": 3.8944000000000005e-05,
2038
- "loss": 0.2996,
2039
- "mean_token_accuracy": 0.9139776781201363,
2040
- "num_tokens": 1025136.0,
2041
- "step": 1910
2042
- },
2043
- {
2044
- "entropy": 0.36649829614907503,
2045
- "epoch": 3.072,
2046
- "grad_norm": 1.147735357284546,
2047
- "learning_rate": 3.8624e-05,
2048
- "loss": 0.3172,
2049
- "mean_token_accuracy": 0.90965236723423,
2050
- "num_tokens": 1041157.0,
2051
- "step": 1920
2052
- },
2053
- {
2054
- "entropy": 0.33526935083791615,
2055
- "epoch": 3.088,
2056
- "grad_norm": 0.6278552412986755,
2057
- "learning_rate": 3.8304e-05,
2058
- "loss": 0.294,
2059
- "mean_token_accuracy": 0.914416927471757,
2060
- "num_tokens": 1069401.0,
2061
- "step": 1930
2062
- },
2063
- {
2064
- "entropy": 0.2916401638649404,
2065
- "epoch": 3.104,
2066
- "grad_norm": 0.7106419205665588,
2067
- "learning_rate": 3.7984e-05,
2068
- "loss": 0.2833,
2069
- "mean_token_accuracy": 0.9128728475421667,
2070
- "num_tokens": 1101705.0,
2071
- "step": 1940
2072
- },
2073
- {
2074
- "entropy": 0.31783650666475294,
2075
- "epoch": 3.12,
2076
- "grad_norm": 0.6372864246368408,
2077
- "learning_rate": 3.7664e-05,
2078
- "loss": 0.2808,
2079
- "mean_token_accuracy": 0.9190873377025127,
2080
- "num_tokens": 1127173.0,
2081
- "step": 1950
2082
- },
2083
- {
2084
- "entropy": 0.33883463945239783,
2085
- "epoch": 3.136,
2086
- "grad_norm": 0.7593994736671448,
2087
- "learning_rate": 3.7344e-05,
2088
- "loss": 0.2932,
2089
- "mean_token_accuracy": 0.9133320480585099,
2090
- "num_tokens": 1147878.0,
2091
- "step": 1960
2092
- },
2093
- {
2094
- "entropy": 0.36267717741429806,
2095
- "epoch": 3.152,
2096
- "grad_norm": 0.9578737616539001,
2097
- "learning_rate": 3.7024e-05,
2098
- "loss": 0.3018,
2099
- "mean_token_accuracy": 0.9135202784091234,
2100
- "num_tokens": 1164084.0,
2101
- "step": 1970
2102
- },
2103
- {
2104
- "entropy": 0.33903956757858394,
2105
- "epoch": 3.168,
2106
- "grad_norm": 0.5553727746009827,
2107
- "learning_rate": 3.6704e-05,
2108
- "loss": 0.2962,
2109
- "mean_token_accuracy": 0.9128197953104973,
2110
- "num_tokens": 1192486.0,
2111
- "step": 1980
2112
- },
2113
- {
2114
- "entropy": 0.2897605660371482,
2115
- "epoch": 3.184,
2116
- "grad_norm": 0.7067289352416992,
2117
- "learning_rate": 3.6384e-05,
2118
- "loss": 0.2867,
2119
- "mean_token_accuracy": 0.9137052699923516,
2120
- "num_tokens": 1224540.0,
2121
- "step": 1990
2122
- },
2123
- {
2124
- "entropy": 0.32448912151157855,
2125
- "epoch": 3.2,
2126
- "grad_norm": 0.7603920102119446,
2127
- "learning_rate": 3.6064000000000006e-05,
2128
- "loss": 0.2908,
2129
- "mean_token_accuracy": 0.9150090869516134,
2130
- "num_tokens": 1249827.0,
2131
- "step": 2000
2132
- },
2133
- {
2134
- "epoch": 3.2,
2135
- "eval_entropy": 0.4150727687478066,
2136
- "eval_loss": 0.5455561280250549,
2137
- "eval_mean_token_accuracy": 0.857409807562828,
2138
- "eval_num_tokens": 1249827.0,
2139
- "eval_runtime": 982.2461,
2140
- "eval_samples_per_second": 2.036,
2141
- "eval_steps_per_second": 0.509,
2142
- "step": 2000
2143
- },
2144
- {
2145
- "entropy": 0.3617474281229079,
2146
- "epoch": 3.216,
2147
- "grad_norm": 0.7705036997795105,
2148
- "learning_rate": 3.5744e-05,
2149
- "loss": 0.3175,
2150
- "mean_token_accuracy": 0.9062783475965261,
2151
- "num_tokens": 20779.0,
2152
- "step": 2010
2153
- },
2154
- {
2155
- "entropy": 0.3887558562681079,
2156
- "epoch": 3.232,
2157
- "grad_norm": 0.9926668405532837,
2158
- "learning_rate": 3.5424e-05,
2159
- "loss": 0.3243,
2160
- "mean_token_accuracy": 0.9048940639942884,
2161
- "num_tokens": 37039.0,
2162
- "step": 2020
2163
- },
2164
- {
2165
- "entropy": 0.36308987056836484,
2166
- "epoch": 3.248,
2167
- "grad_norm": 0.5336251258850098,
2168
- "learning_rate": 3.5104e-05,
2169
- "loss": 0.3286,
2170
- "mean_token_accuracy": 0.9028704173862934,
2171
- "num_tokens": 66230.0,
2172
- "step": 2030
2173
- },
2174
- {
2175
- "entropy": 0.3100855226628482,
2176
- "epoch": 3.2640000000000002,
2177
- "grad_norm": 0.6235008239746094,
2178
- "learning_rate": 3.4784e-05,
2179
- "loss": 0.3026,
2180
- "mean_token_accuracy": 0.9074051853269338,
2181
- "num_tokens": 98315.0,
2182
- "step": 2040
2183
- },
2184
- {
2185
- "entropy": 0.33463340234011413,
2186
- "epoch": 3.2800000000000002,
2187
- "grad_norm": 0.6380220651626587,
2188
- "learning_rate": 3.4464e-05,
2189
- "loss": 0.3058,
2190
- "mean_token_accuracy": 0.9115277793258428,
2191
- "num_tokens": 123538.0,
2192
- "step": 2050
2193
- },
2194
- {
2195
- "entropy": 0.3619419479742646,
2196
- "epoch": 3.296,
2197
- "grad_norm": 0.7604582905769348,
2198
- "learning_rate": 3.4144000000000004e-05,
2199
- "loss": 0.3112,
2200
- "mean_token_accuracy": 0.9084025923162699,
2201
- "num_tokens": 143855.0,
2202
- "step": 2060
2203
- },
2204
- {
2205
- "entropy": 0.3980453579686582,
2206
- "epoch": 3.312,
2207
- "grad_norm": 0.8576037883758545,
2208
- "learning_rate": 3.3824e-05,
2209
- "loss": 0.3267,
2210
- "mean_token_accuracy": 0.9037791218608617,
2211
- "num_tokens": 159314.0,
2212
- "step": 2070
2213
- },
2214
- {
2215
- "entropy": 0.35077386572957037,
2216
- "epoch": 3.328,
2217
- "grad_norm": 0.5504621863365173,
2218
- "learning_rate": 3.3504e-05,
2219
- "loss": 0.3004,
2220
- "mean_token_accuracy": 0.9084354028105736,
2221
- "num_tokens": 187464.0,
2222
- "step": 2080
2223
- },
2224
- {
2225
- "entropy": 0.28209723997861147,
2226
- "epoch": 3.344,
2227
- "grad_norm": 0.8361979126930237,
2228
- "learning_rate": 3.3184000000000006e-05,
2229
- "loss": 0.2903,
2230
- "mean_token_accuracy": 0.9112230580300092,
2231
- "num_tokens": 219657.0,
2232
- "step": 2090
2233
- },
2234
- {
2235
- "entropy": 0.3153431011363864,
2236
- "epoch": 3.36,
2237
- "grad_norm": 0.6275749802589417,
2238
- "learning_rate": 3.2864e-05,
2239
- "loss": 0.2894,
2240
- "mean_token_accuracy": 0.9114996068179607,
2241
- "num_tokens": 245396.0,
2242
- "step": 2100
2243
- },
2244
- {
2245
- "epoch": 3.36,
2246
- "eval_accuracy": 0.026501569905019107,
2247
- "eval_entropy": 0.4113759865760803,
2248
- "eval_loss": 0.541074275970459,
2249
- "eval_mean_token_accuracy": 0.8583663606643677,
2250
- "eval_num_tokens": 245396.0,
2251
- "eval_runtime": 869.6626,
2252
- "eval_samples_per_second": 2.3,
2253
- "eval_steps_per_second": 0.575,
2254
- "step": 2100
2255
- },
2256
- {
2257
- "entropy": 0.3517730229534209,
2258
- "epoch": 3.376,
2259
- "grad_norm": 0.6908054947853088,
2260
- "learning_rate": 3.2544000000000006e-05,
2261
- "loss": 0.3057,
2262
- "mean_token_accuracy": 0.9103573642671108,
2263
- "num_tokens": 266432.0,
2264
- "step": 2110
2265
- },
2266
- {
2267
- "entropy": 0.38618900515139104,
2268
- "epoch": 3.392,
2269
- "grad_norm": 0.9056383967399597,
2270
- "learning_rate": 3.2224e-05,
2271
- "loss": 0.3188,
2272
- "mean_token_accuracy": 0.9076898027211427,
2273
- "num_tokens": 282655.0,
2274
- "step": 2120
2275
- },
2276
- {
2277
- "entropy": 0.3537537831813097,
2278
- "epoch": 3.408,
2279
- "grad_norm": 0.48644715547561646,
2280
- "learning_rate": 3.1904e-05,
2281
- "loss": 0.2886,
2282
- "mean_token_accuracy": 0.9162093725055456,
2283
- "num_tokens": 310801.0,
2284
- "step": 2130
2285
- },
2286
- {
2287
- "entropy": 0.26729877749457953,
2288
- "epoch": 3.424,
2289
- "grad_norm": 0.6074755787849426,
2290
- "learning_rate": 3.1584e-05,
2291
- "loss": 0.2371,
2292
- "mean_token_accuracy": 0.9263024788349867,
2293
- "num_tokens": 343555.0,
2294
- "step": 2140
2295
- },
2296
- {
2297
- "entropy": 0.25955253606662154,
2298
- "epoch": 3.44,
2299
- "grad_norm": 0.8773949146270752,
2300
- "learning_rate": 3.1264e-05,
2301
- "loss": 0.2227,
2302
- "mean_token_accuracy": 0.9337353933602571,
2303
- "num_tokens": 369134.0,
2304
- "step": 2150
2305
- },
2306
- {
2307
- "entropy": 0.27338800597935914,
2308
- "epoch": 3.456,
2309
- "grad_norm": 0.7504522204399109,
2310
- "learning_rate": 3.0975999999999996e-05,
2311
- "loss": 0.2261,
2312
- "mean_token_accuracy": 0.9332862004637719,
2313
- "num_tokens": 390152.0,
2314
- "step": 2160
2315
- },
2316
- {
2317
- "entropy": 0.30181694105267526,
2318
- "epoch": 3.472,
2319
- "grad_norm": 0.8649200201034546,
2320
- "learning_rate": 3.0656e-05,
2321
- "loss": 0.2289,
2322
- "mean_token_accuracy": 0.9334215141832829,
2323
- "num_tokens": 406222.0,
2324
- "step": 2170
2325
- },
2326
- {
2327
- "entropy": 0.28406244921498003,
2328
- "epoch": 3.488,
2329
- "grad_norm": 1.9269925355911255,
2330
- "learning_rate": 3.0336000000000002e-05,
2331
- "loss": 0.2353,
2332
- "mean_token_accuracy": 0.9303826864808797,
2333
- "num_tokens": 434767.0,
2334
- "step": 2180
2335
- },
2336
- {
2337
- "entropy": 0.2358154426328838,
2338
- "epoch": 3.504,
2339
- "grad_norm": 0.7775760293006897,
2340
- "learning_rate": 3.0016e-05,
2341
- "loss": 0.2277,
2342
- "mean_token_accuracy": 0.9293628957122564,
2343
- "num_tokens": 467498.0,
2344
- "step": 2190
2345
- },
2346
- {
2347
- "entropy": 0.2596265008673072,
2348
- "epoch": 3.52,
2349
- "grad_norm": 0.7286163568496704,
2350
- "learning_rate": 2.9696e-05,
2351
- "loss": 0.2266,
2352
- "mean_token_accuracy": 0.9321592267602682,
2353
- "num_tokens": 493146.0,
2354
- "step": 2200
2355
- },
2356
- {
2357
- "entropy": 0.28550293026492,
2358
- "epoch": 3.536,
2359
- "grad_norm": 0.7693914175033569,
2360
- "learning_rate": 2.9376000000000005e-05,
2361
- "loss": 0.2291,
2362
- "mean_token_accuracy": 0.9351058643311262,
2363
- "num_tokens": 513926.0,
2364
- "step": 2210
2365
- },
2366
- {
2367
- "entropy": 0.2885140863247216,
2368
- "epoch": 3.552,
2369
- "grad_norm": 1.1927505731582642,
2370
- "learning_rate": 2.9056e-05,
2371
- "loss": 0.219,
2372
- "mean_token_accuracy": 0.9396381825208664,
2373
- "num_tokens": 530263.0,
2374
- "step": 2220
2375
- },
2376
- {
2377
- "entropy": 0.283741835039109,
2378
- "epoch": 3.568,
2379
- "grad_norm": 0.6537899971008301,
2380
- "learning_rate": 2.8736e-05,
2381
- "loss": 0.2324,
2382
- "mean_token_accuracy": 0.9302929677069187,
2383
- "num_tokens": 559791.0,
2384
- "step": 2230
2385
- },
2386
- {
2387
- "entropy": 0.2369093818590045,
2388
- "epoch": 3.584,
2389
- "grad_norm": 0.793480396270752,
2390
- "learning_rate": 2.8416000000000004e-05,
2391
- "loss": 0.2165,
2392
- "mean_token_accuracy": 0.9320364937186241,
2393
- "num_tokens": 592398.0,
2394
- "step": 2240
2395
- },
2396
- {
2397
- "entropy": 0.264733817987144,
2398
- "epoch": 3.6,
2399
- "grad_norm": 0.7945203185081482,
2400
- "learning_rate": 2.8096e-05,
2401
- "loss": 0.2337,
2402
- "mean_token_accuracy": 0.9294226188212633,
2403
- "num_tokens": 617982.0,
2404
- "step": 2250
2405
- },
2406
- {
2407
- "entropy": 0.2889886857941747,
2408
- "epoch": 3.616,
2409
- "grad_norm": 0.7558261752128601,
2410
- "learning_rate": 2.7776000000000003e-05,
2411
- "loss": 0.2305,
2412
- "mean_token_accuracy": 0.9317790925502777,
2413
- "num_tokens": 639115.0,
2414
- "step": 2260
2415
- },
2416
- {
2417
- "entropy": 0.28708559228107333,
2418
- "epoch": 3.632,
2419
- "grad_norm": 0.6877163648605347,
2420
- "learning_rate": 2.7456000000000003e-05,
2421
- "loss": 0.2215,
2422
- "mean_token_accuracy": 0.9357377961277962,
2423
- "num_tokens": 655709.0,
2424
- "step": 2270
2425
- },
2426
- {
2427
- "entropy": 0.28660596534609795,
2428
- "epoch": 3.648,
2429
- "grad_norm": 0.6599491238594055,
2430
- "learning_rate": 2.7136e-05,
2431
- "loss": 0.2363,
2432
- "mean_token_accuracy": 0.928611570596695,
2433
- "num_tokens": 684500.0,
2434
- "step": 2280
2435
- },
2436
- {
2437
- "entropy": 0.23836621949449183,
2438
- "epoch": 3.664,
2439
- "grad_norm": 0.7436323165893555,
2440
- "learning_rate": 2.6816000000000002e-05,
2441
- "loss": 0.2194,
2442
- "mean_token_accuracy": 0.9314162913709879,
2443
- "num_tokens": 717271.0,
2444
- "step": 2290
2445
- },
2446
- {
2447
- "entropy": 0.27099227644503115,
2448
- "epoch": 3.68,
2449
- "grad_norm": 0.7519745826721191,
2450
- "learning_rate": 2.6496e-05,
2451
- "loss": 0.2369,
2452
- "mean_token_accuracy": 0.9278060872107744,
2453
- "num_tokens": 743068.0,
2454
- "step": 2300
2455
- },
2456
- {
2457
- "entropy": 0.282380092702806,
2458
- "epoch": 3.6959999999999997,
2459
- "grad_norm": 0.7645207643508911,
2460
- "learning_rate": 2.6176e-05,
2461
- "loss": 0.2175,
2462
- "mean_token_accuracy": 0.9372334524989128,
2463
- "num_tokens": 763925.0,
2464
- "step": 2310
2465
- },
2466
- {
2467
- "entropy": 0.2850790939293802,
2468
- "epoch": 3.7119999999999997,
2469
- "grad_norm": 0.9016556143760681,
2470
- "learning_rate": 2.5856e-05,
2471
- "loss": 0.217,
2472
- "mean_token_accuracy": 0.9392455574125051,
2473
- "num_tokens": 780111.0,
2474
- "step": 2320
2475
- },
2476
- {
2477
- "entropy": 0.2691464308649302,
2478
- "epoch": 3.7279999999999998,
2479
- "grad_norm": 0.77091383934021,
2480
- "learning_rate": 2.5535999999999997e-05,
2481
- "loss": 0.2334,
2482
- "mean_token_accuracy": 0.929338139295578,
2483
- "num_tokens": 808661.0,
2484
- "step": 2330
2485
- },
2486
- {
2487
- "entropy": 0.2395469973795116,
2488
- "epoch": 3.7439999999999998,
2489
- "grad_norm": 0.7632396221160889,
2490
- "learning_rate": 2.5216e-05,
2491
- "loss": 0.2148,
2492
- "mean_token_accuracy": 0.9322273649275303,
2493
- "num_tokens": 840932.0,
2494
- "step": 2340
2495
- },
2496
- {
2497
- "entropy": 0.2645680231973529,
2498
- "epoch": 3.76,
2499
- "grad_norm": 0.819273054599762,
2500
- "learning_rate": 2.4896e-05,
2501
- "loss": 0.226,
2502
- "mean_token_accuracy": 0.930556321516633,
2503
- "num_tokens": 866564.0,
2504
- "step": 2350
2505
- },
2506
- {
2507
- "entropy": 0.2808503101579845,
2508
- "epoch": 3.776,
2509
- "grad_norm": 0.8598120808601379,
2510
- "learning_rate": 2.4576000000000003e-05,
2511
- "loss": 0.2215,
2512
- "mean_token_accuracy": 0.9356644533574581,
2513
- "num_tokens": 887527.0,
2514
- "step": 2360
2515
- },
2516
- {
2517
- "entropy": 0.28694011168554423,
2518
- "epoch": 3.792,
2519
- "grad_norm": 1.0404748916625977,
2520
- "learning_rate": 2.4256e-05,
2521
- "loss": 0.214,
2522
- "mean_token_accuracy": 0.9388030290603637,
2523
- "num_tokens": 903688.0,
2524
- "step": 2370
2525
- },
2526
- {
2527
- "entropy": 0.2774578414391726,
2528
- "epoch": 3.808,
2529
- "grad_norm": 1.2308194637298584,
2530
- "learning_rate": 2.3936e-05,
2531
- "loss": 0.2328,
2532
- "mean_token_accuracy": 0.929581755027175,
2533
- "num_tokens": 932975.0,
2534
- "step": 2380
2535
- },
2536
- {
2537
- "entropy": 0.2381771973334253,
2538
- "epoch": 3.824,
2539
- "grad_norm": 0.7983541488647461,
2540
- "learning_rate": 2.3616000000000002e-05,
2541
- "loss": 0.2177,
2542
- "mean_token_accuracy": 0.9316004611551761,
2543
- "num_tokens": 965221.0,
2544
- "step": 2390
2545
- },
2546
- {
2547
- "entropy": 0.2579630766995251,
2548
- "epoch": 3.84,
2549
- "grad_norm": 0.8867554068565369,
2550
- "learning_rate": 2.3296000000000002e-05,
2551
- "loss": 0.2221,
2552
- "mean_token_accuracy": 0.9320516049861908,
2553
- "num_tokens": 990859.0,
2554
- "step": 2400
2555
- },
2556
- {
2557
- "epoch": 3.84,
2558
- "eval_accuracy": 0.02676376698545462,
2559
- "eval_entropy": 0.3534155045747757,
2560
- "eval_loss": 0.6058897972106934,
2561
- "eval_mean_token_accuracy": 0.8553497910499572,
2562
- "eval_num_tokens": 990859.0,
2563
- "eval_runtime": 869.2088,
2564
- "eval_samples_per_second": 2.301,
2565
- "eval_steps_per_second": 0.575,
2566
- "step": 2400
2567
- },
2568
- {
2569
- "entropy": 0.2655953477136791,
2570
- "epoch": 3.856,
2571
- "grad_norm": 0.8277497291564941,
2572
- "learning_rate": 2.2976e-05,
2573
- "loss": 0.2109,
2574
- "mean_token_accuracy": 0.9393812574446201,
2575
- "num_tokens": 1011268.0,
2576
- "step": 2410
2577
- },
2578
- {
2579
- "entropy": 0.2920296056661755,
2580
- "epoch": 3.872,
2581
- "grad_norm": 1.015434980392456,
2582
- "learning_rate": 2.2656e-05,
2583
- "loss": 0.2243,
2584
- "mean_token_accuracy": 0.9357186656445264,
2585
- "num_tokens": 1026942.0,
2586
- "step": 2420
2587
- },
2588
- {
2589
- "entropy": 0.2859017666429281,
2590
- "epoch": 3.888,
2591
- "grad_norm": 0.6656726002693176,
2592
- "learning_rate": 2.2336e-05,
2593
- "loss": 0.2389,
2594
- "mean_token_accuracy": 0.9283736657351256,
2595
- "num_tokens": 1053937.0,
2596
- "step": 2430
2597
- },
2598
- {
2599
- "entropy": 0.24961302392184734,
2600
- "epoch": 3.904,
2601
- "grad_norm": 0.8390278816223145,
2602
- "learning_rate": 2.2016e-05,
2603
- "loss": 0.2211,
2604
- "mean_token_accuracy": 0.9312011521309614,
2605
- "num_tokens": 1084820.0,
2606
- "step": 2440
2607
- },
2608
- {
2609
- "entropy": 0.2519187033176422,
2610
- "epoch": 3.92,
2611
- "grad_norm": 0.8542287349700928,
2612
- "learning_rate": 2.1696e-05,
2613
- "loss": 0.2126,
2614
- "mean_token_accuracy": 0.9375488836318254,
2615
- "num_tokens": 1109943.0,
2616
- "step": 2450
2617
- },
2618
- {
2619
- "entropy": 0.27277124775573613,
2620
- "epoch": 3.936,
2621
- "grad_norm": 0.9245595335960388,
2622
- "learning_rate": 2.1376e-05,
2623
- "loss": 0.2161,
2624
- "mean_token_accuracy": 0.9364014331251382,
2625
- "num_tokens": 1130543.0,
2626
- "step": 2460
2627
- },
2628
- {
2629
- "entropy": 0.28273853762075307,
2630
- "epoch": 3.952,
2631
- "grad_norm": 0.9764724969863892,
2632
- "learning_rate": 2.1056e-05,
2633
- "loss": 0.2217,
2634
- "mean_token_accuracy": 0.9356040749698877,
2635
- "num_tokens": 1146676.0,
2636
- "step": 2470
2637
- },
2638
- {
2639
- "entropy": 0.2879827093333006,
2640
- "epoch": 3.968,
2641
- "grad_norm": 0.7532303929328918,
2642
- "learning_rate": 2.0736e-05,
2643
- "loss": 0.2413,
2644
- "mean_token_accuracy": 0.9290374431759119,
2645
- "num_tokens": 1172078.0,
2646
- "step": 2480
2647
- },
2648
- {
2649
- "entropy": 0.2530561724677682,
2650
- "epoch": 3.984,
2651
- "grad_norm": 0.8568546175956726,
2652
- "learning_rate": 2.0416000000000002e-05,
2653
- "loss": 0.2177,
2654
- "mean_token_accuracy": 0.9337470591068268,
2655
- "num_tokens": 1197464.0,
2656
- "step": 2490
2657
- },
2658
- {
2659
- "entropy": 0.3038310568779707,
2660
- "epoch": 4.0,
2661
- "grad_norm": 0.9622617959976196,
2662
- "learning_rate": 2.0096000000000002e-05,
2663
- "loss": 0.2368,
2664
- "mean_token_accuracy": 0.9296225290745497,
2665
- "num_tokens": 1212204.0,
2666
- "step": 2500
2667
- },
2668
- {
2669
- "entropy": 0.24809251818805933,
2670
- "epoch": 4.016,
2671
- "grad_norm": 0.8197008371353149,
2672
- "learning_rate": 1.9776000000000002e-05,
2673
- "loss": 0.2395,
2674
- "mean_token_accuracy": 0.928604032099247,
2675
- "num_tokens": 1253458.0,
2676
- "step": 2510
2677
- },
2678
- {
2679
- "entropy": 0.24905966678634286,
2680
- "epoch": 4.032,
2681
- "grad_norm": 0.8056384921073914,
2682
- "learning_rate": 1.9456e-05,
2683
- "loss": 0.2301,
2684
- "mean_token_accuracy": 0.9330911111086607,
2685
- "num_tokens": 1282365.0,
2686
- "step": 2520
2687
- },
2688
- {
2689
- "entropy": 0.26601817598566413,
2690
- "epoch": 4.048,
2691
- "grad_norm": 0.9766417145729065,
2692
- "learning_rate": 1.9136e-05,
2693
- "loss": 0.2237,
2694
- "mean_token_accuracy": 0.9384452097117901,
2695
- "num_tokens": 1305420.0,
2696
- "step": 2530
2697
- },
2698
- {
2699
- "entropy": 0.28673125999048354,
2700
- "epoch": 4.064,
2701
- "grad_norm": 1.2241604328155518,
2702
- "learning_rate": 1.8816e-05,
2703
- "loss": 0.2615,
2704
- "mean_token_accuracy": 0.9268214203417301,
2705
- "num_tokens": 1323367.0,
2706
- "step": 2540
2707
- },
2708
- {
2709
- "entropy": 0.3297149523161352,
2710
- "epoch": 4.08,
2711
- "grad_norm": 1.2444630861282349,
2712
- "learning_rate": 1.8496000000000004e-05,
2713
- "loss": 0.266,
2714
- "mean_token_accuracy": 0.9285014558583498,
2715
- "num_tokens": 1335370.0,
2716
- "step": 2550
2717
- },
2718
- {
2719
- "entropy": 0.25180468857288363,
2720
- "epoch": 4.096,
2721
- "grad_norm": 0.6901214718818665,
2722
- "learning_rate": 1.8176e-05,
2723
- "loss": 0.2242,
2724
- "mean_token_accuracy": 0.9317782554775477,
2725
- "num_tokens": 1374567.0,
2726
- "step": 2560
2727
- },
2728
- {
2729
- "entropy": 0.25819407450035214,
2730
- "epoch": 4.112,
2731
- "grad_norm": 0.8702373504638672,
2732
- "learning_rate": 1.7856e-05,
2733
- "loss": 0.2344,
2734
- "mean_token_accuracy": 0.9326971143484115,
2735
- "num_tokens": 1402608.0,
2736
- "step": 2570
2737
- },
2738
- {
2739
- "entropy": 0.26549670435488226,
2740
- "epoch": 4.128,
2741
- "grad_norm": 0.7631207704544067,
2742
- "learning_rate": 1.7536e-05,
2743
- "loss": 0.2297,
2744
- "mean_token_accuracy": 0.9365796335041523,
2745
- "num_tokens": 1425524.0,
2746
- "step": 2580
2747
- },
2748
- {
2749
- "entropy": 0.26975566176697613,
2750
- "epoch": 4.144,
2751
- "grad_norm": 1.1718668937683105,
2752
- "learning_rate": 1.7216000000000003e-05,
2753
- "loss": 0.221,
2754
- "mean_token_accuracy": 0.9397962510585784,
2755
- "num_tokens": 1444092.0,
2756
- "step": 2590
2757
- },
2758
- {
2759
- "entropy": 0.3168819394893944,
2760
- "epoch": 4.16,
2761
- "grad_norm": 1.0534077882766724,
2762
- "learning_rate": 1.6896000000000002e-05,
2763
- "loss": 0.2544,
2764
- "mean_token_accuracy": 0.9319371480494738,
2765
- "num_tokens": 1456844.0,
2766
- "step": 2600
2767
- },
2768
- {
2769
- "entropy": 0.25265237540006635,
2770
- "epoch": 4.176,
2771
- "grad_norm": 0.7592364549636841,
2772
- "learning_rate": 1.6576e-05,
2773
- "loss": 0.2395,
2774
- "mean_token_accuracy": 0.9289916418492794,
2775
- "num_tokens": 1496545.0,
2776
- "step": 2610
2777
- },
2778
- {
2779
- "entropy": 0.2543726827017963,
2780
- "epoch": 4.192,
2781
- "grad_norm": 0.9639586210250854,
2782
- "learning_rate": 1.6256e-05,
2783
- "loss": 0.2351,
2784
- "mean_token_accuracy": 0.9337568439543247,
2785
- "num_tokens": 1525103.0,
2786
- "step": 2620
2787
- },
2788
- {
2789
- "entropy": 0.26547051025554536,
2790
- "epoch": 4.208,
2791
- "grad_norm": 0.9620559215545654,
2792
- "learning_rate": 1.5936e-05,
2793
- "loss": 0.2382,
2794
- "mean_token_accuracy": 0.9348125293850899,
2795
- "num_tokens": 1548306.0,
2796
- "step": 2630
2797
- },
2798
- {
2799
- "entropy": 0.27369030360132457,
2800
- "epoch": 4.224,
2801
- "grad_norm": 0.8373218774795532,
2802
- "learning_rate": 1.5616e-05,
2803
- "loss": 0.2254,
2804
- "mean_token_accuracy": 0.9375662509351969,
2805
- "num_tokens": 1566990.0,
2806
- "step": 2640
2807
- },
2808
- {
2809
- "entropy": 0.3024815677665174,
2810
- "epoch": 4.24,
2811
- "grad_norm": 1.3148176670074463,
2812
- "learning_rate": 1.5296e-05,
2813
- "loss": 0.2391,
2814
- "mean_token_accuracy": 0.9351990919560194,
2815
- "num_tokens": 1580065.0,
2816
- "step": 2650
2817
- },
2818
- {
2819
- "entropy": 0.2600595161318779,
2820
- "epoch": 4.256,
2821
- "grad_norm": 0.6774656176567078,
2822
- "learning_rate": 1.4976000000000002e-05,
2823
- "loss": 0.2377,
2824
- "mean_token_accuracy": 0.9274554952979088,
2825
- "num_tokens": 1619083.0,
2826
- "step": 2660
2827
- },
2828
- {
2829
- "entropy": 0.26013899641111493,
2830
- "epoch": 4.272,
2831
- "grad_norm": 0.9727310538291931,
2832
- "learning_rate": 1.4656e-05,
2833
- "loss": 0.2294,
2834
- "mean_token_accuracy": 0.934112536534667,
2835
- "num_tokens": 1646970.0,
2836
- "step": 2670
2837
- },
2838
- {
2839
- "entropy": 0.25867203902453184,
2840
- "epoch": 4.288,
2841
- "grad_norm": 0.9198706150054932,
2842
- "learning_rate": 1.4336e-05,
2843
- "loss": 0.2184,
2844
- "mean_token_accuracy": 0.9373745564371347,
2845
- "num_tokens": 1669364.0,
2846
- "step": 2680
2847
- },
2848
- {
2849
- "entropy": 0.26432402124628424,
2850
- "epoch": 4.304,
2851
- "grad_norm": 0.9908862709999084,
2852
- "learning_rate": 1.4016000000000001e-05,
2853
- "loss": 0.2195,
2854
- "mean_token_accuracy": 0.9392576098442078,
2855
- "num_tokens": 1687812.0,
2856
- "step": 2690
2857
- },
2858
- {
2859
- "entropy": 0.30741472546942533,
2860
- "epoch": 4.32,
2861
- "grad_norm": 1.0388495922088623,
2862
- "learning_rate": 1.3696e-05,
2863
- "loss": 0.2503,
2864
- "mean_token_accuracy": 0.9325483400374651,
2865
- "num_tokens": 1700598.0,
2866
- "step": 2700
2867
- },
2868
- {
2869
- "epoch": 4.32,
2870
- "eval_accuracy": 0.02638358121882313,
2871
- "eval_entropy": 0.3719751555919647,
2872
- "eval_loss": 0.5846644043922424,
2873
- "eval_mean_token_accuracy": 0.8568292667865753,
2874
- "eval_num_tokens": 1700598.0,
2875
- "eval_runtime": 869.8497,
2876
- "eval_samples_per_second": 2.299,
2877
- "eval_steps_per_second": 0.575,
2878
- "step": 2700
2879
- },
2880
- {
2881
- "entropy": 0.24316317560151218,
2882
- "epoch": 4.336,
2883
- "grad_norm": 0.757876455783844,
2884
- "learning_rate": 1.3376e-05,
2885
- "loss": 0.2118,
2886
- "mean_token_accuracy": 0.9327260747551918,
2887
- "num_tokens": 39749.0,
2888
- "step": 2710
2889
- },
2890
- {
2891
- "entropy": 0.2465177897363901,
2892
- "epoch": 4.352,
2893
- "grad_norm": 0.73354172706604,
2894
- "learning_rate": 1.3056000000000002e-05,
2895
- "loss": 0.21,
2896
- "mean_token_accuracy": 0.9354286625981331,
2897
- "num_tokens": 68464.0,
2898
- "step": 2720
2899
- },
2900
- {
2901
- "entropy": 0.24799817334860563,
2902
- "epoch": 4.368,
2903
- "grad_norm": 0.9990701675415039,
2904
- "learning_rate": 1.2736000000000001e-05,
2905
- "loss": 0.2039,
2906
- "mean_token_accuracy": 0.940489636361599,
2907
- "num_tokens": 91656.0,
2908
- "step": 2730
2909
- },
2910
- {
2911
- "entropy": 0.26067384518682957,
2912
- "epoch": 4.384,
2913
- "grad_norm": 0.9379425644874573,
2914
- "learning_rate": 1.2416000000000001e-05,
2915
- "loss": 0.2182,
2916
- "mean_token_accuracy": 0.9411718167364598,
2917
- "num_tokens": 110505.0,
2918
- "step": 2740
2919
- },
2920
- {
2921
- "entropy": 0.3018894817214459,
2922
- "epoch": 4.4,
2923
- "grad_norm": 1.0026336908340454,
2924
- "learning_rate": 1.2096e-05,
2925
- "loss": 0.2267,
2926
- "mean_token_accuracy": 0.9386275008320808,
2927
- "num_tokens": 123324.0,
2928
- "step": 2750
2929
- },
2930
- {
2931
- "entropy": 0.21805389355868102,
2932
- "epoch": 4.416,
2933
- "grad_norm": 0.6372848153114319,
2934
- "learning_rate": 1.1776e-05,
2935
- "loss": 0.1861,
2936
- "mean_token_accuracy": 0.9427805945277214,
2937
- "num_tokens": 163777.0,
2938
- "step": 2760
2939
- },
2940
- {
2941
- "entropy": 0.21196621540002525,
2942
- "epoch": 4.432,
2943
- "grad_norm": 0.5572025179862976,
2944
- "learning_rate": 1.1456e-05,
2945
- "loss": 0.1581,
2946
- "mean_token_accuracy": 0.9551307797431946,
2947
- "num_tokens": 192177.0,
2948
- "step": 2770
2949
- },
2950
- {
2951
- "entropy": 0.20902398317120968,
2952
- "epoch": 4.448,
2953
- "grad_norm": 0.7340620756149292,
2954
- "learning_rate": 1.1136e-05,
2955
- "loss": 0.1582,
2956
- "mean_token_accuracy": 0.9570909071713686,
2957
- "num_tokens": 215456.0,
2958
- "step": 2780
2959
- },
2960
- {
2961
- "entropy": 0.2131565590389073,
2962
- "epoch": 4.464,
2963
- "grad_norm": 1.0014139413833618,
2964
- "learning_rate": 1.0816000000000001e-05,
2965
- "loss": 0.1583,
2966
- "mean_token_accuracy": 0.9551056247204542,
2967
- "num_tokens": 234122.0,
2968
- "step": 2790
2969
- },
2970
- {
2971
- "entropy": 0.25133530045859515,
2972
- "epoch": 4.48,
2973
- "grad_norm": 0.8922705054283142,
2974
- "learning_rate": 1.0496e-05,
2975
- "loss": 0.1818,
2976
- "mean_token_accuracy": 0.9524805508553982,
2977
- "num_tokens": 246749.0,
2978
- "step": 2800
2979
- },
2980
- {
2981
- "entropy": 0.19833970288746058,
2982
- "epoch": 4.496,
2983
- "grad_norm": 0.8713212609291077,
2984
- "learning_rate": 1.0176e-05,
2985
- "loss": 0.1667,
2986
- "mean_token_accuracy": 0.9479088947176934,
2987
- "num_tokens": 287475.0,
2988
- "step": 2810
2989
- },
2990
- {
2991
- "entropy": 0.18820378091186285,
2992
- "epoch": 4.5120000000000005,
2993
- "grad_norm": 0.782958984375,
2994
- "learning_rate": 9.856e-06,
2995
- "loss": 0.1507,
2996
- "mean_token_accuracy": 0.9564289052039385,
2997
- "num_tokens": 316228.0,
2998
- "step": 2820
2999
- },
3000
- {
3001
- "entropy": 0.1986434136983007,
3002
- "epoch": 4.5280000000000005,
3003
- "grad_norm": 0.9405664801597595,
3004
- "learning_rate": 9.536e-06,
3005
- "loss": 0.1652,
3006
- "mean_token_accuracy": 0.9527083396911621,
3007
- "num_tokens": 339312.0,
3008
- "step": 2830
3009
- },
3010
- {
3011
- "entropy": 0.20359546076506377,
3012
- "epoch": 4.5440000000000005,
3013
- "grad_norm": 1.8294662237167358,
3014
- "learning_rate": 9.216000000000001e-06,
3015
- "loss": 0.1605,
3016
- "mean_token_accuracy": 0.958249793574214,
3017
- "num_tokens": 357957.0,
3018
- "step": 2840
3019
- },
3020
- {
3021
- "entropy": 0.2478945675306022,
3022
- "epoch": 4.5600000000000005,
3023
- "grad_norm": 1.8756585121154785,
3024
- "learning_rate": 8.896000000000001e-06,
3025
- "loss": 0.1791,
3026
- "mean_token_accuracy": 0.9529225923120975,
3027
- "num_tokens": 371074.0,
3028
- "step": 2850
3029
- },
3030
- {
3031
- "entropy": 0.19137877360917627,
3032
- "epoch": 4.576,
3033
- "grad_norm": 0.7811349034309387,
3034
- "learning_rate": 8.576e-06,
3035
- "loss": 0.1603,
3036
- "mean_token_accuracy": 0.9505746208131314,
3037
- "num_tokens": 412461.0,
3038
- "step": 2860
3039
- },
3040
- {
3041
- "entropy": 0.19941019406542182,
3042
- "epoch": 4.592,
3043
- "grad_norm": 0.8849194645881653,
3044
- "learning_rate": 8.256e-06,
3045
- "loss": 0.1559,
3046
- "mean_token_accuracy": 0.9538026105612516,
3047
- "num_tokens": 441113.0,
3048
- "step": 2870
3049
- },
3050
- {
3051
- "entropy": 0.20037598102353513,
3052
- "epoch": 4.608,
3053
- "grad_norm": 1.007367730140686,
3054
- "learning_rate": 7.936e-06,
3055
- "loss": 0.1577,
3056
- "mean_token_accuracy": 0.9563030891120434,
3057
- "num_tokens": 464301.0,
3058
- "step": 2880
3059
- },
3060
- {
3061
- "entropy": 0.21458538975566627,
3062
- "epoch": 4.624,
3063
- "grad_norm": 1.0605765581130981,
3064
- "learning_rate": 7.616000000000001e-06,
3065
- "loss": 0.1636,
3066
- "mean_token_accuracy": 0.9558106277137994,
3067
- "num_tokens": 483422.0,
3068
- "step": 2890
3069
- },
3070
- {
3071
- "entropy": 0.2460995698813349,
3072
- "epoch": 4.64,
3073
- "grad_norm": 1.1102747917175293,
3074
- "learning_rate": 7.296e-06,
3075
- "loss": 0.178,
3076
- "mean_token_accuracy": 0.9527418158948422,
3077
- "num_tokens": 496524.0,
3078
- "step": 2900
3079
- },
3080
- {
3081
- "entropy": 0.1917059404309839,
3082
- "epoch": 4.656,
3083
- "grad_norm": 0.7104383111000061,
3084
- "learning_rate": 6.976000000000001e-06,
3085
- "loss": 0.1692,
3086
- "mean_token_accuracy": 0.9471572674810886,
3087
- "num_tokens": 537262.0,
3088
- "step": 2910
3089
- },
3090
- {
3091
- "entropy": 0.19903061082586646,
3092
- "epoch": 4.672,
3093
- "grad_norm": 0.8522951006889343,
3094
- "learning_rate": 6.688e-06,
3095
- "loss": 0.1668,
3096
- "mean_token_accuracy": 0.9495650254189968,
3097
- "num_tokens": 566118.0,
3098
- "step": 2920
3099
- },
3100
- {
3101
- "entropy": 0.20533090075477958,
3102
- "epoch": 4.688,
3103
- "grad_norm": 0.7692112326622009,
3104
- "learning_rate": 6.368000000000001e-06,
3105
- "loss": 0.1597,
3106
- "mean_token_accuracy": 0.9538190443068743,
3107
- "num_tokens": 589316.0,
3108
- "step": 2930
3109
- },
3110
- {
3111
- "entropy": 0.20868746675550937,
3112
- "epoch": 4.704,
3113
- "grad_norm": 0.8645059466362,
3114
- "learning_rate": 6.048e-06,
3115
- "loss": 0.1496,
3116
- "mean_token_accuracy": 0.9595503833144903,
3117
- "num_tokens": 607904.0,
3118
- "step": 2940
3119
- },
3120
- {
3121
- "entropy": 0.23888139198534192,
3122
- "epoch": 4.72,
3123
- "grad_norm": 1.08635413646698,
3124
- "learning_rate": 5.728e-06,
3125
- "loss": 0.1706,
3126
- "mean_token_accuracy": 0.9570875108242035,
3127
- "num_tokens": 620936.0,
3128
- "step": 2950
3129
- },
3130
- {
3131
- "entropy": 0.18963255980052054,
3132
- "epoch": 4.736,
3133
- "grad_norm": 0.7276900410652161,
3134
- "learning_rate": 5.4080000000000006e-06,
3135
- "loss": 0.1633,
3136
- "mean_token_accuracy": 0.9485368836671114,
3137
- "num_tokens": 661079.0,
3138
- "step": 2960
3139
- },
3140
- {
3141
- "entropy": 0.19404892213642597,
3142
- "epoch": 4.752,
3143
- "grad_norm": 0.8436645269393921,
3144
- "learning_rate": 5.088e-06,
3145
- "loss": 0.1523,
3146
- "mean_token_accuracy": 0.9547487128525972,
3147
- "num_tokens": 689649.0,
3148
- "step": 2970
3149
- },
3150
- {
3151
- "entropy": 0.20046764588914812,
3152
- "epoch": 4.768,
3153
- "grad_norm": 1.0704182386398315,
3154
- "learning_rate": 4.768e-06,
3155
- "loss": 0.1574,
3156
- "mean_token_accuracy": 0.9545170154422522,
3157
- "num_tokens": 712841.0,
3158
- "step": 2980
3159
- },
3160
- {
3161
- "entropy": 0.2065018493682146,
3162
- "epoch": 4.784,
3163
- "grad_norm": 0.9045215249061584,
3164
- "learning_rate": 4.4480000000000004e-06,
3165
- "loss": 0.155,
3166
- "mean_token_accuracy": 0.9589469760656357,
3167
- "num_tokens": 731548.0,
3168
- "step": 2990
3169
- },
3170
- {
3171
- "entropy": 0.2458665339741856,
3172
- "epoch": 4.8,
3173
- "grad_norm": 1.7165741920471191,
3174
- "learning_rate": 4.128e-06,
3175
- "loss": 0.173,
3176
- "mean_token_accuracy": 0.9542810652405024,
3177
- "num_tokens": 744375.0,
3178
- "step": 3000
3179
- },
3180
- {
3181
- "epoch": 4.8,
3182
- "eval_accuracy": 0.026236095361078154,
3183
- "eval_entropy": 0.3239293715655804,
3184
- "eval_loss": 0.6594926714897156,
3185
- "eval_mean_token_accuracy": 0.8544400478601456,
3186
- "eval_num_tokens": 744375.0,
3187
- "eval_runtime": 966.0583,
3188
- "eval_samples_per_second": 2.07,
3189
- "eval_steps_per_second": 0.518,
3190
- "step": 3000
3191
- },
3192
- {
3193
- "entropy": 0.19047842593863606,
3194
- "epoch": 4.816,
3195
- "grad_norm": 0.8224709033966064,
3196
- "learning_rate": 3.8080000000000006e-06,
3197
- "loss": 0.1691,
3198
- "mean_token_accuracy": 0.9483149264007806,
3199
- "num_tokens": 785457.0,
3200
- "step": 3010
3201
- },
3202
- {
3203
- "entropy": 0.1947814745362848,
3204
- "epoch": 4.832,
3205
- "grad_norm": 0.8581233024597168,
3206
- "learning_rate": 3.4880000000000003e-06,
3207
- "loss": 0.1535,
3208
- "mean_token_accuracy": 0.9543764512985945,
3209
- "num_tokens": 814006.0,
3210
- "step": 3020
3211
- },
3212
- {
3213
- "entropy": 0.20228669252246617,
3214
- "epoch": 4.848,
3215
- "grad_norm": 0.7815537452697754,
3216
- "learning_rate": 3.168e-06,
3217
- "loss": 0.1539,
3218
- "mean_token_accuracy": 0.9561178237199783,
3219
- "num_tokens": 836843.0,
3220
- "step": 3030
3221
- },
3222
- {
3223
- "entropy": 0.2111768877133727,
3224
- "epoch": 4.864,
3225
- "grad_norm": 2.0849273204803467,
3226
- "learning_rate": 2.848e-06,
3227
- "loss": 0.1553,
3228
- "mean_token_accuracy": 0.9579557087272406,
3229
- "num_tokens": 855036.0,
3230
- "step": 3040
3231
- },
3232
- {
3233
- "entropy": 0.2543737689033151,
3234
- "epoch": 4.88,
3235
- "grad_norm": 0.9005395770072937,
3236
- "learning_rate": 2.528e-06,
3237
- "loss": 0.18,
3238
- "mean_token_accuracy": 0.951928498968482,
3239
- "num_tokens": 867473.0,
3240
- "step": 3050
3241
- },
3242
- {
3243
- "entropy": 0.19695296385325492,
3244
- "epoch": 4.896,
3245
- "grad_norm": 0.8913720846176147,
3246
- "learning_rate": 2.208e-06,
3247
- "loss": 0.1731,
3248
- "mean_token_accuracy": 0.9454629000276327,
3249
- "num_tokens": 905517.0,
3250
- "step": 3060
3251
- },
3252
- {
3253
- "entropy": 0.2020930268801749,
3254
- "epoch": 4.912,
3255
- "grad_norm": 1.0501484870910645,
3256
- "learning_rate": 1.8880000000000002e-06,
3257
- "loss": 0.1583,
3258
- "mean_token_accuracy": 0.954399960488081,
3259
- "num_tokens": 933251.0,
3260
- "step": 3070
3261
- },
3262
- {
3263
- "entropy": 0.20252155787311493,
3264
- "epoch": 4.928,
3265
- "grad_norm": 1.03731369972229,
3266
- "learning_rate": 1.568e-06,
3267
- "loss": 0.1531,
3268
- "mean_token_accuracy": 0.9579384963959455,
3269
- "num_tokens": 956069.0,
3270
- "step": 3080
3271
- },
3272
- {
3273
- "entropy": 0.2126692888326943,
3274
- "epoch": 4.944,
3275
- "grad_norm": 1.107572317123413,
3276
- "learning_rate": 1.248e-06,
3277
- "loss": 0.1568,
3278
- "mean_token_accuracy": 0.9569063678383827,
3279
- "num_tokens": 974517.0,
3280
- "step": 3090
3281
- },
3282
- {
3283
- "entropy": 0.24990466320887209,
3284
- "epoch": 4.96,
3285
- "grad_norm": 1.2767953872680664,
3286
- "learning_rate": 9.28e-07,
3287
- "loss": 0.1851,
3288
- "mean_token_accuracy": 0.9518057998269797,
3289
- "num_tokens": 987191.0,
3290
- "step": 3100
3291
- },
3292
- {
3293
- "entropy": 0.19635155922733247,
3294
- "epoch": 4.976,
3295
- "grad_norm": 0.838716447353363,
3296
- "learning_rate": 6.08e-07,
3297
- "loss": 0.1689,
3298
- "mean_token_accuracy": 0.9492763552814723,
3299
- "num_tokens": 1021442.0,
3300
- "step": 3110
3301
- },
3302
- {
3303
- "entropy": 0.21572725460864603,
3304
- "epoch": 4.992,
3305
- "grad_norm": 0.9043759107589722,
3306
- "learning_rate": 2.8800000000000004e-07,
3307
- "loss": 0.161,
3308
- "mean_token_accuracy": 0.9549260966479778,
3309
- "num_tokens": 1041350.0,
3310
- "step": 3120
3311
  }
3312
  ],
3313
  "logging_steps": 10,
3314
  "max_steps": 3125,
3315
  "num_input_tokens_seen": 0,
3316
  "num_train_epochs": 5,
3317
- "save_steps": 300,
3318
  "stateful_callbacks": {
3319
  "TrainerControl": {
3320
  "args": {
@@ -3322,12 +827,12 @@
3322
  "should_evaluate": false,
3323
  "should_log": false,
3324
  "should_save": true,
3325
- "should_training_stop": true
3326
  },
3327
  "attributes": {}
3328
  }
3329
  },
3330
- "total_flos": 5.37035906398464e+17,
3331
  "train_batch_size": 1,
3332
  "trial_name": null,
3333
  "trial_params": null
 
2
  "best_global_step": 750,
3
  "best_metric": 0.5089643597602844,
4
  "best_model_checkpoint": "./adapter-phase1/checkpoint-750",
5
+ "epoch": 1.2,
6
+ "eval_steps": 150,
7
+ "global_step": 750,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
813
  "eval_samples_per_second": 2.106,
814
  "eval_steps_per_second": 0.526,
815
  "step": 750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816
  }
817
  ],
818
  "logging_steps": 10,
819
  "max_steps": 3125,
820
  "num_input_tokens_seen": 0,
821
  "num_train_epochs": 5,
822
+ "save_steps": 150,
823
  "stateful_callbacks": {
824
  "TrainerControl": {
825
  "args": {
 
827
  "should_evaluate": false,
828
  "should_log": false,
829
  "should_save": true,
830
+ "should_training_stop": false
831
  },
832
  "attributes": {}
833
  }
834
  },
835
+ "total_flos": 1.3058997783257088e+17,
836
  "train_batch_size": 1,
837
  "trial_name": null,
838
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab977af6525318ffc5b089ead4268f65e71f68e9d355f66185c43f4d771a6da2
3
  size 6353
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d19add453be896fb8010267a01d849597b52aecb53969dce6ab3000e56f1b7d0
3
  size 6353
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab977af6525318ffc5b089ead4268f65e71f68e9d355f66185c43f4d771a6da2
3
  size 6353
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d19add453be896fb8010267a01d849597b52aecb53969dce6ab3000e56f1b7d0
3
  size 6353