NBAmine commited on
Commit
5f9b702
·
verified ·
1 Parent(s): 9662908

Training in progress, epoch 3, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "o_proj",
33
- "k_proj",
34
- "up_proj",
35
  "v_proj",
36
  "q_proj",
 
 
37
  "down_proj",
38
- "gate_proj"
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
 
32
  "v_proj",
33
  "q_proj",
34
+ "k_proj",
35
+ "o_proj",
36
  "down_proj",
37
+ "gate_proj",
38
+ "up_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad737a62d5e6dd3601ece9ec89b866a23dce6e9660089db12fbc69ce938d925e
3
  size 228140600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f46c590a365bafa0b811513437da5e7d8483ece074035b6cb7866f4b8fa0ddbf
3
  size 228140600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0dfe45f7e8553ab326b74acce24981ad2310a19952e96422e10cdcd05d9f3261
3
- size 116484839
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbb191c56fba3604be8448d5e0ca5afff0e43237919a55146c2466867049b7a1
3
+ size 117931203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4ff3bd83efcc74d45f6dc982dfad42de943c268219c0ad0ee388295c41e8e02
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a47775446b1df475bf207a2642f3554e36380c6e365e872a57468d80ab3dc781
3
  size 14709
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4cb6f523fe7cbe7ec261f5a7daf8f68472cbad6a063d529646d1f827a9ef9fd3
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fca1b27ac585e87f33a001c7dfb0adbdfb56b4d914009fde4324a9b10df26171
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53f409af08acb24ba2f85422d6d830e93fdc97a01268b4582a53eec3cbfeb20a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57dbcaa4c36dfe8b1884cd38afdda1f50d97d5b0660c412d604e987f28a13d71
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 438,
3
  "best_metric": 1.2615772485733032,
4
  "best_model_checkpoint": "./adapter-phase2/checkpoint-438",
5
- "epoch": 2.0,
6
  "eval_steps": 500,
7
- "global_step": 876,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -902,6 +902,458 @@
902
  "eval_samples_per_second": 3.451,
903
  "eval_steps_per_second": 0.864,
904
  "step": 876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
905
  }
906
  ],
907
  "logging_steps": 10,
@@ -921,7 +1373,7 @@
921
  "attributes": {}
922
  }
923
  },
924
- "total_flos": 5.880058660122624e+16,
925
  "train_batch_size": 1,
926
  "trial_name": null,
927
  "trial_params": null
 
2
  "best_global_step": 438,
3
  "best_metric": 1.2615772485733032,
4
  "best_model_checkpoint": "./adapter-phase2/checkpoint-438",
5
+ "epoch": 3.0,
6
  "eval_steps": 500,
7
+ "global_step": 1314,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
902
  "eval_samples_per_second": 3.451,
903
  "eval_steps_per_second": 0.864,
904
  "step": 876
905
+ },
906
+ {
907
+ "entropy": 0.5898892944678664,
908
+ "epoch": 2.0091428571428573,
909
+ "grad_norm": 1.3116419315338135,
910
+ "learning_rate": 5.990867579908676e-06,
911
+ "loss": 0.5677,
912
+ "mean_token_accuracy": 0.8516398537904024,
913
+ "num_tokens": 7804.0,
914
+ "step": 880
915
+ },
916
+ {
917
+ "entropy": 0.6767458073794842,
918
+ "epoch": 2.032,
919
+ "grad_norm": 1.4093241691589355,
920
+ "learning_rate": 5.945205479452055e-06,
921
+ "loss": 0.6416,
922
+ "mean_token_accuracy": 0.8335339192301034,
923
+ "num_tokens": 21400.0,
924
+ "step": 890
925
+ },
926
+ {
927
+ "entropy": 0.7389240754768253,
928
+ "epoch": 2.0548571428571427,
929
+ "grad_norm": 1.8839222192764282,
930
+ "learning_rate": 5.8995433789954336e-06,
931
+ "loss": 0.6714,
932
+ "mean_token_accuracy": 0.8232478138059378,
933
+ "num_tokens": 31986.0,
934
+ "step": 900
935
+ },
936
+ {
937
+ "entropy": 0.8426970480009913,
938
+ "epoch": 2.077714285714286,
939
+ "grad_norm": 2.546990394592285,
940
+ "learning_rate": 5.853881278538813e-06,
941
+ "loss": 0.773,
942
+ "mean_token_accuracy": 0.7947055101394653,
943
+ "num_tokens": 39598.0,
944
+ "step": 910
945
+ },
946
+ {
947
+ "entropy": 0.9150473427027463,
948
+ "epoch": 2.1005714285714285,
949
+ "grad_norm": 2.7457187175750732,
950
+ "learning_rate": 5.8082191780821915e-06,
951
+ "loss": 0.8326,
952
+ "mean_token_accuracy": 0.7829385627061128,
953
+ "num_tokens": 45726.0,
954
+ "step": 920
955
+ },
956
+ {
957
+ "entropy": 0.7779074914753437,
958
+ "epoch": 2.123428571428571,
959
+ "grad_norm": 1.6818033456802368,
960
+ "learning_rate": 5.762557077625572e-06,
961
+ "loss": 0.6953,
962
+ "mean_token_accuracy": 0.8222391355782748,
963
+ "num_tokens": 55996.0,
964
+ "step": 930
965
+ },
966
+ {
967
+ "entropy": 0.6338619258254766,
968
+ "epoch": 2.1462857142857144,
969
+ "grad_norm": 1.8489398956298828,
970
+ "learning_rate": 5.716894977168949e-06,
971
+ "loss": 0.5875,
972
+ "mean_token_accuracy": 0.8410690952092409,
973
+ "num_tokens": 69720.0,
974
+ "step": 940
975
+ },
976
+ {
977
+ "entropy": 0.7125289073213935,
978
+ "epoch": 2.169142857142857,
979
+ "grad_norm": 1.8807828426361084,
980
+ "learning_rate": 5.6712328767123296e-06,
981
+ "loss": 0.6763,
982
+ "mean_token_accuracy": 0.8223242565989495,
983
+ "num_tokens": 80327.0,
984
+ "step": 950
985
+ },
986
+ {
987
+ "entropy": 0.8898364685475826,
988
+ "epoch": 2.192,
989
+ "grad_norm": 2.358139753341675,
990
+ "learning_rate": 5.625570776255708e-06,
991
+ "loss": 0.8491,
992
+ "mean_token_accuracy": 0.7844949930906295,
993
+ "num_tokens": 88242.0,
994
+ "step": 960
995
+ },
996
+ {
997
+ "entropy": 0.8897234506905078,
998
+ "epoch": 2.214857142857143,
999
+ "grad_norm": 2.5401251316070557,
1000
+ "learning_rate": 5.5799086757990874e-06,
1001
+ "loss": 0.786,
1002
+ "mean_token_accuracy": 0.795602411031723,
1003
+ "num_tokens": 94381.0,
1004
+ "step": 970
1005
+ },
1006
+ {
1007
+ "entropy": 0.7333379179239273,
1008
+ "epoch": 2.2377142857142855,
1009
+ "grad_norm": 1.7071613073349,
1010
+ "learning_rate": 5.534246575342466e-06,
1011
+ "loss": 0.665,
1012
+ "mean_token_accuracy": 0.8248766608536243,
1013
+ "num_tokens": 105014.0,
1014
+ "step": 980
1015
+ },
1016
+ {
1017
+ "entropy": 0.6419918712228536,
1018
+ "epoch": 2.2605714285714287,
1019
+ "grad_norm": 1.5552293062210083,
1020
+ "learning_rate": 5.488584474885845e-06,
1021
+ "loss": 0.6026,
1022
+ "mean_token_accuracy": 0.8414491657167673,
1023
+ "num_tokens": 118550.0,
1024
+ "step": 990
1025
+ },
1026
+ {
1027
+ "entropy": 0.7310339482501149,
1028
+ "epoch": 2.2834285714285714,
1029
+ "grad_norm": 2.278031587600708,
1030
+ "learning_rate": 5.442922374429224e-06,
1031
+ "loss": 0.6805,
1032
+ "mean_token_accuracy": 0.8224124182015657,
1033
+ "num_tokens": 128693.0,
1034
+ "step": 1000
1035
+ },
1036
+ {
1037
+ "entropy": 0.880455293878913,
1038
+ "epoch": 2.306285714285714,
1039
+ "grad_norm": 2.459608554840088,
1040
+ "learning_rate": 5.397260273972603e-06,
1041
+ "loss": 0.8336,
1042
+ "mean_token_accuracy": 0.7825992915779352,
1043
+ "num_tokens": 136122.0,
1044
+ "step": 1010
1045
+ },
1046
+ {
1047
+ "entropy": 0.8820295415818691,
1048
+ "epoch": 2.329142857142857,
1049
+ "grad_norm": 2.6355550289154053,
1050
+ "learning_rate": 5.351598173515982e-06,
1051
+ "loss": 0.7887,
1052
+ "mean_token_accuracy": 0.7964828334748745,
1053
+ "num_tokens": 142194.0,
1054
+ "step": 1020
1055
+ },
1056
+ {
1057
+ "entropy": 0.7468231266364456,
1058
+ "epoch": 2.352,
1059
+ "grad_norm": 1.3787378072738647,
1060
+ "learning_rate": 5.305936073059361e-06,
1061
+ "loss": 0.6603,
1062
+ "mean_token_accuracy": 0.8238665115088224,
1063
+ "num_tokens": 152715.0,
1064
+ "step": 1030
1065
+ },
1066
+ {
1067
+ "entropy": 0.6317665258422493,
1068
+ "epoch": 2.374857142857143,
1069
+ "grad_norm": 1.7379688024520874,
1070
+ "learning_rate": 5.26027397260274e-06,
1071
+ "loss": 0.6007,
1072
+ "mean_token_accuracy": 0.8413413379341363,
1073
+ "num_tokens": 166545.0,
1074
+ "step": 1040
1075
+ },
1076
+ {
1077
+ "entropy": 0.7211008200421929,
1078
+ "epoch": 2.3977142857142857,
1079
+ "grad_norm": 2.5596601963043213,
1080
+ "learning_rate": 5.214611872146119e-06,
1081
+ "loss": 0.656,
1082
+ "mean_token_accuracy": 0.8258481413125992,
1083
+ "num_tokens": 177411.0,
1084
+ "step": 1050
1085
+ },
1086
+ {
1087
+ "entropy": 0.8763142567127943,
1088
+ "epoch": 2.420571428571429,
1089
+ "grad_norm": 2.739737033843994,
1090
+ "learning_rate": 5.1689497716894975e-06,
1091
+ "loss": 0.8354,
1092
+ "mean_token_accuracy": 0.7820440270006657,
1093
+ "num_tokens": 185340.0,
1094
+ "step": 1060
1095
+ },
1096
+ {
1097
+ "entropy": 0.8908181961625814,
1098
+ "epoch": 2.4434285714285715,
1099
+ "grad_norm": 3.223233222961426,
1100
+ "learning_rate": 5.123287671232877e-06,
1101
+ "loss": 0.7972,
1102
+ "mean_token_accuracy": 0.79459448158741,
1103
+ "num_tokens": 191540.0,
1104
+ "step": 1070
1105
+ },
1106
+ {
1107
+ "entropy": 0.7510069858282804,
1108
+ "epoch": 2.466285714285714,
1109
+ "grad_norm": 1.703507661819458,
1110
+ "learning_rate": 5.077625570776255e-06,
1111
+ "loss": 0.6854,
1112
+ "mean_token_accuracy": 0.8139259118586779,
1113
+ "num_tokens": 201907.0,
1114
+ "step": 1080
1115
+ },
1116
+ {
1117
+ "entropy": 0.6569937597960234,
1118
+ "epoch": 2.4891428571428573,
1119
+ "grad_norm": 1.7559291124343872,
1120
+ "learning_rate": 5.031963470319635e-06,
1121
+ "loss": 0.6111,
1122
+ "mean_token_accuracy": 0.84332409016788,
1123
+ "num_tokens": 215231.0,
1124
+ "step": 1090
1125
+ },
1126
+ {
1127
+ "entropy": 0.7254189381375908,
1128
+ "epoch": 2.512,
1129
+ "grad_norm": 1.9119453430175781,
1130
+ "learning_rate": 4.986301369863014e-06,
1131
+ "loss": 0.6814,
1132
+ "mean_token_accuracy": 0.8200360022485256,
1133
+ "num_tokens": 225539.0,
1134
+ "step": 1100
1135
+ },
1136
+ {
1137
+ "entropy": 0.8703744746744633,
1138
+ "epoch": 2.5348571428571427,
1139
+ "grad_norm": 2.812936305999756,
1140
+ "learning_rate": 4.9406392694063935e-06,
1141
+ "loss": 0.8006,
1142
+ "mean_token_accuracy": 0.7889351420104503,
1143
+ "num_tokens": 233013.0,
1144
+ "step": 1110
1145
+ },
1146
+ {
1147
+ "entropy": 0.8793116342276335,
1148
+ "epoch": 2.557714285714286,
1149
+ "grad_norm": 3.227419137954712,
1150
+ "learning_rate": 4.894977168949772e-06,
1151
+ "loss": 0.7693,
1152
+ "mean_token_accuracy": 0.7998475536704064,
1153
+ "num_tokens": 238974.0,
1154
+ "step": 1120
1155
+ },
1156
+ {
1157
+ "entropy": 0.7303921280428767,
1158
+ "epoch": 2.5805714285714285,
1159
+ "grad_norm": 1.6300448179244995,
1160
+ "learning_rate": 4.849315068493151e-06,
1161
+ "loss": 0.6538,
1162
+ "mean_token_accuracy": 0.8278157886117696,
1163
+ "num_tokens": 249037.0,
1164
+ "step": 1130
1165
+ },
1166
+ {
1167
+ "entropy": 0.6417382193729282,
1168
+ "epoch": 2.603428571428571,
1169
+ "grad_norm": 1.6912339925765991,
1170
+ "learning_rate": 4.80365296803653e-06,
1171
+ "loss": 0.6108,
1172
+ "mean_token_accuracy": 0.8381971474736929,
1173
+ "num_tokens": 262395.0,
1174
+ "step": 1140
1175
+ },
1176
+ {
1177
+ "entropy": 0.7339965717867016,
1178
+ "epoch": 2.6262857142857143,
1179
+ "grad_norm": 2.330716371536255,
1180
+ "learning_rate": 4.757990867579909e-06,
1181
+ "loss": 0.6772,
1182
+ "mean_token_accuracy": 0.8194692388176918,
1183
+ "num_tokens": 272694.0,
1184
+ "step": 1150
1185
+ },
1186
+ {
1187
+ "entropy": 0.8351591594517231,
1188
+ "epoch": 2.649142857142857,
1189
+ "grad_norm": 2.8293557167053223,
1190
+ "learning_rate": 4.712328767123288e-06,
1191
+ "loss": 0.7642,
1192
+ "mean_token_accuracy": 0.7966304961591959,
1193
+ "num_tokens": 280231.0,
1194
+ "step": 1160
1195
+ },
1196
+ {
1197
+ "entropy": 0.8861342877149582,
1198
+ "epoch": 2.672,
1199
+ "grad_norm": 2.9575674533843994,
1200
+ "learning_rate": 4.666666666666667e-06,
1201
+ "loss": 0.7999,
1202
+ "mean_token_accuracy": 0.7918535027652979,
1203
+ "num_tokens": 286161.0,
1204
+ "step": 1170
1205
+ },
1206
+ {
1207
+ "entropy": 0.7180219950154424,
1208
+ "epoch": 2.694857142857143,
1209
+ "grad_norm": 1.5886666774749756,
1210
+ "learning_rate": 4.6210045662100465e-06,
1211
+ "loss": 0.6434,
1212
+ "mean_token_accuracy": 0.8283716265112162,
1213
+ "num_tokens": 296388.0,
1214
+ "step": 1180
1215
+ },
1216
+ {
1217
+ "entropy": 0.6133397184312344,
1218
+ "epoch": 2.717714285714286,
1219
+ "grad_norm": 1.8250149488449097,
1220
+ "learning_rate": 4.575342465753425e-06,
1221
+ "loss": 0.6059,
1222
+ "mean_token_accuracy": 0.8467012654989958,
1223
+ "num_tokens": 310217.0,
1224
+ "step": 1190
1225
+ },
1226
+ {
1227
+ "entropy": 0.6812281895428896,
1228
+ "epoch": 2.7405714285714287,
1229
+ "grad_norm": 2.336768627166748,
1230
+ "learning_rate": 4.529680365296804e-06,
1231
+ "loss": 0.6216,
1232
+ "mean_token_accuracy": 0.8333451252430677,
1233
+ "num_tokens": 320782.0,
1234
+ "step": 1200
1235
+ },
1236
+ {
1237
+ "entropy": 0.8352824920788408,
1238
+ "epoch": 2.7634285714285713,
1239
+ "grad_norm": 2.4751791954040527,
1240
+ "learning_rate": 4.484018264840183e-06,
1241
+ "loss": 0.7816,
1242
+ "mean_token_accuracy": 0.7953334752470255,
1243
+ "num_tokens": 328520.0,
1244
+ "step": 1210
1245
+ },
1246
+ {
1247
+ "entropy": 0.8553074564784765,
1248
+ "epoch": 2.7862857142857145,
1249
+ "grad_norm": 3.6519722938537598,
1250
+ "learning_rate": 4.438356164383562e-06,
1251
+ "loss": 0.7807,
1252
+ "mean_token_accuracy": 0.7970743294805288,
1253
+ "num_tokens": 334489.0,
1254
+ "step": 1220
1255
+ },
1256
+ {
1257
+ "entropy": 0.7267135815694928,
1258
+ "epoch": 2.809142857142857,
1259
+ "grad_norm": 1.6625852584838867,
1260
+ "learning_rate": 4.392694063926941e-06,
1261
+ "loss": 0.6521,
1262
+ "mean_token_accuracy": 0.8255741696804761,
1263
+ "num_tokens": 344543.0,
1264
+ "step": 1230
1265
+ },
1266
+ {
1267
+ "entropy": 0.6338536148890853,
1268
+ "epoch": 2.832,
1269
+ "grad_norm": 1.9026601314544678,
1270
+ "learning_rate": 4.34703196347032e-06,
1271
+ "loss": 0.5962,
1272
+ "mean_token_accuracy": 0.8427884597331285,
1273
+ "num_tokens": 357986.0,
1274
+ "step": 1240
1275
+ },
1276
+ {
1277
+ "entropy": 0.7158672915771603,
1278
+ "epoch": 2.854857142857143,
1279
+ "grad_norm": 2.288316488265991,
1280
+ "learning_rate": 4.301369863013699e-06,
1281
+ "loss": 0.6478,
1282
+ "mean_token_accuracy": 0.8221574258059263,
1283
+ "num_tokens": 368102.0,
1284
+ "step": 1250
1285
+ },
1286
+ {
1287
+ "entropy": 0.8342319210991264,
1288
+ "epoch": 2.8777142857142857,
1289
+ "grad_norm": 2.675821542739868,
1290
+ "learning_rate": 4.255707762557078e-06,
1291
+ "loss": 0.7634,
1292
+ "mean_token_accuracy": 0.8008246626704931,
1293
+ "num_tokens": 375682.0,
1294
+ "step": 1260
1295
+ },
1296
+ {
1297
+ "entropy": 0.8324337769299746,
1298
+ "epoch": 2.9005714285714284,
1299
+ "grad_norm": 3.794491767883301,
1300
+ "learning_rate": 4.2100456621004574e-06,
1301
+ "loss": 0.7409,
1302
+ "mean_token_accuracy": 0.8102138575166464,
1303
+ "num_tokens": 381707.0,
1304
+ "step": 1270
1305
+ },
1306
+ {
1307
+ "entropy": 0.7096458308398723,
1308
+ "epoch": 2.9234285714285715,
1309
+ "grad_norm": 1.945020318031311,
1310
+ "learning_rate": 4.164383561643836e-06,
1311
+ "loss": 0.6394,
1312
+ "mean_token_accuracy": 0.8287061709910631,
1313
+ "num_tokens": 391884.0,
1314
+ "step": 1280
1315
+ },
1316
+ {
1317
+ "entropy": 0.6416196620091796,
1318
+ "epoch": 2.946285714285714,
1319
+ "grad_norm": 2.1223883628845215,
1320
+ "learning_rate": 4.118721461187215e-06,
1321
+ "loss": 0.613,
1322
+ "mean_token_accuracy": 0.837811603397131,
1323
+ "num_tokens": 404730.0,
1324
+ "step": 1290
1325
+ },
1326
+ {
1327
+ "entropy": 0.7876641971990466,
1328
+ "epoch": 2.9691428571428573,
1329
+ "grad_norm": 3.030888795852661,
1330
+ "learning_rate": 4.073059360730594e-06,
1331
+ "loss": 0.7228,
1332
+ "mean_token_accuracy": 0.8122910633683205,
1333
+ "num_tokens": 413461.0,
1334
+ "step": 1300
1335
+ },
1336
+ {
1337
+ "entropy": 0.8694131746888161,
1338
+ "epoch": 2.992,
1339
+ "grad_norm": 2.9641623497009277,
1340
+ "learning_rate": 4.027397260273973e-06,
1341
+ "loss": 0.793,
1342
+ "mean_token_accuracy": 0.7905130475759506,
1343
+ "num_tokens": 419636.0,
1344
+ "step": 1310
1345
+ },
1346
+ {
1347
+ "epoch": 3.0,
1348
+ "eval_accuracy": 0.009547123623011015,
1349
+ "eval_entropy": 0.8601508936826787,
1350
+ "eval_loss": 1.4408637285232544,
1351
+ "eval_mean_token_accuracy": 0.7276363938931792,
1352
+ "eval_num_tokens": 421194.0,
1353
+ "eval_runtime": 323.9597,
1354
+ "eval_samples_per_second": 3.192,
1355
+ "eval_steps_per_second": 0.799,
1356
+ "step": 1314
1357
  }
1358
  ],
1359
  "logging_steps": 10,
 
1373
  "attributes": {}
1374
  }
1375
  },
1376
+ "total_flos": 8.820087990183936e+16,
1377
  "train_batch_size": 1,
1378
  "trial_name": null,
1379
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6397570b74b109fa363e6deebec3b410825df0b8fddd810637091c898cd86887
3
  size 6353
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af1f15f9c776b8fd13191f98c1f119768de73cea6cded240bc06132c6f7b3d65
3
  size 6353