update BASIC config
Browse files- configs/BASIC.yaml +97 -97
configs/BASIC.yaml
CHANGED
|
@@ -8,7 +8,7 @@ model:
|
|
| 8 |
weight_format: SAME
|
| 9 |
weight_sparseness: DENSE
|
| 10 |
model.decoder.final_layer_norm:
|
| 11 |
-
approximation_function:
|
| 12 |
bias_format: SAME
|
| 13 |
input_format: SAME
|
| 14 |
instance: LayerNorm
|
|
@@ -43,7 +43,7 @@ model:
|
|
| 43 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 44 |
weight_sparseness: DENSE
|
| 45 |
model.decoder.layers.0.final_layer_norm:
|
| 46 |
-
approximation_function:
|
| 47 |
bias_format: SAME
|
| 48 |
input_format: SAME
|
| 49 |
instance: LayerNorm
|
|
@@ -82,7 +82,7 @@ model:
|
|
| 82 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 83 |
weight_sparseness: DENSE
|
| 84 |
model.decoder.layers.0.self_attn.softmax:
|
| 85 |
-
approximation_function:
|
| 86 |
input_format: SAME
|
| 87 |
instance: Softmax
|
| 88 |
output_format: SAME
|
|
@@ -96,7 +96,7 @@ model:
|
|
| 96 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 97 |
weight_sparseness: DENSE
|
| 98 |
model.decoder.layers.0.self_attn_layer_norm:
|
| 99 |
-
approximation_function:
|
| 100 |
bias_format: SAME
|
| 101 |
input_format: SAME
|
| 102 |
instance: LayerNorm
|
|
@@ -131,7 +131,7 @@ model:
|
|
| 131 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 132 |
weight_sparseness: DENSE
|
| 133 |
model.decoder.layers.1.final_layer_norm:
|
| 134 |
-
approximation_function:
|
| 135 |
bias_format: SAME
|
| 136 |
input_format: SAME
|
| 137 |
instance: LayerNorm
|
|
@@ -170,7 +170,7 @@ model:
|
|
| 170 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 171 |
weight_sparseness: DENSE
|
| 172 |
model.decoder.layers.1.self_attn.softmax:
|
| 173 |
-
approximation_function:
|
| 174 |
input_format: SAME
|
| 175 |
instance: Softmax
|
| 176 |
output_format: SAME
|
|
@@ -184,7 +184,7 @@ model:
|
|
| 184 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 185 |
weight_sparseness: DENSE
|
| 186 |
model.decoder.layers.1.self_attn_layer_norm:
|
| 187 |
-
approximation_function:
|
| 188 |
bias_format: SAME
|
| 189 |
input_format: SAME
|
| 190 |
instance: LayerNorm
|
|
@@ -219,7 +219,7 @@ model:
|
|
| 219 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 220 |
weight_sparseness: DENSE
|
| 221 |
model.decoder.layers.10.final_layer_norm:
|
| 222 |
-
approximation_function:
|
| 223 |
bias_format: SAME
|
| 224 |
input_format: SAME
|
| 225 |
instance: LayerNorm
|
|
@@ -258,7 +258,7 @@ model:
|
|
| 258 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 259 |
weight_sparseness: DENSE
|
| 260 |
model.decoder.layers.10.self_attn.softmax:
|
| 261 |
-
approximation_function:
|
| 262 |
input_format: SAME
|
| 263 |
instance: Softmax
|
| 264 |
output_format: SAME
|
|
@@ -272,7 +272,7 @@ model:
|
|
| 272 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 273 |
weight_sparseness: DENSE
|
| 274 |
model.decoder.layers.10.self_attn_layer_norm:
|
| 275 |
-
approximation_function:
|
| 276 |
bias_format: SAME
|
| 277 |
input_format: SAME
|
| 278 |
instance: LayerNorm
|
|
@@ -307,7 +307,7 @@ model:
|
|
| 307 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 308 |
weight_sparseness: DENSE
|
| 309 |
model.decoder.layers.11.final_layer_norm:
|
| 310 |
-
approximation_function:
|
| 311 |
bias_format: SAME
|
| 312 |
input_format: SAME
|
| 313 |
instance: LayerNorm
|
|
@@ -346,7 +346,7 @@ model:
|
|
| 346 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 347 |
weight_sparseness: DENSE
|
| 348 |
model.decoder.layers.11.self_attn.softmax:
|
| 349 |
-
approximation_function:
|
| 350 |
input_format: SAME
|
| 351 |
instance: Softmax
|
| 352 |
output_format: SAME
|
|
@@ -360,7 +360,7 @@ model:
|
|
| 360 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 361 |
weight_sparseness: DENSE
|
| 362 |
model.decoder.layers.11.self_attn_layer_norm:
|
| 363 |
-
approximation_function:
|
| 364 |
bias_format: SAME
|
| 365 |
input_format: SAME
|
| 366 |
instance: LayerNorm
|
|
@@ -395,7 +395,7 @@ model:
|
|
| 395 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 396 |
weight_sparseness: DENSE
|
| 397 |
model.decoder.layers.12.final_layer_norm:
|
| 398 |
-
approximation_function:
|
| 399 |
bias_format: SAME
|
| 400 |
input_format: SAME
|
| 401 |
instance: LayerNorm
|
|
@@ -434,7 +434,7 @@ model:
|
|
| 434 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 435 |
weight_sparseness: DENSE
|
| 436 |
model.decoder.layers.12.self_attn.softmax:
|
| 437 |
-
approximation_function:
|
| 438 |
input_format: SAME
|
| 439 |
instance: Softmax
|
| 440 |
output_format: SAME
|
|
@@ -448,7 +448,7 @@ model:
|
|
| 448 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 449 |
weight_sparseness: DENSE
|
| 450 |
model.decoder.layers.12.self_attn_layer_norm:
|
| 451 |
-
approximation_function:
|
| 452 |
bias_format: SAME
|
| 453 |
input_format: SAME
|
| 454 |
instance: LayerNorm
|
|
@@ -483,7 +483,7 @@ model:
|
|
| 483 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 484 |
weight_sparseness: DENSE
|
| 485 |
model.decoder.layers.13.final_layer_norm:
|
| 486 |
-
approximation_function:
|
| 487 |
bias_format: SAME
|
| 488 |
input_format: SAME
|
| 489 |
instance: LayerNorm
|
|
@@ -522,7 +522,7 @@ model:
|
|
| 522 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 523 |
weight_sparseness: DENSE
|
| 524 |
model.decoder.layers.13.self_attn.softmax:
|
| 525 |
-
approximation_function:
|
| 526 |
input_format: SAME
|
| 527 |
instance: Softmax
|
| 528 |
output_format: SAME
|
|
@@ -536,7 +536,7 @@ model:
|
|
| 536 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 537 |
weight_sparseness: DENSE
|
| 538 |
model.decoder.layers.13.self_attn_layer_norm:
|
| 539 |
-
approximation_function:
|
| 540 |
bias_format: SAME
|
| 541 |
input_format: SAME
|
| 542 |
instance: LayerNorm
|
|
@@ -571,7 +571,7 @@ model:
|
|
| 571 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 572 |
weight_sparseness: DENSE
|
| 573 |
model.decoder.layers.14.final_layer_norm:
|
| 574 |
-
approximation_function:
|
| 575 |
bias_format: SAME
|
| 576 |
input_format: SAME
|
| 577 |
instance: LayerNorm
|
|
@@ -610,7 +610,7 @@ model:
|
|
| 610 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 611 |
weight_sparseness: DENSE
|
| 612 |
model.decoder.layers.14.self_attn.softmax:
|
| 613 |
-
approximation_function:
|
| 614 |
input_format: SAME
|
| 615 |
instance: Softmax
|
| 616 |
output_format: SAME
|
|
@@ -624,7 +624,7 @@ model:
|
|
| 624 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 625 |
weight_sparseness: DENSE
|
| 626 |
model.decoder.layers.14.self_attn_layer_norm:
|
| 627 |
-
approximation_function:
|
| 628 |
bias_format: SAME
|
| 629 |
input_format: SAME
|
| 630 |
instance: LayerNorm
|
|
@@ -659,7 +659,7 @@ model:
|
|
| 659 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 660 |
weight_sparseness: DENSE
|
| 661 |
model.decoder.layers.15.final_layer_norm:
|
| 662 |
-
approximation_function:
|
| 663 |
bias_format: SAME
|
| 664 |
input_format: SAME
|
| 665 |
instance: LayerNorm
|
|
@@ -698,7 +698,7 @@ model:
|
|
| 698 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 699 |
weight_sparseness: DENSE
|
| 700 |
model.decoder.layers.15.self_attn.softmax:
|
| 701 |
-
approximation_function:
|
| 702 |
input_format: SAME
|
| 703 |
instance: Softmax
|
| 704 |
output_format: SAME
|
|
@@ -712,7 +712,7 @@ model:
|
|
| 712 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 713 |
weight_sparseness: DENSE
|
| 714 |
model.decoder.layers.15.self_attn_layer_norm:
|
| 715 |
-
approximation_function:
|
| 716 |
bias_format: SAME
|
| 717 |
input_format: SAME
|
| 718 |
instance: LayerNorm
|
|
@@ -747,7 +747,7 @@ model:
|
|
| 747 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 748 |
weight_sparseness: DENSE
|
| 749 |
model.decoder.layers.16.final_layer_norm:
|
| 750 |
-
approximation_function:
|
| 751 |
bias_format: SAME
|
| 752 |
input_format: SAME
|
| 753 |
instance: LayerNorm
|
|
@@ -786,7 +786,7 @@ model:
|
|
| 786 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 787 |
weight_sparseness: DENSE
|
| 788 |
model.decoder.layers.16.self_attn.softmax:
|
| 789 |
-
approximation_function:
|
| 790 |
input_format: SAME
|
| 791 |
instance: Softmax
|
| 792 |
output_format: SAME
|
|
@@ -800,7 +800,7 @@ model:
|
|
| 800 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 801 |
weight_sparseness: DENSE
|
| 802 |
model.decoder.layers.16.self_attn_layer_norm:
|
| 803 |
-
approximation_function:
|
| 804 |
bias_format: SAME
|
| 805 |
input_format: SAME
|
| 806 |
instance: LayerNorm
|
|
@@ -835,7 +835,7 @@ model:
|
|
| 835 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 836 |
weight_sparseness: DENSE
|
| 837 |
model.decoder.layers.17.final_layer_norm:
|
| 838 |
-
approximation_function:
|
| 839 |
bias_format: SAME
|
| 840 |
input_format: SAME
|
| 841 |
instance: LayerNorm
|
|
@@ -874,7 +874,7 @@ model:
|
|
| 874 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 875 |
weight_sparseness: DENSE
|
| 876 |
model.decoder.layers.17.self_attn.softmax:
|
| 877 |
-
approximation_function:
|
| 878 |
input_format: SAME
|
| 879 |
instance: Softmax
|
| 880 |
output_format: SAME
|
|
@@ -888,7 +888,7 @@ model:
|
|
| 888 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 889 |
weight_sparseness: DENSE
|
| 890 |
model.decoder.layers.17.self_attn_layer_norm:
|
| 891 |
-
approximation_function:
|
| 892 |
bias_format: SAME
|
| 893 |
input_format: SAME
|
| 894 |
instance: LayerNorm
|
|
@@ -923,7 +923,7 @@ model:
|
|
| 923 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 924 |
weight_sparseness: DENSE
|
| 925 |
model.decoder.layers.18.final_layer_norm:
|
| 926 |
-
approximation_function:
|
| 927 |
bias_format: SAME
|
| 928 |
input_format: SAME
|
| 929 |
instance: LayerNorm
|
|
@@ -962,7 +962,7 @@ model:
|
|
| 962 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 963 |
weight_sparseness: DENSE
|
| 964 |
model.decoder.layers.18.self_attn.softmax:
|
| 965 |
-
approximation_function:
|
| 966 |
input_format: SAME
|
| 967 |
instance: Softmax
|
| 968 |
output_format: SAME
|
|
@@ -976,7 +976,7 @@ model:
|
|
| 976 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 977 |
weight_sparseness: DENSE
|
| 978 |
model.decoder.layers.18.self_attn_layer_norm:
|
| 979 |
-
approximation_function:
|
| 980 |
bias_format: SAME
|
| 981 |
input_format: SAME
|
| 982 |
instance: LayerNorm
|
|
@@ -1011,7 +1011,7 @@ model:
|
|
| 1011 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1012 |
weight_sparseness: DENSE
|
| 1013 |
model.decoder.layers.19.final_layer_norm:
|
| 1014 |
-
approximation_function:
|
| 1015 |
bias_format: SAME
|
| 1016 |
input_format: SAME
|
| 1017 |
instance: LayerNorm
|
|
@@ -1050,7 +1050,7 @@ model:
|
|
| 1050 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1051 |
weight_sparseness: DENSE
|
| 1052 |
model.decoder.layers.19.self_attn.softmax:
|
| 1053 |
-
approximation_function:
|
| 1054 |
input_format: SAME
|
| 1055 |
instance: Softmax
|
| 1056 |
output_format: SAME
|
|
@@ -1064,7 +1064,7 @@ model:
|
|
| 1064 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1065 |
weight_sparseness: DENSE
|
| 1066 |
model.decoder.layers.19.self_attn_layer_norm:
|
| 1067 |
-
approximation_function:
|
| 1068 |
bias_format: SAME
|
| 1069 |
input_format: SAME
|
| 1070 |
instance: LayerNorm
|
|
@@ -1099,7 +1099,7 @@ model:
|
|
| 1099 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1100 |
weight_sparseness: DENSE
|
| 1101 |
model.decoder.layers.2.final_layer_norm:
|
| 1102 |
-
approximation_function:
|
| 1103 |
bias_format: SAME
|
| 1104 |
input_format: SAME
|
| 1105 |
instance: LayerNorm
|
|
@@ -1138,7 +1138,7 @@ model:
|
|
| 1138 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1139 |
weight_sparseness: DENSE
|
| 1140 |
model.decoder.layers.2.self_attn.softmax:
|
| 1141 |
-
approximation_function:
|
| 1142 |
input_format: SAME
|
| 1143 |
instance: Softmax
|
| 1144 |
output_format: SAME
|
|
@@ -1152,7 +1152,7 @@ model:
|
|
| 1152 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1153 |
weight_sparseness: DENSE
|
| 1154 |
model.decoder.layers.2.self_attn_layer_norm:
|
| 1155 |
-
approximation_function:
|
| 1156 |
bias_format: SAME
|
| 1157 |
input_format: SAME
|
| 1158 |
instance: LayerNorm
|
|
@@ -1187,7 +1187,7 @@ model:
|
|
| 1187 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1188 |
weight_sparseness: DENSE
|
| 1189 |
model.decoder.layers.20.final_layer_norm:
|
| 1190 |
-
approximation_function:
|
| 1191 |
bias_format: SAME
|
| 1192 |
input_format: SAME
|
| 1193 |
instance: LayerNorm
|
|
@@ -1226,7 +1226,7 @@ model:
|
|
| 1226 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1227 |
weight_sparseness: DENSE
|
| 1228 |
model.decoder.layers.20.self_attn.softmax:
|
| 1229 |
-
approximation_function:
|
| 1230 |
input_format: SAME
|
| 1231 |
instance: Softmax
|
| 1232 |
output_format: SAME
|
|
@@ -1240,7 +1240,7 @@ model:
|
|
| 1240 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1241 |
weight_sparseness: DENSE
|
| 1242 |
model.decoder.layers.20.self_attn_layer_norm:
|
| 1243 |
-
approximation_function:
|
| 1244 |
bias_format: SAME
|
| 1245 |
input_format: SAME
|
| 1246 |
instance: LayerNorm
|
|
@@ -1275,7 +1275,7 @@ model:
|
|
| 1275 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1276 |
weight_sparseness: DENSE
|
| 1277 |
model.decoder.layers.21.final_layer_norm:
|
| 1278 |
-
approximation_function:
|
| 1279 |
bias_format: SAME
|
| 1280 |
input_format: SAME
|
| 1281 |
instance: LayerNorm
|
|
@@ -1314,7 +1314,7 @@ model:
|
|
| 1314 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1315 |
weight_sparseness: DENSE
|
| 1316 |
model.decoder.layers.21.self_attn.softmax:
|
| 1317 |
-
approximation_function:
|
| 1318 |
input_format: SAME
|
| 1319 |
instance: Softmax
|
| 1320 |
output_format: SAME
|
|
@@ -1328,7 +1328,7 @@ model:
|
|
| 1328 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1329 |
weight_sparseness: DENSE
|
| 1330 |
model.decoder.layers.21.self_attn_layer_norm:
|
| 1331 |
-
approximation_function:
|
| 1332 |
bias_format: SAME
|
| 1333 |
input_format: SAME
|
| 1334 |
instance: LayerNorm
|
|
@@ -1363,7 +1363,7 @@ model:
|
|
| 1363 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1364 |
weight_sparseness: DENSE
|
| 1365 |
model.decoder.layers.22.final_layer_norm:
|
| 1366 |
-
approximation_function:
|
| 1367 |
bias_format: SAME
|
| 1368 |
input_format: SAME
|
| 1369 |
instance: LayerNorm
|
|
@@ -1402,7 +1402,7 @@ model:
|
|
| 1402 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1403 |
weight_sparseness: DENSE
|
| 1404 |
model.decoder.layers.22.self_attn.softmax:
|
| 1405 |
-
approximation_function:
|
| 1406 |
input_format: SAME
|
| 1407 |
instance: Softmax
|
| 1408 |
output_format: SAME
|
|
@@ -1416,7 +1416,7 @@ model:
|
|
| 1416 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1417 |
weight_sparseness: DENSE
|
| 1418 |
model.decoder.layers.22.self_attn_layer_norm:
|
| 1419 |
-
approximation_function:
|
| 1420 |
bias_format: SAME
|
| 1421 |
input_format: SAME
|
| 1422 |
instance: LayerNorm
|
|
@@ -1451,7 +1451,7 @@ model:
|
|
| 1451 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1452 |
weight_sparseness: DENSE
|
| 1453 |
model.decoder.layers.23.final_layer_norm:
|
| 1454 |
-
approximation_function:
|
| 1455 |
bias_format: SAME
|
| 1456 |
input_format: SAME
|
| 1457 |
instance: LayerNorm
|
|
@@ -1490,7 +1490,7 @@ model:
|
|
| 1490 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1491 |
weight_sparseness: DENSE
|
| 1492 |
model.decoder.layers.23.self_attn.softmax:
|
| 1493 |
-
approximation_function:
|
| 1494 |
input_format: SAME
|
| 1495 |
instance: Softmax
|
| 1496 |
output_format: SAME
|
|
@@ -1504,7 +1504,7 @@ model:
|
|
| 1504 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1505 |
weight_sparseness: DENSE
|
| 1506 |
model.decoder.layers.23.self_attn_layer_norm:
|
| 1507 |
-
approximation_function:
|
| 1508 |
bias_format: SAME
|
| 1509 |
input_format: SAME
|
| 1510 |
instance: LayerNorm
|
|
@@ -1539,7 +1539,7 @@ model:
|
|
| 1539 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1540 |
weight_sparseness: DENSE
|
| 1541 |
model.decoder.layers.24.final_layer_norm:
|
| 1542 |
-
approximation_function:
|
| 1543 |
bias_format: SAME
|
| 1544 |
input_format: SAME
|
| 1545 |
instance: LayerNorm
|
|
@@ -1578,7 +1578,7 @@ model:
|
|
| 1578 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1579 |
weight_sparseness: DENSE
|
| 1580 |
model.decoder.layers.24.self_attn.softmax:
|
| 1581 |
-
approximation_function:
|
| 1582 |
input_format: SAME
|
| 1583 |
instance: Softmax
|
| 1584 |
output_format: SAME
|
|
@@ -1592,7 +1592,7 @@ model:
|
|
| 1592 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1593 |
weight_sparseness: DENSE
|
| 1594 |
model.decoder.layers.24.self_attn_layer_norm:
|
| 1595 |
-
approximation_function:
|
| 1596 |
bias_format: SAME
|
| 1597 |
input_format: SAME
|
| 1598 |
instance: LayerNorm
|
|
@@ -1627,7 +1627,7 @@ model:
|
|
| 1627 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1628 |
weight_sparseness: DENSE
|
| 1629 |
model.decoder.layers.25.final_layer_norm:
|
| 1630 |
-
approximation_function:
|
| 1631 |
bias_format: SAME
|
| 1632 |
input_format: SAME
|
| 1633 |
instance: LayerNorm
|
|
@@ -1666,7 +1666,7 @@ model:
|
|
| 1666 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1667 |
weight_sparseness: DENSE
|
| 1668 |
model.decoder.layers.25.self_attn.softmax:
|
| 1669 |
-
approximation_function:
|
| 1670 |
input_format: SAME
|
| 1671 |
instance: Softmax
|
| 1672 |
output_format: SAME
|
|
@@ -1680,7 +1680,7 @@ model:
|
|
| 1680 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1681 |
weight_sparseness: DENSE
|
| 1682 |
model.decoder.layers.25.self_attn_layer_norm:
|
| 1683 |
-
approximation_function:
|
| 1684 |
bias_format: SAME
|
| 1685 |
input_format: SAME
|
| 1686 |
instance: LayerNorm
|
|
@@ -1715,7 +1715,7 @@ model:
|
|
| 1715 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1716 |
weight_sparseness: DENSE
|
| 1717 |
model.decoder.layers.26.final_layer_norm:
|
| 1718 |
-
approximation_function:
|
| 1719 |
bias_format: SAME
|
| 1720 |
input_format: SAME
|
| 1721 |
instance: LayerNorm
|
|
@@ -1754,7 +1754,7 @@ model:
|
|
| 1754 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1755 |
weight_sparseness: DENSE
|
| 1756 |
model.decoder.layers.26.self_attn.softmax:
|
| 1757 |
-
approximation_function:
|
| 1758 |
input_format: SAME
|
| 1759 |
instance: Softmax
|
| 1760 |
output_format: SAME
|
|
@@ -1768,7 +1768,7 @@ model:
|
|
| 1768 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1769 |
weight_sparseness: DENSE
|
| 1770 |
model.decoder.layers.26.self_attn_layer_norm:
|
| 1771 |
-
approximation_function:
|
| 1772 |
bias_format: SAME
|
| 1773 |
input_format: SAME
|
| 1774 |
instance: LayerNorm
|
|
@@ -1803,7 +1803,7 @@ model:
|
|
| 1803 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1804 |
weight_sparseness: DENSE
|
| 1805 |
model.decoder.layers.27.final_layer_norm:
|
| 1806 |
-
approximation_function:
|
| 1807 |
bias_format: SAME
|
| 1808 |
input_format: SAME
|
| 1809 |
instance: LayerNorm
|
|
@@ -1842,7 +1842,7 @@ model:
|
|
| 1842 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1843 |
weight_sparseness: DENSE
|
| 1844 |
model.decoder.layers.27.self_attn.softmax:
|
| 1845 |
-
approximation_function:
|
| 1846 |
input_format: SAME
|
| 1847 |
instance: Softmax
|
| 1848 |
output_format: SAME
|
|
@@ -1856,7 +1856,7 @@ model:
|
|
| 1856 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1857 |
weight_sparseness: DENSE
|
| 1858 |
model.decoder.layers.27.self_attn_layer_norm:
|
| 1859 |
-
approximation_function:
|
| 1860 |
bias_format: SAME
|
| 1861 |
input_format: SAME
|
| 1862 |
instance: LayerNorm
|
|
@@ -1891,7 +1891,7 @@ model:
|
|
| 1891 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1892 |
weight_sparseness: DENSE
|
| 1893 |
model.decoder.layers.28.final_layer_norm:
|
| 1894 |
-
approximation_function:
|
| 1895 |
bias_format: SAME
|
| 1896 |
input_format: SAME
|
| 1897 |
instance: LayerNorm
|
|
@@ -1930,7 +1930,7 @@ model:
|
|
| 1930 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1931 |
weight_sparseness: DENSE
|
| 1932 |
model.decoder.layers.28.self_attn.softmax:
|
| 1933 |
-
approximation_function:
|
| 1934 |
input_format: SAME
|
| 1935 |
instance: Softmax
|
| 1936 |
output_format: SAME
|
|
@@ -1944,7 +1944,7 @@ model:
|
|
| 1944 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1945 |
weight_sparseness: DENSE
|
| 1946 |
model.decoder.layers.28.self_attn_layer_norm:
|
| 1947 |
-
approximation_function:
|
| 1948 |
bias_format: SAME
|
| 1949 |
input_format: SAME
|
| 1950 |
instance: LayerNorm
|
|
@@ -1979,7 +1979,7 @@ model:
|
|
| 1979 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1980 |
weight_sparseness: DENSE
|
| 1981 |
model.decoder.layers.29.final_layer_norm:
|
| 1982 |
-
approximation_function:
|
| 1983 |
bias_format: SAME
|
| 1984 |
input_format: SAME
|
| 1985 |
instance: LayerNorm
|
|
@@ -2018,7 +2018,7 @@ model:
|
|
| 2018 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2019 |
weight_sparseness: DENSE
|
| 2020 |
model.decoder.layers.29.self_attn.softmax:
|
| 2021 |
-
approximation_function:
|
| 2022 |
input_format: SAME
|
| 2023 |
instance: Softmax
|
| 2024 |
output_format: SAME
|
|
@@ -2032,7 +2032,7 @@ model:
|
|
| 2032 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2033 |
weight_sparseness: DENSE
|
| 2034 |
model.decoder.layers.29.self_attn_layer_norm:
|
| 2035 |
-
approximation_function:
|
| 2036 |
bias_format: SAME
|
| 2037 |
input_format: SAME
|
| 2038 |
instance: LayerNorm
|
|
@@ -2067,7 +2067,7 @@ model:
|
|
| 2067 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2068 |
weight_sparseness: DENSE
|
| 2069 |
model.decoder.layers.3.final_layer_norm:
|
| 2070 |
-
approximation_function:
|
| 2071 |
bias_format: SAME
|
| 2072 |
input_format: SAME
|
| 2073 |
instance: LayerNorm
|
|
@@ -2106,7 +2106,7 @@ model:
|
|
| 2106 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2107 |
weight_sparseness: DENSE
|
| 2108 |
model.decoder.layers.3.self_attn.softmax:
|
| 2109 |
-
approximation_function:
|
| 2110 |
input_format: SAME
|
| 2111 |
instance: Softmax
|
| 2112 |
output_format: SAME
|
|
@@ -2120,7 +2120,7 @@ model:
|
|
| 2120 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2121 |
weight_sparseness: DENSE
|
| 2122 |
model.decoder.layers.3.self_attn_layer_norm:
|
| 2123 |
-
approximation_function:
|
| 2124 |
bias_format: SAME
|
| 2125 |
input_format: SAME
|
| 2126 |
instance: LayerNorm
|
|
@@ -2155,7 +2155,7 @@ model:
|
|
| 2155 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2156 |
weight_sparseness: DENSE
|
| 2157 |
model.decoder.layers.30.final_layer_norm:
|
| 2158 |
-
approximation_function:
|
| 2159 |
bias_format: SAME
|
| 2160 |
input_format: SAME
|
| 2161 |
instance: LayerNorm
|
|
@@ -2194,7 +2194,7 @@ model:
|
|
| 2194 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2195 |
weight_sparseness: DENSE
|
| 2196 |
model.decoder.layers.30.self_attn.softmax:
|
| 2197 |
-
approximation_function:
|
| 2198 |
input_format: SAME
|
| 2199 |
instance: Softmax
|
| 2200 |
output_format: SAME
|
|
@@ -2208,7 +2208,7 @@ model:
|
|
| 2208 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2209 |
weight_sparseness: DENSE
|
| 2210 |
model.decoder.layers.30.self_attn_layer_norm:
|
| 2211 |
-
approximation_function:
|
| 2212 |
bias_format: SAME
|
| 2213 |
input_format: SAME
|
| 2214 |
instance: LayerNorm
|
|
@@ -2243,7 +2243,7 @@ model:
|
|
| 2243 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2244 |
weight_sparseness: DENSE
|
| 2245 |
model.decoder.layers.31.final_layer_norm:
|
| 2246 |
-
approximation_function:
|
| 2247 |
bias_format: SAME
|
| 2248 |
input_format: SAME
|
| 2249 |
instance: LayerNorm
|
|
@@ -2282,7 +2282,7 @@ model:
|
|
| 2282 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2283 |
weight_sparseness: DENSE
|
| 2284 |
model.decoder.layers.31.self_attn.softmax:
|
| 2285 |
-
approximation_function:
|
| 2286 |
input_format: SAME
|
| 2287 |
instance: Softmax
|
| 2288 |
output_format: SAME
|
|
@@ -2296,7 +2296,7 @@ model:
|
|
| 2296 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2297 |
weight_sparseness: DENSE
|
| 2298 |
model.decoder.layers.31.self_attn_layer_norm:
|
| 2299 |
-
approximation_function:
|
| 2300 |
bias_format: SAME
|
| 2301 |
input_format: SAME
|
| 2302 |
instance: LayerNorm
|
|
@@ -2331,7 +2331,7 @@ model:
|
|
| 2331 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2332 |
weight_sparseness: DENSE
|
| 2333 |
model.decoder.layers.4.final_layer_norm:
|
| 2334 |
-
approximation_function:
|
| 2335 |
bias_format: SAME
|
| 2336 |
input_format: SAME
|
| 2337 |
instance: LayerNorm
|
|
@@ -2370,7 +2370,7 @@ model:
|
|
| 2370 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2371 |
weight_sparseness: DENSE
|
| 2372 |
model.decoder.layers.4.self_attn.softmax:
|
| 2373 |
-
approximation_function:
|
| 2374 |
input_format: SAME
|
| 2375 |
instance: Softmax
|
| 2376 |
output_format: SAME
|
|
@@ -2384,7 +2384,7 @@ model:
|
|
| 2384 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2385 |
weight_sparseness: DENSE
|
| 2386 |
model.decoder.layers.4.self_attn_layer_norm:
|
| 2387 |
-
approximation_function:
|
| 2388 |
bias_format: SAME
|
| 2389 |
input_format: SAME
|
| 2390 |
instance: LayerNorm
|
|
@@ -2419,7 +2419,7 @@ model:
|
|
| 2419 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2420 |
weight_sparseness: DENSE
|
| 2421 |
model.decoder.layers.5.final_layer_norm:
|
| 2422 |
-
approximation_function:
|
| 2423 |
bias_format: SAME
|
| 2424 |
input_format: SAME
|
| 2425 |
instance: LayerNorm
|
|
@@ -2458,7 +2458,7 @@ model:
|
|
| 2458 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2459 |
weight_sparseness: DENSE
|
| 2460 |
model.decoder.layers.5.self_attn.softmax:
|
| 2461 |
-
approximation_function:
|
| 2462 |
input_format: SAME
|
| 2463 |
instance: Softmax
|
| 2464 |
output_format: SAME
|
|
@@ -2472,7 +2472,7 @@ model:
|
|
| 2472 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2473 |
weight_sparseness: DENSE
|
| 2474 |
model.decoder.layers.5.self_attn_layer_norm:
|
| 2475 |
-
approximation_function:
|
| 2476 |
bias_format: SAME
|
| 2477 |
input_format: SAME
|
| 2478 |
instance: LayerNorm
|
|
@@ -2507,7 +2507,7 @@ model:
|
|
| 2507 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2508 |
weight_sparseness: DENSE
|
| 2509 |
model.decoder.layers.6.final_layer_norm:
|
| 2510 |
-
approximation_function:
|
| 2511 |
bias_format: SAME
|
| 2512 |
input_format: SAME
|
| 2513 |
instance: LayerNorm
|
|
@@ -2546,7 +2546,7 @@ model:
|
|
| 2546 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2547 |
weight_sparseness: DENSE
|
| 2548 |
model.decoder.layers.6.self_attn.softmax:
|
| 2549 |
-
approximation_function:
|
| 2550 |
input_format: SAME
|
| 2551 |
instance: Softmax
|
| 2552 |
output_format: SAME
|
|
@@ -2560,7 +2560,7 @@ model:
|
|
| 2560 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2561 |
weight_sparseness: DENSE
|
| 2562 |
model.decoder.layers.6.self_attn_layer_norm:
|
| 2563 |
-
approximation_function:
|
| 2564 |
bias_format: SAME
|
| 2565 |
input_format: SAME
|
| 2566 |
instance: LayerNorm
|
|
@@ -2595,7 +2595,7 @@ model:
|
|
| 2595 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2596 |
weight_sparseness: DENSE
|
| 2597 |
model.decoder.layers.7.final_layer_norm:
|
| 2598 |
-
approximation_function:
|
| 2599 |
bias_format: SAME
|
| 2600 |
input_format: SAME
|
| 2601 |
instance: LayerNorm
|
|
@@ -2634,7 +2634,7 @@ model:
|
|
| 2634 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2635 |
weight_sparseness: DENSE
|
| 2636 |
model.decoder.layers.7.self_attn.softmax:
|
| 2637 |
-
approximation_function:
|
| 2638 |
input_format: SAME
|
| 2639 |
instance: Softmax
|
| 2640 |
output_format: SAME
|
|
@@ -2648,7 +2648,7 @@ model:
|
|
| 2648 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2649 |
weight_sparseness: DENSE
|
| 2650 |
model.decoder.layers.7.self_attn_layer_norm:
|
| 2651 |
-
approximation_function:
|
| 2652 |
bias_format: SAME
|
| 2653 |
input_format: SAME
|
| 2654 |
instance: LayerNorm
|
|
@@ -2683,7 +2683,7 @@ model:
|
|
| 2683 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2684 |
weight_sparseness: DENSE
|
| 2685 |
model.decoder.layers.8.final_layer_norm:
|
| 2686 |
-
approximation_function:
|
| 2687 |
bias_format: SAME
|
| 2688 |
input_format: SAME
|
| 2689 |
instance: LayerNorm
|
|
@@ -2722,7 +2722,7 @@ model:
|
|
| 2722 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2723 |
weight_sparseness: DENSE
|
| 2724 |
model.decoder.layers.8.self_attn.softmax:
|
| 2725 |
-
approximation_function:
|
| 2726 |
input_format: SAME
|
| 2727 |
instance: Softmax
|
| 2728 |
output_format: SAME
|
|
@@ -2736,7 +2736,7 @@ model:
|
|
| 2736 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2737 |
weight_sparseness: DENSE
|
| 2738 |
model.decoder.layers.8.self_attn_layer_norm:
|
| 2739 |
-
approximation_function:
|
| 2740 |
bias_format: SAME
|
| 2741 |
input_format: SAME
|
| 2742 |
instance: LayerNorm
|
|
@@ -2771,7 +2771,7 @@ model:
|
|
| 2771 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2772 |
weight_sparseness: DENSE
|
| 2773 |
model.decoder.layers.9.final_layer_norm:
|
| 2774 |
-
approximation_function:
|
| 2775 |
bias_format: SAME
|
| 2776 |
input_format: SAME
|
| 2777 |
instance: LayerNorm
|
|
@@ -2810,7 +2810,7 @@ model:
|
|
| 2810 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2811 |
weight_sparseness: DENSE
|
| 2812 |
model.decoder.layers.9.self_attn.softmax:
|
| 2813 |
-
approximation_function:
|
| 2814 |
input_format: SAME
|
| 2815 |
instance: Softmax
|
| 2816 |
output_format: SAME
|
|
@@ -2824,7 +2824,7 @@ model:
|
|
| 2824 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2825 |
weight_sparseness: DENSE
|
| 2826 |
model.decoder.layers.9.self_attn_layer_norm:
|
| 2827 |
-
approximation_function:
|
| 2828 |
bias_format: SAME
|
| 2829 |
input_format: SAME
|
| 2830 |
instance: LayerNorm
|
|
|
|
| 8 |
weight_format: SAME
|
| 9 |
weight_sparseness: DENSE
|
| 10 |
model.decoder.final_layer_norm:
|
| 11 |
+
approximation_function: NONE
|
| 12 |
bias_format: SAME
|
| 13 |
input_format: SAME
|
| 14 |
instance: LayerNorm
|
|
|
|
| 43 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 44 |
weight_sparseness: DENSE
|
| 45 |
model.decoder.layers.0.final_layer_norm:
|
| 46 |
+
approximation_function: NONE
|
| 47 |
bias_format: SAME
|
| 48 |
input_format: SAME
|
| 49 |
instance: LayerNorm
|
|
|
|
| 82 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 83 |
weight_sparseness: DENSE
|
| 84 |
model.decoder.layers.0.self_attn.softmax:
|
| 85 |
+
approximation_function: NONE
|
| 86 |
input_format: SAME
|
| 87 |
instance: Softmax
|
| 88 |
output_format: SAME
|
|
|
|
| 96 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 97 |
weight_sparseness: DENSE
|
| 98 |
model.decoder.layers.0.self_attn_layer_norm:
|
| 99 |
+
approximation_function: NONE
|
| 100 |
bias_format: SAME
|
| 101 |
input_format: SAME
|
| 102 |
instance: LayerNorm
|
|
|
|
| 131 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 132 |
weight_sparseness: DENSE
|
| 133 |
model.decoder.layers.1.final_layer_norm:
|
| 134 |
+
approximation_function: NONE
|
| 135 |
bias_format: SAME
|
| 136 |
input_format: SAME
|
| 137 |
instance: LayerNorm
|
|
|
|
| 170 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 171 |
weight_sparseness: DENSE
|
| 172 |
model.decoder.layers.1.self_attn.softmax:
|
| 173 |
+
approximation_function: NONE
|
| 174 |
input_format: SAME
|
| 175 |
instance: Softmax
|
| 176 |
output_format: SAME
|
|
|
|
| 184 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 185 |
weight_sparseness: DENSE
|
| 186 |
model.decoder.layers.1.self_attn_layer_norm:
|
| 187 |
+
approximation_function: NONE
|
| 188 |
bias_format: SAME
|
| 189 |
input_format: SAME
|
| 190 |
instance: LayerNorm
|
|
|
|
| 219 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 220 |
weight_sparseness: DENSE
|
| 221 |
model.decoder.layers.10.final_layer_norm:
|
| 222 |
+
approximation_function: NONE
|
| 223 |
bias_format: SAME
|
| 224 |
input_format: SAME
|
| 225 |
instance: LayerNorm
|
|
|
|
| 258 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 259 |
weight_sparseness: DENSE
|
| 260 |
model.decoder.layers.10.self_attn.softmax:
|
| 261 |
+
approximation_function: NONE
|
| 262 |
input_format: SAME
|
| 263 |
instance: Softmax
|
| 264 |
output_format: SAME
|
|
|
|
| 272 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 273 |
weight_sparseness: DENSE
|
| 274 |
model.decoder.layers.10.self_attn_layer_norm:
|
| 275 |
+
approximation_function: NONE
|
| 276 |
bias_format: SAME
|
| 277 |
input_format: SAME
|
| 278 |
instance: LayerNorm
|
|
|
|
| 307 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 308 |
weight_sparseness: DENSE
|
| 309 |
model.decoder.layers.11.final_layer_norm:
|
| 310 |
+
approximation_function: NONE
|
| 311 |
bias_format: SAME
|
| 312 |
input_format: SAME
|
| 313 |
instance: LayerNorm
|
|
|
|
| 346 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 347 |
weight_sparseness: DENSE
|
| 348 |
model.decoder.layers.11.self_attn.softmax:
|
| 349 |
+
approximation_function: NONE
|
| 350 |
input_format: SAME
|
| 351 |
instance: Softmax
|
| 352 |
output_format: SAME
|
|
|
|
| 360 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 361 |
weight_sparseness: DENSE
|
| 362 |
model.decoder.layers.11.self_attn_layer_norm:
|
| 363 |
+
approximation_function: NONE
|
| 364 |
bias_format: SAME
|
| 365 |
input_format: SAME
|
| 366 |
instance: LayerNorm
|
|
|
|
| 395 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 396 |
weight_sparseness: DENSE
|
| 397 |
model.decoder.layers.12.final_layer_norm:
|
| 398 |
+
approximation_function: NONE
|
| 399 |
bias_format: SAME
|
| 400 |
input_format: SAME
|
| 401 |
instance: LayerNorm
|
|
|
|
| 434 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 435 |
weight_sparseness: DENSE
|
| 436 |
model.decoder.layers.12.self_attn.softmax:
|
| 437 |
+
approximation_function: NONE
|
| 438 |
input_format: SAME
|
| 439 |
instance: Softmax
|
| 440 |
output_format: SAME
|
|
|
|
| 448 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 449 |
weight_sparseness: DENSE
|
| 450 |
model.decoder.layers.12.self_attn_layer_norm:
|
| 451 |
+
approximation_function: NONE
|
| 452 |
bias_format: SAME
|
| 453 |
input_format: SAME
|
| 454 |
instance: LayerNorm
|
|
|
|
| 483 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 484 |
weight_sparseness: DENSE
|
| 485 |
model.decoder.layers.13.final_layer_norm:
|
| 486 |
+
approximation_function: NONE
|
| 487 |
bias_format: SAME
|
| 488 |
input_format: SAME
|
| 489 |
instance: LayerNorm
|
|
|
|
| 522 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 523 |
weight_sparseness: DENSE
|
| 524 |
model.decoder.layers.13.self_attn.softmax:
|
| 525 |
+
approximation_function: NONE
|
| 526 |
input_format: SAME
|
| 527 |
instance: Softmax
|
| 528 |
output_format: SAME
|
|
|
|
| 536 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 537 |
weight_sparseness: DENSE
|
| 538 |
model.decoder.layers.13.self_attn_layer_norm:
|
| 539 |
+
approximation_function: NONE
|
| 540 |
bias_format: SAME
|
| 541 |
input_format: SAME
|
| 542 |
instance: LayerNorm
|
|
|
|
| 571 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 572 |
weight_sparseness: DENSE
|
| 573 |
model.decoder.layers.14.final_layer_norm:
|
| 574 |
+
approximation_function: NONE
|
| 575 |
bias_format: SAME
|
| 576 |
input_format: SAME
|
| 577 |
instance: LayerNorm
|
|
|
|
| 610 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 611 |
weight_sparseness: DENSE
|
| 612 |
model.decoder.layers.14.self_attn.softmax:
|
| 613 |
+
approximation_function: NONE
|
| 614 |
input_format: SAME
|
| 615 |
instance: Softmax
|
| 616 |
output_format: SAME
|
|
|
|
| 624 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 625 |
weight_sparseness: DENSE
|
| 626 |
model.decoder.layers.14.self_attn_layer_norm:
|
| 627 |
+
approximation_function: NONE
|
| 628 |
bias_format: SAME
|
| 629 |
input_format: SAME
|
| 630 |
instance: LayerNorm
|
|
|
|
| 659 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 660 |
weight_sparseness: DENSE
|
| 661 |
model.decoder.layers.15.final_layer_norm:
|
| 662 |
+
approximation_function: NONE
|
| 663 |
bias_format: SAME
|
| 664 |
input_format: SAME
|
| 665 |
instance: LayerNorm
|
|
|
|
| 698 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 699 |
weight_sparseness: DENSE
|
| 700 |
model.decoder.layers.15.self_attn.softmax:
|
| 701 |
+
approximation_function: NONE
|
| 702 |
input_format: SAME
|
| 703 |
instance: Softmax
|
| 704 |
output_format: SAME
|
|
|
|
| 712 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 713 |
weight_sparseness: DENSE
|
| 714 |
model.decoder.layers.15.self_attn_layer_norm:
|
| 715 |
+
approximation_function: NONE
|
| 716 |
bias_format: SAME
|
| 717 |
input_format: SAME
|
| 718 |
instance: LayerNorm
|
|
|
|
| 747 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 748 |
weight_sparseness: DENSE
|
| 749 |
model.decoder.layers.16.final_layer_norm:
|
| 750 |
+
approximation_function: NONE
|
| 751 |
bias_format: SAME
|
| 752 |
input_format: SAME
|
| 753 |
instance: LayerNorm
|
|
|
|
| 786 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 787 |
weight_sparseness: DENSE
|
| 788 |
model.decoder.layers.16.self_attn.softmax:
|
| 789 |
+
approximation_function: NONE
|
| 790 |
input_format: SAME
|
| 791 |
instance: Softmax
|
| 792 |
output_format: SAME
|
|
|
|
| 800 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 801 |
weight_sparseness: DENSE
|
| 802 |
model.decoder.layers.16.self_attn_layer_norm:
|
| 803 |
+
approximation_function: NONE
|
| 804 |
bias_format: SAME
|
| 805 |
input_format: SAME
|
| 806 |
instance: LayerNorm
|
|
|
|
| 835 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 836 |
weight_sparseness: DENSE
|
| 837 |
model.decoder.layers.17.final_layer_norm:
|
| 838 |
+
approximation_function: NONE
|
| 839 |
bias_format: SAME
|
| 840 |
input_format: SAME
|
| 841 |
instance: LayerNorm
|
|
|
|
| 874 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 875 |
weight_sparseness: DENSE
|
| 876 |
model.decoder.layers.17.self_attn.softmax:
|
| 877 |
+
approximation_function: NONE
|
| 878 |
input_format: SAME
|
| 879 |
instance: Softmax
|
| 880 |
output_format: SAME
|
|
|
|
| 888 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 889 |
weight_sparseness: DENSE
|
| 890 |
model.decoder.layers.17.self_attn_layer_norm:
|
| 891 |
+
approximation_function: NONE
|
| 892 |
bias_format: SAME
|
| 893 |
input_format: SAME
|
| 894 |
instance: LayerNorm
|
|
|
|
| 923 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 924 |
weight_sparseness: DENSE
|
| 925 |
model.decoder.layers.18.final_layer_norm:
|
| 926 |
+
approximation_function: NONE
|
| 927 |
bias_format: SAME
|
| 928 |
input_format: SAME
|
| 929 |
instance: LayerNorm
|
|
|
|
| 962 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 963 |
weight_sparseness: DENSE
|
| 964 |
model.decoder.layers.18.self_attn.softmax:
|
| 965 |
+
approximation_function: NONE
|
| 966 |
input_format: SAME
|
| 967 |
instance: Softmax
|
| 968 |
output_format: SAME
|
|
|
|
| 976 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 977 |
weight_sparseness: DENSE
|
| 978 |
model.decoder.layers.18.self_attn_layer_norm:
|
| 979 |
+
approximation_function: NONE
|
| 980 |
bias_format: SAME
|
| 981 |
input_format: SAME
|
| 982 |
instance: LayerNorm
|
|
|
|
| 1011 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1012 |
weight_sparseness: DENSE
|
| 1013 |
model.decoder.layers.19.final_layer_norm:
|
| 1014 |
+
approximation_function: NONE
|
| 1015 |
bias_format: SAME
|
| 1016 |
input_format: SAME
|
| 1017 |
instance: LayerNorm
|
|
|
|
| 1050 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1051 |
weight_sparseness: DENSE
|
| 1052 |
model.decoder.layers.19.self_attn.softmax:
|
| 1053 |
+
approximation_function: NONE
|
| 1054 |
input_format: SAME
|
| 1055 |
instance: Softmax
|
| 1056 |
output_format: SAME
|
|
|
|
| 1064 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1065 |
weight_sparseness: DENSE
|
| 1066 |
model.decoder.layers.19.self_attn_layer_norm:
|
| 1067 |
+
approximation_function: NONE
|
| 1068 |
bias_format: SAME
|
| 1069 |
input_format: SAME
|
| 1070 |
instance: LayerNorm
|
|
|
|
| 1099 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1100 |
weight_sparseness: DENSE
|
| 1101 |
model.decoder.layers.2.final_layer_norm:
|
| 1102 |
+
approximation_function: NONE
|
| 1103 |
bias_format: SAME
|
| 1104 |
input_format: SAME
|
| 1105 |
instance: LayerNorm
|
|
|
|
| 1138 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1139 |
weight_sparseness: DENSE
|
| 1140 |
model.decoder.layers.2.self_attn.softmax:
|
| 1141 |
+
approximation_function: NONE
|
| 1142 |
input_format: SAME
|
| 1143 |
instance: Softmax
|
| 1144 |
output_format: SAME
|
|
|
|
| 1152 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1153 |
weight_sparseness: DENSE
|
| 1154 |
model.decoder.layers.2.self_attn_layer_norm:
|
| 1155 |
+
approximation_function: NONE
|
| 1156 |
bias_format: SAME
|
| 1157 |
input_format: SAME
|
| 1158 |
instance: LayerNorm
|
|
|
|
| 1187 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1188 |
weight_sparseness: DENSE
|
| 1189 |
model.decoder.layers.20.final_layer_norm:
|
| 1190 |
+
approximation_function: NONE
|
| 1191 |
bias_format: SAME
|
| 1192 |
input_format: SAME
|
| 1193 |
instance: LayerNorm
|
|
|
|
| 1226 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1227 |
weight_sparseness: DENSE
|
| 1228 |
model.decoder.layers.20.self_attn.softmax:
|
| 1229 |
+
approximation_function: NONE
|
| 1230 |
input_format: SAME
|
| 1231 |
instance: Softmax
|
| 1232 |
output_format: SAME
|
|
|
|
| 1240 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1241 |
weight_sparseness: DENSE
|
| 1242 |
model.decoder.layers.20.self_attn_layer_norm:
|
| 1243 |
+
approximation_function: NONE
|
| 1244 |
bias_format: SAME
|
| 1245 |
input_format: SAME
|
| 1246 |
instance: LayerNorm
|
|
|
|
| 1275 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1276 |
weight_sparseness: DENSE
|
| 1277 |
model.decoder.layers.21.final_layer_norm:
|
| 1278 |
+
approximation_function: NONE
|
| 1279 |
bias_format: SAME
|
| 1280 |
input_format: SAME
|
| 1281 |
instance: LayerNorm
|
|
|
|
| 1314 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1315 |
weight_sparseness: DENSE
|
| 1316 |
model.decoder.layers.21.self_attn.softmax:
|
| 1317 |
+
approximation_function: NONE
|
| 1318 |
input_format: SAME
|
| 1319 |
instance: Softmax
|
| 1320 |
output_format: SAME
|
|
|
|
| 1328 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1329 |
weight_sparseness: DENSE
|
| 1330 |
model.decoder.layers.21.self_attn_layer_norm:
|
| 1331 |
+
approximation_function: NONE
|
| 1332 |
bias_format: SAME
|
| 1333 |
input_format: SAME
|
| 1334 |
instance: LayerNorm
|
|
|
|
| 1363 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1364 |
weight_sparseness: DENSE
|
| 1365 |
model.decoder.layers.22.final_layer_norm:
|
| 1366 |
+
approximation_function: NONE
|
| 1367 |
bias_format: SAME
|
| 1368 |
input_format: SAME
|
| 1369 |
instance: LayerNorm
|
|
|
|
| 1402 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1403 |
weight_sparseness: DENSE
|
| 1404 |
model.decoder.layers.22.self_attn.softmax:
|
| 1405 |
+
approximation_function: NONE
|
| 1406 |
input_format: SAME
|
| 1407 |
instance: Softmax
|
| 1408 |
output_format: SAME
|
|
|
|
| 1416 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1417 |
weight_sparseness: DENSE
|
| 1418 |
model.decoder.layers.22.self_attn_layer_norm:
|
| 1419 |
+
approximation_function: NONE
|
| 1420 |
bias_format: SAME
|
| 1421 |
input_format: SAME
|
| 1422 |
instance: LayerNorm
|
|
|
|
| 1451 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1452 |
weight_sparseness: DENSE
|
| 1453 |
model.decoder.layers.23.final_layer_norm:
|
| 1454 |
+
approximation_function: NONE
|
| 1455 |
bias_format: SAME
|
| 1456 |
input_format: SAME
|
| 1457 |
instance: LayerNorm
|
|
|
|
| 1490 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1491 |
weight_sparseness: DENSE
|
| 1492 |
model.decoder.layers.23.self_attn.softmax:
|
| 1493 |
+
approximation_function: NONE
|
| 1494 |
input_format: SAME
|
| 1495 |
instance: Softmax
|
| 1496 |
output_format: SAME
|
|
|
|
| 1504 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1505 |
weight_sparseness: DENSE
|
| 1506 |
model.decoder.layers.23.self_attn_layer_norm:
|
| 1507 |
+
approximation_function: NONE
|
| 1508 |
bias_format: SAME
|
| 1509 |
input_format: SAME
|
| 1510 |
instance: LayerNorm
|
|
|
|
| 1539 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1540 |
weight_sparseness: DENSE
|
| 1541 |
model.decoder.layers.24.final_layer_norm:
|
| 1542 |
+
approximation_function: NONE
|
| 1543 |
bias_format: SAME
|
| 1544 |
input_format: SAME
|
| 1545 |
instance: LayerNorm
|
|
|
|
| 1578 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1579 |
weight_sparseness: DENSE
|
| 1580 |
model.decoder.layers.24.self_attn.softmax:
|
| 1581 |
+
approximation_function: NONE
|
| 1582 |
input_format: SAME
|
| 1583 |
instance: Softmax
|
| 1584 |
output_format: SAME
|
|
|
|
| 1592 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1593 |
weight_sparseness: DENSE
|
| 1594 |
model.decoder.layers.24.self_attn_layer_norm:
|
| 1595 |
+
approximation_function: NONE
|
| 1596 |
bias_format: SAME
|
| 1597 |
input_format: SAME
|
| 1598 |
instance: LayerNorm
|
|
|
|
| 1627 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1628 |
weight_sparseness: DENSE
|
| 1629 |
model.decoder.layers.25.final_layer_norm:
|
| 1630 |
+
approximation_function: NONE
|
| 1631 |
bias_format: SAME
|
| 1632 |
input_format: SAME
|
| 1633 |
instance: LayerNorm
|
|
|
|
| 1666 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1667 |
weight_sparseness: DENSE
|
| 1668 |
model.decoder.layers.25.self_attn.softmax:
|
| 1669 |
+
approximation_function: NONE
|
| 1670 |
input_format: SAME
|
| 1671 |
instance: Softmax
|
| 1672 |
output_format: SAME
|
|
|
|
| 1680 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1681 |
weight_sparseness: DENSE
|
| 1682 |
model.decoder.layers.25.self_attn_layer_norm:
|
| 1683 |
+
approximation_function: NONE
|
| 1684 |
bias_format: SAME
|
| 1685 |
input_format: SAME
|
| 1686 |
instance: LayerNorm
|
|
|
|
| 1715 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1716 |
weight_sparseness: DENSE
|
| 1717 |
model.decoder.layers.26.final_layer_norm:
|
| 1718 |
+
approximation_function: NONE
|
| 1719 |
bias_format: SAME
|
| 1720 |
input_format: SAME
|
| 1721 |
instance: LayerNorm
|
|
|
|
| 1754 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1755 |
weight_sparseness: DENSE
|
| 1756 |
model.decoder.layers.26.self_attn.softmax:
|
| 1757 |
+
approximation_function: NONE
|
| 1758 |
input_format: SAME
|
| 1759 |
instance: Softmax
|
| 1760 |
output_format: SAME
|
|
|
|
| 1768 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1769 |
weight_sparseness: DENSE
|
| 1770 |
model.decoder.layers.26.self_attn_layer_norm:
|
| 1771 |
+
approximation_function: NONE
|
| 1772 |
bias_format: SAME
|
| 1773 |
input_format: SAME
|
| 1774 |
instance: LayerNorm
|
|
|
|
| 1803 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1804 |
weight_sparseness: DENSE
|
| 1805 |
model.decoder.layers.27.final_layer_norm:
|
| 1806 |
+
approximation_function: NONE
|
| 1807 |
bias_format: SAME
|
| 1808 |
input_format: SAME
|
| 1809 |
instance: LayerNorm
|
|
|
|
| 1842 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1843 |
weight_sparseness: DENSE
|
| 1844 |
model.decoder.layers.27.self_attn.softmax:
|
| 1845 |
+
approximation_function: NONE
|
| 1846 |
input_format: SAME
|
| 1847 |
instance: Softmax
|
| 1848 |
output_format: SAME
|
|
|
|
| 1856 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1857 |
weight_sparseness: DENSE
|
| 1858 |
model.decoder.layers.27.self_attn_layer_norm:
|
| 1859 |
+
approximation_function: NONE
|
| 1860 |
bias_format: SAME
|
| 1861 |
input_format: SAME
|
| 1862 |
instance: LayerNorm
|
|
|
|
| 1891 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1892 |
weight_sparseness: DENSE
|
| 1893 |
model.decoder.layers.28.final_layer_norm:
|
| 1894 |
+
approximation_function: NONE
|
| 1895 |
bias_format: SAME
|
| 1896 |
input_format: SAME
|
| 1897 |
instance: LayerNorm
|
|
|
|
| 1930 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1931 |
weight_sparseness: DENSE
|
| 1932 |
model.decoder.layers.28.self_attn.softmax:
|
| 1933 |
+
approximation_function: NONE
|
| 1934 |
input_format: SAME
|
| 1935 |
instance: Softmax
|
| 1936 |
output_format: SAME
|
|
|
|
| 1944 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1945 |
weight_sparseness: DENSE
|
| 1946 |
model.decoder.layers.28.self_attn_layer_norm:
|
| 1947 |
+
approximation_function: NONE
|
| 1948 |
bias_format: SAME
|
| 1949 |
input_format: SAME
|
| 1950 |
instance: LayerNorm
|
|
|
|
| 1979 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 1980 |
weight_sparseness: DENSE
|
| 1981 |
model.decoder.layers.29.final_layer_norm:
|
| 1982 |
+
approximation_function: NONE
|
| 1983 |
bias_format: SAME
|
| 1984 |
input_format: SAME
|
| 1985 |
instance: LayerNorm
|
|
|
|
| 2018 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2019 |
weight_sparseness: DENSE
|
| 2020 |
model.decoder.layers.29.self_attn.softmax:
|
| 2021 |
+
approximation_function: NONE
|
| 2022 |
input_format: SAME
|
| 2023 |
instance: Softmax
|
| 2024 |
output_format: SAME
|
|
|
|
| 2032 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2033 |
weight_sparseness: DENSE
|
| 2034 |
model.decoder.layers.29.self_attn_layer_norm:
|
| 2035 |
+
approximation_function: NONE
|
| 2036 |
bias_format: SAME
|
| 2037 |
input_format: SAME
|
| 2038 |
instance: LayerNorm
|
|
|
|
| 2067 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2068 |
weight_sparseness: DENSE
|
| 2069 |
model.decoder.layers.3.final_layer_norm:
|
| 2070 |
+
approximation_function: NONE
|
| 2071 |
bias_format: SAME
|
| 2072 |
input_format: SAME
|
| 2073 |
instance: LayerNorm
|
|
|
|
| 2106 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2107 |
weight_sparseness: DENSE
|
| 2108 |
model.decoder.layers.3.self_attn.softmax:
|
| 2109 |
+
approximation_function: NONE
|
| 2110 |
input_format: SAME
|
| 2111 |
instance: Softmax
|
| 2112 |
output_format: SAME
|
|
|
|
| 2120 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2121 |
weight_sparseness: DENSE
|
| 2122 |
model.decoder.layers.3.self_attn_layer_norm:
|
| 2123 |
+
approximation_function: NONE
|
| 2124 |
bias_format: SAME
|
| 2125 |
input_format: SAME
|
| 2126 |
instance: LayerNorm
|
|
|
|
| 2155 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2156 |
weight_sparseness: DENSE
|
| 2157 |
model.decoder.layers.30.final_layer_norm:
|
| 2158 |
+
approximation_function: NONE
|
| 2159 |
bias_format: SAME
|
| 2160 |
input_format: SAME
|
| 2161 |
instance: LayerNorm
|
|
|
|
| 2194 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2195 |
weight_sparseness: DENSE
|
| 2196 |
model.decoder.layers.30.self_attn.softmax:
|
| 2197 |
+
approximation_function: NONE
|
| 2198 |
input_format: SAME
|
| 2199 |
instance: Softmax
|
| 2200 |
output_format: SAME
|
|
|
|
| 2208 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2209 |
weight_sparseness: DENSE
|
| 2210 |
model.decoder.layers.30.self_attn_layer_norm:
|
| 2211 |
+
approximation_function: NONE
|
| 2212 |
bias_format: SAME
|
| 2213 |
input_format: SAME
|
| 2214 |
instance: LayerNorm
|
|
|
|
| 2243 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2244 |
weight_sparseness: DENSE
|
| 2245 |
model.decoder.layers.31.final_layer_norm:
|
| 2246 |
+
approximation_function: NONE
|
| 2247 |
bias_format: SAME
|
| 2248 |
input_format: SAME
|
| 2249 |
instance: LayerNorm
|
|
|
|
| 2282 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2283 |
weight_sparseness: DENSE
|
| 2284 |
model.decoder.layers.31.self_attn.softmax:
|
| 2285 |
+
approximation_function: NONE
|
| 2286 |
input_format: SAME
|
| 2287 |
instance: Softmax
|
| 2288 |
output_format: SAME
|
|
|
|
| 2296 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2297 |
weight_sparseness: DENSE
|
| 2298 |
model.decoder.layers.31.self_attn_layer_norm:
|
| 2299 |
+
approximation_function: NONE
|
| 2300 |
bias_format: SAME
|
| 2301 |
input_format: SAME
|
| 2302 |
instance: LayerNorm
|
|
|
|
| 2331 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2332 |
weight_sparseness: DENSE
|
| 2333 |
model.decoder.layers.4.final_layer_norm:
|
| 2334 |
+
approximation_function: NONE
|
| 2335 |
bias_format: SAME
|
| 2336 |
input_format: SAME
|
| 2337 |
instance: LayerNorm
|
|
|
|
| 2370 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2371 |
weight_sparseness: DENSE
|
| 2372 |
model.decoder.layers.4.self_attn.softmax:
|
| 2373 |
+
approximation_function: NONE
|
| 2374 |
input_format: SAME
|
| 2375 |
instance: Softmax
|
| 2376 |
output_format: SAME
|
|
|
|
| 2384 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2385 |
weight_sparseness: DENSE
|
| 2386 |
model.decoder.layers.4.self_attn_layer_norm:
|
| 2387 |
+
approximation_function: NONE
|
| 2388 |
bias_format: SAME
|
| 2389 |
input_format: SAME
|
| 2390 |
instance: LayerNorm
|
|
|
|
| 2419 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2420 |
weight_sparseness: DENSE
|
| 2421 |
model.decoder.layers.5.final_layer_norm:
|
| 2422 |
+
approximation_function: NONE
|
| 2423 |
bias_format: SAME
|
| 2424 |
input_format: SAME
|
| 2425 |
instance: LayerNorm
|
|
|
|
| 2458 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2459 |
weight_sparseness: DENSE
|
| 2460 |
model.decoder.layers.5.self_attn.softmax:
|
| 2461 |
+
approximation_function: NONE
|
| 2462 |
input_format: SAME
|
| 2463 |
instance: Softmax
|
| 2464 |
output_format: SAME
|
|
|
|
| 2472 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2473 |
weight_sparseness: DENSE
|
| 2474 |
model.decoder.layers.5.self_attn_layer_norm:
|
| 2475 |
+
approximation_function: NONE
|
| 2476 |
bias_format: SAME
|
| 2477 |
input_format: SAME
|
| 2478 |
instance: LayerNorm
|
|
|
|
| 2507 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2508 |
weight_sparseness: DENSE
|
| 2509 |
model.decoder.layers.6.final_layer_norm:
|
| 2510 |
+
approximation_function: NONE
|
| 2511 |
bias_format: SAME
|
| 2512 |
input_format: SAME
|
| 2513 |
instance: LayerNorm
|
|
|
|
| 2546 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2547 |
weight_sparseness: DENSE
|
| 2548 |
model.decoder.layers.6.self_attn.softmax:
|
| 2549 |
+
approximation_function: NONE
|
| 2550 |
input_format: SAME
|
| 2551 |
instance: Softmax
|
| 2552 |
output_format: SAME
|
|
|
|
| 2560 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2561 |
weight_sparseness: DENSE
|
| 2562 |
model.decoder.layers.6.self_attn_layer_norm:
|
| 2563 |
+
approximation_function: NONE
|
| 2564 |
bias_format: SAME
|
| 2565 |
input_format: SAME
|
| 2566 |
instance: LayerNorm
|
|
|
|
| 2595 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2596 |
weight_sparseness: DENSE
|
| 2597 |
model.decoder.layers.7.final_layer_norm:
|
| 2598 |
+
approximation_function: NONE
|
| 2599 |
bias_format: SAME
|
| 2600 |
input_format: SAME
|
| 2601 |
instance: LayerNorm
|
|
|
|
| 2634 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2635 |
weight_sparseness: DENSE
|
| 2636 |
model.decoder.layers.7.self_attn.softmax:
|
| 2637 |
+
approximation_function: NONE
|
| 2638 |
input_format: SAME
|
| 2639 |
instance: Softmax
|
| 2640 |
output_format: SAME
|
|
|
|
| 2648 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2649 |
weight_sparseness: DENSE
|
| 2650 |
model.decoder.layers.7.self_attn_layer_norm:
|
| 2651 |
+
approximation_function: NONE
|
| 2652 |
bias_format: SAME
|
| 2653 |
input_format: SAME
|
| 2654 |
instance: LayerNorm
|
|
|
|
| 2683 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2684 |
weight_sparseness: DENSE
|
| 2685 |
model.decoder.layers.8.final_layer_norm:
|
| 2686 |
+
approximation_function: NONE
|
| 2687 |
bias_format: SAME
|
| 2688 |
input_format: SAME
|
| 2689 |
instance: LayerNorm
|
|
|
|
| 2722 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2723 |
weight_sparseness: DENSE
|
| 2724 |
model.decoder.layers.8.self_attn.softmax:
|
| 2725 |
+
approximation_function: NONE
|
| 2726 |
input_format: SAME
|
| 2727 |
instance: Softmax
|
| 2728 |
output_format: SAME
|
|
|
|
| 2736 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2737 |
weight_sparseness: DENSE
|
| 2738 |
model.decoder.layers.8.self_attn_layer_norm:
|
| 2739 |
+
approximation_function: NONE
|
| 2740 |
bias_format: SAME
|
| 2741 |
input_format: SAME
|
| 2742 |
instance: LayerNorm
|
|
|
|
| 2771 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2772 |
weight_sparseness: DENSE
|
| 2773 |
model.decoder.layers.9.final_layer_norm:
|
| 2774 |
+
approximation_function: NONE
|
| 2775 |
bias_format: SAME
|
| 2776 |
input_format: SAME
|
| 2777 |
instance: LayerNorm
|
|
|
|
| 2810 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2811 |
weight_sparseness: DENSE
|
| 2812 |
model.decoder.layers.9.self_attn.softmax:
|
| 2813 |
+
approximation_function: NONE
|
| 2814 |
input_format: SAME
|
| 2815 |
instance: Softmax
|
| 2816 |
output_format: SAME
|
|
|
|
| 2824 |
weight_format: BFP[8|8]{64,-1}(SN)
|
| 2825 |
weight_sparseness: DENSE
|
| 2826 |
model.decoder.layers.9.self_attn_layer_norm:
|
| 2827 |
+
approximation_function: NONE
|
| 2828 |
bias_format: SAME
|
| 2829 |
input_format: SAME
|
| 2830 |
instance: LayerNorm
|