bert-base-squadv1 / layer_wise_sparsity_global_rate_0.00.md
Chua, Vui Seng
Add collaterals
d96569c
|
raw
history blame
27.9 kB
layer_id layer_type param_type shape nparam nnz sparsity
0 bert.embeddings.word_embeddings Embedding weight [30522, 768] 23440896 23440896 0
1 bert.embeddings.position_embeddings Embedding weight [512, 768] 393216 393216 0
2 bert.embeddings.token_type_embeddings Embedding weight [2, 768] 1536 1536 0
3 bert.embeddings.LayerNorm LayerNorm weight [768] 768 768 0
4 bert.embeddings.LayerNorm LayerNorm bias [768] 768 768 0
5 bert.encoder.layer.0.attention.self.query Linear weight [768, 768] 589824 589824 0
6 bert.encoder.layer.0.attention.self.query Linear bias [768] 768 768 0
7 bert.encoder.layer.0.attention.self.key Linear weight [768, 768] 589824 589824 0
8 bert.encoder.layer.0.attention.self.key Linear bias [768] 768 768 0
9 bert.encoder.layer.0.attention.self.value Linear weight [768, 768] 589824 589824 0
10 bert.encoder.layer.0.attention.self.value Linear bias [768] 768 768 0
11 bert.encoder.layer.0.attention.output.dense Linear weight [768, 768] 589824 589824 0
12 bert.encoder.layer.0.attention.output.dense Linear bias [768] 768 768 0
13 bert.encoder.layer.0.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
14 bert.encoder.layer.0.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
15 bert.encoder.layer.0.intermediate.dense Linear weight [3072, 768] 2359296 2359296 0
16 bert.encoder.layer.0.intermediate.dense Linear bias [3072] 3072 3072 0
17 bert.encoder.layer.0.output.dense Linear weight [768, 3072] 2359296 2359296 0
18 bert.encoder.layer.0.output.dense Linear bias [768] 768 768 0
19 bert.encoder.layer.0.output.LayerNorm LayerNorm weight [768] 768 768 0
20 bert.encoder.layer.0.output.LayerNorm LayerNorm bias [768] 768 768 0
21 bert.encoder.layer.1.attention.self.query Linear weight [768, 768] 589824 589824 0
22 bert.encoder.layer.1.attention.self.query Linear bias [768] 768 768 0
23 bert.encoder.layer.1.attention.self.key Linear weight [768, 768] 589824 589824 0
24 bert.encoder.layer.1.attention.self.key Linear bias [768] 768 768 0
25 bert.encoder.layer.1.attention.self.value Linear weight [768, 768] 589824 589824 0
26 bert.encoder.layer.1.attention.self.value Linear bias [768] 768 768 0
27 bert.encoder.layer.1.attention.output.dense Linear weight [768, 768] 589824 589824 0
28 bert.encoder.layer.1.attention.output.dense Linear bias [768] 768 768 0
29 bert.encoder.layer.1.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
30 bert.encoder.layer.1.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
31 bert.encoder.layer.1.intermediate.dense Linear weight [3072, 768] 2359296 2359296 0
32 bert.encoder.layer.1.intermediate.dense Linear bias [3072] 3072 3072 0
33 bert.encoder.layer.1.output.dense Linear weight [768, 3072] 2359296 2359296 0
34 bert.encoder.layer.1.output.dense Linear bias [768] 768 768 0
35 bert.encoder.layer.1.output.LayerNorm LayerNorm weight [768] 768 768 0
36 bert.encoder.layer.1.output.LayerNorm LayerNorm bias [768] 768 768 0
37 bert.encoder.layer.2.attention.self.query Linear weight [768, 768] 589824 589824 0
38 bert.encoder.layer.2.attention.self.query Linear bias [768] 768 768 0
39 bert.encoder.layer.2.attention.self.key Linear weight [768, 768] 589824 589824 0
40 bert.encoder.layer.2.attention.self.key Linear bias [768] 768 768 0
41 bert.encoder.layer.2.attention.self.value Linear weight [768, 768] 589824 589824 0
42 bert.encoder.layer.2.attention.self.value Linear bias [768] 768 768 0
43 bert.encoder.layer.2.attention.output.dense Linear weight [768, 768] 589824 589824 0
44 bert.encoder.layer.2.attention.output.dense Linear bias [768] 768 768 0
45 bert.encoder.layer.2.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
46 bert.encoder.layer.2.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
47 bert.encoder.layer.2.intermediate.dense Linear weight [3072, 768] 2359296 2359296 0
48 bert.encoder.layer.2.intermediate.dense Linear bias [3072] 3072 3072 0
49 bert.encoder.layer.2.output.dense Linear weight [768, 3072] 2359296 2359296 0
50 bert.encoder.layer.2.output.dense Linear bias [768] 768 768 0
51 bert.encoder.layer.2.output.LayerNorm LayerNorm weight [768] 768 768 0
52 bert.encoder.layer.2.output.LayerNorm LayerNorm bias [768] 768 768 0
53 bert.encoder.layer.3.attention.self.query Linear weight [768, 768] 589824 589824 0
54 bert.encoder.layer.3.attention.self.query Linear bias [768] 768 768 0
55 bert.encoder.layer.3.attention.self.key Linear weight [768, 768] 589824 589824 0
56 bert.encoder.layer.3.attention.self.key Linear bias [768] 768 768 0
57 bert.encoder.layer.3.attention.self.value Linear weight [768, 768] 589824 589824 0
58 bert.encoder.layer.3.attention.self.value Linear bias [768] 768 768 0
59 bert.encoder.layer.3.attention.output.dense Linear weight [768, 768] 589824 589824 0
60 bert.encoder.layer.3.attention.output.dense Linear bias [768] 768 768 0
61 bert.encoder.layer.3.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
62 bert.encoder.layer.3.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
63 bert.encoder.layer.3.intermediate.dense Linear weight [3072, 768] 2359296 2359296 0
64 bert.encoder.layer.3.intermediate.dense Linear bias [3072] 3072 3072 0
65 bert.encoder.layer.3.output.dense Linear weight [768, 3072] 2359296 2359296 0
66 bert.encoder.layer.3.output.dense Linear bias [768] 768 768 0
67 bert.encoder.layer.3.output.LayerNorm LayerNorm weight [768] 768 768 0
68 bert.encoder.layer.3.output.LayerNorm LayerNorm bias [768] 768 768 0
69 bert.encoder.layer.4.attention.self.query Linear weight [768, 768] 589824 589824 0
70 bert.encoder.layer.4.attention.self.query Linear bias [768] 768 768 0
71 bert.encoder.layer.4.attention.self.key Linear weight [768, 768] 589824 589824 0
72 bert.encoder.layer.4.attention.self.key Linear bias [768] 768 768 0
73 bert.encoder.layer.4.attention.self.value Linear weight [768, 768] 589824 589824 0
74 bert.encoder.layer.4.attention.self.value Linear bias [768] 768 768 0
75 bert.encoder.layer.4.attention.output.dense Linear weight [768, 768] 589824 589824 0
76 bert.encoder.layer.4.attention.output.dense Linear bias [768] 768 768 0
77 bert.encoder.layer.4.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
78 bert.encoder.layer.4.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
79 bert.encoder.layer.4.intermediate.dense Linear weight [3072, 768] 2359296 2359296 0
80 bert.encoder.layer.4.intermediate.dense Linear bias [3072] 3072 3072 0
81 bert.encoder.layer.4.output.dense Linear weight [768, 3072] 2359296 2359296 0
82 bert.encoder.layer.4.output.dense Linear bias [768] 768 768 0
83 bert.encoder.layer.4.output.LayerNorm LayerNorm weight [768] 768 768 0
84 bert.encoder.layer.4.output.LayerNorm LayerNorm bias [768] 768 768 0
85 bert.encoder.layer.5.attention.self.query Linear weight [768, 768] 589824 589824 0
86 bert.encoder.layer.5.attention.self.query Linear bias [768] 768 768 0
87 bert.encoder.layer.5.attention.self.key Linear weight [768, 768] 589824 589824 0
88 bert.encoder.layer.5.attention.self.key Linear bias [768] 768 768 0
89 bert.encoder.layer.5.attention.self.value Linear weight [768, 768] 589824 589824 0
90 bert.encoder.layer.5.attention.self.value Linear bias [768] 768 768 0
91 bert.encoder.layer.5.attention.output.dense Linear weight [768, 768] 589824 589824 0
92 bert.encoder.layer.5.attention.output.dense Linear bias [768] 768 768 0
93 bert.encoder.layer.5.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
94 bert.encoder.layer.5.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
95 bert.encoder.layer.5.intermediate.dense Linear weight [3072, 768] 2359296 2359296 0
96 bert.encoder.layer.5.intermediate.dense Linear bias [3072] 3072 3072 0
97 bert.encoder.layer.5.output.dense Linear weight [768, 3072] 2359296 2359296 0
98 bert.encoder.layer.5.output.dense Linear bias [768] 768 768 0
99 bert.encoder.layer.5.output.LayerNorm LayerNorm weight [768] 768 768 0
100 bert.encoder.layer.5.output.LayerNorm LayerNorm bias [768] 768 768 0
101 bert.encoder.layer.6.attention.self.query Linear weight [768, 768] 589824 589824 0
102 bert.encoder.layer.6.attention.self.query Linear bias [768] 768 768 0
103 bert.encoder.layer.6.attention.self.key Linear weight [768, 768] 589824 589824 0
104 bert.encoder.layer.6.attention.self.key Linear bias [768] 768 768 0
105 bert.encoder.layer.6.attention.self.value Linear weight [768, 768] 589824 589824 0
106 bert.encoder.layer.6.attention.self.value Linear bias [768] 768 768 0
107 bert.encoder.layer.6.attention.output.dense Linear weight [768, 768] 589824 589824 0
108 bert.encoder.layer.6.attention.output.dense Linear bias [768] 768 768 0
109 bert.encoder.layer.6.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
110 bert.encoder.layer.6.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
111 bert.encoder.layer.6.intermediate.dense Linear weight [3072, 768] 2359296 2359296 0
112 bert.encoder.layer.6.intermediate.dense Linear bias [3072] 3072 3072 0
113 bert.encoder.layer.6.output.dense Linear weight [768, 3072] 2359296 2359296 0
114 bert.encoder.layer.6.output.dense Linear bias [768] 768 768 0
115 bert.encoder.layer.6.output.LayerNorm LayerNorm weight [768] 768 768 0
116 bert.encoder.layer.6.output.LayerNorm LayerNorm bias [768] 768 768 0
117 bert.encoder.layer.7.attention.self.query Linear weight [768, 768] 589824 589824 0
118 bert.encoder.layer.7.attention.self.query Linear bias [768] 768 768 0
119 bert.encoder.layer.7.attention.self.key Linear weight [768, 768] 589824 589824 0
120 bert.encoder.layer.7.attention.self.key Linear bias [768] 768 768 0
121 bert.encoder.layer.7.attention.self.value Linear weight [768, 768] 589824 589824 0
122 bert.encoder.layer.7.attention.self.value Linear bias [768] 768 768 0
123 bert.encoder.layer.7.attention.output.dense Linear weight [768, 768] 589824 589824 0
124 bert.encoder.layer.7.attention.output.dense Linear bias [768] 768 768 0
125 bert.encoder.layer.7.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
126 bert.encoder.layer.7.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
127 bert.encoder.layer.7.intermediate.dense Linear weight [3072, 768] 2359296 2359296 0
128 bert.encoder.layer.7.intermediate.dense Linear bias [3072] 3072 3072 0
129 bert.encoder.layer.7.output.dense Linear weight [768, 3072] 2359296 2359296 0
130 bert.encoder.layer.7.output.dense Linear bias [768] 768 768 0
131 bert.encoder.layer.7.output.LayerNorm LayerNorm weight [768] 768 768 0
132 bert.encoder.layer.7.output.LayerNorm LayerNorm bias [768] 768 768 0
133 bert.encoder.layer.8.attention.self.query Linear weight [768, 768] 589824 589824 0
134 bert.encoder.layer.8.attention.self.query Linear bias [768] 768 768 0
135 bert.encoder.layer.8.attention.self.key Linear weight [768, 768] 589824 589824 0
136 bert.encoder.layer.8.attention.self.key Linear bias [768] 768 768 0
137 bert.encoder.layer.8.attention.self.value Linear weight [768, 768] 589824 589824 0
138 bert.encoder.layer.8.attention.self.value Linear bias [768] 768 768 0
139 bert.encoder.layer.8.attention.output.dense Linear weight [768, 768] 589824 589824 0
140 bert.encoder.layer.8.attention.output.dense Linear bias [768] 768 768 0
141 bert.encoder.layer.8.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
142 bert.encoder.layer.8.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
143 bert.encoder.layer.8.intermediate.dense Linear weight [3072, 768] 2359296 2359296 0
144 bert.encoder.layer.8.intermediate.dense Linear bias [3072] 3072 3072 0
145 bert.encoder.layer.8.output.dense Linear weight [768, 3072] 2359296 2359296 0
146 bert.encoder.layer.8.output.dense Linear bias [768] 768 768 0
147 bert.encoder.layer.8.output.LayerNorm LayerNorm weight [768] 768 768 0
148 bert.encoder.layer.8.output.LayerNorm LayerNorm bias [768] 768 768 0
149 bert.encoder.layer.9.attention.self.query Linear weight [768, 768] 589824 589824 0
150 bert.encoder.layer.9.attention.self.query Linear bias [768] 768 768 0
151 bert.encoder.layer.9.attention.self.key Linear weight [768, 768] 589824 589824 0
152 bert.encoder.layer.9.attention.self.key Linear bias [768] 768 768 0
153 bert.encoder.layer.9.attention.self.value Linear weight [768, 768] 589824 589824 0
154 bert.encoder.layer.9.attention.self.value Linear bias [768] 768 768 0
155 bert.encoder.layer.9.attention.output.dense Linear weight [768, 768] 589824 589824 0
156 bert.encoder.layer.9.attention.output.dense Linear bias [768] 768 768 0
157 bert.encoder.layer.9.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
158 bert.encoder.layer.9.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
159 bert.encoder.layer.9.intermediate.dense Linear weight [3072, 768] 2359296 2359296 0
160 bert.encoder.layer.9.intermediate.dense Linear bias [3072] 3072 3072 0
161 bert.encoder.layer.9.output.dense Linear weight [768, 3072] 2359296 2359296 0
162 bert.encoder.layer.9.output.dense Linear bias [768] 768 768 0
163 bert.encoder.layer.9.output.LayerNorm LayerNorm weight [768] 768 768 0
164 bert.encoder.layer.9.output.LayerNorm LayerNorm bias [768] 768 768 0
165 bert.encoder.layer.10.attention.self.query Linear weight [768, 768] 589824 589824 0
166 bert.encoder.layer.10.attention.self.query Linear bias [768] 768 768 0
167 bert.encoder.layer.10.attention.self.key Linear weight [768, 768] 589824 589824 0
168 bert.encoder.layer.10.attention.self.key Linear bias [768] 768 768 0
169 bert.encoder.layer.10.attention.self.value Linear weight [768, 768] 589824 589824 0
170 bert.encoder.layer.10.attention.self.value Linear bias [768] 768 768 0
171 bert.encoder.layer.10.attention.output.dense Linear weight [768, 768] 589824 589824 0
172 bert.encoder.layer.10.attention.output.dense Linear bias [768] 768 768 0
173 bert.encoder.layer.10.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
174 bert.encoder.layer.10.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
175 bert.encoder.layer.10.intermediate.dense Linear weight [3072, 768] 2359296 2359296 0
176 bert.encoder.layer.10.intermediate.dense Linear bias [3072] 3072 3072 0
177 bert.encoder.layer.10.output.dense Linear weight [768, 3072] 2359296 2359296 0
178 bert.encoder.layer.10.output.dense Linear bias [768] 768 768 0
179 bert.encoder.layer.10.output.LayerNorm LayerNorm weight [768] 768 768 0
180 bert.encoder.layer.10.output.LayerNorm LayerNorm bias [768] 768 768 0
181 bert.encoder.layer.11.attention.self.query Linear weight [768, 768] 589824 589824 0
182 bert.encoder.layer.11.attention.self.query Linear bias [768] 768 768 0
183 bert.encoder.layer.11.attention.self.key Linear weight [768, 768] 589824 589824 0
184 bert.encoder.layer.11.attention.self.key Linear bias [768] 768 768 0
185 bert.encoder.layer.11.attention.self.value Linear weight [768, 768] 589824 589824 0
186 bert.encoder.layer.11.attention.self.value Linear bias [768] 768 768 0
187 bert.encoder.layer.11.attention.output.dense Linear weight [768, 768] 589824 589824 0
188 bert.encoder.layer.11.attention.output.dense Linear bias [768] 768 768 0
189 bert.encoder.layer.11.attention.output.LayerNorm LayerNorm weight [768] 768 768 0
190 bert.encoder.layer.11.attention.output.LayerNorm LayerNorm bias [768] 768 768 0
191 bert.encoder.layer.11.intermediate.dense Linear weight [3072, 768] 2359296 2359296 0
192 bert.encoder.layer.11.intermediate.dense Linear bias [3072] 3072 3072 0
193 bert.encoder.layer.11.output.dense Linear weight [768, 3072] 2359296 2359296 0
194 bert.encoder.layer.11.output.dense Linear bias [768] 768 768 0
195 bert.encoder.layer.11.output.LayerNorm LayerNorm weight [768] 768 768 0
196 bert.encoder.layer.11.output.LayerNorm LayerNorm bias [768] 768 768 0
197 qa_outputs Linear weight [2, 768] 1536 1536 0
198 qa_outputs Linear bias [2] 2 2 0