Spaces:
Running
Running
Adrian Gabriel
commited on
Commit
·
aa2ae84
1
Parent(s):
f393371
Bugfix
Browse files- models/current_code.py +66 -4
models/current_code.py
CHANGED
|
@@ -46,7 +46,7 @@ def group(X):
|
|
| 46 |
# group 2
|
| 47 |
else:
|
| 48 |
X_encoded[idx][1] = group_matmul.data + + E_feat.data[1]
|
| 49 |
-
col =
|
| 50 |
box(f"grouping: group {col}", [group_window, group_matmul])
|
| 51 |
idx += 1
|
| 52 |
X_encoded_tensor = Tensor(X_encoded)
|
|
@@ -69,7 +69,7 @@ def label_embeddings(y_train):
|
|
| 69 |
for (idx, row) in enumerate(y_train.data):
|
| 70 |
res = Tensor((row)).matmul(W_y)
|
| 71 |
lbl_embds[idx] = res.data
|
| 72 |
-
box("
|
| 73 |
|
| 74 |
return Tensor(lbl_embds)
|
| 75 |
|
|
@@ -118,6 +118,18 @@ scaling_factor = np.sqrt(4)
|
|
| 118 |
col_att_softmax = Softmax()
|
| 119 |
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
def column_attention_inplace(E: Tensor):
|
| 122 |
"""
|
| 123 |
In-place column attention:
|
|
@@ -140,15 +152,28 @@ def column_attention_inplace(E: Tensor):
|
|
| 140 |
A = softmax.forward(scores, dim=-1) # (3,3)
|
| 141 |
O = A.matmul(V) # (3,4)
|
| 142 |
|
| 143 |
-
box("
|
| 144 |
|
| 145 |
# In-place residual update of ALL tokens
|
| 146 |
E.data[s] = E.data[s] + O.data
|
| 147 |
|
| 148 |
|
| 149 |
column_attention_inplace(E)
|
|
|
|
| 150 |
box("Updated Logits", E + 0, "5")
|
| 151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
def row_attention_inplace(E: Tensor, single_eval_pos: int):
|
| 154 |
"""
|
|
@@ -179,5 +204,42 @@ def row_attention_inplace(E: Tensor, single_eval_pos: int):
|
|
| 179 |
O = A.matmul(V) # (S, D)
|
| 180 |
|
| 181 |
# In-place residual update for this token slot
|
| 182 |
-
box("
|
| 183 |
E.data[:, t, :] = E.data[:, t, :] + O.data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
# group 2
|
| 47 |
else:
|
| 48 |
X_encoded[idx][1] = group_matmul.data + + E_feat.data[1]
|
| 49 |
+
col = 0
|
| 50 |
box(f"grouping: group {col}", [group_window, group_matmul])
|
| 51 |
idx += 1
|
| 52 |
X_encoded_tensor = Tensor(X_encoded)
|
|
|
|
| 69 |
for (idx, row) in enumerate(y_train.data):
|
| 70 |
res = Tensor((row)).matmul(W_y)
|
| 71 |
lbl_embds[idx] = res.data
|
| 72 |
+
box("Label Embeddings", [res], "5")
|
| 73 |
|
| 74 |
return Tensor(lbl_embds)
|
| 75 |
|
|
|
|
| 118 |
col_att_softmax = Softmax()
|
| 119 |
|
| 120 |
|
| 121 |
+
def layer_norm_inplace(E: Tensor, eps=1e-5):
|
| 122 |
+
"""
|
| 123 |
+
In-place LN over last dim D for every vector in E.
|
| 124 |
+
E: (S, Ttok, D)
|
| 125 |
+
"""
|
| 126 |
+
x = E.data
|
| 127 |
+
mean = x.mean(axis=-1, keepdims=True)
|
| 128 |
+
var = ((x - mean) * (x - mean)).mean(axis=-1, keepdims=True)
|
| 129 |
+
x_norm = (x - mean) / np.sqrt(var + eps)
|
| 130 |
+
box("Layer norn", [Tensor(x), Tensor(mean), Tensor(var), Tensor(x_norm)], "7")
|
| 131 |
+
E.data[:] = x_norm
|
| 132 |
+
|
| 133 |
def column_attention_inplace(E: Tensor):
|
| 134 |
"""
|
| 135 |
In-place column attention:
|
|
|
|
| 152 |
A = softmax.forward(scores, dim=-1) # (3,3)
|
| 153 |
O = A.matmul(V) # (3,4)
|
| 154 |
|
| 155 |
+
box("column_attention", [Q, K, V, scores, A, O], "5")
|
| 156 |
|
| 157 |
# In-place residual update of ALL tokens
|
| 158 |
E.data[s] = E.data[s] + O.data
|
| 159 |
|
| 160 |
|
| 161 |
column_attention_inplace(E)
|
| 162 |
+
layer_norm_inplace(E)
|
| 163 |
box("Updated Logits", E + 0, "5")
|
| 164 |
|
| 165 |
+
def mlp_inplace(E: Tensor):
|
| 166 |
+
"""
|
| 167 |
+
Minimal hand-friendly MLP with residual:
|
| 168 |
+
x <- x + GELU(x)
|
| 169 |
+
In-place.
|
| 170 |
+
"""
|
| 171 |
+
gelu = GELU()
|
| 172 |
+
x = Tensor(E.data.copy())
|
| 173 |
+
gx = gelu.forward(x).data
|
| 174 |
+
E.data[:] = E.data + gx
|
| 175 |
+
|
| 176 |
+
|
| 177 |
|
| 178 |
def row_attention_inplace(E: Tensor, single_eval_pos: int):
|
| 179 |
"""
|
|
|
|
| 204 |
O = A.matmul(V) # (S, D)
|
| 205 |
|
| 206 |
# In-place residual update for this token slot
|
| 207 |
+
box("row_attention", [Q, K, V, scores, A, O], "5")
|
| 208 |
E.data[:, t, :] = E.data[:, t, :] + O.data
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
row_attention_inplace(E, single_eval_pos=4)
|
| 212 |
+
layer_norm_inplace(E)
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
# 3) MLP + LN
|
| 216 |
+
mlp_inplace(E) # x <- x + GELU(x)
|
| 217 |
+
layer_norm_inplace(E)
|
| 218 |
+
|
| 219 |
+
# ============================================================
|
| 220 |
+
# Readout: take test row label token -> logits
|
| 221 |
+
# In this layout: rows are [think1, think2, train1, train2, test1]
|
| 222 |
+
# test index = T + N_train = 4
|
| 223 |
+
# label token index = 2
|
| 224 |
+
# ============================================================
|
| 225 |
+
|
| 226 |
+
test_row_idx = 4 # 4
|
| 227 |
+
label_tok_idx = 2 # last token slot
|
| 228 |
+
|
| 229 |
+
h_test = Tensor(E.data[test_row_idx, label_tok_idx, :].reshape(1, 4)) # (1,4)
|
| 230 |
+
|
| 231 |
+
gelu = GELU()
|
| 232 |
+
z = gelu.forward(h_test) # (1,4)
|
| 233 |
+
|
| 234 |
+
# Simple head D->C (pick first 2 dims as logits)
|
| 235 |
+
W_out = Tensor([[1, 0],
|
| 236 |
+
[0, 1],
|
| 237 |
+
[0, 0],
|
| 238 |
+
[0, 0]]) # (4,2)
|
| 239 |
+
b_out = Tensor([0.0, 0.0])
|
| 240 |
+
|
| 241 |
+
logits = z.matmul(W_out) + b_out # (1,2)
|
| 242 |
+
|
| 243 |
+
print("h_test:", h_test.data)
|
| 244 |
+
print("z (GELU):", z.data)
|
| 245 |
+
print("logits:", logits.data)
|