Adrian Gabriel commited on
Commit
aa2ae84
·
1 Parent(s): f393371
Files changed (1) hide show
  1. models/current_code.py +66 -4
models/current_code.py CHANGED
@@ -46,7 +46,7 @@ def group(X):
46
  # group 2
47
  else:
48
  X_encoded[idx][1] = group_matmul.data + + E_feat.data[1]
49
- col = 0a
50
  box(f"grouping: group {col}", [group_window, group_matmul])
51
  idx += 1
52
  X_encoded_tensor = Tensor(X_encoded)
@@ -69,7 +69,7 @@ def label_embeddings(y_train):
69
  for (idx, row) in enumerate(y_train.data):
70
  res = Tensor((row)).matmul(W_y)
71
  lbl_embds[idx] = res.data
72
- box("test", [res], "5")
73
 
74
  return Tensor(lbl_embds)
75
 
@@ -118,6 +118,18 @@ scaling_factor = np.sqrt(4)
118
  col_att_softmax = Softmax()
119
 
120
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  def column_attention_inplace(E: Tensor):
122
  """
123
  In-place column attention:
@@ -140,15 +152,28 @@ def column_attention_inplace(E: Tensor):
140
  A = softmax.forward(scores, dim=-1) # (3,3)
141
  O = A.matmul(V) # (3,4)
142
 
143
- box("test", [Q, K, V, scores, A, O], "5")
144
 
145
  # In-place residual update of ALL tokens
146
  E.data[s] = E.data[s] + O.data
147
 
148
 
149
  column_attention_inplace(E)
 
150
  box("Updated Logits", E + 0, "5")
151
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
  def row_attention_inplace(E: Tensor, single_eval_pos: int):
154
  """
@@ -179,5 +204,42 @@ def row_attention_inplace(E: Tensor, single_eval_pos: int):
179
  O = A.matmul(V) # (S, D)
180
 
181
  # In-place residual update for this token slot
182
- box("test", [Q, K, V, scores, A, O], "5")
183
  E.data[:, t, :] = E.data[:, t, :] + O.data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  # group 2
47
  else:
48
  X_encoded[idx][1] = group_matmul.data + + E_feat.data[1]
49
+ col = 0
50
  box(f"grouping: group {col}", [group_window, group_matmul])
51
  idx += 1
52
  X_encoded_tensor = Tensor(X_encoded)
 
69
  for (idx, row) in enumerate(y_train.data):
70
  res = Tensor((row)).matmul(W_y)
71
  lbl_embds[idx] = res.data
72
+ box("Label Embeddings", [res], "5")
73
 
74
  return Tensor(lbl_embds)
75
 
 
118
  col_att_softmax = Softmax()
119
 
120
 
121
+ def layer_norm_inplace(E: Tensor, eps=1e-5):
122
+ """
123
+ In-place LN over last dim D for every vector in E.
124
+ E: (S, Ttok, D)
125
+ """
126
+ x = E.data
127
+ mean = x.mean(axis=-1, keepdims=True)
128
+ var = ((x - mean) * (x - mean)).mean(axis=-1, keepdims=True)
129
+ x_norm = (x - mean) / np.sqrt(var + eps)
130
+ box("Layer norn", [Tensor(x), Tensor(mean), Tensor(var), Tensor(x_norm)], "7")
131
+ E.data[:] = x_norm
132
+
133
  def column_attention_inplace(E: Tensor):
134
  """
135
  In-place column attention:
 
152
  A = softmax.forward(scores, dim=-1) # (3,3)
153
  O = A.matmul(V) # (3,4)
154
 
155
+ box("column_attention", [Q, K, V, scores, A, O], "5")
156
 
157
  # In-place residual update of ALL tokens
158
  E.data[s] = E.data[s] + O.data
159
 
160
 
161
  column_attention_inplace(E)
162
+ layer_norm_inplace(E)
163
  box("Updated Logits", E + 0, "5")
164
 
165
+ def mlp_inplace(E: Tensor):
166
+ """
167
+ Minimal hand-friendly MLP with residual:
168
+ x <- x + GELU(x)
169
+ In-place.
170
+ """
171
+ gelu = GELU()
172
+ x = Tensor(E.data.copy())
173
+ gx = gelu.forward(x).data
174
+ E.data[:] = E.data + gx
175
+
176
+
177
 
178
  def row_attention_inplace(E: Tensor, single_eval_pos: int):
179
  """
 
204
  O = A.matmul(V) # (S, D)
205
 
206
  # In-place residual update for this token slot
207
+ box("row_attention", [Q, K, V, scores, A, O], "5")
208
  E.data[:, t, :] = E.data[:, t, :] + O.data
209
+
210
+
211
+ row_attention_inplace(E, single_eval_pos=4)
212
+ layer_norm_inplace(E)
213
+
214
+
215
+ # 3) MLP + LN
216
+ mlp_inplace(E) # x <- x + GELU(x)
217
+ layer_norm_inplace(E)
218
+
219
+ # ============================================================
220
+ # Readout: take test row label token -> logits
221
+ # In this layout: rows are [think1, think2, train1, train2, test1]
222
+ # test index = T + N_train = 4
223
+ # label token index = 2
224
+ # ============================================================
225
+
226
+ test_row_idx = 4 # 4
227
+ label_tok_idx = 2 # last token slot
228
+
229
+ h_test = Tensor(E.data[test_row_idx, label_tok_idx, :].reshape(1, 4)) # (1,4)
230
+
231
+ gelu = GELU()
232
+ z = gelu.forward(h_test) # (1,4)
233
+
234
+ # Simple head D->C (pick first 2 dims as logits)
235
+ W_out = Tensor([[1, 0],
236
+ [0, 1],
237
+ [0, 0],
238
+ [0, 0]]) # (4,2)
239
+ b_out = Tensor([0.0, 0.0])
240
+
241
+ logits = z.matmul(W_out) + b_out # (1,2)
242
+
243
+ print("h_test:", h_test.data)
244
+ print("z (GELU):", z.data)
245
+ print("logits:", logits.data)