File size: 7,151 Bytes
30bbad7
 
b0b05a2
30bbad7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0b05a2
 
30bbad7
b0b05a2
30bbad7
 
 
b0b05a2
30bbad7
 
b0b05a2
30bbad7
 
 
 
 
 
 
 
 
 
 
 
b0b05a2
30bbad7
b0b05a2
30bbad7
b0b05a2
30bbad7
b0b05a2
aa2ae84
f393371
30bbad7
 
 
 
 
 
b0b05a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa2ae84
b0b05a2
 
 
 
 
 
30bbad7
 
b0b05a2
 
 
 
30bbad7
b0b05a2
 
 
 
 
30bbad7
 
 
 
 
 
 
 
b0b05a2
30bbad7
 
 
 
 
 
 
 
 
 
 
 
 
 
b0b05a2
30bbad7
 
b0b05a2
 
 
 
aa2ae84
 
 
 
 
 
 
 
 
 
 
 
b0b05a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa2ae84
f5a78dd
b0b05a2
 
 
 
 
aa2ae84
f5a78dd
b0b05a2
aa2ae84
 
 
 
 
 
 
 
 
 
 
 
b0b05a2
f5a78dd
b0b05a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30bbad7
b0b05a2
 
 
30bbad7
b0b05a2
 
 
30bbad7
b0b05a2
aa2ae84
b0b05a2
aa2ae84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import numpy

# TabPFN

# training data
X_train = Tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
Y_train = Tensor([1, 0])
X_test = Tensor([[9, 10, 11, 12]])

box("X_train", [X_train, Y_train, X_test], "1")

# Feature Encoder - Feature Embeddings
W_enc = Tensor([[1, 0.5], [0.5, 1], [0.3, 0.7], [0.7, 0.3]])
W_enc_transpose = W_enc.transpose()
b_enc = Tensor([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]])

box("Feature Encoder", W_enc_transpose, "2")

# Feature/group embeddings
E_feat = Tensor([[0.1, 0.0, 0.0, 0.0], [0.0, 0.1, 0.0, 0.0]])

box("Group embedding", E_feat, "6")

# Step 1: Combine Training and Test Samples
X_combined = X_combined = Tensor(np.vstack([X_train.data, X_test.data]))
box("Training and Test Samples grouped", X_combined, "4")


# Step 1: Group Features

def group(X):
    groups = X.shape[0] * W_enc.shape[1]
    X_encoded = np.zeros((3, 2, 4))
    # print(X_encoded)
    idx = 0
    col = 0
    for (group_idx, row) in enumerate(X.data):
        rt_ptr = 0
        for rt_ptr in range(0, len(row), 2):
            group_window = Tensor(row[rt_ptr:rt_ptr + 2])
            group_matmul = group_window.matmul(W_enc_transpose) + b_enc[group_idx]
            # group 1
            if col == 0:
                X_encoded[idx][0] = group_matmul.data + E_feat.data[0]
                col = 1
            # group 2
            else:
                X_encoded[idx][1] = group_matmul.data + + E_feat.data[1]
                col = 0
            box(f"grouping: group {col}", [group_window, group_matmul])
        idx += 1
    X_encoded_tensor = Tensor(X_encoded)
    return X_encoded_tensor


X_encoded = group(X_combined)
box("X_encoded", X_encoded, "4")

# Label Encoder - Label Embeddings
W_y = Tensor([[1, -1, 0, 0], [0, 0, 1, 1]])
b_y = Tensor([0, 0, 0, 0])
y_padded = Tensor([1, 0, np.nan])  # we wan't to mask y_test with nan
y_clean = Tensor([[1, 0, 0], [0, 0, 1]]).reshape(3, 2)
box("y_clean", y_clean, "4")


def label_embeddings(y_train):
    lbl_embds = np.zeros((3, 4))
    for (idx, row) in enumerate(y_train.data):
        res = Tensor((row)).matmul(W_y)
        lbl_embds[idx] = res.data
        box("Label Embeddings", [res], "5")

    return Tensor(lbl_embds)


label_embeds = label_embeddings(y_clean)
# print(label_embeds)

# Step 3: Add Thinking Tokens
Thinking_Tokens = Tensor([
    [[0.01, 0.02, 0.03, 0.04],
     [0.01, 0.02, 0.03, 0.04],
     [0.01, 0.02, 0.03, 0.04]],

    [[0.05, 0.06, 0.07, 0.08],
     [0.05, 0.06, 0.07, 0.08],
     [0.05, 0.06, 0.07, 0.08]]
])
box("Thinking Tokens", Thinking_Tokens, "4")

# Computing full model input

labels_reshaped = label_embeds.data.reshape(3, 1, 4)
data_rows = np.concatenate([X_encoded.data, labels_reshaped], axis=1)
E_numpy = np.concatenate([Thinking_Tokens.data, data_rows], axis=0)
E = Tensor(E_numpy)

# we need to adapt positional embeddings!
# Create row positional embeddings
P_col_pos_embeds = Tensor([[[0.1, 0.1, 0.1, 0.1],
                            [0.2, 0.2, 0.2, 0.2],
                            [0.3, 0.3, 0.3, 0.3]]])

# Add positional embeddings
E = E + P_col_pos_embeds
box("Positional Embedding", E, "9")

# Attention
W_q = Tensor(np.diag([0.1, 0.2, 0.1, 0.2]))
W_k = Tensor(np.diag([0.1, 0.1, 0.1, 0.1]))
W_v = Tensor(np.diag([1, 1, 1, 1]))

box("Attention weights", [W_q, W_k, W_v], "9")
scaling_factor = np.sqrt(4)

# labels = [E[1][2], E[2][2], E[2][2]]
col_att_softmax = Softmax()


def layer_norm_inplace(E: Tensor, eps=1e-5):
    """
    In-place LN over last dim D for every vector in E.
    E: (S, Ttok, D)
    """
    x = E.data
    mean = x.mean(axis=-1, keepdims=True)
    var = ((x - mean) * (x - mean)).mean(axis=-1, keepdims=True)
    x_norm = (x - mean) / np.sqrt(var + eps)
    box("Layer norn", [Tensor(x), Tensor(mean), Tensor(var), Tensor(x_norm)], "7")
    E.data[:] = x_norm

def column_attention_inplace(E: Tensor):
    """
    In-place column attention:
      For each item s: X = E[s] has shape (Ttok=3, D=4)
      Does self-attention across the 3 tokens and writes back:
         E[s] <- E[s] + Attn(E[s])
    """
    S, Ttok, D = E.shape
    softmax = Softmax()

    for s in range(S):
        # Snapshot of current item (avoid in-place mixing during compute)
        X = Tensor(E.data[s].copy())  # (3,4)

        Q = X.matmul(W_q.transpose())  # (3,4)
        K = X.matmul(W_k.transpose())  # (3,4)
        V = X.matmul(W_v.transpose())  # (3,4)

        scores = Q.matmul(K.transpose()) / math.sqrt(D)  # (3,3)
        A = softmax.forward(scores, dim=-1)  # (3,3)
        O = A.matmul(V)  # (3,4)

        box("column_attention", [Q, K, V, scores, A, O], "5")

        # In-place residual update of ALL tokens
        E.data[s] = E.data[s] + O.data


column_attention_inplace(E)
layer_norm_inplace(E)
box("Updated Logits", E + 0, "5")

def mlp_inplace(E: Tensor):
    """
    Minimal hand-friendly MLP with residual:
      x <- x + GELU(x)
    In-place.
    """
    gelu = GELU()
    x = Tensor(E.data.copy())
    gx = gelu.forward(x).data
    E.data[:] = E.data + gx



def row_attention_inplace(E: Tensor, single_eval_pos: int):
    """
    In-place row attention:
      For each token slot t:
        Q from all S items:      E[:, t, :] -> (S, D)
        K,V from first Klen rows E[:single_eval_pos, t, :] -> (Klen, D)
      Writes:
        E[:, t, :] <- E[:, t, :] + Attn_row(E[:, t, :])
    """
    S, Ttok, D = E.shape
    softmax = Softmax()

    Klen = single_eval_pos
    assert 0 < Klen <= S, "single_eval_pos must be between 1 and S"

    for t in range(Ttok):
        # Snapshot streams (avoid in-place mixing)
        X_all = Tensor(E.data[:, t, :].copy())  # (S, D)
        X_kv = Tensor(E.data[:Klen, t, :].copy())  # (Klen, D)

        Q = X_all.matmul(W_q.transpose())  # (S, D)
        K = X_kv.matmul(W_k.transpose())  # (Klen, D)
        V = X_kv.matmul(W_v.transpose())  # (Klen, D)

        scores = Q.matmul(K.transpose()) / math.sqrt(D)  # (S, Klen)
        A = softmax.forward(scores, dim=-1)  # (S, Klen)
        O = A.matmul(V)  # (S, D)

        # In-place residual update for this token slot
        box("row_attention", [Q, K, V, scores, A, O], "5")
        E.data[:, t, :] = E.data[:, t, :] + O.data


row_attention_inplace(E, single_eval_pos=4)
layer_norm_inplace(E)


# 3) MLP + LN
mlp_inplace(E)          # x <- x + GELU(x)
layer_norm_inplace(E)

# ============================================================
# Readout: take test row label token -> logits
# In this layout: rows are [think1, think2, train1, train2, test1]
# test index = T + N_train = 4
# label token index = 2
# ============================================================

test_row_idx = 4       # 4
label_tok_idx = 2                 # last token slot

h_test = Tensor(E.data[test_row_idx, label_tok_idx, :].reshape(1, 4))  # (1,4)

gelu = GELU()
z = gelu.forward(h_test)  # (1,4)

# Simple head D->C (pick first 2 dims as logits)
W_out = Tensor([[1, 0],
                [0, 1],
                [0, 0],
                [0, 0]])  # (4,2)
b_out = Tensor([0.0, 0.0])

logits = z.matmul(W_out) + b_out  # (1,2)

print("h_test:", h_test.data)
print("z (GELU):", z.data)
print("logits:", logits.data)