Spaces:
Sleeping
Sleeping
Adrian Gabriel commited on
Commit ·
b0b05a2
1
Parent(s): 30bbad7
Latest additions to TabPFN
Browse files- README.md +2 -0
- models/current_code.py +103 -35
README.md
CHANGED
|
@@ -23,6 +23,8 @@ Based on [TinyTorch](https://mlsysbook.ai/tinytorch/intro.html) and the educatio
|
|
| 23 |
### Or run locally:
|
| 24 |
```bash
|
| 25 |
cd /path/to/TinyTorch
|
|
|
|
|
|
|
| 26 |
uv run uvicorn app:app --host 0.0.0.0 --port 8000
|
| 27 |
```
|
| 28 |
|
|
|
|
| 23 |
### Or run locally:
|
| 24 |
```bash
|
| 25 |
cd /path/to/TinyTorch
|
| 26 |
+
uv sync
|
| 27 |
+
source .venv/bin/activate
|
| 28 |
uv run uvicorn app:app --host 0.0.0.0 --port 8000
|
| 29 |
```
|
| 30 |
|
models/current_code.py
CHANGED
|
@@ -1,13 +1,11 @@
|
|
| 1 |
import numpy
|
| 2 |
|
| 3 |
-
# TabPFN
|
| 4 |
|
| 5 |
# training data
|
| 6 |
-
|
| 7 |
X_train = Tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
|
| 8 |
Y_train = Tensor([1, 0])
|
| 9 |
X_test = Tensor([[9, 10, 11, 12]])
|
| 10 |
-
Y_test = Tensor([0])
|
| 11 |
|
| 12 |
box("X_train", [X_train, Y_train, X_test], "1")
|
| 13 |
|
|
@@ -18,31 +16,17 @@ b_enc = Tensor([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]
|
|
| 18 |
|
| 19 |
box("Feature Encoder", W_enc_transpose, "2")
|
| 20 |
|
| 21 |
-
#
|
| 22 |
-
|
| 23 |
-
b_y = Tensor([0, 0, 0, 0])
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def label_embeddings(y_train):
|
| 27 |
-
lbl_embds = np.zeros((3, 4))
|
| 28 |
-
for (idx, row) in enumerate(y_train):
|
| 29 |
-
res = row.data * W_y.data
|
| 30 |
-
lbl_embds[idx] = res
|
| 31 |
-
|
| 32 |
-
return Tensor(lbl_embds)
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
y_stacked = Tensor(np.hstack((Y_train.data, Y_test.data)))
|
| 36 |
-
label_embeds = label_embeddings(y_stacked)
|
| 37 |
|
| 38 |
-
box("
|
| 39 |
|
| 40 |
# Step 1: Combine Training and Test Samples
|
| 41 |
X_combined = X_combined = Tensor(np.vstack([X_train.data, X_test.data]))
|
| 42 |
-
box("Training and Test Samples", X_combined, "4")
|
| 43 |
|
| 44 |
|
| 45 |
-
# Step 1:
|
| 46 |
|
| 47 |
def group(X):
|
| 48 |
groups = X.shape[0] * W_enc.shape[1]
|
|
@@ -55,11 +39,13 @@ def group(X):
|
|
| 55 |
for rt_ptr in range(0, len(row), 2):
|
| 56 |
group_window = Tensor(row[rt_ptr:rt_ptr + 2])
|
| 57 |
group_matmul = group_window.matmul(W_enc_transpose) + b_enc[group_idx]
|
|
|
|
| 58 |
if col == 0:
|
| 59 |
-
X_encoded[idx][0] = group_matmul.data
|
| 60 |
col = 1
|
|
|
|
| 61 |
else:
|
| 62 |
-
X_encoded[idx][1] = group_matmul.data
|
| 63 |
col = 0
|
| 64 |
idx += 1
|
| 65 |
X_encoded_tensor = Tensor(X_encoded)
|
|
@@ -67,24 +53,48 @@ def group(X):
|
|
| 67 |
|
| 68 |
|
| 69 |
X_encoded = group(X_combined)
|
| 70 |
-
box("X_encoded",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
# Step 3: Add Thinking Tokens
|
| 73 |
-
Thinking_Tokens = Tensor([
|
| 74 |
-
|
| 75 |
-
|
|
|
|
| 76 |
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
# Computing full model input
|
| 80 |
|
| 81 |
-
|
| 82 |
labels_reshaped = label_embeds.data.reshape(3, 1, 4)
|
| 83 |
data_rows = np.concatenate([X_encoded.data, labels_reshaped], axis=1)
|
| 84 |
E_numpy = np.concatenate([Thinking_Tokens.data, data_rows], axis=0)
|
| 85 |
E = Tensor(E_numpy)
|
| 86 |
-
# print(E)
|
| 87 |
|
|
|
|
| 88 |
# Create row positional embeddings
|
| 89 |
P_col_pos_embeds = Tensor([[[0.1, 0.1, 0.1, 0.1],
|
| 90 |
[0.2, 0.2, 0.2, 0.2],
|
|
@@ -99,12 +109,70 @@ W_q = Tensor(np.diag([0.1, 0.2, 0.1, 0.2]))
|
|
| 99 |
W_k = Tensor(np.diag([0.1, 0.1, 0.1, 0.1]))
|
| 100 |
W_v = Tensor(np.diag([1, 1, 1, 1]))
|
| 101 |
|
|
|
|
| 102 |
scaling_factor = np.sqrt(4)
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
-
|
| 106 |
-
|
|
|
|
| 107 |
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
|
|
|
| 1 |
import numpy
|
| 2 |
|
| 3 |
+
# TabPFN
|
| 4 |
|
| 5 |
# training data
|
|
|
|
| 6 |
X_train = Tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
|
| 7 |
Y_train = Tensor([1, 0])
|
| 8 |
X_test = Tensor([[9, 10, 11, 12]])
|
|
|
|
| 9 |
|
| 10 |
box("X_train", [X_train, Y_train, X_test], "1")
|
| 11 |
|
|
|
|
| 16 |
|
| 17 |
box("Feature Encoder", W_enc_transpose, "2")
|
| 18 |
|
| 19 |
+
# Feature/group embeddings
|
| 20 |
+
E_feat = Tensor([[0.1, 0.0, 0.0, 0.0], [0.0, 0.1, 0.0, 0.0]])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
box("Group embedding", E_feat, "6")
|
| 23 |
|
| 24 |
# Step 1: Combine Training and Test Samples
|
| 25 |
X_combined = X_combined = Tensor(np.vstack([X_train.data, X_test.data]))
|
| 26 |
+
box("Training and Test Samples grouped", X_combined, "4")
|
| 27 |
|
| 28 |
|
| 29 |
+
# Step 1: Group Features
|
| 30 |
|
| 31 |
def group(X):
|
| 32 |
groups = X.shape[0] * W_enc.shape[1]
|
|
|
|
| 39 |
for rt_ptr in range(0, len(row), 2):
|
| 40 |
group_window = Tensor(row[rt_ptr:rt_ptr + 2])
|
| 41 |
group_matmul = group_window.matmul(W_enc_transpose) + b_enc[group_idx]
|
| 42 |
+
# group 1
|
| 43 |
if col == 0:
|
| 44 |
+
X_encoded[idx][0] = group_matmul.data + E_feat.data[0]
|
| 45 |
col = 1
|
| 46 |
+
# group 2
|
| 47 |
else:
|
| 48 |
+
X_encoded[idx][1] = group_matmul.data + + E_feat.data[1]
|
| 49 |
col = 0
|
| 50 |
idx += 1
|
| 51 |
X_encoded_tensor = Tensor(X_encoded)
|
|
|
|
| 53 |
|
| 54 |
|
| 55 |
X_encoded = group(X_combined)
|
| 56 |
+
box("X_encoded", X_encoded, "4")
|
| 57 |
+
|
| 58 |
+
# Label Encoder - Label Embeddings
|
| 59 |
+
W_y = Tensor([[1, -1, 0, 0], [0, 0, 1, 1]])
|
| 60 |
+
b_y = Tensor([0, 0, 0, 0])
|
| 61 |
+
y_padded = Tensor([1, 0, np.nan]) # we wan't to mask y_test with nan
|
| 62 |
+
y_clean = Tensor([[1, 0, 0], [0, 0, 1]]).reshape(3, 2)
|
| 63 |
+
box("y_clean", y_clean, "4")
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def label_embeddings(y_train):
|
| 67 |
+
lbl_embds = np.zeros((3, 4))
|
| 68 |
+
for (idx, row) in enumerate(y_train.data):
|
| 69 |
+
res = Tensor((row)).matmul(W_y)
|
| 70 |
+
lbl_embds[idx] = res.data
|
| 71 |
+
|
| 72 |
+
return Tensor(lbl_embds)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
label_embeds = label_embeddings(y_clean)
|
| 76 |
+
# print(label_embeds)
|
| 77 |
|
| 78 |
# Step 3: Add Thinking Tokens
|
| 79 |
+
Thinking_Tokens = Tensor([
|
| 80 |
+
[[0.01, 0.02, 0.03, 0.04],
|
| 81 |
+
[0.01, 0.02, 0.03, 0.04],
|
| 82 |
+
[0.01, 0.02, 0.03, 0.04]],
|
| 83 |
|
| 84 |
+
[[0.05, 0.06, 0.07, 0.08],
|
| 85 |
+
[0.05, 0.06, 0.07, 0.08],
|
| 86 |
+
[0.05, 0.06, 0.07, 0.08]]
|
| 87 |
+
])
|
| 88 |
+
box("Thinking Tokens", Thinking_Tokens, "4")
|
| 89 |
|
| 90 |
# Computing full model input
|
| 91 |
|
|
|
|
| 92 |
labels_reshaped = label_embeds.data.reshape(3, 1, 4)
|
| 93 |
data_rows = np.concatenate([X_encoded.data, labels_reshaped], axis=1)
|
| 94 |
E_numpy = np.concatenate([Thinking_Tokens.data, data_rows], axis=0)
|
| 95 |
E = Tensor(E_numpy)
|
|
|
|
| 96 |
|
| 97 |
+
# we need to adapt positional embeddings!
|
| 98 |
# Create row positional embeddings
|
| 99 |
P_col_pos_embeds = Tensor([[[0.1, 0.1, 0.1, 0.1],
|
| 100 |
[0.2, 0.2, 0.2, 0.2],
|
|
|
|
| 109 |
W_k = Tensor(np.diag([0.1, 0.1, 0.1, 0.1]))
|
| 110 |
W_v = Tensor(np.diag([1, 1, 1, 1]))
|
| 111 |
|
| 112 |
+
box("Attention weights", [W_q, W_k, W_v], "9")
|
| 113 |
scaling_factor = np.sqrt(4)
|
| 114 |
|
| 115 |
+
# labels = [E[1][2], E[2][2], E[2][2]]
|
| 116 |
+
col_att_softmax = Softmax()
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def column_attention_inplace(E: Tensor):
|
| 120 |
+
"""
|
| 121 |
+
In-place column attention:
|
| 122 |
+
For each item s: X = E[s] has shape (Ttok=3, D=4)
|
| 123 |
+
Does self-attention across the 3 tokens and writes back:
|
| 124 |
+
E[s] <- E[s] + Attn(E[s])
|
| 125 |
+
"""
|
| 126 |
+
S, Ttok, D = E.shape
|
| 127 |
+
softmax = Softmax()
|
| 128 |
+
|
| 129 |
+
for s in range(S):
|
| 130 |
+
# Snapshot of current item (avoid in-place mixing during compute)
|
| 131 |
+
X = Tensor(E.data[s].copy()) # (3,4)
|
| 132 |
+
|
| 133 |
+
Q = X.matmul(W_q.transpose()) # (3,4)
|
| 134 |
+
K = X.matmul(W_k.transpose()) # (3,4)
|
| 135 |
+
V = X.matmul(W_v.transpose()) # (3,4)
|
| 136 |
+
|
| 137 |
+
scores = Q.matmul(K.transpose()) / math.sqrt(D) # (3,3)
|
| 138 |
+
A = softmax.forward(scores, dim=-1) # (3,3)
|
| 139 |
+
O = A.matmul(V) # (3,4)
|
| 140 |
+
|
| 141 |
+
# In-place residual update of ALL tokens
|
| 142 |
+
E.data[s] = E.data[s] + O.data
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
column_attention_inplace(E)
|
| 146 |
+
box("Updated Logits", E, "5")
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def row_attention_inplace(E: Tensor, W_q: Tensor, W_k: Tensor, W_v: Tensor, single_eval_pos: int):
|
| 150 |
+
"""
|
| 151 |
+
In-place row attention:
|
| 152 |
+
For each token slot t:
|
| 153 |
+
Q from all S items: E[:, t, :] -> (S, D)
|
| 154 |
+
K,V from first Klen rows E[:single_eval_pos, t, :] -> (Klen, D)
|
| 155 |
+
Writes:
|
| 156 |
+
E[:, t, :] <- E[:, t, :] + Attn_row(E[:, t, :])
|
| 157 |
+
"""
|
| 158 |
+
S, Ttok, D = E.shape
|
| 159 |
+
softmax = Softmax()
|
| 160 |
+
|
| 161 |
+
Klen = single_eval_pos
|
| 162 |
+
assert 0 < Klen <= S, "single_eval_pos must be between 1 and S"
|
| 163 |
+
|
| 164 |
+
for t in range(Ttok):
|
| 165 |
+
# Snapshot streams (avoid in-place mixing)
|
| 166 |
+
X_all = Tensor(E.data[:, t, :].copy()) # (S, D)
|
| 167 |
+
X_kv = Tensor(E.data[:Klen, t, :].copy()) # (Klen, D)
|
| 168 |
|
| 169 |
+
Q = X_all.matmul(W_q.transpose()) # (S, D)
|
| 170 |
+
K = X_kv.matmul(W_k.transpose()) # (Klen, D)
|
| 171 |
+
V = X_kv.matmul(W_v.transpose()) # (Klen, D)
|
| 172 |
|
| 173 |
+
scores = Q.matmul(K.transpose()) / math.sqrt(D) # (S, Klen)
|
| 174 |
+
A = softmax.forward(scores, dim=-1) # (S, Klen)
|
| 175 |
+
O = A.matmul(V) # (S, D)
|
| 176 |
|
| 177 |
+
# In-place residual update for this token slot
|
| 178 |
+
E.data[:, t, :] = E.data[:, t, :] + O.data
|