Adrian Gabriel commited on
Commit
b0b05a2
·
1 Parent(s): 30bbad7

Latest additions to TabPFN

Browse files
Files changed (2) hide show
  1. README.md +2 -0
  2. models/current_code.py +103 -35
README.md CHANGED
@@ -23,6 +23,8 @@ Based on [TinyTorch](https://mlsysbook.ai/tinytorch/intro.html) and the educatio
23
  ### Or run locally:
24
  ```bash
25
  cd /path/to/TinyTorch
 
 
26
  uv run uvicorn app:app --host 0.0.0.0 --port 8000
27
  ```
28
 
 
23
  ### Or run locally:
24
  ```bash
25
  cd /path/to/TinyTorch
26
+ uv sync
27
+ source .venv/bin/activate
28
  uv run uvicorn app:app --host 0.0.0.0 --port 8000
29
  ```
30
 
models/current_code.py CHANGED
@@ -1,13 +1,11 @@
1
  import numpy
2
 
3
- # TabPFN toy example
4
 
5
  # training data
6
-
7
  X_train = Tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
8
  Y_train = Tensor([1, 0])
9
  X_test = Tensor([[9, 10, 11, 12]])
10
- Y_test = Tensor([0])
11
 
12
  box("X_train", [X_train, Y_train, X_test], "1")
13
 
@@ -18,31 +16,17 @@ b_enc = Tensor([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]
18
 
19
  box("Feature Encoder", W_enc_transpose, "2")
20
 
21
- # Label Encoder - Label Embeddings
22
- W_y = Tensor([[1], [-1], [0], [0]]).reshape(1, 4)
23
- b_y = Tensor([0, 0, 0, 0])
24
-
25
-
26
- def label_embeddings(y_train):
27
- lbl_embds = np.zeros((3, 4))
28
- for (idx, row) in enumerate(y_train):
29
- res = row.data * W_y.data
30
- lbl_embds[idx] = res
31
-
32
- return Tensor(lbl_embds)
33
-
34
-
35
- y_stacked = Tensor(np.hstack((Y_train.data, Y_test.data)))
36
- label_embeds = label_embeddings(y_stacked)
37
 
38
- box("label Encoder", [W_y, label_embeds], "3")
39
 
40
  # Step 1: Combine Training and Test Samples
41
  X_combined = X_combined = Tensor(np.vstack([X_train.data, X_test.data]))
42
- box("Training and Test Samples", X_combined, "4")
43
 
44
 
45
- # Step 1: Tokenization - Group Features
46
 
47
  def group(X):
48
  groups = X.shape[0] * W_enc.shape[1]
@@ -55,11 +39,13 @@ def group(X):
55
  for rt_ptr in range(0, len(row), 2):
56
  group_window = Tensor(row[rt_ptr:rt_ptr + 2])
57
  group_matmul = group_window.matmul(W_enc_transpose) + b_enc[group_idx]
 
58
  if col == 0:
59
- X_encoded[idx][0] = group_matmul.data
60
  col = 1
 
61
  else:
62
- X_encoded[idx][1] = group_matmul.data
63
  col = 0
64
  idx += 1
65
  X_encoded_tensor = Tensor(X_encoded)
@@ -67,24 +53,48 @@ def group(X):
67
 
68
 
69
  X_encoded = group(X_combined)
70
- box("X_encoded", [X_encoded], "4")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  # Step 3: Add Thinking Tokens
73
- Thinking_Tokens = Tensor([[[0, 0, 0, 0],
74
- [0, 0, 0, 0],
75
- [0, 0, 0, 0]]])
 
76
 
77
- # box("Thinking Tokens", Thinking_Tokens, "4")
 
 
 
 
78
 
79
  # Computing full model input
80
 
81
-
82
  labels_reshaped = label_embeds.data.reshape(3, 1, 4)
83
  data_rows = np.concatenate([X_encoded.data, labels_reshaped], axis=1)
84
  E_numpy = np.concatenate([Thinking_Tokens.data, data_rows], axis=0)
85
  E = Tensor(E_numpy)
86
- # print(E)
87
 
 
88
  # Create row positional embeddings
89
  P_col_pos_embeds = Tensor([[[0.1, 0.1, 0.1, 0.1],
90
  [0.2, 0.2, 0.2, 0.2],
@@ -99,12 +109,70 @@ W_q = Tensor(np.diag([0.1, 0.2, 0.1, 0.2]))
99
  W_k = Tensor(np.diag([0.1, 0.1, 0.1, 0.1]))
100
  W_v = Tensor(np.diag([1, 1, 1, 1]))
101
 
 
102
  scaling_factor = np.sqrt(4)
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- def compute_attn_labels(labels):
106
- q
 
107
 
 
 
 
108
 
109
- labels = [E[1][2], E[2][2], E[2][2]]
110
- compute_attn_labels(labels)
 
1
  import numpy
2
 
3
+ # TabPFN
4
 
5
  # training data
 
6
  X_train = Tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
7
  Y_train = Tensor([1, 0])
8
  X_test = Tensor([[9, 10, 11, 12]])
 
9
 
10
  box("X_train", [X_train, Y_train, X_test], "1")
11
 
 
16
 
17
  box("Feature Encoder", W_enc_transpose, "2")
18
 
19
+ # Feature/group embeddings
20
+ E_feat = Tensor([[0.1, 0.0, 0.0, 0.0], [0.0, 0.1, 0.0, 0.0]])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ box("Group embedding", E_feat, "6")
23
 
24
  # Step 1: Combine Training and Test Samples
25
  X_combined = X_combined = Tensor(np.vstack([X_train.data, X_test.data]))
26
+ box("Training and Test Samples grouped", X_combined, "4")
27
 
28
 
29
+ # Step 1: Group Features
30
 
31
  def group(X):
32
  groups = X.shape[0] * W_enc.shape[1]
 
39
  for rt_ptr in range(0, len(row), 2):
40
  group_window = Tensor(row[rt_ptr:rt_ptr + 2])
41
  group_matmul = group_window.matmul(W_enc_transpose) + b_enc[group_idx]
42
+ # group 1
43
  if col == 0:
44
+ X_encoded[idx][0] = group_matmul.data + E_feat.data[0]
45
  col = 1
46
+ # group 2
47
  else:
48
+ X_encoded[idx][1] = group_matmul.data + + E_feat.data[1]
49
  col = 0
50
  idx += 1
51
  X_encoded_tensor = Tensor(X_encoded)
 
53
 
54
 
55
  X_encoded = group(X_combined)
56
+ box("X_encoded", X_encoded, "4")
57
+
58
+ # Label Encoder - Label Embeddings
59
+ W_y = Tensor([[1, -1, 0, 0], [0, 0, 1, 1]])
60
+ b_y = Tensor([0, 0, 0, 0])
61
+ y_padded = Tensor([1, 0, np.nan]) # we wan't to mask y_test with nan
62
+ y_clean = Tensor([[1, 0, 0], [0, 0, 1]]).reshape(3, 2)
63
+ box("y_clean", y_clean, "4")
64
+
65
+
66
+ def label_embeddings(y_train):
67
+ lbl_embds = np.zeros((3, 4))
68
+ for (idx, row) in enumerate(y_train.data):
69
+ res = Tensor((row)).matmul(W_y)
70
+ lbl_embds[idx] = res.data
71
+
72
+ return Tensor(lbl_embds)
73
+
74
+
75
+ label_embeds = label_embeddings(y_clean)
76
+ # print(label_embeds)
77
 
78
  # Step 3: Add Thinking Tokens
79
+ Thinking_Tokens = Tensor([
80
+ [[0.01, 0.02, 0.03, 0.04],
81
+ [0.01, 0.02, 0.03, 0.04],
82
+ [0.01, 0.02, 0.03, 0.04]],
83
 
84
+ [[0.05, 0.06, 0.07, 0.08],
85
+ [0.05, 0.06, 0.07, 0.08],
86
+ [0.05, 0.06, 0.07, 0.08]]
87
+ ])
88
+ box("Thinking Tokens", Thinking_Tokens, "4")
89
 
90
  # Computing full model input
91
 
 
92
  labels_reshaped = label_embeds.data.reshape(3, 1, 4)
93
  data_rows = np.concatenate([X_encoded.data, labels_reshaped], axis=1)
94
  E_numpy = np.concatenate([Thinking_Tokens.data, data_rows], axis=0)
95
  E = Tensor(E_numpy)
 
96
 
97
+ # we need to adapt positional embeddings!
98
  # Create row positional embeddings
99
  P_col_pos_embeds = Tensor([[[0.1, 0.1, 0.1, 0.1],
100
  [0.2, 0.2, 0.2, 0.2],
 
109
  W_k = Tensor(np.diag([0.1, 0.1, 0.1, 0.1]))
110
  W_v = Tensor(np.diag([1, 1, 1, 1]))
111
 
112
+ box("Attention weights", [W_q, W_k, W_v], "9")
113
  scaling_factor = np.sqrt(4)
114
 
115
+ # labels = [E[1][2], E[2][2], E[2][2]]
116
+ col_att_softmax = Softmax()
117
+
118
+
119
+ def column_attention_inplace(E: Tensor):
120
+ """
121
+ In-place column attention:
122
+ For each item s: X = E[s] has shape (Ttok=3, D=4)
123
+ Does self-attention across the 3 tokens and writes back:
124
+ E[s] <- E[s] + Attn(E[s])
125
+ """
126
+ S, Ttok, D = E.shape
127
+ softmax = Softmax()
128
+
129
+ for s in range(S):
130
+ # Snapshot of current item (avoid in-place mixing during compute)
131
+ X = Tensor(E.data[s].copy()) # (3,4)
132
+
133
+ Q = X.matmul(W_q.transpose()) # (3,4)
134
+ K = X.matmul(W_k.transpose()) # (3,4)
135
+ V = X.matmul(W_v.transpose()) # (3,4)
136
+
137
+ scores = Q.matmul(K.transpose()) / math.sqrt(D) # (3,3)
138
+ A = softmax.forward(scores, dim=-1) # (3,3)
139
+ O = A.matmul(V) # (3,4)
140
+
141
+ # In-place residual update of ALL tokens
142
+ E.data[s] = E.data[s] + O.data
143
+
144
+
145
+ column_attention_inplace(E)
146
+ box("Updated Logits", E, "5")
147
+
148
+
149
+ def row_attention_inplace(E: Tensor, W_q: Tensor, W_k: Tensor, W_v: Tensor, single_eval_pos: int):
150
+ """
151
+ In-place row attention:
152
+ For each token slot t:
153
+ Q from all S items: E[:, t, :] -> (S, D)
154
+ K,V from first Klen rows E[:single_eval_pos, t, :] -> (Klen, D)
155
+ Writes:
156
+ E[:, t, :] <- E[:, t, :] + Attn_row(E[:, t, :])
157
+ """
158
+ S, Ttok, D = E.shape
159
+ softmax = Softmax()
160
+
161
+ Klen = single_eval_pos
162
+ assert 0 < Klen <= S, "single_eval_pos must be between 1 and S"
163
+
164
+ for t in range(Ttok):
165
+ # Snapshot streams (avoid in-place mixing)
166
+ X_all = Tensor(E.data[:, t, :].copy()) # (S, D)
167
+ X_kv = Tensor(E.data[:Klen, t, :].copy()) # (Klen, D)
168
 
169
+ Q = X_all.matmul(W_q.transpose()) # (S, D)
170
+ K = X_kv.matmul(W_k.transpose()) # (Klen, D)
171
+ V = X_kv.matmul(W_v.transpose()) # (Klen, D)
172
 
173
+ scores = Q.matmul(K.transpose()) / math.sqrt(D) # (S, Klen)
174
+ A = softmax.forward(scores, dim=-1) # (S, Klen)
175
+ O = A.matmul(V) # (S, D)
176
 
177
+ # In-place residual update for this token slot
178
+ E.data[:, t, :] = E.data[:, t, :] + O.data