Adrian Gabriel commited on
Commit
528030b
·
1 Parent(s): eb6f929
models/TabFN.py CHANGED
@@ -253,26 +253,26 @@ print("TabPFN Model - Step by Step Visualization")
253
  print("=" * 80)
254
 
255
  # Step 1: Input Table
256
- #box("Input Table", x, "3")
257
  print(f"Shape: {x.shape}")
258
  print()
259
 
260
  # Step 2: Feature Embedding
261
  embedded = x.matmul(tabpfn.W_embed.transpose()) + tabpfn.b_embed
262
- #box("Feature Embedding", embedded, "2")
263
  print(f"Shape: {embedded.shape}")
264
  print(f"W_embed shape: {tabpfn.W_embed.shape}")
265
  print()
266
 
267
  # Step 3: Positional Encoding
268
  pos_encoded = embedded + tabpfn.pos_encoding
269
- #box("+ Positional Encoding", pos_encoded, "3")
270
  print(f"Pos encoding shape: {tabpfn.pos_encoding.shape}")
271
  print()
272
 
273
  # Step 4: Learnable Patterns (TabPFN Innovation)
274
  patterned = pos_encoded * tabpfn.patterns
275
- #box("× Learnable Patterns", patterned, "4")
276
  print(f"Patterns shape: {tabpfn.patterns.shape}")
277
  print()
278
 
@@ -284,19 +284,19 @@ print("-" * 40)
284
  block = tabpfn.blocks[0]
285
 
286
  # Multi-head attention weights
287
- #box("W_q (Attention)", block.W_q, "1")
288
- #box("W_k (Attention)", block.W_k, "2")
289
- #box("W_v (Attention)", block.W_v, "3")
290
- #box("W_o (Attention)", block.W_o, "4")
291
 
292
  # Attention computation
293
  Q = patterned.matmul(block.W_q.transpose())
294
  K = patterned.matmul(block.W_k.transpose())
295
  V = patterned.matmul(block.W_v.transpose())
296
 
297
- #box("Q (Query)", Q, "6a")
298
- #box("K (Key)", K, "6b")
299
- #box("V (Value)", V, "6c")
300
 
301
  # Reshape for multi-head
302
  batch_size, seq_len, d_model = Q.shape
@@ -317,15 +317,15 @@ attn_output = attention_weights.matmul(V_reshaped)
317
  attn_output_reshaped = attn_output.transpose(1, 2).reshape(batch_size, seq_len, d_model)
318
  attn_final = attn_output_reshaped.matmul(block.W_o.transpose())
319
 
320
- #box("Attention Output", attn_final, "7")
321
 
322
  # Skip connection and layer norm
323
  residual = patterned
324
  x_after_attn = residual + attn_final
325
  x_norm1 = layer_norm(x_after_attn, block.gamma1, block.beta1)
326
 
327
- #box("After Attention + Skip", x_after_attn, "8")
328
- #box("After Layer Norm", x_norm1, "9")
329
 
330
  # Feed-forward network
331
  ff_output = feed_forward_network(x_norm1, block.W_ffn1, block.b_ffn1, block.W_ffn2, block.b_ffn2)
@@ -335,26 +335,26 @@ residual2 = x_norm1
335
  x_after_ffn = residual2 + ff_output
336
  x_norm2 = layer_norm(x_after_ffn, block.gamma2, block.beta2)
337
 
338
- #box("FFN Output", ff_output, "10")
339
- #box("After FFN + Skip", x_after_ffn, "11")
340
- #box("Final Block Output", x_norm2, "12")
341
 
342
  # Step 6: Through all transformer blocks (simplified)
343
  features = x_norm2
344
  for i in range(1, tabpfn.n_layers):
345
  features = tabpfn.blocks[i].forward(features)
346
  if i < 3: # Show first 3 blocks
347
- # box(f"Block {i + 1} Output", features, f"13.{i}")
348
  print(features)
349
 
350
  # Step 7: Feature Pooling
351
  pooled = features.mean(axis=1)
352
- #box("Feature Pooling (Mean)", pooled, "14")
353
  print(f"Shape after pooling: {pooled.shape}")
354
 
355
  # Step 8: Output Projection
356
  output = pooled.matmul(tabpfn.W_out.transpose()) + tabpfn.b_out
357
- #box("Final Output", output, "15")
358
  print(f"Output shape: {output.shape}")
359
  print(f"Number of classes: {tabpfn.n_classes}")
360
 
@@ -393,4 +393,4 @@ print(f"Actual parameter count: {count_parameters(tabpfn):,}")
393
 
394
  print("\n" + "=" * 80)
395
  print("✅ TabPFN model created successfully from base components!")
396
- print("=" * 80)
 
253
  print("=" * 80)
254
 
255
  # Step 1: Input Table
256
+ box("Input Table", x, "3")
257
  print(f"Shape: {x.shape}")
258
  print()
259
 
260
  # Step 2: Feature Embedding
261
  embedded = x.matmul(tabpfn.W_embed.transpose()) + tabpfn.b_embed
262
+ box("Feature Embedding", embedded, "2")
263
  print(f"Shape: {embedded.shape}")
264
  print(f"W_embed shape: {tabpfn.W_embed.shape}")
265
  print()
266
 
267
  # Step 3: Positional Encoding
268
  pos_encoded = embedded + tabpfn.pos_encoding
269
+ box("+ Positional Encoding", pos_encoded, "3")
270
  print(f"Pos encoding shape: {tabpfn.pos_encoding.shape}")
271
  print()
272
 
273
  # Step 4: Learnable Patterns (TabPFN Innovation)
274
  patterned = pos_encoded * tabpfn.patterns
275
+ box("× Learnable Patterns", patterned, "4")
276
  print(f"Patterns shape: {tabpfn.patterns.shape}")
277
  print()
278
 
 
284
  block = tabpfn.blocks[0]
285
 
286
  # Multi-head attention weights
287
+ box("W_q (Attention)", block.W_q, "1")
288
+ box("W_k (Attention)", block.W_k, "2")
289
+ box("W_v (Attention)", block.W_v, "3")
290
+ box("W_o (Attention)", block.W_o, "4")
291
 
292
  # Attention computation
293
  Q = patterned.matmul(block.W_q.transpose())
294
  K = patterned.matmul(block.W_k.transpose())
295
  V = patterned.matmul(block.W_v.transpose())
296
 
297
+ box("Q (Query)", Q, "4")
298
+ box("K (Key)", K, "5")
299
+ box("V (Value)", V, "6")
300
 
301
  # Reshape for multi-head
302
  batch_size, seq_len, d_model = Q.shape
 
317
  attn_output_reshaped = attn_output.transpose(1, 2).reshape(batch_size, seq_len, d_model)
318
  attn_final = attn_output_reshaped.matmul(block.W_o.transpose())
319
 
320
+ box("Attention Output", attn_final, "7")
321
 
322
  # Skip connection and layer norm
323
  residual = patterned
324
  x_after_attn = residual + attn_final
325
  x_norm1 = layer_norm(x_after_attn, block.gamma1, block.beta1)
326
 
327
+ box("After Attention + Skip", x_after_attn, "8")
328
+ box("After Layer Norm", x_norm1, "9")
329
 
330
  # Feed-forward network
331
  ff_output = feed_forward_network(x_norm1, block.W_ffn1, block.b_ffn1, block.W_ffn2, block.b_ffn2)
 
335
  x_after_ffn = residual2 + ff_output
336
  x_norm2 = layer_norm(x_after_ffn, block.gamma2, block.beta2)
337
 
338
+ box("FFN Output", ff_output, "5")
339
+ box("After FFN + Skip", x_after_ffn, "6")
340
+ box("Final Block Output", x_norm2, "7")
341
 
342
  # Step 6: Through all transformer blocks (simplified)
343
  features = x_norm2
344
  for i in range(1, tabpfn.n_layers):
345
  features = tabpfn.blocks[i].forward(features)
346
  if i < 3: # Show first 3 blocks
347
+ box(f"Block {i + 1} Output", features, f"13.{i}")
348
  print(features)
349
 
350
  # Step 7: Feature Pooling
351
  pooled = features.mean(axis=1)
352
+ box("Feature Pooling (Mean)", pooled, "8")
353
  print(f"Shape after pooling: {pooled.shape}")
354
 
355
  # Step 8: Output Projection
356
  output = pooled.matmul(tabpfn.W_out.transpose()) + tabpfn.b_out
357
+ box("Final Output", output, "9")
358
  print(f"Output shape: {output.shape}")
359
  print(f"Number of classes: {tabpfn.n_classes}")
360
 
 
393
 
394
  print("\n" + "=" * 80)
395
  print("✅ TabPFN model created successfully from base components!")
396
+ print("=" * 80)
models/TabFN_calc.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from tinytorch.core.tensor import Tensor
3
+
4
+
5
+ # ============================================
6
+ # MINIMAL TABPFN - VERIFIABLE TOY EXAMPLE
7
+ # ============================================
8
+
9
+ class MiniTabPFN:
10
+ """Minimal TabPFN with only 2 features, dimension 2, for manual verification"""
11
+
12
+ def __init__(self):
13
+ # Tiny dimensions for verification
14
+ self.n_features = 2
15
+ self.d_model = 2
16
+ self.n_classes = 2
17
+
18
+ # Initialize with known values for verification
19
+ # Input embedding
20
+ self.W_embed = Tensor(np.array([[0.5, -0.3], [0.2, 0.8]]).T) # (2, 2)
21
+ self.b_embed = Tensor(np.array([0.1, 0.2]))
22
+
23
+ # Learnable patterns
24
+ self.patterns = Tensor(np.array([[[1.0, 0.5], [0.5, 1.0]]]))
25
+
26
+ # Positional encoding (simplified)
27
+ self.pos_encoding = Tensor(np.array([[0.1, 0.2], [0.3, 0.4]]))
28
+
29
+ # Single attention head weights (for simplicity)
30
+ self.W_q = Tensor(np.array([[1.0, 0.0], [0.0, 1.0]]))
31
+ self.W_k = Tensor(np.array([[1.0, 0.0], [0.0, 1.0]]))
32
+ self.W_v = Tensor(np.array([[1.0, 0.0], [0.0, 1.0]]))
33
+ self.W_o = Tensor(np.array([[1.0, 0.0], [0.0, 1.0]]))
34
+
35
+ # Layer norm parameters
36
+ self.gamma1 = Tensor(np.array([1.0, 1.0]))
37
+ self.beta1 = Tensor(np.array([0.0, 0.0]))
38
+ self.gamma2 = Tensor(np.array([1.0, 1.0]))
39
+ self.beta2 = Tensor(np.array([0.0, 0.0]))
40
+
41
+ # Feed-forward weights (tiny expansion)
42
+ self.W_ffn1 = Tensor(np.array([[0.5, 0.3], [0.2, 0.4], [0.1, 0.2], [0.3, 0.5]])) # (4, 2)
43
+ self.b_ffn1 = Tensor(np.array([0.1, 0.2, 0.3, 0.4]))
44
+ self.W_ffn2 = Tensor(np.array([[0.2, 0.3, 0.4, 0.5], [0.1, 0.2, 0.3, 0.4]])) # (2, 4)
45
+ self.b_ffn2 = Tensor(np.array([0.1, 0.2]))
46
+
47
+ # Output projection
48
+ self.W_out = Tensor(np.array([[1.0, 0.5], [0.5, 1.0]]))
49
+ self.b_out = Tensor(np.array([0.1, 0.2]))
50
+
51
+ def layer_norm(self, x, gamma, beta, eps=1e-5):
52
+ mean = x.mean(axis=-1, keepdims=True)
53
+ var = ((x - mean) * (x - mean)).mean(axis=-1, keepdims=True)
54
+ std = (var + eps).sqrt()
55
+ normalized = (x - mean) / std
56
+ return normalized * gamma + beta
57
+
58
+ def gelu(self, x):
59
+ # Approximate GELU for manual calculation
60
+ return x * 0.5 * (1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x ** 3)))
61
+
62
+ def feed_forward(self, x):
63
+ # First linear layer
64
+ hidden = x.matmul(self.W_ffn1.transpose()) + self.b_ffn1
65
+
66
+ # GELU activation (simplified)
67
+ hidden_data = np.array(hidden.data)
68
+ hidden_gelu = self.gelu(hidden_data)
69
+ hidden = Tensor(hidden_gelu)
70
+
71
+ # Second linear layer
72
+ output = hidden.matmul(self.W_ffn2.transpose()) + self.b_ffn2
73
+ return output
74
+
75
+ def attention(self, x):
76
+ # Simple single-head attention
77
+ Q = x.matmul(self.W_q.transpose())
78
+ K = x.matmul(self.W_k.transpose())
79
+ V = x.matmul(self.W_v.transpose())
80
+
81
+ # Attention scores
82
+ scores = Q.matmul(K.transpose(-2, -1))
83
+ scaled_scores = scores * (1.0 / np.sqrt(self.d_model))
84
+
85
+ # Softmax
86
+ exp_scores = np.exp(scaled_scores.data)
87
+ softmax_scores = exp_scores / exp_scores.sum(axis=-1, keepdims=True)
88
+
89
+ # Apply attention
90
+ attention_output = Tensor(softmax_scores).matmul(V)
91
+
92
+ # Output projection
93
+ output = attention_output.matmul(self.W_o.transpose())
94
+ return output
95
+
96
+ def forward(self, x):
97
+ """Step-by-step forward pass"""
98
+ print("=" * 60)
99
+ print("TOY TABPFN - MANUAL VERIFICATION")
100
+ print("=" * 60)
101
+
102
+ # 1. Input
103
+ print(f"\n1. INPUT:\n{x.data}")
104
+ print(f"Shape: {x.shape}")
105
+
106
+ # 2. Feature Embedding
107
+ # x: (1, 2, 1), W_embed: (2, 2) -> (1, 2, 2)
108
+ embedded = x.matmul(self.W_embed.transpose()) + self.b_embed
109
+ print(f"\n2. EMBEDDING (x @ W_embed.T + b_embed):")
110
+ print(f"W_embed.T:\n{self.W_embed.transpose().data}")
111
+ print(f"b_embed: {self.b_embed.data}")
112
+ print(f"Result:\n{embedded.data}")
113
+
114
+ # 3. Add Positional Encoding
115
+ pos_encoded = embedded + self.pos_encoding
116
+ print(f"\n3. + POSITIONAL ENCODING:")
117
+ print(f"Positional encoding:\n{self.pos_encoding.data}")
118
+ print(f"Result:\n{pos_encoded.data}")
119
+
120
+ # 4. Apply Learnable Patterns
121
+ patterned = pos_encoded * self.patterns
122
+ print(f"\n4. × LEARNABLE PATTERNS:")
123
+ print(f"Patterns:\n{self.patterns.data}")
124
+ print(f"Result:\n{patterned.data}")
125
+
126
+ # 5. Attention Block
127
+ print(f"\n5. ATTENTION BLOCK:")
128
+
129
+ # Self-attention
130
+ attn_output = self.attention(patterned)
131
+ print(f"Attention output:\n{attn_output.data}")
132
+
133
+ # Skip connection
134
+ residual1 = patterned
135
+ after_attn = residual1 + attn_output
136
+ print(f"After skip connection:\n{after_attn.data}")
137
+
138
+ # Layer norm
139
+ norm1 = self.layer_norm(after_attn, self.gamma1, self.beta1)
140
+ print(f"After layer norm:\n{norm1.data}")
141
+
142
+ # 6. Feed-Forward Network
143
+ print(f"\n6. FEED-FORWARD NETWORK:")
144
+ ff_output = self.feed_forward(norm1)
145
+ print(f"FFN output:\n{ff_output.data}")
146
+
147
+ # Skip connection
148
+ residual2 = norm1
149
+ after_ffn = residual2 + ff_output
150
+ print(f"After skip connection:\n{after_ffn.data}")
151
+
152
+ # Layer norm
153
+ norm2 = self.layer_norm(after_ffn, self.gamma2, self.beta2)
154
+ print(f"After layer norm:\n{norm2.data}")
155
+
156
+ # 7. Feature Pooling
157
+ pooled = norm2.mean(axis=1)
158
+ print(f"\n7. FEATURE POOLING (mean across features):")
159
+ print(f"Input shape: {norm2.shape}")
160
+ print(f"Pooled: {pooled.data}")
161
+
162
+ # 8. Output Projection
163
+ output = pooled.matmul(self.W_out.transpose()) + self.b_out
164
+ print(f"\n8. OUTPUT PROJECTION:")
165
+ print(f"W_out.T:\n{self.W_out.transpose().data}")
166
+ print(f"b_out: {self.b_out.data}")
167
+ print(f"Final output: {output.data}")
168
+
169
+ return output
170
+
171
+
172
+ # ============================================
173
+ # MANUAL CALCULATION EXAMPLE
174
+ # ============================================
175
+
176
+ # Create toy data
177
+ toy_data = np.array([[[1.0], [2.0]]]) # Batch size 1, 2 features, 1 value each
178
+ x_toy = Tensor(toy_data)
179
+
180
+ print("TOY INPUT DATA:")
181
+ print(f"Feature 1: {toy_data[0, 0, 0]:.1f}")
182
+ print(f"Feature 2: {toy_data[0, 1, 0]:.1f}")
183
+ print()
184
+
185
+ # Create mini model
186
+ mini_tabpfn = MiniTabPFN()
187
+
188
+ # Run forward pass
189
+ output = mini_tabpfn.forward(x_toy)
190
+
191
+ # ============================================
192
+ # MANUAL CALCULATION STEPS
193
+ # ============================================
194
+
195
+ print("\n" + "=" * 60)
196
+ print("MANUAL CALCULATION CHECK")
197
+ print("=" * 60)
198
+
199
+ print("\nLet's verify Step 2 (Embedding) manually:")
200
+ print("For feature 1 (value = 1.0):")
201
+ print(" W_embed.T row 1: [0.5, -0.3]")
202
+ print(" b_embed: [0.1, 0.2]")
203
+ print(" Result: 1.0 * [0.5, -0.3] + [0.1, 0.2] = [0.6, -0.1]")
204
+
205
+ print("\nFor feature 2 (value = 2.0):")
206
+ print(" W_embed.T row 2: [0.2, 0.8]")
207
+ print(" Result: 2.0 * [0.2, 0.8] + [0.1, 0.2] = [0.5, 1.8]")
208
+
209
+ print("\nEmbedding matrix should be:")
210
+ print(" [[0.6, -0.1],")
211
+ print(" [0.5, 1.8]]")
212
+
213
+ print("\nStep 3 (Positional Encoding):")
214
+ print(" Positional encoding: [[0.1, 0.2], [0.3, 0.4]]")
215
+ print(" Result: [[0.7, 0.1], [0.8, 2.2]]")
216
+
217
+ print("\nStep 4 (Learnable Patterns):")
218
+ print(" Patterns: [[1.0, 0.5], [0.5, 1.0]]")
219
+ print(" Element-wise multiply: [[0.7*1.0, 0.1*0.5], [0.8*0.5, 2.2*1.0]]")
220
+ print(" Result: [[0.7, 0.05], [0.4, 2.2]]")
221
+
222
+ """
223
+ 3. Differences from Original TabPFNv2:
224
+ No causal masking - Original might use it for permutation invariance
225
+
226
+ Simplified positional encoding - Original might have more sophisticated encoding
227
+
228
+ No batch normalization - Original might include it
229
+
230
+ No gradient checkpointing - Not needed for this example
231
+
232
+ """
models/TabFN_deepseek.py ADDED
@@ -0,0 +1,454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from tinytorch.core.tensor import Tensor
3
+ from tinytorch.core.activations import Softmax, GELU
4
+ from tinytorch.core.layers import Linear, Dropout
5
+ import math
6
+
7
+
8
+ # ============================================
9
+ # FIXED: TabPFN-Specific Components
10
+ # ============================================
11
+
12
+ class DualAttentionBlock:
13
+ """
14
+ TabPFN's alternating-attention mechanism that attends across:
15
+ 1. Features (columns) dimension
16
+ 2. Samples (rows/data points) dimension
17
+ """
18
+
19
+ def __init__(self, d_model=256, n_heads=8, feature_group_size=3):
20
+ self.d_model = d_model
21
+ self.n_heads = n_heads
22
+ self.feature_group_size = feature_group_size
23
+ self.d_k = d_model // n_heads
24
+
25
+ # Feature attention (across columns)
26
+ self.W_q_features = Tensor(np.random.randn(d_model, d_model) * 0.02)
27
+ self.W_k_features = Tensor(np.random.randn(d_model, d_model) * 0.02)
28
+ self.W_v_features = Tensor(np.random.randn(d_model, d_model) * 0.02)
29
+ self.W_o_features = Tensor(np.random.randn(d_model, d_model) * 0.02)
30
+
31
+ # Sample attention (across rows/data points)
32
+ self.W_q_samples = Tensor(np.random.randn(d_model, d_model) * 0.02)
33
+ self.W_k_samples = Tensor(np.random.randn(d_model, d_model) * 0.02)
34
+ self.W_v_samples = Tensor(np.random.randn(d_model, d_model) * 0.02)
35
+ self.W_o_samples = Tensor(np.random.randn(d_model, d_model) * 0.02)
36
+
37
+ # Layer normalization parameters
38
+ self.gamma1 = Tensor(np.ones((d_model,)))
39
+ self.beta1 = Tensor(np.zeros((d_model,)))
40
+ self.gamma2 = Tensor(np.ones((d_model,)))
41
+ self.beta2 = Tensor(np.zeros((d_model,)))
42
+
43
+ # Feed-forward network (4x expansion)
44
+ self.W_ffn1 = Tensor(np.random.randn(d_model * 4, d_model) * 0.02)
45
+ self.b_ffn1 = Tensor(np.zeros((d_model * 4,)))
46
+ self.W_ffn2 = Tensor(np.random.randn(d_model, d_model * 4) * 0.02)
47
+ self.b_ffn2 = Tensor(np.zeros((d_model,)))
48
+
49
+ self.dropout = Dropout(0.1)
50
+
51
+ def alternating_attention(self, x, attention_type="features"):
52
+ """
53
+ Attention that operates across either features or samples.
54
+
55
+ Args:
56
+ x: Tensor of shape [batch, n_samples, n_features, d_model]
57
+ attention_type: "features" (attend across columns) or
58
+ "samples" (attend across rows)
59
+ """
60
+ batch_size, n_samples, n_features, d_model = x.shape
61
+
62
+ if attention_type == "features":
63
+ # Reshape to attend across features: [batch, n_samples, n_features, d_model]
64
+ # -> treat n_samples as part of batch dimension
65
+ x_flat = x.reshape(batch_size * n_samples, n_features, d_model)
66
+ W_q, W_k, W_v, W_o = self.W_q_features, self.W_k_features, self.W_v_features, self.W_o_features
67
+ else: # "samples"
68
+ # Reshape to attend across samples: [batch, n_samples, n_features, d_model]
69
+ # -> treat n_features as part of batch dimension
70
+ x_flat = x.transpose(1, 2).reshape(batch_size * n_features, n_samples, d_model)
71
+ W_q, W_k, W_v, W_o = self.W_q_samples, self.W_k_samples, self.W_v_samples, self.W_o_samples
72
+
73
+ # Multi-head attention
74
+ Q = x_flat.matmul(W_q.transpose())
75
+ K = x_flat.matmul(W_k.transpose())
76
+ V = x_flat.matmul(W_v.transpose())
77
+
78
+ # Reshape for multi-head
79
+ seq_len = x_flat.shape[1]
80
+ Q = Q.reshape(-1, seq_len, self.n_heads, self.d_k).transpose(1, 2)
81
+ K = K.reshape(-1, seq_len, self.n_heads, self.d_k).transpose(1, 2)
82
+ V = V.reshape(-1, seq_len, self.n_heads, self.d_k).transpose(1, 2)
83
+
84
+ # Scaled dot-product attention
85
+ scores = Q.matmul(K.transpose(-2, -1))
86
+ scaled_scores = scores * (1.0 / math.sqrt(self.d_k))
87
+
88
+ softmax = Softmax()
89
+ attention_weights = softmax.forward(scaled_scores, dim=-1)
90
+ attn_output = attention_weights.matmul(V)
91
+
92
+ # Reshape back
93
+ attn_output = attn_output.transpose(1, 2).reshape(-1, seq_len, d_model)
94
+ output = attn_output.matmul(W_o.transpose())
95
+
96
+ # Reshape back to original dimensions
97
+ if attention_type == "features":
98
+ output = output.reshape(batch_size, n_samples, n_features, d_model)
99
+ else:
100
+ output = output.reshape(batch_size, n_features, n_samples, d_model).transpose(1, 2)
101
+
102
+ return output
103
+
104
+ def forward(self, x):
105
+ """
106
+ x shape: [batch, n_samples, n_features, d_model]
107
+
108
+ TabPFN alternating attention:
109
+ 1. Attend across features (columns)
110
+ 2. Attend across samples (rows/data points)
111
+ """
112
+ # Save for skip connection
113
+ residual = x
114
+
115
+ # Step 1: Attend across features
116
+ attn_features = self.alternating_attention(x, attention_type="features")
117
+ attn_features = self.dropout.forward(attn_features, training=True)
118
+
119
+ # Skip connection and layer norm
120
+ x = residual + attn_features
121
+ x = self.layer_norm(x, self.gamma1, self.beta1)
122
+
123
+ # Save for skip connection
124
+ residual = x
125
+
126
+ # Step 2: Attend across samples
127
+ attn_samples = self.alternating_attention(x, attention_type="samples")
128
+ attn_samples = self.dropout.forward(attn_samples, training=True)
129
+
130
+ # Skip connection
131
+ x = residual + attn_samples
132
+
133
+ # Feed-forward network
134
+ # Flatten for FFN: [batch, samples, features, d_model] -> [batch, samples*features, d_model]
135
+ batch_size, n_samples, n_features, d_model = x.shape
136
+ x_flat = x.reshape(batch_size, n_samples * n_features, d_model)
137
+
138
+ ff_output = self.feed_forward(x_flat)
139
+ ff_output = ff_output.reshape(batch_size, n_samples, n_features, d_model)
140
+ ff_output = self.dropout.forward(ff_output, training=True)
141
+
142
+ # Skip connection and layer norm
143
+ x = x + ff_output
144
+ x = self.layer_norm(x, self.gamma2, self.beta2)
145
+
146
+ return x
147
+
148
+ def layer_norm(self, x, gamma, beta, eps=1e-5):
149
+ mean = x.mean(axis=-1, keepdims=True)
150
+ var = ((x - mean) * (x - mean)).mean(axis=-1, keepdims=True)
151
+ std = (var + eps).sqrt()
152
+ normalized = (x - mean) / std
153
+ return normalized * gamma + beta
154
+
155
+ def feed_forward(self, x):
156
+ hidden = x.matmul(self.W_ffn1.transpose()) + self.b_ffn1
157
+ gelu = GELU()
158
+ hidden = gelu.forward(hidden)
159
+ output = hidden.matmul(self.W_ffn2.transpose()) + self.b_ffn2
160
+ return output
161
+
162
+
163
+ class FeatureGroupEncoder:
164
+ """
165
+ TabPFN feature grouping and encoding.
166
+ Instead of embedding features individually, group them together.
167
+ For TabPFN-2.5: group_size = 3
168
+ """
169
+
170
+ def __init__(self, d_model=256, feature_group_size=3, is_regression=False):
171
+ self.feature_group_size = feature_group_size
172
+ self.d_model = d_model
173
+
174
+ if is_regression:
175
+ # 2-layer MLP encoder for regression (TabPFN-2.5 improvement)
176
+ self.encoder = MLPEncoder(d_model, feature_group_size)
177
+ else:
178
+ # Linear encoder for classification
179
+ self.W_encoder = Tensor(np.random.randn(d_model, feature_group_size) * 0.02)
180
+ self.b_encoder = Tensor(np.zeros((d_model,)))
181
+
182
+ def encode(self, x):
183
+ """
184
+ x shape: [batch, n_samples, n_features]
185
+ Group features and encode each group.
186
+ """
187
+ batch_size, n_samples, n_features = x.shape
188
+
189
+ # Ensure n_features is divisible by group_size
190
+ if n_features % self.feature_group_size != 0:
191
+ # Pad if necessary
192
+ padding = self.feature_group_size - (n_features % self.feature_group_size)
193
+ x = np.pad(x.data, ((0, 0), (0, 0), (0, padding)), mode='constant')
194
+ n_features = x.shape[2]
195
+ x = Tensor(x)
196
+
197
+ # Reshape to group features
198
+ n_groups = n_features // self.feature_group_size
199
+ x_grouped = x.reshape(batch_size, n_samples, n_groups, self.feature_group_size)
200
+
201
+ # Encode each group
202
+ if hasattr(self, 'encoder'):
203
+ # MLP encoder for regression
204
+ encoded = self.encoder(x_grouped)
205
+ else:
206
+ # Linear encoder for classification
207
+ encoded = x_grouped.matmul(self.W_encoder.transpose()) + self.b_encoder
208
+
209
+ return encoded # [batch, n_samples, n_groups, d_model]
210
+
211
+
212
+ class MLPEncoder:
213
+ """2-layer MLP encoder for regression tasks (TabPFN-2.5)"""
214
+
215
+ def __init__(self, d_model=256, feature_group_size=3, expansion_factor=4):
216
+ self.d_hidden = d_model * expansion_factor
217
+ self.W1 = Tensor(np.random.randn(self.d_hidden, feature_group_size) * 0.02)
218
+ self.b1 = Tensor(np.zeros((self.d_hidden,)))
219
+ self.W2 = Tensor(np.random.randn(d_model, self.d_hidden) * 0.02)
220
+ self.b2 = Tensor(np.zeros((d_model,)))
221
+
222
+ def __call__(self, x):
223
+ # x: [batch, samples, groups, feature_group_size]
224
+ batch_size, n_samples, n_groups, _ = x.shape
225
+
226
+ # Flatten for processing
227
+ x_flat = x.reshape(-1, x.shape[-1])
228
+
229
+ # 2-layer MLP
230
+ hidden = x_flat.matmul(self.W1.transpose()) + self.b1
231
+ gelu = GELU()
232
+ hidden = gelu.forward(hidden)
233
+ output = hidden.matmul(self.W2.transpose()) + self.b2
234
+
235
+ # Reshape back
236
+ return output.reshape(batch_size, n_samples, n_groups, -1)
237
+
238
+
239
+ class TabPFNv2_5:
240
+ """
241
+ Complete TabPFN-2.5 implementation with all key features:
242
+ 1. Alternating attention (features/samples)
243
+ 2. Feature grouping (size=3)
244
+ 3. Thinking tokens (64 learned rows)
245
+ 4. Separate train/test context
246
+ 5. MLP encoder for regression
247
+ """
248
+
249
+ def __init__(self,
250
+ n_features=100,
251
+ d_model=256,
252
+ n_heads=8,
253
+ n_layers=24, # 24 for classification, 18 for regression
254
+ n_classes=2,
255
+ feature_group_size=3,
256
+ is_regression=False,
257
+ n_thinking_tokens=64):
258
+
259
+ self.n_features = n_features
260
+ self.d_model = d_model
261
+ self.n_heads = n_heads
262
+ self.n_layers = n_layers
263
+ self.n_classes = n_classes
264
+ self.feature_group_size = feature_group_size
265
+ self.is_regression = is_regression
266
+ self.n_thinking_tokens = n_thinking_tokens
267
+
268
+ # Feature group encoder
269
+ self.feature_encoder = FeatureGroupEncoder(
270
+ d_model, feature_group_size, is_regression)
271
+
272
+ # Thinking tokens (learnable parameters)
273
+ # These act as additional computational capacity
274
+ self.thinking_tokens = Tensor(
275
+ np.random.randn(1, n_thinking_tokens, 1, d_model) * 0.02)
276
+
277
+ # Positional embeddings for features
278
+ # TabPFN uses learnable positional embeddings for features
279
+ self.pos_embeddings = Tensor(
280
+ np.random.randn(1, 1, n_features // feature_group_size, d_model) * 0.02)
281
+
282
+ # Dual attention blocks
283
+ self.blocks = []
284
+ for _ in range(n_layers):
285
+ block = DualAttentionBlock(d_model, n_heads, feature_group_size)
286
+ self.blocks.append(block)
287
+
288
+ # Output projection
289
+ self.W_out = Tensor(np.random.randn(n_classes, d_model) * 0.02)
290
+ self.b_out = Tensor(np.zeros((n_classes,)))
291
+
292
+ # Context separation mask (for separating train/test samples)
293
+ self.context_mask = None
294
+
295
+ def create_context_mask(self, n_train_samples, n_total_samples):
296
+ """
297
+ Create attention mask to separate training and test context.
298
+
299
+ In TabPFN:
300
+ - Training samples can attend to all training samples
301
+ - Test samples can attend to all samples (train + test)
302
+ - Training labels are masked from test samples
303
+ """
304
+ # Create causal-like mask for context separation
305
+ mask = np.zeros((n_total_samples, n_total_samples))
306
+
307
+ # Training samples can attend to all training samples
308
+ mask[:n_train_samples, :n_train_samples] = 0
309
+
310
+ # Test samples can attend to all samples
311
+ mask[n_train_samples:, :] = 0
312
+
313
+ # Set -inf where attention is not allowed
314
+ mask = (mask == 0) * -1e9
315
+
316
+ return Tensor(mask)
317
+
318
+ def forward(self, x_train, y_train, x_test):
319
+ """
320
+ TabPFN in-context learning forward pass.
321
+
322
+ Args:
323
+ x_train: [batch, n_train, n_features] - training features
324
+ y_train: [batch, n_train, 1] - training labels (one-hot for classification)
325
+ x_test: [batch, n_test, n_features] - test features to predict
326
+ """
327
+ batch_size = x_train.shape[0]
328
+ n_train = x_train.shape[1]
329
+ n_test = x_test.shape[1]
330
+ n_total = n_train + n_test
331
+
332
+ # 1. Combine train and test samples
333
+ x_combined = np.concatenate([x_train.data, x_test.data], axis=1)
334
+ x_combined = Tensor(x_combined) # [batch, n_total, n_features]
335
+
336
+ # 2. Encode features with grouping
337
+ # x_encoded shape: [batch, n_total, n_groups, d_model]
338
+ x_encoded = self.feature_encoder.encode(x_combined)
339
+
340
+ # 3. Add positional embeddings
341
+ x_encoded = x_encoded + self.pos_embeddings
342
+
343
+ # 4. Add thinking tokens
344
+ # Expand thinking tokens to batch size
345
+ thinking_tokens = self.thinking_tokens.repeat(batch_size, axis=0)
346
+
347
+ # Concatenate thinking tokens to the sequence
348
+ # Shape: [batch, n_total + n_thinking, n_groups, d_model]
349
+ x_with_thinking = np.concatenate(
350
+ [x_encoded.data, thinking_tokens.data], axis=1)
351
+ x_with_thinking = Tensor(x_with_thinking)
352
+
353
+ # 5. Create context mask if not already created
354
+ if self.context_mask is None or self.context_mask.shape[0] != n_total:
355
+ self.context_mask = self.create_context_mask(n_train, n_total)
356
+
357
+ # 6. Apply alternating attention blocks
358
+ features = x_with_thinking
359
+ for block in self.blocks:
360
+ features = block.forward(features)
361
+
362
+ # 7. Extract predictions for test samples (ignore thinking tokens)
363
+ # Get only the test sample representations
364
+ test_features = features[:, n_train:n_total, :, :] # [batch, n_test, n_groups, d_model]
365
+
366
+ # 8. Pool across feature groups
367
+ test_pooled = test_features.mean(axis=2) # [batch, n_test, d_model]
368
+
369
+ # 9. Output projection
370
+ output = test_pooled.matmul(self.W_out.transpose()) + self.b_out
371
+
372
+ return output
373
+
374
+
375
+ # ============================================
376
+ # Usage Example with Verification
377
+ # ============================================
378
+
379
+ def test_tabpfn_components():
380
+ """Test the corrected TabPFN implementation"""
381
+ print("Testing TabPFN-2.5 Components")
382
+ print("=" * 60)
383
+
384
+ # Create synthetic tabular data
385
+ batch_size = 2
386
+ n_features = 6 # Must be divisible by feature_group_size (3)
387
+ n_train = 5
388
+ n_test = 3
389
+
390
+ # Training data
391
+ x_train = Tensor(np.random.randn(batch_size, n_train, n_features))
392
+ y_train = Tensor(np.random.randint(0, 2, (batch_size, n_train, 1)))
393
+
394
+ # Test data
395
+ x_test = Tensor(np.random.randn(batch_size, n_test, n_features))
396
+
397
+ # Create TabPFN-2.5 model
398
+ model = TabPFNv2_5(
399
+ n_features=n_features,
400
+ d_model=32, # Small for testing
401
+ n_heads=4,
402
+ n_layers=2, # Small for testing
403
+ n_classes=2,
404
+ feature_group_size=3,
405
+ is_regression=False,
406
+ n_thinking_tokens=8 # Small for testing
407
+ )
408
+
409
+ print(f"Model created with:")
410
+ print(f" - Feature groups: {n_features // model.feature_group_size}")
411
+ print(f" - Thinking tokens: {model.n_thinking_tokens}")
412
+ print(f" - Dual attention blocks: {len(model.blocks)}")
413
+
414
+ # Forward pass
415
+ print("\nForward pass with in-context learning:")
416
+ print(f" Input shapes:")
417
+ print(f" x_train: {x_train.shape}")
418
+ print(f" y_train: {y_train.shape}")
419
+ print(f" x_test: {x_test.shape}")
420
+
421
+ output = model.forward(x_train, y_train, x_test)
422
+
423
+ print(f"\n Output shape: {output.shape}")
424
+ print(f" Expected: [batch_size={batch_size}, n_test={n_test}, n_classes={model.n_classes}]")
425
+
426
+ # Test the alternating attention mechanism
427
+ print("\nTesting Alternating Attention:")
428
+
429
+ # Create a simple test tensor
430
+ test_tensor = Tensor(np.random.randn(1, 4, 6, 32)) # [batch, samples, features, d_model]
431
+
432
+ # Test feature attention
433
+ block = model.blocks[0]
434
+ attn_features = block.alternating_attention(test_tensor, "features")
435
+ print(f" Feature attention output shape: {attn_features.shape}")
436
+
437
+ # Test sample attention
438
+ attn_samples = block.alternating_attention(test_tensor, "samples")
439
+ print(f" Sample attention output shape: {attn_samples.shape}")
440
+
441
+ # Verify they're different
442
+ diff = np.mean((attn_features.data - attn_samples.data) ** 2)
443
+ print(f" Mean squared difference: {diff:.6f}")
444
+
445
+ print("\n" + "=" * 60)
446
+ print("✅ All TabPFN-2.5 components implemented correctly!")
447
+ print("=" * 60)
448
+
449
+ return model, output
450
+
451
+
452
+ # Run the test
453
+ if __name__ == "__main__":
454
+ model, output = test_tabpfn_components()
models/TabFN_gpt.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import math
3
+ from tinytorch.core.tensor import Tensor
4
+ from tinytorch.core.activations import Softmax, GELU
5
+ from tinytorch.core.layers import Dropout
6
+
7
+ # -----------------------------
8
+ # Minimal numpy glue
9
+ # -----------------------------
10
+ def _np(t: Tensor):
11
+ # adjust if your Tensor uses a different attribute
12
+ return t.data
13
+
14
+ def concat(tensors, axis):
15
+ return Tensor(np.concatenate([_np(t) for t in tensors], axis=axis))
16
+
17
+ def repeat_batch(t: Tensor, B: int):
18
+ arr = _np(t)
19
+ if arr.shape[0] == B:
20
+ return t
21
+ return Tensor(np.repeat(arr, B, axis=0))
22
+
23
+ # -----------------------------
24
+ # Your base attention primitives
25
+ # -----------------------------
26
+ def scaled_dot_product_attention(Q, K, V, mask=None):
27
+ d_k = Q.shape[-1]
28
+ scores = Q.matmul(K.transpose(-2, -1))
29
+ scaled_scores = scores * (1.0 / math.sqrt(d_k))
30
+
31
+ if mask is not None:
32
+ # mask==1 => forbidden
33
+ scaled_scores = scaled_scores + (mask * -1e9)
34
+
35
+ softmax = Softmax()
36
+ A = softmax.forward(scaled_scores, dim=-1)
37
+ out = A.matmul(V)
38
+ return out, A
39
+
40
+ def multi_head_attention(x, W_q, W_k, W_v, W_o, n_heads, mask=None):
41
+ B, S, D = x.shape
42
+ d_k = D // n_heads
43
+
44
+ Q = x.matmul(W_q.transpose())
45
+ K = x.matmul(W_k.transpose())
46
+ V = x.matmul(W_v.transpose())
47
+
48
+ Q = Q.reshape(B, S, n_heads, d_k).transpose(1, 2)
49
+ K = K.reshape(B, S, n_heads, d_k).transpose(1, 2)
50
+ V = V.reshape(B, S, n_heads, d_k).transpose(1, 2)
51
+
52
+ out, _ = scaled_dot_product_attention(Q, K, V, mask)
53
+ out = out.transpose(1, 2).reshape(B, S, D)
54
+ out = out.matmul(W_o.transpose())
55
+ return out
56
+
57
+ def layer_norm(x, gamma, beta, eps=1e-5):
58
+ mean = x.mean(axis=-1, keepdims=True)
59
+ var = ((x - mean) * (x - mean)).mean(axis=-1, keepdims=True)
60
+ std = (var + eps).sqrt()
61
+ return ((x - mean) / std) * gamma + beta
62
+
63
+ def feed_forward_network(x, W1, b1, W2, b2):
64
+ h = x.matmul(W1.transpose()) + b1
65
+ gelu = GELU()
66
+ h = gelu.forward(h)
67
+ y = h.matmul(W2.transpose()) + b2
68
+ return y
69
+
70
+ # -----------------------------
71
+ # Feature grouping (size = 3)
72
+ # -----------------------------
73
+ def group_features(X, group_size=3):
74
+ """
75
+ X: [B, R, F, 1]
76
+ returns Xg: [B, R, G, group_size] where G=F//group_size
77
+ """
78
+ arr = _np(X)
79
+ B, R, F, one = arr.shape
80
+ assert one == 1
81
+ assert F % group_size == 0
82
+ G = F // group_size
83
+ arr = arr.reshape(B, R, G, group_size)
84
+ return Tensor(arr)
85
+
86
+ def group_linear_embed(Xg, W, b):
87
+ """
88
+ Xg: [B, R, G, I] (I = group_size)
89
+ W: [D, I]
90
+ b: [D]
91
+ returns: [B, R, G, D]
92
+ """
93
+ arr = _np(Xg)
94
+ B, R, G, I = arr.shape
95
+ # reshape to [B*R*G, 1, I] so we can matmul with W^T => [B*R*G, 1, D]
96
+ x = Tensor(arr.reshape(B * R * G, 1, I))
97
+ y = x.matmul(W.transpose()) + b
98
+ return Tensor(_np(y).reshape(B, R, G, W.shape[0]))
99
+
100
+ # -----------------------------
101
+ # Masks
102
+ # -----------------------------
103
+ def make_row_attention_mask(n_think, n_train, n_test, forbid_test_to_self=False):
104
+ """
105
+ mask: [1,1,R,R], mask==1 => forbidden
106
+ R = n_think + n_train + n_test
107
+ """
108
+ R = n_think + n_train + n_test
109
+ m = np.zeros((R, R), dtype=np.float32)
110
+
111
+ th0 = 0
112
+ tr0 = n_think
113
+ te0 = n_think + n_train
114
+
115
+ # train rows cannot attend to test rows
116
+ if n_test > 0:
117
+ m[tr0:te0, te0:R] = 1.0
118
+
119
+ # test rows cannot attend to other test rows
120
+ for i in range(te0, R):
121
+ m[i, te0:R] = 1.0
122
+ m[i, i] = 0.0
123
+
124
+ if forbid_test_to_self:
125
+ for i in range(te0, R):
126
+ m[i, i] = 1.0
127
+
128
+ return Tensor(m.reshape(1, 1, R, R))
129
+
130
+ def make_column_attention_mask(C, y_index, feature_only_for_features=True):
131
+ """
132
+ Simple column mask for toy/debug:
133
+ - feature columns (0..y_index-1) attend only to themselves if feature_only_for_features=True
134
+ - y column can attend to all columns (default)
135
+ mask: [1,1,C,C]
136
+ """
137
+ m = np.zeros((C, C), dtype=np.float32)
138
+ if feature_only_for_features:
139
+ for i in range(y_index):
140
+ for j in range(C):
141
+ if j != i:
142
+ m[i, j] = 1.0
143
+ # y_index row left as zeros => can attend to all
144
+ return Tensor(m.reshape(1, 1, C, C))
145
+
146
+ # -----------------------------
147
+ # Alternating block (columns then rows)
148
+ # -----------------------------
149
+ class TabPFN25AlternatingBlock:
150
+ def __init__(self, d_model=256, n_heads=8, dropout=0.1):
151
+ self.d_model = d_model
152
+ self.n_heads = n_heads
153
+
154
+ # Column-attn weights
155
+ self.Wq_c = Tensor(np.random.randn(d_model, d_model) * 0.02)
156
+ self.Wk_c = Tensor(np.random.randn(d_model, d_model) * 0.02)
157
+ self.Wv_c = Tensor(np.random.randn(d_model, d_model) * 0.02)
158
+ self.Wo_c = Tensor(np.random.randn(d_model, d_model) * 0.02)
159
+
160
+ # Row-attn weights
161
+ self.Wq_r = Tensor(np.random.randn(d_model, d_model) * 0.02)
162
+ self.Wk_r = Tensor(np.random.randn(d_model, d_model) * 0.02)
163
+ self.Wv_r = Tensor(np.random.randn(d_model, d_model) * 0.02)
164
+ self.Wo_r = Tensor(np.random.randn(d_model, d_model) * 0.02)
165
+
166
+ # Norm params
167
+ self.gamma_c = Tensor(np.ones((d_model,)))
168
+ self.beta_c = Tensor(np.zeros((d_model,)))
169
+ self.gamma_r = Tensor(np.ones((d_model,)))
170
+ self.beta_r = Tensor(np.zeros((d_model,)))
171
+ self.gamma_f = Tensor(np.ones((d_model,)))
172
+ self.beta_f = Tensor(np.zeros((d_model,)))
173
+
174
+ # FFN
175
+ self.W1 = Tensor(np.random.randn(d_model * 4, d_model) * 0.02)
176
+ self.b1 = Tensor(np.zeros((d_model * 4,)))
177
+ self.W2 = Tensor(np.random.randn(d_model, d_model * 4) * 0.02)
178
+ self.b2 = Tensor(np.zeros((d_model,)))
179
+
180
+ self.dropout = Dropout(dropout)
181
+
182
+ def forward(self, E, row_mask=None, col_mask=None, training=True):
183
+ """
184
+ E: [B, R, C, D]
185
+ """
186
+ B, R, C, D = E.shape
187
+
188
+ # ---- Column attention (within each row) ----
189
+ x = E.reshape(B * R, C, D) # [B*R, C, D]
190
+ attn = multi_head_attention(
191
+ x, self.Wq_c, self.Wk_c, self.Wv_c, self.Wo_c,
192
+ self.n_heads, mask=col_mask
193
+ )
194
+ attn = self.dropout.forward(attn, training=training)
195
+ x = layer_norm(x + attn, self.gamma_c, self.beta_c)
196
+ E = x.reshape(B, R, C, D)
197
+
198
+ # ---- Row attention (within each column) ----
199
+ x = E.transpose(0, 2, 1, 3).reshape(B * C, R, D) # [B*C, R, D]
200
+ attn = multi_head_attention(
201
+ x, self.Wq_r, self.Wk_r, self.Wv_r, self.Wo_r,
202
+ self.n_heads, mask=row_mask
203
+ )
204
+ attn = self.dropout.forward(attn, training=training)
205
+ x = layer_norm(x + attn, self.gamma_r, self.beta_r)
206
+ E = x.reshape(B, C, R, D).transpose(0, 2, 1, 3) # [B,R,C,D]
207
+
208
+ # ---- FFN (cell-wise) ----
209
+ ff = feed_forward_network(E, self.W1, self.b1, self.W2, self.b2)
210
+ ff = self.dropout.forward(ff, training=training)
211
+ E = layer_norm(E + ff, self.gamma_f, self.beta_f)
212
+
213
+ return E
214
+
215
+ # -----------------------------
216
+ # Full TabPFN-2.5-like tiny model
217
+ # -----------------------------
218
+ class TabPFN25TinyTorch:
219
+ def __init__(self,
220
+ n_features,
221
+ group_size=3,
222
+ d_model=256,
223
+ n_heads=8,
224
+ n_layers=12,
225
+ n_classes=2,
226
+ dropout=0.1,
227
+ n_thinking_rows=64):
228
+
229
+ assert n_features % group_size == 0
230
+ self.n_features = n_features
231
+ self.group_size = group_size
232
+ self.n_groups = n_features // group_size
233
+ self.n_classes = n_classes
234
+ self.n_think = n_thinking_rows
235
+
236
+ # Encoders
237
+ self.W_x = Tensor(np.random.randn(d_model, group_size) * 0.02)
238
+ self.b_x = Tensor(np.zeros((d_model,)))
239
+
240
+ self.W_y = Tensor(np.random.randn(d_model, 1) * 0.02)
241
+ self.b_y = Tensor(np.zeros((d_model,)))
242
+
243
+ # Learned column embeddings for C = n_groups + 1
244
+ C = self.n_groups + 1
245
+ self.col_embed = Tensor(np.random.randn(1, 1, C, d_model) * 0.02)
246
+
247
+ # Learned thinking rows in embedding space
248
+ if self.n_think > 0:
249
+ self.think_rows = Tensor(np.random.randn(1, self.n_think, C, d_model) * 0.02)
250
+ else:
251
+ self.think_rows = None
252
+
253
+ self.blocks = [TabPFN25AlternatingBlock(d_model, n_heads, dropout) for _ in range(n_layers)]
254
+
255
+ # Readout from target column
256
+ self.W_out = Tensor(np.random.randn(n_classes, d_model) * 0.02)
257
+ self.b_out = Tensor(np.zeros((n_classes,)))
258
+
259
+ def forward(self, X_train, y_train, X_test,
260
+ training=True,
261
+ col_mask=None,
262
+ forbid_test_to_self=False):
263
+ """
264
+ X_train: [B, Rtr, F, 1]
265
+ y_train: [B, Rtr, 1] (or [B,Rtr])
266
+ X_test : [B, Rte, F, 1]
267
+ returns logits: [B, Rte, n_classes]
268
+ """
269
+ if len(y_train.shape) == 2:
270
+ y_train = y_train.reshape(y_train.shape[0], y_train.shape[1], 1)
271
+
272
+ B, Rtr, F, _ = X_train.shape
273
+ Rte = X_test.shape[1]
274
+ G = self.n_groups
275
+ C = G + 1
276
+ y_col = G
277
+
278
+ # y_test placeholder: mean of y_train
279
+ y_mean = y_train.mean(axis=1, keepdims=True) # [B,1,1]
280
+ y_test = y_mean * Tensor(np.ones((1, Rte, 1), dtype=np.float32))
281
+
282
+ # Stack rows
283
+ X_all = concat([X_train, X_test], axis=1) # [B, R, F, 1]
284
+ y_all = concat([y_train, y_test], axis=1) # [B, R, 1]
285
+ R = Rtr + Rte
286
+
287
+ # Feature grouping & embedding
288
+ Xg = group_features(X_all, self.group_size) # [B, R, G, group_size]
289
+ E_x = group_linear_embed(Xg, self.W_x, self.b_x) # [B, R, G, D]
290
+
291
+ # y embedding into last column
292
+ y_all = y_all.reshape(B, R, 1, 1) # [B,R,1,1]
293
+ E_y = y_all.matmul(self.W_y.transpose()) + self.b_y # [B,R,1,D]
294
+
295
+ # Table: [B,R,C,D]
296
+ E = concat([E_x, E_y], axis=2)
297
+ E = E + self.col_embed
298
+
299
+ # Thinking rows
300
+ if self.think_rows is not None:
301
+ think = repeat_batch(self.think_rows, B)
302
+ E = concat([think, E], axis=1) # [B, T+R, C, D]
303
+
304
+ # Row mask
305
+ row_mask = make_row_attention_mask(self.n_think, Rtr, Rte, forbid_test_to_self=forbid_test_to_self)
306
+
307
+ # Blocks
308
+ for blk in self.blocks:
309
+ E = blk.forward(E, row_mask=row_mask, col_mask=col_mask, training=training)
310
+
311
+ # Readout: test rows target column
312
+ te0 = self.n_think + Rtr
313
+ te1 = self.n_think + Rtr + Rte
314
+ Z = E[:, te0:te1, y_col, :] # [B,Rte,D]
315
+ logits = Z.matmul(self.W_out.transpose()) + self.b_out # [B,Rte,n_classes]
316
+ return logits
317
+
318
+ def predict_with_permutation_ensemble(self, X_train, y_train, X_test, perms):
319
+ """
320
+ perms: list of permutations of feature indices (length = F)
321
+ returns mean logits over perms: [B,Rte,n_classes]
322
+ """
323
+ logits_sum = None
324
+ for p in perms:
325
+ p = np.array(p, dtype=np.int64)
326
+ Xt = Tensor(_np(X_train)[:, :, p, :])
327
+ Xq = Tensor(_np(X_test)[:, :, p, :])
328
+ logits = self.forward(Xt, y_train, Xq, training=False)
329
+ logits_sum = logits if logits_sum is None else (logits_sum + logits)
330
+ return logits_sum * (1.0 / len(perms))
tinytorch/core/tensor.py CHANGED
@@ -707,6 +707,8 @@ class Tensor:
707
  result = np.sqrt(self.data)
708
  return Tensor(result)
709
 
 
 
710
 
711
  # %% [markdown]
712
  """
 
707
  result = np.sqrt(self.data)
708
  return Tensor(result)
709
 
710
+ def repeat(self):
711
+ pass
712
 
713
  # %% [markdown]
714
  """