import numpy # TabPFN # training data X_train = Tensor([[1, 2, 3, 4], [5, 6, 7, 8]]) Y_train = Tensor([1, 0]) X_test = Tensor([[9, 10, 11, 12]]) box("X_train", [X_train, Y_train, X_test], "1") # Feature Encoder - Feature Embeddings W_enc = Tensor([[1, 0.5], [0.5, 1], [0.3, 0.7], [0.7, 0.3]]) W_enc_transpose = W_enc.transpose() b_enc = Tensor([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]]) box("Feature Encoder", W_enc_transpose, "2") # Feature/group embeddings E_feat = Tensor([[0.1, 0.0, 0.0, 0.0], [0.0, 0.1, 0.0, 0.0]]) box("Group embedding", E_feat, "6") # Step 1: Combine Training and Test Samples X_combined = X_combined = Tensor(np.vstack([X_train.data, X_test.data])) box("Training and Test Samples grouped", X_combined, "4") # Step 1: Group Features def group(X): groups = X.shape[0] * W_enc.shape[1] X_encoded = np.zeros((3, 2, 4)) # print(X_encoded) idx = 0 col = 0 for (group_idx, row) in enumerate(X.data): rt_ptr = 0 for rt_ptr in range(0, len(row), 2): group_window = Tensor(row[rt_ptr:rt_ptr + 2]) group_matmul = group_window.matmul(W_enc_transpose) + b_enc[group_idx] # group 1 if col == 0: X_encoded[idx][0] = group_matmul.data + E_feat.data[0] col = 1 # group 2 else: X_encoded[idx][1] = group_matmul.data + + E_feat.data[1] col = 0 box(f"grouping: group {col}", [group_window, group_matmul]) idx += 1 X_encoded_tensor = Tensor(X_encoded) return X_encoded_tensor X_encoded = group(X_combined) box("X_encoded", X_encoded, "4") # Label Encoder - Label Embeddings W_y = Tensor([[1, -1, 0, 0], [0, 0, 1, 1]]) b_y = Tensor([0, 0, 0, 0]) y_padded = Tensor([1, 0, np.nan]) # we wan't to mask y_test with nan y_clean = Tensor([[1, 0, 0], [0, 0, 1]]).reshape(3, 2) box("y_clean", y_clean, "4") def label_embeddings(y_train): lbl_embds = np.zeros((3, 4)) for (idx, row) in enumerate(y_train.data): res = Tensor((row)).matmul(W_y) lbl_embds[idx] = res.data box("Label Embeddings", [res], "5") return Tensor(lbl_embds) label_embeds = label_embeddings(y_clean) # print(label_embeds) # Step 3: Add Thinking Tokens Thinking_Tokens = Tensor([ [[0.01, 0.02, 0.03, 0.04], [0.01, 0.02, 0.03, 0.04], [0.01, 0.02, 0.03, 0.04]], [[0.05, 0.06, 0.07, 0.08], [0.05, 0.06, 0.07, 0.08], [0.05, 0.06, 0.07, 0.08]] ]) box("Thinking Tokens", Thinking_Tokens, "4") # Computing full model input labels_reshaped = label_embeds.data.reshape(3, 1, 4) data_rows = np.concatenate([X_encoded.data, labels_reshaped], axis=1) E_numpy = np.concatenate([Thinking_Tokens.data, data_rows], axis=0) E = Tensor(E_numpy) # we need to adapt positional embeddings! # Create row positional embeddings P_col_pos_embeds = Tensor([[[0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2], [0.3, 0.3, 0.3, 0.3]]]) # Add positional embeddings E = E + P_col_pos_embeds box("Positional Embedding", E, "9") # Attention W_q = Tensor(np.diag([0.1, 0.2, 0.1, 0.2])) W_k = Tensor(np.diag([0.1, 0.1, 0.1, 0.1])) W_v = Tensor(np.diag([1, 1, 1, 1])) box("Attention weights", [W_q, W_k, W_v], "9") scaling_factor = np.sqrt(4) # labels = [E[1][2], E[2][2], E[2][2]] col_att_softmax = Softmax() def layer_norm_inplace(E: Tensor, eps=1e-5): """ In-place LN over last dim D for every vector in E. E: (S, Ttok, D) """ x = E.data mean = x.mean(axis=-1, keepdims=True) var = ((x - mean) * (x - mean)).mean(axis=-1, keepdims=True) x_norm = (x - mean) / np.sqrt(var + eps) box("Layer norn", [Tensor(x), Tensor(mean), Tensor(var), Tensor(x_norm)], "7") E.data[:] = x_norm def column_attention_inplace(E: Tensor): """ In-place column attention: For each item s: X = E[s] has shape (Ttok=3, D=4) Does self-attention across the 3 tokens and writes back: E[s] <- E[s] + Attn(E[s]) """ S, Ttok, D = E.shape softmax = Softmax() for s in range(S): # Snapshot of current item (avoid in-place mixing during compute) X = Tensor(E.data[s].copy()) # (3,4) Q = X.matmul(W_q.transpose()) # (3,4) K = X.matmul(W_k.transpose()) # (3,4) V = X.matmul(W_v.transpose()) # (3,4) scores = Q.matmul(K.transpose()) / math.sqrt(D) # (3,3) A = softmax.forward(scores, dim=-1) # (3,3) O = A.matmul(V) # (3,4) box("column_attention", [Q, K, V, scores, A, O], "5") # In-place residual update of ALL tokens E.data[s] = E.data[s] + O.data column_attention_inplace(E) layer_norm_inplace(E) box("Updated Logits", E + 0, "5") def mlp_inplace(E: Tensor): """ Minimal hand-friendly MLP with residual: x <- x + GELU(x) In-place. """ gelu = GELU() x = Tensor(E.data.copy()) gx = gelu.forward(x).data E.data[:] = E.data + gx def row_attention_inplace(E: Tensor, single_eval_pos: int): """ In-place row attention: For each token slot t: Q from all S items: E[:, t, :] -> (S, D) K,V from first Klen rows E[:single_eval_pos, t, :] -> (Klen, D) Writes: E[:, t, :] <- E[:, t, :] + Attn_row(E[:, t, :]) """ S, Ttok, D = E.shape softmax = Softmax() Klen = single_eval_pos assert 0 < Klen <= S, "single_eval_pos must be between 1 and S" for t in range(Ttok): # Snapshot streams (avoid in-place mixing) X_all = Tensor(E.data[:, t, :].copy()) # (S, D) X_kv = Tensor(E.data[:Klen, t, :].copy()) # (Klen, D) Q = X_all.matmul(W_q.transpose()) # (S, D) K = X_kv.matmul(W_k.transpose()) # (Klen, D) V = X_kv.matmul(W_v.transpose()) # (Klen, D) scores = Q.matmul(K.transpose()) / math.sqrt(D) # (S, Klen) A = softmax.forward(scores, dim=-1) # (S, Klen) O = A.matmul(V) # (S, D) # In-place residual update for this token slot box("row_attention", [Q, K, V, scores, A, O], "5") E.data[:, t, :] = E.data[:, t, :] + O.data row_attention_inplace(E, single_eval_pos=4) layer_norm_inplace(E) # 3) MLP + LN mlp_inplace(E) # x <- x + GELU(x) layer_norm_inplace(E) # ============================================================ # Readout: take test row label token -> logits # In this layout: rows are [think1, think2, train1, train2, test1] # test index = T + N_train = 4 # label token index = 2 # ============================================================ test_row_idx = 4 # 4 label_tok_idx = 2 # last token slot h_test = Tensor(E.data[test_row_idx, label_tok_idx, :].reshape(1, 4)) # (1,4) gelu = GELU() z = gelu.forward(h_test) # (1,4) # Simple head D->C (pick first 2 dims as logits) W_out = Tensor([[1, 0], [0, 1], [0, 0], [0, 0]]) # (4,2) b_out = Tensor([0.0, 0.0]) logits = z.matmul(W_out) + b_out # (1,2) print("h_test:", h_test.data) print("z (GELU):", z.data) print("logits:", logits.data)