tiny-torch-viz / models /current_code.py
Adrian Gabriel
Bugfix
aa2ae84
import numpy
# TabPFN
# training data
X_train = Tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
Y_train = Tensor([1, 0])
X_test = Tensor([[9, 10, 11, 12]])
box("X_train", [X_train, Y_train, X_test], "1")
# Feature Encoder - Feature Embeddings
W_enc = Tensor([[1, 0.5], [0.5, 1], [0.3, 0.7], [0.7, 0.3]])
W_enc_transpose = W_enc.transpose()
b_enc = Tensor([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]])
box("Feature Encoder", W_enc_transpose, "2")
# Feature/group embeddings
E_feat = Tensor([[0.1, 0.0, 0.0, 0.0], [0.0, 0.1, 0.0, 0.0]])
box("Group embedding", E_feat, "6")
# Step 1: Combine Training and Test Samples
X_combined = X_combined = Tensor(np.vstack([X_train.data, X_test.data]))
box("Training and Test Samples grouped", X_combined, "4")
# Step 1: Group Features
def group(X):
groups = X.shape[0] * W_enc.shape[1]
X_encoded = np.zeros((3, 2, 4))
# print(X_encoded)
idx = 0
col = 0
for (group_idx, row) in enumerate(X.data):
rt_ptr = 0
for rt_ptr in range(0, len(row), 2):
group_window = Tensor(row[rt_ptr:rt_ptr + 2])
group_matmul = group_window.matmul(W_enc_transpose) + b_enc[group_idx]
# group 1
if col == 0:
X_encoded[idx][0] = group_matmul.data + E_feat.data[0]
col = 1
# group 2
else:
X_encoded[idx][1] = group_matmul.data + + E_feat.data[1]
col = 0
box(f"grouping: group {col}", [group_window, group_matmul])
idx += 1
X_encoded_tensor = Tensor(X_encoded)
return X_encoded_tensor
X_encoded = group(X_combined)
box("X_encoded", X_encoded, "4")
# Label Encoder - Label Embeddings
W_y = Tensor([[1, -1, 0, 0], [0, 0, 1, 1]])
b_y = Tensor([0, 0, 0, 0])
y_padded = Tensor([1, 0, np.nan]) # we wan't to mask y_test with nan
y_clean = Tensor([[1, 0, 0], [0, 0, 1]]).reshape(3, 2)
box("y_clean", y_clean, "4")
def label_embeddings(y_train):
lbl_embds = np.zeros((3, 4))
for (idx, row) in enumerate(y_train.data):
res = Tensor((row)).matmul(W_y)
lbl_embds[idx] = res.data
box("Label Embeddings", [res], "5")
return Tensor(lbl_embds)
label_embeds = label_embeddings(y_clean)
# print(label_embeds)
# Step 3: Add Thinking Tokens
Thinking_Tokens = Tensor([
[[0.01, 0.02, 0.03, 0.04],
[0.01, 0.02, 0.03, 0.04],
[0.01, 0.02, 0.03, 0.04]],
[[0.05, 0.06, 0.07, 0.08],
[0.05, 0.06, 0.07, 0.08],
[0.05, 0.06, 0.07, 0.08]]
])
box("Thinking Tokens", Thinking_Tokens, "4")
# Computing full model input
labels_reshaped = label_embeds.data.reshape(3, 1, 4)
data_rows = np.concatenate([X_encoded.data, labels_reshaped], axis=1)
E_numpy = np.concatenate([Thinking_Tokens.data, data_rows], axis=0)
E = Tensor(E_numpy)
# we need to adapt positional embeddings!
# Create row positional embeddings
P_col_pos_embeds = Tensor([[[0.1, 0.1, 0.1, 0.1],
[0.2, 0.2, 0.2, 0.2],
[0.3, 0.3, 0.3, 0.3]]])
# Add positional embeddings
E = E + P_col_pos_embeds
box("Positional Embedding", E, "9")
# Attention
W_q = Tensor(np.diag([0.1, 0.2, 0.1, 0.2]))
W_k = Tensor(np.diag([0.1, 0.1, 0.1, 0.1]))
W_v = Tensor(np.diag([1, 1, 1, 1]))
box("Attention weights", [W_q, W_k, W_v], "9")
scaling_factor = np.sqrt(4)
# labels = [E[1][2], E[2][2], E[2][2]]
col_att_softmax = Softmax()
def layer_norm_inplace(E: Tensor, eps=1e-5):
"""
In-place LN over last dim D for every vector in E.
E: (S, Ttok, D)
"""
x = E.data
mean = x.mean(axis=-1, keepdims=True)
var = ((x - mean) * (x - mean)).mean(axis=-1, keepdims=True)
x_norm = (x - mean) / np.sqrt(var + eps)
box("Layer norn", [Tensor(x), Tensor(mean), Tensor(var), Tensor(x_norm)], "7")
E.data[:] = x_norm
def column_attention_inplace(E: Tensor):
"""
In-place column attention:
For each item s: X = E[s] has shape (Ttok=3, D=4)
Does self-attention across the 3 tokens and writes back:
E[s] <- E[s] + Attn(E[s])
"""
S, Ttok, D = E.shape
softmax = Softmax()
for s in range(S):
# Snapshot of current item (avoid in-place mixing during compute)
X = Tensor(E.data[s].copy()) # (3,4)
Q = X.matmul(W_q.transpose()) # (3,4)
K = X.matmul(W_k.transpose()) # (3,4)
V = X.matmul(W_v.transpose()) # (3,4)
scores = Q.matmul(K.transpose()) / math.sqrt(D) # (3,3)
A = softmax.forward(scores, dim=-1) # (3,3)
O = A.matmul(V) # (3,4)
box("column_attention", [Q, K, V, scores, A, O], "5")
# In-place residual update of ALL tokens
E.data[s] = E.data[s] + O.data
column_attention_inplace(E)
layer_norm_inplace(E)
box("Updated Logits", E + 0, "5")
def mlp_inplace(E: Tensor):
"""
Minimal hand-friendly MLP with residual:
x <- x + GELU(x)
In-place.
"""
gelu = GELU()
x = Tensor(E.data.copy())
gx = gelu.forward(x).data
E.data[:] = E.data + gx
def row_attention_inplace(E: Tensor, single_eval_pos: int):
"""
In-place row attention:
For each token slot t:
Q from all S items: E[:, t, :] -> (S, D)
K,V from first Klen rows E[:single_eval_pos, t, :] -> (Klen, D)
Writes:
E[:, t, :] <- E[:, t, :] + Attn_row(E[:, t, :])
"""
S, Ttok, D = E.shape
softmax = Softmax()
Klen = single_eval_pos
assert 0 < Klen <= S, "single_eval_pos must be between 1 and S"
for t in range(Ttok):
# Snapshot streams (avoid in-place mixing)
X_all = Tensor(E.data[:, t, :].copy()) # (S, D)
X_kv = Tensor(E.data[:Klen, t, :].copy()) # (Klen, D)
Q = X_all.matmul(W_q.transpose()) # (S, D)
K = X_kv.matmul(W_k.transpose()) # (Klen, D)
V = X_kv.matmul(W_v.transpose()) # (Klen, D)
scores = Q.matmul(K.transpose()) / math.sqrt(D) # (S, Klen)
A = softmax.forward(scores, dim=-1) # (S, Klen)
O = A.matmul(V) # (S, D)
# In-place residual update for this token slot
box("row_attention", [Q, K, V, scores, A, O], "5")
E.data[:, t, :] = E.data[:, t, :] + O.data
row_attention_inplace(E, single_eval_pos=4)
layer_norm_inplace(E)
# 3) MLP + LN
mlp_inplace(E) # x <- x + GELU(x)
layer_norm_inplace(E)
# ============================================================
# Readout: take test row label token -> logits
# In this layout: rows are [think1, think2, train1, train2, test1]
# test index = T + N_train = 4
# label token index = 2
# ============================================================
test_row_idx = 4 # 4
label_tok_idx = 2 # last token slot
h_test = Tensor(E.data[test_row_idx, label_tok_idx, :].reshape(1, 4)) # (1,4)
gelu = GELU()
z = gelu.forward(h_test) # (1,4)
# Simple head D->C (pick first 2 dims as logits)
W_out = Tensor([[1, 0],
[0, 1],
[0, 0],
[0, 0]]) # (4,2)
b_out = Tensor([0.0, 0.0])
logits = z.matmul(W_out) + b_out # (1,2)
print("h_test:", h_test.data)
print("z (GELU):", z.data)
print("logits:", logits.data)