Spaces:

gabriead
/

tiny-torch-viz

Running

File size: 7,151 Bytes

30bbad7
 
b0b05a2
30bbad7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0b05a2
 
30bbad7
b0b05a2
30bbad7
 
 
b0b05a2
30bbad7
 
b0b05a2
30bbad7
 
 
 
 
 
 
 
 
 
 
 
b0b05a2
30bbad7
b0b05a2
30bbad7
b0b05a2
30bbad7
b0b05a2
aa2ae84
f393371
30bbad7
 
 
 
 
 
b0b05a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa2ae84
b0b05a2
 
 
 
 
 
30bbad7
 
b0b05a2
 
 
 
30bbad7
b0b05a2
 
 
 
 
30bbad7
 
 
 
 
 
 
 
b0b05a2
30bbad7
 
 
 
 
 
 
 
 
 
 
 
 
 
b0b05a2
30bbad7
 
b0b05a2
 
 
 
aa2ae84
 
 
 
 
 
 
 
 
 
 
 
b0b05a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa2ae84
f5a78dd
b0b05a2
 
 
 
 
aa2ae84
f5a78dd
b0b05a2
aa2ae84
 
 
 
 
 
 
 
 
 
 
 
b0b05a2
f5a78dd
b0b05a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30bbad7
b0b05a2
 
 
30bbad7
b0b05a2
 
 
30bbad7
b0b05a2
aa2ae84
b0b05a2
aa2ae84

import numpy

# TabPFN

# training data
X_train = Tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
Y_train = Tensor([1, 0])
X_test = Tensor([[9, 10, 11, 12]])

box("X_train", [X_train, Y_train, X_test], "1")

# Feature Encoder - Feature Embeddings
W_enc = Tensor([[1, 0.5], [0.5, 1], [0.3, 0.7], [0.7, 0.3]])
W_enc_transpose = W_enc.transpose()
b_enc = Tensor([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]])

box("Feature Encoder", W_enc_transpose, "2")

# Feature/group embeddings
E_feat = Tensor([[0.1, 0.0, 0.0, 0.0], [0.0, 0.1, 0.0, 0.0]])

box("Group embedding", E_feat, "6")

# Step 1: Combine Training and Test Samples
X_combined = X_combined = Tensor(np.vstack([X_train.data, X_test.data]))
box("Training and Test Samples grouped", X_combined, "4")


# Step 1: Group Features

def group(X):
    groups = X.shape[0] * W_enc.shape[1]
    X_encoded = np.zeros((3, 2, 4))
    # print(X_encoded)
    idx = 0
    col = 0
    for (group_idx, row) in enumerate(X.data):
        rt_ptr = 0
        for rt_ptr in range(0, len(row), 2):
            group_window = Tensor(row[rt_ptr:rt_ptr + 2])
            group_matmul = group_window.matmul(W_enc_transpose) + b_enc[group_idx]
            # group 1
            if col == 0:
                X_encoded[idx][0] = group_matmul.data + E_feat.data[0]
                col = 1
            # group 2
            else:
                X_encoded[idx][1] = group_matmul.data + + E_feat.data[1]
                col = 0
            box(f"grouping: group {col}", [group_window, group_matmul])
        idx += 1
    X_encoded_tensor = Tensor(X_encoded)
    return X_encoded_tensor


X_encoded = group(X_combined)
box("X_encoded", X_encoded, "4")

# Label Encoder - Label Embeddings
W_y = Tensor([[1, -1, 0, 0], [0, 0, 1, 1]])
b_y = Tensor([0, 0, 0, 0])
y_padded = Tensor([1, 0, np.nan])  # we wan't to mask y_test with nan
y_clean = Tensor([[1, 0, 0], [0, 0, 1]]).reshape(3, 2)
box("y_clean", y_clean, "4")


def label_embeddings(y_train):
    lbl_embds = np.zeros((3, 4))
    for (idx, row) in enumerate(y_train.data):
        res = Tensor((row)).matmul(W_y)
        lbl_embds[idx] = res.data
        box("Label Embeddings", [res], "5")

    return Tensor(lbl_embds)


label_embeds = label_embeddings(y_clean)
# print(label_embeds)

# Step 3: Add Thinking Tokens
Thinking_Tokens = Tensor([
    [[0.01, 0.02, 0.03, 0.04],
     [0.01, 0.02, 0.03, 0.04],
     [0.01, 0.02, 0.03, 0.04]],

    [[0.05, 0.06, 0.07, 0.08],
     [0.05, 0.06, 0.07, 0.08],
     [0.05, 0.06, 0.07, 0.08]]
])
box("Thinking Tokens", Thinking_Tokens, "4")

# Computing full model input

labels_reshaped = label_embeds.data.reshape(3, 1, 4)
data_rows = np.concatenate([X_encoded.data, labels_reshaped], axis=1)
E_numpy = np.concatenate([Thinking_Tokens.data, data_rows], axis=0)
E = Tensor(E_numpy)

# we need to adapt positional embeddings!
# Create row positional embeddings
P_col_pos_embeds = Tensor([[[0.1, 0.1, 0.1, 0.1],
                            [0.2, 0.2, 0.2, 0.2],
                            [0.3, 0.3, 0.3, 0.3]]])

# Add positional embeddings
E = E + P_col_pos_embeds
box("Positional Embedding", E, "9")

# Attention
W_q = Tensor(np.diag([0.1, 0.2, 0.1, 0.2]))
W_k = Tensor(np.diag([0.1, 0.1, 0.1, 0.1]))
W_v = Tensor(np.diag([1, 1, 1, 1]))

box("Attention weights", [W_q, W_k, W_v], "9")
scaling_factor = np.sqrt(4)

# labels = [E[1][2], E[2][2], E[2][2]]
col_att_softmax = Softmax()


def layer_norm_inplace(E: Tensor, eps=1e-5):
    """
    In-place LN over last dim D for every vector in E.
    E: (S, Ttok, D)
    """
    x = E.data
    mean = x.mean(axis=-1, keepdims=True)
    var = ((x - mean) * (x - mean)).mean(axis=-1, keepdims=True)
    x_norm = (x - mean) / np.sqrt(var + eps)
    box("Layer norn", [Tensor(x), Tensor(mean), Tensor(var), Tensor(x_norm)], "7")
    E.data[:] = x_norm

def column_attention_inplace(E: Tensor):
    """
    In-place column attention:
      For each item s: X = E[s] has shape (Ttok=3, D=4)
      Does self-attention across the 3 tokens and writes back:
         E[s] <- E[s] + Attn(E[s])
    """
    S, Ttok, D = E.shape
    softmax = Softmax()

    for s in range(S):
        # Snapshot of current item (avoid in-place mixing during compute)
        X = Tensor(E.data[s].copy())  # (3,4)

        Q = X.matmul(W_q.transpose())  # (3,4)
        K = X.matmul(W_k.transpose())  # (3,4)
        V = X.matmul(W_v.transpose())  # (3,4)

        scores = Q.matmul(K.transpose()) / math.sqrt(D)  # (3,3)
        A = softmax.forward(scores, dim=-1)  # (3,3)
        O = A.matmul(V)  # (3,4)

        box("column_attention", [Q, K, V, scores, A, O], "5")

        # In-place residual update of ALL tokens
        E.data[s] = E.data[s] + O.data


column_attention_inplace(E)
layer_norm_inplace(E)
box("Updated Logits", E + 0, "5")

def mlp_inplace(E: Tensor):
    """
    Minimal hand-friendly MLP with residual:
      x <- x + GELU(x)
    In-place.
    """
    gelu = GELU()
    x = Tensor(E.data.copy())
    gx = gelu.forward(x).data
    E.data[:] = E.data + gx



def row_attention_inplace(E: Tensor, single_eval_pos: int):
    """
    In-place row attention:
      For each token slot t:
        Q from all S items:      E[:, t, :] -> (S, D)
        K,V from first Klen rows E[:single_eval_pos, t, :] -> (Klen, D)
      Writes:
        E[:, t, :] <- E[:, t, :] + Attn_row(E[:, t, :])
    """
    S, Ttok, D = E.shape
    softmax = Softmax()

    Klen = single_eval_pos
    assert 0 < Klen <= S, "single_eval_pos must be between 1 and S"

    for t in range(Ttok):
        # Snapshot streams (avoid in-place mixing)
        X_all = Tensor(E.data[:, t, :].copy())  # (S, D)
        X_kv = Tensor(E.data[:Klen, t, :].copy())  # (Klen, D)

        Q = X_all.matmul(W_q.transpose())  # (S, D)
        K = X_kv.matmul(W_k.transpose())  # (Klen, D)
        V = X_kv.matmul(W_v.transpose())  # (Klen, D)

        scores = Q.matmul(K.transpose()) / math.sqrt(D)  # (S, Klen)
        A = softmax.forward(scores, dim=-1)  # (S, Klen)
        O = A.matmul(V)  # (S, D)

        # In-place residual update for this token slot
        box("row_attention", [Q, K, V, scores, A, O], "5")
        E.data[:, t, :] = E.data[:, t, :] + O.data


row_attention_inplace(E, single_eval_pos=4)
layer_norm_inplace(E)


# 3) MLP + LN
mlp_inplace(E)          # x <- x + GELU(x)
layer_norm_inplace(E)

# ============================================================
# Readout: take test row label token -> logits
# In this layout: rows are [think1, think2, train1, train2, test1]
# test index = T + N_train = 4
# label token index = 2
# ============================================================

test_row_idx = 4       # 4
label_tok_idx = 2                 # last token slot

h_test = Tensor(E.data[test_row_idx, label_tok_idx, :].reshape(1, 4))  # (1,4)

gelu = GELU()
z = gelu.forward(h_test)  # (1,4)

# Simple head D->C (pick first 2 dims as logits)
W_out = Tensor([[1, 0],
                [0, 1],
                [0, 0],
                [0, 0]])  # (4,2)
b_out = Tensor([0.0, 0.0])

logits = z.matmul(W_out) + b_out  # (1,2)

print("h_test:", h_test.data)
print("z (GELU):", z.data)
print("logits:", logits.data)