import numpy

# TabPFN

# training data
X_train = Tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
Y_train = Tensor([1, 0])
X_test = Tensor([[9, 10, 11, 12]])

box("X_train", [X_train, Y_train, X_test], "1")

# Feature Encoder - Feature Embeddings
W_enc = Tensor([[1, 0.5], [0.5, 1], [0.3, 0.7], [0.7, 0.3]])
W_enc_transpose = W_enc.transpose()
b_enc = Tensor([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]])

box("Feature Encoder", W_enc_transpose, "2")

# Feature/group embeddings
E_feat = Tensor([[0.1, 0.0, 0.0, 0.0], [0.0, 0.1, 0.0, 0.0]])

box("Group embedding", E_feat, "6")

# Step 1: Combine Training and Test Samples
X_combined = X_combined = Tensor(np.vstack([X_train.data, X_test.data]))
box("Training and Test Samples grouped", X_combined, "4")


# Step 1: Group Features

def group(X):
    groups = X.shape[0] * W_enc.shape[1]
    X_encoded = np.zeros((3, 2, 4))
    # print(X_encoded)
    idx = 0
    col = 0
    for (group_idx, row) in enumerate(X.data):
        rt_ptr = 0
        for rt_ptr in range(0, len(row), 2):
            group_window = Tensor(row[rt_ptr:rt_ptr + 2])
            group_matmul = group_window.matmul(W_enc_transpose) + b_enc[group_idx]
            # group 1
            if col == 0:
                X_encoded[idx][0] = group_matmul.data + E_feat.data[0]
                col = 1
            # group 2
            else:
                X_encoded[idx][1] = group_matmul.data + + E_feat.data[1]
                col = 0
            box(f"grouping: group {col}", [group_window, group_matmul])
        idx += 1
    X_encoded_tensor = Tensor(X_encoded)
    return X_encoded_tensor


X_encoded = group(X_combined)
box("X_encoded", X_encoded, "4")

# Label Encoder - Label Embeddings
W_y = Tensor([[1, -1, 0, 0], [0, 0, 1, 1]])
b_y = Tensor([0, 0, 0, 0])
y_padded = Tensor([1, 0, np.nan])  # we wan't to mask y_test with nan
y_clean = Tensor([[1, 0, 0], [0, 0, 1]]).reshape(3, 2)
box("y_clean", y_clean, "4")


def label_embeddings(y_train):
    lbl_embds = np.zeros((3, 4))
    for (idx, row) in enumerate(y_train.data):
        res = Tensor((row)).matmul(W_y)
        lbl_embds[idx] = res.data
        box("Label Embeddings", [res], "5")

    return Tensor(lbl_embds)


label_embeds = label_embeddings(y_clean)
# print(label_embeds)

# Step 3: Add Thinking Tokens
Thinking_Tokens = Tensor([
    [[0.01, 0.02, 0.03, 0.04],
     [0.01, 0.02, 0.03, 0.04],
     [0.01, 0.02, 0.03, 0.04]],

    [[0.05, 0.06, 0.07, 0.08],
     [0.05, 0.06, 0.07, 0.08],
     [0.05, 0.06, 0.07, 0.08]]
])
box("Thinking Tokens", Thinking_Tokens, "4")

# Computing full model input

labels_reshaped = label_embeds.data.reshape(3, 1, 4)
data_rows = np.concatenate([X_encoded.data, labels_reshaped], axis=1)
E_numpy = np.concatenate([Thinking_Tokens.data, data_rows], axis=0)
E = Tensor(E_numpy)

# we need to adapt positional embeddings!
# Create row positional embeddings
P_col_pos_embeds = Tensor([[[0.1, 0.1, 0.1, 0.1],
                            [0.2, 0.2, 0.2, 0.2],
                            [0.3, 0.3, 0.3, 0.3]]])

# Add positional embeddings
E = E + P_col_pos_embeds
box("Positional Embedding", E, "9")

# Attention
W_q = Tensor(np.diag([0.1, 0.2, 0.1, 0.2]))
W_k = Tensor(np.diag([0.1, 0.1, 0.1, 0.1]))
W_v = Tensor(np.diag([1, 1, 1, 1]))

box("Attention weights", [W_q, W_k, W_v], "9")
scaling_factor = np.sqrt(4)

# labels = [E[1][2], E[2][2], E[2][2]]
col_att_softmax = Softmax()


def layer_norm_inplace(E: Tensor, eps=1e-5):
    """
    In-place LN over last dim D for every vector in E.
    E: (S, Ttok, D)
    """
    x = E.data
    mean = x.mean(axis=-1, keepdims=True)
    var = ((x - mean) * (x - mean)).mean(axis=-1, keepdims=True)
    x_norm = (x - mean) / np.sqrt(var + eps)
    box("Layer norn", [Tensor(x), Tensor(mean), Tensor(var), Tensor(x_norm)], "7")
    E.data[:] = x_norm

def column_attention_inplace(E: Tensor):
    """
    In-place column attention:
      For each item s: X = E[s] has shape (Ttok=3, D=4)
      Does self-attention across the 3 tokens and writes back:
         E[s] <- E[s] + Attn(E[s])
    """
    S, Ttok, D = E.shape
    softmax = Softmax()

    for s in range(S):
        # Snapshot of current item (avoid in-place mixing during compute)
        X = Tensor(E.data[s].copy())  # (3,4)

        Q = X.matmul(W_q.transpose())  # (3,4)
        K = X.matmul(W_k.transpose())  # (3,4)
        V = X.matmul(W_v.transpose())  # (3,4)

        scores = Q.matmul(K.transpose()) / math.sqrt(D)  # (3,3)
        A = softmax.forward(scores, dim=-1)  # (3,3)
        O = A.matmul(V)  # (3,4)

        box("column_attention", [Q, K, V, scores, A, O], "5")

        # In-place residual update of ALL tokens
        E.data[s] = E.data[s] + O.data


column_attention_inplace(E)
layer_norm_inplace(E)
box("Updated Logits", E + 0, "5")

def mlp_inplace(E: Tensor):
    """
    Minimal hand-friendly MLP with residual:
      x <- x + GELU(x)
    In-place.
    """
    gelu = GELU()
    x = Tensor(E.data.copy())
    gx = gelu.forward(x).data
    E.data[:] = E.data + gx


def row_attention_inplace(E: Tensor, single_eval_pos: int):
    """
    In-place row attention:
      For each token slot t:
        Q from all S items:      E[:, t, :] -> (S, D)
        K,V from first Klen rows E[:single_eval_pos, t, :] -> (Klen, D)
      Writes:
        E[:, t, :] <- E[:, t, :] + Attn_row(E[:, t, :])
    """
    S, Ttok, D = E.shape
    softmax = Softmax()

    Klen = single_eval_pos
    assert 0 < Klen <= S, "single_eval_pos must be between 1 and S"

    for t in range(Ttok):
        # Snapshot streams (avoid in-place mixing)
        X_all = Tensor(E.data[:, t, :].copy())  # (S, D)
        X_kv = Tensor(E.data[:Klen, t, :].copy())  # (Klen, D)

        Q = X_all.matmul(W_q.transpose())  # (S, D)
        K = X_kv.matmul(W_k.transpose())  # (Klen, D)
        V = X_kv.matmul(W_v.transpose())  # (Klen, D)

        scores = Q.matmul(K.transpose()) / math.sqrt(D)  # (S, Klen)
        A = softmax.forward(scores, dim=-1)  # (S, Klen)
        O = A.matmul(V)  # (S, D)

        # In-place residual update for this token slot
        box("row_attention", [Q, K, V, scores, A, O], "5")
        E.data[:, t, :] = E.data[:, t, :] + O.data


row_attention_inplace(E, single_eval_pos=4)
layer_norm_inplace(E)


# 3) MLP + LN
mlp_inplace(E)          # x <- x + GELU(x)
layer_norm_inplace(E)

# ============================================================
# Readout: take test row label token -> logits
# In this layout: rows are [think1, think2, train1, train2, test1]
# test index = T + N_train = 4
# label token index = 2
# ============================================================

test_row_idx = 4       # 4
label_tok_idx = 2                 # last token slot

h_test = Tensor(E.data[test_row_idx, label_tok_idx, :].reshape(1, 4))  # (1,4)

gelu = GELU()
z = gelu.forward(h_test)  # (1,4)

# Simple head D->C (pick first 2 dims as logits)
W_out = Tensor([[1, 0],
                [0, 1],
                [0, 0],
                [0, 0]])  # (4,2)
b_out = Tensor([0.0, 0.0])

logits = z.matmul(W_out) + b_out  # (1,2)

print("h_test:", h_test.data)
print("z (GELU):", z.data)
print("logits:", logits.data)