# TabPFN Complete Implementation from Base Components
# Using only Tensor, Linear, Softmax, and basic operations

import numpy as np
from tinytorch.core.tensor import Tensor
from tinytorch.core.activations import Softmax, GELU
from tinytorch.core.layers import Linear, Dropout
import math


# ============================================
# Base Components for TabPFN
# ============================================

def scaled_dot_product_attention(Q, K, V, mask=None):
    """
    Scaled Dot-Product Attention from base components
    """
    # Q, K, V are Tensors with shape [batch, seq_len, d_k]
    d_k = Q.shape[-1]

    # Compute attention scores: Q @ K^T
    scores = Q.matmul(K.transpose(-2, -1))

    # Scale scores
    scaling_factor = 1 / math.sqrt(d_k)
    scaled_scores = scores * scaling_factor

    # Apply mask if provided
    if mask is not None:
        scaled_scores = scaled_scores + (mask * -1e9)

    # Apply softmax
    softmax = Softmax()
    attention_weights = softmax.forward(scaled_scores, dim=-1)

    # Apply attention to values
    output = attention_weights.matmul(V)

    return output, attention_weights


def multi_head_attention(x, W_q, W_k, W_v, W_o, n_heads, mask=None):
    """
    Multi-Head Attention using base components
    """
    batch_size, seq_len, d_model = x.shape
    d_k = d_model // n_heads

    # Linear projections
    Q = x.matmul(W_q.transpose())  # [batch, seq_len, d_model]
    K = x.matmul(W_k.transpose())  # [batch, seq_len, d_model]
    V = x.matmul(W_v.transpose())  # [batch, seq_len, d_model]

    # Reshape for multi-head attention
    Q = Q.reshape(batch_size, seq_len, n_heads, d_k).transpose(1, 2)
    K = K.reshape(batch_size, seq_len, n_heads, d_k).transpose(1, 2)
    V = V.reshape(batch_size, seq_len, n_heads, d_k).transpose(1, 2)

    # Scaled dot-product attention for each head
    attn_output, attn_weights = scaled_dot_product_attention(Q, K, V, mask)

    # Concatenate heads
    attn_output = attn_output.transpose(1, 2).reshape(batch_size, seq_len, d_model)

    # Output projection
    output = attn_output.matmul(W_o.transpose())

    return output


def layer_norm(x, gamma, beta, eps=1e-5):
    """
    Layer Normalization from base components
    """
    mean = x.mean(axis=-1, keepdims=True)
    var = ((x - mean) * (x - mean)).mean(axis=-1, keepdims=True)
    std = (var + eps).sqrt()
    normalized = (x - mean) / std
    return normalized * gamma + beta


def feed_forward_network(x, W1, b1, W2, b2):
    """
    Feed Forward Network with GELU activation
    """
    # First linear layer (expansion)
    hidden = x.matmul(W1.transpose()) + b1
    # GELU activation
    gelu = GELU()
    hidden = gelu.forward(hidden)
    # Second linear layer (projection)
    output = hidden.matmul(W2.transpose()) + b2
    return output


# ============================================
# TabPFN Transformer Block
# ============================================

class TabPFNBlock:
    def __init__(self, d_model=256, n_heads=8, dropout=0.1):
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads

        # Multi-head attention weights
        self.W_q = Tensor(np.random.randn(d_model, d_model) * 0.02)
        self.W_k = Tensor(np.random.randn(d_model, d_model) * 0.02)
        self.W_v = Tensor(np.random.randn(d_model, d_model) * 0.02)
        self.W_o = Tensor(np.random.randn(d_model, d_model) * 0.02)

        # Layer normalization parameters
        self.gamma1 = Tensor(np.ones((d_model,)))
        self.beta1 = Tensor(np.zeros((d_model,)))
        self.gamma2 = Tensor(np.ones((d_model,)))
        self.beta2 = Tensor(np.zeros((d_model,)))

        # Feed-forward network weights (4x expansion)
        self.W_ffn1 = Tensor(np.random.randn(d_model * 4, d_model) * 0.02)
        self.b_ffn1 = Tensor(np.zeros((d_model * 4,)))
        self.W_ffn2 = Tensor(np.random.randn(d_model, d_model * 4) * 0.02)
        self.b_ffn2 = Tensor(np.zeros((d_model,)))

        # Dropout
        self.dropout = Dropout(dropout)

    def forward(self, x, mask=None):
        # Save input for skip connection
        residual = x

        # Multi-head attention
        attn_output = multi_head_attention(x, self.W_q, self.W_k, self.W_v, self.W_o, self.n_heads, mask)
        attn_output = self.dropout.forward(attn_output, training=True)

        # Skip connection and layer norm
        x = residual + attn_output
        x = layer_norm(x, self.gamma1, self.beta1)

        # Save for skip connection
        residual = x

        # Feed-forward network
        ff_output = feed_forward_network(x, self.W_ffn1, self.b_ffn1, self.W_ffn2, self.b_ffn2)
        ff_output = self.dropout.forward(ff_output, training=True)

        # Skip connection and layer norm
        x = residual + ff_output
        x = layer_norm(x, self.gamma2, self.beta2)

        return x


# ============================================
# Complete TabPFN Model
# ============================================

class TabPFN:
    def __init__(self,
                 n_features=100,
                 d_model=4,
                 n_heads=1,
                 n_layers=12,
                 n_classes=2,
                 dropout=0.1):

        self.n_features = n_features
        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.n_classes = n_classes

        # Input embedding (feature projection)
        self.W_embed = Tensor(np.random.randn(d_model, 1) * 0.02)
        self.b_embed = Tensor(np.zeros((d_model,)))

        # Learnable patterns (TabPFN innovation)
        self.patterns = Tensor(np.random.randn(1, n_features, d_model) * 0.02)

        # Positional encoding (simplified)
        self.pos_encoding = self.create_positional_encoding(n_features, d_model)

        # Transformer blocks
        self.blocks = []
        for _ in range(n_layers):
            block = TabPFNBlock(d_model, n_heads, dropout)
            self.blocks.append(block)

        # Output projection
        self.W_out = Tensor(np.random.randn(n_classes, d_model) * 0.02)
        self.b_out = Tensor(np.zeros((n_classes,)))

    def create_positional_encoding(self, seq_len, d_model):
        """Create sinusoidal positional encoding"""
        pos_encoding = np.zeros((seq_len, d_model))
        position = np.arange(seq_len).reshape(-1, 1)
        div_term = np.exp(np.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))

        pos_encoding[:, 0::2] = np.sin(position * div_term)
        pos_encoding[:, 1::2] = np.cos(position * div_term)

        return Tensor(pos_encoding)

    def forward(self, x):
        """
        x shape: [batch_size, n_features, 1] - tabular data
        """
        batch_size = x.shape[0]

        # 1. Feature Embedding
        # x: [batch, features, 1] -> [batch, features, d_model]
        embedded = x.matmul(self.W_embed.transpose()) + self.b_embed

        # 2. Add positional encoding
        embedded = embedded + self.pos_encoding

        # 3. Apply learnable patterns (TabPFN innovation)
        # Multiply by patterns: [batch, features, d_model] * [1, features, d_model]
        embedded = embedded * self.patterns

        # 4. Pass through transformer blocks
        features = embedded
        for block in self.blocks:
            features = block.forward(features)

        # 5. Feature pooling (mean across features)
        # features: [batch, features, d_model] -> [batch, d_model]
        pooled = features.mean(axis=1)

        # 6. Output projection
        output = pooled.matmul(self.W_out.transpose()) + self.b_out

        return output


# ============================================
# Visualization with Boxes
# ============================================

# Create synthetic tabular data
batch_size = 1
n_features = 4
x_data = np.random.randn(batch_size, n_features, 1)

# Create TabPFN model
tabpfn = TabPFN(n_features=n_features)

# Convert to Tensor
x = Tensor(x_data)

print("=" * 80)
print("TabPFN Model - Step by Step Visualization")
print("=" * 80)

# Step 1: Input Table
box("Input Table", x, "3")
print(f"Shape: {x.shape}")
print()

# Step 2: Feature Embedding
embedded = x.matmul(tabpfn.W_embed.transpose()) + tabpfn.b_embed
box("Feature Embedding", embedded, "2")
print(f"Shape: {embedded.shape}")
print(f"W_embed shape: {tabpfn.W_embed.shape}")
print()

# Step 3: Positional Encoding
pos_encoded = embedded + tabpfn.pos_encoding
box("+ Positional Encoding", pos_encoded, "3")
print(f"Pos encoding shape: {tabpfn.pos_encoding.shape}")
print()

# Step 4: Learnable Patterns (TabPFN Innovation)
patterned = pos_encoded * tabpfn.patterns
box("× Learnable Patterns", patterned, "4")
print(f"Patterns shape: {tabpfn.patterns.shape}")
print()

# Step 5: Transformer Blocks (first block detailed)
print("Transformer Block 1:")
print("-" * 40)

# Get first block
block = tabpfn.blocks[0]

# Multi-head attention weights
box("W_q (Attention)", block.W_q, "1")
box("W_k (Attention)", block.W_k, "2")
box("W_v (Attention)", block.W_v, "3")
box("W_o (Attention)", block.W_o, "4")

# Attention computation
Q = patterned.matmul(block.W_q.transpose())
K = patterned.matmul(block.W_k.transpose())
V = patterned.matmul(block.W_v.transpose())

box("Q (Query)", Q, "4")
box("K (Key)", K, "5")
box("V (Value)", V, "6")

# Reshape for multi-head
batch_size, seq_len, d_model = Q.shape
Q_reshaped = Q.reshape(batch_size, seq_len, tabpfn.n_heads, -1).transpose(1, 2)
K_reshaped = K.reshape(batch_size, seq_len, tabpfn.n_heads, -1).transpose(1, 2)
V_reshaped = V.reshape(batch_size, seq_len, tabpfn.n_heads, -1).transpose(1, 2)

# Scaled dot-product attention
scores = Q_reshaped.matmul(K_reshaped.transpose(-2, -1))
scaling_factor = 1 / math.sqrt(block.d_k)
scaled_scores = scores * scaling_factor

softmax = Softmax()
attention_weights = softmax.forward(scaled_scores, dim=-1)
attn_output = attention_weights.matmul(V_reshaped)

# Output projection
attn_output_reshaped = attn_output.transpose(1, 2).reshape(batch_size, seq_len, d_model)
attn_final = attn_output_reshaped.matmul(block.W_o.transpose())

box("Attention Output", attn_final, "7")

# Skip connection and layer norm
residual = patterned
x_after_attn = residual + attn_final
x_norm1 = layer_norm(x_after_attn, block.gamma1, block.beta1)

box("After Attention + Skip", x_after_attn, "8")
box("After Layer Norm", x_norm1, "9")

# Feed-forward network
ff_output = feed_forward_network(x_norm1, block.W_ffn1, block.b_ffn1, block.W_ffn2, block.b_ffn2)

# Skip connection and layer norm
residual2 = x_norm1
x_after_ffn = residual2 + ff_output
x_norm2 = layer_norm(x_after_ffn, block.gamma2, block.beta2)

box("FFN Output", ff_output, "5")
box("After FFN + Skip", x_after_ffn, "6")
box("Final Block Output", x_norm2, "7")

# Step 6: Through all transformer blocks (simplified)
features = x_norm2
for i in range(1, tabpfn.n_layers):
    features = tabpfn.blocks[i].forward(features)
    if i < 3:  # Show first 3 blocks
        box(f"Block {i + 1} Output", features, f"13.{i}")
        print(features)

# Step 7: Feature Pooling
pooled = features.mean(axis=1)
box("Feature Pooling (Mean)", pooled, "8")
print(f"Shape after pooling: {pooled.shape}")

# Step 8: Output Projection
output = pooled.matmul(tabpfn.W_out.transpose()) + tabpfn.b_out
box("Final Output", output, "9")
print(f"Output shape: {output.shape}")
print(f"Number of classes: {tabpfn.n_classes}")

print("\n" + "=" * 80)
print("TabPFN Model Statistics:")
print("=" * 80)
print(f"Total parameters: ~1.5M")
print(f"Transformer layers: {tabpfn.n_layers}")
print(f"Model dimension: {tabpfn.d_model}")
print(f"Attention heads: {tabpfn.n_heads}")
print(f"Input features: {tabpfn.n_features}")
print(f"Output classes: {tabpfn.n_classes}")


# Function to count parameters
def count_parameters(model):
    total = 0
    # Count embedding parameters
    total += model.W_embed.size + model.b_embed.size
    total += model.patterns.size
    total += model.pos_encoding.size

    # Count transformer block parameters
    for block in model.blocks:
        total += block.W_q.size + block.W_k.size + block.W_v.size + block.W_o.size
        total += block.gamma1.size + block.beta1.size + block.gamma2.size + block.beta2.size
        total += block.W_ffn1.size + block.b_ffn1.size + block.W_ffn2.size + block.b_ffn2.size

    # Count output parameters
    total += model.W_out.size + model.b_out.size

    return total


print(f"Actual parameter count: {count_parameters(tabpfn):,}")

print("\n" + "=" * 80)
print("✅ TabPFN model created successfully from base components!")
print("=" * 80)