import tiktoken
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from contextlib import nullcontext
import math
from dataclasses import dataclass
from typing import Tuple, Optional, Literal

import torch.nn.functional as F
import torch.distributed as dist

from kernel import act_quant, weight_dequant, fp8_gemm

#####################################
# CONFIGURATION
#####################################
@dataclass
class ModelArgs:
    max_batch_size: int = 8
    max_seq_len: int = 2048
    dtype: Literal["bf16", "fp8"] = "bf16"
    scale_fmt: Optional[str] = None

    vocab_size: int = 102400
    dim: int = 1024
    inter_dim: int = 4096
    moe_inter_dim: int = 1024
    n_layers: int = 20
    n_dense_layers: int = 3
    n_heads: int = 12
   
    # moe
    n_routed_experts: int = 6
    n_shared_experts: int = 1
    n_activated_experts: int = 2
    route_scale: float = 1.
    use_routing_bias: bool = True  # Enable routing bias for fine-tuning expert selection
    
    # mla
    q_lora_rank: int = 0
    kv_lora_rank: int = 512
    qk_nope_head_dim: int = 128
    qk_rope_head_dim: int = 64
    v_head_dim: int = 128
    
    # yarn
    original_seq_len: int = 4096
    rope_theta: float = 10000.0
    rope_factor: float = 40
    beta_fast: int = 32
    beta_slow: int = 1
    mscale: float = 1.

    tokenizer_name: str = "gpt2"  #

# others
world_size = 1
rank = 0
block_size = 128
gemm_impl: Literal["bf16", "fp8"] = "bf16"


#####################################
# RoPE
#####################################
def precompute_freqs_cis(args: ModelArgs) -> torch.Tensor:
    dim = args.qk_rope_head_dim
    seqlen = args.max_seq_len
    beta_fast = args.beta_fast
    beta_slow = args.beta_slow
    base = args.rope_theta
    factor = args.rope_factor

    def find_correction_dim(num_rotations, dim, base, max_seq_len):
        return dim * math.log(max_seq_len / (num_rotations * 2 * math.pi)) / (2 * math.log(base))

    def find_correction_range(low_rot, high_rot, dim, base, max_seq_len):
        low = math.floor(find_correction_dim(low_rot, dim, base, max_seq_len))
        high = math.ceil(find_correction_dim(high_rot, dim, base, max_seq_len))
        return max(low, 0), min(high, dim-1)

    def linear_ramp_factor(min, max, dim):
        if min == max:
            max += 0.001
        linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
        ramp_func = torch.clamp(linear_func, 0, 1)
        return ramp_func

    freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
    if seqlen > args.original_seq_len:
        low, high = find_correction_range(beta_fast, beta_slow, dim, base, args.original_seq_len)
        smooth = 1 - linear_ramp_factor(low, high, dim // 2)
        freqs = freqs / factor * (1 - smooth) + freqs * smooth

    t = torch.arange(seqlen)
    freqs = torch.outer(t, freqs)
    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
    return freqs_cis


def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
    dtype = x.dtype
    x = torch.view_as_complex(x.float().view(*x.shape[:-1], -1, 2))
    freqs_cis = freqs_cis.view(1, x.size(1), 1, x.size(-1))
    y = torch.view_as_real(x * freqs_cis).flatten(3)
    return y.to(dtype)


#####################################
# LINEAR LAYERS
#####################################

def linear(x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, scale_fmt: Optional[str] = None) -> torch.Tensor:

    if weight.element_size() > 1:
        return F.linear(x, weight, bias)
    elif gemm_impl == "bf16":
        weight = weight_dequant(weight, weight.scale)
        return F.linear(x, weight, bias)
    else:
        x, scale = act_quant(x, block_size, scale_fmt)
        y = fp8_gemm(x, scale, weight, weight.scale)
        if bias is not None:
            y += bias
        return y


class Linear(nn.Module):
    dtype = torch.float32
    scale_fmt: Optional[str] = None

    def __init__(self, in_features: int, out_features: int, bias: bool = False, dtype = None):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features

        # Set dtype
        param_dtype = dtype or Linear.dtype

        # Initialize weight with proper distribution
        self.weight = nn.Parameter(torch.empty(out_features, in_features, dtype=param_dtype))
        # CRITICAL: Initialize weights!
        nn.init.normal_(self.weight, mean=0.0, std=0.02 / math.sqrt(in_features))

        if self.weight.element_size() == 1:
            scale_out_features = (out_features + block_size - 1) // block_size
            scale_in_features = (in_features + block_size - 1) // block_size
            self.weight.scale = self.scale = nn.Parameter(torch.empty(scale_out_features, scale_in_features, dtype=torch.float32))
            # Initialize scale to 1.0
            nn.init.ones_(self.scale)
        else:
            self.register_parameter("scale", None)

        if bias:
            self.bias = nn.Parameter(torch.empty(out_features, dtype=param_dtype))
            nn.init.zeros_(self.bias)
        else:
            self.register_parameter("bias", None)

    def forward(self, x: torch.Tensor) -> torch.Tensor:

        return linear(x, self.weight, self.bias, self.scale_fmt)


class ColumnParallelLinear(Linear):
    def __init__(self, in_features: int, out_features: int, bias: bool = False, dtype = None):
        assert out_features % world_size == 0, f"Output features must be divisible by world size (world_size={world_size})"
        self.part_out_features = out_features // world_size
        super().__init__(in_features, self.part_out_features, bias, dtype)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        y = linear(x, self.weight, self.bias)
        return y


class RowParallelLinear(Linear):
    def __init__(self, in_features: int, out_features: int, bias: bool = False, dtype = None):
        assert in_features % world_size == 0, f"Input features must be divisible by world size (world_size={world_size})"
        self.part_in_features = in_features // world_size
        super().__init__(self.part_in_features, out_features, bias, dtype)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        y = linear(x, self.weight)
        if world_size > 1:
            dist.all_reduce(y)
        if self.bias is not None:
            y += self.bias
        return y

#####################################
# NORMALIZATION
#####################################

class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.dim = dim
        self.eps = eps
        # Keep weight in float32 for stability
        self.weight = nn.Parameter(torch.ones(dim, dtype=torch.float32))

    def forward(self, x: torch.Tensor):
        # F.rms_norm handles dtype conversion internally
        output = F.rms_norm(x.float(), (self.dim,), self.weight, self.eps)
        return output.to(x.dtype)


#####################################
# ATTENTION
#####################################

class MultiHeadLatentAttention(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.dim = args.dim
        self.n_heads = args.n_heads
        self.n_local_heads = args.n_heads // world_size
        self.q_lora_rank = args.q_lora_rank
        self.kv_lora_rank = args.kv_lora_rank
        self.qk_nope_head_dim = args.qk_nope_head_dim
        self.qk_rope_head_dim = args.qk_rope_head_dim
        self.qk_head_dim = args.qk_nope_head_dim + args.qk_rope_head_dim
        self.v_head_dim = args.v_head_dim

        if self.q_lora_rank == 0:
            self.wq = ColumnParallelLinear(self.dim, self.n_heads * self.qk_head_dim)
        else:
            self.wq_a = Linear(self.dim, self.q_lora_rank)
            self.q_norm = RMSNorm(self.q_lora_rank)
            self.wq_b = ColumnParallelLinear(self.q_lora_rank, self.n_heads * self.qk_head_dim)

        self.wkv_a = Linear(self.dim, self.kv_lora_rank + self.qk_rope_head_dim)
        self.kv_norm = RMSNorm(self.kv_lora_rank)
        self.wkv_b = ColumnParallelLinear(self.kv_lora_rank, self.n_heads * (self.qk_nope_head_dim + self.v_head_dim))
        self.wo = RowParallelLinear(self.n_heads * self.v_head_dim, self.dim)
        self.softmax_scale = self.qk_head_dim ** -0.5

        if args.max_seq_len > args.original_seq_len:
            mscale = 0.1 * args.mscale * math.log(args.rope_factor) + 1.0
            self.softmax_scale = self.softmax_scale * mscale * mscale


        self.register_buffer("kv_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.kv_lora_rank, dtype=Linear.dtype), persistent=False)
        self.register_buffer("pe_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.qk_rope_head_dim, dtype=Linear.dtype), persistent=False)

    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):

        bsz, seqlen, _ = x.size()
        end_pos = start_pos + seqlen
        if self.q_lora_rank == 0:
            q = self.wq(x)
        else:
            q = self.wq_b(self.q_norm(self.wq_a(x)))
        q = q.view(bsz, seqlen, self.n_local_heads, self.qk_head_dim)
        q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
        q_pe = apply_rotary_emb(q_pe, freqs_cis)
        kv = self.wkv_a(x)
        kv, k_pe = torch.split(kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
        k_pe = apply_rotary_emb(k_pe.unsqueeze(2), freqs_cis)


        wkv_b = self.wkv_b.weight if self.wkv_b.scale is None else weight_dequant(self.wkv_b.weight, self.wkv_b.scale, block_size)
        wkv_b = wkv_b.view(self.n_local_heads, -1, self.kv_lora_rank)
        q_nope = torch.einsum("bshd,hdc->bshc", q_nope, wkv_b[:, :self.qk_nope_head_dim])
        self.kv_cache[:bsz, start_pos:end_pos] = self.kv_norm(kv).detach()
        self.pe_cache[:bsz, start_pos:end_pos] = k_pe.squeeze(2).detach()
        scores = (torch.einsum("bshc,btc->bsht", q_nope, self.kv_cache[:bsz, :end_pos]) +
                torch.einsum("bshr,btr->bsht", q_pe, self.pe_cache[:bsz, :end_pos])) * self.softmax_scale

        if mask is not None:
            scores += mask.unsqueeze(1)
        scores = scores.softmax(dim=-1, dtype=torch.float32).type_as(x)


        x = torch.einsum("bsht,btc->bshc", scores, self.kv_cache[:bsz, :end_pos])
        x = torch.einsum("bshc,hdc->bshd", x, wkv_b[:, -self.v_head_dim:])
        x = self.wo(x.flatten(2))
        return x


#####################################
# MOE FEEDFORWARD
#####################################

class Gate(nn.Module):

    def __init__(self, args: ModelArgs):
        super().__init__()
        self.dim = args.dim
        self.n_routed_experts = args.n_routed_experts
        self.n_activated_experts = args.n_activated_experts
        self.route_scale = args.route_scale

        # Gate weight
        self.weight = nn.Parameter(torch.empty(args.n_routed_experts, args.dim, dtype=Linear.dtype))
        nn.init.normal_(self.weight, mean=0.0, std=0.02 / math.sqrt(args.dim)) 

        # Optional routing bias for fine-tuning expert selection
        if args.use_routing_bias:
            self.bias = nn.Parameter(torch.zeros(args.n_routed_experts, dtype=torch.float32))
        else:
            self.register_parameter("bias", None)

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:

        # Compute routing scores
        scores = linear(x, self.weight)

        # Apply scoring function
        scores = scores.sigmoid()

        original_scores = scores

        # Apply routing bias if available
        if self.bias is not None:
            scores = scores + self.bias

        # Select top-k experts
        indices = torch.topk(scores, self.n_activated_experts, dim=-1)[1]
        weights = original_scores.gather(1, indices)

        # Normalize weights (sigmoid always needs normalization)
        weights = weights / weights.sum(dim=-1, keepdim=True)

        # Apply route scaling
        weights = weights * self.route_scale

        return weights.type_as(x), indices


class Expert(nn.Module):

    def __init__(self, dim: int, inter_dim: int):
        super().__init__()
        self.w1 = Linear(dim, inter_dim, bias=False)
        self.w2 = Linear(inter_dim, dim, bias=False)
        self.w3 = Linear(dim, inter_dim, bias=False)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # SwiGLU activation: w2(silu(w1(x)) * w3(x))
        return self.w2(F.silu(self.w1(x)) * self.w3(x))


class MoE(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.dim = args.dim
        self.n_routed_experts = args.n_routed_experts
        self.n_activated_experts = args.n_activated_experts
        self.active_expert_idx = None  # None = all active (inference mode)
        
        self.gate = Gate(args)
        self.experts = nn.ModuleList([
            Expert(args.dim, args.moe_inter_dim)
            for _ in range(args.n_routed_experts)
        ])
        self.shared_experts = MLP(args.dim, args.n_shared_experts * args.moe_inter_dim)
        self.ffn_norm = RMSNorm(args.dim)
        
        # Load balance loss coefficient
        self.lb_loss_coef = 0.01

    def set_active_expert(self, expert_idx: Optional[int]):
        """Freeze all but the active expert to save optimizer memory"""
        self.active_expert_idx = expert_idx
        
        for i, expert in enumerate(self.experts):
            requires_grad = (expert_idx is None) or (i == expert_idx)
            for param in expert.parameters():
                param.requires_grad = requires_grad

    def compute_load_balance_loss(self, router_probs, expert_indices):
        """Encourage uniform expert utilization"""
        # router_probs: [num_tokens, n_experts]
        # expert_indices: [num_tokens, top_k]
        
        # Token fraction per expert
        tokens_per_expert = torch.zeros(self.n_routed_experts, device=router_probs.device)
        indices_flat = expert_indices.view(-1)
        ones = torch.ones_like(indices_flat, dtype=torch.float32)
        tokens_per_expert.scatter_add_(0, indices_flat, ones)
        tokens_per_expert = tokens_per_expert / (indices_flat.numel() + 1e-8)
        
        # Average routing probability per expert
        router_prob_per_expert = router_probs.mean(dim=0)
        
        # Load balancing loss (minimize difference)
        loss = torch.mean(tokens_per_expert * router_prob_per_expert) * self.n_routed_experts
        return loss

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        original_shape = x.size()
        x = x.view(-1, self.dim)

        router_logits = linear(x, self.gate.weight, self.gate.bias)
        router_probs = router_logits.sigmoid()
        weights, indices = torch.topk(router_probs, self.n_activated_experts, dim=-1)
        
        # Normalize weights
        weights = weights / (weights.sum(dim=-1, keepdim=True) + 1e-8)  # Add epsilon for stability
        weights = weights * self.gate.route_scale

        # CRITICAL FIX: Check training mode AND active expert
        if self.training and self.active_expert_idx is not None:
            # Sequential training mode - only train one expert
            y = torch.zeros_like(x)
            i = self.active_expert_idx

            # Find tokens where expert i is in the top-k
            mask = (indices == i)
            idx = torch.where(mask.any(dim=1))[0]

            if idx.numel() > 0:
                top_positions = torch.argmax(mask[idx].int(), dim=1)
                expert_weights = weights[idx, top_positions].unsqueeze(-1)
                expert_out = self.experts[i](x[idx])
                y[idx] = expert_out * expert_weights

            # Load balance loss
            lb_loss = self.compute_load_balance_loss(router_probs, indices)

            # Shared experts
            z = self.shared_experts(x)
            return (y + z).view(original_shape), lb_loss
        
        else:
            # Inference mode or all-experts training mode
            y = torch.zeros_like(x)
            for i in range(self.n_routed_experts):
                mask = (indices == i)
                idx = torch.where(mask.any(dim=1))[0]

                if idx.numel() == 0:
                    continue

                top_positions = torch.argmax(mask[idx].int(), dim=1)
                expert_weights = weights[idx, top_positions].unsqueeze(-1)
                expert_out = self.experts[i](x[idx])
                y[idx] += expert_out * expert_weights

            z = self.shared_experts(x)
            output = (y + z).view(original_shape)

            # Only compute load balance loss during training
            if self.training:
                lb_loss = self.compute_load_balance_loss(router_probs, indices)
                return output, lb_loss
            else:
                return output, None


#####################################
# DENSE FEEDFORWARD (MLP)
#####################################

class MLP(nn.Module):
    def __init__(self, dim: int, inter_dim: int):
        super().__init__()
        self.fc1 = Linear(dim, inter_dim, bias=False)
        self.fc2 = Linear(dim, inter_dim, bias=False)
        self.fc3 = Linear(inter_dim, dim, bias=False)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # SwiGLU-style activation: silu(fc1(x)) * fc2(x)
        return self.fc3(F.silu(self.fc1(x)) * self.fc2(x))


#####################################
# TRANSFORMER BLOCKS
#####################################

class Block(nn.Module):
    def __init__(self, layer_id: int, args: ModelArgs):
        super().__init__()
        self.attn = MultiHeadLatentAttention(args)
        # Use dense MLP for first n_dense_layers, then MoE for remaining layers
        self.ffn = MLP(args.dim, args.inter_dim) if layer_id < args.n_dense_layers else MoE(args)
        self.attn_norm = RMSNorm(args.dim)
        self.ffn_norm = RMSNorm(args.dim)

    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        x = x + self.attn(self.attn_norm(x), start_pos, freqs_cis, mask)

        # Handle both MLP (returns single output) and MoE (returns output + loss)
        ffn_result = self.ffn(self.ffn_norm(x))
        if isinstance(ffn_result, tuple):
            ffn_out, lb_loss = ffn_result
        else:
            ffn_out = ffn_result
            lb_loss = None

        x = x + ffn_out
        return x, lb_loss
    

#####################################
# TRANSFORMER MODEL
#####################################

class ismail(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.args = args
        self.vocab_size = args.vocab_size
        self.n_layers = args.n_layers

        # Create embedding with correct dtype
        self.tok_embeddings = nn.Embedding(args.vocab_size, args.dim, dtype=Linear.dtype)
        nn.init.normal_(self.tok_embeddings.weight, mean=0.0, std=0.02)

        self.layers = nn.ModuleList([Block(i, args) for i in range(args.n_layers)])
        self.norm = RMSNorm(args.dim)
        self.output = Linear(args.dim, args.vocab_size, bias=False, dtype=Linear.dtype)
        self.use_checkpointing = False

        self.register_buffer("freqs_cis", precompute_freqs_cis(args), persistent=False)

    def set_active_expert(self, expert_idx: Optional[int]):
        """Set active expert for all MoE layers (for sequential training)"""
        for layer in self.layers:
            if isinstance(layer.ffn, MoE):
                layer.ffn.set_active_expert(expert_idx)

    def forward(self, tokens: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
        bsz, seqlen = tokens.shape
        h = self.tok_embeddings(tokens).to(Linear.dtype)
        freqs_cis = self.freqs_cis[start_pos:start_pos + seqlen]

        # CRITICAL: Always clear caches at start_pos=0, regardless of training mode
        if start_pos == 0:
            for layer in self.layers:
                if hasattr(layer.attn, 'kv_cache'):
                    layer.attn.kv_cache.zero_()
                if hasattr(layer.attn, 'pe_cache'):
                    layer.attn.pe_cache.zero_()

        mask = None
        if seqlen > 1:
            mask = torch.full((seqlen, seqlen), float("-inf"), device=tokens.device, dtype=h.dtype)
            mask = torch.triu(mask, diagonal=1)
            mask = torch.hstack([torch.zeros((seqlen, start_pos), device=tokens.device, dtype=h.dtype), mask])

        total_lb_loss = 0.0
        
        for layer in self.layers:
            h, lb_loss = layer(h, start_pos, freqs_cis, mask)
            if lb_loss is not None:
                total_lb_loss += lb_loss

        h = self.norm(h)
        output = self.output(h)

        # FIX: Only return load balance loss during training
        if self.training and total_lb_loss > 0:
            return output, total_lb_loss
        else:
            return output