import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from dataclasses import dataclass
from contextlib import nullcontext
from typing import Literal 


class CausalSelfAttention(nn.Module):
    # A causal self-attention layer that supports both flash attention and standard attention.
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0 # Ensures the embedding dimension can be evenly split across attention heads.
        
        # This linear layer projects input x into query (q), key (k), and value (v) vectors — 
        # all at once (so the output is 3× the size).
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        
        # After attention is done, this layer projects the output back to the original embedding size.
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        
        # Dropout applied to the attention weights (probabilities).
        self.attn_dropout = nn.Dropout(config.dropout)
        
        # Dropout applied after the final projection.
        self.resid_dropout = nn.Dropout(config.dropout)
        
        # Store values for easy access later.
        self.n_head = config.n_head 
        self.n_embd = config.n_embd
        
        # Checks whether the efficient Flash Attention API is available in torch.nn.functional.
        self.flash = hasattr(F, "scaled_dot_product_attention")
        
        # If Flash Attention is not available, we create a lower triangular mask to ensure causality.
        # This mask prevents the model from attending to future tokens in the sequence.
        if not self.flash:
            # register_buffer ensures this tensor is saved with the model but not updated by gradients.
            self.register_buffer(
                "bias",
                torch.tril(torch.ones(config.block_size, config.block_size)).view(
                    1, 1, config.block_size, config.block_size
                ),
            )
    def forward(self, x):
        B, T, C = x.size()
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        if self.flash:
            y = F.scaled_dot_product_attention(
                q,
                k,
                v,
                attn_mask=None,
                dropout_p=self.attn_dropout.p if self.training else 0.0,
                is_causal=True,
            )
        else:
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v

        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_dropout(self.c_proj(y))
        return y

# --- User's Original LayerNorm ---
class LayerNorm(nn.Module):
    def __init__(self, ndim, bias):
        """
        Initializes the LayerNorm module.
        Args:
            ndim (int): is the number of features in the last dimension (e.g., embedding size).
            bias (bool): Whether to include a bias term in the normalization.
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, x):
        return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
# --- End User's Original LayerNorm ---

# --- User's Original MLP ---
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu = nn.GELU()
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        return self.dropout(self.c_proj(self.gelu(self.c_fc(x))))
# --- End User's Original MLP ---

# --- User's Original Block ---
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = LayerNorm(config.n_embd, config.bias)
        self.attn = CausalSelfAttention(config)
        self.ln2 = LayerNorm(config.n_embd, config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x
# --- End User's Original Block ---


# --- User's Original GPTConfig ---
@dataclass
class GPTConfig:
    block_size: int
    vocab_size: int
    n_layer: int
    n_head: int
    n_embd: int
    dropout: float = 0.0
    bias: bool = True
# --- End User's Original GPTConfig ---

# --- User's Original TrainingConfig ---
@dataclass
class TrainingConfig:
    learning_rate: float = 1e-4  # more stable training, earlier 1e-4
    max_iters: int = 20000  # increase from 25000
    warmup_steps: int = 1000  # smoother initial train, earlier 100
    min_lr: float = 5e-4  # lower rate, earlier 5e-4
    eval_iters: int = 500  # increased from 100
    batch_size: int = 32  # changed from 16, better gradient estimate
    block_size: int = 128  # changed from 64, capture longer range dependencies
    gradient_accumulation_steps: int = 32  # reduced from 50
    device: Literal["cuda", "cpu"] = "cuda" if torch.cuda.is_available() else "cpu"
    device_type: Literal["cuda", "cpu"] = (
        "cuda" if "cuda" in device else "cpu"
    )  # for later use in torch.autocast
    dtype: Literal["bfloat16", "float16"] = (
        "bfloat16"
        if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
        else "float16"
    )
    ptdtype: torch.dtype = {
        "float32": torch.float32,
        "bfloat16": torch.bfloat16,
        "float16": torch.float16,
    }[dtype]
    ctx: nullcontext[None] | torch.autocast = (
        nullcontext()
        if device_type == "cpu"
        else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
    )
# --- End User's Original TrainingConfig ---


class GPT(nn.Module):
    """
    The main GPT model, now with an optional QA head for Question Answering tasks.
    The QA head will predict start and end token indices of the answer span.
    """
    def __init__(self, config, is_qa_model=False):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config
        self.is_qa_model = is_qa_model

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = LayerNorm(config.n_embd, bias=config.bias),
        ))
        
        # Language modeling head (for pre-training)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        
        # QA head (for fine-tuning)
        # This will predict start and end logits for the answer span
        if self.is_qa_model:
            self.qa_head = nn.Linear(config.n_embd, 2, bias=False) # 2 outputs: start_logit, end_logit
        else:
            self.qa_head = None # No QA head if not a QA model

        # tie weights
        self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying

        # init all weights
        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/((2 * config.n_layer)**0.5))

        # report number of parameters
        # n_params calculation will differ slightly if QA head is present
        n_params = sum(p.numel() for p in self.parameters())
        # For non-embedding count it excludes token embeddings and positional embeddings.
        non_embedding_params = n_params - self.transformer.wpe.weight.numel()
        print(f"Number of parameters: {non_embedding_params/1e6:.2f}M (excluding positional embeddings)")


    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, input_ids, targets=None, attention_mask=None, token_type_ids=None):
        device = input_ids.device
        b, t = input_ids.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(input_ids) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if self.is_qa_model and self.qa_head is not None:
            # For QA, we typically use the pooled output or sequence output directly
            # For extractive QA, we need logits for each token for start/end prediction
            # The output 'x' is (batch_size, sequence_length, n_embd)
            logits = self.qa_head(x) # (batch_size, sequence_length, 2)
            start_logits, end_logits = logits.split(1, dim=-1)
            start_logits = start_logits.squeeze(-1).contiguous() # (batch_size, sequence_length)
            end_logits = end_logits.squeeze(-1).contiguous()     # (batch_size, sequence_length)

            if targets is not None:
                # targets for QA are start_positions and end_positions
                start_positions, end_positions = targets[:, 0], targets[:, 1]
                
                # Apply attention mask to logits for valid tokens
                if attention_mask is not None:
                    # Tokens that are part of the context (token_type_ids == 1) should be considered for answers
                    # and also non-padding tokens (attention_mask == 1)
                    valid_tokens_mask = (attention_mask == 1) & (token_type_ids == 1)
                    
                    start_logits = start_logits.masked_fill(~valid_tokens_mask, float('-inf'))
                    end_logits = end_logits.masked_fill(~valid_tokens_mask, float('-inf'))

                loss_fct = nn.CrossEntropyLoss(ignore_index=-100) # Use -100 as ignore_index for consistency
                start_loss = loss_fct(start_logits, start_positions)
                end_loss = loss_fct(end_logits, end_positions)
                total_loss = (start_loss + end_loss) / 2
                return start_logits, end_logits, total_loss
            
            return start_logits, end_logits, None # For inference
        else: # Standard language model for pre-training or text generation
            if targets is not None:
                # if we are given some targets (e.g. for training), calculate the loss
                logits = self.lm_head(x)
                loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100) # Use -100
            else:
                # inference-time mini-optimization: only forward the lm_head on the very last position
                logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
                loss = None

            return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Generate tokens given a conditioning sequence.
        idx: Tensor of shape (B, T)
        """
        if self.is_qa_model:
            print("Warning: generate method is not intended for QA models directly.")
            print("Please use the QA forward pass for inference and post-processing.")
            return idx # Or raise an error
            
        for _ in range(max_new_tokens):
            idx_cond = (
                idx
                if idx.size(1) <= self.config.block_size
                else idx[:, -self.config.block_size :]
            )
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float("Inf")
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# The 'config' object for pre-training is also kept here, if it's used by other scripts for its definition
config = GPTConfig(
    vocab_size=50257,  # use the tokenizer's vocab size
    block_size=1024,   # or whatever context size you're training with
    n_layer=8,
    n_head=8,
    n_embd=512,
    dropout=0.1,
    bias=True,
)