import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from transformers.modeling_outputs import CausalLMOutput
from transformers.modeling_utils import PreTrainedModel

from .configuration_tinygpt import TinyGPTConfig #to look inside the folder

# -------------------------
# TinyGPTConfig (Required)
# -------------------------
class TinyGPTConfig:
    model_type = "tinygpt"

    def __init__(self,
                 vocab_size=30522,
                 d_model=256,
                 n_heads=4,
                 n_layers=4,
                 d_ff=1024,
                 max_seq_len=256,
                 **kwargs):
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.d_ff = d_ff
        self.max_seq_len = max_seq_len

        # store additional HF keys
        for k, v in kwargs.items():
            setattr(self, k, v)


# -------------------------
# Your Original TinyGPT Core
# -------------------------
class TinyGPT(nn.Module):
    def __init__(self, vocab_size=30522, d_model=256, n_heads=4,
                 n_layers=4, d_ff=1024, max_seq_len=256):
        x = self.ln_f(x)
        return self.head(x)


class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()
        ff_out = self.ff(x)
        x = self.ln2(x + ff_out)
        return x


# -------------------------
# HF Wrapper: TinyGPTForCausalLM
# -------------------------
class TinyGPTForCausalLM(PreTrainedModel):
    config_class = TinyGPTConfig

    def __init__(self, config):
        super().__init__(config)

        self.model = TinyGPT(
            vocab_size=config.vocab_size,
            d_model=config.d_model,
            n_heads=config.n_heads,
            n_layers=config.n_layers,
            d_ff=config.d_ff,
            max_seq_len=config.max_seq_len
        )

        self.post_init()

    def forward(self, input_ids, labels=None):
        logits = self.model(input_ids)

        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(
                logits.view(-1, logits.size(-1)),
                labels.view(-1)
            )

        return CausalLMOutput(
            logits=logits,
            loss=loss
        )