"""
Codsworth HuggingFace Model Wrapper
Allows loading Codsworth using HuggingFace's AutoModel
"""

import json
from typing import Optional, Tuple

import torch
import torch.nn as nn
from transformers import PreTrainedModel, PretrainedConfig
from transformers.modeling_outputs import CausalLMOutputWithPast

from .config import CodsworthConfig


class CodsworthConfig(PretrainedConfig):
    """HuggingFace compatible config for Codsworth"""
    
    model_type = "codsworth"
    
    def __init__(
        self,
        vocab_size: int = 5004,
        hidden_size: int = 256,
        num_hidden_layers: int = 2,
        num_attention_heads: int = 4,
        head_dim: int = 64,
        intermediate_size: int = 512,
        max_position_embeddings: int = 128,
        rope_theta: float = 10000.0,
        use_rope: bool = True,
        hidden_dropout: float = 0.1,
        attention_dropout: float = 0.0,
        pad_token_id: int = 0,
        bos_token_id: int = 1,
        eos_token_id: int = 2,
        torch_dtype: str = "float32",
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.head_dim = head_dim
        self.intermediate_size = intermediate_size
        self.max_position_embeddings = max_position_embeddings
        self.rope_theta = rope_theta
        self.use_rope = use_rope
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout
        
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            **kwargs,
        )


class CodsworthPreTrainedModel(PreTrainedModel):
    """Base class for Codsworth"""
    
    config_class = CodsworthConfig
    base_model_prefix = "codsworth"
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=0.02)


class CodsworthModel(CodsworthPreTrainedModel):
    """Codsworth model for HuggingFace"""
    
    def __init__(self, config: CodsworthConfig):
        super().__init__(config)
        
        self.config = config
        
        # Create internal config for the original model
        internal_config = CodsworthConfig(
            vocab_size=config.vocab_size,
            context_length=config.max_position_embeddings,
            embedding_dim=config.hidden_size,
            num_layers=config.num_hidden_layers,
            num_heads=config.num_attention_heads,
            head_dim=config.head_dim,
            ffn_hidden_dim=config.intermediate_size,
            use_rope=config.use_rope,
            rope_theta=config.rope_theta,
            dropout=config.hidden_dropout,
            attention_dropout=config.attention_dropout,
            use_flash_attention=False,
            use_gradient_checkpointing=False,
        )
        
        # Import the actual model
        from codsworth.model import CodsworthTransformer
        self.transformer = CodsworthTransformer(internal_config)
        
        # Tie weights
        self.lm_head = self.transformer.lm_head
        
        self.post_init()
    
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        labels: Optional[torch.LongTensor] = None,
    ) -> CausalLMOutputWithPast:
        
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
        )
        
        logits = outputs["logits"]
        loss = outputs.get("loss")
        
        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=None,
            hidden_states=outputs.get("hidden_states"),
            attentions=None,
        )
    
    def generate(self, inputs, **kwargs):
        """Wrapper for generation"""
        return self.transformer.generate(inputs, **kwargs)


# Auto mapping
AutoConfig = None
AutoModel = None


def from_pretrained(model_path: str, **kwargs):
    """Load Codsworth from pretrained"""
    global AutoConfig, AutoModel
    
    if AutoConfig is None:
        from transformers import AutoConfig as _AutoConfig, AutoModel as _AutoModel
        AutoConfig = _AutoConfig
        AutoModel = _AutoModel
    
    return AutoModel.from_pretrained(model_path, **kwargs)