""" Codsworth HuggingFace Model Wrapper Allows loading Codsworth using HuggingFace's AutoModel """ import json from typing import Optional, Tuple import torch import torch.nn as nn from transformers import PreTrainedModel, PretrainedConfig from transformers.modeling_outputs import CausalLMOutputWithPast from .config import CodsworthConfig class CodsworthConfig(PretrainedConfig): """HuggingFace compatible config for Codsworth""" model_type = "codsworth" def __init__( self, vocab_size: int = 5004, hidden_size: int = 256, num_hidden_layers: int = 2, num_attention_heads: int = 4, head_dim: int = 64, intermediate_size: int = 512, max_position_embeddings: int = 128, rope_theta: float = 10000.0, use_rope: bool = True, hidden_dropout: float = 0.1, attention_dropout: float = 0.0, pad_token_id: int = 0, bos_token_id: int = 1, eos_token_id: int = 2, torch_dtype: str = "float32", **kwargs, ): self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.head_dim = head_dim self.intermediate_size = intermediate_size self.max_position_embeddings = max_position_embeddings self.rope_theta = rope_theta self.use_rope = use_rope self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs, ) class CodsworthPreTrainedModel(PreTrainedModel): """Base class for Codsworth""" config_class = CodsworthConfig base_model_prefix = "codsworth" def _init_weights(self, module): if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=0.02) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=0.02) class CodsworthModel(CodsworthPreTrainedModel): """Codsworth model for HuggingFace""" def __init__(self, config: CodsworthConfig): super().__init__(config) self.config = config # Create internal config for the original model internal_config = CodsworthConfig( vocab_size=config.vocab_size, context_length=config.max_position_embeddings, embedding_dim=config.hidden_size, num_layers=config.num_hidden_layers, num_heads=config.num_attention_heads, head_dim=config.head_dim, ffn_hidden_dim=config.intermediate_size, use_rope=config.use_rope, rope_theta=config.rope_theta, dropout=config.hidden_dropout, attention_dropout=config.attention_dropout, use_flash_attention=False, use_gradient_checkpointing=False, ) # Import the actual model from codsworth.model import CodsworthTransformer self.transformer = CodsworthTransformer(internal_config) # Tie weights self.lm_head = self.transformer.lm_head self.post_init() def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, labels: Optional[torch.LongTensor] = None, ) -> CausalLMOutputWithPast: outputs = self.transformer( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, ) logits = outputs["logits"] loss = outputs.get("loss") return CausalLMOutputWithPast( loss=loss, logits=logits, past_key_values=None, hidden_states=outputs.get("hidden_states"), attentions=None, ) def generate(self, inputs, **kwargs): """Wrapper for generation""" return self.transformer.generate(inputs, **kwargs) # Auto mapping AutoConfig = None AutoModel = None def from_pretrained(model_path: str, **kwargs): """Load Codsworth from pretrained""" global AutoConfig, AutoModel if AutoConfig is None: from transformers import AutoConfig as _AutoConfig, AutoModel as _AutoModel AutoConfig = _AutoConfig AutoModel = _AutoModel return AutoModel.from_pretrained(model_path, **kwargs)