| """ |
| Codsworth HuggingFace Model Wrapper |
| Allows loading Codsworth using HuggingFace's AutoModel |
| """ |
|
|
| import json |
| from typing import Optional, Tuple |
|
|
| import torch |
| import torch.nn as nn |
| from transformers import PreTrainedModel, PretrainedConfig |
| from transformers.modeling_outputs import CausalLMOutputWithPast |
|
|
| from .config import CodsworthConfig |
|
|
|
|
| class CodsworthConfig(PretrainedConfig): |
| """HuggingFace compatible config for Codsworth""" |
| |
| model_type = "codsworth" |
| |
| def __init__( |
| self, |
| vocab_size: int = 5004, |
| hidden_size: int = 256, |
| num_hidden_layers: int = 2, |
| num_attention_heads: int = 4, |
| head_dim: int = 64, |
| intermediate_size: int = 512, |
| max_position_embeddings: int = 128, |
| rope_theta: float = 10000.0, |
| use_rope: bool = True, |
| hidden_dropout: float = 0.1, |
| attention_dropout: float = 0.0, |
| pad_token_id: int = 0, |
| bos_token_id: int = 1, |
| eos_token_id: int = 2, |
| torch_dtype: str = "float32", |
| **kwargs, |
| ): |
| self.vocab_size = vocab_size |
| self.hidden_size = hidden_size |
| self.num_hidden_layers = num_hidden_layers |
| self.num_attention_heads = num_attention_heads |
| self.head_dim = head_dim |
| self.intermediate_size = intermediate_size |
| self.max_position_embeddings = max_position_embeddings |
| self.rope_theta = rope_theta |
| self.use_rope = use_rope |
| self.hidden_dropout = hidden_dropout |
| self.attention_dropout = attention_dropout |
| |
| super().__init__( |
| pad_token_id=pad_token_id, |
| bos_token_id=bos_token_id, |
| eos_token_id=eos_token_id, |
| **kwargs, |
| ) |
|
|
|
|
| class CodsworthPreTrainedModel(PreTrainedModel): |
| """Base class for Codsworth""" |
| |
| config_class = CodsworthConfig |
| base_model_prefix = "codsworth" |
| |
| def _init_weights(self, module): |
| if isinstance(module, nn.Linear): |
| module.weight.data.normal_(mean=0.0, std=0.02) |
| if module.bias is not None: |
| module.bias.data.zero_() |
| elif isinstance(module, nn.Embedding): |
| module.weight.data.normal_(mean=0.0, std=0.02) |
|
|
|
|
| class CodsworthModel(CodsworthPreTrainedModel): |
| """Codsworth model for HuggingFace""" |
| |
| def __init__(self, config: CodsworthConfig): |
| super().__init__(config) |
| |
| self.config = config |
| |
| |
| internal_config = CodsworthConfig( |
| vocab_size=config.vocab_size, |
| context_length=config.max_position_embeddings, |
| embedding_dim=config.hidden_size, |
| num_layers=config.num_hidden_layers, |
| num_heads=config.num_attention_heads, |
| head_dim=config.head_dim, |
| ffn_hidden_dim=config.intermediate_size, |
| use_rope=config.use_rope, |
| rope_theta=config.rope_theta, |
| dropout=config.hidden_dropout, |
| attention_dropout=config.attention_dropout, |
| use_flash_attention=False, |
| use_gradient_checkpointing=False, |
| ) |
| |
| |
| from codsworth.model import CodsworthTransformer |
| self.transformer = CodsworthTransformer(internal_config) |
| |
| |
| self.lm_head = self.transformer.lm_head |
| |
| self.post_init() |
| |
| def forward( |
| self, |
| input_ids: torch.LongTensor = None, |
| attention_mask: Optional[torch.Tensor] = None, |
| position_ids: Optional[torch.LongTensor] = None, |
| labels: Optional[torch.LongTensor] = None, |
| ) -> CausalLMOutputWithPast: |
| |
| outputs = self.transformer( |
| input_ids=input_ids, |
| attention_mask=attention_mask, |
| position_ids=position_ids, |
| ) |
| |
| logits = outputs["logits"] |
| loss = outputs.get("loss") |
| |
| return CausalLMOutputWithPast( |
| loss=loss, |
| logits=logits, |
| past_key_values=None, |
| hidden_states=outputs.get("hidden_states"), |
| attentions=None, |
| ) |
| |
| def generate(self, inputs, **kwargs): |
| """Wrapper for generation""" |
| return self.transformer.generate(inputs, **kwargs) |
|
|
|
|
| |
| AutoConfig = None |
| AutoModel = None |
|
|
|
|
| def from_pretrained(model_path: str, **kwargs): |
| """Load Codsworth from pretrained""" |
| global AutoConfig, AutoModel |
| |
| if AutoConfig is None: |
| from transformers import AutoConfig as _AutoConfig, AutoModel as _AutoModel |
| AutoConfig = _AutoConfig |
| AutoModel = _AutoModel |
| |
| return AutoModel.from_pretrained(model_path, **kwargs) |