codsworth-3.8m / codsworth /huggingface_wrapper.py
Jaqshanahan's picture
Initial upload of Codsworth model
b84d85a verified
"""
Codsworth HuggingFace Model Wrapper
Allows loading Codsworth using HuggingFace's AutoModel
"""
import json
from typing import Optional, Tuple
import torch
import torch.nn as nn
from transformers import PreTrainedModel, PretrainedConfig
from transformers.modeling_outputs import CausalLMOutputWithPast
from .config import CodsworthConfig
class CodsworthConfig(PretrainedConfig):
"""HuggingFace compatible config for Codsworth"""
model_type = "codsworth"
def __init__(
self,
vocab_size: int = 5004,
hidden_size: int = 256,
num_hidden_layers: int = 2,
num_attention_heads: int = 4,
head_dim: int = 64,
intermediate_size: int = 512,
max_position_embeddings: int = 128,
rope_theta: float = 10000.0,
use_rope: bool = True,
hidden_dropout: float = 0.1,
attention_dropout: float = 0.0,
pad_token_id: int = 0,
bos_token_id: int = 1,
eos_token_id: int = 2,
torch_dtype: str = "float32",
**kwargs,
):
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.head_dim = head_dim
self.intermediate_size = intermediate_size
self.max_position_embeddings = max_position_embeddings
self.rope_theta = rope_theta
self.use_rope = use_rope
self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
class CodsworthPreTrainedModel(PreTrainedModel):
"""Base class for Codsworth"""
config_class = CodsworthConfig
base_model_prefix = "codsworth"
def _init_weights(self, module):
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=0.02)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=0.02)
class CodsworthModel(CodsworthPreTrainedModel):
"""Codsworth model for HuggingFace"""
def __init__(self, config: CodsworthConfig):
super().__init__(config)
self.config = config
# Create internal config for the original model
internal_config = CodsworthConfig(
vocab_size=config.vocab_size,
context_length=config.max_position_embeddings,
embedding_dim=config.hidden_size,
num_layers=config.num_hidden_layers,
num_heads=config.num_attention_heads,
head_dim=config.head_dim,
ffn_hidden_dim=config.intermediate_size,
use_rope=config.use_rope,
rope_theta=config.rope_theta,
dropout=config.hidden_dropout,
attention_dropout=config.attention_dropout,
use_flash_attention=False,
use_gradient_checkpointing=False,
)
# Import the actual model
from codsworth.model import CodsworthTransformer
self.transformer = CodsworthTransformer(internal_config)
# Tie weights
self.lm_head = self.transformer.lm_head
self.post_init()
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
labels: Optional[torch.LongTensor] = None,
) -> CausalLMOutputWithPast:
outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
)
logits = outputs["logits"]
loss = outputs.get("loss")
return CausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=None,
hidden_states=outputs.get("hidden_states"),
attentions=None,
)
def generate(self, inputs, **kwargs):
"""Wrapper for generation"""
return self.transformer.generate(inputs, **kwargs)
# Auto mapping
AutoConfig = None
AutoModel = None
def from_pretrained(model_path: str, **kwargs):
"""Load Codsworth from pretrained"""
global AutoConfig, AutoModel
if AutoConfig is None:
from transformers import AutoConfig as _AutoConfig, AutoModel as _AutoModel
AutoConfig = _AutoConfig
AutoModel = _AutoModel
return AutoModel.from_pretrained(model_path, **kwargs)