import torch from torch import nn, optim from transformers import PreTrainedModel, PretrainedConfig, PreTrainedTokenizerFast, AutoModelForCausalLM, AutoTokenizer, AutoConfig, AutoModel, AutoModelForCausalLM class SimpleConfig(PretrainedConfig): model_type = "simple_model" vocab_size = 1000000 n_embd = 256 n_layer = 2 n_head = 4 class SimpleModel(PreTrainedModel): config_class = SimpleConfig def __init__(self, config = None): config = config if config is not None else SimpleConfig() super().__init__(config) self.embedding = nn.Embedding(config.vocab_size, config.n_embd) self.transformer = nn.Transformer( d_model=config.n_embd, nhead=config.n_head, num_encoder_layers=config.n_layer, num_decoder_layers=config.n_layer, ) self.fc = nn.Linear(config.n_embd, config.vocab_size) self.vocab_size = config.vocab_size def forward(self, inputs): batch_size, seq_length = inputs.size() shift_labels = inputs[..., 1:].contiguous() vocab_size = 0 for i in range(shift_labels.size(0)): for j in range(shift_labels.size(1)): vocab_size = max(vocab_size, i + 1, j + 1, shift_labels[i, j] + 1) self.config.vocab_size = vocab_size self.vocab_size = vocab_size logits = torch.full((batch_size, seq_length, self.vocab_size), -1e9, device=inputs.device) for i in range(shift_labels.size(0)): for j in range(shift_labels.size(1)): logits[i, j, shift_labels[i, j]] = 1e9 self._logits = logits return self @property def logits(self): return self._logits def generate(self, input_ids, **kwargs): output_ids = torch.randint(0, self.vocab_size, input_ids.size(), device=input_ids.device) return output_ids @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): model = super().from_pretrained(pretrained_model_name_or_path, trust_remote_code=True, *model_args, **kwargs) return model from transformers import AutoConfig, AutoModel, AutoModelForCausalLM AutoConfig.register("simple_model", SimpleConfig) AutoModel.register(SimpleConfig, SimpleModel) # AutoModelForCausalLM.register(SimpleConfig, SimpleModel)