| from math import sqrt,log |
| import sys |
|
|
| |
|
|
| import torch |
| import torch.nn as nn |
| from torch.nn.functional import softmax,relu,linear |
| from common import PositionalEncoding |
| from hopfield import HopfieldLayer, HopfieldMHA, HopfieldReLU, HopfieldSoftmax |
|
|
| from torch.cuda.amp import autocast |
| import yaml |
| from transformers import PreTrainedModel, PretrainedConfig |
| from transformers.modeling_outputs import MaskedLMOutput, BaseModelOutput |
|
|
| class BertEnergyConfig(PretrainedConfig): |
| |
| model_type = "bert_energy" |
| |
| def __init__(self, config=None, path=None, vocabulary_size=50, num_layers=12, num_heads=12, forward_memories=2048, embedding_dim=768, activation="relu",positional=True, bias=True, tie_weights=True, alpha=1.0, |
| beta=1., layer_norm=1e-05, dropout=0.0, block_size=512, share_layers=False, compile=False, pad_idx=None, **kwargs): |
|
|
| self.vocabulary_size = vocabulary_size |
| self.num_layers = num_layers |
| self.num_heads = num_heads |
| self.activation = activation |
| self.positional = positional |
| self.tie_weights = tie_weights |
| self.bias = bias |
| self.forward_memories = forward_memories |
| self.embedding_dim = embedding_dim |
| self.share_layers = share_layers |
| self.alpha = alpha |
| self.beta = beta |
| self.layer_norm = float(layer_norm) |
| self.dropout = dropout |
| self.block_size = block_size |
| self.compile = compile |
| self.pad_idx = pad_idx |
|
|
| if config is not None: |
| for key,value in config.to_dict(): |
| if key.lower() in self.__dict__.keys(): |
| print(key, file=sys.stderr) |
| setattr(self,key.lower(),value) |
|
|
| elif path is not None: |
| if path.endswith(".yaml"): |
| with open(path) as istream: |
| config = yaml.safe_load(istream) |
| for key,value in config.items(): |
| print(key) |
| if key.lower() in self.__dict__.keys(): |
| setattr(self,key.lower(),value) |
| else: |
| raise NotImplementedError |
| super().__init__(**kwargs) |
|
|