import torch import torch.nn as nn from transformers import BertForTokenClassification class FourOClassifier(nn.Module): def __init__(self, clf_hidden_size, num_labels): super(FourOClassifier, self).__init__() self.dense = nn.Linear(clf_hidden_size, clf_hidden_size) self.activation = nn.ReLU() self.dropout = nn.Dropout(p=0.1) self.batch_norm = nn.BatchNorm1d(clf_hidden_size) self.output_layer = nn.Linear(clf_hidden_size, num_labels) def forward(self, clf_input): x = self.dense(clf_input) x = self.activation(x) x = self.dropout(x) x = self.batch_norm(x.permute(0, 2, 1)).permute(0, 2, 1) # BatchNorm1d expects (N, C, L) x = self.output_layer(x) return x class BertForTokenClassificationWithFourO(BertForTokenClassification): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.classifier = FourOClassifier(config.hidden_size, config.num_labels) self.init_weights() @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) model.check_classifier_initialization() return model def check_classifier_initialization(self): # Check if classifier weights seem to be randomly initialized def is_randomly_initialized(tensor): return torch.abs(tensor.mean()) < 1e-3 < tensor.std() < 1e-1 classifier_weights = [ self.classifier.dense.weight, self.classifier.dense.bias, self.classifier.output_layer.weight, self.classifier.output_layer.bias ] def freeze_bert(self): """Freezes the BERT layers to prevent their parameters from being updated during training.""" for param in self.bert.parameters(): param.requires_grad = False print("BERT layers frozen.") def unfreeze_bert(self): """Unfreezes the BERT layers to allow their parameters to be updated during training.""" for param in self.bert.parameters(): param.requires_grad = True print("BERT layers unfrozen.")