yangwang825
/

bert-base-uncased-cls

@@ -1,8 +1,12 @@
 {
   "affine": true,
   "attention_probs_dropout_prob": 0.1,
   "auto_map": {
-    "AutoConfig": "configuration_bert.BertConfig"
   },
   "classifier_dropout": null,
   "hidden_act": "gelu",
@@ -17,6 +21,7 @@
   "num_hidden_layers": 12,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
   "transformers_version": "4.33.3",
   "type_vocab_size": 2,
   "use_cache": true,

 {
   "affine": true,
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
   "attention_probs_dropout_prob": 0.1,
   "auto_map": {
+    "AutoConfig": "configuration_bert.BertConfig",
+    "AutoModelForSequenceClassification": "modeling_bert.BertForSequenceClassification"
   },
   "classifier_dropout": null,
   "hidden_act": "gelu",
   "num_hidden_layers": 12,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
   "transformers_version": "4.33.3",
   "type_vocab_size": 2,
   "use_cache": true,

modeling_bert.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-from collections import OrderedDict
 from typing import Optional, List, Union, Tuple
 from transformers import (
     PretrainedConfig,
@@ -48,163 +46,24 @@ class BertPreTrainedModel(PreTrainedModel):
             module.weight.data.fill_(1.0)
-class PFSA(nn.Module):
-    """
-    https://openreview.net/pdf?id=isodM5jTA7h
-    """
-    def __init__(self, input_dim, alpha=1):
-        super(PFSA, self).__init__()
-        self.input_dim = input_dim
-        self.alpha = alpha
-    def forward(self, x, mask=None):
-        """
-        x: [B, T, F]
-        """
-        x = x.transpose(1, 2)[..., None]
-        k = torch.mean(x, dim=[-1, -2], keepdim=True)
-        kd = torch.sqrt((k - k.mean(dim=1, keepdim=True)).pow(2).sum(dim=1, keepdim=True)) # [B, 1, 1, 1]
-        qd = torch.sqrt((x - x.mean(dim=1, keepdim=True)).pow(2).sum(dim=1, keepdim=True)) # [B, 1, T, 1]
-        C_qk = (((x - x.mean(dim=1, keepdim=True)) * (k - k.mean(dim=1, keepdim=True))).sum(dim=1, keepdim=True)) / (qd * kd)
-        A = (1 - torch.sigmoid(C_qk)) ** self.alpha
-        out = x * A
-        out = out.squeeze(dim=-1).transpose(1, 2)
-        return out
-class PURE(nn.Module):
-    def __init__(
-        self,
-        in_dim,
-        q=5,
-        r=1,
-        center=False,
-        num_iters=1,
-        return_mean=True,
-        return_std=True,
-        normalize=False,
-        do_pcr=True,
-        do_pfsa=True,
-        alpha=1,
-        *args, **kwargs
-    ):
-        super().__init__()
-        self.in_dim = in_dim
-        self.target_rank = q
-        self.num_pc_to_remove = r
-        self.center = center
-        self.num_iters = num_iters
-        self.return_mean = return_mean
-        self.return_std = return_std
-        self.normalize = normalize
-        self.do_pcr = do_pcr
-        self.do_pfsa = do_pfsa
-        # self.attention = SelfAttention(in_dim)
-        self.attention = PFSA(in_dim, alpha=alpha)
-        self.eps = 1e-5
-        if self.normalize:
-            self.norm = nn.Sequential(OrderedDict([
-                ('relu', nn.LeakyReLU(inplace=True)),
-                ('bn', nn.BatchNorm1d(in_dim)),
-            ]))
-    def get_out_dim(self):
-        if self.return_mean and self.return_std:
-            self.out_dim = self.in_dim * 2
-        else:
-            self.out_dim = self.in_dim
-        return self.out_dim
-    def _compute_pc(self, x):
-        """
-        x: (B, T, F)
-        """
-        _, _, V = torch.pca_lowrank(x, q=self.target_rank, center=self.center, niter=self.num_iters)
-        pc = V.transpose(1, 2)[:, :self.num_pc_to_remove, :] # pc: [B, K, F]
-        return pc
-    def forward(self, x, attention_mask=None, *args, **kwargs):
-        """
-        PCR -> Attention
-        x: (B, F, T)
-        """
-        if self.normalize:
-            x = self.norm(x)
-        xt = x.transpose(1, 2)
-        if self.do_pcr:
-            pc = self._compute_pc(xt) # pc: [B, K, F]
-            xx = xt - xt @ pc.transpose(1, 2) @ pc # [B, T, F] * [B, F, K] * [B, K, F] = [B, T, F]
-        else:
-            xx = xt
-        if self.do_pfsa:
-            xx = self.attention(xx, attention_mask)
-        if self.normalize:
-            xx = F.normalize(xx, p=2, dim=2)
-        return xx
 class BertPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.pure = PURE(
-            config.hidden_size,
-            q=config.q,
-            r=config.r,
-            center=config.center,
-            num_iters=config.num_iters,
-            return_mean=config.return_mean,
-            return_std=config.return_std,
-            normalize=config.normalize,
-            do_pcr=config.do_pcr,
-            do_pfsa=config.do_pfsa,
-            alpha=config.alpha
-        )
         if config.affine:
             self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         else:
             self.dense = nn.Identity()
         self.activation = nn.Tanh()
-        self.eps = 1e-5
-    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
         # We "pool" the model by simply taking the hidden state corresponding
         # to the first token.
-        hidden_states = self.pure(hidden_states.transpose(1, 2), attention_mask)
-        mean_tensor = self.mean_pooling(hidden_states, attention_mask)
-        pooled_output = self.dense(mean_tensor)
         pooled_output = self.activation(pooled_output)
         return pooled_output
-    def _get_gauss_noise(self, shape_of_tensor, device="cpu"):
-        """Returns a tensor of epsilon Gaussian noise.
-        Arguments
-        ---------
-        shape_of_tensor : tensor
-            It represents the size of tensor for generating Gaussian noise.
-        """
-        gnoise = torch.randn(shape_of_tensor, device=device)
-        gnoise -= torch.min(gnoise)
-        gnoise /= torch.max(gnoise)
-        gnoise = self.eps * ((1 - 9) * gnoise + 9)
-        return gnoise
-    def add_noise(self, tensor):
-        gnoise = self._get_gauss_noise(tensor.size(), device=tensor.device)
-        gnoise = gnoise
-        tensor += gnoise
-        return tensor
-    def mean_pooling(self, token_embeddings, attention_mask):
-        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-        mean = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-        # mean = self.add_noise(mean)
-        return mean
 class BertModel(BertPreTrainedModel):
@@ -324,7 +183,7 @@ class BertModel(BertPreTrainedModel):
             return_dict=return_dict,
         )
         sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output, attention_mask) if self.pooler is not None else None
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]

 import torch
 import torch.nn as nn
 from typing import Optional, List, Union, Tuple
 from transformers import (
     PretrainedConfig,
             module.weight.data.fill_(1.0)
 class BertPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
         if config.affine:
             self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         else:
             self.dense = nn.Identity()
         self.activation = nn.Tanh()
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # We "pool" the model by simply taking the hidden state corresponding
         # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
         pooled_output = self.activation(pooled_output)
         return pooled_output
 class BertModel(BertPreTrainedModel):
             return_dict=return_dict,
         )
         sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:64dd3354da4b868afe78cc83d9e51ed4ca20cab88015a22a38257b205c9eadd4
 size 438000689

 version https://git-lfs.github.com/spec/v1
+oid sha256:ae9bcc7a4ae5b93f43cf78aa7dea754315ca54e073d4a6b4c780bc4be2dd2406
 size 438000689