ducanhdinh
/

M_AHuTieuBert

Safetensors

roberta

Model card Files Files and versions

xet

Community

Đinh Trác Đức Anh commited on Oct 22, 2025

Commit

8679365

1 Parent(s): 50f5d14

create batch bias matrix

Browse files

Files changed (2) hide show

bias_utils.py +44 -66
model.py +31 -32

bias_utils.py CHANGED Viewed

@@ -3,73 +3,51 @@ import numpy as np
 def create_bias_matrix(bmes_tags, alpha=0.1, beta=-0.05, gamma=0.0, delta=0.0):
     """
-    Tạo ma trận bias từ BMES tags theo quy tắc:
-    - α (alpha): khi cả 2 token thuộc cùng 1 từ (cùng BMES sequence)
-    - β (beta): khi 2 token thuộc 2 từ khác nhau
-    - γ (gamma): khi có ít nhất 1 token là 'S' (single syllable)
-    - δ (delta): khi token khớp với chính nó (diagonal)
-    Args:
-        bmes_tags: list hoặc tensor chứa BMES tags
-                   - Nếu là list: ['B', 'M', 'E', 'S', ...]
-                   - Nếu là tensor: [0, 1, 2, 3, ...] (B=0, M=1, E=2, S=3)
-        alpha: trọng số cho cặp token cùng từ
-        beta: trọng số cho cặp token khác từ
-        gamma: trọng số khi có token 'S'
-        delta: trọng số cho diagonal (token với chính nó)
-    Returns:
-        bias_matrix: ma trận bias có shape (seq_len, seq_len)
     """
-    # Chuyển về list nếu là tensor
-    if isinstance(bmes_tags, torch.Tensor):
-        BMES_MAP_INV = {0: 'B', 1: 'M', 2: 'E', 3: 'S'}
-        bmes_tags = [BMES_MAP_INV[t.item() if isinstance(t, torch.Tensor) else t]
-                     for t in bmes_tags.squeeze().tolist()]
-    seq_len = len(bmes_tags)
-    bias_matrix = np.zeros((seq_len, seq_len))
-    # Xác định ranh giới từ: tìm các nhóm token thuộc cùng một từ
-    word_groups = []
-    current_group = [0]
-    for i in range(1, seq_len):
-        prev_tag = bmes_tags[i-1]
-        curr_tag = bmes_tags[i]
-        # Nếu token trước là 'E' hoặc 'S', thì từ mới bắt đầu
-        if prev_tag in ['E', 'S']:
-            word_groups.append(current_group)
-            current_group = [i]
-        else:
-            current_group.append(i)
-    # Thêm group cuối cùng
-    if current_group:
-        word_groups.append(current_group)
-    # Điền giá trị vào ma trận
-    for i in range(seq_len):
-        for j in range(seq_len):
-            if i == j:
-                # Diagonal
-                bias_matrix[i, j] = delta
-            elif bmes_tags[i] == 'S' or bmes_tags[j] == 'S':
-                # Có ít nhất 1 token là 'S'
-                bias_matrix[i, j] = gamma
             else:
-                # Kiểm tra xem i và j có thuộc cùng từ không
-                same_word = False
-                for group in word_groups:
-                    if i in group and j in group:
-                        same_word = True
-                        break
-                if same_word:
-                    bias_matrix[i, j] = alpha
                 else:
-                    bias_matrix[i, j] = beta
-    return bias_matrix

 def create_bias_matrix(bmes_tags, alpha=0.1, beta=-0.05, gamma=0.0, delta=0.0):
     """
+    Hỗ trợ:
+    - bmes_tags: shape [seq_len] (1 sample) hoặc [B, seq_len] (batch)
+    Trả về bias_matrix:
+    - 1 sample: [seq_len, seq_len]
+    - batch: [B, seq_len, seq_len]
     """
+    def single_bias(seq_tags):
+        # Chuyển tensor -> list ['B','M','E','S']
+        if isinstance(seq_tags, torch.Tensor):
+            BMES_MAP_INV = {0:'B',1:'M',2:'E',3:'S'}
+            seq_tags = [BMES_MAP_INV[t.item()] if isinstance(t, torch.Tensor) else BMES_MAP_INV[t] for t in seq_tags.tolist()]
+        seq_len = len(seq_tags)
+        bias_matrix = np.zeros((seq_len, seq_len))
+        # Nhóm token theo từ
+        word_groups = []
+        current_group = [0]
+        for i in range(1, seq_len):
+            prev_tag = seq_tags[i-1]
+            curr_tag = seq_tags[i]
+            if prev_tag in ['E','S']:
+                word_groups.append(current_group)
+                current_group = [i]
             else:
+                current_group.append(i)
+        if current_group:
+            word_groups.append(current_group)
+        # Điền bias
+        for i in range(seq_len):
+            for j in range(seq_len):
+                if i == j:
+                    bias_matrix[i,j] = delta
+                elif seq_tags[i]=='S' or seq_tags[j]=='S':
+                    bias_matrix[i,j] = gamma
                 else:
+                    same_word = any(i in g and j in g for g in word_groups)
+                    bias_matrix[i,j] = alpha if same_word else beta
+        return bias_matrix
+    if isinstance(bmes_tags, torch.Tensor) and bmes_tags.dim() == 2:
+        # batch
+        batch_bias = [single_bias(bmes_tags[i]) for i in range(bmes_tags.size(0))]
+        return np.stack(batch_bias, axis=0)  # [B, seq_len, seq_len]
+    else:
+        # 1 sample
+        return single_bias(bmes_tags)  # [seq_len, seq_len]

model.py CHANGED Viewed

@@ -7,64 +7,63 @@ class MorphemeAwareRobertaModel(RobertaModel):
     """
     PhoBERT mở rộng với:
     - BoundaryAwareEmbeddings (BMES + gate)
-    - BMES bias hook trên attention head
     """
     def __init__(self, config, target_heads=None, alpha=0.1, beta=-0.05, gamma=0.0, delta=0.0, **kwargs):
-        """
-        config: RobertaConfig hoặc đường dẫn HF pretrained
-        target_heads: dict[layer_idx] = list[head_idx] để apply bias
-        alpha,beta,gamma,delta: weight của BMES bias
-        kwargs: giữ để HF from_pretrained gọi được
-        """
         super().__init__(config, **kwargs)
-        # Thay embedding gốc bằng embedding mới
         self.embeddings = BoundaryAwareEmbeddings(config, **kwargs)
-        # Thông số bias
         self.target_heads = target_heads or {}
         self.alpha = alpha
         self.beta = beta
         self.gamma = gamma
         self.delta = delta
-        # Tokenizer (set ngoài)
         self.tokenizer = None
-        # Lưu hook và bias_matrix
         self.bias_hooks = {}
-        self.bias_matrix = None
     def set_tokenizer(self, tokenizer):
-        assert tokenizer is not None, "Tokenizer không được để None"
         self.tokenizer = tokenizer
     def set_bias_matrix(self, bmes_tags):
-        bias = create_bias_matrix(
-            bmes_tags.squeeze(0) if isinstance(bmes_tags, torch.Tensor) else bmes_tags,
-            alpha=self.alpha, beta=self.beta, gamma=self.gamma, delta=self.delta
-        )
-        bias = torch.tensor(bias, dtype=torch.float32).unsqueeze(0)
         num_heads = self.config.num_attention_heads
-        bias = bias.unsqueeze(1).repeat(1, num_heads, 1, 1)
-        self.bias_matrix = bias.to(next(self.parameters()).device)
     def _register_attention_hook(self, layer_idx, head_indices):
         def hook_fn(module, input, output):
-            if not isinstance(output, tuple) or len(output) < 2:
                 return output
-            context_layer, attention_probs = output
-            if attention_probs is None or self.bias_matrix is None:
                 return output
             bias = self.bias_matrix
-            seq_len = attention_probs.size(-1)
-            if bias.size(-1) != seq_len:
-                bias = bias[:, :, :seq_len, :seq_len]
             for h in head_indices:
-                if h < attention_probs.size(1):
-                    attention_probs[:, h, :, :] += bias[:, h, :, :]
-            return (context_layer, attention_probs)
         attn = self.encoder.layer[layer_idx].attention.self
         hook = attn.register_forward_hook(hook_fn)
@@ -103,7 +102,7 @@ class MorphemeAwareRobertaModel(RobertaModel):
         if self.target_heads:
             self.prepare_bias_hooks()
-        output_attentions = True
         outputs = super().forward(
             input_ids=input_ids,
@@ -118,4 +117,4 @@ class MorphemeAwareRobertaModel(RobertaModel):
         )
         self.remove_bias_hooks()
-        return outputs

     """
     PhoBERT mở rộng với:
     - BoundaryAwareEmbeddings (BMES + gate)
+    - BMES bias hook trên attention head, hỗ trợ batch
     """
     def __init__(self, config, target_heads=None, alpha=0.1, beta=-0.05, gamma=0.0, delta=0.0, **kwargs):
         super().__init__(config, **kwargs)
+        # Embedding mới
         self.embeddings = BoundaryAwareEmbeddings(config, **kwargs)
+        # Bias params
         self.target_heads = target_heads or {}
         self.alpha = alpha
         self.beta = beta
         self.gamma = gamma
         self.delta = delta
         self.tokenizer = None
         self.bias_hooks = {}
+        self.bias_matrix = None  # shape: [B, num_heads, seq_len, seq_len]
     def set_tokenizer(self, tokenizer):
+        assert tokenizer is not None
         self.tokenizer = tokenizer
     def set_bias_matrix(self, bmes_tags):
+        """
+        bmes_tags: tensor [B, seq_len] hoặc [seq_len]
+        Trả về tensor [B, num_heads, seq_len, seq_len]
+        """
+        if isinstance(bmes_tags, torch.Tensor) and bmes_tags.dim() == 1:
+            # 1 sample -> add batch dim
+            bmes_tags = bmes_tags.unsqueeze(0)
+        batch_size, seq_len = bmes_tags.shape
+        bias_np = create_bias_matrix(bmes_tags, alpha=self.alpha, beta=self.beta, gamma=self.gamma, delta=self.delta)
+        bias_tensor = torch.tensor(bias_np, dtype=torch.float32, device=next(self.parameters()).device)
+        # lặp num_heads
         num_heads = self.config.num_attention_heads
+        bias_tensor = bias_tensor.unsqueeze(1).repeat(1, num_heads, 1, 1)  # [B, num_heads, seq_len, seq_len]
+        self.bias_matrix = bias_tensor
     def _register_attention_hook(self, layer_idx, head_indices):
         def hook_fn(module, input, output):
+            # output: (context_layer, attention_probs)
+            if not isinstance(output, tuple) or len(output)<2:
                 return output
+            context_layer, attn_probs = output
+            if attn_probs is None or self.bias_matrix is None:
                 return output
+            B, H, L, _ = attn_probs.shape
             bias = self.bias_matrix
+            if bias.size(-1) != L:
+                bias = bias[:, :, :L, :L]
             for h in head_indices:
+                if h < H:
+                    attn_probs[:, h, :, :] += bias[:, h, :, :]
+            return (context_layer, attn_probs)
         attn = self.encoder.layer[layer_idx].attention.self
         hook = attn.register_forward_hook(hook_fn)
         if self.target_heads:
             self.prepare_bias_hooks()
+        output_attentions = True if output_attentions is None else output_attentions
         outputs = super().forward(
             input_ids=input_ids,
         )
         self.remove_bias_hooks()
+        return outputs