mideind
/

IceBERT-PoS

@@ -40,7 +40,7 @@ class MultiLabelTokenClassificationHead(nn.Module):
     def forward(self, features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         H = hidden_size, C = num_categories, A = num_attributes, Wt = total_words
         Args:
             features: Word-level features (Wt x H)
@@ -131,7 +131,7 @@ class IceBertPosForTokenClassification(PreTrainedModel):
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         B = batch_size, L = seq_len, H = hidden_size, C = num_categories, A = num_attributes, W = max_words
         Args:
             input_ids: Token indices (B x L)
             attention_mask: Attention mask (B x L)
@@ -157,13 +157,13 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         )
         hidden_states = outputs[0]  # (B x L x H)
         # (B x L x H) -> (Wt x H)
         word_embeddings = self._aggregate_subword_tokens(hidden_states, word_mask, attention_mask)
         # (Wt x H) -> (Wt x C), (Wt x A)
         cat_logits, attr_logits = self.classifier(word_embeddings)
         # (Wt x C) -> (B x W x C), (Wt x A) -> (B x W x A)
         nwords = word_mask.sum(dim=-1)  # (B,)
         cat_logits = self._reshape_to_batch_format(cat_logits, nwords)
@@ -175,9 +175,10 @@ class IceBertPosForTokenClassification(PreTrainedModel):
     ) -> torch.Tensor:
         """
         Average subword tokens within each word to get word-level representations.
         B = batch_size, L = seq_len, H = hidden_size, Wt = total_words
         Args:
             sequence_output: Subword token representations (B x L x H)
             word_mask: Binary mask where 1 indicates start of word (B x L)
@@ -187,48 +188,94 @@ class IceBertPosForTokenClassification(PreTrainedModel):
             word_features: Concatenated word-level features (Wt x H)
         """
         batch_size, seq_len, hidden_size = sequence_output.shape
-        mean_words = []
-        for batch_idx in range(batch_size):
-            # Get valid (non-padding) tokens for this sequence
-            valid_mask = attention_mask[batch_idx].bool()  # (L,) -> (Lv,)
-            seq_output = sequence_output[batch_idx, valid_mask]  # (Lv x H)
-            seq_word_mask = word_mask[batch_idx, valid_mask]  # (Lv,)
-            # Find word start positions
-            word_starts = seq_word_mask.nonzero(as_tuple=True)[0]  # (Ws,)
             if len(word_starts) == 0:
                 continue
-            # For each word, find its token span and average
             for i, start_pos in enumerate(word_starts):
-                # Find end position (start of next word or end of valid sequence)
                 if i + 1 < len(word_starts):
-                    end_pos = word_starts[i + 1]
                 else:
-                    end_pos = len(seq_output)
-                # Average tokens within this word (excluding padding)
-                word_tokens = seq_output[start_pos:end_pos]  # (Tw x H)
-                word_repr = word_tokens.mean(dim=0)  # (H,)
-                mean_words.append(word_repr)
-        if len(mean_words) == 0:
-            return torch.empty(0, hidden_size, device=sequence_output.device)
-        return torch.stack(mean_words)  # (Wt x H)
     def _reshape_to_batch_format(self, logits: torch.Tensor, nwords: torch.Tensor) -> torch.Tensor:
         """
         Reshape concatenated word predictions back to padded batch format.
         B = batch_size, W = max_words, Wt = total_words, K = num_classes
         Args:
             logits: Concatenated word predictions (Wt x K)
             nwords: Number of words per sequence (B,)
         Returns:
             batch_logits: Batched predictions (B x W x K)
         """
@@ -265,7 +312,7 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         Convert word_ids to binary mask indicating word boundaries.
         B = batch_size, L = seq_len
         Args:
             word_ids: List of word id sequences for each batch item
             input_shape: Shape of input_ids tensor (B x L)
@@ -296,7 +343,7 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         Predict POS labels from raw text.
         B = batch_size, L = seq_len
         Args:
             sentences: List of input sentences (B,)
             tokenizer: HuggingFace tokenizer
@@ -330,14 +377,14 @@ class IceBertPosForTokenClassification(PreTrainedModel):
     ) -> List[List[Tuple[str, List[str]]]]:
         """
         Convert logits to human-readable labels using schema-based logic.
         B = batch_size, W = max_words, C = num_categories, A = num_attributes, L = seq_len
         Args:
             cat_logits: Category logits (B x W x C)
             attr_logits: Attribute logits (B x W x A)
             word_mask: Binary mask for valid words (B x L)
         Returns:
             predictions: List of [(category, [attributes])] for each sequence
         """
@@ -350,20 +397,20 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         predictions = []
         schema = self.config.label_schema
         for seq_idx in range(bsz):
             seq_nwords = nwords[seq_idx]
             # (W x C) -> (seq_nwords,)
             pred_cat_indices = cat_logits[seq_idx, :seq_nwords].max(dim=-1).indices
             seq_predictions = []
             for word_idx in range(seq_nwords):
                 cat_idx = pred_cat_indices[word_idx].item()
                 cat_name = schema.label_categories[cat_idx]
                 # Get valid groups for this category
                 valid_groups = schema.category_to_group_names.get(cat_name, [])
                 # Collect attributes for this word
                 attributes = []
                 for group_name in valid_groups:
@@ -382,7 +429,7 @@ class IceBertPosForTokenClassification(PreTrainedModel):
                                 best_idx = group_logits.max(dim=-1).indices.item()
                                 attr_idx = group_indices[best_idx].item()
                                 attributes.append(schema.labels[attr_idx])
                 # Apply specific rules from original model
                 if len(attributes) == 1 and attributes[0] == "pos":
                     # This label is used as a default for training but implied in mim format
@@ -390,9 +437,9 @@ class IceBertPosForTokenClassification(PreTrainedModel):
                 elif cat_name == "sl" and "act" in attributes:
                     # Number and tense are not shown for sl act in mim format
                     attributes = [attr for attr in attributes if attr not in ["1", "sing", "pres"]]
                 seq_predictions.append((cat_name, attributes))
             predictions.append(seq_predictions)
         return predictions
@@ -400,25 +447,25 @@ class IceBertPosForTokenClassification(PreTrainedModel):
     def predict_ifd_labels_from_text(self, sentences: List[str], tokenizer) -> List[List[str]]:
         """
         Predict IFD format labels from raw text.
         B = batch_size, Ws = seq_words
         Args:
             sentences: List of input sentences (B,)
             tokenizer: HuggingFace tokenizer
         Returns:
             ifd_predictions: List of IFD labels per sentence (B x Ws)
         """
         # Get model predictions in (category, [attributes]) format
         predictions = self.predict_labels_from_text(sentences, tokenizer)
         # Convert each sentence's predictions to IFD format
         ifd_predictions = []
         for sentence_predictions in predictions:
             ifd_labels = convert_predictions_to_ifd(sentence_predictions)  # (Ws,)
             ifd_predictions.append(ifd_labels)
         return ifd_predictions

     def forward(self, features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         H = hidden_size, C = num_categories, A = num_attributes, Wt = total_words
         Args:
             features: Word-level features (Wt x H)
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         B = batch_size, L = seq_len, H = hidden_size, C = num_categories, A = num_attributes, W = max_words
         Args:
             input_ids: Token indices (B x L)
             attention_mask: Attention mask (B x L)
         )
         hidden_states = outputs[0]  # (B x L x H)
         # (B x L x H) -> (Wt x H)
         word_embeddings = self._aggregate_subword_tokens(hidden_states, word_mask, attention_mask)
         # (Wt x H) -> (Wt x C), (Wt x A)
         cat_logits, attr_logits = self.classifier(word_embeddings)
         # (Wt x C) -> (B x W x C), (Wt x A) -> (B x W x A)
         nwords = word_mask.sum(dim=-1)  # (B,)
         cat_logits = self._reshape_to_batch_format(cat_logits, nwords)
     ) -> torch.Tensor:
         """
         Average subword tokens within each word to get word-level representations.
+        Vectorized implementation using scatter operations for efficiency.
         B = batch_size, L = seq_len, H = hidden_size, Wt = total_words
         Args:
             sequence_output: Subword token representations (B x L x H)
             word_mask: Binary mask where 1 indicates start of word (B x L)
             word_features: Concatenated word-level features (Wt x H)
         """
         batch_size, seq_len, hidden_size = sequence_output.shape
+        device = sequence_output.device
+        # Create word indices mapping each token to its word
+        # Strategy: assign each token to a word ID, then use scatter operations to sum/average
+        # Only tokens that belong to actual words get valid indices
+        word_indices = torch.full_like(word_mask, -1, dtype=torch.long)  # (B x L)
+        # Build word indices by finding word boundaries
+        # Each token gets assigned to a word index (0, 1, 2, ...) within its sequence
+        for b in range(batch_size):
+            valid_mask = attention_mask[b].bool()  # (L,) - exclude padding tokens
+            if not valid_mask.any():
+                continue
+            # Get word starts for this sequence
+            seq_word_mask = word_mask[b, valid_mask]  # (Lv,) - only valid positions
+            word_starts = seq_word_mask.nonzero(as_tuple=True)[0]  # (Ws,) - positions where words start
             if len(word_starts) == 0:
                 continue
+            # Assign each token to its word within this sequence
+            seq_word_indices = torch.full((len(seq_word_mask),), -1, dtype=torch.long, device=device)
             for i, start_pos in enumerate(word_starts):
+                # Find end position (next word start or end of sequence)
                 if i + 1 < len(word_starts):
+                    end_pos = word_starts[i + 1]  # Next word boundary
                 else:
+                    end_pos = len(seq_word_mask)  # End of sequence
+                # All tokens from start_pos to end_pos belong to word i
+                seq_word_indices[start_pos:end_pos] = i
+            # Store the word indices for this sequence
+            word_indices[b, valid_mask] = seq_word_indices
+        # Create global word indices across the entire batch
+        # Convert local word indices (0,1,2... per sequence) to global indices (0,1,2...total_words-1)
+        # This allows us to use scatter operations across the entire batch
+        max_words_per_seq = word_mask.sum(dim=-1)  # (B,) - words per sequence
+        word_offset = torch.cat(
+            [torch.zeros(1, device=device, dtype=torch.long), max_words_per_seq.cumsum(dim=0)[:-1]]
+        )  # (B,) - cumulative word offsets
+        # Add batch offsets to make global unique indices
+        # E.g., if batch has [3,2] words: seq0=[0,1,2], seq1=[3,4]
+        global_word_indices = word_indices + word_offset.unsqueeze(1)  # (B x L)
+        # Flatten everything for scatter operations
+        flat_output = sequence_output.view(-1, hidden_size)  # (B*L x H)
+        flat_word_indices = global_word_indices.view(-1)  # (B*L,)
+        flat_attention = attention_mask.view(-1)  # (B*L,)
+        # Only use tokens that belong to words (not padding and not before first word)
+        valid_word_tokens = (flat_attention.bool()) & (flat_word_indices >= 0)  # (B*L,)
+        valid_output = flat_output[valid_word_tokens]  # (valid_word_tokens x H)
+        valid_word_indices = flat_word_indices[valid_word_tokens]  # (valid_word_tokens,)
+        total_words = max_words_per_seq.sum().item()
+        if total_words == 0:
+            return torch.empty(0, hidden_size, device=device)
+        # Vectorized aggregation using scatter operations
+        # Sum all token embeddings that belong to the same word
+        word_sums = torch.zeros(total_words, hidden_size, device=device)  # (Wt x H)
+        word_sums.scatter_add_(0, valid_word_indices.unsqueeze(1).expand(-1, hidden_size), valid_output)
+        # Count how many tokens belong to each word (for averaging)
+        word_counts = torch.zeros(total_words, device=device)  # (Wt,)
+        word_counts.scatter_add_(0, valid_word_indices, torch.ones_like(valid_word_indices, dtype=torch.float))
+        # Compute average: word_embedding = sum_of_tokens / count_of_tokens
+        word_counts = torch.clamp(word_counts, min=1.0)  # Prevent division by zero
+        word_features = word_sums / word_counts.unsqueeze(1)  # (Wt x H)
+        return word_features
     def _reshape_to_batch_format(self, logits: torch.Tensor, nwords: torch.Tensor) -> torch.Tensor:
         """
         Reshape concatenated word predictions back to padded batch format.
         B = batch_size, W = max_words, Wt = total_words, K = num_classes
         Args:
             logits: Concatenated word predictions (Wt x K)
             nwords: Number of words per sequence (B,)
         Returns:
             batch_logits: Batched predictions (B x W x K)
         """
         Convert word_ids to binary mask indicating word boundaries.
         B = batch_size, L = seq_len
         Args:
             word_ids: List of word id sequences for each batch item
             input_shape: Shape of input_ids tensor (B x L)
         Predict POS labels from raw text.
         B = batch_size, L = seq_len
         Args:
             sentences: List of input sentences (B,)
             tokenizer: HuggingFace tokenizer
     ) -> List[List[Tuple[str, List[str]]]]:
         """
         Convert logits to human-readable labels using schema-based logic.
         B = batch_size, W = max_words, C = num_categories, A = num_attributes, L = seq_len
         Args:
             cat_logits: Category logits (B x W x C)
             attr_logits: Attribute logits (B x W x A)
             word_mask: Binary mask for valid words (B x L)
         Returns:
             predictions: List of [(category, [attributes])] for each sequence
         """
         predictions = []
         schema = self.config.label_schema
         for seq_idx in range(bsz):
             seq_nwords = nwords[seq_idx]
             # (W x C) -> (seq_nwords,)
             pred_cat_indices = cat_logits[seq_idx, :seq_nwords].max(dim=-1).indices
             seq_predictions = []
             for word_idx in range(seq_nwords):
                 cat_idx = pred_cat_indices[word_idx].item()
                 cat_name = schema.label_categories[cat_idx]
                 # Get valid groups for this category
                 valid_groups = schema.category_to_group_names.get(cat_name, [])
                 # Collect attributes for this word
                 attributes = []
                 for group_name in valid_groups:
                                 best_idx = group_logits.max(dim=-1).indices.item()
                                 attr_idx = group_indices[best_idx].item()
                                 attributes.append(schema.labels[attr_idx])
                 # Apply specific rules from original model
                 if len(attributes) == 1 and attributes[0] == "pos":
                     # This label is used as a default for training but implied in mim format
                 elif cat_name == "sl" and "act" in attributes:
                     # Number and tense are not shown for sl act in mim format
                     attributes = [attr for attr in attributes if attr not in ["1", "sing", "pres"]]
                 seq_predictions.append((cat_name, attributes))
             predictions.append(seq_predictions)
         return predictions
     def predict_ifd_labels_from_text(self, sentences: List[str], tokenizer) -> List[List[str]]:
         """
         Predict IFD format labels from raw text.
         B = batch_size, Ws = seq_words
         Args:
             sentences: List of input sentences (B,)
             tokenizer: HuggingFace tokenizer
         Returns:
             ifd_predictions: List of IFD labels per sentence (B x Ws)
         """
         # Get model predictions in (category, [attributes]) format
         predictions = self.predict_labels_from_text(sentences, tokenizer)
         # Convert each sentence's predictions to IFD format
         ifd_predictions = []
         for sentence_predictions in predictions:
             ifd_labels = convert_predictions_to_ifd(sentence_predictions)  # (Ws,)
             ifd_predictions.append(ifd_labels)
         return ifd_predictions