mideind
/

IceBERT-PoS

@@ -25,37 +25,41 @@ class MultiLabelTokenClassificationHead(nn.Module):
         self.num_labels = config.num_labels
         self.hidden_size = config.hidden_size
         self.dense = nn.Linear(self.hidden_size, self.hidden_size)
         self.activation_fn = F.relu
         self.dropout = nn.Dropout(p=config.classifier_dropout)
         self.layer_norm = nn.LayerNorm(self.hidden_size)
-        # Category projection: hidden_size -> num_categories
         self.cat_proj = nn.Linear(self.hidden_size, self.num_categories)
-        # Attribute projection: (hidden_size + num_categories) -> num_labels
         self.out_proj = nn.Linear(self.hidden_size + self.num_categories, self.num_labels)
     def forward(self, features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Args:
-            features: Word-level features of shape (batch_size, max_words, hidden_size)
         Returns:
-            cat_logits: Category logits of shape (batch_size, max_words, num_categories)
-            attr_logits: Attribute logits of shape (batch_size, max_words, num_labels)
         """
-        x = self.dropout(features)
-        x = self.dense(x)
-        x = self.layer_norm(x)
-        x = self.activation_fn(x)
-        # Predict categories
         cat_logits = self.cat_proj(x)
-        cat_probs = torch.softmax(cat_logits, dim=-1)
-        # Predict attributes using concatenated features
         attr_input = torch.cat((cat_probs, x), dim=-1)
         attr_logits = self.out_proj(attr_input)
         return cat_logits, attr_logits
@@ -94,22 +98,22 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         # Create tensors as regular attributes (not buffers to avoid init warnings)
         self.group_mask = schema.get_group_masks()
         self.group_name_to_group_attr_indices = schema.get_group_name_to_group_attr_indices()
         # Category name to index mapping (regular dict, no device movement needed)
         self.category_name_to_index = schema.get_category_name_to_index()
-    def _apply(self, fn):
         """Override _apply to move our custom tensors with the model."""
         super()._apply(fn)
         # Move our custom tensors when model.to(device) is called
-        if hasattr(self, 'group_mask'):
             self.group_mask = fn(self.group_mask)
-        if hasattr(self, 'group_name_to_group_attr_indices'):
             for group_name, tensor in self.group_name_to_group_attr_indices.items():
                 self.group_name_to_group_attr_indices[group_name] = fn(tensor)
         return self
     def forward(
@@ -126,14 +130,16 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         return_dict: Optional[bool] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Args:
-            input_ids: Token indices of shape (batch_size, sequence_length)
-            attention_mask: Attention mask of shape (batch_size, sequence_length)
-            word_mask: Binary mask indicating word boundaries (1 = word start) of shape (batch_size, sequence_length)
         Returns:
-            cat_logits: Category logits of shape (batch_size, max_words, num_categories)
-            attr_logits: Attribute logits of shape (batch_size, max_words, num_labels)
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -150,112 +156,87 @@ class IceBertPosForTokenClassification(PreTrainedModel):
             return_dict=return_dict,
         )
-        x = outputs[0]  # (batch_size, seq_len, hidden)
-        # Copy exact logic from old model
-        _, _, inner_dim = x.shape
-        # use first bpe token of word as representation
-        x = x[:, 1:-1, :]
-        starts = word_mask[:, 1:-1]  # remove bos, eos
-        ends = starts.roll(-1, dims=[-1]).nonzero()[:, -1] + 1
-        starts = starts.nonzero().tolist()
-        mean_words = []
-        for (seq_idx, token_idx), end in zip(starts, ends):
-            mean_words.append(x[seq_idx, token_idx:end, :].mean(dim=0))
-        mean_words = torch.stack(mean_words)
-        words = mean_words
-        # Innermost dimension is mask for tokens at head of word.
-        nwords = word_mask.sum(dim=-1)
-        (cat_logits, attr_logits) = self.classifier(words)
-        # (Batch * Time) x Depth -> Batch x Time x Depth
-        cat_logits = pad_sequence(cat_logits.split((nwords).tolist()), padding_value=0, batch_first=True)
-        attr_logits = pad_sequence(
-            attr_logits.split((nwords).tolist()),
-            padding_value=0,
-            batch_first=True,
-        )
         return cat_logits, attr_logits
     def _aggregate_subword_tokens(
-        self, sequence_output: torch.Tensor, word_mask: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
-        Aggregate subword token representations to word-level representations.
-        Following the original fairseq approach by averaging subword tokens within each word.
         Args:
-            sequence_output: subword token representations (batch_size, seq_len, hidden_size)
-            word_mask: Binary mask where 1 indicates start of word (batch_size, seq_len)
         Returns:
-            word_features: Word-level features (batch_size, max_words, hidden_size)
-            nwords: Number of words per sequence (batch_size,)
         """
-        # TODO: Verify that BOS and EOS are handled correctly - I'm worried that this does not correctly handle padding
-        # Remove BOS and EOS tokens (first and last positions)
-        x = sequence_output[:, 1:-1, :]  # (batch_size, seq_len-2, hidden_size)
-        starts = word_mask[:, 1:-1]  # (batch_size, seq_len-2)
-        # Count words per sequence
-        nwords = starts.sum(dim=-1)  # (batch_size,)
-        # Find word boundaries and average tokens within each word
         mean_words = []
-        batch_size, seq_len, hidden_size = x.shape
         for batch_idx in range(batch_size):
-            seq_starts = starts[batch_idx]  # (seq_len-2,)
-            seq_x = x[batch_idx]  # (seq_len-2, hidden_size)
-            # Find start positions of words
-            start_positions = seq_starts.nonzero(as_tuple=True)[0]  # positions where words start
-            if len(start_positions) == 0:
                 continue
-            # Calculate end positions (start of next word or end of sequence)
-            end_positions = torch.cat([start_positions[1:], torch.tensor([seq_len], device=start_positions.device)])
-            # Average tokens within each word
-            for start_pos, end_pos in zip(start_positions, end_positions):
-                word_tokens = seq_x[start_pos:end_pos]  # tokens in this word
-                word_repr = word_tokens.mean(dim=0)  # average representation
                 mean_words.append(word_repr)
         if len(mean_words) == 0:
-            return torch.empty(0, sequence_output.size(-1), device=sequence_output.device), nwords
-        return torch.stack(mean_words), nwords
-    def _reshape_to_batch_format(
-        self, cat_logits: torch.Tensor, attr_logits: torch.Tensor, nwords: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
-        Reshape word-level predictions back to batch format.
-        Following the original fairseq approach with pad_sequence.
         Args:
-            cat_logits: Category logits (total_words, num_categories)
-            attr_logits: Attribute logits (total_words, num_labels)
-            nwords: Number of words per sequence (batch_size,)
         Returns:
-            cat_logits_batch: (batch_size, max_words, num_categories)
-            attr_logits_batch: (batch_size, max_words, num_labels)
         """
-        # Split logits by sequence using word counts
-        words_per_seq = nwords.tolist()
-        cat_logits_split = cat_logits.split(words_per_seq)
-        attr_logits_split = attr_logits.split(words_per_seq)
-        # Pad to same length (matching original fairseq approach)
-        cat_logits_batch = pad_sequence(cat_logits_split, batch_first=True, padding_value=0)
-        attr_logits_batch = pad_sequence(attr_logits_split, batch_first=True, padding_value=0)
-        return cat_logits_batch, attr_logits_batch
     @torch.no_grad()
     def predict_labels(
@@ -281,24 +262,26 @@ class IceBertPosForTokenClassification(PreTrainedModel):
     def _word_ids_to_word_mask(self, word_ids: List[List[int]], input_shape: torch.Size) -> torch.Tensor:
         """
-        Convert word_ids to word_mask (binary mask indicating word boundaries).
         Args:
-            word_ids: List of word id sequences
-            input_shape: Shape of input_ids tensor (batch_size, seq_len)
         Returns:
-            word_mask: Binary tensor where 1 indicates start of word (batch_size, seq_len)
         """
         batch_size, seq_len = input_shape
-        word_mask = torch.zeros(batch_size, seq_len, dtype=torch.long)
         for batch_idx, seq_word_ids in enumerate(word_ids):
             prev_word_id = None
             for token_idx, word_id in enumerate(seq_word_ids):
                 # Skip None values (special tokens and padding)
                 if word_id is not None and word_id != prev_word_id:
-                    word_mask[batch_idx, token_idx] = 1
                 # Only update prev_word_id for valid (non-None) word_ids
                 if word_id is not None:
                     prev_word_id = word_id
@@ -310,10 +293,12 @@ class IceBertPosForTokenClassification(PreTrainedModel):
     def predict_labels_from_text(self, sentences: List[str], tokenizer) -> List[List[Tuple[str, List[str]]]]:
         """
-        Predict POS labels from raw text using fairseq-style preprocessing.
         Args:
-            sentences: List of input sentences
             tokenizer: HuggingFace tokenizer
         Returns:
@@ -334,9 +319,9 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         # Debug logging to match fairseq model
         for i in range(len(sentences)):
-            logger.debug(f"Encoded tokens: {batch_input_ids[i]}")
             logger.debug(f"Decoded tokens: {tokenizer.convert_ids_to_tokens(batch_input_ids[i].tolist())}")
-            logger.debug(f"Word IDs: {word_ids_list[i]}")
         return self.predict_labels(batch_input_ids, batch_attention_mask, word_ids_list)
@@ -345,48 +330,59 @@ class IceBertPosForTokenClassification(PreTrainedModel):
     ) -> List[List[Tuple[str, List[str]]]]:
         """
         Convert logits to human-readable labels using schema-based logic.
         """
-        # logits: Batch x Time x Labels
         bsz, _, num_cats = cat_logits.shape
         _, _, num_attrs = attr_logits.shape
-        nwords = word_mask.sum(-1)
         assert num_attrs == len(self.config.label_schema.labels)
         assert num_cats == len(self.config.label_schema.label_categories)
         predictions = []
         schema = self.config.label_schema
         for seq_idx in range(bsz):
             seq_nwords = nwords[seq_idx]
             pred_cat_indices = cat_logits[seq_idx, :seq_nwords].max(dim=-1).indices
             seq_predictions = []
             for word_idx in range(seq_nwords):
-                cat_idx = int(pred_cat_indices[word_idx].item())
                 cat_name = schema.label_categories[cat_idx]
                 # Get valid groups for this category
                 valid_groups = schema.category_to_group_names.get(cat_name, [])
                 # Collect attributes for this word
                 attributes = []
                 for group_name in valid_groups:
                     if group_name in self.group_name_to_group_attr_indices:
-                        group_indices = self.group_name_to_group_attr_indices[group_name]
                         if len(group_indices) > 0:
                             group_logits = attr_logits[seq_idx, word_idx, group_indices]
                             if len(group_indices) == 1:
-                                # Binary decision
                                 if group_logits.sigmoid().item() > 0.5:
-                                    attr_idx = int(group_indices[0].item())
                                     attributes.append(schema.labels[attr_idx])
                             else:
-                                # Multi-class decision
-                                best_idx = int(group_logits.max(dim=-1).indices.item())
-                                attr_idx = int(group_indices[best_idx].item())
                                 attributes.append(schema.labels[attr_idx])
                 # Apply specific rules from original model
                 if len(attributes) == 1 and attributes[0] == "pos":
                     # This label is used as a default for training but implied in mim format
@@ -394,9 +390,9 @@ class IceBertPosForTokenClassification(PreTrainedModel):
                 elif cat_name == "sl" and "act" in attributes:
                     # Number and tense are not shown for sl act in mim format
                     attributes = [attr for attr in attributes if attr not in ["1", "sing", "pres"]]
                 seq_predictions.append((cat_name, attributes))
             predictions.append(seq_predictions)
         return predictions
@@ -405,12 +401,14 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         """
         Predict IFD format labels from raw text.
         Args:
-            sentences: List of input sentences
             tokenizer: HuggingFace tokenizer
         Returns:
-            List of sequences, each containing IFD format labels per word
         """
         # Get model predictions in (category, [attributes]) format
         predictions = self.predict_labels_from_text(sentences, tokenizer)
@@ -418,7 +416,7 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         # Convert each sentence's predictions to IFD format
         ifd_predictions = []
         for sentence_predictions in predictions:
-            ifd_labels = convert_predictions_to_ifd(sentence_predictions)
             ifd_predictions.append(ifd_labels)
         return ifd_predictions

         self.num_labels = config.num_labels
         self.hidden_size = config.hidden_size
+        # (*, H) -> (*, H)
         self.dense = nn.Linear(self.hidden_size, self.hidden_size)
         self.activation_fn = F.relu
         self.dropout = nn.Dropout(p=config.classifier_dropout)
         self.layer_norm = nn.LayerNorm(self.hidden_size)
+        # Projection heads for multilabel classification
+        # (*, H) -> (*, C)
         self.cat_proj = nn.Linear(self.hidden_size, self.num_categories)
+        # (*, H + C) -> (*, A)
         self.out_proj = nn.Linear(self.hidden_size + self.num_categories, self.num_labels)
     def forward(self, features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """
+        H = hidden_size, C = num_categories, A = num_attributes, Wt = total_words
         Args:
+            features: Word-level features (Wt x H)
         Returns:
+            cat_logits: Category logits (Wt x C)
+            attr_logits: Attribute logits (Wt x A)
         """
+        x = self.dropout(features)  # (Wt x H)
+        x = self.dense(x)  # (Wt x H)
+        x = self.layer_norm(x)  # (Wt x H)
+        x = self.activation_fn(x)  # (Wt x H)
+        # (Wt x H) -> (Wt x C)
         cat_logits = self.cat_proj(x)
+        cat_probs = torch.softmax(cat_logits, dim=-1)  # (Wt x C)
+        # (Wt x H) + (Wt x C) -> (Wt x H+C)
         attr_input = torch.cat((cat_probs, x), dim=-1)
+        # (Wt x H+C) -> (Wt x A)
         attr_logits = self.out_proj(attr_input)
         return cat_logits, attr_logits
         # Create tensors as regular attributes (not buffers to avoid init warnings)
         self.group_mask = schema.get_group_masks()
         self.group_name_to_group_attr_indices = schema.get_group_name_to_group_attr_indices()
         # Category name to index mapping (regular dict, no device movement needed)
         self.category_name_to_index = schema.get_category_name_to_index()
+    def _apply(self, fn):  # type: ignore
         """Override _apply to move our custom tensors with the model."""
         super()._apply(fn)
         # Move our custom tensors when model.to(device) is called
+        if hasattr(self, "group_mask"):
             self.group_mask = fn(self.group_mask)
+        if hasattr(self, "group_name_to_group_attr_indices"):
             for group_name, tensor in self.group_name_to_group_attr_indices.items():
                 self.group_name_to_group_attr_indices[group_name] = fn(tensor)
         return self
     def forward(
         return_dict: Optional[bool] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
+        B = batch_size, L = seq_len, H = hidden_size, C = num_categories, A = num_attributes, W = max_words
         Args:
+            input_ids: Token indices (B x L)
+            attention_mask: Attention mask (B x L)
+            word_mask: Binary mask indicating word boundaries, 1 = word start (B x L)
         Returns:
+            cat_logits: Category logits (B x W x C)
+            attr_logits: Attribute logits (B x W x A)
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
             return_dict=return_dict,
         )
+        hidden_states = outputs[0]  # (B x L x H)
+        # (B x L x H) -> (Wt x H)
+        word_embeddings = self._aggregate_subword_tokens(hidden_states, word_mask, attention_mask)
+        # (Wt x H) -> (Wt x C), (Wt x A)
+        cat_logits, attr_logits = self.classifier(word_embeddings)
+        # (Wt x C) -> (B x W x C), (Wt x A) -> (B x W x A)
+        nwords = word_mask.sum(dim=-1)  # (B,)
+        cat_logits = self._reshape_to_batch_format(cat_logits, nwords)
+        attr_logits = self._reshape_to_batch_format(attr_logits, nwords)
         return cat_logits, attr_logits
     def _aggregate_subword_tokens(
+        self, sequence_output: torch.Tensor, word_mask: torch.Tensor, attention_mask: torch.Tensor
+    ) -> torch.Tensor:
         """
+        Average subword tokens within each word to get word-level representations.
+        B = batch_size, L = seq_len, H = hidden_size, Wt = total_words
         Args:
+            sequence_output: Subword token representations (B x L x H)
+            word_mask: Binary mask where 1 indicates start of word (B x L)
+            attention_mask: Attention mask to exclude padding tokens (B x L)
         Returns:
+            word_features: Concatenated word-level features (Wt x H)
         """
+        batch_size, seq_len, hidden_size = sequence_output.shape
         mean_words = []
         for batch_idx in range(batch_size):
+            # Get valid (non-padding) tokens for this sequence
+            valid_mask = attention_mask[batch_idx].bool()  # (L,) -> (Lv,)
+            seq_output = sequence_output[batch_idx, valid_mask]  # (Lv x H)
+            seq_word_mask = word_mask[batch_idx, valid_mask]  # (Lv,)
+            # Find word start positions
+            word_starts = seq_word_mask.nonzero(as_tuple=True)[0]  # (Ws,)
+            if len(word_starts) == 0:
                 continue
+            # For each word, find its token span and average
+            for i, start_pos in enumerate(word_starts):
+                # Find end position (start of next word or end of valid sequence)
+                if i + 1 < len(word_starts):
+                    end_pos = word_starts[i + 1]
+                else:
+                    end_pos = len(seq_output)
+                # Average tokens within this word (excluding padding)
+                word_tokens = seq_output[start_pos:end_pos]  # (Tw x H)
+                word_repr = word_tokens.mean(dim=0)  # (H,)
                 mean_words.append(word_repr)
         if len(mean_words) == 0:
+            return torch.empty(0, hidden_size, device=sequence_output.device)
+        return torch.stack(mean_words)  # (Wt x H)
+    def _reshape_to_batch_format(self, logits: torch.Tensor, nwords: torch.Tensor) -> torch.Tensor:
         """
+        Reshape concatenated word predictions back to padded batch format.
+        B = batch_size, W = max_words, Wt = total_words, K = num_classes
         Args:
+            logits: Concatenated word predictions (Wt x K)
+            nwords: Number of words per sequence (B,)
         Returns:
+            batch_logits: Batched predictions (B x W x K)
         """
+        return pad_sequence(
+            logits.split(nwords.tolist()),
+            padding_value=0,
+            batch_first=True,
+        )
     @torch.no_grad()
     def predict_labels(
     def _word_ids_to_word_mask(self, word_ids: List[List[int]], input_shape: torch.Size) -> torch.Tensor:
         """
+        Convert word_ids to binary mask indicating word boundaries.
+        B = batch_size, L = seq_len
         Args:
+            word_ids: List of word id sequences for each batch item
+            input_shape: Shape of input_ids tensor (B x L)
         Returns:
+            word_mask: Binary tensor where 1 indicates start of word (B x L)
         """
         batch_size, seq_len = input_shape
+        word_mask = torch.zeros(batch_size, seq_len, dtype=torch.long)  # (B x L)
         for batch_idx, seq_word_ids in enumerate(word_ids):
             prev_word_id = None
             for token_idx, word_id in enumerate(seq_word_ids):
                 # Skip None values (special tokens and padding)
                 if word_id is not None and word_id != prev_word_id:
+                    word_mask[batch_idx, token_idx] = 1  # Mark word start
                 # Only update prev_word_id for valid (non-None) word_ids
                 if word_id is not None:
                     prev_word_id = word_id
     def predict_labels_from_text(self, sentences: List[str], tokenizer) -> List[List[Tuple[str, List[str]]]]:
         """
+        Predict POS labels from raw text.
+        B = batch_size, L = seq_len
         Args:
+            sentences: List of input sentences (B,)
             tokenizer: HuggingFace tokenizer
         Returns:
         # Debug logging to match fairseq model
         for i in range(len(sentences)):
+            logger.debug(f"Encoded tokens: {batch_input_ids[i]}")  # (L,)
             logger.debug(f"Decoded tokens: {tokenizer.convert_ids_to_tokens(batch_input_ids[i].tolist())}")
+            logger.debug(f"Word IDs: {word_ids_list[i]}")  # (L,)
         return self.predict_labels(batch_input_ids, batch_attention_mask, word_ids_list)
     ) -> List[List[Tuple[str, List[str]]]]:
         """
         Convert logits to human-readable labels using schema-based logic.
+        B = batch_size, W = max_words, C = num_categories, A = num_attributes, L = seq_len
+        Args:
+            cat_logits: Category logits (B x W x C)
+            attr_logits: Attribute logits (B x W x A)
+            word_mask: Binary mask for valid words (B x L)
+        Returns:
+            predictions: List of [(category, [attributes])] for each sequence
         """
         bsz, _, num_cats = cat_logits.shape
         _, _, num_attrs = attr_logits.shape
+        nwords = word_mask.sum(-1)  # (B,)
         assert num_attrs == len(self.config.label_schema.labels)
         assert num_cats == len(self.config.label_schema.label_categories)
         predictions = []
         schema = self.config.label_schema
         for seq_idx in range(bsz):
             seq_nwords = nwords[seq_idx]
+            # (W x C) -> (seq_nwords,)
             pred_cat_indices = cat_logits[seq_idx, :seq_nwords].max(dim=-1).indices
             seq_predictions = []
             for word_idx in range(seq_nwords):
+                cat_idx = pred_cat_indices[word_idx].item()
                 cat_name = schema.label_categories[cat_idx]
                 # Get valid groups for this category
                 valid_groups = schema.category_to_group_names.get(cat_name, [])
                 # Collect attributes for this word
                 attributes = []
                 for group_name in valid_groups:
                     if group_name in self.group_name_to_group_attr_indices:
+                        group_indices = self.group_name_to_group_attr_indices[group_name]  # (Gs,)
                         if len(group_indices) > 0:
+                            # (A,) -> (Gs,)
                             group_logits = attr_logits[seq_idx, word_idx, group_indices]
                             if len(group_indices) == 1:
+                                # Binary decision for single-item groups
                                 if group_logits.sigmoid().item() > 0.5:
+                                    attr_idx = group_indices[0].item()
                                     attributes.append(schema.labels[attr_idx])
                             else:
+                                # Multi-class decision for multi-item groups
+                                best_idx = group_logits.max(dim=-1).indices.item()
+                                attr_idx = group_indices[best_idx].item()
                                 attributes.append(schema.labels[attr_idx])
                 # Apply specific rules from original model
                 if len(attributes) == 1 and attributes[0] == "pos":
                     # This label is used as a default for training but implied in mim format
                 elif cat_name == "sl" and "act" in attributes:
                     # Number and tense are not shown for sl act in mim format
                     attributes = [attr for attr in attributes if attr not in ["1", "sing", "pres"]]
                 seq_predictions.append((cat_name, attributes))
             predictions.append(seq_predictions)
         return predictions
         """
         Predict IFD format labels from raw text.
+        B = batch_size, Ws = seq_words
         Args:
+            sentences: List of input sentences (B,)
             tokenizer: HuggingFace tokenizer
         Returns:
+            ifd_predictions: List of IFD labels per sentence (B x Ws)
         """
         # Get model predictions in (category, [attributes]) format
         predictions = self.predict_labels_from_text(sentences, tokenizer)
         # Convert each sentence's predictions to IFD format
         ifd_predictions = []
         for sentence_predictions in predictions:
+            ifd_labels = convert_predictions_to_ifd(sentence_predictions)  # (Ws,)
             ifd_predictions.append(ifd_labels)
         return ifd_predictions