mideind
/

IceBERT-PoS

@@ -97,61 +97,60 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         # Create tensors as regular attributes (not buffers to avoid init warnings)
         self.group_mask = schema.get_group_masks()  # (C x G)
         # Convert group mappings to tensor format for GPU operations
         self._create_tensor_group_mappings(schema)
         # Category name to index mapping (regular dict, no device movement needed)
         self.category_name_to_index = schema.get_category_name_to_index()
     def _create_tensor_group_mappings(self, schema):
         """
         Create tensor-based group mappings for efficient GPU operations.
         Converts Python dict-based schema to tensors to avoid CPU-GPU context switching.
         This optimization replaces dict lookups with tensor indexing for better performance.
         C = num_categories, G = num_groups, A = num_attributes
         """
         num_groups = len(schema.group_names)
-        num_labels = len(schema.labels)
         device = torch.device("cpu")  # Will be moved with model
         # Create group attribute indices tensor: (G x max_group_size)
         # Instead of dict lookups, we can index directly: group_attr_indices[group_id, :]
         max_group_size = max(len(labels) for labels in schema.group_name_to_labels.values())
         self.group_attr_indices = torch.full((num_groups, max_group_size), -1, dtype=torch.long, device=device)
         self.group_sizes = torch.zeros(num_groups, dtype=torch.long, device=device)  # (G,)
         for group_idx, group_name in enumerate(schema.group_names):
             group_labels = schema.group_name_to_labels[group_name]
             group_size = len(group_labels)
             self.group_sizes[group_idx] = group_size
             for label_idx, label in enumerate(group_labels):
                 if label in schema.labels:
                     attr_idx = schema.labels.index(label)
                     self.group_attr_indices[group_idx, label_idx] = attr_idx
         # Create category to groups mapping: (C x G) - which groups are valid for each category
         # Replaces dict-based category_to_group_names with tensor indexing
         # Usage: category_to_groups[cat_idx, :] gives valid groups for category cat_idx
         self.category_to_groups = self.group_mask.clone()  # (C x G)
-    def _apply(self, fn):
         """Override _apply to move our custom tensors with the model."""
         super()._apply(fn)
         # Move our custom tensors when model.to(device) is called
-        if hasattr(self, 'group_mask'):
             self.group_mask = fn(self.group_mask)
-        if hasattr(self, 'group_attr_indices'):
             self.group_attr_indices = fn(self.group_attr_indices)
-        if hasattr(self, 'group_sizes'):
             self.group_sizes = fn(self.group_sizes)
-        if hasattr(self, 'category_to_groups'):
             self.category_to_groups = fn(self.category_to_groups)
         return self
     def forward(
@@ -285,7 +284,7 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         valid_output = flat_output[valid_word_tokens]  # (valid_word_tokens x H)
         valid_word_indices = flat_word_indices[valid_word_tokens]  # (valid_word_tokens,)
-        total_words = max_words_per_seq.sum().item()
         if total_words == 0:
             return torch.empty(0, hidden_size, device=device)
@@ -372,7 +371,7 @@ class IceBertPosForTokenClassification(PreTrainedModel):
                     prev_word_id = word_id
             # Debug logging to match fairseq model
-            logger.debug(f"Word mask: {word_mask[batch_idx].tolist()}")
         return word_mask
@@ -417,86 +416,105 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         Convert logits to human-readable labels using vectorized operations.
         Key optimizations:
-        1. Tensor-based schema lookups instead of Python dict access
-        2. Vectorized argmax for categories across entire batch
-        3. Reduced CPU-GPU context switching by batching operations
-        4. Pre-computed tensor mappings for group/attribute relationships
         B = batch_size, W = max_words, C = num_categories, A = num_attributes, G = num_groups
-        Args:
-            cat_logits: Category logits (B x W x C)
-            attr_logits: Attribute logits (B x W x A)
-            word_mask: Binary mask for valid words (B x L)
-        Returns:
-            predictions: List of [(category, [attributes])] for each sequence
         """
         device = cat_logits.device
         bsz, max_words = cat_logits.shape[:2]
         nwords = word_mask.sum(-1)  # (B,)
         schema = self.config.label_schema
-        # Vectorized category prediction: (B x W)
-        # Single GPU operation instead of nested loops
-        pred_cat_indices = cat_logits.argmax(dim=-1)  # (B x W)
-        # Vectorized attribute prediction for all groups
-        predictions = []
-        for seq_idx in range(bsz):
-            seq_nwords = int(nwords[seq_idx].item())
-            if seq_nwords == 0:
-                predictions.append([])
                 continue
-            # Get categories for this sequence: (seq_nwords,)
-            seq_cat_indices = pred_cat_indices[seq_idx, :seq_nwords]
-            # Get valid groups for each category: (seq_nwords x G)
-            # Tensor lookup replaces dict access: category_to_group_names[cat_name]
-            seq_valid_groups = self.category_to_groups[seq_cat_indices]  # (seq_nwords x G)
-            # Process attributes for all words in sequence
             seq_predictions = []
-            for word_idx in range(seq_nwords):
-                cat_idx = seq_cat_indices[word_idx].item()
                 cat_name = schema.label_categories[cat_idx]
-                # Get valid groups for this word
-                word_valid_groups = seq_valid_groups[word_idx]  # (G,)
-                valid_group_indices = word_valid_groups.nonzero(as_tuple=True)[0]  # (valid_groups,)
-                # Collect attributes using vectorized operations
                 attributes = []
-                for group_idx in valid_group_indices:
-                    group_idx = group_idx.item()
-                    group_size = self.group_sizes[group_idx].item()
-                    if group_size == 0:
-                        continue
-                    # Get attribute indices for this group: (group_size,)
-                    # Tensor lookup replaces dict: group_name_to_group_attr_indices[group_name]
-                    group_attr_indices = self.group_attr_indices[group_idx, :group_size]
-                    valid_indices = group_attr_indices[group_attr_indices >= 0]
-                    if len(valid_indices) == 0:
-                        continue
-                    # Get logits for this group: (group_size,)
-                    group_logits = attr_logits[seq_idx, word_idx, valid_indices]
-                    if len(valid_indices) == 1:
-                        # Binary decision
-                        if group_logits.sigmoid().item() > 0.5:
-                            attr_idx = valid_indices[0].item()
-                            attributes.append(schema.labels[attr_idx])
-                    else:
-                        # Multi-class decision
-                        best_local_idx = group_logits.argmax().item()
-                        attr_idx = valid_indices[best_local_idx].item()
-                        attributes.append(schema.labels[attr_idx])
                 # Apply post-processing rules
                 if len(attributes) == 1 and attributes[0] == "pos":
@@ -507,6 +525,7 @@ class IceBertPosForTokenClassification(PreTrainedModel):
                     attributes = [attr for attr in attributes if attr not in ["1", "sing", "pres"]]
                 seq_predictions.append((cat_name, attributes))
             predictions.append(seq_predictions)

         # Create tensors as regular attributes (not buffers to avoid init warnings)
         self.group_mask = schema.get_group_masks()  # (C x G)
         # Convert group mappings to tensor format for GPU operations
         self._create_tensor_group_mappings(schema)
         # Category name to index mapping (regular dict, no device movement needed)
         self.category_name_to_index = schema.get_category_name_to_index()
     def _create_tensor_group_mappings(self, schema):
         """
         Create tensor-based group mappings for efficient GPU operations.
         Converts Python dict-based schema to tensors to avoid CPU-GPU context switching.
         This optimization replaces dict lookups with tensor indexing for better performance.
         C = num_categories, G = num_groups, A = num_attributes
         """
         num_groups = len(schema.group_names)
         device = torch.device("cpu")  # Will be moved with model
         # Create group attribute indices tensor: (G x max_group_size)
         # Instead of dict lookups, we can index directly: group_attr_indices[group_id, :]
         max_group_size = max(len(labels) for labels in schema.group_name_to_labels.values())
         self.group_attr_indices = torch.full((num_groups, max_group_size), -1, dtype=torch.long, device=device)
         self.group_sizes = torch.zeros(num_groups, dtype=torch.long, device=device)  # (G,)
         for group_idx, group_name in enumerate(schema.group_names):
             group_labels = schema.group_name_to_labels[group_name]
             group_size = len(group_labels)
             self.group_sizes[group_idx] = group_size
             for label_idx, label in enumerate(group_labels):
                 if label in schema.labels:
                     attr_idx = schema.labels.index(label)
                     self.group_attr_indices[group_idx, label_idx] = attr_idx
         # Create category to groups mapping: (C x G) - which groups are valid for each category
         # Replaces dict-based category_to_group_names with tensor indexing
         # Usage: category_to_groups[cat_idx, :] gives valid groups for category cat_idx
         self.category_to_groups = self.group_mask.clone()  # (C x G)
+    def _apply(self, fn):  # type: ignore[override]
         """Override _apply to move our custom tensors with the model."""
         super()._apply(fn)
         # Move our custom tensors when model.to(device) is called
+        if hasattr(self, "group_mask"):
             self.group_mask = fn(self.group_mask)
+        if hasattr(self, "group_attr_indices"):
             self.group_attr_indices = fn(self.group_attr_indices)
+        if hasattr(self, "group_sizes"):
             self.group_sizes = fn(self.group_sizes)
+        if hasattr(self, "category_to_groups"):
             self.category_to_groups = fn(self.category_to_groups)
         return self
     def forward(
         valid_output = flat_output[valid_word_tokens]  # (valid_word_tokens x H)
         valid_word_indices = flat_word_indices[valid_word_tokens]  # (valid_word_tokens,)
+        total_words = max_words_per_seq.sum()
         if total_words == 0:
             return torch.empty(0, hidden_size, device=device)
                     prev_word_id = word_id
             # Debug logging to match fairseq model
+            logger.debug(f"Word mask: {word_mask[batch_idx]}")
         return word_mask
         Convert logits to human-readable labels using vectorized operations.
         Key optimizations:
+        1. Flatten batch dimension to process all words simultaneously
+        2. Vectorized group processing across all words
+        3. Defer string conversion to the very end
+        4. Minimize Python loops and tensor-CPU transfers
         B = batch_size, W = max_words, C = num_categories, A = num_attributes, G = num_groups
         """
         device = cat_logits.device
         bsz, max_words = cat_logits.shape[:2]
         nwords = word_mask.sum(-1)  # (B,)
         schema = self.config.label_schema
+        # Step 1: Create valid word mask and flatten batch dimension
+        # (B x W) -> (total_words,) to process all words simultaneously
+        batch_word_mask = torch.zeros(bsz, max_words, dtype=torch.bool, device=device)
+        for b in range(bsz):
+            if nwords[b] > 0:
+                batch_word_mask[b, :nwords[b]] = True
+        valid_positions = batch_word_mask.flatten().nonzero(as_tuple=True)[0]  # (total_words,)
+        total_words = len(valid_positions)
+        if total_words == 0:
+            return [[] for _ in range(bsz)]
+        # Step 2: Vectorized category prediction for all valid words
+        flat_cat_logits = cat_logits.view(-1, cat_logits.size(-1))  # (B*W x C)
+        flat_attr_logits = attr_logits.view(-1, attr_logits.size(-1))  # (B*W x A)
+        # Get categories for all valid words: (total_words,)
+        all_cat_indices = flat_cat_logits[valid_positions].argmax(dim=-1)
+        # Step 3: Vectorized group validity for all words: (total_words x G)
+        all_valid_groups = self.category_to_groups[all_cat_indices]
+        # Step 4: Collect attributes using vectorized group processing
+        word_to_attrs = {}  # word_idx -> list of attr_indices
+        # Process each group across all words simultaneously
+        for group_idx in range(self.group_sizes.size(0)):
+            group_size = self.group_sizes[group_idx].item()
+            if group_size == 0:
+                continue
+            # Find words that have this group valid: (words_with_group,)
+            words_with_group = all_valid_groups[:, group_idx].nonzero(as_tuple=True)[0]
+            if len(words_with_group) == 0:
                 continue
+            # Get attribute indices for this group
+            group_attr_indices = self.group_attr_indices[group_idx, :group_size]
+            valid_attr_indices = group_attr_indices[group_attr_indices >= 0]
+            if len(valid_attr_indices) == 0:
+                continue
+            # Get logits for all words that need this group: (words_with_group x group_size)
+            word_positions = valid_positions[words_with_group]
+            group_logits = flat_attr_logits[word_positions][:, valid_attr_indices]
+            if len(valid_attr_indices) == 1:
+                # Binary decision for all words simultaneously: (words_with_group,)
+                decisions = group_logits.sigmoid().squeeze(-1) > 0.5
+                selected_words = words_with_group[decisions]
+                attr_idx = valid_attr_indices[0].item()
+                for word_idx in selected_words:
+                    word_idx_item = word_idx.item()
+                    if word_idx_item not in word_to_attrs:
+                        word_to_attrs[word_idx_item] = []
+                    word_to_attrs[word_idx_item].append(attr_idx)
+            else:
+                # Multi-class decision for all words: (words_with_group,)
+                best_indices = group_logits.argmax(dim=-1)
+                for i, word_idx in enumerate(words_with_group):
+                    attr_idx = valid_attr_indices[best_indices[i]].item()
+                    word_idx_item = word_idx.item()
+                    if word_idx_item not in word_to_attrs:
+                        word_to_attrs[word_idx_item] = []
+                    word_to_attrs[word_idx_item].append(attr_idx)
+        # Step 5: Reconstruct batch structure and convert to strings (deferred)
+        predictions = []
+        word_counter = 0
+        for seq_idx in range(bsz):
+            seq_nwords = nwords[seq_idx].item()
             seq_predictions = []
+            for _ in range(seq_nwords):
+                # Get category (string conversion deferred)
+                cat_idx = all_cat_indices[word_counter].item()
                 cat_name = schema.label_categories[cat_idx]
+                # Get attributes (string conversion deferred)
                 attributes = []
+                if word_counter in word_to_attrs:
+                    attr_indices = word_to_attrs[word_counter]
+                    attributes = [schema.labels[idx] for idx in attr_indices]
                 # Apply post-processing rules
                 if len(attributes) == 1 and attributes[0] == "pos":
                     attributes = [attr for attr in attributes if attr not in ["1", "sing", "pres"]]
                 seq_predictions.append((cat_name, attributes))
+                word_counter += 1
             predictions.append(seq_predictions)