mideind
/

IceBERT-PoS

@@ -96,24 +96,62 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         schema = self.config.label_schema
         # Create tensors as regular attributes (not buffers to avoid init warnings)
-        self.group_mask = schema.get_group_masks()
-        self.group_name_to_group_attr_indices = schema.get_group_name_to_group_attr_indices()
         # Category name to index mapping (regular dict, no device movement needed)
         self.category_name_to_index = schema.get_category_name_to_index()
-    def _apply(self, fn):  # type: ignore
         """Override _apply to move our custom tensors with the model."""
         super()._apply(fn)
         # Move our custom tensors when model.to(device) is called
-        if hasattr(self, "group_mask"):
             self.group_mask = fn(self.group_mask)
-        if hasattr(self, "group_name_to_group_attr_indices"):
-            for group_name, tensor in self.group_name_to_group_attr_indices.items():
-                self.group_name_to_group_attr_indices[group_name] = fn(tensor)
         return self
     def forward(
@@ -376,70 +414,100 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         self, cat_logits: torch.Tensor, attr_logits: torch.Tensor, word_mask: torch.Tensor
     ) -> List[List[Tuple[str, List[str]]]]:
         """
-        Convert logits to human-readable labels using schema-based logic.
-        B = batch_size, W = max_words, C = num_categories, A = num_attributes, L = seq_len
         Args:
             cat_logits: Category logits (B x W x C)
             attr_logits: Attribute logits (B x W x A)
             word_mask: Binary mask for valid words (B x L)
         Returns:
             predictions: List of [(category, [attributes])] for each sequence
         """
-        bsz, _, num_cats = cat_logits.shape
-        _, _, num_attrs = attr_logits.shape
         nwords = word_mask.sum(-1)  # (B,)
-        assert num_attrs == len(self.config.label_schema.labels)
-        assert num_cats == len(self.config.label_schema.label_categories)
-        predictions = []
         schema = self.config.label_schema
         for seq_idx in range(bsz):
-            seq_nwords = nwords[seq_idx]
-            # (W x C) -> (seq_nwords,)
-            pred_cat_indices = cat_logits[seq_idx, :seq_nwords].max(dim=-1).indices
             seq_predictions = []
             for word_idx in range(seq_nwords):
-                cat_idx = pred_cat_indices[word_idx].item()
                 cat_name = schema.label_categories[cat_idx]
-                # Get valid groups for this category
-                valid_groups = schema.category_to_group_names.get(cat_name, [])
-                # Collect attributes for this word
                 attributes = []
-                for group_name in valid_groups:
-                    if group_name in self.group_name_to_group_attr_indices:
-                        group_indices = self.group_name_to_group_attr_indices[group_name]  # (Gs,)
-                        if len(group_indices) > 0:
-                            # (A,) -> (Gs,)
-                            group_logits = attr_logits[seq_idx, word_idx, group_indices]
-                            if len(group_indices) == 1:
-                                # Binary decision for single-item groups
-                                if group_logits.sigmoid().item() > 0.5:
-                                    attr_idx = group_indices[0].item()
-                                    attributes.append(schema.labels[attr_idx])
-                            else:
-                                # Multi-class decision for multi-item groups
-                                best_idx = group_logits.max(dim=-1).indices.item()
-                                attr_idx = group_indices[best_idx].item()
-                                attributes.append(schema.labels[attr_idx])
-                # Apply specific rules from original model
                 if len(attributes) == 1 and attributes[0] == "pos":
                     # This label is used as a default for training but implied in mim format
                     attributes = []
                 elif cat_name == "sl" and "act" in attributes:
                     # Number and tense are not shown for sl act in mim format
                     attributes = [attr for attr in attributes if attr not in ["1", "sing", "pres"]]
                 seq_predictions.append((cat_name, attributes))
             predictions.append(seq_predictions)
         return predictions

         schema = self.config.label_schema
         # Create tensors as regular attributes (not buffers to avoid init warnings)
+        self.group_mask = schema.get_group_masks()  # (C x G)
+        # Convert group mappings to tensor format for GPU operations
+        self._create_tensor_group_mappings(schema)
         # Category name to index mapping (regular dict, no device movement needed)
         self.category_name_to_index = schema.get_category_name_to_index()
+    def _create_tensor_group_mappings(self, schema):
+        """
+        Create tensor-based group mappings for efficient GPU operations.
+        Converts Python dict-based schema to tensors to avoid CPU-GPU context switching.
+        This optimization replaces dict lookups with tensor indexing for better performance.
+        C = num_categories, G = num_groups, A = num_attributes
+        """
+        num_groups = len(schema.group_names)
+        num_labels = len(schema.labels)
+        device = torch.device("cpu")  # Will be moved with model
+        # Create group attribute indices tensor: (G x max_group_size)
+        # Instead of dict lookups, we can index directly: group_attr_indices[group_id, :]
+        max_group_size = max(len(labels) for labels in schema.group_name_to_labels.values())
+        self.group_attr_indices = torch.full((num_groups, max_group_size), -1, dtype=torch.long, device=device)
+        self.group_sizes = torch.zeros(num_groups, dtype=torch.long, device=device)  # (G,)
+        for group_idx, group_name in enumerate(schema.group_names):
+            group_labels = schema.group_name_to_labels[group_name]
+            group_size = len(group_labels)
+            self.group_sizes[group_idx] = group_size
+            for label_idx, label in enumerate(group_labels):
+                if label in schema.labels:
+                    attr_idx = schema.labels.index(label)
+                    self.group_attr_indices[group_idx, label_idx] = attr_idx
+        # Create category to groups mapping: (C x G) - which groups are valid for each category
+        # Replaces dict-based category_to_group_names with tensor indexing
+        # Usage: category_to_groups[cat_idx, :] gives valid groups for category cat_idx
+        self.category_to_groups = self.group_mask.clone()  # (C x G)
+    def _apply(self, fn):
         """Override _apply to move our custom tensors with the model."""
         super()._apply(fn)
         # Move our custom tensors when model.to(device) is called
+        if hasattr(self, 'group_mask'):
             self.group_mask = fn(self.group_mask)
+        if hasattr(self, 'group_attr_indices'):
+            self.group_attr_indices = fn(self.group_attr_indices)
+        if hasattr(self, 'group_sizes'):
+            self.group_sizes = fn(self.group_sizes)
+        if hasattr(self, 'category_to_groups'):
+            self.category_to_groups = fn(self.category_to_groups)
         return self
     def forward(
         self, cat_logits: torch.Tensor, attr_logits: torch.Tensor, word_mask: torch.Tensor
     ) -> List[List[Tuple[str, List[str]]]]:
         """
+        Convert logits to human-readable labels using vectorized operations.
+        Key optimizations:
+        1. Tensor-based schema lookups instead of Python dict access
+        2. Vectorized argmax for categories across entire batch
+        3. Reduced CPU-GPU context switching by batching operations
+        4. Pre-computed tensor mappings for group/attribute relationships
+        B = batch_size, W = max_words, C = num_categories, A = num_attributes, G = num_groups
         Args:
             cat_logits: Category logits (B x W x C)
             attr_logits: Attribute logits (B x W x A)
             word_mask: Binary mask for valid words (B x L)
         Returns:
             predictions: List of [(category, [attributes])] for each sequence
         """
+        device = cat_logits.device
+        bsz, max_words = cat_logits.shape[:2]
         nwords = word_mask.sum(-1)  # (B,)
         schema = self.config.label_schema
+        # Vectorized category prediction: (B x W)
+        # Single GPU operation instead of nested loops
+        pred_cat_indices = cat_logits.argmax(dim=-1)  # (B x W)
+        # Vectorized attribute prediction for all groups
+        predictions = []
         for seq_idx in range(bsz):
+            seq_nwords = int(nwords[seq_idx].item())
+            if seq_nwords == 0:
+                predictions.append([])
+                continue
+            # Get categories for this sequence: (seq_nwords,)
+            seq_cat_indices = pred_cat_indices[seq_idx, :seq_nwords]
+            # Get valid groups for each category: (seq_nwords x G)
+            # Tensor lookup replaces dict access: category_to_group_names[cat_name]
+            seq_valid_groups = self.category_to_groups[seq_cat_indices]  # (seq_nwords x G)
+            # Process attributes for all words in sequence
             seq_predictions = []
             for word_idx in range(seq_nwords):
+                cat_idx = seq_cat_indices[word_idx].item()
                 cat_name = schema.label_categories[cat_idx]
+                # Get valid groups for this word
+                word_valid_groups = seq_valid_groups[word_idx]  # (G,)
+                valid_group_indices = word_valid_groups.nonzero(as_tuple=True)[0]  # (valid_groups,)
+                # Collect attributes using vectorized operations
                 attributes = []
+                for group_idx in valid_group_indices:
+                    group_idx = group_idx.item()
+                    group_size = self.group_sizes[group_idx].item()
+                    if group_size == 0:
+                        continue
+                    # Get attribute indices for this group: (group_size,)
+                    # Tensor lookup replaces dict: group_name_to_group_attr_indices[group_name]
+                    group_attr_indices = self.group_attr_indices[group_idx, :group_size]
+                    valid_indices = group_attr_indices[group_attr_indices >= 0]
+                    if len(valid_indices) == 0:
+                        continue
+                    # Get logits for this group: (group_size,)
+                    group_logits = attr_logits[seq_idx, word_idx, valid_indices]
+                    if len(valid_indices) == 1:
+                        # Binary decision
+                        if group_logits.sigmoid().item() > 0.5:
+                            attr_idx = valid_indices[0].item()
+                            attributes.append(schema.labels[attr_idx])
+                    else:
+                        # Multi-class decision
+                        best_local_idx = group_logits.argmax().item()
+                        attr_idx = valid_indices[best_local_idx].item()
+                        attributes.append(schema.labels[attr_idx])
+                # Apply post-processing rules
                 if len(attributes) == 1 and attributes[0] == "pos":
                     # This label is used as a default for training but implied in mim format
                     attributes = []
                 elif cat_name == "sl" and "act" in attributes:
                     # Number and tense are not shown for sl act in mim format
                     attributes = [attr for attr in attributes if attr not in ["1", "sing", "pres"]]
                 seq_predictions.append((cat_name, attributes))
             predictions.append(seq_predictions)
         return predictions