Fix inconsistencies with the old model - now works equally

Browse files

Files changed (4) hide show

config.json +0 -2
configuration.py +67 -18
modeling.py +178 -237
old_label_utils.py +223 -0

config.json CHANGED Viewed

@@ -469,8 +469,6 @@
       "act",
       "mid"
     ],
-    "null": null,
-    "null_leaf": null,
     "separator": "<SEP>"
   },
   "layer_norm_eps": 1e-05,

       "act",
       "mid"
     ],
     "separator": "<SEP>"
   },
   "layer_norm_eps": 1e-05,

configuration.py CHANGED Viewed

@@ -2,11 +2,36 @@
 # This file is part of IceBERT POS model conversion.
 import json
-from typing import Any, Dict, Optional
 from transformers import AutoConfig, RobertaConfig
 class IceBertPosConfig(RobertaConfig):
     """
     Configuration class for IceBERT POS (Part-of-Speech) tagging model.
@@ -18,7 +43,7 @@ class IceBertPosConfig(RobertaConfig):
     model_type = "icebert-pos"
     def __init__(
-        self, label_schema: Optional[Dict[str, Any]] = None, classifier_dropout: Optional[float] = None, **kwargs
     ):
         super().__init__(**kwargs)
@@ -26,12 +51,16 @@ class IceBertPosConfig(RobertaConfig):
         if label_schema is None:
             label_schema = self._get_default_label_schema()
         self.label_schema = label_schema
         # Derive parameters from label schema
-        self.num_categories = len(label_schema["label_categories"])
-        self.num_labels = len(label_schema["labels"])
-        self.num_groups = len(label_schema["group_names"])
         # Classification head parameters
         self.classifier_dropout = classifier_dropout if classifier_dropout is not None else 0.1
@@ -41,10 +70,10 @@ class IceBertPosConfig(RobertaConfig):
         self.attr_proj_input_size = self.num_categories + self.hidden_size
     @staticmethod
-    def _get_default_label_schema() -> Dict[str, Any]:
         """Default label schema corresponding to terms2.json"""
-        return {
-            "label_categories": [
                 "n",
                 "g",
                 "x",
@@ -89,7 +118,7 @@ class IceBertPosConfig(RobertaConfig):
                 "ns",
                 "m",
             ],
-            "category_to_group_names": {
                 "n": ["gender", "number", "case", "def", "proper"],
                 "g": ["gender", "number", "case"],
                 "l": ["gender", "number", "case", "adj_c", "deg"],
@@ -116,7 +145,7 @@ class IceBertPosConfig(RobertaConfig):
                 "ae": ["deg"],
                 "as": ["deg"],
             },
-            "group_names": [
                 "gender",
                 "gender_or_person",
                 "number",
@@ -129,7 +158,7 @@ class IceBertPosConfig(RobertaConfig):
                 "person",
                 "tense",
             ],
-            "group_name_to_labels": {
                 "gender": ["masc", "fem", "neut", "gender_x"],
                 "number": ["sing", "plur"],
                 "person": ["1", "2", "3"],
@@ -142,7 +171,7 @@ class IceBertPosConfig(RobertaConfig):
                 "proper": ["proper"],
                 "adj_c": ["strong", "weak", "equiinflected"],
             },
-            "labels": [
                 "<SEP>",
                 "n",
                 "g",
@@ -214,17 +243,37 @@ class IceBertPosConfig(RobertaConfig):
                 "act",
                 "mid",
             ],
-            "null": None,
-            "null_leaf": None,
-            "separator": "<SEP>",
-            "ignore_categories": ["x", "e"],
-        }
     @classmethod
     def from_label_schema_file(cls, schema_path: str, **kwargs) -> "IceBertPosConfig":
         """Create config from a label schema JSON file"""
         with open(schema_path, "r", encoding="utf-8") as f:
-            label_schema = json.load(f)
         return cls(label_schema=label_schema, **kwargs)

 # This file is part of IceBERT POS model conversion.
 import json
+from dataclasses import dataclass
+from typing import Dict, List, Optional
 from transformers import AutoConfig, RobertaConfig
+@dataclass
+class LabelSchema:
+    """
+    Dataclass representing the structure of a POS tagging label schema.
+    The schema defines a hierarchical structure where:
+    - Categories (e.g., 'n', 'v', 'l') are the main POS types
+    - Groups (e.g., 'gender', 'number', 'case') are grammatical attribute types
+    - Labels are the specific values for each group (e.g., 'masc', 'fem', 'sing', 'plur')
+    Each category maps to applicable groups, and each group maps to its possible labels.
+    This enables multilabel classification where tokens get both a category and
+    relevant grammatical attributes.
+    """
+    label_categories: List[str]
+    category_to_group_names: Dict[str, List[str]]
+    group_names: List[str]
+    group_name_to_labels: Dict[str, List[str]]
+    labels: List[str]
+    separator: str
+    ignore_categories: List[str]
 class IceBertPosConfig(RobertaConfig):
     """
     Configuration class for IceBERT POS (Part-of-Speech) tagging model.
     model_type = "icebert-pos"
     def __init__(
+        self, label_schema: Optional[LabelSchema] = None, classifier_dropout: Optional[float] = None, **kwargs
     ):
         super().__init__(**kwargs)
         if label_schema is None:
             label_schema = self._get_default_label_schema()
+        # Convert dict to LabelSchema if needed (when loaded from JSON)
+        if isinstance(label_schema, dict):
+            label_schema = LabelSchema(**label_schema)
         self.label_schema = label_schema
         # Derive parameters from label schema
+        self.num_categories = len(label_schema.label_categories)
+        self.num_labels = len(label_schema.labels)
+        self.num_groups = len(label_schema.group_names)
         # Classification head parameters
         self.classifier_dropout = classifier_dropout if classifier_dropout is not None else 0.1
         self.attr_proj_input_size = self.num_categories + self.hidden_size
     @staticmethod
+    def _get_default_label_schema() -> LabelSchema:
         """Default label schema corresponding to terms2.json"""
+        return LabelSchema(
+            label_categories=[
                 "n",
                 "g",
                 "x",
                 "ns",
                 "m",
             ],
+            category_to_group_names={
                 "n": ["gender", "number", "case", "def", "proper"],
                 "g": ["gender", "number", "case"],
                 "l": ["gender", "number", "case", "adj_c", "deg"],
                 "ae": ["deg"],
                 "as": ["deg"],
             },
+            group_names=[
                 "gender",
                 "gender_or_person",
                 "number",
                 "person",
                 "tense",
             ],
+            group_name_to_labels={
                 "gender": ["masc", "fem", "neut", "gender_x"],
                 "number": ["sing", "plur"],
                 "person": ["1", "2", "3"],
                 "proper": ["proper"],
                 "adj_c": ["strong", "weak", "equiinflected"],
             },
+            labels=[
                 "<SEP>",
                 "n",
                 "g",
                 "act",
                 "mid",
             ],
+            separator="<SEP>",
+            ignore_categories=["x", "e"],
+        )
+    def to_dict(self):
+        """Convert config to dictionary, handling LabelSchema serialization."""
+        output = super().to_dict()
+        # Convert LabelSchema to dict for JSON serialization
+        if hasattr(self, 'label_schema') and self.label_schema is not None:
+            if isinstance(self.label_schema, LabelSchema):
+                output['label_schema'] = {
+                    'label_categories': self.label_schema.label_categories,
+                    'category_to_group_names': self.label_schema.category_to_group_names,
+                    'group_names': self.label_schema.group_names,
+                    'group_name_to_labels': self.label_schema.group_name_to_labels,
+                    'labels': self.label_schema.labels,
+                    'separator': self.label_schema.separator,
+                    'ignore_categories': self.label_schema.ignore_categories,
+                }
+            else:
+                output['label_schema'] = self.label_schema
+        return output
     @classmethod
     def from_label_schema_file(cls, schema_path: str, **kwargs) -> "IceBertPosConfig":
         """Create config from a label schema JSON file"""
         with open(schema_path, "r", encoding="utf-8") as f:
+            schema_dict = json.load(f)
+        label_schema = LabelSchema(**schema_dict)
         return cls(label_schema=label_schema, **kwargs)

modeling.py CHANGED Viewed

@@ -11,6 +11,15 @@ from torch.nn.utils.rnn import pad_sequence
 from transformers import AutoConfig, AutoModel, PreTrainedModel, RobertaModel
 from .configuration import IceBertPosConfig
 logger = logging.getLogger(__name__)
@@ -38,11 +47,11 @@ class MultiLabelTokenClassificationHead(nn.Module):
     def forward(self, features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Args:
-            features: Word-level features of shape (total_words, hidden_size)
         Returns:
-            cat_logits: Category logits of shape (total_words, num_categories)
-            attr_logits: Attribute logits of shape (total_words, num_labels)
         """
         x = self.dropout(features)
         x = self.dense(x)
@@ -81,9 +90,22 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         self.roberta = RobertaModel(config, add_pooling_layer=False)
         self.classifier = MultiLabelTokenClassificationHead(config)
         # Initialize weights and apply final processing
         self.post_init()
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -101,7 +123,7 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         Args:
             input_ids: Token indices of shape (batch_size, sequence_length)
             attention_mask: Attention mask of shape (batch_size, sequence_length)
-            word_mask: Binary mask indicating word boundaries (1 = word start)
         Returns:
             cat_logits: Category logits of shape (batch_size, max_words, num_categories)
@@ -118,22 +140,37 @@ class IceBertPosForTokenClassification(PreTrainedModel):
             head_mask=head_mask,
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        sequence_output = outputs[0]  # (batch_size, seq_len, hidden_size)
-        # Aggregate subword tokens to word-level representations using word_mask
-        word_features, nwords = self._aggregate_subword_tokens(sequence_output, word_mask)
-        # Apply classification head
-        cat_logits, attr_logits = self.classifier(word_features)
-        # Reshape back to batch format using word counts
-        cat_logits_batch, attr_logits_batch = self._reshape_to_batch_format(cat_logits, attr_logits, nwords)
-        return cat_logits_batch, attr_logits_batch
     def _aggregate_subword_tokens(
         self, sequence_output: torch.Tensor, word_mask: torch.Tensor
@@ -147,7 +184,7 @@ class IceBertPosForTokenClassification(PreTrainedModel):
             word_mask: Binary mask where 1 indicates start of word (batch_size, seq_len)
         Returns:
-            word_features: Word-level features (total_words, hidden_size)
             nwords: Number of words per sequence (batch_size,)
         """
         # TODO: Verify that BOS and EOS are handled correctly - I'm worried that this does not correctly handle padding
@@ -234,7 +271,7 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         cat_logits, attr_logits = self.forward(input_ids=input_ids, attention_mask=attention_mask, word_mask=word_mask)
-        return self._logits_to_labels(cat_logits, attr_logits, word_ids)
     def _word_ids_to_word_mask(self, word_ids: List[List[int]], input_shape: torch.Size) -> torch.Tensor:
         """
@@ -245,18 +282,23 @@ class IceBertPosForTokenClassification(PreTrainedModel):
             input_shape: Shape of input_ids tensor (batch_size, seq_len)
         Returns:
-            word_mask: Binary tensor where 1 indicates start of word
         """
         batch_size, seq_len = input_shape
         word_mask = torch.zeros(batch_size, seq_len, dtype=torch.long)
         for batch_idx, seq_word_ids in enumerate(word_ids):
             prev_word_id = None
-            for token_idx, word_id in enumerate(seq_word_ids):
                 if word_id != prev_word_id:
-                    word_mask[batch_idx, token_idx] = 1
                 prev_word_id = word_id
         return word_mask
     def predict_labels_from_text(self, sentences: List[str], tokenizer) -> List[List[Tuple[str, List[str]]]]:
@@ -270,231 +312,130 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         Returns:
             List of sequences, each containing (category, [attributes]) per word
         """
-        # Tokenize with fairseq-style preprocessing
-        encodings = [tokenizer(sent, return_tensors="pt") for sent in sentences]
-        word_ids_list = [encoding.word_ids() for encoding in encodings]
-        # Batch the inputs
-        max_len = max(encoding["input_ids"].shape[1] for encoding in encodings)
-        batch_input_ids = []
-        batch_attention_mask = []
-        for encoding in encodings:
-            input_ids = encoding["input_ids"][0]
-            attention_mask = encoding["attention_mask"][0]
-            # Pad to max length
-            pad_len = max_len - len(input_ids)
-            if pad_len > 0:
-                input_ids = torch.cat([input_ids, torch.ones(pad_len, dtype=torch.long)])  # pad_token_id = 1
-                attention_mask = torch.cat([attention_mask, torch.zeros(pad_len, dtype=torch.long)])
-            batch_input_ids.append(input_ids)
-            batch_attention_mask.append(attention_mask)
-        batch_input_ids = torch.stack(batch_input_ids)
-        batch_attention_mask = torch.stack(batch_attention_mask)
         return self.predict_labels(batch_input_ids, batch_attention_mask, word_ids_list)
-    def _make_group_name_to_group_attr_vec_idxs(self):
-        """Create mapping from group names to their attribute vector indices"""
-        group_name_to_group_attr_vec_idxs = {}
-        labels = self.config.label_schema["labels"]
-        nspecial = 0  # Number of special tokens in label dictionary (like <SEP>)
-        for group_name, group_labels in self.config.label_schema["group_name_to_labels"].items():
-            vec_idxs = []
-            for label in group_labels:
-                if label in labels:
-                    # Find index in labels list, but subtract nspecial to get vector index
-                    label_dict_idx = labels.index(label)
-                    if label_dict_idx >= nspecial:  # Skip special tokens
-                        vec_idxs.append(label_dict_idx - nspecial)
-            group_name_to_group_attr_vec_idxs[group_name] = torch.tensor(vec_idxs)
-        return group_name_to_group_attr_vec_idxs
-    def _make_group_masks(self):
-        """Create group masks for each category"""
-        label_categories = self.config.label_schema["label_categories"]
-        group_names = self.config.label_schema["group_names"]
-        category_to_group_names = self.config.label_schema["category_to_group_names"]
-        num_cats = len(label_categories)
-        num_groups = len(group_names)
-        group_mask = torch.zeros(num_cats, num_groups, dtype=torch.bool)
-        for cat_idx, category in enumerate(label_categories):
-            if category in category_to_group_names:
-                for group_name in category_to_group_names[category]:
-                    if group_name in group_names:
-                        group_idx = group_names.index(group_name)
-                        group_mask[cat_idx, group_idx] = True
-        return group_mask
-    def _make_category_mappings(self):
-        """Create mappings between category vector indices and dictionary indices"""
-        labels = self.config.label_schema["labels"]
-        label_categories = self.config.label_schema["label_categories"]
-        # Create mapping from category names to vector indices (0-based)
-        cat_dict_idx_to_vec_idx = torch.zeros(len(labels), dtype=torch.long)
-        cat_vec_idx_to_dict_idx = torch.zeros(len(label_categories), dtype=torch.long)
-        for vec_idx, category in enumerate(label_categories):
-            if category in labels:
-                dict_idx = labels.index(category)
-                cat_dict_idx_to_vec_idx[dict_idx] = vec_idx
-                cat_vec_idx_to_dict_idx[vec_idx] = dict_idx
-        return cat_dict_idx_to_vec_idx, cat_vec_idx_to_dict_idx
-    def _count_words_per_sequence(self, word_ids: List[List[int]]) -> List[int]:
-        """Count the number of unique words in each sequence."""
-        words_per_seq = []
-        for seq_word_ids in word_ids:
-            unique_word_ids = set(word_id for word_id in seq_word_ids if word_id is not None)
-            words_per_seq.append(len(unique_word_ids))
-        return words_per_seq
-    def _predict_categories_for_sequence(
-        self, cat_logits: torch.Tensor, seq_idx: int, seq_nwords: int, cat_vec_idx_to_dict_idx: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Predict categories for a single sequence and return both vector and dictionary indices."""
-        pred_cat_vec_idxs = cat_logits[seq_idx, :seq_nwords].max(dim=-1).indices
-        pred_cats = cat_vec_idx_to_dict_idx[pred_cat_vec_idxs]
-        return pred_cat_vec_idxs, pred_cats
-    def _predict_attributes_for_group(
-        self,
-        attr_logits: torch.Tensor,
-        seq_idx: int,
-        seq_nwords: int,
-        group_vec_idxs: torch.Tensor,
-        seq_group_mask: torch.Tensor,
-        group_idx: int,
-    ) -> torch.Tensor:
-        """Predict attributes for a single group."""
-        if len(group_vec_idxs) == 0:
-            return torch.zeros(seq_nwords, dtype=torch.long)
-        # Get logits for this group
-        group_logits = attr_logits[seq_idx, :seq_nwords, group_vec_idxs]
-        if len(group_vec_idxs) == 1:
-            # Single element group: use sigmoid > 0.5
-            group_pred = group_logits.sigmoid().ge(0.5).long()
-            group_pred_dict_idxs = (group_pred.squeeze() * group_vec_idxs.item()) * seq_group_mask[:, group_idx]
-        else:
-            # Multi element group: use argmax
-            group_pred_vec_idxs = group_logits.max(dim=-1).indices
-            group_pred_dict_idxs = group_vec_idxs[group_pred_vec_idxs] * seq_group_mask[:, group_idx]
-        return group_pred_dict_idxs
-    def _predict_all_attributes_for_sequence(
-        self,
-        attr_logits: torch.Tensor,
-        seq_idx: int,
-        seq_nwords: int,
-        pred_cat_vec_idxs: torch.Tensor,
-        group_name_to_group_attr_vec_idxs: dict,
-        group_mask: torch.Tensor,
-        group_names: List[str],
-    ) -> torch.Tensor:
-        """Predict all attributes for a single sequence."""
-        seq_group_mask = group_mask[pred_cat_vec_idxs]
-        pred_attrs = []
-        for group_idx, group_name in enumerate(group_names):
-            if group_name not in group_name_to_group_attr_vec_idxs:
-                pred_attrs.append(torch.zeros(seq_nwords, dtype=torch.long))
-                continue
-            group_vec_idxs = group_name_to_group_attr_vec_idxs[group_name]
-            group_pred_dict_idxs = self._predict_attributes_for_group(
-                attr_logits, seq_idx, seq_nwords, group_vec_idxs, seq_group_mask, group_idx
-            )
-            pred_attrs.append(group_pred_dict_idxs)
-        # Stack predictions
-        if pred_attrs:
-            return torch.stack([p.squeeze() if p.dim() > 1 else p for p in pred_attrs]).t()
-        else:
-            return torch.zeros(seq_nwords, len(group_names), dtype=torch.long)
-    def _convert_predictions_to_labels(
-        self, pred_cats: torch.Tensor, pred_attrs_tensor: torch.Tensor, labels: List[str], group_names: List[str]
-    ) -> List[Tuple[str, List[str]]]:
-        """Convert prediction tensors to human-readable labels."""
-        seq_nwords = pred_cats.size(0)
-        seq_predictions = []
-        for word_idx in range(seq_nwords):
-            # Category (convert from dictionary index to string)
-            cat_dict_idx = pred_cats[word_idx].item()
-            if cat_dict_idx < len(labels):
-                category = labels[cat_dict_idx]
-            else:
-                category = "UNK"
-            # Attributes (convert from dictionary indices to strings)
-            attributes = []
-            for group_idx in range(len(group_names)):
-                attr_dict_idx = pred_attrs_tensor[word_idx, group_idx].item()
-                if attr_dict_idx > 0 and attr_dict_idx < len(labels):  # Skip 0 (empty) and out of bounds
-                    attributes.append(labels[attr_dict_idx])
-            seq_predictions.append((category, attributes))
-        return seq_predictions
     def _logits_to_labels(
-        self, cat_logits: torch.Tensor, attr_logits: torch.Tensor, word_ids: List[List[int]]
     ) -> List[List[Tuple[str, List[str]]]]:
         """
         Convert logits to human-readable labels using fairseq's group-based logic.
         """
-        # Create necessary mappings
-        group_name_to_group_attr_vec_idxs = self._make_group_name_to_group_attr_vec_idxs()
-        group_mask = self._make_group_masks()
-        cat_dict_idx_to_vec_idx, cat_vec_idx_to_dict_idx = self._make_category_mappings()
-        label_schema = self.config.label_schema
-        labels = label_schema["labels"]
-        group_names = label_schema["group_names"]
-        batch_size = cat_logits.size(0)
-        words_per_seq = self._count_words_per_sequence(word_ids)
-        batch_predictions = []
-        for seq_idx in range(batch_size):
-            seq_nwords = words_per_seq[seq_idx]
-            # Predict categories
-            pred_cat_vec_idxs, pred_cats = self._predict_categories_for_sequence(
-                cat_logits, seq_idx, seq_nwords, cat_vec_idx_to_dict_idx
-            )
-            # Predict attributes
-            pred_attrs_tensor = self._predict_all_attributes_for_sequence(
-                attr_logits,
-                seq_idx,
-                seq_nwords,
-                pred_cat_vec_idxs,
-                group_name_to_group_attr_vec_idxs,
-                group_mask,
-                group_names,
-            )
-            # Convert to labels
-            seq_predictions = self._convert_predictions_to_labels(pred_cats, pred_attrs_tensor, labels, group_names)
-            batch_predictions.append(seq_predictions)
-        return batch_predictions
 AutoConfig.register("icebert-pos", IceBertPosConfig)

 from transformers import AutoConfig, AutoModel, PreTrainedModel, RobertaModel
 from .configuration import IceBertPosConfig
+from .old_label_utils import (
+    SimpleLabelDictionary,
+    clean_cats_attrs,
+    create_label_dictionary_from_schema,
+    make_dict_idx_to_vec_idx,
+    make_group_masks,
+    make_group_name_to_group_attr_vec_idxs,
+    make_vec_idx_to_dict_idx,
+)
 logger = logging.getLogger(__name__)
     def forward(self, features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Args:
+            features: Word-level features of shape (batch_size, max_words, hidden_size)
         Returns:
+            cat_logits: Category logits of shape (batch_size, max_words, num_categories)
+            attr_logits: Attribute logits of shape (batch_size, max_words, num_labels)
         """
         x = self.dropout(features)
         x = self.dense(x)
         self.roberta = RobertaModel(config, add_pooling_layer=False)
         self.classifier = MultiLabelTokenClassificationHead(config)
+        # Create label dictionary and mappings (mimicking old fairseq model)
+        self.label_dictionary = create_label_dictionary_from_schema(config.label_schema)
+        self._setup_label_mappings()
         # Initialize weights and apply final processing
         self.post_init()
+    def _setup_label_mappings(self):
+        """Setup label mappings similar to the old fairseq model."""
+        schema = self.config.label_schema
+        self.group_name_to_group_attr_vec_idxs = make_group_name_to_group_attr_vec_idxs(self.label_dictionary, schema)
+        self.cat_dict_idx_to_vec_idx = make_dict_idx_to_vec_idx(self.label_dictionary, schema.label_categories)
+        self.cat_vec_idx_to_dict_idx = make_vec_idx_to_dict_idx(self.label_dictionary, schema.label_categories)
+        self.group_mask = make_group_masks(self.label_dictionary, schema)
     def forward(
         self,
         input_ids: torch.Tensor,
         Args:
             input_ids: Token indices of shape (batch_size, sequence_length)
             attention_mask: Attention mask of shape (batch_size, sequence_length)
+            word_mask: Binary mask indicating word boundaries (1 = word start) of shape (batch_size, sequence_length)
         Returns:
             cat_logits: Category logits of shape (batch_size, max_words, num_categories)
             head_mask=head_mask,
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
+            output_hidden_states=True,
             return_dict=return_dict,
         )
+        x = outputs[0]  # (batch_size, seq_len, hidden)
+        # Copy exact logic from old model
+        _, _, inner_dim = x.shape
+        # use first bpe token of word as representation
+        x = x[:, 1:-1, :]
+        starts = word_mask[:, 1:-1]  # remove bos, eos
+        ends = starts.roll(-1, dims=[-1]).nonzero()[:, -1] + 1
+        starts = starts.nonzero().tolist()
+        mean_words = []
+        for (seq_idx, token_idx), end in zip(starts, ends):
+            mean_words.append(x[seq_idx, token_idx:end, :].mean(dim=0))
+        mean_words = torch.stack(mean_words)
+        words = mean_words
+        # Innermost dimension is mask for tokens at head of word.
+        nwords = word_mask.sum(dim=-1)
+        (cat_logits, attr_logits) = self.classifier(words)
+        # (Batch * Time) x Depth -> Batch x Time x Depth
+        cat_logits = pad_sequence(cat_logits.split((nwords).tolist()), padding_value=0, batch_first=True)
+        attr_logits = pad_sequence(
+            attr_logits.split((nwords).tolist()),
+            padding_value=0,
+            batch_first=True,
+        )
+        return cat_logits, attr_logits
     def _aggregate_subword_tokens(
         self, sequence_output: torch.Tensor, word_mask: torch.Tensor
             word_mask: Binary mask where 1 indicates start of word (batch_size, seq_len)
         Returns:
+            word_features: Word-level features (batch_size, max_words, hidden_size)
             nwords: Number of words per sequence (batch_size,)
         """
         # TODO: Verify that BOS and EOS are handled correctly - I'm worried that this does not correctly handle padding
         cat_logits, attr_logits = self.forward(input_ids=input_ids, attention_mask=attention_mask, word_mask=word_mask)
+        return self._logits_to_labels(cat_logits, attr_logits, word_mask)
     def _word_ids_to_word_mask(self, word_ids: List[List[int]], input_shape: torch.Size) -> torch.Tensor:
         """
             input_shape: Shape of input_ids tensor (batch_size, seq_len)
         Returns:
+            word_mask: Binary tensor where 1 indicates start of word (batch_size, seq_len)
         """
         batch_size, seq_len = input_shape
         word_mask = torch.zeros(batch_size, seq_len, dtype=torch.long)
         for batch_idx, seq_word_ids in enumerate(word_ids):
+            # Truncate to exclude BOS and EOS tokens (first and last)
+            truncated_word_ids = seq_word_ids[1:-1]
             prev_word_id = None
+            for token_idx, word_id in enumerate(truncated_word_ids):
                 if word_id != prev_word_id:
+                    word_mask[batch_idx, token_idx + 1] = 1  # +1 to account for BOS
                 prev_word_id = word_id
+            # Debug logging to match fairseq model
+            logger.debug(f"Word mask: {word_mask[batch_idx].tolist()}")
         return word_mask
     def predict_labels_from_text(self, sentences: List[str], tokenizer) -> List[List[Tuple[str, List[str]]]]:
         Returns:
             List of sequences, each containing (category, [attributes]) per word
         """
+        # Split sentences by spaces to get proper word boundaries
+        # This fixes the issue where tokens like "Kl." get split incorrectly
+        sentences_split = [sentence.split() for sentence in sentences]
+        # Use batch_encode_plus with is_split_into_words=True to preserve word boundaries
+        encoding = tokenizer.batch_encode_plus(
+            sentences_split,
+            return_tensors="pt",
+            padding=True,
+            is_split_into_words=True,
+            add_special_tokens=True
+        )
+        batch_input_ids = encoding["input_ids"]
+        batch_attention_mask = encoding["attention_mask"]
+        word_ids_list = [encoding.word_ids(i) for i in range(len(sentences))]
+        # Debug logging to match fairseq model
+        for i in range(len(sentences)):
+            logger.debug(f"Encoded tokens: {batch_input_ids[i]}")
+            logger.debug(f"Decoded tokens: {tokenizer.convert_ids_to_tokens(batch_input_ids[i].tolist())}")
+            logger.debug(f"Word IDs: {word_ids_list[i]}")
         return self.predict_labels(batch_input_ids, batch_attention_mask, word_ids_list)
     def _logits_to_labels(
+        self, cat_logits: torch.Tensor, attr_logits: torch.Tensor, word_mask: torch.Tensor
     ) -> List[List[Tuple[str, List[str]]]]:
         """
         Convert logits to human-readable labels using fairseq's group-based logic.
+        Copied from the old model's logits_to_labels method.
         """
+        # logits: Batch x Time x Labels
+        bsz, _, num_cats = cat_logits.shape
+        _, _, num_attrs = attr_logits.shape
+        nwords = word_mask.sum(-1)
+        assert num_attrs == len(self.config.label_schema.labels)
+        assert num_cats == len(self.config.label_schema.label_categories)
+        batch_cats = []
+        batch_attrs = []
+        for seq_idx in range(bsz):
+            seq_nwords = nwords[seq_idx]
+            pred_cat_vec_idxs = cat_logits[seq_idx, :seq_nwords].max(dim=-1).indices
+            pred_cats = self.cat_vec_idx_to_dict_idx[pred_cat_vec_idxs]
+            group_mask = self.group_mask[pred_cat_vec_idxs]
+            offset = self.label_dictionary.nspecial
+            pred_attrs = []
+            for group_idx, group_name in enumerate(self.config.label_schema.group_names):
+                group_vec_idxs = self.group_name_to_group_attr_vec_idxs[group_name]
+                # logits: (bsz * nwords) x labels
+                group_logits = attr_logits[seq_idx, :seq_nwords, group_vec_idxs]
+                if len(group_vec_idxs) == 1:
+                    group_pred = group_logits.sigmoid().ge(0.5).long()
+                    group_pred_dict_idxs = (group_pred.squeeze() * (group_vec_idxs.item() + offset)).T.to(
+                        "cpu"
+                    ) * group_mask[:, group_idx]
+                else:
+                    group_pred_vec_idxs = group_logits.max(dim=-1).indices
+                    group_pred_dict_idxs = (group_vec_idxs[group_pred_vec_idxs] + offset) * group_mask[:, group_idx]
+                pred_attrs.append(group_pred_dict_idxs)
+            pred_attrs = torch.stack([p.squeeze() for p in pred_attrs]).t()
+            batch_cats.append(pred_cats)
+            batch_attrs.append(pred_attrs)
+        predictions = list(
+            [
+                clean_cats_attrs(
+                    self.label_dictionary,
+                    self.config.label_schema,
+                    seq_cats,
+                    seq_attrs,
+                )
+                for seq_cats, seq_attrs in zip(batch_cats, batch_attrs)
+            ]
+        )
+        return predictions
+def make_vec_idx_to_dict_idx(dictionary, labels, device="cpu", fill_value=-100):
+    vec_idx_to_dict_idx = torch.full((len(dictionary),), device=device, fill_value=fill_value, dtype=torch.long)
+    for vec_idx, label in enumerate(labels):
+        vec_idx_to_dict_idx[vec_idx] = dictionary.index(label)
+    return vec_idx_to_dict_idx
+def make_group_masks(dictionary, schema, device="cpu"):
+    num_groups = len(schema.group_names)
+    offset = dictionary.nspecial
+    num_labels = len(dictionary) - offset
+    ret_mask = torch.zeros(num_labels, num_groups, dtype=torch.int64, device=device)
+    for cat, cat_group_names in schema.category_to_group_names.items():
+        cat_label_idx = dictionary.index(cat)
+        cat_vec_idx = schema.label_categories.index(cat)
+        for group_name in cat_group_names:
+            ret_mask[cat_vec_idx, schema.group_names.index(group_name)] = 1
+        assert cat_label_idx != dictionary.unk()
+    for cat in schema.label_categories:
+        cat_label_idx = dictionary.index(cat)
+        assert cat_label_idx != dictionary.unk()
+    return ret_mask
+def make_group_name_to_group_attr_vec_idxs(dict_, schema):
+    offset = dict_.nspecial
+    group_names = schema.group_name_to_labels.keys()
+    name_to_labels = schema.group_name_to_labels
+    group_name_to_group_attr_vec_idxs = {
+        name: torch.tensor([dict_.index(item) - offset for item in name_to_labels[name]]) for name in group_names
+    }
+    return group_name_to_group_attr_vec_idxs
+def make_dict_idx_to_vec_idx(dictionary, cats, device="cpu", fill_value=-100):
+    # NOTE: when target is not in label_categories, the error is silent
+    map_tgt = torch.full((len(dictionary),), device=device, fill_value=fill_value, dtype=torch.long)
+    for vec_idx, label in enumerate(cats):
+        map_tgt[dictionary.index(label)] = vec_idx
+    return map_tgt
 AutoConfig.register("icebert-pos", IceBertPosConfig)

old_label_utils.py ADDED Viewed

	@@ -0,0 +1,223 @@

+# Copyright (C) Miðeind ehf.
+# This file is part of IceBERT POS model conversion.
+"""
+Utility functions copied from the old fairseq-based model for label handling.
+These functions handle the conversion between vector indices and dictionary indices,
+accounting for the offset caused by special tokens in the label dictionary.
+"""
+from typing import Dict, List, Tuple
+import torch
+class SimpleLabelDictionary:
+    """
+    Simplified version of fairseq Dictionary to handle label mappings.
+    This replaces the fairseq Dictionary dependency while maintaining the same interface.
+    """
+    def __init__(self, labels: List[str], nspecial: int = 5):
+        """
+        Args:
+            labels: List of labels including special tokens at the beginning
+            nspecial: Number of special tokens (typically 5: <pad>, <s>, </s>, <unk>, <SEP>)
+        """
+        self.symbols = labels
+        self.nspecial = nspecial
+        self._indices = {label: idx for idx, label in enumerate(labels)}
+    def index(self, label: str) -> int:
+        """Get index of label in dictionary."""
+        return self._indices.get(label, self.unk())
+    def unk(self) -> int:
+        """Return index of unknown token (typically 3)."""
+        return 3
+    def string(self, indices: torch.Tensor) -> str:
+        """Convert tensor of indices to space-separated string of labels."""
+        if indices.dim() == 0:
+            indices = indices.unsqueeze(0)
+        # Filter out special tokens like fairseq Dictionary does
+        special_indices_to_ignore = {0, 1, 2, 3}  # BOS, PAD, EOS, UNK
+        labels = [
+            self.symbols[idx] for idx in indices.tolist()
+            if 0 <= idx < len(self.symbols) and idx not in special_indices_to_ignore
+        ]
+        return " ".join(labels)
+    def __len__(self) -> int:
+        return len(self.symbols)
+def make_vec_idx_to_dict_idx(dictionary: SimpleLabelDictionary, labels: List[str], device="cpu", fill_value=-100) -> torch.Tensor:
+    """
+    Create mapping from vector indices to dictionary indices.
+    Args:
+        dictionary: Label dictionary
+        labels: List of labels
+        device: Device for tensor
+        fill_value: Fill value for missing entries
+    Returns:
+        Tensor mapping vector indices to dictionary indices
+    """
+    vec_idx_to_dict_idx = torch.full((len(dictionary),), device=device, fill_value=fill_value, dtype=torch.long)
+    for vec_idx, label in enumerate(labels):
+        vec_idx_to_dict_idx[vec_idx] = dictionary.index(label)
+    return vec_idx_to_dict_idx
+def make_group_masks(dictionary: SimpleLabelDictionary, schema, device="cpu") -> torch.Tensor:
+    """
+    Create group masks indicating which groups are valid for each category.
+    Args:
+        dictionary: Label dictionary
+        schema: Label schema object
+        device: Device for tensor
+    Returns:
+        Tensor of shape (num_categories, num_groups) with 1 for valid combinations
+    """
+    num_groups = len(schema.group_names)
+    offset = dictionary.nspecial
+    num_labels = len(dictionary) - offset
+    ret_mask = torch.zeros(num_labels, num_groups, dtype=torch.int64, device=device)
+    for cat, cat_group_names in schema.category_to_group_names.items():
+        cat_label_idx = dictionary.index(cat)
+        cat_vec_idx = schema.label_categories.index(cat)
+        for group_name in cat_group_names:
+            ret_mask[cat_vec_idx, schema.group_names.index(group_name)] = 1
+        assert cat_label_idx != dictionary.unk()
+    return ret_mask
+def make_group_name_to_group_attr_vec_idxs(dictionary: SimpleLabelDictionary, schema) -> Dict[str, torch.Tensor]:
+    """
+    Create mapping from group names to their attribute vector indices.
+    Args:
+        dictionary: Label dictionary
+        schema: Label schema object
+    Returns:
+        Dictionary mapping group names to tensor of vector indices
+    """
+    offset = dictionary.nspecial
+    group_names = schema.group_name_to_labels.keys()
+    name_to_labels = schema.group_name_to_labels
+    group_name_to_group_attr_vec_idxs = {
+        name: torch.tensor([dictionary.index(item) - offset for item in name_to_labels[name]])
+        for name in group_names
+    }
+    return group_name_to_group_attr_vec_idxs
+def make_dict_idx_to_vec_idx(dictionary: SimpleLabelDictionary, cats: List[str], device="cpu", fill_value=-100) -> torch.Tensor:
+    """
+    Create mapping from dictionary indices to vector indices.
+    Args:
+        dictionary: Label dictionary
+        cats: List of categories
+        device: Device for tensor
+        fill_value: Fill value for missing entries
+    Returns:
+        Tensor mapping dictionary indices to vector indices
+    """
+    # NOTE: when target is not in label_categories, the error is silent
+    map_tgt = torch.full((len(dictionary),), device=device, fill_value=fill_value, dtype=torch.long)
+    for vec_idx, label in enumerate(cats):
+        map_tgt[dictionary.index(label)] = vec_idx
+    return map_tgt
+def clean_cats_attrs(ldict: SimpleLabelDictionary, schema, pred_cats: torch.Tensor, pred_attrs: torch.Tensor) -> List[Tuple[str, List[str]]]:
+    """
+    Convert predicted category and attribute indices to human-readable labels.
+    Args:
+        ldict: Label dictionary
+        schema: Label schema object
+        pred_cats: Predicted category indices
+        pred_attrs: Predicted attribute indices
+    Returns:
+        List of (category, [attributes]) tuples
+    """
+    cats = ldict.string(pred_cats).split(" ")
+    attrs = []
+    if len(pred_attrs.shape) == 1:
+        split_pred_attrs = [pred_attrs]
+    else:
+        split_pred_attrs = pred_attrs.split(1, dim=0)
+    for (_cat_idx, attr_idxs) in zip(pred_cats.tolist(), split_pred_attrs):
+        seq_attrs = [lbl for lbl in ldict.string((attr_idxs.squeeze())).split(" ")]
+        if not any(it for it in seq_attrs):
+            seq_attrs = []
+        attrs.append(seq_attrs)
+    return list(zip(cats, attrs))
+def create_label_dictionary_from_schema(schema) -> SimpleLabelDictionary:
+    """
+    Create a SimpleLabelDictionary from a label schema, mimicking the old fairseq setup.
+    Load the exact symbols from the original fairseq dictionary to ensure perfect compatibility.
+    Args:
+        schema: Label schema object (unused, kept for compatibility)
+    Returns:
+        SimpleLabelDictionary with exact same symbols as original fairseq dict
+    """
+    try:
+        # Load original fairseq dictionary to get exact symbol order and content
+        from fairseq.data import Dictionary
+        import os
+        # Try to find the original dict_term.txt file
+        possible_paths = [
+            'scripts/dict_term.txt',
+            'icebert-pos/scripts/dict_term.txt',
+            '../scripts/dict_term.txt'
+        ]
+        original_dict = None
+        for path in possible_paths:
+            if os.path.exists(path):
+                original_dict = Dictionary.load(path)
+                break
+        if original_dict is not None:
+            # Use exact symbols from original dictionary
+            return SimpleLabelDictionary(original_dict.symbols, nspecial=original_dict.nspecial)
+    except ImportError:
+        # Fallback if fairseq is not available
+        pass
+    except Exception:
+        # Fallback if file loading fails
+        pass
+    # Fallback: reconstruct from schema (original logic)
+    # Use the correct special token order from original dictionary
+    special_symbols = ["<s>", "<pad>", "</s>", "<unk>", "<SEP>"]
+    # The schema labels start with <SEP>, so we need to skip it
+    schema_labels_without_sep = [label for label in schema.labels if label != "<SEP>"]
+    # Combine: special tokens + schema labels (without duplicate <SEP>)
+    all_symbols = special_symbols + schema_labels_without_sep
+    return SimpleLabelDictionary(all_symbols, nspecial=4)  # 4 special tokens before <SEP>