add support for IFD tags and some refactoring

Browse files

Files changed (5) hide show

README.md +21 -5
configuration.py +56 -0
ifd_utils.py +136 -0
modeling.py +86 -105
old_label_utils.py +0 -223

README.md CHANGED Viewed

@@ -13,24 +13,40 @@ from transformers import AutoModel, AutoTokenizer
 model = AutoModel.from_pretrained("mideind/IceBERT-PoS", trust_remote_code=True)
 tokenizer = AutoTokenizer.from_pretrained("mideind/IceBERT-PoS")
-# Pre tokenized
 sentence = "Ég veit að þú kemur í kvöld til mín ."
 result = model.predict_labels_from_text([sentence], tokenizer)
 expected = [
     [
         ("fp", ["1", "sing", "nom"]),
-        ("sf", ["sing", "act", "1", "pres"]),
         ("c", []),
         ("fp", ["2", "sing", "nom"]),
-        ("sf", ["sing", "act", "2", "pres"]),
-        ("af", ["pos"]),
         ("n", ["neut", "sing", "acc"]),
-        ("af", ["pos"]),
         ("fp", ["1", "sing", "gen"]),
         ("pl", []),
     ]
 ]
 assert result == expected, f"Expected {expected}, but got {result}"
 print("Test passed successfully!")
 ```

 model = AutoModel.from_pretrained("mideind/IceBERT-PoS", trust_remote_code=True)
 tokenizer = AutoTokenizer.from_pretrained("mideind/IceBERT-PoS")
+## Prediction Methods
+The model provides two prediction methods:
+- **`predict_labels_from_text()`**: Returns structured predictions as (category, [attributes]) tuples. Use this for downstream NLP tasks or when you need the semantic meaning of each prediction.
+- **`predict_ifd_labels_from_text()`**: Returns predictions in IFD (Icelandic Frequency Dictionary) format. Use this for evaluation against MIM-GOLD datasets or when you need compatibility with traditional Icelandic POS taggers.
+# Example sentence which is already tokenized (with a classic tokenizer)
 sentence = "Ég veit að þú kemur í kvöld til mín ."
+# Get predictions in (category, [attributes]) format
 result = model.predict_labels_from_text([sentence], tokenizer)
 expected = [
     [
         ("fp", ["1", "sing", "nom"]),
+        ("sf", ["act", "1", "sing", "pres"]),
         ("c", []),
         ("fp", ["2", "sing", "nom"]),
+        ("sf", ["act", "2", "sing", "pres"]),
+        ("af", []),
         ("n", ["neut", "sing", "acc"]),
+        ("af", []),
         ("fp", ["1", "sing", "gen"]),
         ("pl", []),
     ]
 ]
 assert result == expected, f"Expected {expected}, but got {result}"
 print("Test passed successfully!")
+# Get predictions in IFD format (for MIM-GOLD evaluation)
+ifd_result = model.predict_ifd_labels_from_text([sentence], tokenizer)
+ifd_expected = [
+    ["fp1en", "sfg1en", "c", "fp2en", "sfg2en", "af", "nheo", "af", "fp1ee", "pl"]
+]
+assert ifd_result == ifd_expected, f"Expected {ifd_expected}, but got {ifd_result}"
+print("IFD conversion test passed successfully!")
 ```

configuration.py CHANGED Viewed

@@ -5,6 +5,7 @@ import json
 from dataclasses import dataclass
 from typing import Dict, List, Optional
 from transformers import AutoConfig, RobertaConfig
@@ -31,6 +32,61 @@ class LabelSchema:
     separator: str
     ignore_categories: List[str]
 class IceBertPosConfig(RobertaConfig):
     """

 from dataclasses import dataclass
 from typing import Dict, List, Optional
+import torch
 from transformers import AutoConfig, RobertaConfig
     separator: str
     ignore_categories: List[str]
+    def get_group_name_to_group_attr_indices(self, device="cpu") -> Dict[str, torch.Tensor]:
+        """
+        Create mapping from group names to their attribute indices in the labels list.
+        Returns:
+            Dictionary mapping group names to tensor of label indices
+        """
+        group_name_to_group_attr_indices = {}
+        for group_name, group_labels in self.group_name_to_labels.items():
+            indices = []
+            for label in group_labels:
+                if label in self.labels:
+                    indices.append(self.labels.index(label))
+            group_name_to_group_attr_indices[group_name] = torch.tensor(indices, device=device)
+        return group_name_to_group_attr_indices
+    def get_group_masks(self, device="cpu") -> torch.Tensor:
+        """
+        Create group masks indicating which groups are valid for each category.
+        Returns:
+            Tensor of shape (num_categories, num_groups) with 1 for valid combinations
+        """
+        num_categories = len(self.label_categories)
+        num_groups = len(self.group_names)
+        group_mask = torch.zeros(num_categories, num_groups, dtype=torch.int64, device=device)
+        for cat, cat_group_names in self.category_to_group_names.items():
+            if cat in self.label_categories:
+                cat_idx = self.label_categories.index(cat)
+                for group_name in cat_group_names:
+                    if group_name in self.group_names:
+                        group_idx = self.group_names.index(group_name)
+                        group_mask[cat_idx, group_idx] = 1
+        return group_mask
+    def get_category_name_to_index(self) -> Dict[str, int]:
+        """
+        Create mapping from category names to their indices.
+        Returns:
+            Dictionary mapping category names to their indices
+        """
+        return {cat: idx for idx, cat in enumerate(self.label_categories)}
+    def get_label_name_to_index(self) -> Dict[str, int]:
+        """
+        Create mapping from label names to their indices.
+        Returns:
+            Dictionary mapping label names to their indices
+        """
+        return {label: idx for idx, label in enumerate(self.labels)}
 class IceBertPosConfig(RobertaConfig):
     """

ifd_utils.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Copyright (C) Miðeind ehf.
+# This file is part of IceBERT POS model conversion.
+"""
+IFD (Icelandic Frequency Dictionary) utilities for converting model predictions
+to IFD format labels used in MIM-GOLD evaluation.
+"""
+import numpy as np
+from typing import List, Tuple
+# Category and feature definitions
+CATS = [
+    "n", "g", "x", "e", "v", "l", "fa", "fb", "fe", "fo", "fp", "fs", "ft", "tf",
+    "ta", "tp", "to", "sn", "sb", "sf", "sv", "ss", "sl", "sþ", "cn", "ct", "c",
+    "aa", "af", "au", "ao", "aþ", "ae", "as", "ks", "kt", "p", "pl", "pk", "pg",
+    "pa", "ns", "m"
+]
+FEATS = [
+    "masc", "fem", "neut", "gender_x", "1", "2", "3", "sing", "plur", "nom",
+    "acc", "dat", "gen", "definite", "proper", "strong", "weak", "equiinflected",
+    "pos", "cmp", "superl", "past", "pres", "pass", "act", "mid"
+]
+LABELS = CATS + FEATS
+LABEL_TO_IDX = {label: idx for (idx, label) in enumerate(LABELS)}
+# IFD conversion mappings
+GENDER = {"k": "masc", "v": "fem", "h": "neut", "-": "gender_x"}
+NUMBER = {"e": "sing", "f": "plur"}
+PERSON = {"1": "1", "2": "2", "3": "3"}
+CASE = {"n": "nom", "o": "acc", "þ": "dat", "e": "gen"}
+DEGREE = {"f": "pos", "m": "cmp", "e": "superl"}
+VOICE = {"g": "act", "m": "mid"}
+TENSE = {"n": "pres", "þ": "past"}
+ADJ_CLASS = {"s": "strong", "v": "weak", "o": "equiinflected"}
+DEFINITE = {"g": "definite", " ": "indefinite"}
+TAGSET = {
+    "n": [
+        GENDER,
+        NUMBER,
+        CASE,
+        {"g": "definite", "-": "", " ": ""},
+        {"": "", "s": "proper"},
+    ],
+    "l": [GENDER, NUMBER, CASE, ADJ_CLASS, DEGREE],
+    "f": [{**GENDER, **PERSON}, NUMBER, CASE],
+    "g": [GENDER, NUMBER, CASE],
+    "t": [GENDER, NUMBER, CASE],
+    "sþ": [VOICE, GENDER, NUMBER, CASE],
+    "s": [VOICE, PERSON, NUMBER, TENSE],
+    "a": [DEGREE],
+}
+def vec2ifd(vec):
+    """Convert one-hot vector to IFD format tag."""
+    cat_idx = np.argmax(vec[:len(CATS)])
+    cat = CATS[cat_idx]
+    idxs = list(np.where(vec == 1)[0])
+    features = [LABELS[int(idx)] for idx in idxs if int(idx) >= len(CATS)]
+    if not features:
+        return cat
+    ret = []
+    codes = []
+    tagset_key = cat[0]
+    tagset_key = "sþ" if cat.startswith("sþ") else tagset_key
+    if tagset_key not in TAGSET:
+        return cat
+    for feature in TAGSET[tagset_key]:
+        for code, val in feature.items():
+            if val in features:
+                ret.append(val)
+                codes.append(code)
+    tag = "".join([cat] + codes)
+    if cat == "n" and "proper" in features and "definite" not in features:
+        tag = tag[:-1] + "-" + tag[-1]
+    return tag
+def convert_predictions_to_ifd(predictions: List[Tuple[str, List[str]]]) -> List[str]:
+    """
+    Convert model predictions to IFD format using logic from the original model.
+    Args:
+        predictions: List of (category, [attributes]) tuples from model
+    Returns:
+        List of IFD format labels
+    """
+    ifd_labels = []
+    for labelset in predictions:
+        cat, feats = labelset
+        labels_to_map = [cat]
+        # Apply the same logic as the original predict_ifd_labels method
+        if len(feats) == 1 and feats[0] == "pos":
+            # This label is used as a default for training but implied in mim format
+            feats = []
+        elif cat == "sl" and "act" in feats:
+            # Number and tense are not shown for sl act in mim format
+            feats = [f for f in feats if f not in ["1", "sing", "pres"]]
+        labels_to_map += feats
+        # Create one-hot vector from labels
+        vec = np.zeros(len(LABELS))
+        for label in labels_to_map:
+            if label in LABEL_TO_IDX:
+                vec[LABEL_TO_IDX[label]] = 1
+        # Convert to IFD format
+        try:
+            ifd_label = vec2ifd(vec)
+            if ifd_label == "ns":
+                # This is to comply with the format
+                ifd_label = "n----s"
+            ifd_labels.append(ifd_label)
+        except Exception:
+            # Fallback to naive concatenation if conversion fails
+            if feats:
+                ifd_labels.append(cat + "".join(feats))
+            else:
+                ifd_labels.append(cat)
+    return ifd_labels

modeling.py CHANGED Viewed

@@ -11,15 +11,7 @@ from torch.nn.utils.rnn import pad_sequence
 from transformers import AutoConfig, AutoModel, PreTrainedModel, RobertaModel
 from .configuration import IceBertPosConfig
-from .old_label_utils import (
-    SimpleLabelDictionary,
-    clean_cats_attrs,
-    create_label_dictionary_from_schema,
-    make_dict_idx_to_vec_idx,
-    make_group_masks,
-    make_group_name_to_group_attr_vec_idxs,
-    make_vec_idx_to_dict_idx,
-)
 logger = logging.getLogger(__name__)
@@ -90,21 +82,31 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         self.roberta = RobertaModel(config, add_pooling_layer=False)
         self.classifier = MultiLabelTokenClassificationHead(config)
-        # Create label dictionary and mappings (mimicking old fairseq model)
-        self.label_dictionary = create_label_dictionary_from_schema(config.label_schema)
         self._setup_label_mappings()
         # Initialize weights and apply final processing
         self.post_init()
     def _setup_label_mappings(self):
-        """Setup label mappings similar to the old fairseq model."""
         schema = self.config.label_schema
-        self.group_name_to_group_attr_vec_idxs = make_group_name_to_group_attr_vec_idxs(self.label_dictionary, schema)
-        self.cat_dict_idx_to_vec_idx = make_dict_idx_to_vec_idx(self.label_dictionary, schema.label_categories)
-        self.cat_vec_idx_to_dict_idx = make_vec_idx_to_dict_idx(self.label_dictionary, schema.label_categories)
-        self.group_mask = make_group_masks(self.label_dictionary, schema)
     def forward(
         self,
@@ -316,20 +318,16 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         # Split sentences by spaces to get proper word boundaries
         # This fixes the issue where tokens like "Kl." get split incorrectly
         sentences_split = [sentence.split() for sentence in sentences]
         # Use batch_encode_plus with is_split_into_words=True to preserve word boundaries
         encoding = tokenizer.batch_encode_plus(
-            sentences_split,
-            return_tensors="pt",
-            padding=True,
-            is_split_into_words=True,
-            add_special_tokens=True
         )
         batch_input_ids = encoding["input_ids"]
         batch_attention_mask = encoding["attention_mask"]
         word_ids_list = [encoding.word_ids(i) for i in range(len(sentences))]
         # Debug logging to match fairseq model
         for i in range(len(sentences)):
             logger.debug(f"Encoded tokens: {batch_input_ids[i]}")
@@ -342,8 +340,7 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         self, cat_logits: torch.Tensor, attr_logits: torch.Tensor, word_mask: torch.Tensor
     ) -> List[List[Tuple[str, List[str]]]]:
         """
-        Convert logits to human-readable labels using fairseq's group-based logic.
-        Copied from the old model's logits_to_labels method.
         """
         # logits: Batch x Time x Labels
         bsz, _, num_cats = cat_logits.shape
@@ -353,90 +350,74 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         assert num_attrs == len(self.config.label_schema.labels)
         assert num_cats == len(self.config.label_schema.label_categories)
-        batch_cats = []
-        batch_attrs = []
         for seq_idx in range(bsz):
             seq_nwords = nwords[seq_idx]
-            pred_cat_vec_idxs = cat_logits[seq_idx, :seq_nwords].max(dim=-1).indices
-            pred_cats = self.cat_vec_idx_to_dict_idx[pred_cat_vec_idxs]
-            group_mask = self.group_mask[pred_cat_vec_idxs]
-            offset = self.label_dictionary.nspecial
-            pred_attrs = []
-            for group_idx, group_name in enumerate(self.config.label_schema.group_names):
-                group_vec_idxs = self.group_name_to_group_attr_vec_idxs[group_name]
-                # logits: (bsz * nwords) x labels
-                group_logits = attr_logits[seq_idx, :seq_nwords, group_vec_idxs]
-                if len(group_vec_idxs) == 1:
-                    group_pred = group_logits.sigmoid().ge(0.5).long()
-                    group_pred_dict_idxs = (group_pred.squeeze() * (group_vec_idxs.item() + offset)).T.to(
-                        "cpu"
-                    ) * group_mask[:, group_idx]
-                else:
-                    group_pred_vec_idxs = group_logits.max(dim=-1).indices
-                    group_pred_dict_idxs = (group_vec_idxs[group_pred_vec_idxs] + offset) * group_mask[:, group_idx]
-                pred_attrs.append(group_pred_dict_idxs)
-            pred_attrs = torch.stack([p.squeeze() for p in pred_attrs]).t()
-            batch_cats.append(pred_cats)
-            batch_attrs.append(pred_attrs)
-        predictions = list(
-            [
-                clean_cats_attrs(
-                    self.label_dictionary,
-                    self.config.label_schema,
-                    seq_cats,
-                    seq_attrs,
-                )
-                for seq_cats, seq_attrs in zip(batch_cats, batch_attrs)
-            ]
-        )
         return predictions
-def make_vec_idx_to_dict_idx(dictionary, labels, device="cpu", fill_value=-100):
-    vec_idx_to_dict_idx = torch.full((len(dictionary),), device=device, fill_value=fill_value, dtype=torch.long)
-    for vec_idx, label in enumerate(labels):
-        vec_idx_to_dict_idx[vec_idx] = dictionary.index(label)
-    return vec_idx_to_dict_idx
-def make_group_masks(dictionary, schema, device="cpu"):
-    num_groups = len(schema.group_names)
-    offset = dictionary.nspecial
-    num_labels = len(dictionary) - offset
-    ret_mask = torch.zeros(num_labels, num_groups, dtype=torch.int64, device=device)
-    for cat, cat_group_names in schema.category_to_group_names.items():
-        cat_label_idx = dictionary.index(cat)
-        cat_vec_idx = schema.label_categories.index(cat)
-        for group_name in cat_group_names:
-            ret_mask[cat_vec_idx, schema.group_names.index(group_name)] = 1
-        assert cat_label_idx != dictionary.unk()
-    for cat in schema.label_categories:
-        cat_label_idx = dictionary.index(cat)
-        assert cat_label_idx != dictionary.unk()
-    return ret_mask
-def make_group_name_to_group_attr_vec_idxs(dict_, schema):
-    offset = dict_.nspecial
-    group_names = schema.group_name_to_labels.keys()
-    name_to_labels = schema.group_name_to_labels
-    group_name_to_group_attr_vec_idxs = {
-        name: torch.tensor([dict_.index(item) - offset for item in name_to_labels[name]]) for name in group_names
-    }
-    return group_name_to_group_attr_vec_idxs
-def make_dict_idx_to_vec_idx(dictionary, cats, device="cpu", fill_value=-100):
-    # NOTE: when target is not in label_categories, the error is silent
-    map_tgt = torch.full((len(dictionary),), device=device, fill_value=fill_value, dtype=torch.long)
-    for vec_idx, label in enumerate(cats):
-        map_tgt[dictionary.index(label)] = vec_idx
-    return map_tgt
 AutoConfig.register("icebert-pos", IceBertPosConfig)

 from transformers import AutoConfig, AutoModel, PreTrainedModel, RobertaModel
 from .configuration import IceBertPosConfig
+from .ifd_utils import convert_predictions_to_ifd
 logger = logging.getLogger(__name__)
         self.roberta = RobertaModel(config, add_pooling_layer=False)
         self.classifier = MultiLabelTokenClassificationHead(config)
         self._setup_label_mappings()
         # Initialize weights and apply final processing
         self.post_init()
     def _setup_label_mappings(self):
+        """Setup label mappings using schema methods."""
         schema = self.config.label_schema
+        # Get model device for tensor creation
+        device = next(self.parameters()).device if len(list(self.parameters())) > 0 else torch.device("cpu")
+        # Register group mask as buffer so it moves with model.to(device)
+        self.register_buffer("group_mask", schema.get_group_masks(device=device))
+        # Register group attribute indices as buffers
+        group_attr_indices = schema.get_group_name_to_group_attr_indices(device=device)
+        self.group_name_to_group_attr_indices = {}
+        for group_name, indices in group_attr_indices.items():
+            buffer_name = f"group_attr_indices_{group_name}"
+            self.register_buffer(buffer_name, indices)
+            self.group_name_to_group_attr_indices[group_name] = getattr(self, buffer_name)
+        # Category name to index mapping (regular dict, no device movement needed)
+        self.category_name_to_index = schema.get_category_name_to_index()
     def forward(
         self,
         # Split sentences by spaces to get proper word boundaries
         # This fixes the issue where tokens like "Kl." get split incorrectly
         sentences_split = [sentence.split() for sentence in sentences]
         # Use batch_encode_plus with is_split_into_words=True to preserve word boundaries
         encoding = tokenizer.batch_encode_plus(
+            sentences_split, return_tensors="pt", padding=True, is_split_into_words=True, add_special_tokens=True
         )
         batch_input_ids = encoding["input_ids"]
         batch_attention_mask = encoding["attention_mask"]
         word_ids_list = [encoding.word_ids(i) for i in range(len(sentences))]
         # Debug logging to match fairseq model
         for i in range(len(sentences)):
             logger.debug(f"Encoded tokens: {batch_input_ids[i]}")
         self, cat_logits: torch.Tensor, attr_logits: torch.Tensor, word_mask: torch.Tensor
     ) -> List[List[Tuple[str, List[str]]]]:
         """
+        Convert logits to human-readable labels using schema-based logic.
         """
         # logits: Batch x Time x Labels
         bsz, _, num_cats = cat_logits.shape
         assert num_attrs == len(self.config.label_schema.labels)
         assert num_cats == len(self.config.label_schema.label_categories)
+        predictions = []
+        schema = self.config.label_schema
         for seq_idx in range(bsz):
             seq_nwords = nwords[seq_idx]
+            pred_cat_indices = cat_logits[seq_idx, :seq_nwords].max(dim=-1).indices
+            seq_predictions = []
+            for word_idx in range(seq_nwords):
+                cat_idx = int(pred_cat_indices[word_idx].item())
+                cat_name = schema.label_categories[cat_idx]
+                # Get valid groups for this category
+                valid_groups = schema.category_to_group_names.get(cat_name, [])
+                # Collect attributes for this word
+                attributes = []
+                for group_name in valid_groups:
+                    if group_name in self.group_name_to_group_attr_indices:
+                        group_indices = self.group_name_to_group_attr_indices[group_name]
+                        if len(group_indices) > 0:
+                            group_logits = attr_logits[seq_idx, word_idx, group_indices]
+                            if len(group_indices) == 1:
+                                # Binary decision
+                                if group_logits.sigmoid().item() > 0.5:
+                                    attr_idx = int(group_indices[0].item())
+                                    attributes.append(schema.labels[attr_idx])
+                            else:
+                                # Multi-class decision
+                                best_idx = int(group_logits.max(dim=-1).indices.item())
+                                attr_idx = int(group_indices[best_idx].item())
+                                attributes.append(schema.labels[attr_idx])
+                # Apply specific rules from original model
+                if len(attributes) == 1 and attributes[0] == "pos":
+                    # This label is used as a default for training but implied in mim format
+                    attributes = []
+                elif cat_name == "sl" and "act" in attributes:
+                    # Number and tense are not shown for sl act in mim format
+                    attributes = [attr for attr in attributes if attr not in ["1", "sing", "pres"]]
+                seq_predictions.append((cat_name, attributes))
+            predictions.append(seq_predictions)
         return predictions
+    def predict_ifd_labels_from_text(self, sentences: List[str], tokenizer) -> List[List[str]]:
+        """
+        Predict IFD format labels from raw text.
+        Args:
+            sentences: List of input sentences
+            tokenizer: HuggingFace tokenizer
+        Returns:
+            List of sequences, each containing IFD format labels per word
+        """
+        # Get model predictions in (category, [attributes]) format
+        predictions = self.predict_labels_from_text(sentences, tokenizer)
+        # Convert each sentence's predictions to IFD format
+        ifd_predictions = []
+        for sentence_predictions in predictions:
+            ifd_labels = convert_predictions_to_ifd(sentence_predictions)
+            ifd_predictions.append(ifd_labels)
+        return ifd_predictions
 AutoConfig.register("icebert-pos", IceBertPosConfig)

old_label_utils.py DELETED Viewed

@@ -1,223 +0,0 @@
-# Copyright (C) Miðeind ehf.
-# This file is part of IceBERT POS model conversion.
-"""
-Utility functions copied from the old fairseq-based model for label handling.
-These functions handle the conversion between vector indices and dictionary indices,
-accounting for the offset caused by special tokens in the label dictionary.
-"""
-from typing import Dict, List, Tuple
-import torch
-class SimpleLabelDictionary:
-    """
-    Simplified version of fairseq Dictionary to handle label mappings.
-    This replaces the fairseq Dictionary dependency while maintaining the same interface.
-    """
-    def __init__(self, labels: List[str], nspecial: int = 5):
-        """
-        Args:
-            labels: List of labels including special tokens at the beginning
-            nspecial: Number of special tokens (typically 5: <pad>, <s>, </s>, <unk>, <SEP>)
-        """
-        self.symbols = labels
-        self.nspecial = nspecial
-        self._indices = {label: idx for idx, label in enumerate(labels)}
-    def index(self, label: str) -> int:
-        """Get index of label in dictionary."""
-        return self._indices.get(label, self.unk())
-    def unk(self) -> int:
-        """Return index of unknown token (typically 3)."""
-        return 3
-    def string(self, indices: torch.Tensor) -> str:
-        """Convert tensor of indices to space-separated string of labels."""
-        if indices.dim() == 0:
-            indices = indices.unsqueeze(0)
-        # Filter out special tokens like fairseq Dictionary does
-        special_indices_to_ignore = {0, 1, 2, 3}  # BOS, PAD, EOS, UNK
-        labels = [
-            self.symbols[idx] for idx in indices.tolist()
-            if 0 <= idx < len(self.symbols) and idx not in special_indices_to_ignore
-        ]
-        return " ".join(labels)
-    def __len__(self) -> int:
-        return len(self.symbols)
-def make_vec_idx_to_dict_idx(dictionary: SimpleLabelDictionary, labels: List[str], device="cpu", fill_value=-100) -> torch.Tensor:
-    """
-    Create mapping from vector indices to dictionary indices.
-    Args:
-        dictionary: Label dictionary
-        labels: List of labels
-        device: Device for tensor
-        fill_value: Fill value for missing entries
-    Returns:
-        Tensor mapping vector indices to dictionary indices
-    """
-    vec_idx_to_dict_idx = torch.full((len(dictionary),), device=device, fill_value=fill_value, dtype=torch.long)
-    for vec_idx, label in enumerate(labels):
-        vec_idx_to_dict_idx[vec_idx] = dictionary.index(label)
-    return vec_idx_to_dict_idx
-def make_group_masks(dictionary: SimpleLabelDictionary, schema, device="cpu") -> torch.Tensor:
-    """
-    Create group masks indicating which groups are valid for each category.
-    Args:
-        dictionary: Label dictionary
-        schema: Label schema object
-        device: Device for tensor
-    Returns:
-        Tensor of shape (num_categories, num_groups) with 1 for valid combinations
-    """
-    num_groups = len(schema.group_names)
-    offset = dictionary.nspecial
-    num_labels = len(dictionary) - offset
-    ret_mask = torch.zeros(num_labels, num_groups, dtype=torch.int64, device=device)
-    for cat, cat_group_names in schema.category_to_group_names.items():
-        cat_label_idx = dictionary.index(cat)
-        cat_vec_idx = schema.label_categories.index(cat)
-        for group_name in cat_group_names:
-            ret_mask[cat_vec_idx, schema.group_names.index(group_name)] = 1
-        assert cat_label_idx != dictionary.unk()
-    return ret_mask
-def make_group_name_to_group_attr_vec_idxs(dictionary: SimpleLabelDictionary, schema) -> Dict[str, torch.Tensor]:
-    """
-    Create mapping from group names to their attribute vector indices.
-    Args:
-        dictionary: Label dictionary
-        schema: Label schema object
-    Returns:
-        Dictionary mapping group names to tensor of vector indices
-    """
-    offset = dictionary.nspecial
-    group_names = schema.group_name_to_labels.keys()
-    name_to_labels = schema.group_name_to_labels
-    group_name_to_group_attr_vec_idxs = {
-        name: torch.tensor([dictionary.index(item) - offset for item in name_to_labels[name]])
-        for name in group_names
-    }
-    return group_name_to_group_attr_vec_idxs
-def make_dict_idx_to_vec_idx(dictionary: SimpleLabelDictionary, cats: List[str], device="cpu", fill_value=-100) -> torch.Tensor:
-    """
-    Create mapping from dictionary indices to vector indices.
-    Args:
-        dictionary: Label dictionary
-        cats: List of categories
-        device: Device for tensor
-        fill_value: Fill value for missing entries
-    Returns:
-        Tensor mapping dictionary indices to vector indices
-    """
-    # NOTE: when target is not in label_categories, the error is silent
-    map_tgt = torch.full((len(dictionary),), device=device, fill_value=fill_value, dtype=torch.long)
-    for vec_idx, label in enumerate(cats):
-        map_tgt[dictionary.index(label)] = vec_idx
-    return map_tgt
-def clean_cats_attrs(ldict: SimpleLabelDictionary, schema, pred_cats: torch.Tensor, pred_attrs: torch.Tensor) -> List[Tuple[str, List[str]]]:
-    """
-    Convert predicted category and attribute indices to human-readable labels.
-    Args:
-        ldict: Label dictionary
-        schema: Label schema object
-        pred_cats: Predicted category indices
-        pred_attrs: Predicted attribute indices
-    Returns:
-        List of (category, [attributes]) tuples
-    """
-    cats = ldict.string(pred_cats).split(" ")
-    attrs = []
-    if len(pred_attrs.shape) == 1:
-        split_pred_attrs = [pred_attrs]
-    else:
-        split_pred_attrs = pred_attrs.split(1, dim=0)
-    for (_cat_idx, attr_idxs) in zip(pred_cats.tolist(), split_pred_attrs):
-        seq_attrs = [lbl for lbl in ldict.string((attr_idxs.squeeze())).split(" ")]
-        if not any(it for it in seq_attrs):
-            seq_attrs = []
-        attrs.append(seq_attrs)
-    return list(zip(cats, attrs))
-def create_label_dictionary_from_schema(schema) -> SimpleLabelDictionary:
-    """
-    Create a SimpleLabelDictionary from a label schema, mimicking the old fairseq setup.
-    Load the exact symbols from the original fairseq dictionary to ensure perfect compatibility.
-    Args:
-        schema: Label schema object (unused, kept for compatibility)
-    Returns:
-        SimpleLabelDictionary with exact same symbols as original fairseq dict
-    """
-    try:
-        # Load original fairseq dictionary to get exact symbol order and content
-        from fairseq.data import Dictionary
-        import os
-        # Try to find the original dict_term.txt file
-        possible_paths = [
-            'scripts/dict_term.txt',
-            'icebert-pos/scripts/dict_term.txt',
-            '../scripts/dict_term.txt'
-        ]
-        original_dict = None
-        for path in possible_paths:
-            if os.path.exists(path):
-                original_dict = Dictionary.load(path)
-                break
-        if original_dict is not None:
-            # Use exact symbols from original dictionary
-            return SimpleLabelDictionary(original_dict.symbols, nspecial=original_dict.nspecial)
-    except ImportError:
-        # Fallback if fairseq is not available
-        pass
-    except Exception:
-        # Fallback if file loading fails
-        pass
-    # Fallback: reconstruct from schema (original logic)
-    # Use the correct special token order from original dictionary
-    special_symbols = ["<s>", "<pad>", "</s>", "<unk>", "<SEP>"]
-    # The schema labels start with <SEP>, so we need to skip it
-    schema_labels_without_sep = [label for label in schema.labels if label != "<SEP>"]
-    # Combine: special tokens + schema labels (without duplicate <SEP>)
-    all_symbols = special_symbols + schema_labels_without_sep
-    return SimpleLabelDictionary(all_symbols, nspecial=4)  # 4 special tokens before <SEP>