Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

__init__.py +13 -0
__pycache__/__init__.cpython-38.pyc +0 -0
__pycache__/configuration.cpython-38.pyc +0 -0
__pycache__/modeling.cpython-38.pyc +0 -0
config.json +490 -0
configuration.py +231 -0
merges.txt +0 -0
model.safetensors +3 -0
modeling.py +503 -0
special_tokens_map.json +15 -0
tokenizer.json +0 -0
tokenizer_config.json +57 -0
vocab.json +0 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright (C) Miðeind ehf.
+# This file is part of IceBERT POS model conversion.
+from .configuration import IceBertPosConfig
+from .modeling import IceBertPosForTokenClassification, MultiLabelTokenClassificationHead
+__version__ = "0.1.0"
+__all__ = [
+    "IceBertPosConfig",
+    "IceBertPosForTokenClassification",
+    "MultiLabelTokenClassificationHead",
+]

__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (157 Bytes). View file

__pycache__/configuration.cpython-38.pyc ADDED Viewed

Binary file (3.52 kB). View file

__pycache__/modeling.cpython-38.pyc ADDED Viewed

Binary file (15.2 kB). View file

config.json ADDED Viewed

	@@ -0,0 +1,490 @@

+{
+  "architectures": [
+    "IceBertPosForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "attr_proj_input_size": 811,
+  "auto_map": {
+    "AutoConfig": "configuration.IceBertPosConfig",
+    "AutoModel": "modeling.IceBertPosForTokenClassification"
+  },
+  "bos_token_id": 0,
+  "classifier_dropout": 0.0,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6",
+    "7": "LABEL_7",
+    "8": "LABEL_8",
+    "9": "LABEL_9",
+    "10": "LABEL_10",
+    "11": "LABEL_11",
+    "12": "LABEL_12",
+    "13": "LABEL_13",
+    "14": "LABEL_14",
+    "15": "LABEL_15",
+    "16": "LABEL_16",
+    "17": "LABEL_17",
+    "18": "LABEL_18",
+    "19": "LABEL_19",
+    "20": "LABEL_20",
+    "21": "LABEL_21",
+    "22": "LABEL_22",
+    "23": "LABEL_23",
+    "24": "LABEL_24",
+    "25": "LABEL_25",
+    "26": "LABEL_26",
+    "27": "LABEL_27",
+    "28": "LABEL_28",
+    "29": "LABEL_29",
+    "30": "LABEL_30",
+    "31": "LABEL_31",
+    "32": "LABEL_32",
+    "33": "LABEL_33",
+    "34": "LABEL_34",
+    "35": "LABEL_35",
+    "36": "LABEL_36",
+    "37": "LABEL_37",
+    "38": "LABEL_38",
+    "39": "LABEL_39",
+    "40": "LABEL_40",
+    "41": "LABEL_41",
+    "42": "LABEL_42",
+    "43": "LABEL_43",
+    "44": "LABEL_44",
+    "45": "LABEL_45",
+    "46": "LABEL_46",
+    "47": "LABEL_47",
+    "48": "LABEL_48",
+    "49": "LABEL_49",
+    "50": "LABEL_50",
+    "51": "LABEL_51",
+    "52": "LABEL_52",
+    "53": "LABEL_53",
+    "54": "LABEL_54",
+    "55": "LABEL_55",
+    "56": "LABEL_56",
+    "57": "LABEL_57",
+    "58": "LABEL_58",
+    "59": "LABEL_59",
+    "60": "LABEL_60",
+    "61": "LABEL_61",
+    "62": "LABEL_62",
+    "63": "LABEL_63",
+    "64": "LABEL_64",
+    "65": "LABEL_65",
+    "66": "LABEL_66",
+    "67": "LABEL_67",
+    "68": "LABEL_68",
+    "69": "LABEL_69"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_10": 10,
+    "LABEL_11": 11,
+    "LABEL_12": 12,
+    "LABEL_13": 13,
+    "LABEL_14": 14,
+    "LABEL_15": 15,
+    "LABEL_16": 16,
+    "LABEL_17": 17,
+    "LABEL_18": 18,
+    "LABEL_19": 19,
+    "LABEL_2": 2,
+    "LABEL_20": 20,
+    "LABEL_21": 21,
+    "LABEL_22": 22,
+    "LABEL_23": 23,
+    "LABEL_24": 24,
+    "LABEL_25": 25,
+    "LABEL_26": 26,
+    "LABEL_27": 27,
+    "LABEL_28": 28,
+    "LABEL_29": 29,
+    "LABEL_3": 3,
+    "LABEL_30": 30,
+    "LABEL_31": 31,
+    "LABEL_32": 32,
+    "LABEL_33": 33,
+    "LABEL_34": 34,
+    "LABEL_35": 35,
+    "LABEL_36": 36,
+    "LABEL_37": 37,
+    "LABEL_38": 38,
+    "LABEL_39": 39,
+    "LABEL_4": 4,
+    "LABEL_40": 40,
+    "LABEL_41": 41,
+    "LABEL_42": 42,
+    "LABEL_43": 43,
+    "LABEL_44": 44,
+    "LABEL_45": 45,
+    "LABEL_46": 46,
+    "LABEL_47": 47,
+    "LABEL_48": 48,
+    "LABEL_49": 49,
+    "LABEL_5": 5,
+    "LABEL_50": 50,
+    "LABEL_51": 51,
+    "LABEL_52": 52,
+    "LABEL_53": 53,
+    "LABEL_54": 54,
+    "LABEL_55": 55,
+    "LABEL_56": 56,
+    "LABEL_57": 57,
+    "LABEL_58": 58,
+    "LABEL_59": 59,
+    "LABEL_6": 6,
+    "LABEL_60": 60,
+    "LABEL_61": 61,
+    "LABEL_62": 62,
+    "LABEL_63": 63,
+    "LABEL_64": 64,
+    "LABEL_65": 65,
+    "LABEL_66": 66,
+    "LABEL_67": 67,
+    "LABEL_68": 68,
+    "LABEL_69": 69,
+    "LABEL_7": 7,
+    "LABEL_8": 8,
+    "LABEL_9": 9
+  },
+  "label_schema": {
+    "category_to_group_names": {
+      "aa": [
+        "deg"
+      ],
+      "ae": [
+        "deg"
+      ],
+      "af": [
+        "deg"
+      ],
+      "ao": [
+        "deg"
+      ],
+      "as": [
+        "deg"
+      ],
+      "au": [
+        "deg"
+      ],
+      "a\u00fe": [
+        "deg"
+      ],
+      "fa": [
+        "gender",
+        "number",
+        "case"
+      ],
+      "fb": [
+        "gender",
+        "number",
+        "case"
+      ],
+      "fe": [
+        "gender",
+        "number",
+        "case"
+      ],
+      "fo": [
+        "gender_or_person",
+        "number",
+        "case"
+      ],
+      "fp": [
+        "gender_or_person",
+        "number",
+        "case"
+      ],
+      "fs": [
+        "gender",
+        "number",
+        "case"
+      ],
+      "ft": [
+        "gender",
+        "number",
+        "case"
+      ],
+      "g": [
+        "gender",
+        "number",
+        "case"
+      ],
+      "l": [
+        "gender",
+        "number",
+        "case",
+        "adj_c",
+        "deg"
+      ],
+      "n": [
+        "gender",
+        "number",
+        "case",
+        "def",
+        "proper"
+      ],
+      "sb": [
+        "voice",
+        "person",
+        "number",
+        "tense"
+      ],
+      "sf": [
+        "voice",
+        "person",
+        "number",
+        "tense"
+      ],
+      "sl": [
+        "voice",
+        "person",
+        "number",
+        "tense"
+      ],
+      "sn": [
+        "voice"
+      ],
+      "ss": [
+        "voice"
+      ],
+      "sv": [
+        "voice",
+        "person",
+        "number",
+        "tense"
+      ],
+      "s\u00fe": [
+        "voice",
+        "gender",
+        "number",
+        "case"
+      ],
+      "tf": [
+        "gender",
+        "number",
+        "case"
+      ]
+    },
+    "group_name_to_labels": {
+      "adj_c": [
+        "strong",
+        "weak",
+        "equiinflected"
+      ],
+      "case": [
+        "nom",
+        "acc",
+        "dat",
+        "gen"
+      ],
+      "def": [
+        "definite"
+      ],
+      "deg": [
+        "pos",
+        "cmp",
+        "superl"
+      ],
+      "gender": [
+        "masc",
+        "fem",
+        "neut",
+        "gender_x"
+      ],
+      "gender_or_person": [
+        "masc",
+        "fem",
+        "neut",
+        "gender_x",
+        "1",
+        "2",
+        "3"
+      ],
+      "number": [
+        "sing",
+        "plur"
+      ],
+      "person": [
+        "1",
+        "2",
+        "3"
+      ],
+      "proper": [
+        "proper"
+      ],
+      "tense": [
+        "pres",
+        "past"
+      ],
+      "voice": [
+        "act",
+        "mid"
+      ]
+    },
+    "group_names": [
+      "gender",
+      "gender_or_person",
+      "number",
+      "case",
+      "def",
+      "proper",
+      "adj_c",
+      "deg",
+      "voice",
+      "person",
+      "tense"
+    ],
+    "ignore_categories": [
+      "x",
+      "e"
+    ],
+    "label_categories": [
+      "n",
+      "g",
+      "x",
+      "e",
+      "v",
+      "l",
+      "fa",
+      "fb",
+      "fe",
+      "fo",
+      "fp",
+      "fs",
+      "ft",
+      "tf",
+      "ta",
+      "tp",
+      "to",
+      "sn",
+      "sb",
+      "sf",
+      "sv",
+      "ss",
+      "sl",
+      "s\u00fe",
+      "cn",
+      "ct",
+      "c",
+      "aa",
+      "af",
+      "au",
+      "ao",
+      "a\u00fe",
+      "ae",
+      "as",
+      "ks",
+      "kt",
+      "p",
+      "pl",
+      "pk",
+      "pg",
+      "pa",
+      "ns",
+      "m"
+    ],
+    "labels": [
+      "<SEP>",
+      "n",
+      "g",
+      "x",
+      "e",
+      "v",
+      "l",
+      "fa",
+      "fb",
+      "fe",
+      "fo",
+      "fp",
+      "fs",
+      "ft",
+      "tf",
+      "ta",
+      "tp",
+      "to",
+      "sn",
+      "sb",
+      "sf",
+      "sv",
+      "ss",
+      "sl",
+      "s\u00fe",
+      "cn",
+      "ct",
+      "c",
+      "aa",
+      "af",
+      "au",
+      "ao",
+      "a\u00fe",
+      "ae",
+      "as",
+      "ks",
+      "kt",
+      "p",
+      "pl",
+      "pk",
+      "pg",
+      "pa",
+      "ns",
+      "m",
+      "masc",
+      "fem",
+      "neut",
+      "gender_x",
+      "1",
+      "2",
+      "3",
+      "sing",
+      "plur",
+      "nom",
+      "acc",
+      "dat",
+      "gen",
+      "definite",
+      "proper",
+      "strong",
+      "weak",
+      "equiinflected",
+      "pos",
+      "cmp",
+      "superl",
+      "past",
+      "pres",
+      "pass",
+      "act",
+      "mid"
+    ],
+    "null": null,
+    "null_leaf": null,
+    "separator": "<SEP>"
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "icebert-pos",
+  "num_attention_heads": 12,
+  "num_categories": 43,
+  "num_groups": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.3",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 49937
+}

configuration.py ADDED Viewed

	@@ -0,0 +1,231 @@

+# Copyright (C) Miðeind ehf.
+# This file is part of IceBERT POS model conversion.
+import json
+from typing import Any, Dict, Optional
+from transformers import AutoConfig, RobertaConfig
+class IceBertPosConfig(RobertaConfig):
+    """
+    Configuration class for IceBERT POS (Part-of-Speech) tagging model.
+    This configuration inherits from RobertaConfig and adds POS-specific parameters
+    derived from the label schema used for multilabel token classification.
+    """
+    model_type = "icebert-pos"
+    def __init__(
+        self, label_schema: Optional[Dict[str, Any]] = None, classifier_dropout: Optional[float] = None, **kwargs
+    ):
+        super().__init__(**kwargs)
+        # Default label schema (terms2.json content)
+        if label_schema is None:
+            label_schema = self._get_default_label_schema()
+        self.label_schema = label_schema
+        # Derive parameters from label schema
+        self.num_categories = len(label_schema["label_categories"])
+        self.num_labels = len(label_schema["labels"])
+        self.num_groups = len(label_schema["group_names"])
+        # Classification head parameters
+        self.classifier_dropout = classifier_dropout if classifier_dropout is not None else 0.1
+        # Computed input size for attribute projection
+        # (category_probs + hidden_size) -> num_labels
+        self.attr_proj_input_size = self.num_categories + self.hidden_size
+    @staticmethod
+    def _get_default_label_schema() -> Dict[str, Any]:
+        """Default label schema corresponding to terms2.json"""
+        return {
+            "label_categories": [
+                "n",
+                "g",
+                "x",
+                "e",
+                "v",
+                "l",
+                "fa",
+                "fb",
+                "fe",
+                "fo",
+                "fp",
+                "fs",
+                "ft",
+                "tf",
+                "ta",
+                "tp",
+                "to",
+                "sn",
+                "sb",
+                "sf",
+                "sv",
+                "ss",
+                "sl",
+                "sþ",
+                "cn",
+                "ct",
+                "c",
+                "aa",
+                "af",
+                "au",
+                "ao",
+                "aþ",
+                "ae",
+                "as",
+                "ks",
+                "kt",
+                "p",
+                "pl",
+                "pk",
+                "pg",
+                "pa",
+                "ns",
+                "m",
+            ],
+            "category_to_group_names": {
+                "n": ["gender", "number", "case", "def", "proper"],
+                "g": ["gender", "number", "case"],
+                "l": ["gender", "number", "case", "adj_c", "deg"],
+                "fa": ["gender", "number", "case"],
+                "fb": ["gender", "number", "case"],
+                "fe": ["gender", "number", "case"],
+                "fs": ["gender", "number", "case"],
+                "ft": ["gender", "number", "case"],
+                "fo": ["gender_or_person", "number", "case"],
+                "fp": ["gender_or_person", "number", "case"],
+                "tf": ["gender", "number", "case"],
+                "sn": ["voice"],
+                "sb": ["voice", "person", "number", "tense"],
+                "sf": ["voice", "person", "number", "tense"],
+                "sv": ["voice", "person", "number", "tense"],
+                "ss": ["voice"],
+                "sl": ["voice", "person", "number", "tense"],
+                "sþ": ["voice", "gender", "number", "case"],
+                "aa": ["deg"],
+                "af": ["deg"],
+                "au": ["deg"],
+                "ao": ["deg"],
+                "aþ": ["deg"],
+                "ae": ["deg"],
+                "as": ["deg"],
+            },
+            "group_names": [
+                "gender",
+                "gender_or_person",
+                "number",
+                "case",
+                "def",
+                "proper",
+                "adj_c",
+                "deg",
+                "voice",
+                "person",
+                "tense",
+            ],
+            "group_name_to_labels": {
+                "gender": ["masc", "fem", "neut", "gender_x"],
+                "number": ["sing", "plur"],
+                "person": ["1", "2", "3"],
+                "gender_or_person": ["masc", "fem", "neut", "gender_x", "1", "2", "3"],
+                "case": ["nom", "acc", "dat", "gen"],
+                "deg": ["pos", "cmp", "superl"],
+                "voice": ["act", "mid"],
+                "tense": ["pres", "past"],
+                "def": ["definite"],
+                "proper": ["proper"],
+                "adj_c": ["strong", "weak", "equiinflected"],
+            },
+            "labels": [
+                "<SEP>",
+                "n",
+                "g",
+                "x",
+                "e",
+                "v",
+                "l",
+                "fa",
+                "fb",
+                "fe",
+                "fo",
+                "fp",
+                "fs",
+                "ft",
+                "tf",
+                "ta",
+                "tp",
+                "to",
+                "sn",
+                "sb",
+                "sf",
+                "sv",
+                "ss",
+                "sl",
+                "sþ",
+                "cn",
+                "ct",
+                "c",
+                "aa",
+                "af",
+                "au",
+                "ao",
+                "aþ",
+                "ae",
+                "as",
+                "ks",
+                "kt",
+                "p",
+                "pl",
+                "pk",
+                "pg",
+                "pa",
+                "ns",
+                "m",
+                "masc",
+                "fem",
+                "neut",
+                "gender_x",
+                "1",
+                "2",
+                "3",
+                "sing",
+                "plur",
+                "nom",
+                "acc",
+                "dat",
+                "gen",
+                "definite",
+                "proper",
+                "strong",
+                "weak",
+                "equiinflected",
+                "pos",
+                "cmp",
+                "superl",
+                "past",
+                "pres",
+                "pass",
+                "act",
+                "mid",
+            ],
+            "null": None,
+            "null_leaf": None,
+            "separator": "<SEP>",
+            "ignore_categories": ["x", "e"],
+        }
+    @classmethod
+    def from_label_schema_file(cls, schema_path: str, **kwargs) -> "IceBertPosConfig":
+        """Create config from a label schema JSON file"""
+        with open(schema_path, "r", encoding="utf-8") as f:
+            label_schema = json.load(f)
+        return cls(label_schema=label_schema, **kwargs)
+AutoConfig.register("icebert-pos", IceBertPosConfig)

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc24ca46b3b1024c92be719a8964c1336185c3d188674f2f4b96c1064fdaab7f
+size 497965196

modeling.py ADDED Viewed

	@@ -0,0 +1,503 @@

+# Copyright (C) Miðeind ehf.
+# This file is part of IceBERT POS model conversion.
+import logging
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+from transformers import AutoConfig, AutoModel, PreTrainedModel, RobertaModel
+from .configuration import IceBertPosConfig
+logger = logging.getLogger(__name__)
+class MultiLabelTokenClassificationHead(nn.Module):
+    """Head for multilabel word-level classification tasks."""
+    def __init__(self, config: IceBertPosConfig):
+        super().__init__()
+        self.num_categories = config.num_categories
+        self.num_labels = config.num_labels
+        self.hidden_size = config.hidden_size
+        self.dense = nn.Linear(self.hidden_size, self.hidden_size)
+        self.activation_fn = F.relu
+        self.dropout = nn.Dropout(p=config.classifier_dropout)
+        self.layer_norm = nn.LayerNorm(self.hidden_size)
+        # Category projection: hidden_size -> num_categories
+        self.cat_proj = nn.Linear(self.hidden_size, self.num_categories)
+        # Attribute projection: (hidden_size + num_categories) -> num_labels
+        self.out_proj = nn.Linear(self.hidden_size + self.num_categories, self.num_labels)
+    def forward(self, features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            features: Word-level features of shape (total_words, hidden_size)
+        Returns:
+            cat_logits: Category logits of shape (total_words, num_categories)
+            attr_logits: Attribute logits of shape (total_words, num_labels)
+        """
+        x = self.dropout(features)
+        x = self.dense(x)
+        x = self.layer_norm(x)
+        x = self.activation_fn(x)
+        # Predict categories
+        cat_logits = self.cat_proj(x)
+        cat_probs = torch.softmax(cat_logits, dim=-1)
+        # Predict attributes using concatenated features
+        attr_input = torch.cat((cat_probs, x), dim=-1)
+        attr_logits = self.out_proj(attr_input)
+        return cat_logits, attr_logits
+class IceBertPosForTokenClassification(PreTrainedModel):
+    """
+    IceBERT model for multilabel token classification (POS tagging).
+    This model performs word-level POS tagging by:
+    1. Encoding input with RoBERTa
+    2. Aggregating subword tokens to word-level representations
+    3. Predicting both categories and attributes for each word
+    """
+    config_class = IceBertPosConfig
+    def __init__(self, config: IceBertPosConfig):
+        super().__init__(config)
+        self.config = config
+        self.num_categories = config.num_categories
+        self.num_labels = config.num_labels
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.classifier = MultiLabelTokenClassificationHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        word_mask: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            input_ids: Token indices of shape (batch_size, sequence_length)
+            attention_mask: Attention mask of shape (batch_size, sequence_length)
+            word_mask: Binary mask indicating word boundaries (1 = word start)
+        Returns:
+            cat_logits: Category logits of shape (batch_size, max_words, num_categories)
+            attr_logits: Attribute logits of shape (batch_size, max_words, num_labels)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Get RoBERTa outputs
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]  # (batch_size, seq_len, hidden_size)
+        # Aggregate subword tokens to word-level representations using word_mask
+        word_features, nwords = self._aggregate_subword_tokens(sequence_output, word_mask)
+        # Apply classification head
+        cat_logits, attr_logits = self.classifier(word_features)
+        # Reshape back to batch format using word counts
+        cat_logits_batch, attr_logits_batch = self._reshape_to_batch_format(cat_logits, attr_logits, nwords)
+        return cat_logits_batch, attr_logits_batch
+    def _aggregate_subword_tokens(
+        self, sequence_output: torch.Tensor, word_mask: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Aggregate subword token representations to word-level representations.
+        Following the original fairseq approach by averaging subword tokens within each word.
+        Args:
+            sequence_output: subword token representations (batch_size, seq_len, hidden_size)
+            word_mask: Binary mask where 1 indicates start of word (batch_size, seq_len)
+        Returns:
+            word_features: Word-level features (total_words, hidden_size)
+            nwords: Number of words per sequence (batch_size,)
+        """
+        # TODO: Verify that BOS and EOS are handled correctly - I'm worried that this does not correctly handle padding
+        # Remove BOS and EOS tokens (first and last positions)
+        x = sequence_output[:, 1:-1, :]  # (batch_size, seq_len-2, hidden_size)
+        starts = word_mask[:, 1:-1]  # (batch_size, seq_len-2)
+        # Count words per sequence
+        nwords = starts.sum(dim=-1)  # (batch_size,)
+        # Find word boundaries and average tokens within each word
+        mean_words = []
+        batch_size, seq_len, hidden_size = x.shape
+        for batch_idx in range(batch_size):
+            seq_starts = starts[batch_idx]  # (seq_len-2,)
+            seq_x = x[batch_idx]  # (seq_len-2, hidden_size)
+            # Find start positions of words
+            start_positions = seq_starts.nonzero(as_tuple=True)[0]  # positions where words start
+            if len(start_positions) == 0:
+                continue
+            # Calculate end positions (start of next word or end of sequence)
+            end_positions = torch.cat([start_positions[1:], torch.tensor([seq_len], device=start_positions.device)])
+            # Average tokens within each word
+            for start_pos, end_pos in zip(start_positions, end_positions):
+                word_tokens = seq_x[start_pos:end_pos]  # tokens in this word
+                word_repr = word_tokens.mean(dim=0)  # average representation
+                mean_words.append(word_repr)
+        if len(mean_words) == 0:
+            return torch.empty(0, sequence_output.size(-1), device=sequence_output.device), nwords
+        return torch.stack(mean_words), nwords
+    def _reshape_to_batch_format(
+        self, cat_logits: torch.Tensor, attr_logits: torch.Tensor, nwords: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Reshape word-level predictions back to batch format.
+        Following the original fairseq approach with pad_sequence.
+        Args:
+            cat_logits: Category logits (total_words, num_categories)
+            attr_logits: Attribute logits (total_words, num_labels)
+            nwords: Number of words per sequence (batch_size,)
+        Returns:
+            cat_logits_batch: (batch_size, max_words, num_categories)
+            attr_logits_batch: (batch_size, max_words, num_labels)
+        """
+        # Split logits by sequence using word counts
+        words_per_seq = nwords.tolist()
+        cat_logits_split = cat_logits.split(words_per_seq)
+        attr_logits_split = attr_logits.split(words_per_seq)
+        # Pad to same length (matching original fairseq approach)
+        cat_logits_batch = pad_sequence(cat_logits_split, batch_first=True, padding_value=0)
+        attr_logits_batch = pad_sequence(attr_logits_split, batch_first=True, padding_value=0)
+        return cat_logits_batch, attr_logits_batch
+    @torch.no_grad()
+    def predict_labels(
+        self, input_ids: torch.Tensor, attention_mask: torch.Tensor, word_ids: List[List[int]]
+    ) -> List[List[Tuple[str, List[str]]]]:
+        """
+        Predict POS labels for input sequences.
+        Args:
+            input_ids: Token indices
+            attention_mask: Attention mask
+            word_ids: Word boundaries
+        Returns:
+            List of sequences, each containing (category, [attributes]) per word
+        """
+        # Convert word_ids to word_mask
+        word_mask = self._word_ids_to_word_mask(word_ids, input_ids.shape)
+        cat_logits, attr_logits = self.forward(input_ids=input_ids, attention_mask=attention_mask, word_mask=word_mask)
+        return self._logits_to_labels(cat_logits, attr_logits, word_ids)
+    def _word_ids_to_word_mask(self, word_ids: List[List[int]], input_shape: torch.Size) -> torch.Tensor:
+        """
+        Convert word_ids to word_mask (binary mask indicating word boundaries).
+        Args:
+            word_ids: List of word id sequences
+            input_shape: Shape of input_ids tensor (batch_size, seq_len)
+        Returns:
+            word_mask: Binary tensor where 1 indicates start of word
+        """
+        batch_size, seq_len = input_shape
+        word_mask = torch.zeros(batch_size, seq_len, dtype=torch.long)
+        for batch_idx, seq_word_ids in enumerate(word_ids):
+            prev_word_id = None
+            for token_idx, word_id in enumerate(seq_word_ids):
+                if word_id != prev_word_id:
+                    word_mask[batch_idx, token_idx] = 1
+                prev_word_id = word_id
+        return word_mask
+    def predict_labels_from_text(self, sentences: List[str], tokenizer) -> List[List[Tuple[str, List[str]]]]:
+        """
+        Predict POS labels from raw text using fairseq-style preprocessing.
+        Args:
+            sentences: List of input sentences
+            tokenizer: HuggingFace tokenizer
+        Returns:
+            List of sequences, each containing (category, [attributes]) per word
+        """
+        # Tokenize with fairseq-style preprocessing
+        encodings = [tokenizer(sent, return_tensors="pt") for sent in sentences]
+        word_ids_list = [encoding.word_ids() for encoding in encodings]
+        # Batch the inputs
+        max_len = max(encoding["input_ids"].shape[1] for encoding in encodings)
+        batch_input_ids = []
+        batch_attention_mask = []
+        for encoding in encodings:
+            input_ids = encoding["input_ids"][0]
+            attention_mask = encoding["attention_mask"][0]
+            # Pad to max length
+            pad_len = max_len - len(input_ids)
+            if pad_len > 0:
+                input_ids = torch.cat([input_ids, torch.ones(pad_len, dtype=torch.long)])  # pad_token_id = 1
+                attention_mask = torch.cat([attention_mask, torch.zeros(pad_len, dtype=torch.long)])
+            batch_input_ids.append(input_ids)
+            batch_attention_mask.append(attention_mask)
+        batch_input_ids = torch.stack(batch_input_ids)
+        batch_attention_mask = torch.stack(batch_attention_mask)
+        return self.predict_labels(batch_input_ids, batch_attention_mask, word_ids_list)
+    def _make_group_name_to_group_attr_vec_idxs(self):
+        """Create mapping from group names to their attribute vector indices"""
+        group_name_to_group_attr_vec_idxs = {}
+        labels = self.config.label_schema["labels"]
+        nspecial = 0  # Number of special tokens in label dictionary (like <SEP>)
+        for group_name, group_labels in self.config.label_schema["group_name_to_labels"].items():
+            vec_idxs = []
+            for label in group_labels:
+                if label in labels:
+                    # Find index in labels list, but subtract nspecial to get vector index
+                    label_dict_idx = labels.index(label)
+                    if label_dict_idx >= nspecial:  # Skip special tokens
+                        vec_idxs.append(label_dict_idx - nspecial)
+            group_name_to_group_attr_vec_idxs[group_name] = torch.tensor(vec_idxs)
+        return group_name_to_group_attr_vec_idxs
+    def _make_group_masks(self):
+        """Create group masks for each category"""
+        label_categories = self.config.label_schema["label_categories"]
+        group_names = self.config.label_schema["group_names"]
+        category_to_group_names = self.config.label_schema["category_to_group_names"]
+        num_cats = len(label_categories)
+        num_groups = len(group_names)
+        group_mask = torch.zeros(num_cats, num_groups, dtype=torch.bool)
+        for cat_idx, category in enumerate(label_categories):
+            if category in category_to_group_names:
+                for group_name in category_to_group_names[category]:
+                    if group_name in group_names:
+                        group_idx = group_names.index(group_name)
+                        group_mask[cat_idx, group_idx] = True
+        return group_mask
+    def _make_category_mappings(self):
+        """Create mappings between category vector indices and dictionary indices"""
+        labels = self.config.label_schema["labels"]
+        label_categories = self.config.label_schema["label_categories"]
+        # Create mapping from category names to vector indices (0-based)
+        cat_dict_idx_to_vec_idx = torch.zeros(len(labels), dtype=torch.long)
+        cat_vec_idx_to_dict_idx = torch.zeros(len(label_categories), dtype=torch.long)
+        for vec_idx, category in enumerate(label_categories):
+            if category in labels:
+                dict_idx = labels.index(category)
+                cat_dict_idx_to_vec_idx[dict_idx] = vec_idx
+                cat_vec_idx_to_dict_idx[vec_idx] = dict_idx
+        return cat_dict_idx_to_vec_idx, cat_vec_idx_to_dict_idx
+    def _count_words_per_sequence(self, word_ids: List[List[int]]) -> List[int]:
+        """Count the number of unique words in each sequence."""
+        words_per_seq = []
+        for seq_word_ids in word_ids:
+            unique_word_ids = set(word_id for word_id in seq_word_ids if word_id is not None)
+            words_per_seq.append(len(unique_word_ids))
+        return words_per_seq
+    def _predict_categories_for_sequence(
+        self, cat_logits: torch.Tensor, seq_idx: int, seq_nwords: int, cat_vec_idx_to_dict_idx: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predict categories for a single sequence and return both vector and dictionary indices."""
+        pred_cat_vec_idxs = cat_logits[seq_idx, :seq_nwords].max(dim=-1).indices
+        pred_cats = cat_vec_idx_to_dict_idx[pred_cat_vec_idxs]
+        return pred_cat_vec_idxs, pred_cats
+    def _predict_attributes_for_group(
+        self,
+        attr_logits: torch.Tensor,
+        seq_idx: int,
+        seq_nwords: int,
+        group_vec_idxs: torch.Tensor,
+        seq_group_mask: torch.Tensor,
+        group_idx: int,
+    ) -> torch.Tensor:
+        """Predict attributes for a single group."""
+        if len(group_vec_idxs) == 0:
+            return torch.zeros(seq_nwords, dtype=torch.long)
+        # Get logits for this group
+        group_logits = attr_logits[seq_idx, :seq_nwords, group_vec_idxs]
+        if len(group_vec_idxs) == 1:
+            # Single element group: use sigmoid > 0.5
+            group_pred = group_logits.sigmoid().ge(0.5).long()
+            group_pred_dict_idxs = (group_pred.squeeze() * group_vec_idxs.item()) * seq_group_mask[:, group_idx]
+        else:
+            # Multi element group: use argmax
+            group_pred_vec_idxs = group_logits.max(dim=-1).indices
+            group_pred_dict_idxs = group_vec_idxs[group_pred_vec_idxs] * seq_group_mask[:, group_idx]
+        return group_pred_dict_idxs
+    def _predict_all_attributes_for_sequence(
+        self,
+        attr_logits: torch.Tensor,
+        seq_idx: int,
+        seq_nwords: int,
+        pred_cat_vec_idxs: torch.Tensor,
+        group_name_to_group_attr_vec_idxs: dict,
+        group_mask: torch.Tensor,
+        group_names: List[str],
+    ) -> torch.Tensor:
+        """Predict all attributes for a single sequence."""
+        seq_group_mask = group_mask[pred_cat_vec_idxs]
+        pred_attrs = []
+        for group_idx, group_name in enumerate(group_names):
+            if group_name not in group_name_to_group_attr_vec_idxs:
+                pred_attrs.append(torch.zeros(seq_nwords, dtype=torch.long))
+                continue
+            group_vec_idxs = group_name_to_group_attr_vec_idxs[group_name]
+            group_pred_dict_idxs = self._predict_attributes_for_group(
+                attr_logits, seq_idx, seq_nwords, group_vec_idxs, seq_group_mask, group_idx
+            )
+            pred_attrs.append(group_pred_dict_idxs)
+        # Stack predictions
+        if pred_attrs:
+            return torch.stack([p.squeeze() if p.dim() > 1 else p for p in pred_attrs]).t()
+        else:
+            return torch.zeros(seq_nwords, len(group_names), dtype=torch.long)
+    def _convert_predictions_to_labels(
+        self, pred_cats: torch.Tensor, pred_attrs_tensor: torch.Tensor, labels: List[str], group_names: List[str]
+    ) -> List[Tuple[str, List[str]]]:
+        """Convert prediction tensors to human-readable labels."""
+        seq_nwords = pred_cats.size(0)
+        seq_predictions = []
+        for word_idx in range(seq_nwords):
+            # Category (convert from dictionary index to string)
+            cat_dict_idx = pred_cats[word_idx].item()
+            if cat_dict_idx < len(labels):
+                category = labels[cat_dict_idx]
+            else:
+                category = "UNK"
+            # Attributes (convert from dictionary indices to strings)
+            attributes = []
+            for group_idx in range(len(group_names)):
+                attr_dict_idx = pred_attrs_tensor[word_idx, group_idx].item()
+                if attr_dict_idx > 0 and attr_dict_idx < len(labels):  # Skip 0 (empty) and out of bounds
+                    attributes.append(labels[attr_dict_idx])
+            seq_predictions.append((category, attributes))
+        return seq_predictions
+    def _logits_to_labels(
+        self, cat_logits: torch.Tensor, attr_logits: torch.Tensor, word_ids: List[List[int]]
+    ) -> List[List[Tuple[str, List[str]]]]:
+        """
+        Convert logits to human-readable labels using fairseq's group-based logic.
+        """
+        # Create necessary mappings
+        group_name_to_group_attr_vec_idxs = self._make_group_name_to_group_attr_vec_idxs()
+        group_mask = self._make_group_masks()
+        cat_dict_idx_to_vec_idx, cat_vec_idx_to_dict_idx = self._make_category_mappings()
+        label_schema = self.config.label_schema
+        labels = label_schema["labels"]
+        group_names = label_schema["group_names"]
+        batch_size = cat_logits.size(0)
+        words_per_seq = self._count_words_per_sequence(word_ids)
+        batch_predictions = []
+        for seq_idx in range(batch_size):
+            seq_nwords = words_per_seq[seq_idx]
+            # Predict categories
+            pred_cat_vec_idxs, pred_cats = self._predict_categories_for_sequence(
+                cat_logits, seq_idx, seq_nwords, cat_vec_idx_to_dict_idx
+            )
+            # Predict attributes
+            pred_attrs_tensor = self._predict_all_attributes_for_sequence(
+                attr_logits,
+                seq_idx,
+                seq_nwords,
+                pred_cat_vec_idxs,
+                group_name_to_group_attr_vec_idxs,
+                group_mask,
+                group_names,
+            )
+            # Convert to labels
+            seq_predictions = self._convert_predictions_to_labels(pred_cats, pred_attrs_tensor, labels, group_names)
+            batch_predictions.append(seq_predictions)
+        return batch_predictions
+AutoConfig.register("icebert-pos", IceBertPosConfig)
+AutoModel.register(IceBertPosConfig, IceBertPosForTokenClassification)
+IceBertPosConfig.register_for_auto_class()
+IceBertPosForTokenClassification.register_for_auto_class("AutoModel")

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49936": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff