File size: 20,640 Bytes

2d923bf

# Copyright (C) Miðeind ehf.
# This file is part of IceBERT POS model conversion.

import logging
from typing import List, Optional, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoConfig, AutoModel, PreTrainedModel, RobertaModel

from .configuration import IceBertPosConfig

logger = logging.getLogger(__name__)


class MultiLabelTokenClassificationHead(nn.Module):
    """Head for multilabel word-level classification tasks."""

    def __init__(self, config: IceBertPosConfig):
        super().__init__()
        self.num_categories = config.num_categories
        self.num_labels = config.num_labels
        self.hidden_size = config.hidden_size

        self.dense = nn.Linear(self.hidden_size, self.hidden_size)
        self.activation_fn = F.relu
        self.dropout = nn.Dropout(p=config.classifier_dropout)
        self.layer_norm = nn.LayerNorm(self.hidden_size)

        # Category projection: hidden_size -> num_categories
        self.cat_proj = nn.Linear(self.hidden_size, self.num_categories)

        # Attribute projection: (hidden_size + num_categories) -> num_labels
        self.out_proj = nn.Linear(self.hidden_size + self.num_categories, self.num_labels)

    def forward(self, features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            features: Word-level features of shape (total_words, hidden_size)

        Returns:
            cat_logits: Category logits of shape (total_words, num_categories)
            attr_logits: Attribute logits of shape (total_words, num_labels)
        """
        x = self.dropout(features)
        x = self.dense(x)
        x = self.layer_norm(x)
        x = self.activation_fn(x)

        # Predict categories
        cat_logits = self.cat_proj(x)
        cat_probs = torch.softmax(cat_logits, dim=-1)

        # Predict attributes using concatenated features
        attr_input = torch.cat((cat_probs, x), dim=-1)
        attr_logits = self.out_proj(attr_input)

        return cat_logits, attr_logits


class IceBertPosForTokenClassification(PreTrainedModel):
    """
    IceBERT model for multilabel token classification (POS tagging).

    This model performs word-level POS tagging by:
    1. Encoding input with RoBERTa
    2. Aggregating subword tokens to word-level representations
    3. Predicting both categories and attributes for each word
    """

    config_class = IceBertPosConfig

    def __init__(self, config: IceBertPosConfig):
        super().__init__(config)
        self.config = config
        self.num_categories = config.num_categories
        self.num_labels = config.num_labels

        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.classifier = MultiLabelTokenClassificationHead(config)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        word_mask: torch.Tensor,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            input_ids: Token indices of shape (batch_size, sequence_length)
            attention_mask: Attention mask of shape (batch_size, sequence_length)
            word_mask: Binary mask indicating word boundaries (1 = word start)

        Returns:
            cat_logits: Category logits of shape (batch_size, max_words, num_categories)
            attr_logits: Attribute logits of shape (batch_size, max_words, num_labels)
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Get RoBERTa outputs
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]  # (batch_size, seq_len, hidden_size)

        # Aggregate subword tokens to word-level representations using word_mask
        word_features, nwords = self._aggregate_subword_tokens(sequence_output, word_mask)

        # Apply classification head
        cat_logits, attr_logits = self.classifier(word_features)

        # Reshape back to batch format using word counts
        cat_logits_batch, attr_logits_batch = self._reshape_to_batch_format(cat_logits, attr_logits, nwords)

        return cat_logits_batch, attr_logits_batch

    def _aggregate_subword_tokens(
        self, sequence_output: torch.Tensor, word_mask: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Aggregate subword token representations to word-level representations.
        Following the original fairseq approach by averaging subword tokens within each word.

        Args:
            sequence_output: subword token representations (batch_size, seq_len, hidden_size)
            word_mask: Binary mask where 1 indicates start of word (batch_size, seq_len)

        Returns:
            word_features: Word-level features (total_words, hidden_size)
            nwords: Number of words per sequence (batch_size,)
        """
        # TODO: Verify that BOS and EOS are handled correctly - I'm worried that this does not correctly handle padding
        # Remove BOS and EOS tokens (first and last positions)
        x = sequence_output[:, 1:-1, :]  # (batch_size, seq_len-2, hidden_size)
        starts = word_mask[:, 1:-1]  # (batch_size, seq_len-2)

        # Count words per sequence
        nwords = starts.sum(dim=-1)  # (batch_size,)

        # Find word boundaries and average tokens within each word
        mean_words = []
        batch_size, seq_len, hidden_size = x.shape

        for batch_idx in range(batch_size):
            seq_starts = starts[batch_idx]  # (seq_len-2,)
            seq_x = x[batch_idx]  # (seq_len-2, hidden_size)

            # Find start positions of words
            start_positions = seq_starts.nonzero(as_tuple=True)[0]  # positions where words start

            if len(start_positions) == 0:
                continue

            # Calculate end positions (start of next word or end of sequence)
            end_positions = torch.cat([start_positions[1:], torch.tensor([seq_len], device=start_positions.device)])

            # Average tokens within each word
            for start_pos, end_pos in zip(start_positions, end_positions):
                word_tokens = seq_x[start_pos:end_pos]  # tokens in this word
                word_repr = word_tokens.mean(dim=0)  # average representation
                mean_words.append(word_repr)

        if len(mean_words) == 0:
            return torch.empty(0, sequence_output.size(-1), device=sequence_output.device), nwords

        return torch.stack(mean_words), nwords

    def _reshape_to_batch_format(
        self, cat_logits: torch.Tensor, attr_logits: torch.Tensor, nwords: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Reshape word-level predictions back to batch format.
        Following the original fairseq approach with pad_sequence.

        Args:
            cat_logits: Category logits (total_words, num_categories)
            attr_logits: Attribute logits (total_words, num_labels)
            nwords: Number of words per sequence (batch_size,)

        Returns:
            cat_logits_batch: (batch_size, max_words, num_categories)
            attr_logits_batch: (batch_size, max_words, num_labels)
        """

        # Split logits by sequence using word counts
        words_per_seq = nwords.tolist()
        cat_logits_split = cat_logits.split(words_per_seq)
        attr_logits_split = attr_logits.split(words_per_seq)

        # Pad to same length (matching original fairseq approach)
        cat_logits_batch = pad_sequence(cat_logits_split, batch_first=True, padding_value=0)
        attr_logits_batch = pad_sequence(attr_logits_split, batch_first=True, padding_value=0)

        return cat_logits_batch, attr_logits_batch

    @torch.no_grad()
    def predict_labels(
        self, input_ids: torch.Tensor, attention_mask: torch.Tensor, word_ids: List[List[int]]
    ) -> List[List[Tuple[str, List[str]]]]:
        """
        Predict POS labels for input sequences.

        Args:
            input_ids: Token indices
            attention_mask: Attention mask
            word_ids: Word boundaries

        Returns:
            List of sequences, each containing (category, [attributes]) per word
        """
        # Convert word_ids to word_mask
        word_mask = self._word_ids_to_word_mask(word_ids, input_ids.shape)

        cat_logits, attr_logits = self.forward(input_ids=input_ids, attention_mask=attention_mask, word_mask=word_mask)

        return self._logits_to_labels(cat_logits, attr_logits, word_ids)

    def _word_ids_to_word_mask(self, word_ids: List[List[int]], input_shape: torch.Size) -> torch.Tensor:
        """
        Convert word_ids to word_mask (binary mask indicating word boundaries).

        Args:
            word_ids: List of word id sequences
            input_shape: Shape of input_ids tensor (batch_size, seq_len)

        Returns:
            word_mask: Binary tensor where 1 indicates start of word
        """
        batch_size, seq_len = input_shape
        word_mask = torch.zeros(batch_size, seq_len, dtype=torch.long)

        for batch_idx, seq_word_ids in enumerate(word_ids):
            prev_word_id = None
            for token_idx, word_id in enumerate(seq_word_ids):
                if word_id != prev_word_id:
                    word_mask[batch_idx, token_idx] = 1
                prev_word_id = word_id

        return word_mask

    def predict_labels_from_text(self, sentences: List[str], tokenizer) -> List[List[Tuple[str, List[str]]]]:
        """
        Predict POS labels from raw text using fairseq-style preprocessing.

        Args:
            sentences: List of input sentences
            tokenizer: HuggingFace tokenizer

        Returns:
            List of sequences, each containing (category, [attributes]) per word
        """
        # Tokenize with fairseq-style preprocessing
        encodings = [tokenizer(sent, return_tensors="pt") for sent in sentences]
        word_ids_list = [encoding.word_ids() for encoding in encodings]

        # Batch the inputs
        max_len = max(encoding["input_ids"].shape[1] for encoding in encodings)
        batch_input_ids = []
        batch_attention_mask = []

        for encoding in encodings:
            input_ids = encoding["input_ids"][0]
            attention_mask = encoding["attention_mask"][0]

            # Pad to max length
            pad_len = max_len - len(input_ids)
            if pad_len > 0:
                input_ids = torch.cat([input_ids, torch.ones(pad_len, dtype=torch.long)])  # pad_token_id = 1
                attention_mask = torch.cat([attention_mask, torch.zeros(pad_len, dtype=torch.long)])

            batch_input_ids.append(input_ids)
            batch_attention_mask.append(attention_mask)

        batch_input_ids = torch.stack(batch_input_ids)
        batch_attention_mask = torch.stack(batch_attention_mask)

        return self.predict_labels(batch_input_ids, batch_attention_mask, word_ids_list)

    def _make_group_name_to_group_attr_vec_idxs(self):
        """Create mapping from group names to their attribute vector indices"""
        group_name_to_group_attr_vec_idxs = {}
        labels = self.config.label_schema["labels"]
        nspecial = 0  # Number of special tokens in label dictionary (like <SEP>)

        for group_name, group_labels in self.config.label_schema["group_name_to_labels"].items():
            vec_idxs = []
            for label in group_labels:
                if label in labels:
                    # Find index in labels list, but subtract nspecial to get vector index
                    label_dict_idx = labels.index(label)
                    if label_dict_idx >= nspecial:  # Skip special tokens
                        vec_idxs.append(label_dict_idx - nspecial)
            group_name_to_group_attr_vec_idxs[group_name] = torch.tensor(vec_idxs)

        return group_name_to_group_attr_vec_idxs

    def _make_group_masks(self):
        """Create group masks for each category"""
        label_categories = self.config.label_schema["label_categories"]
        group_names = self.config.label_schema["group_names"]
        category_to_group_names = self.config.label_schema["category_to_group_names"]

        num_cats = len(label_categories)
        num_groups = len(group_names)

        group_mask = torch.zeros(num_cats, num_groups, dtype=torch.bool)

        for cat_idx, category in enumerate(label_categories):
            if category in category_to_group_names:
                for group_name in category_to_group_names[category]:
                    if group_name in group_names:
                        group_idx = group_names.index(group_name)
                        group_mask[cat_idx, group_idx] = True

        return group_mask

    def _make_category_mappings(self):
        """Create mappings between category vector indices and dictionary indices"""
        labels = self.config.label_schema["labels"]
        label_categories = self.config.label_schema["label_categories"]

        # Create mapping from category names to vector indices (0-based)
        cat_dict_idx_to_vec_idx = torch.zeros(len(labels), dtype=torch.long)
        cat_vec_idx_to_dict_idx = torch.zeros(len(label_categories), dtype=torch.long)

        for vec_idx, category in enumerate(label_categories):
            if category in labels:
                dict_idx = labels.index(category)
                cat_dict_idx_to_vec_idx[dict_idx] = vec_idx
                cat_vec_idx_to_dict_idx[vec_idx] = dict_idx

        return cat_dict_idx_to_vec_idx, cat_vec_idx_to_dict_idx

    def _count_words_per_sequence(self, word_ids: List[List[int]]) -> List[int]:
        """Count the number of unique words in each sequence."""
        words_per_seq = []
        for seq_word_ids in word_ids:
            unique_word_ids = set(word_id for word_id in seq_word_ids if word_id is not None)
            words_per_seq.append(len(unique_word_ids))
        return words_per_seq

    def _predict_categories_for_sequence(
        self, cat_logits: torch.Tensor, seq_idx: int, seq_nwords: int, cat_vec_idx_to_dict_idx: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Predict categories for a single sequence and return both vector and dictionary indices."""
        pred_cat_vec_idxs = cat_logits[seq_idx, :seq_nwords].max(dim=-1).indices
        pred_cats = cat_vec_idx_to_dict_idx[pred_cat_vec_idxs]
        return pred_cat_vec_idxs, pred_cats

    def _predict_attributes_for_group(
        self,
        attr_logits: torch.Tensor,
        seq_idx: int,
        seq_nwords: int,
        group_vec_idxs: torch.Tensor,
        seq_group_mask: torch.Tensor,
        group_idx: int,
    ) -> torch.Tensor:
        """Predict attributes for a single group."""
        if len(group_vec_idxs) == 0:
            return torch.zeros(seq_nwords, dtype=torch.long)

        # Get logits for this group
        group_logits = attr_logits[seq_idx, :seq_nwords, group_vec_idxs]

        if len(group_vec_idxs) == 1:
            # Single element group: use sigmoid > 0.5
            group_pred = group_logits.sigmoid().ge(0.5).long()
            group_pred_dict_idxs = (group_pred.squeeze() * group_vec_idxs.item()) * seq_group_mask[:, group_idx]
        else:
            # Multi element group: use argmax
            group_pred_vec_idxs = group_logits.max(dim=-1).indices
            group_pred_dict_idxs = group_vec_idxs[group_pred_vec_idxs] * seq_group_mask[:, group_idx]

        return group_pred_dict_idxs

    def _predict_all_attributes_for_sequence(
        self,
        attr_logits: torch.Tensor,
        seq_idx: int,
        seq_nwords: int,
        pred_cat_vec_idxs: torch.Tensor,
        group_name_to_group_attr_vec_idxs: dict,
        group_mask: torch.Tensor,
        group_names: List[str],
    ) -> torch.Tensor:
        """Predict all attributes for a single sequence."""
        seq_group_mask = group_mask[pred_cat_vec_idxs]
        pred_attrs = []

        for group_idx, group_name in enumerate(group_names):
            if group_name not in group_name_to_group_attr_vec_idxs:
                pred_attrs.append(torch.zeros(seq_nwords, dtype=torch.long))
                continue

            group_vec_idxs = group_name_to_group_attr_vec_idxs[group_name]
            group_pred_dict_idxs = self._predict_attributes_for_group(
                attr_logits, seq_idx, seq_nwords, group_vec_idxs, seq_group_mask, group_idx
            )
            pred_attrs.append(group_pred_dict_idxs)

        # Stack predictions
        if pred_attrs:
            return torch.stack([p.squeeze() if p.dim() > 1 else p for p in pred_attrs]).t()
        else:
            return torch.zeros(seq_nwords, len(group_names), dtype=torch.long)

    def _convert_predictions_to_labels(
        self, pred_cats: torch.Tensor, pred_attrs_tensor: torch.Tensor, labels: List[str], group_names: List[str]
    ) -> List[Tuple[str, List[str]]]:
        """Convert prediction tensors to human-readable labels."""
        seq_nwords = pred_cats.size(0)
        seq_predictions = []

        for word_idx in range(seq_nwords):
            # Category (convert from dictionary index to string)
            cat_dict_idx = pred_cats[word_idx].item()
            if cat_dict_idx < len(labels):
                category = labels[cat_dict_idx]
            else:
                category = "UNK"

            # Attributes (convert from dictionary indices to strings)
            attributes = []
            for group_idx in range(len(group_names)):
                attr_dict_idx = pred_attrs_tensor[word_idx, group_idx].item()
                if attr_dict_idx > 0 and attr_dict_idx < len(labels):  # Skip 0 (empty) and out of bounds
                    attributes.append(labels[attr_dict_idx])

            seq_predictions.append((category, attributes))

        return seq_predictions

    def _logits_to_labels(
        self, cat_logits: torch.Tensor, attr_logits: torch.Tensor, word_ids: List[List[int]]
    ) -> List[List[Tuple[str, List[str]]]]:
        """
        Convert logits to human-readable labels using fairseq's group-based logic.
        """
        # Create necessary mappings
        group_name_to_group_attr_vec_idxs = self._make_group_name_to_group_attr_vec_idxs()
        group_mask = self._make_group_masks()
        cat_dict_idx_to_vec_idx, cat_vec_idx_to_dict_idx = self._make_category_mappings()

        label_schema = self.config.label_schema
        labels = label_schema["labels"]
        group_names = label_schema["group_names"]

        batch_size = cat_logits.size(0)
        words_per_seq = self._count_words_per_sequence(word_ids)
        batch_predictions = []

        for seq_idx in range(batch_size):
            seq_nwords = words_per_seq[seq_idx]

            # Predict categories
            pred_cat_vec_idxs, pred_cats = self._predict_categories_for_sequence(
                cat_logits, seq_idx, seq_nwords, cat_vec_idx_to_dict_idx
            )

            # Predict attributes
            pred_attrs_tensor = self._predict_all_attributes_for_sequence(
                attr_logits,
                seq_idx,
                seq_nwords,
                pred_cat_vec_idxs,
                group_name_to_group_attr_vec_idxs,
                group_mask,
                group_names,
            )

            # Convert to labels
            seq_predictions = self._convert_predictions_to_labels(pred_cats, pred_attrs_tensor, labels, group_names)
            batch_predictions.append(seq_predictions)

        return batch_predictions


AutoConfig.register("icebert-pos", IceBertPosConfig)
AutoModel.register(IceBertPosConfig, IceBertPosForTokenClassification)
IceBertPosConfig.register_for_auto_class()
IceBertPosForTokenClassification.register_for_auto_class("AutoModel")