File size: 18,313 Bytes
2d923bf
 
 
 
 
 
 
 
 
 
 
 
 
aaca62a
 
 
 
 
 
 
 
 
2d923bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aaca62a
2d923bf
 
aaca62a
 
2d923bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aaca62a
 
 
 
2d923bf
 
 
aaca62a
 
 
 
 
 
 
 
 
2d923bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aaca62a
2d923bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aaca62a
2d923bf
 
 
aaca62a
2d923bf
aaca62a
 
2d923bf
aaca62a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d923bf
 
 
 
 
 
 
 
 
 
 
 
 
aaca62a
2d923bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aaca62a
2d923bf
 
 
 
 
 
 
 
 
 
aaca62a
2d923bf
 
 
 
 
aaca62a
 
2d923bf
aaca62a
2d923bf
aaca62a
2d923bf
 
aaca62a
 
 
2d923bf
 
 
 
 
 
 
 
 
 
 
 
 
aaca62a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d923bf
 
 
 
aaca62a
2d923bf
 
 
aaca62a
2d923bf
aaca62a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d923bf
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
# Copyright (C) Miðeind ehf.
# This file is part of IceBERT POS model conversion.

import logging
from typing import List, Optional, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoConfig, AutoModel, PreTrainedModel, RobertaModel

from .configuration import IceBertPosConfig
from .old_label_utils import (
    SimpleLabelDictionary,
    clean_cats_attrs,
    create_label_dictionary_from_schema,
    make_dict_idx_to_vec_idx,
    make_group_masks,
    make_group_name_to_group_attr_vec_idxs,
    make_vec_idx_to_dict_idx,
)

logger = logging.getLogger(__name__)


class MultiLabelTokenClassificationHead(nn.Module):
    """Head for multilabel word-level classification tasks."""

    def __init__(self, config: IceBertPosConfig):
        super().__init__()
        self.num_categories = config.num_categories
        self.num_labels = config.num_labels
        self.hidden_size = config.hidden_size

        self.dense = nn.Linear(self.hidden_size, self.hidden_size)
        self.activation_fn = F.relu
        self.dropout = nn.Dropout(p=config.classifier_dropout)
        self.layer_norm = nn.LayerNorm(self.hidden_size)

        # Category projection: hidden_size -> num_categories
        self.cat_proj = nn.Linear(self.hidden_size, self.num_categories)

        # Attribute projection: (hidden_size + num_categories) -> num_labels
        self.out_proj = nn.Linear(self.hidden_size + self.num_categories, self.num_labels)

    def forward(self, features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            features: Word-level features of shape (batch_size, max_words, hidden_size)

        Returns:
            cat_logits: Category logits of shape (batch_size, max_words, num_categories)
            attr_logits: Attribute logits of shape (batch_size, max_words, num_labels)
        """
        x = self.dropout(features)
        x = self.dense(x)
        x = self.layer_norm(x)
        x = self.activation_fn(x)

        # Predict categories
        cat_logits = self.cat_proj(x)
        cat_probs = torch.softmax(cat_logits, dim=-1)

        # Predict attributes using concatenated features
        attr_input = torch.cat((cat_probs, x), dim=-1)
        attr_logits = self.out_proj(attr_input)

        return cat_logits, attr_logits


class IceBertPosForTokenClassification(PreTrainedModel):
    """
    IceBERT model for multilabel token classification (POS tagging).

    This model performs word-level POS tagging by:
    1. Encoding input with RoBERTa
    2. Aggregating subword tokens to word-level representations
    3. Predicting both categories and attributes for each word
    """

    config_class = IceBertPosConfig

    def __init__(self, config: IceBertPosConfig):
        super().__init__(config)
        self.config = config
        self.num_categories = config.num_categories
        self.num_labels = config.num_labels

        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.classifier = MultiLabelTokenClassificationHead(config)

        # Create label dictionary and mappings (mimicking old fairseq model)
        self.label_dictionary = create_label_dictionary_from_schema(config.label_schema)
        self._setup_label_mappings()

        # Initialize weights and apply final processing
        self.post_init()

    def _setup_label_mappings(self):
        """Setup label mappings similar to the old fairseq model."""
        schema = self.config.label_schema

        self.group_name_to_group_attr_vec_idxs = make_group_name_to_group_attr_vec_idxs(self.label_dictionary, schema)
        self.cat_dict_idx_to_vec_idx = make_dict_idx_to_vec_idx(self.label_dictionary, schema.label_categories)
        self.cat_vec_idx_to_dict_idx = make_vec_idx_to_dict_idx(self.label_dictionary, schema.label_categories)
        self.group_mask = make_group_masks(self.label_dictionary, schema)

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        word_mask: torch.Tensor,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            input_ids: Token indices of shape (batch_size, sequence_length)
            attention_mask: Attention mask of shape (batch_size, sequence_length)
            word_mask: Binary mask indicating word boundaries (1 = word start) of shape (batch_size, sequence_length)

        Returns:
            cat_logits: Category logits of shape (batch_size, max_words, num_categories)
            attr_logits: Attribute logits of shape (batch_size, max_words, num_labels)
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Get RoBERTa outputs
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=True,
            return_dict=return_dict,
        )

        x = outputs[0]  # (batch_size, seq_len, hidden)

        # Copy exact logic from old model
        _, _, inner_dim = x.shape

        # use first bpe token of word as representation
        x = x[:, 1:-1, :]
        starts = word_mask[:, 1:-1]  # remove bos, eos
        ends = starts.roll(-1, dims=[-1]).nonzero()[:, -1] + 1
        starts = starts.nonzero().tolist()
        mean_words = []
        for (seq_idx, token_idx), end in zip(starts, ends):
            mean_words.append(x[seq_idx, token_idx:end, :].mean(dim=0))
        mean_words = torch.stack(mean_words)
        words = mean_words
        # Innermost dimension is mask for tokens at head of word.
        nwords = word_mask.sum(dim=-1)
        (cat_logits, attr_logits) = self.classifier(words)

        # (Batch * Time) x Depth -> Batch x Time x Depth
        cat_logits = pad_sequence(cat_logits.split((nwords).tolist()), padding_value=0, batch_first=True)
        attr_logits = pad_sequence(
            attr_logits.split((nwords).tolist()),
            padding_value=0,
            batch_first=True,
        )
        return cat_logits, attr_logits

    def _aggregate_subword_tokens(
        self, sequence_output: torch.Tensor, word_mask: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Aggregate subword token representations to word-level representations.
        Following the original fairseq approach by averaging subword tokens within each word.

        Args:
            sequence_output: subword token representations (batch_size, seq_len, hidden_size)
            word_mask: Binary mask where 1 indicates start of word (batch_size, seq_len)

        Returns:
            word_features: Word-level features (batch_size, max_words, hidden_size)
            nwords: Number of words per sequence (batch_size,)
        """
        # TODO: Verify that BOS and EOS are handled correctly - I'm worried that this does not correctly handle padding
        # Remove BOS and EOS tokens (first and last positions)
        x = sequence_output[:, 1:-1, :]  # (batch_size, seq_len-2, hidden_size)
        starts = word_mask[:, 1:-1]  # (batch_size, seq_len-2)

        # Count words per sequence
        nwords = starts.sum(dim=-1)  # (batch_size,)

        # Find word boundaries and average tokens within each word
        mean_words = []
        batch_size, seq_len, hidden_size = x.shape

        for batch_idx in range(batch_size):
            seq_starts = starts[batch_idx]  # (seq_len-2,)
            seq_x = x[batch_idx]  # (seq_len-2, hidden_size)

            # Find start positions of words
            start_positions = seq_starts.nonzero(as_tuple=True)[0]  # positions where words start

            if len(start_positions) == 0:
                continue

            # Calculate end positions (start of next word or end of sequence)
            end_positions = torch.cat([start_positions[1:], torch.tensor([seq_len], device=start_positions.device)])

            # Average tokens within each word
            for start_pos, end_pos in zip(start_positions, end_positions):
                word_tokens = seq_x[start_pos:end_pos]  # tokens in this word
                word_repr = word_tokens.mean(dim=0)  # average representation
                mean_words.append(word_repr)

        if len(mean_words) == 0:
            return torch.empty(0, sequence_output.size(-1), device=sequence_output.device), nwords

        return torch.stack(mean_words), nwords

    def _reshape_to_batch_format(
        self, cat_logits: torch.Tensor, attr_logits: torch.Tensor, nwords: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Reshape word-level predictions back to batch format.
        Following the original fairseq approach with pad_sequence.

        Args:
            cat_logits: Category logits (total_words, num_categories)
            attr_logits: Attribute logits (total_words, num_labels)
            nwords: Number of words per sequence (batch_size,)

        Returns:
            cat_logits_batch: (batch_size, max_words, num_categories)
            attr_logits_batch: (batch_size, max_words, num_labels)
        """

        # Split logits by sequence using word counts
        words_per_seq = nwords.tolist()
        cat_logits_split = cat_logits.split(words_per_seq)
        attr_logits_split = attr_logits.split(words_per_seq)

        # Pad to same length (matching original fairseq approach)
        cat_logits_batch = pad_sequence(cat_logits_split, batch_first=True, padding_value=0)
        attr_logits_batch = pad_sequence(attr_logits_split, batch_first=True, padding_value=0)

        return cat_logits_batch, attr_logits_batch

    @torch.no_grad()
    def predict_labels(
        self, input_ids: torch.Tensor, attention_mask: torch.Tensor, word_ids: List[List[int]]
    ) -> List[List[Tuple[str, List[str]]]]:
        """
        Predict POS labels for input sequences.

        Args:
            input_ids: Token indices
            attention_mask: Attention mask
            word_ids: Word boundaries

        Returns:
            List of sequences, each containing (category, [attributes]) per word
        """
        # Convert word_ids to word_mask
        word_mask = self._word_ids_to_word_mask(word_ids, input_ids.shape)

        cat_logits, attr_logits = self.forward(input_ids=input_ids, attention_mask=attention_mask, word_mask=word_mask)

        return self._logits_to_labels(cat_logits, attr_logits, word_mask)

    def _word_ids_to_word_mask(self, word_ids: List[List[int]], input_shape: torch.Size) -> torch.Tensor:
        """
        Convert word_ids to word_mask (binary mask indicating word boundaries).

        Args:
            word_ids: List of word id sequences
            input_shape: Shape of input_ids tensor (batch_size, seq_len)

        Returns:
            word_mask: Binary tensor where 1 indicates start of word (batch_size, seq_len)
        """
        batch_size, seq_len = input_shape
        word_mask = torch.zeros(batch_size, seq_len, dtype=torch.long)

        for batch_idx, seq_word_ids in enumerate(word_ids):
            # Truncate to exclude BOS and EOS tokens (first and last)
            truncated_word_ids = seq_word_ids[1:-1]
            prev_word_id = None
            for token_idx, word_id in enumerate(truncated_word_ids):
                if word_id != prev_word_id:
                    word_mask[batch_idx, token_idx + 1] = 1  # +1 to account for BOS
                prev_word_id = word_id

            # Debug logging to match fairseq model
            logger.debug(f"Word mask: {word_mask[batch_idx].tolist()}")

        return word_mask

    def predict_labels_from_text(self, sentences: List[str], tokenizer) -> List[List[Tuple[str, List[str]]]]:
        """
        Predict POS labels from raw text using fairseq-style preprocessing.

        Args:
            sentences: List of input sentences
            tokenizer: HuggingFace tokenizer

        Returns:
            List of sequences, each containing (category, [attributes]) per word
        """
        # Split sentences by spaces to get proper word boundaries
        # This fixes the issue where tokens like "Kl." get split incorrectly
        sentences_split = [sentence.split() for sentence in sentences]
        
        # Use batch_encode_plus with is_split_into_words=True to preserve word boundaries
        encoding = tokenizer.batch_encode_plus(
            sentences_split,
            return_tensors="pt",
            padding=True,
            is_split_into_words=True,
            add_special_tokens=True
        )
        
        batch_input_ids = encoding["input_ids"]
        batch_attention_mask = encoding["attention_mask"]
        word_ids_list = [encoding.word_ids(i) for i in range(len(sentences))]
        
        # Debug logging to match fairseq model
        for i in range(len(sentences)):
            logger.debug(f"Encoded tokens: {batch_input_ids[i]}")
            logger.debug(f"Decoded tokens: {tokenizer.convert_ids_to_tokens(batch_input_ids[i].tolist())}")
            logger.debug(f"Word IDs: {word_ids_list[i]}")

        return self.predict_labels(batch_input_ids, batch_attention_mask, word_ids_list)

    def _logits_to_labels(
        self, cat_logits: torch.Tensor, attr_logits: torch.Tensor, word_mask: torch.Tensor
    ) -> List[List[Tuple[str, List[str]]]]:
        """
        Convert logits to human-readable labels using fairseq's group-based logic.
        Copied from the old model's logits_to_labels method.
        """
        # logits: Batch x Time x Labels
        bsz, _, num_cats = cat_logits.shape
        _, _, num_attrs = attr_logits.shape
        nwords = word_mask.sum(-1)

        assert num_attrs == len(self.config.label_schema.labels)
        assert num_cats == len(self.config.label_schema.label_categories)

        batch_cats = []
        batch_attrs = []
        for seq_idx in range(bsz):
            seq_nwords = nwords[seq_idx]
            pred_cat_vec_idxs = cat_logits[seq_idx, :seq_nwords].max(dim=-1).indices
            pred_cats = self.cat_vec_idx_to_dict_idx[pred_cat_vec_idxs]

            group_mask = self.group_mask[pred_cat_vec_idxs]
            offset = self.label_dictionary.nspecial
            pred_attrs = []
            for group_idx, group_name in enumerate(self.config.label_schema.group_names):
                group_vec_idxs = self.group_name_to_group_attr_vec_idxs[group_name]
                # logits: (bsz * nwords) x labels
                group_logits = attr_logits[seq_idx, :seq_nwords, group_vec_idxs]
                if len(group_vec_idxs) == 1:
                    group_pred = group_logits.sigmoid().ge(0.5).long()
                    group_pred_dict_idxs = (group_pred.squeeze() * (group_vec_idxs.item() + offset)).T.to(
                        "cpu"
                    ) * group_mask[:, group_idx]
                else:
                    group_pred_vec_idxs = group_logits.max(dim=-1).indices
                    group_pred_dict_idxs = (group_vec_idxs[group_pred_vec_idxs] + offset) * group_mask[:, group_idx]
                pred_attrs.append(group_pred_dict_idxs)

            pred_attrs = torch.stack([p.squeeze() for p in pred_attrs]).t()

            batch_cats.append(pred_cats)
            batch_attrs.append(pred_attrs)

        predictions = list(
            [
                clean_cats_attrs(
                    self.label_dictionary,
                    self.config.label_schema,
                    seq_cats,
                    seq_attrs,
                )
                for seq_cats, seq_attrs in zip(batch_cats, batch_attrs)
            ]
        )

        return predictions


def make_vec_idx_to_dict_idx(dictionary, labels, device="cpu", fill_value=-100):
    vec_idx_to_dict_idx = torch.full((len(dictionary),), device=device, fill_value=fill_value, dtype=torch.long)
    for vec_idx, label in enumerate(labels):
        vec_idx_to_dict_idx[vec_idx] = dictionary.index(label)
    return vec_idx_to_dict_idx


def make_group_masks(dictionary, schema, device="cpu"):
    num_groups = len(schema.group_names)
    offset = dictionary.nspecial
    num_labels = len(dictionary) - offset
    ret_mask = torch.zeros(num_labels, num_groups, dtype=torch.int64, device=device)
    for cat, cat_group_names in schema.category_to_group_names.items():
        cat_label_idx = dictionary.index(cat)
        cat_vec_idx = schema.label_categories.index(cat)
        for group_name in cat_group_names:
            ret_mask[cat_vec_idx, schema.group_names.index(group_name)] = 1
        assert cat_label_idx != dictionary.unk()
    for cat in schema.label_categories:
        cat_label_idx = dictionary.index(cat)
        assert cat_label_idx != dictionary.unk()
    return ret_mask


def make_group_name_to_group_attr_vec_idxs(dict_, schema):
    offset = dict_.nspecial
    group_names = schema.group_name_to_labels.keys()
    name_to_labels = schema.group_name_to_labels
    group_name_to_group_attr_vec_idxs = {
        name: torch.tensor([dict_.index(item) - offset for item in name_to_labels[name]]) for name in group_names
    }
    return group_name_to_group_attr_vec_idxs


def make_dict_idx_to_vec_idx(dictionary, cats, device="cpu", fill_value=-100):
    # NOTE: when target is not in label_categories, the error is silent
    map_tgt = torch.full((len(dictionary),), device=device, fill_value=fill_value, dtype=torch.long)
    for vec_idx, label in enumerate(cats):
        map_tgt[dictionary.index(label)] = vec_idx
    return map_tgt


AutoConfig.register("icebert-pos", IceBertPosConfig)
AutoModel.register(IceBertPosConfig, IceBertPosForTokenClassification)
IceBertPosConfig.register_for_auto_class()
IceBertPosForTokenClassification.register_for_auto_class("AutoModel")