changing interface to accept words instead of a sentence string. Documenting the truncation flag

Browse files

Files changed (2) hide show

README.md +50 -7
modeling.py +134 -96

README.md CHANGED Viewed

@@ -9,10 +9,14 @@ paper: https://arxiv.org/abs/2201.05601
 ---
 ## Prediction Methods
-The model provides two prediction methods:
-- **`predict_labels_from_text()`**: Returns structured predictions as (category, [attributes]) tuples. These can be slightly more readable and more suitable for some applications.
-- **`predict_ifd_labels_from_text()`**: Returns predictions in IFD (Icelandic Frequency Dictionary) format. Use this for evaluation against MIM-GOLD datasets or when you need compatibility with traditional Icelandic POS taggers.
 ```python
 from transformers import AutoModel, AutoTokenizer
@@ -22,16 +26,17 @@ tokenizer = AutoTokenizer.from_pretrained("mideind/IceBERT-PoS")
 # Example sentence
 sentence = "Ég veit að þú kemur í kvöld til mín ."
 # Get predictions in (category, [attributes]) format
-result = model.predict_labels_from_text([sentence], tokenizer)
 expected = [
     [
         ("fp", ["1", "sing", "nom"]),
-        ("sf", ["act", "1", "sing", "pres"]),
         ("c", []),
         ("fp", ["2", "sing", "nom"]),
-        ("sf", ["act", "2", "sing", "pres"]),
         ("af", []),
         ("n", ["neut", "sing", "acc"]),
         ("af", []),
@@ -43,10 +48,48 @@ assert result == expected, f"Expected {expected}, but got {result}"
 print("Test passed successfully!")
 # Get predictions in IFD format (for MIM-GOLD evaluation)
-ifd_result = model.predict_ifd_labels_from_text([sentence], tokenizer)
 ifd_expected = [
     ["fp1en", "sfg1en", "c", "fp2en", "sfg2en", "af", "nheo", "af", "fp1ee", "pl"]
 ]
 assert ifd_result == ifd_expected, f"Expected {ifd_expected}, but got {ifd_result}"
 print("IFD conversion test passed successfully!")
 ```

 ---
 ## Prediction Methods
+The model provides several prediction methods:
+- **`prepare_inputs(words, tokenizer, truncate=False)`**: Prepares inputs for a single list of words, returning tensors without batch dimension.
+- **`predict_labels(input_ids, attention_mask, word_mask)`**: Low-level prediction from prepared tensors with batch dimension.
+- **`predict_labels_from_text(sentences, tokenizer, truncate=False)`**: Returns structured predictions as (category, [attributes]) tuples from word lists. These can be slightly more readable and more suitable for some applications.
+- **`predict_ifd_labels_from_text(sentences, tokenizer, truncate=False)`**: Returns predictions in IFD (Icelandic Frequency Dictionary) format from word lists. Use this for evaluation against MIM-GOLD datasets or when you need compatibility with traditional Icelandic POS taggers.
+All methods accept pre-tokenized word lists rather than raw sentences for better control over tokenization.
 ```python
 from transformers import AutoModel, AutoTokenizer
 # Example sentence
 sentence = "Ég veit að þú kemur í kvöld til mín ."
+sentence_words = sentence.split()
 # Get predictions in (category, [attributes]) format
+result = model.predict_labels_from_text([sentence_words], tokenizer)
 expected = [
     [
         ("fp", ["1", "sing", "nom"]),
+        ("sf", ["sing", "act", "1", "pres"]),
         ("c", []),
         ("fp", ["2", "sing", "nom"]),
+        ("sf", ["sing", "act", "2", "pres"]),
         ("af", []),
         ("n", ["neut", "sing", "acc"]),
         ("af", []),
 print("Test passed successfully!")
 # Get predictions in IFD format (for MIM-GOLD evaluation)
+ifd_result = model.predict_ifd_labels_from_text([sentence_words], tokenizer)
 ifd_expected = [
     ["fp1en", "sfg1en", "c", "fp2en", "sfg2en", "af", "nheo", "af", "fp1ee", "pl"]
 ]
 assert ifd_result == ifd_expected, f"Expected {ifd_expected}, but got {ifd_result}"
 print("IFD conversion test passed successfully!")
+# Alternative: use prepare_inputs for single sentence prediction
+input_ids, attention_mask, word_mask = model.prepare_inputs(sentence_words, tokenizer)
+single_result = model.predict_labels(input_ids.unsqeeze(0), attention_mask.unsqeeze(0), word_mask.unsqeeze(0))
+assert single_result == expected, f"Expected {expected}, but got {single_result}"
+print("Single sentence prediction test passed successfully!")
+```
+## Handling Long Sequences with Truncation
+By default, `truncate=False` to avoid hard-to-debug issues where input is silently truncated. However, very long sequences will cause errors:
+```python
+from transformers import AutoModel, AutoTokenizer
+model = AutoModel.from_pretrained("mideind/IceBERT-PoS", trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained("mideind/IceBERT-PoS")
+# Create a very long sentence that exceeds model limits
+words = ["Þetta", "er", "mjög", "löng", "setning"] * 200  # Very long sentence
+print(f"Input length: {len(words)} words")
+# This will crash due to sequence length exceeding model limits
+try:
+    result = model.predict_labels_from_text([words], tokenizer, truncate=False)
+    print("This shouldn't print - sequence was too long!")
+except Exception as e:
+    print(f"Error as expected: {type(e).__name__}")
+# Use truncate=True for long sequences
+result_truncated = model.predict_labels_from_text([words], tokenizer, truncate=True)
+print(f"Truncated result length: {len(result_truncated[0])} tokens")
+print("Warning: Output length differs from input length due to truncation!")
+# When using truncation, you must handle the length mismatch carefully
+# The output will have fewer predictions than input words
+assert len(result_truncated[0]) < len(words), "Truncation should reduce length"
+print("Truncation example completed successfully!")
 ```

modeling.py CHANGED Viewed

@@ -322,105 +322,167 @@ class IceBertPosForTokenClassification(PreTrainedModel):
             batch_first=True,
         )
     @torch.no_grad()
     def predict_labels(
-        self, input_ids: torch.Tensor, attention_mask: torch.Tensor, word_ids: List[List[int]]
     ) -> List[List[Tuple[str, List[str]]]]:
         """
         Predict POS labels for input sequences.
         Args:
-            input_ids: Token indices
-            attention_mask: Attention mask
-            word_ids: Word boundaries
         Returns:
             List of sequences, each containing (category, [attributes]) per word
         """
-        # Convert word_ids to word_mask
-        word_mask = self._word_ids_to_word_mask(word_ids, input_ids.shape)
         cat_logits, attr_logits = self.forward(input_ids=input_ids, attention_mask=attention_mask, word_mask=word_mask)
         return self._logits_to_labels(cat_logits, attr_logits, word_mask)
-    def _word_ids_to_word_mask(self, word_ids: List[List[int]], input_shape: torch.Size) -> torch.Tensor:
         """
-        Convert word_ids to binary mask indicating word boundaries.
-        B = batch_size, L = seq_len
         Args:
-            word_ids: List of word id sequences for each batch item
-            input_shape: Shape of input_ids tensor (B x L)
         Returns:
-            word_mask: Binary tensor where 1 indicates start of word (B x L)
         """
-        batch_size, seq_len = input_shape
-        word_mask = torch.zeros(batch_size, seq_len, dtype=torch.long)  # (B x L)
-        for batch_idx, seq_word_ids in enumerate(word_ids):
-            prev_word_id = None
-            for token_idx, word_id in enumerate(seq_word_ids):
-                # Skip None values (special tokens and padding)
-                if word_id is not None and word_id != prev_word_id:
-                    word_mask[batch_idx, token_idx] = 1  # Mark word start
-                # Only update prev_word_id for valid (non-None) word_ids
-                if word_id is not None:
-                    prev_word_id = word_id
-            # Debug logging to match fairseq model
-            logger.debug(f"Word mask: {word_mask[batch_idx]}")
-        return word_mask
-    def predict_labels_from_text(self, sentences: List[str], tokenizer) -> List[List[Tuple[str, List[str]]]]:
         """
-        Predict POS labels from raw text.
-        B = batch_size, L = seq_len
         Args:
-            sentences: List of input sentences (B,)
-            tokenizer: HuggingFace tokenizer
         Returns:
-            List of sequences, each containing (category, [attributes]) per word
         """
-        # Split sentences by spaces to get proper word boundaries
-        # This fixes the issue where tokens like "Kl." get split incorrectly
-        sentences_split = [sentence.split() for sentence in sentences]
-        # Use batch_encode_plus with is_split_into_words=True to preserve word boundaries
-        encoding = tokenizer.batch_encode_plus(
-            sentences_split, return_tensors="pt", padding=True, is_split_into_words=True, add_special_tokens=True
-        )
-        batch_input_ids = encoding["input_ids"]
-        batch_attention_mask = encoding["attention_mask"]
-        word_ids_list = [encoding.word_ids(i) for i in range(len(sentences))]
         # Debug logging to match fairseq model
-        for i in range(len(sentences)):
-            logger.debug(f"Encoded tokens: {batch_input_ids[i]}")  # (L,)
-            logger.debug(f"Decoded tokens: {tokenizer.convert_ids_to_tokens(batch_input_ids[i].tolist())}")
-            logger.debug(f"Word IDs: {word_ids_list[i]}")  # (L,)
-        return self.predict_labels(batch_input_ids, batch_attention_mask, word_ids_list)
     def _logits_to_labels(
         self, cat_logits: torch.Tensor, attr_logits: torch.Tensor, word_mask: torch.Tensor
     ) -> List[List[Tuple[str, List[str]]]]:
         """
         Convert logits to human-readable labels using vectorized operations.
         Key optimizations:
         1. Flatten batch dimension to process all words simultaneously
         2. Vectorized group processing across all words
         3. Defer string conversion to the very end
         4. Minimize Python loops and tensor-CPU transfers
         B = batch_size, W = max_words, C = num_categories, A = num_attributes, G = num_groups
         """
         device = cat_logits.device
@@ -433,54 +495,54 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         batch_word_mask = torch.zeros(bsz, max_words, dtype=torch.bool, device=device)
         for b in range(bsz):
             if nwords[b] > 0:
-                batch_word_mask[b, :nwords[b]] = True
         valid_positions = batch_word_mask.flatten().nonzero(as_tuple=True)[0]  # (total_words,)
         total_words = len(valid_positions)
         if total_words == 0:
             return [[] for _ in range(bsz)]
         # Step 2: Vectorized category prediction for all valid words
         flat_cat_logits = cat_logits.view(-1, cat_logits.size(-1))  # (B*W x C)
         flat_attr_logits = attr_logits.view(-1, attr_logits.size(-1))  # (B*W x A)
         # Get categories for all valid words: (total_words,)
         all_cat_indices = flat_cat_logits[valid_positions].argmax(dim=-1)
         # Step 3: Vectorized group validity for all words: (total_words x G)
         all_valid_groups = self.category_to_groups[all_cat_indices]
         # Step 4: Collect attributes using vectorized group processing
         word_to_attrs = {}  # word_idx -> list of attr_indices
         # Process each group across all words simultaneously
         for group_idx in range(self.group_sizes.size(0)):
             group_size = self.group_sizes[group_idx].item()
             if group_size == 0:
                 continue
             # Find words that have this group valid: (words_with_group,)
             words_with_group = all_valid_groups[:, group_idx].nonzero(as_tuple=True)[0]
             if len(words_with_group) == 0:
                 continue
             # Get attribute indices for this group
             group_attr_indices = self.group_attr_indices[group_idx, :group_size]
             valid_attr_indices = group_attr_indices[group_attr_indices >= 0]
             if len(valid_attr_indices) == 0:
                 continue
             # Get logits for all words that need this group: (words_with_group x group_size)
             word_positions = valid_positions[words_with_group]
             group_logits = flat_attr_logits[word_positions][:, valid_attr_indices]
             if len(valid_attr_indices) == 1:
                 # Binary decision for all words simultaneously: (words_with_group,)
                 decisions = group_logits.sigmoid().squeeze(-1) > 0.5
                 selected_words = words_with_group[decisions]
                 attr_idx = valid_attr_indices[0].item()
                 for word_idx in selected_words:
                     word_idx_item = word_idx.item()
                     if word_idx_item not in word_to_attrs:
@@ -489,7 +551,7 @@ class IceBertPosForTokenClassification(PreTrainedModel):
             else:
                 # Multi-class decision for all words: (words_with_group,)
                 best_indices = group_logits.argmax(dim=-1)
                 for i, word_idx in enumerate(words_with_group):
                     attr_idx = valid_attr_indices[best_indices[i]].item()
                     word_idx_item = word_idx.item()
@@ -500,22 +562,22 @@ class IceBertPosForTokenClassification(PreTrainedModel):
         # Step 5: Reconstruct batch structure and convert to strings (deferred)
         predictions = []
         word_counter = 0
         for seq_idx in range(bsz):
             seq_nwords = nwords[seq_idx].item()
             seq_predictions = []
             for _ in range(seq_nwords):
                 # Get category (string conversion deferred)
                 cat_idx = all_cat_indices[word_counter].item()
                 cat_name = schema.label_categories[cat_idx]
                 # Get attributes (string conversion deferred)
                 attributes = []
                 if word_counter in word_to_attrs:
                     attr_indices = word_to_attrs[word_counter]
                     attributes = [schema.labels[idx] for idx in attr_indices]
                 # Apply post-processing rules
                 if len(attributes) == 1 and attributes[0] == "pos":
                     # This label is used as a default for training but implied in mim format
@@ -523,38 +585,14 @@ class IceBertPosForTokenClassification(PreTrainedModel):
                 elif cat_name == "sl" and "act" in attributes:
                     # Number and tense are not shown for sl act in mim format
                     attributes = [attr for attr in attributes if attr not in ["1", "sing", "pres"]]
                 seq_predictions.append((cat_name, attributes))
                 word_counter += 1
             predictions.append(seq_predictions)
         return predictions
-    def predict_ifd_labels_from_text(self, sentences: List[str], tokenizer) -> List[List[str]]:
-        """
-        Predict IFD format labels from raw text.
-        B = batch_size, Ws = seq_words
-        Args:
-            sentences: List of input sentences (B,)
-            tokenizer: HuggingFace tokenizer
-        Returns:
-            ifd_predictions: List of IFD labels per sentence (B x Ws)
-        """
-        # Get model predictions in (category, [attributes]) format
-        predictions = self.predict_labels_from_text(sentences, tokenizer)
-        # Convert each sentence's predictions to IFD format
-        ifd_predictions = []
-        for sentence_predictions in predictions:
-            ifd_labels = convert_predictions_to_ifd(sentence_predictions)  # (Ws,)
-            ifd_predictions.append(ifd_labels)
-        return ifd_predictions
 AutoConfig.register("icebert-pos", IceBertPosConfig)
 AutoModel.register(IceBertPosConfig, IceBertPosForTokenClassification)

             batch_first=True,
         )
+    def prepare_inputs(
+        self, words: List[str], tokenizer, truncate: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Prepare inputs for a list of words.
+        Args:
+            words: List of words
+            tokenizer: HuggingFace tokenizer
+            truncate: Whether to truncate if too long
+        Returns:
+            Tuple of (input_ids, attention_mask, word_mask) without batch dimension.
+        """
+        # Encode with word boundary preservation
+        encoding = tokenizer.encode_plus(
+            words,
+            return_tensors="pt",
+            is_split_into_words=True,
+            add_special_tokens=True,
+            truncation=truncate,
+            # The model was probably trained with a lot shorter sequences
+            max_length=self.config.max_position_embeddings - 2,
+        )
+        input_ids = encoding["input_ids"].squeeze(0)  # (L,)
+        attention_mask = torch.ones_like(input_ids)
+        # Get word_ids and convert to word_mask
+        word_ids = encoding.word_ids()
+        word_mask = self._word_ids_to_word_mask(word_ids)
+        # Debug logging to match fairseq model
+        logger.debug(f"Encoded tokens: {input_ids}")  # (L,)
+        logger.debug(f"Decoded tokens: {tokenizer.convert_ids_to_tokens(input_ids.tolist())}")
+        logger.debug(f"Word IDs: {word_ids}")  # (L,)
+        logger.debug(f"Word mask: {word_mask}")
+        return input_ids, attention_mask, word_mask
     @torch.no_grad()
     def predict_labels(
+        self, input_ids: torch.Tensor, attention_mask: torch.Tensor, word_mask: torch.Tensor
     ) -> List[List[Tuple[str, List[str]]]]:
         """
         Predict POS labels for input sequences.
+        B = batch_size, L = seq_len
         Args:
+            input_ids: Token indices (B x L)
+            attention_mask: Attention mask (B x L)
+            word_mask: Binary mask indicating word boundaries (B x L)
         Returns:
             List of sequences, each containing (category, [attributes]) per word
         """
         cat_logits, attr_logits = self.forward(input_ids=input_ids, attention_mask=attention_mask, word_mask=word_mask)
         return self._logits_to_labels(cat_logits, attr_logits, word_mask)
+    def predict_labels_from_text(
+        self, sentences: List[List[str]], tokenizer, truncate: bool = False
+    ) -> List[List[Tuple[str, List[str]]]]:
         """
+        Predict POS labels from list of word lists.
+        Args:
+            sentences: List of sentences, each a list of words
+            tokenizer: HuggingFace tokenizer
+            truncate: Whether to truncate if too long
+        Returns:
+            List of sequences, each containing (category, [attributes]) per word
+        """
+        # Use prepare_inputs for each sentence and batch them
+        all_input_ids = []
+        all_attention_masks = []
+        all_word_masks = []
+        for words in sentences:
+            input_ids, attention_mask, word_mask = self.prepare_inputs(words, tokenizer, truncate)
+            all_input_ids.append(input_ids)
+            all_attention_masks.append(attention_mask)
+            all_word_masks.append(word_mask)
+        # Pad sequences to same length
+        batch_input_ids = pad_sequence(all_input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
+        batch_attention_mask = pad_sequence(all_attention_masks, batch_first=True, padding_value=0)
+        batch_word_mask = pad_sequence(all_word_masks, batch_first=True, padding_value=0)
+        return self.predict_labels(batch_input_ids, batch_attention_mask, batch_word_mask)
+    def predict_ifd_labels_from_text(
+        self, sentences: List[List[str]], tokenizer, truncate: bool = False
+    ) -> List[List[str]]:
+        """
+        Predict IFD format labels from list of word lists.
+        B = batch_size, Ws = seq_words
         Args:
+            sentences: List of sentences, each a list of words
+            tokenizer: HuggingFace tokenizer
+            truncate: Whether to truncate if too long
         Returns:
+            ifd_predictions: List of IFD labels per sentence (B x Ws)
         """
+        # Get model predictions in (category, [attributes]) format
+        predictions = self.predict_labels_from_text(sentences, tokenizer, truncate)
+        # Convert each sentence's predictions to IFD format
+        ifd_predictions = []
+        for sentence_predictions in predictions:
+            ifd_labels = convert_predictions_to_ifd(sentence_predictions)  # (Ws,)
+            ifd_predictions.append(ifd_labels)
+        return ifd_predictions
+    def _word_ids_to_word_mask(self, word_ids: List[int]) -> torch.Tensor:
         """
+        Convert word_ids to binary mask indicating word boundaries.
+        L = seq_len
         Args:
+            word_ids: Word id sequence for a single sequence
+            seq_len: Length of the sequence
         Returns:
+            word_mask: Binary tensor where 1 indicates start of word (L,)
         """
+        word_mask = torch.zeros(len(word_ids), dtype=torch.long)  # (L,)
+        prev_word_id = None
+        for token_idx, word_id in enumerate(word_ids):
+            # Skip None values (special tokens and padding)
+            if word_id is not None and word_id != prev_word_id:
+                word_mask[token_idx] = 1  # Mark word start
+            # Only update prev_word_id for valid (non-None) word_ids
+            if word_id is not None:
+                prev_word_id = word_id
         # Debug logging to match fairseq model
+        logger.debug(f"Word mask: {word_mask}")
+        return word_mask
     def _logits_to_labels(
         self, cat_logits: torch.Tensor, attr_logits: torch.Tensor, word_mask: torch.Tensor
     ) -> List[List[Tuple[str, List[str]]]]:
         """
         Convert logits to human-readable labels using vectorized operations.
         Key optimizations:
         1. Flatten batch dimension to process all words simultaneously
         2. Vectorized group processing across all words
         3. Defer string conversion to the very end
         4. Minimize Python loops and tensor-CPU transfers
         B = batch_size, W = max_words, C = num_categories, A = num_attributes, G = num_groups
         """
         device = cat_logits.device
         batch_word_mask = torch.zeros(bsz, max_words, dtype=torch.bool, device=device)
         for b in range(bsz):
             if nwords[b] > 0:
+                batch_word_mask[b, : nwords[b]] = True
         valid_positions = batch_word_mask.flatten().nonzero(as_tuple=True)[0]  # (total_words,)
         total_words = len(valid_positions)
         if total_words == 0:
             return [[] for _ in range(bsz)]
         # Step 2: Vectorized category prediction for all valid words
         flat_cat_logits = cat_logits.view(-1, cat_logits.size(-1))  # (B*W x C)
         flat_attr_logits = attr_logits.view(-1, attr_logits.size(-1))  # (B*W x A)
         # Get categories for all valid words: (total_words,)
         all_cat_indices = flat_cat_logits[valid_positions].argmax(dim=-1)
         # Step 3: Vectorized group validity for all words: (total_words x G)
         all_valid_groups = self.category_to_groups[all_cat_indices]
         # Step 4: Collect attributes using vectorized group processing
         word_to_attrs = {}  # word_idx -> list of attr_indices
         # Process each group across all words simultaneously
         for group_idx in range(self.group_sizes.size(0)):
             group_size = self.group_sizes[group_idx].item()
             if group_size == 0:
                 continue
             # Find words that have this group valid: (words_with_group,)
             words_with_group = all_valid_groups[:, group_idx].nonzero(as_tuple=True)[0]
             if len(words_with_group) == 0:
                 continue
             # Get attribute indices for this group
             group_attr_indices = self.group_attr_indices[group_idx, :group_size]
             valid_attr_indices = group_attr_indices[group_attr_indices >= 0]
             if len(valid_attr_indices) == 0:
                 continue
             # Get logits for all words that need this group: (words_with_group x group_size)
             word_positions = valid_positions[words_with_group]
             group_logits = flat_attr_logits[word_positions][:, valid_attr_indices]
             if len(valid_attr_indices) == 1:
                 # Binary decision for all words simultaneously: (words_with_group,)
                 decisions = group_logits.sigmoid().squeeze(-1) > 0.5
                 selected_words = words_with_group[decisions]
                 attr_idx = valid_attr_indices[0].item()
                 for word_idx in selected_words:
                     word_idx_item = word_idx.item()
                     if word_idx_item not in word_to_attrs:
             else:
                 # Multi-class decision for all words: (words_with_group,)
                 best_indices = group_logits.argmax(dim=-1)
                 for i, word_idx in enumerate(words_with_group):
                     attr_idx = valid_attr_indices[best_indices[i]].item()
                     word_idx_item = word_idx.item()
         # Step 5: Reconstruct batch structure and convert to strings (deferred)
         predictions = []
         word_counter = 0
         for seq_idx in range(bsz):
             seq_nwords = nwords[seq_idx].item()
             seq_predictions = []
             for _ in range(seq_nwords):
                 # Get category (string conversion deferred)
                 cat_idx = all_cat_indices[word_counter].item()
                 cat_name = schema.label_categories[cat_idx]
                 # Get attributes (string conversion deferred)
                 attributes = []
                 if word_counter in word_to_attrs:
                     attr_indices = word_to_attrs[word_counter]
                     attributes = [schema.labels[idx] for idx in attr_indices]
                 # Apply post-processing rules
                 if len(attributes) == 1 and attributes[0] == "pos":
                     # This label is used as a default for training but implied in mim format
                 elif cat_name == "sl" and "act" in attributes:
                     # Number and tense are not shown for sl act in mim format
                     attributes = [attr for attr in attributes if attr not in ["1", "sing", "pres"]]
                 seq_predictions.append((cat_name, attributes))
                 word_counter += 1
             predictions.append(seq_predictions)
         return predictions
 AutoConfig.register("icebert-pos", IceBertPosConfig)
 AutoModel.register(IceBertPosConfig, IceBertPosForTokenClassification)