mrapacz
/

interlinear-en-philta-emb-auto-diacritics-bh

         )
         return encoder_outputs
+########## Tokenizer Code ##########
+import json
+from pathlib import Path
+from typing import List, Optional, Union
+import numpy as np
+from datasets import Dataset
+from transformers import PreTrainedTokenizer, T5TokenizerFast
+from transformers.utils import PaddingStrategy
+class MorphTokenizer:
+    """Handles morphological tokenization with special tokens support."""
+    def __init__(self):
+        self.morph_encodings = {}
+        self.unique_tags = set()
+        self.special_tokens_map = {
+            "pad_token": "<pad>",
+            "eos_token": "<eos>",
+            "unk_token": "<unk>",
+            "block_separator_token": "<extra_id_0>",
+        }
+        self._special_token_ids = {"<pad>": 0, "<eos>": 1, "<unk>": 2, "<extra_id_0>": 3}
+    @property
+    def pad_token_id(self) -> int:
+        return self._special_token_ids[self.special_tokens_map["pad_token"]]
+    @property
+    def eos_token_id(self) -> int:
+        return self._special_token_ids[self.special_tokens_map["eos_token"]]
+    @property
+    def unk_token_id(self) -> int:
+        return self._special_token_ids[self.special_tokens_map["unk_token"]]
+    @property
+    def block_separator_token(self) -> str:
+        return self.special_tokens_map["block_separator_token"]
+    @property
+    def block_separator_token_id(self) -> int:
+        return self._special_token_ids[self.special_tokens_map["block_separator_token"]]
+    @property
+    def vocabulary_size(self) -> int:
+        return len(self.morph_encodings)
+    def initialize_vocab(self, dset: Dataset, tags_col: str) -> None:
+        """Initialize vocabulary from dataset."""
+        all_tags = set()
+        for split in dset:
+            all_tags.update(tag for tags in dset[split][tags_col] for tag in tags)
+        self.unique_tags = all_tags
+        self.morph_encodings = {token: idx for idx, token in enumerate(list(self._special_token_ids.keys()) + list(all_tags))}
+    def encode(self, tags: list[str]) -> list[int]:
+        """Convert tags to token ids."""
+        return [self.morph_encodings.get(tag, self.unk_token_id) for tag in tags]
+    def decode(self, ids: list[int]) -> list[str]:
+        """Convert token ids back to tags."""
+        id_to_token = {v: k for k, v in self.morph_encodings.items()}
+        return [id_to_token[id] for id in ids]
+class MorphologicallyAwareTokenizer(PreTrainedTokenizer):
+    """T5Tokenizer with additional morphological tokenization capabilities."""
+    model_input_names = ["input_ids", "attention_mask", "input_morphs"]
+    def __init__(self, base_tokenizer_path: str, **kwargs):
+        """Initialize tokenizer with both text and morphological capabilities."""
+        super().__init__(**kwargs)
+        self.text_tokenizer = T5TokenizerFast.from_pretrained(base_tokenizer_path, subfolder="text_tokenizer")
+        self.morph_tokenizer = MorphTokenizer()
+        # Copy attributes from text tokenizer
+        self.pad_token = self.text_tokenizer.pad_token
+        self.eos_token = self.text_tokenizer.eos_token
+    def initialize_morph_vocab(self, dset: Dataset, tags_col: str) -> None:
+        self.morph_tokenizer.initialize_vocab(dset, tags_col)
+    def save_pretrained(self, save_directory: Union[str, Path], **kwargs):
+        """Save both text and morphological tokenizers."""
+        save_directory = Path(save_directory)
+        self.text_tokenizer.save_pretrained(save_directory / "text_tokenizer")
+        morph_config = {
+            "morph_encodings": self.morph_tokenizer.morph_encodings,
+            "special_tokens_map": self.morph_tokenizer.special_tokens_map,
+            "unique_tags": list(self.morph_tokenizer.unique_tags),
+        }
+        morph_config_file = save_directory / "morph_tokenizer_config.json"
+        morph_config_file.write_text(json.dumps(morph_config))
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, Path], **kwargs):
+        """Load both text and morphological tokenizers."""
+        instance = cls(base_tokenizer_path=pretrained_model_name_or_path, **kwargs)
+        morph_config_path = Path(pretrained_model_name_or_path) / "morph_tokenizer_config.json"
+        if morph_config_path.exists():
+            morph_config = json.loads(morph_config_path.read_text())
+            instance.morph_tokenizer.morph_encodings = morph_config["morph_encodings"]
+            instance.morph_tokenizer.special_tokens_map = morph_config["special_tokens_map"]
+            instance.morph_tokenizer.unique_tags = set(morph_config["unique_tags"])
+        return instance
+    def __call__(
+        self,
+        text: Union[List[str], List[List[str]]],
+        text_target: Optional[Union[str, List[str]]] = None,
+        morph_tags: Optional[List[List[str]]] = None,
+        padding: Union[bool, str, PaddingStrategy] = True,
+        truncation: bool = True,
+        max_length: Optional[int] = 512,
+        return_tensors: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Process text and morphological tags.
+        Args:
+            text: List of text blocks for each example or list of lists for batched input
+            text_target: Optional target text
+            morph_tags: List of morphological tags corresponding to text blocks
+            padding: Padding strategy
+            truncation: Whether to truncate sequences
+            max_length: Maximum sequence length
+            return_tensors: Return format for tensors
+            **kwargs: Additional arguments
+        """
+        # Get block separator token
+        block_sep = self.morph_tokenizer.block_separator_token
+        # Format text with block separators
+        if text and isinstance(text[0], str):
+            formatted_text = [f" {block_sep} ".join(text)]
+        else:
+            formatted_text = [f" {block_sep} ".join(example) for example in text]
+        encoding = self.text_tokenizer(
+            formatted_text,
+            text_target=text_target,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+        if morph_tags is not None:
+            # Ensure morph_tags is a list of lists for batch processing
+            if morph_tags and isinstance(morph_tags[0], str):
+                morph_tags = [morph_tags]
+            morph_ids = [self.morph_tokenizer.encode(tags) for tags in morph_tags]
+            block_sep_id = self.text_tokenizer.convert_tokens_to_ids("<extra_id_0>")
+            all_morph_arrays = []
+            for batch_idx, (tag_ids, input_ids) in enumerate(zip(morph_ids, encoding["input_ids"])):
+                text_ids = np.array(input_ids)
+                text_blocks = np.split(text_ids, np.where(text_ids == block_sep_id)[0])
+                morph_array = []
+                for tag_id, text_block in zip(tag_ids, text_blocks):
+                    morph_array.extend([tag_id] * len(text_block))
+                morph_array = np.array(morph_array)
+                morph_array[text_ids == block_sep_id] = self.morph_tokenizer.block_separator_token_id
+                morph_array[text_ids == self.text_tokenizer.eos_token_id] = self.morph_tokenizer.eos_token_id
+                morph_array[text_ids == self.text_tokenizer.pad_token_id] = self.morph_tokenizer.pad_token_id
+                morph_array[text_ids == self.text_tokenizer.unk_token_id] = self.morph_tokenizer.unk_token_id
+                all_morph_arrays.append(morph_array)
+            encoding["input_morphs"] = all_morph_arrays
+            if return_tensors == "pt":
+                import torch
+                encoding["input_morphs"] = torch.tensor(encoding["input_morphs"])
+        return encoding
+    def decode(self, input_ids: List[int], skip_special_tokens: bool = True, keep_block_separator: bool = False) -> str:
+        """Decode input IDs back to text."""
+        if skip_special_tokens and keep_block_separator:
+            decoded = self.text_tokenizer.decode(input_ids, skip_special_tokens=False)
+            special_tokens = {
+                self.text_tokenizer.eos_token,
+                self.text_tokenizer.pad_token,
+                self.text_tokenizer.unk_token,
+            }
+            decoded = self.text_tokenizer.decode(input_ids, skip_special_tokens=False)
+            for token in special_tokens:
+                decoded = decoded.replace(token, "")
+            return decoded.strip()
+        decoded = self.text_tokenizer.decode(input_ids, skip_special_tokens=skip_special_tokens)
+        return decoded
+    @property
+    def target_block_separator_token(self) -> str:
+        return "<extra_id_2>"