mrapacz commited on
Commit
fa00f57
·
verified ·
1 Parent(s): b17b1ce

Upload MorphT5ForConditionalGeneration

Browse files
Files changed (1) hide show
  1. modeling_morph_t5_auto.py +216 -0
modeling_morph_t5_auto.py CHANGED
@@ -1990,3 +1990,219 @@ class MorphT5EncoderModel(MorphT5PreTrainedModel):
1990
  )
1991
 
1992
  return encoder_outputs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1990
  )
1991
 
1992
  return encoder_outputs
1993
+
1994
+
1995
+ ########## Tokenizer Code ##########
1996
+
1997
+ import json
1998
+ from pathlib import Path
1999
+ from typing import List, Optional, Union
2000
+
2001
+ import numpy as np
2002
+ from datasets import Dataset
2003
+ from transformers import PreTrainedTokenizer, T5TokenizerFast
2004
+ from transformers.utils import PaddingStrategy
2005
+
2006
+
2007
+ class MorphTokenizer:
2008
+ """Handles morphological tokenization with special tokens support."""
2009
+
2010
+ def __init__(self):
2011
+ self.morph_encodings = {}
2012
+ self.unique_tags = set()
2013
+ self.special_tokens_map = {
2014
+ "pad_token": "<pad>",
2015
+ "eos_token": "<eos>",
2016
+ "unk_token": "<unk>",
2017
+ "block_separator_token": "<extra_id_0>",
2018
+ }
2019
+ self._special_token_ids = {"<pad>": 0, "<eos>": 1, "<unk>": 2, "<extra_id_0>": 3}
2020
+
2021
+ @property
2022
+ def pad_token_id(self) -> int:
2023
+ return self._special_token_ids[self.special_tokens_map["pad_token"]]
2024
+
2025
+ @property
2026
+ def eos_token_id(self) -> int:
2027
+ return self._special_token_ids[self.special_tokens_map["eos_token"]]
2028
+
2029
+ @property
2030
+ def unk_token_id(self) -> int:
2031
+ return self._special_token_ids[self.special_tokens_map["unk_token"]]
2032
+
2033
+ @property
2034
+ def block_separator_token(self) -> str:
2035
+ return self.special_tokens_map["block_separator_token"]
2036
+
2037
+ @property
2038
+ def block_separator_token_id(self) -> int:
2039
+ return self._special_token_ids[self.special_tokens_map["block_separator_token"]]
2040
+
2041
+ @property
2042
+ def vocabulary_size(self) -> int:
2043
+ return len(self.morph_encodings)
2044
+
2045
+ def initialize_vocab(self, dset: Dataset, tags_col: str) -> None:
2046
+ """Initialize vocabulary from dataset."""
2047
+ all_tags = set()
2048
+ for split in dset:
2049
+ all_tags.update(tag for tags in dset[split][tags_col] for tag in tags)
2050
+
2051
+ self.unique_tags = all_tags
2052
+ self.morph_encodings = {token: idx for idx, token in enumerate(list(self._special_token_ids.keys()) + list(all_tags))}
2053
+
2054
+ def encode(self, tags: list[str]) -> list[int]:
2055
+ """Convert tags to token ids."""
2056
+ return [self.morph_encodings.get(tag, self.unk_token_id) for tag in tags]
2057
+
2058
+ def decode(self, ids: list[int]) -> list[str]:
2059
+ """Convert token ids back to tags."""
2060
+ id_to_token = {v: k for k, v in self.morph_encodings.items()}
2061
+ return [id_to_token[id] for id in ids]
2062
+
2063
+
2064
+ class MorphologicallyAwareTokenizer(PreTrainedTokenizer):
2065
+ """T5Tokenizer with additional morphological tokenization capabilities."""
2066
+
2067
+ model_input_names = ["input_ids", "attention_mask", "input_morphs"]
2068
+
2069
+ def __init__(self, base_tokenizer_path: str, **kwargs):
2070
+ """Initialize tokenizer with both text and morphological capabilities."""
2071
+ super().__init__(**kwargs)
2072
+
2073
+ self.text_tokenizer = T5TokenizerFast.from_pretrained(base_tokenizer_path, subfolder="text_tokenizer")
2074
+ self.morph_tokenizer = MorphTokenizer()
2075
+
2076
+ # Copy attributes from text tokenizer
2077
+ self.pad_token = self.text_tokenizer.pad_token
2078
+ self.eos_token = self.text_tokenizer.eos_token
2079
+
2080
+ def initialize_morph_vocab(self, dset: Dataset, tags_col: str) -> None:
2081
+ self.morph_tokenizer.initialize_vocab(dset, tags_col)
2082
+
2083
+ def save_pretrained(self, save_directory: Union[str, Path], **kwargs):
2084
+ """Save both text and morphological tokenizers."""
2085
+ save_directory = Path(save_directory)
2086
+ self.text_tokenizer.save_pretrained(save_directory / "text_tokenizer")
2087
+
2088
+ morph_config = {
2089
+ "morph_encodings": self.morph_tokenizer.morph_encodings,
2090
+ "special_tokens_map": self.morph_tokenizer.special_tokens_map,
2091
+ "unique_tags": list(self.morph_tokenizer.unique_tags),
2092
+ }
2093
+
2094
+ morph_config_file = save_directory / "morph_tokenizer_config.json"
2095
+ morph_config_file.write_text(json.dumps(morph_config))
2096
+
2097
+ @classmethod
2098
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, Path], **kwargs):
2099
+ """Load both text and morphological tokenizers."""
2100
+ instance = cls(base_tokenizer_path=pretrained_model_name_or_path, **kwargs)
2101
+
2102
+ morph_config_path = Path(pretrained_model_name_or_path) / "morph_tokenizer_config.json"
2103
+ if morph_config_path.exists():
2104
+ morph_config = json.loads(morph_config_path.read_text())
2105
+ instance.morph_tokenizer.morph_encodings = morph_config["morph_encodings"]
2106
+ instance.morph_tokenizer.special_tokens_map = morph_config["special_tokens_map"]
2107
+ instance.morph_tokenizer.unique_tags = set(morph_config["unique_tags"])
2108
+
2109
+ return instance
2110
+
2111
+ def __call__(
2112
+ self,
2113
+ text: Union[List[str], List[List[str]]],
2114
+ text_target: Optional[Union[str, List[str]]] = None,
2115
+ morph_tags: Optional[List[List[str]]] = None,
2116
+ padding: Union[bool, str, PaddingStrategy] = True,
2117
+ truncation: bool = True,
2118
+ max_length: Optional[int] = 512,
2119
+ return_tensors: Optional[str] = None,
2120
+ **kwargs,
2121
+ ):
2122
+ """
2123
+ Process text and morphological tags.
2124
+
2125
+ Args:
2126
+ text: List of text blocks for each example or list of lists for batched input
2127
+ text_target: Optional target text
2128
+ morph_tags: List of morphological tags corresponding to text blocks
2129
+ padding: Padding strategy
2130
+ truncation: Whether to truncate sequences
2131
+ max_length: Maximum sequence length
2132
+ return_tensors: Return format for tensors
2133
+ **kwargs: Additional arguments
2134
+ """
2135
+ # Get block separator token
2136
+ block_sep = self.morph_tokenizer.block_separator_token
2137
+
2138
+ # Format text with block separators
2139
+ if text and isinstance(text[0], str):
2140
+ formatted_text = [f" {block_sep} ".join(text)]
2141
+ else:
2142
+ formatted_text = [f" {block_sep} ".join(example) for example in text]
2143
+
2144
+ encoding = self.text_tokenizer(
2145
+ formatted_text,
2146
+ text_target=text_target,
2147
+ padding=padding,
2148
+ truncation=truncation,
2149
+ max_length=max_length,
2150
+ return_tensors=return_tensors,
2151
+ **kwargs,
2152
+ )
2153
+
2154
+ if morph_tags is not None:
2155
+ # Ensure morph_tags is a list of lists for batch processing
2156
+ if morph_tags and isinstance(morph_tags[0], str):
2157
+ morph_tags = [morph_tags]
2158
+
2159
+ morph_ids = [self.morph_tokenizer.encode(tags) for tags in morph_tags]
2160
+ block_sep_id = self.text_tokenizer.convert_tokens_to_ids("<extra_id_0>")
2161
+
2162
+ all_morph_arrays = []
2163
+ for batch_idx, (tag_ids, input_ids) in enumerate(zip(morph_ids, encoding["input_ids"])):
2164
+ text_ids = np.array(input_ids)
2165
+ text_blocks = np.split(text_ids, np.where(text_ids == block_sep_id)[0])
2166
+
2167
+ morph_array = []
2168
+ for tag_id, text_block in zip(tag_ids, text_blocks):
2169
+ morph_array.extend([tag_id] * len(text_block))
2170
+
2171
+ morph_array = np.array(morph_array)
2172
+ morph_array[text_ids == block_sep_id] = self.morph_tokenizer.block_separator_token_id
2173
+ morph_array[text_ids == self.text_tokenizer.eos_token_id] = self.morph_tokenizer.eos_token_id
2174
+ morph_array[text_ids == self.text_tokenizer.pad_token_id] = self.morph_tokenizer.pad_token_id
2175
+ morph_array[text_ids == self.text_tokenizer.unk_token_id] = self.morph_tokenizer.unk_token_id
2176
+
2177
+ all_morph_arrays.append(morph_array)
2178
+
2179
+ encoding["input_morphs"] = all_morph_arrays
2180
+
2181
+ if return_tensors == "pt":
2182
+ import torch
2183
+
2184
+ encoding["input_morphs"] = torch.tensor(encoding["input_morphs"])
2185
+
2186
+ return encoding
2187
+
2188
+ def decode(self, input_ids: List[int], skip_special_tokens: bool = True, keep_block_separator: bool = False) -> str:
2189
+ """Decode input IDs back to text."""
2190
+
2191
+ if skip_special_tokens and keep_block_separator:
2192
+ decoded = self.text_tokenizer.decode(input_ids, skip_special_tokens=False)
2193
+ special_tokens = {
2194
+ self.text_tokenizer.eos_token,
2195
+ self.text_tokenizer.pad_token,
2196
+ self.text_tokenizer.unk_token,
2197
+ }
2198
+ decoded = self.text_tokenizer.decode(input_ids, skip_special_tokens=False)
2199
+ for token in special_tokens:
2200
+ decoded = decoded.replace(token, "")
2201
+ return decoded.strip()
2202
+
2203
+ decoded = self.text_tokenizer.decode(input_ids, skip_special_tokens=skip_special_tokens)
2204
+ return decoded
2205
+
2206
+ @property
2207
+ def target_block_separator_token(self) -> str:
2208
+ return "<extra_id_2>"