mrapacz commited on
Commit
a596334
·
verified ·
1 Parent(s): fa00f57

Upload MorphT5ForConditionalGeneration

Browse files
Files changed (1) hide show
  1. modeling_morph_t5_auto.py +216 -216
modeling_morph_t5_auto.py CHANGED
@@ -1,3 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Copyright 2020 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
2
  #
3
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -1990,219 +2206,3 @@ class MorphT5EncoderModel(MorphT5PreTrainedModel):
1990
  )
1991
 
1992
  return encoder_outputs
1993
-
1994
-
1995
- ########## Tokenizer Code ##########
1996
-
1997
- import json
1998
- from pathlib import Path
1999
- from typing import List, Optional, Union
2000
-
2001
- import numpy as np
2002
- from datasets import Dataset
2003
- from transformers import PreTrainedTokenizer, T5TokenizerFast
2004
- from transformers.utils import PaddingStrategy
2005
-
2006
-
2007
- class MorphTokenizer:
2008
- """Handles morphological tokenization with special tokens support."""
2009
-
2010
- def __init__(self):
2011
- self.morph_encodings = {}
2012
- self.unique_tags = set()
2013
- self.special_tokens_map = {
2014
- "pad_token": "<pad>",
2015
- "eos_token": "<eos>",
2016
- "unk_token": "<unk>",
2017
- "block_separator_token": "<extra_id_0>",
2018
- }
2019
- self._special_token_ids = {"<pad>": 0, "<eos>": 1, "<unk>": 2, "<extra_id_0>": 3}
2020
-
2021
- @property
2022
- def pad_token_id(self) -> int:
2023
- return self._special_token_ids[self.special_tokens_map["pad_token"]]
2024
-
2025
- @property
2026
- def eos_token_id(self) -> int:
2027
- return self._special_token_ids[self.special_tokens_map["eos_token"]]
2028
-
2029
- @property
2030
- def unk_token_id(self) -> int:
2031
- return self._special_token_ids[self.special_tokens_map["unk_token"]]
2032
-
2033
- @property
2034
- def block_separator_token(self) -> str:
2035
- return self.special_tokens_map["block_separator_token"]
2036
-
2037
- @property
2038
- def block_separator_token_id(self) -> int:
2039
- return self._special_token_ids[self.special_tokens_map["block_separator_token"]]
2040
-
2041
- @property
2042
- def vocabulary_size(self) -> int:
2043
- return len(self.morph_encodings)
2044
-
2045
- def initialize_vocab(self, dset: Dataset, tags_col: str) -> None:
2046
- """Initialize vocabulary from dataset."""
2047
- all_tags = set()
2048
- for split in dset:
2049
- all_tags.update(tag for tags in dset[split][tags_col] for tag in tags)
2050
-
2051
- self.unique_tags = all_tags
2052
- self.morph_encodings = {token: idx for idx, token in enumerate(list(self._special_token_ids.keys()) + list(all_tags))}
2053
-
2054
- def encode(self, tags: list[str]) -> list[int]:
2055
- """Convert tags to token ids."""
2056
- return [self.morph_encodings.get(tag, self.unk_token_id) for tag in tags]
2057
-
2058
- def decode(self, ids: list[int]) -> list[str]:
2059
- """Convert token ids back to tags."""
2060
- id_to_token = {v: k for k, v in self.morph_encodings.items()}
2061
- return [id_to_token[id] for id in ids]
2062
-
2063
-
2064
- class MorphologicallyAwareTokenizer(PreTrainedTokenizer):
2065
- """T5Tokenizer with additional morphological tokenization capabilities."""
2066
-
2067
- model_input_names = ["input_ids", "attention_mask", "input_morphs"]
2068
-
2069
- def __init__(self, base_tokenizer_path: str, **kwargs):
2070
- """Initialize tokenizer with both text and morphological capabilities."""
2071
- super().__init__(**kwargs)
2072
-
2073
- self.text_tokenizer = T5TokenizerFast.from_pretrained(base_tokenizer_path, subfolder="text_tokenizer")
2074
- self.morph_tokenizer = MorphTokenizer()
2075
-
2076
- # Copy attributes from text tokenizer
2077
- self.pad_token = self.text_tokenizer.pad_token
2078
- self.eos_token = self.text_tokenizer.eos_token
2079
-
2080
- def initialize_morph_vocab(self, dset: Dataset, tags_col: str) -> None:
2081
- self.morph_tokenizer.initialize_vocab(dset, tags_col)
2082
-
2083
- def save_pretrained(self, save_directory: Union[str, Path], **kwargs):
2084
- """Save both text and morphological tokenizers."""
2085
- save_directory = Path(save_directory)
2086
- self.text_tokenizer.save_pretrained(save_directory / "text_tokenizer")
2087
-
2088
- morph_config = {
2089
- "morph_encodings": self.morph_tokenizer.morph_encodings,
2090
- "special_tokens_map": self.morph_tokenizer.special_tokens_map,
2091
- "unique_tags": list(self.morph_tokenizer.unique_tags),
2092
- }
2093
-
2094
- morph_config_file = save_directory / "morph_tokenizer_config.json"
2095
- morph_config_file.write_text(json.dumps(morph_config))
2096
-
2097
- @classmethod
2098
- def from_pretrained(cls, pretrained_model_name_or_path: Union[str, Path], **kwargs):
2099
- """Load both text and morphological tokenizers."""
2100
- instance = cls(base_tokenizer_path=pretrained_model_name_or_path, **kwargs)
2101
-
2102
- morph_config_path = Path(pretrained_model_name_or_path) / "morph_tokenizer_config.json"
2103
- if morph_config_path.exists():
2104
- morph_config = json.loads(morph_config_path.read_text())
2105
- instance.morph_tokenizer.morph_encodings = morph_config["morph_encodings"]
2106
- instance.morph_tokenizer.special_tokens_map = morph_config["special_tokens_map"]
2107
- instance.morph_tokenizer.unique_tags = set(morph_config["unique_tags"])
2108
-
2109
- return instance
2110
-
2111
- def __call__(
2112
- self,
2113
- text: Union[List[str], List[List[str]]],
2114
- text_target: Optional[Union[str, List[str]]] = None,
2115
- morph_tags: Optional[List[List[str]]] = None,
2116
- padding: Union[bool, str, PaddingStrategy] = True,
2117
- truncation: bool = True,
2118
- max_length: Optional[int] = 512,
2119
- return_tensors: Optional[str] = None,
2120
- **kwargs,
2121
- ):
2122
- """
2123
- Process text and morphological tags.
2124
-
2125
- Args:
2126
- text: List of text blocks for each example or list of lists for batched input
2127
- text_target: Optional target text
2128
- morph_tags: List of morphological tags corresponding to text blocks
2129
- padding: Padding strategy
2130
- truncation: Whether to truncate sequences
2131
- max_length: Maximum sequence length
2132
- return_tensors: Return format for tensors
2133
- **kwargs: Additional arguments
2134
- """
2135
- # Get block separator token
2136
- block_sep = self.morph_tokenizer.block_separator_token
2137
-
2138
- # Format text with block separators
2139
- if text and isinstance(text[0], str):
2140
- formatted_text = [f" {block_sep} ".join(text)]
2141
- else:
2142
- formatted_text = [f" {block_sep} ".join(example) for example in text]
2143
-
2144
- encoding = self.text_tokenizer(
2145
- formatted_text,
2146
- text_target=text_target,
2147
- padding=padding,
2148
- truncation=truncation,
2149
- max_length=max_length,
2150
- return_tensors=return_tensors,
2151
- **kwargs,
2152
- )
2153
-
2154
- if morph_tags is not None:
2155
- # Ensure morph_tags is a list of lists for batch processing
2156
- if morph_tags and isinstance(morph_tags[0], str):
2157
- morph_tags = [morph_tags]
2158
-
2159
- morph_ids = [self.morph_tokenizer.encode(tags) for tags in morph_tags]
2160
- block_sep_id = self.text_tokenizer.convert_tokens_to_ids("<extra_id_0>")
2161
-
2162
- all_morph_arrays = []
2163
- for batch_idx, (tag_ids, input_ids) in enumerate(zip(morph_ids, encoding["input_ids"])):
2164
- text_ids = np.array(input_ids)
2165
- text_blocks = np.split(text_ids, np.where(text_ids == block_sep_id)[0])
2166
-
2167
- morph_array = []
2168
- for tag_id, text_block in zip(tag_ids, text_blocks):
2169
- morph_array.extend([tag_id] * len(text_block))
2170
-
2171
- morph_array = np.array(morph_array)
2172
- morph_array[text_ids == block_sep_id] = self.morph_tokenizer.block_separator_token_id
2173
- morph_array[text_ids == self.text_tokenizer.eos_token_id] = self.morph_tokenizer.eos_token_id
2174
- morph_array[text_ids == self.text_tokenizer.pad_token_id] = self.morph_tokenizer.pad_token_id
2175
- morph_array[text_ids == self.text_tokenizer.unk_token_id] = self.morph_tokenizer.unk_token_id
2176
-
2177
- all_morph_arrays.append(morph_array)
2178
-
2179
- encoding["input_morphs"] = all_morph_arrays
2180
-
2181
- if return_tensors == "pt":
2182
- import torch
2183
-
2184
- encoding["input_morphs"] = torch.tensor(encoding["input_morphs"])
2185
-
2186
- return encoding
2187
-
2188
- def decode(self, input_ids: List[int], skip_special_tokens: bool = True, keep_block_separator: bool = False) -> str:
2189
- """Decode input IDs back to text."""
2190
-
2191
- if skip_special_tokens and keep_block_separator:
2192
- decoded = self.text_tokenizer.decode(input_ids, skip_special_tokens=False)
2193
- special_tokens = {
2194
- self.text_tokenizer.eos_token,
2195
- self.text_tokenizer.pad_token,
2196
- self.text_tokenizer.unk_token,
2197
- }
2198
- decoded = self.text_tokenizer.decode(input_ids, skip_special_tokens=False)
2199
- for token in special_tokens:
2200
- decoded = decoded.replace(token, "")
2201
- return decoded.strip()
2202
-
2203
- decoded = self.text_tokenizer.decode(input_ids, skip_special_tokens=skip_special_tokens)
2204
- return decoded
2205
-
2206
- @property
2207
- def target_block_separator_token(self) -> str:
2208
- return "<extra_id_2>"
 
1
+ ########## Tokenizer Code ##########
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import List, Optional, Union
6
+
7
+ import numpy as np
8
+ from datasets import Dataset
9
+ from transformers import PreTrainedTokenizer, T5TokenizerFast
10
+ from transformers.utils import PaddingStrategy
11
+
12
+
13
+ class MorphTokenizer:
14
+ """Handles morphological tokenization with special tokens support."""
15
+
16
+ def __init__(self):
17
+ self.morph_encodings = {}
18
+ self.unique_tags = set()
19
+ self.special_tokens_map = {
20
+ "pad_token": "<pad>",
21
+ "eos_token": "<eos>",
22
+ "unk_token": "<unk>",
23
+ "block_separator_token": "<extra_id_0>",
24
+ }
25
+ self._special_token_ids = {"<pad>": 0, "<eos>": 1, "<unk>": 2, "<extra_id_0>": 3}
26
+
27
+ @property
28
+ def pad_token_id(self) -> int:
29
+ return self._special_token_ids[self.special_tokens_map["pad_token"]]
30
+
31
+ @property
32
+ def eos_token_id(self) -> int:
33
+ return self._special_token_ids[self.special_tokens_map["eos_token"]]
34
+
35
+ @property
36
+ def unk_token_id(self) -> int:
37
+ return self._special_token_ids[self.special_tokens_map["unk_token"]]
38
+
39
+ @property
40
+ def block_separator_token(self) -> str:
41
+ return self.special_tokens_map["block_separator_token"]
42
+
43
+ @property
44
+ def block_separator_token_id(self) -> int:
45
+ return self._special_token_ids[self.special_tokens_map["block_separator_token"]]
46
+
47
+ @property
48
+ def vocabulary_size(self) -> int:
49
+ return len(self.morph_encodings)
50
+
51
+ def initialize_vocab(self, dset: Dataset, tags_col: str) -> None:
52
+ """Initialize vocabulary from dataset."""
53
+ all_tags = set()
54
+ for split in dset:
55
+ all_tags.update(tag for tags in dset[split][tags_col] for tag in tags)
56
+
57
+ self.unique_tags = all_tags
58
+ self.morph_encodings = {token: idx for idx, token in enumerate(list(self._special_token_ids.keys()) + list(all_tags))}
59
+
60
+ def encode(self, tags: list[str]) -> list[int]:
61
+ """Convert tags to token ids."""
62
+ return [self.morph_encodings.get(tag, self.unk_token_id) for tag in tags]
63
+
64
+ def decode(self, ids: list[int]) -> list[str]:
65
+ """Convert token ids back to tags."""
66
+ id_to_token = {v: k for k, v in self.morph_encodings.items()}
67
+ return [id_to_token[id] for id in ids]
68
+
69
+
70
+ class MorphologicallyAwareTokenizer(PreTrainedTokenizer):
71
+ """T5Tokenizer with additional morphological tokenization capabilities."""
72
+
73
+ model_input_names = ["input_ids", "attention_mask", "input_morphs"]
74
+
75
+ def __init__(self, base_tokenizer_path: str, **kwargs):
76
+ """Initialize tokenizer with both text and morphological capabilities."""
77
+ super().__init__(**kwargs)
78
+
79
+ self.text_tokenizer = T5TokenizerFast.from_pretrained(base_tokenizer_path, subfolder="text_tokenizer")
80
+ self.morph_tokenizer = MorphTokenizer()
81
+
82
+ # Copy attributes from text tokenizer
83
+ self.pad_token = self.text_tokenizer.pad_token
84
+ self.eos_token = self.text_tokenizer.eos_token
85
+
86
+ def initialize_morph_vocab(self, dset: Dataset, tags_col: str) -> None:
87
+ self.morph_tokenizer.initialize_vocab(dset, tags_col)
88
+
89
+ def save_pretrained(self, save_directory: Union[str, Path], **kwargs):
90
+ """Save both text and morphological tokenizers."""
91
+ save_directory = Path(save_directory)
92
+ self.text_tokenizer.save_pretrained(save_directory / "text_tokenizer")
93
+
94
+ morph_config = {
95
+ "morph_encodings": self.morph_tokenizer.morph_encodings,
96
+ "special_tokens_map": self.morph_tokenizer.special_tokens_map,
97
+ "unique_tags": list(self.morph_tokenizer.unique_tags),
98
+ }
99
+
100
+ morph_config_file = save_directory / "morph_tokenizer_config.json"
101
+ morph_config_file.write_text(json.dumps(morph_config))
102
+
103
+ @classmethod
104
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, Path], **kwargs):
105
+ """Load both text and morphological tokenizers."""
106
+ instance = cls(base_tokenizer_path=pretrained_model_name_or_path, **kwargs)
107
+
108
+ morph_config_path = Path(pretrained_model_name_or_path) / "morph_tokenizer_config.json"
109
+ if morph_config_path.exists():
110
+ morph_config = json.loads(morph_config_path.read_text())
111
+ instance.morph_tokenizer.morph_encodings = morph_config["morph_encodings"]
112
+ instance.morph_tokenizer.special_tokens_map = morph_config["special_tokens_map"]
113
+ instance.morph_tokenizer.unique_tags = set(morph_config["unique_tags"])
114
+
115
+ return instance
116
+
117
+ def __call__(
118
+ self,
119
+ text: Union[List[str], List[List[str]]],
120
+ text_target: Optional[Union[str, List[str]]] = None,
121
+ morph_tags: Optional[List[List[str]]] = None,
122
+ padding: Union[bool, str, PaddingStrategy] = True,
123
+ truncation: bool = True,
124
+ max_length: Optional[int] = 512,
125
+ return_tensors: Optional[str] = None,
126
+ **kwargs,
127
+ ):
128
+ """
129
+ Process text and morphological tags.
130
+
131
+ Args:
132
+ text: List of text blocks for each example or list of lists for batched input
133
+ text_target: Optional target text
134
+ morph_tags: List of morphological tags corresponding to text blocks
135
+ padding: Padding strategy
136
+ truncation: Whether to truncate sequences
137
+ max_length: Maximum sequence length
138
+ return_tensors: Return format for tensors
139
+ **kwargs: Additional arguments
140
+ """
141
+ # Get block separator token
142
+ block_sep = self.morph_tokenizer.block_separator_token
143
+
144
+ # Format text with block separators
145
+ if text and isinstance(text[0], str):
146
+ formatted_text = [f" {block_sep} ".join(text)]
147
+ else:
148
+ formatted_text = [f" {block_sep} ".join(example) for example in text]
149
+
150
+ encoding = self.text_tokenizer(
151
+ formatted_text,
152
+ text_target=text_target,
153
+ padding=padding,
154
+ truncation=truncation,
155
+ max_length=max_length,
156
+ return_tensors=return_tensors,
157
+ **kwargs,
158
+ )
159
+
160
+ if morph_tags is not None:
161
+ # Ensure morph_tags is a list of lists for batch processing
162
+ if morph_tags and isinstance(morph_tags[0], str):
163
+ morph_tags = [morph_tags]
164
+
165
+ morph_ids = [self.morph_tokenizer.encode(tags) for tags in morph_tags]
166
+ block_sep_id = self.text_tokenizer.convert_tokens_to_ids("<extra_id_0>")
167
+
168
+ all_morph_arrays = []
169
+ for batch_idx, (tag_ids, input_ids) in enumerate(zip(morph_ids, encoding["input_ids"])):
170
+ text_ids = np.array(input_ids)
171
+ text_blocks = np.split(text_ids, np.where(text_ids == block_sep_id)[0])
172
+
173
+ morph_array = []
174
+ for tag_id, text_block in zip(tag_ids, text_blocks):
175
+ morph_array.extend([tag_id] * len(text_block))
176
+
177
+ morph_array = np.array(morph_array)
178
+ morph_array[text_ids == block_sep_id] = self.morph_tokenizer.block_separator_token_id
179
+ morph_array[text_ids == self.text_tokenizer.eos_token_id] = self.morph_tokenizer.eos_token_id
180
+ morph_array[text_ids == self.text_tokenizer.pad_token_id] = self.morph_tokenizer.pad_token_id
181
+ morph_array[text_ids == self.text_tokenizer.unk_token_id] = self.morph_tokenizer.unk_token_id
182
+
183
+ all_morph_arrays.append(morph_array)
184
+
185
+ encoding["input_morphs"] = all_morph_arrays
186
+
187
+ if return_tensors == "pt":
188
+ import torch
189
+
190
+ encoding["input_morphs"] = torch.tensor(encoding["input_morphs"])
191
+
192
+ return encoding
193
+
194
+ def decode(self, input_ids: List[int], skip_special_tokens: bool = True, keep_block_separator: bool = False) -> str:
195
+ """Decode input IDs back to text."""
196
+
197
+ if skip_special_tokens and keep_block_separator:
198
+ decoded = self.text_tokenizer.decode(input_ids, skip_special_tokens=False)
199
+ special_tokens = {
200
+ self.text_tokenizer.eos_token,
201
+ self.text_tokenizer.pad_token,
202
+ self.text_tokenizer.unk_token,
203
+ }
204
+ decoded = self.text_tokenizer.decode(input_ids, skip_special_tokens=False)
205
+ for token in special_tokens:
206
+ decoded = decoded.replace(token, "")
207
+ return decoded.strip()
208
+
209
+ decoded = self.text_tokenizer.decode(input_ids, skip_special_tokens=skip_special_tokens)
210
+ return decoded
211
+
212
+ @property
213
+ def target_block_separator_token(self) -> str:
214
+ return "<extra_id_2>"
215
+
216
+
217
  # Copyright 2020 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
218
  #
219
  # Licensed under the Apache License, Version 2.0 (the "License");
 
2206
  )
2207
 
2208
  return encoder_outputs