Sohan2004 commited on
Commit
027eff0
·
verified ·
1 Parent(s): b16259e

Upload tokenizer

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.TGT filter=lfs diff=lfs merge=lfs -text
37
+ model.SRC filter=lfs diff=lfs merge=lfs -text
dict.SRC.json ADDED
The diff for this file is too large to render. See raw diff
 
dict.TGT.json ADDED
The diff for this file is too large to render. See raw diff
 
model.SRC ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cedc5cbcc740369b76201942a0f096fec7287fee039b55bdb956f301235b914
3
+ size 759425
model.TGT ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac9257c8e76b8b607705b959cc3d075656ea33032f7a974e467b8941df6e98d4
3
+ size 3256903
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenization_indictrans.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ from transformers.utils import logging
5
+ from typing import Dict, List, Optional, Union, Tuple
6
+
7
+ from sentencepiece import SentencePieceProcessor
8
+ from transformers.tokenization_utils import PreTrainedTokenizer
9
+
10
+
11
+ logger = logging.get_logger(__name__)
12
+
13
+ # Convert LANGUAGE_TAGS to a frozen set for faster lookups
14
+ LANGUAGE_TAGS = frozenset(
15
+ {
16
+ "asm_Beng",
17
+ "awa_Deva",
18
+ "ben_Beng",
19
+ "bho_Deva",
20
+ "brx_Deva",
21
+ "doi_Deva",
22
+ "eng_Latn",
23
+ "gom_Deva",
24
+ "gon_Deva",
25
+ "guj_Gujr",
26
+ "hin_Deva",
27
+ "hne_Deva",
28
+ "kan_Knda",
29
+ "kas_Arab",
30
+ "kas_Deva",
31
+ "kha_Latn",
32
+ "lus_Latn",
33
+ "mag_Deva",
34
+ "mai_Deva",
35
+ "mal_Mlym",
36
+ "mar_Deva",
37
+ "mni_Beng",
38
+ "mni_Mtei",
39
+ "npi_Deva",
40
+ "ory_Orya",
41
+ "pan_Guru",
42
+ "san_Deva",
43
+ "sat_Olck",
44
+ "snd_Arab",
45
+ "snd_Deva",
46
+ "tam_Taml",
47
+ "tel_Telu",
48
+ "urd_Arab",
49
+ "unr_Deva",
50
+ }
51
+ )
52
+
53
+ VOCAB_FILES_NAMES = {
54
+ "src_vocab_fp": "dict.SRC.json",
55
+ "tgt_vocab_fp": "dict.TGT.json",
56
+ "src_spm_fp": "model.SRC",
57
+ "tgt_spm_fp": "model.TGT",
58
+ }
59
+
60
+
61
+ class IndicTransTokenizer(PreTrainedTokenizer):
62
+ _added_tokens_encoder: Dict[str, int] = {}
63
+ _added_tokens_decoder: Dict[str, int] = {}
64
+ vocab_files_names = VOCAB_FILES_NAMES
65
+ model_input_names = ["input_ids", "attention_mask"]
66
+
67
+ def __init__(
68
+ self,
69
+ src_vocab_fp=None,
70
+ tgt_vocab_fp=None,
71
+ src_spm_fp=None,
72
+ tgt_spm_fp=None,
73
+ unk_token="<unk>",
74
+ bos_token="<s>",
75
+ eos_token="</s>",
76
+ pad_token="<pad>",
77
+ do_lower_case=False,
78
+ **kwargs,
79
+ ):
80
+ self.src_vocab_fp = src_vocab_fp
81
+ self.tgt_vocab_fp = tgt_vocab_fp
82
+ self.src_spm_fp = src_spm_fp
83
+ self.tgt_spm_fp = tgt_spm_fp
84
+
85
+ # Store token content directly instead of accessing .content
86
+ self.unk_token = (
87
+ hasattr(unk_token, "content") and unk_token.content or unk_token
88
+ )
89
+ self.pad_token = (
90
+ hasattr(pad_token, "content") and pad_token.content or pad_token
91
+ )
92
+ self.eos_token = (
93
+ hasattr(eos_token, "content") and eos_token.content or eos_token
94
+ )
95
+ self.bos_token = (
96
+ hasattr(bos_token, "content") and bos_token.content or bos_token
97
+ )
98
+
99
+ # Load vocabularies
100
+ self.src_encoder = self._load_json(self.src_vocab_fp)
101
+ self.tgt_encoder = self._load_json(self.tgt_vocab_fp)
102
+
103
+ # Validate tokens
104
+ if self.unk_token not in self.src_encoder:
105
+ raise KeyError("<unk> token must be in vocab")
106
+ if self.pad_token not in self.src_encoder:
107
+ raise KeyError("<pad> token must be in vocab")
108
+
109
+ # Pre-compute reverse mappings
110
+ self.src_decoder = {v: k for k, v in self.src_encoder.items()}
111
+ self.tgt_decoder = {v: k for k, v in self.tgt_encoder.items()}
112
+
113
+ # Load SPM models
114
+ self.src_spm = self._load_spm(self.src_spm_fp)
115
+ self.tgt_spm = self._load_spm(self.tgt_spm_fp)
116
+
117
+ # Initialize current settings
118
+ self._switch_to_input_mode()
119
+
120
+ # Cache token IDs
121
+ self.unk_token_id = self.src_encoder[self.unk_token]
122
+ self.pad_token_id = self.src_encoder[self.pad_token]
123
+ self.eos_token_id = self.src_encoder[self.eos_token]
124
+ self.bos_token_id = self.src_encoder[self.bos_token]
125
+
126
+ super().__init__(
127
+ src_vocab_file=self.src_vocab_fp,
128
+ tgt_vocab_file=self.tgt_vocab_fp,
129
+ do_lower_case=do_lower_case,
130
+ unk_token=unk_token,
131
+ bos_token=bos_token,
132
+ eos_token=eos_token,
133
+ pad_token=pad_token,
134
+ **kwargs,
135
+ )
136
+
137
+ def add_new_language_tags(self, new_tags: List[str]) -> None:
138
+ global LANGUAGE_TAGS
139
+ LANGUAGE_TAGS = frozenset(LANGUAGE_TAGS | set(new_tags))
140
+
141
+ def _switch_to_input_mode(self) -> None:
142
+ self.spm = self.src_spm
143
+ self.padding_side = "left"
144
+ self.encoder = self.src_encoder
145
+ self.decoder = self.src_decoder
146
+ self._tokenize = self._src_tokenize
147
+
148
+ def _switch_to_target_mode(self) -> None:
149
+ self.spm = self.tgt_spm
150
+ self.padding_side = "right"
151
+ self.encoder = self.tgt_encoder
152
+ self.decoder = self.tgt_decoder
153
+ self._tokenize = self._tgt_tokenize
154
+
155
+ @staticmethod
156
+ def _load_spm(path: str) -> SentencePieceProcessor:
157
+ return SentencePieceProcessor(model_file=path)
158
+
159
+ @staticmethod
160
+ def _save_json(data: Union[Dict, List], path: str) -> None:
161
+ with open(path, "w", encoding="utf-8") as f:
162
+ json.dump(data, f, indent=2)
163
+
164
+ @staticmethod
165
+ def _load_json(path: str) -> Union[Dict, List]:
166
+ with open(path, "r", encoding="utf-8") as f:
167
+ return json.load(f)
168
+
169
+ @property
170
+ def src_vocab_size(self) -> int:
171
+ return len(self.src_encoder)
172
+
173
+ @property
174
+ def tgt_vocab_size(self) -> int:
175
+ return len(self.tgt_encoder)
176
+
177
+ def get_src_vocab(self) -> Dict[str, int]:
178
+ return dict(self.src_encoder, **self.added_tokens_encoder)
179
+
180
+ def get_tgt_vocab(self) -> Dict[str, int]:
181
+ return dict(self.tgt_encoder, **self.added_tokens_decoder)
182
+
183
+ def get_vocab(self) -> Dict[str, int]:
184
+ return self.get_src_vocab()
185
+
186
+ @property
187
+ def vocab_size(self) -> int:
188
+ return self.src_vocab_size
189
+
190
+ def _convert_token_to_id(self, token: str) -> int:
191
+ return self.encoder.get(token, self.unk_token_id)
192
+
193
+ def _convert_id_to_token(self, index: int) -> str:
194
+ return self.decoder.get(index, self.unk_token)
195
+
196
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
197
+ return "".join(tokens).replace("▁", " ").strip()
198
+
199
+ def _src_tokenize(self, text: str) -> List[str]:
200
+ src_lang, tgt_lang, text = text.split(" ", 2)
201
+ assert src_lang in LANGUAGE_TAGS, f"Invalid source language tag: {src_lang}"
202
+ assert tgt_lang in LANGUAGE_TAGS, f"Invalid target language tag: {tgt_lang}"
203
+ return [src_lang, tgt_lang] + self.spm.EncodeAsPieces(text)
204
+
205
+ def _tgt_tokenize(self, text: str) -> List[str]:
206
+ return self.spm.EncodeAsPieces(text)
207
+
208
+ def _decode(
209
+ self,
210
+ token_ids: Union[int, List[int]],
211
+ skip_special_tokens: bool = False,
212
+ clean_up_tokenization_spaces: bool = None,
213
+ spaces_between_special_tokens: bool = True,
214
+ **kwargs,
215
+ ) -> str:
216
+ self._switch_to_target_mode()
217
+ decoded_token_ids = super()._decode(
218
+ token_ids=token_ids,
219
+ skip_special_tokens=skip_special_tokens,
220
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
221
+ spaces_between_special_tokens=spaces_between_special_tokens,
222
+ **kwargs,
223
+ )
224
+ self._switch_to_input_mode()
225
+ return decoded_token_ids
226
+
227
+ def build_inputs_with_special_tokens(
228
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
229
+ ) -> List[int]:
230
+ return token_ids_0 + [self.eos_token_id]
231
+
232
+ def save_vocabulary(
233
+ self, save_directory: str, filename_prefix: Optional[str] = None
234
+ ) -> Tuple[str, ...]:
235
+ if not os.path.isdir(save_directory):
236
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
237
+ return ()
238
+
239
+ src_spm_fp = os.path.join(save_directory, "model.SRC")
240
+ tgt_spm_fp = os.path.join(save_directory, "model.TGT")
241
+ src_vocab_fp = os.path.join(save_directory, "dict.SRC.json")
242
+ tgt_vocab_fp = os.path.join(save_directory, "dict.TGT.json")
243
+
244
+ self._save_json(self.src_encoder, src_vocab_fp)
245
+ self._save_json(self.tgt_encoder, tgt_vocab_fp)
246
+
247
+ for fp, spm in [(src_spm_fp, self.src_spm), (tgt_spm_fp, self.tgt_spm)]:
248
+ with open(fp, "wb") as f:
249
+ f.write(spm.serialized_model_proto())
250
+
251
+ return src_vocab_fp, tgt_vocab_fp, src_spm_fp, tgt_spm_fp
tokenizer_config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "auto_map": {
37
+ "AutoTokenizer": [
38
+ "tokenization_indictrans.IndicTransTokenizer",
39
+ null
40
+ ]
41
+ },
42
+ "bos_token": "<s>",
43
+ "clean_up_tokenization_spaces": true,
44
+ "do_lower_case": false,
45
+ "eos_token": "</s>",
46
+ "extra_special_tokens": {},
47
+ "model_max_length": 256,
48
+ "pad_token": "<pad>",
49
+ "src_vocab_file": "/root/.cache/huggingface/hub/models--ai4bharat--indictrans2-en-indic-1B/snapshots/10e65a9951a1e922cd109a95e8aba9357b62144b/dict.SRC.json",
50
+ "tgt_vocab_file": "/root/.cache/huggingface/hub/models--ai4bharat--indictrans2-en-indic-1B/snapshots/10e65a9951a1e922cd109a95e8aba9357b62144b/dict.TGT.json",
51
+ "tokenizer_class": "IndicTransTokenizer",
52
+ "unk_token": "<unk>"
53
+ }