SVECTOR-OFFICIAL commited on
Commit
a949a72
·
verified ·
1 Parent(s): ffdc618

Create tessar_tokenizer.py

Browse files
Files changed (1) hide show
  1. tessar_tokenizer.py +133 -0
tessar_tokenizer.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import List, Optional, Union
4
+
5
+ from transformers import PreTrainedTokenizerFast, AutoTokenizer
6
+ from transformers.tokenization_utils import PreTrainedTokenizerBase
7
+
8
+
9
+ class TessarTokenizer(PreTrainedTokenizerFast):
10
+ """
11
+ Tessar Tokenizer implementation for Hugging Face Transformers
12
+ """
13
+
14
+ model_input_names = ['input_ids', 'attention_mask']
15
+
16
+ def __init__(
17
+ self,
18
+ vocab_file=None,
19
+ tokenizer_file=None,
20
+ do_lower_case=True,
21
+ unk_token="<unk>",
22
+ sep_token="</s>",
23
+ pad_token="<pad>",
24
+ cls_token="<s>",
25
+ mask_token="<mask>",
26
+ bos_token="<s>",
27
+ eos_token="</s>",
28
+ max_cell_length=15,
29
+ **kwargs
30
+ ):
31
+ """
32
+ Initialize the Tessar Tokenizer with specific token configurations
33
+ """
34
+ # Prepare special tokens
35
+ special_tokens = {
36
+ "unk_token": unk_token,
37
+ "sep_token": sep_token,
38
+ "pad_token": pad_token,
39
+ "cls_token": cls_token,
40
+ "mask_token": mask_token,
41
+ "bos_token": bos_token,
42
+ "eos_token": eos_token,
43
+ }
44
+
45
+ # Remove None values
46
+ special_tokens = {k: v for k, v in special_tokens.items() if v is not None}
47
+
48
+ # Call parent constructor
49
+ super().__init__(
50
+ vocab_file=vocab_file,
51
+ tokenizer_file=tokenizer_file,
52
+ do_lower_case=do_lower_case,
53
+ **special_tokens,
54
+ **kwargs
55
+ )
56
+
57
+ # Custom Tessar-specific attributes
58
+ self.do_lower_case = do_lower_case
59
+ self.max_cell_length = max_cell_length
60
+
61
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
62
+ """
63
+ Save the tokenizer vocabulary and special tokens file
64
+ """
65
+ # Prepare file paths
66
+ vocab_file = os.path.join(
67
+ save_directory,
68
+ f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
69
+ )
70
+
71
+ # Save special tokens configuration
72
+ special_tokens_file = os.path.join(
73
+ save_directory,
74
+ f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
75
+ )
76
+
77
+ # Save vocabulary
78
+ with open(vocab_file, 'w', encoding='utf-8') as f:
79
+ json.dump(self.vocab, f, ensure_ascii=False, indent=2)
80
+
81
+ # Save special tokens configuration
82
+ special_tokens_config = {
83
+ "unk_token": self.unk_token,
84
+ "sep_token": self.sep_token,
85
+ "pad_token": self.pad_token,
86
+ "cls_token": self.cls_token,
87
+ "mask_token": self.mask_token,
88
+ "bos_token": self.bos_token,
89
+ "eos_token": self.eos_token,
90
+ "do_lower_case": self.do_lower_case,
91
+ "max_cell_length": self.max_cell_length
92
+ }
93
+
94
+ with open(special_tokens_file, 'w', encoding='utf-8') as f:
95
+ json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
96
+
97
+ return (vocab_file, special_tokens_file)
98
+
99
+ def _tokenize(self, text: str) -> List[str]:
100
+ """
101
+ Custom tokenization method
102
+ """
103
+ # Apply lowercase if required
104
+ if self.do_lower_case:
105
+ text = text.lower()
106
+
107
+ # Use the parent tokenizer's tokenization method
108
+ tokens = super()._tokenize(text)
109
+
110
+ # Optional: Add custom cell-length truncation
111
+ tokens = tokens[:self.max_cell_length]
112
+
113
+ return tokens
114
+
115
+ def prepare_for_model(
116
+ self,
117
+ ids: List[int],
118
+ pair_ids: Optional[List[int]] = None,
119
+ **kwargs
120
+ ) -> dict:
121
+ """
122
+ Prepare tokenized inputs for the model
123
+ """
124
+ return super().prepare_for_model(ids, pair_ids, **kwargs)
125
+
126
+ def load_tessar_tokenizer(pretrained_model_name_or_path: str):
127
+ """
128
+ Load a pretrained Tessar tokenizer
129
+ """
130
+ return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)
131
+
132
+ # Register the tokenizer with AutoTokenizer
133
+ AutoTokenizer.register(TessarTokenizer, "TessarTokenizer")