NeTS-lab commited on
Commit
9fc4d25
·
verified ·
1 Parent(s): 9880de2

Upload folder using huggingface_hub

Browse files
__init__.py ADDED
File without changes
added_tokens.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<unk>": 0,
3
+ "<pad>": 1,
4
+ "<s>": 2,
5
+ "</s>": 3,
6
+ "<mask>": 4,
7
+ "<sep>": 5,
8
+ "<cls>": 6
9
+ }
morpiece_data.json ADDED
The diff for this file is too large to render. See raw diff
 
morpiece_processor.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MorPiece Processor for Hugging Face Transformers with AutoProcessor support"""
2
+
3
+ import json
4
+ import os
5
+ from typing import List, Optional, Union
6
+ from transformers import ProcessorMixin, WhisperFeatureExtractor, CLIPImageProcessor
7
+ from transformers.utils import logging
8
+
9
+ logger = logging.get_logger(__name__)
10
+
11
+ try:
12
+ from .morpiece_tokenizer import MorPieceTokenizer
13
+ except ImportError:
14
+ from morpiece_tokenizer import MorPieceTokenizer
15
+
16
+ class MorPieceProcessor(ProcessorMixin):
17
+ """MorPiece processor that combines tokenizer with optional image/audio processors.
18
+
19
+ This processor is compatible with AutoProcessor.from_pretrained().
20
+ """
21
+
22
+ attributes = ["tokenizer"]
23
+ tokenizer_class = "MorPieceTokenizer"
24
+
25
+ def __init__(
26
+ self,
27
+ tokenizer=None,
28
+ image_processor=None,
29
+ feature_extractor=None,
30
+ processor_type="text_only",
31
+ **kwargs
32
+ ):
33
+ # Initialize the tokenizer
34
+ if tokenizer is None:
35
+ raise ValueError("MorPieceProcessor requires a tokenizer")
36
+
37
+ self.tokenizer = tokenizer
38
+ self.processor_type = processor_type
39
+
40
+ # Initialize additional processors based on type
41
+ if processor_type == "vision_text":
42
+ self.image_processor = image_processor
43
+ if hasattr(self, 'image_processor') and self.image_processor:
44
+ self.attributes.append("image_processor")
45
+ elif processor_type == "audio_text":
46
+ self.feature_extractor = feature_extractor
47
+ if hasattr(self, 'feature_extractor') and self.feature_extractor:
48
+ self.attributes.append("feature_extractor")
49
+
50
+ super().__init__(**kwargs)
51
+
52
+ def __call__(
53
+ self,
54
+ text: Union[str, List[str]] = None,
55
+ images = None,
56
+ audio = None,
57
+ return_tensors: Optional[str] = None,
58
+ **kwargs
59
+ ):
60
+ """
61
+ Process inputs based on processor type
62
+
63
+ Parameters
64
+ ----------
65
+ text : str or List[str], optional
66
+ Text input(s) to tokenize
67
+ images : PIL.Image or List[PIL.Image], optional
68
+ Image input(s) to process (for vision_text processor)
69
+ audio : np.ndarray or List[np.ndarray], optional
70
+ Audio input(s) to process (for audio_text processor)
71
+ return_tensors : str, optional
72
+ Type of tensors to return ('pt', 'tf', 'np')
73
+ **kwargs
74
+ Additional arguments passed to the respective processors
75
+ """
76
+
77
+ # Process text if provided
78
+ if text is not None:
79
+ text_inputs = self.tokenizer(
80
+ text,
81
+ return_tensors=return_tensors,
82
+ **{k: v for k, v in kwargs.items() if k in self.tokenizer.__call__.__code__.co_varnames}
83
+ )
84
+ else:
85
+ text_inputs = {}
86
+
87
+ # Process images if provided (vision_text processor)
88
+ if images is not None and self.processor_type == "vision_text":
89
+ if hasattr(self, 'image_processor') and self.image_processor:
90
+ image_inputs = self.image_processor(
91
+ images,
92
+ return_tensors=return_tensors,
93
+ **{k: v for k, v in kwargs.items() if k in self.image_processor.__call__.__code__.co_varnames}
94
+ )
95
+ text_inputs.update(image_inputs)
96
+ else:
97
+ raise ValueError("Image processor not initialized for vision_text processor type")
98
+
99
+ # Process audio if provided (audio_text processor)
100
+ if audio is not None and self.processor_type == "audio_text":
101
+ if hasattr(self, 'feature_extractor') and self.feature_extractor:
102
+ audio_inputs = self.feature_extractor(
103
+ audio,
104
+ return_tensors=return_tensors,
105
+ **{k: v for k, v in kwargs.items() if k in self.feature_extractor.__call__.__code__.co_varnames}
106
+ )
107
+ text_inputs.update(audio_inputs)
108
+ else:
109
+ raise ValueError("Feature extractor not initialized for audio_text processor type")
110
+
111
+ return text_inputs
112
+
113
+ def batch_decode(self, *args, **kwargs):
114
+ """
115
+ This method forwards all its arguments to the tokenizer's batch_decode.
116
+ """
117
+ return self.tokenizer.batch_decode(*args, **kwargs)
118
+
119
+ def decode(self, *args, **kwargs):
120
+ """
121
+ This method forwards all its arguments to the tokenizer's decode.
122
+ """
123
+ return self.tokenizer.decode(*args, **kwargs)
124
+
125
+ @classmethod
126
+ def from_pretrained(
127
+ cls,
128
+ pretrained_model_name_or_path: Union[str, os.PathLike],
129
+ cache_dir: Optional[Union[str, os.PathLike]] = None,
130
+ force_download: bool = False,
131
+ local_files_only: bool = False,
132
+ token: Optional[Union[str, bool]] = None,
133
+ revision: str = "main",
134
+ **kwargs,
135
+ ):
136
+ """
137
+ Load a processor from a pretrained model.
138
+ """
139
+ # Load processor config
140
+ processor_config_file = os.path.join(pretrained_model_name_or_path, "processor_config.json")
141
+ if os.path.exists(processor_config_file):
142
+ with open(processor_config_file, 'r') as f:
143
+ config = json.load(f)
144
+ else:
145
+ config = {"processor_type": "text_only"}
146
+
147
+ processor_type = config.get("morpiece_config", {}).get("processor_type", "text_only")
148
+
149
+ # Load tokenizer
150
+ tokenizer = MorPieceTokenizer.from_pretrained(
151
+ pretrained_model_name_or_path,
152
+ **kwargs
153
+ )
154
+
155
+ # Load additional processors based on type
156
+ image_processor = None
157
+ feature_extractor = None
158
+
159
+ if processor_type == "vision_text":
160
+ try:
161
+ image_processor = CLIPImageProcessor.from_pretrained(
162
+ pretrained_model_name_or_path,
163
+ **kwargs
164
+ )
165
+ except:
166
+ logger.warning("Could not load image processor, using default CLIPImageProcessor")
167
+ image_processor = CLIPImageProcessor()
168
+
169
+ elif processor_type == "audio_text":
170
+ try:
171
+ feature_extractor = WhisperFeatureExtractor.from_pretrained(
172
+ pretrained_model_name_or_path,
173
+ **kwargs
174
+ )
175
+ except:
176
+ logger.warning("Could not load feature extractor, using default WhisperFeatureExtractor")
177
+ feature_extractor = WhisperFeatureExtractor()
178
+
179
+ return cls(
180
+ tokenizer=tokenizer,
181
+ image_processor=image_processor,
182
+ feature_extractor=feature_extractor,
183
+ processor_type=processor_type,
184
+ **kwargs
185
+ )
186
+
187
+ @property
188
+ def model_input_names(self):
189
+ """
190
+ List of input names expected by the model
191
+ """
192
+ input_names = ["input_ids", "attention_mask"]
193
+
194
+ if self.processor_type == "vision_text":
195
+ input_names.extend(["pixel_values"])
196
+ elif self.processor_type == "audio_text":
197
+ input_names.extend(["input_features"])
198
+
199
+ return input_names
200
+
morpiece_tokenizer.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MorPiece Tokenizer for Hugging Face Transformers"""
2
+
3
+ import json
4
+ import os
5
+ from typing import List, Optional, Tuple, Union, Dict, Any
6
+ from transformers import PreTrainedTokenizer
7
+ from transformers.utils import logging
8
+
9
+ logger = logging.get_logger(__name__)
10
+
11
+
12
+ class MorPieceTokenizer(PreTrainedTokenizer):
13
+ """MorPiece tokenizer for Hugging Face transformers.
14
+
15
+ This tokenizer uses morphological segmentation based on tries and the sufficiency principle.
16
+ """
17
+
18
+ vocab_files_names = {
19
+ "vocab_file": "vocab.json",
20
+ "tokenizer_file": "tokenizer.json",
21
+ }
22
+
23
+ def __init__(
24
+ self,
25
+ vocab_file=None,
26
+ tokenizer_file=None,
27
+ unk_token="<unk>",
28
+ pad_token="<pad>",
29
+ bos_token="<s>",
30
+ eos_token="</s>",
31
+ mask_token="<mask>",
32
+ sep_token="<sep>",
33
+ cls_token="<cls>",
34
+ add_prefix_space=True,
35
+ vocab_size=60000,
36
+ min_frequency=10,
37
+ cutoff=100,
38
+ bf=4,
39
+ use_tokenizers_lib=True,
40
+ **kwargs
41
+ ):
42
+ self.vocab_to_id = {}
43
+ self.id_to_vocab = {}
44
+
45
+ # Initialize the parent class
46
+ super().__init__(
47
+ unk_token=unk_token,
48
+ pad_token=pad_token,
49
+ bos_token=bos_token,
50
+ eos_token=eos_token,
51
+ mask_token=mask_token,
52
+ sep_token=sep_token,
53
+ cls_token=cls_token,
54
+ add_prefix_space=add_prefix_space,
55
+ **kwargs
56
+ )
57
+
58
+ # Store MorPiece specific parameters
59
+ self.min_frequency = min_frequency
60
+ self.cutoff = cutoff
61
+ self.bf = bf
62
+ self.use_tokenizers_lib = use_tokenizers_lib
63
+
64
+ # Load vocabulary
65
+ if vocab_file and os.path.exists(vocab_file):
66
+ with open(vocab_file, "r", encoding="utf-8") as f:
67
+ self.vocab_to_id = json.load(f)
68
+ else:
69
+ self.vocab_to_id = {}
70
+
71
+ self.id_to_vocab = {v: k for k, v in self.vocab_to_id.items()}
72
+
73
+ # Load tokenizer configuration
74
+ if tokenizer_file and os.path.exists(tokenizer_file):
75
+ with open(tokenizer_file, "r", encoding="utf-8") as f:
76
+ tokenizer_config = json.load(f)
77
+ if "model" in tokenizer_config:
78
+ self.roots = tokenizer_config["model"].get("roots", {})
79
+ else:
80
+ self.roots = {}
81
+ else:
82
+ self.roots = {}
83
+
84
+ # Set special token IDs
85
+ self.unk_token_id = self.vocab_to_id.get(unk_token, 0)
86
+ self.pad_token_id = self.vocab_to_id.get(pad_token, 1)
87
+ self.bos_token_id = self.vocab_to_id.get(bos_token, 2)
88
+ self.eos_token_id = self.vocab_to_id.get(eos_token, 3)
89
+ self.mask_token_id = self.vocab_to_id.get(mask_token, 4)
90
+ self.sep_token_id = self.vocab_to_id.get(sep_token, 5)
91
+ self.cls_token_id = self.vocab_to_id.get(cls_token, 6)
92
+
93
+ @property
94
+ def vocab_size(self) -> int:
95
+ return len(self.vocab_to_id)
96
+
97
+ def get_vocab(self) -> Dict[str, int]:
98
+ return self.vocab_to_id.copy()
99
+
100
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
101
+ """Tokenize a string using MorPiece algorithm"""
102
+ # This is a simplified version - you may want to integrate the full MorPiece logic
103
+ words = text.strip().split()
104
+ tokens = []
105
+
106
+ for word in words:
107
+ if word in self.roots.get('[RSX]', {}):
108
+ tokens.append(word)
109
+ else:
110
+ # Use simplified tokenization for now
111
+ tokens.extend(self._tokenize_word(word))
112
+
113
+ return tokens
114
+
115
+ def _tokenize_word(self, word: str) -> List[str]:
116
+ """Tokenize a single word using MorPiece trie traversal"""
117
+ # Simplified implementation
118
+ tokens = []
119
+ i = 0
120
+ while i < len(word):
121
+ found = False
122
+ # Try to find longest match in vocabulary
123
+ for j in range(len(word), i, -1):
124
+ subword = word[i:j]
125
+ if subword in self.vocab_to_id:
126
+ tokens.append(subword)
127
+ i = j
128
+ found = True
129
+ break
130
+ if not found:
131
+ tokens.append(self.unk_token)
132
+ i += 1
133
+ return tokens
134
+
135
+ def _convert_token_to_id(self, token: str) -> int:
136
+ """Convert a token to its ID"""
137
+ return self.vocab_to_id.get(token, self.unk_token_id)
138
+
139
+ def _convert_id_to_token(self, index: int) -> str:
140
+ """Convert an ID to its token"""
141
+ return self.id_to_vocab.get(index, self.unk_token)
142
+
143
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
144
+ """Convert a list of tokens to a string"""
145
+ # Handle special prefix tokens
146
+ result = []
147
+ for token in tokens:
148
+ if token.startswith('++'):
149
+ result.append(token[2:]) # Remove ++ prefix
150
+ else:
151
+ result.append(token)
152
+ return ''.join(result)
153
+
154
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
155
+ """Save vocabulary to files"""
156
+ if not os.path.isdir(save_directory):
157
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
158
+ return
159
+
160
+ vocab_file = os.path.join(
161
+ save_directory,
162
+ (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
163
+ )
164
+
165
+ with open(vocab_file, "w", encoding="utf-8") as f:
166
+ json.dump(self.vocab_to_id, f, indent=2, sort_keys=True, ensure_ascii=False)
167
+
168
+ return (vocab_file,)
169
+
processor_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "MorPieceProcessor",
3
+ "auto_map": {
4
+ "AutoProcessor": "morpiece_processor.MorPieceProcessor"
5
+ },
6
+ "tokenizer_class": "MorPieceTokenizer",
7
+ "feature_extractor_class": null,
8
+ "image_processor_class": null,
9
+ "audio_processor_class": null,
10
+ "morpiece_config": {
11
+ "vocab_size": 50684,
12
+ "min_frequency": 10,
13
+ "cutoff": 100,
14
+ "bf": 10,
15
+ "use_tokenizers_lib": true,
16
+ "processor_type": "text_only"
17
+ }
18
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "<pad>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "mask_token": {
31
+ "content": "<mask>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "<sep>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "cls_token": {
45
+ "content": "<cls>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "MorPieceTokenizer",
3
+ "auto_map": {
4
+ "AutoTokenizer": [
5
+ "morpiece_tokenizer.MorPieceTokenizer",
6
+ null
7
+ ]
8
+ },
9
+ "bos_token": "<s>",
10
+ "eos_token": "</s>",
11
+ "unk_token": "<unk>",
12
+ "pad_token": "<pad>",
13
+ "mask_token": "<mask>",
14
+ "sep_token": "<sep>",
15
+ "cls_token": "<cls>",
16
+ "model_max_length": 512,
17
+ "padding_side": "left",
18
+ "truncation_side": "right",
19
+ "chat_template": null,
20
+ "clean_up_tokenization_spaces": false,
21
+ "split_special_tokens": false,
22
+ "strip_accents": null,
23
+ "add_prefix_space": true,
24
+ "vocab_size": 50684,
25
+ "min_frequency": 10,
26
+ "cutoff": 100,
27
+ "bf": 10,
28
+ "use_tokenizers_lib": true
29
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff