Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

__init__.py +0 -0
added_tokens.json +9 -0
morpiece_data.json +0 -0
morpiece_processor.py +200 -0
morpiece_tokenizer.py +169 -0
processor_config.json +18 -0
special_tokens_map.json +51 -0
tokenizer.json +0 -0
tokenizer_config.json +29 -0
vocab.json +0 -0

__init__.py ADDED Viewed

File without changes

added_tokens.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "<unk>": 0,
+  "<pad>": 1,
+  "<s>": 2,
+  "</s>": 3,
+  "<mask>": 4,
+  "<sep>": 5,
+  "<cls>": 6
+}

morpiece_data.json ADDED Viewed

The diff for this file is too large to render. See raw diff

morpiece_processor.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""MorPiece Processor for Hugging Face Transformers with AutoProcessor support"""
+import json
+import os
+from typing import List, Optional, Union
+from transformers import ProcessorMixin, WhisperFeatureExtractor, CLIPImageProcessor
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+try:
+    from .morpiece_tokenizer import MorPieceTokenizer
+except ImportError:
+    from morpiece_tokenizer import MorPieceTokenizer
+class MorPieceProcessor(ProcessorMixin):
+    """MorPiece processor that combines tokenizer with optional image/audio processors.
+    This processor is compatible with AutoProcessor.from_pretrained().
+    """
+    attributes = ["tokenizer"]
+    tokenizer_class = "MorPieceTokenizer"
+    def __init__(
+        self,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor_type="text_only",
+        **kwargs
+    ):
+        # Initialize the tokenizer
+        if tokenizer is None:
+            raise ValueError("MorPieceProcessor requires a tokenizer")
+        self.tokenizer = tokenizer
+        self.processor_type = processor_type
+        # Initialize additional processors based on type
+        if processor_type == "vision_text":
+            self.image_processor = image_processor
+            if hasattr(self, 'image_processor') and self.image_processor:
+                self.attributes.append("image_processor")
+        elif processor_type == "audio_text":
+            self.feature_extractor = feature_extractor
+            if hasattr(self, 'feature_extractor') and self.feature_extractor:
+                self.attributes.append("feature_extractor")
+        super().__init__(**kwargs)
+    def __call__(
+        self,
+        text: Union[str, List[str]] = None,
+        images = None,
+        audio = None,
+        return_tensors: Optional[str] = None,
+        **kwargs
+    ):
+        """
+        Process inputs based on processor type
+        Parameters
+        ----------
+        text : str or List[str], optional
+            Text input(s) to tokenize
+        images : PIL.Image or List[PIL.Image], optional
+            Image input(s) to process (for vision_text processor)
+        audio : np.ndarray or List[np.ndarray], optional
+            Audio input(s) to process (for audio_text processor)
+        return_tensors : str, optional
+            Type of tensors to return ('pt', 'tf', 'np')
+        **kwargs
+            Additional arguments passed to the respective processors
+        """
+        # Process text if provided
+        if text is not None:
+            text_inputs = self.tokenizer(
+                text,
+                return_tensors=return_tensors,
+                **{k: v for k, v in kwargs.items() if k in self.tokenizer.__call__.__code__.co_varnames}
+            )
+        else:
+            text_inputs = {}
+        # Process images if provided (vision_text processor)
+        if images is not None and self.processor_type == "vision_text":
+            if hasattr(self, 'image_processor') and self.image_processor:
+                image_inputs = self.image_processor(
+                    images,
+                    return_tensors=return_tensors,
+                    **{k: v for k, v in kwargs.items() if k in self.image_processor.__call__.__code__.co_varnames}
+                )
+                text_inputs.update(image_inputs)
+            else:
+                raise ValueError("Image processor not initialized for vision_text processor type")
+        # Process audio if provided (audio_text processor)
+        if audio is not None and self.processor_type == "audio_text":
+            if hasattr(self, 'feature_extractor') and self.feature_extractor:
+                audio_inputs = self.feature_extractor(
+                    audio,
+                    return_tensors=return_tensors,
+                    **{k: v for k, v in kwargs.items() if k in self.feature_extractor.__call__.__code__.co_varnames}
+                )
+                text_inputs.update(audio_inputs)
+            else:
+                raise ValueError("Feature extractor not initialized for audio_text processor type")
+        return text_inputs
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to the tokenizer's batch_decode.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to the tokenizer's decode.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ):
+        """
+        Load a processor from a pretrained model.
+        """
+        # Load processor config
+        processor_config_file = os.path.join(pretrained_model_name_or_path, "processor_config.json")
+        if os.path.exists(processor_config_file):
+            with open(processor_config_file, 'r') as f:
+                config = json.load(f)
+        else:
+            config = {"processor_type": "text_only"}
+        processor_type = config.get("morpiece_config", {}).get("processor_type", "text_only")
+        # Load tokenizer
+        tokenizer = MorPieceTokenizer.from_pretrained(
+            pretrained_model_name_or_path,
+            **kwargs
+        )
+        # Load additional processors based on type
+        image_processor = None
+        feature_extractor = None
+        if processor_type == "vision_text":
+            try:
+                image_processor = CLIPImageProcessor.from_pretrained(
+                    pretrained_model_name_or_path,
+                    **kwargs
+                )
+            except:
+                logger.warning("Could not load image processor, using default CLIPImageProcessor")
+                image_processor = CLIPImageProcessor()
+        elif processor_type == "audio_text":
+            try:
+                feature_extractor = WhisperFeatureExtractor.from_pretrained(
+                    pretrained_model_name_or_path,
+                    **kwargs
+                )
+            except:
+                logger.warning("Could not load feature extractor, using default WhisperFeatureExtractor")
+                feature_extractor = WhisperFeatureExtractor()
+        return cls(
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            feature_extractor=feature_extractor,
+            processor_type=processor_type,
+            **kwargs
+        )
+    @property
+    def model_input_names(self):
+        """
+        List of input names expected by the model
+        """
+        input_names = ["input_ids", "attention_mask"]
+        if self.processor_type == "vision_text":
+            input_names.extend(["pixel_values"])
+        elif self.processor_type == "audio_text":
+            input_names.extend(["input_features"])
+        return input_names

morpiece_tokenizer.py ADDED Viewed

	@@ -0,0 +1,169 @@

+"""MorPiece Tokenizer for Hugging Face Transformers"""
+import json
+import os
+from typing import List, Optional, Tuple, Union, Dict, Any
+from transformers import PreTrainedTokenizer
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class MorPieceTokenizer(PreTrainedTokenizer):
+    """MorPiece tokenizer for Hugging Face transformers.
+    This tokenizer uses morphological segmentation based on tries and the sufficiency principle.
+    """
+    vocab_files_names = {
+        "vocab_file": "vocab.json",
+        "tokenizer_file": "tokenizer.json",
+    }
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        unk_token="<unk>",
+        pad_token="<pad>",
+        bos_token="<s>",
+        eos_token="</s>",
+        mask_token="<mask>",
+        sep_token="<sep>",
+        cls_token="<cls>",
+        add_prefix_space=True,
+        vocab_size=60000,
+        min_frequency=10,
+        cutoff=100,
+        bf=4,
+        use_tokenizers_lib=True,
+        **kwargs
+    ):
+        self.vocab_to_id = {}
+        self.id_to_vocab = {}
+        # Initialize the parent class
+        super().__init__(
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            mask_token=mask_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs
+        )
+        # Store MorPiece specific parameters
+        self.min_frequency = min_frequency
+        self.cutoff = cutoff
+        self.bf = bf
+        self.use_tokenizers_lib = use_tokenizers_lib
+        # Load vocabulary
+        if vocab_file and os.path.exists(vocab_file):
+            with open(vocab_file, "r", encoding="utf-8") as f:
+                self.vocab_to_id = json.load(f)
+        else:
+            self.vocab_to_id = {}
+        self.id_to_vocab = {v: k for k, v in self.vocab_to_id.items()}
+        # Load tokenizer configuration
+        if tokenizer_file and os.path.exists(tokenizer_file):
+            with open(tokenizer_file, "r", encoding="utf-8") as f:
+                tokenizer_config = json.load(f)
+                if "model" in tokenizer_config:
+                    self.roots = tokenizer_config["model"].get("roots", {})
+                else:
+                    self.roots = {}
+        else:
+            self.roots = {}
+        # Set special token IDs
+        self.unk_token_id = self.vocab_to_id.get(unk_token, 0)
+        self.pad_token_id = self.vocab_to_id.get(pad_token, 1)
+        self.bos_token_id = self.vocab_to_id.get(bos_token, 2)
+        self.eos_token_id = self.vocab_to_id.get(eos_token, 3)
+        self.mask_token_id = self.vocab_to_id.get(mask_token, 4)
+        self.sep_token_id = self.vocab_to_id.get(sep_token, 5)
+        self.cls_token_id = self.vocab_to_id.get(cls_token, 6)
+    @property
+    def vocab_size(self) -> int:
+        return len(self.vocab_to_id)
+    def get_vocab(self) -> Dict[str, int]:
+        return self.vocab_to_id.copy()
+    def _tokenize(self, text: str, **kwargs) -> List[str]:
+        """Tokenize a string using MorPiece algorithm"""
+        # This is a simplified version - you may want to integrate the full MorPiece logic
+        words = text.strip().split()
+        tokens = []
+        for word in words:
+            if word in self.roots.get('[RSX]', {}):
+                tokens.append(word)
+            else:
+                # Use simplified tokenization for now
+                tokens.extend(self._tokenize_word(word))
+        return tokens
+    def _tokenize_word(self, word: str) -> List[str]:
+        """Tokenize a single word using MorPiece trie traversal"""
+        # Simplified implementation
+        tokens = []
+        i = 0
+        while i < len(word):
+            found = False
+            # Try to find longest match in vocabulary
+            for j in range(len(word), i, -1):
+                subword = word[i:j]
+                if subword in self.vocab_to_id:
+                    tokens.append(subword)
+                    i = j
+                    found = True
+                    break
+            if not found:
+                tokens.append(self.unk_token)
+                i += 1
+        return tokens
+    def _convert_token_to_id(self, token: str) -> int:
+        """Convert a token to its ID"""
+        return self.vocab_to_id.get(token, self.unk_token_id)
+    def _convert_id_to_token(self, index: int) -> str:
+        """Convert an ID to its token"""
+        return self.id_to_vocab.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Convert a list of tokens to a string"""
+        # Handle special prefix tokens
+        result = []
+        for token in tokens:
+            if token.startswith('++'):
+                result.append(token[2:])  # Remove ++ prefix
+            else:
+                result.append(token)
+        return ''.join(result)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """Save vocabulary to files"""
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            json.dump(self.vocab_to_id, f, indent=2, sort_keys=True, ensure_ascii=False)
+        return (vocab_file,)

processor_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "processor_class": "MorPieceProcessor",
+  "auto_map": {
+    "AutoProcessor": "morpiece_processor.MorPieceProcessor"
+  },
+  "tokenizer_class": "MorPieceTokenizer",
+  "feature_extractor_class": null,
+  "image_processor_class": null,
+  "audio_processor_class": null,
+  "morpiece_config": {
+    "vocab_size": 50684,
+    "min_frequency": 10,
+    "cutoff": 100,
+    "bf": 10,
+    "use_tokenizers_lib": true,
+    "processor_type": "text_only"
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "<sep>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<cls>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "tokenizer_class": "MorPieceTokenizer",
+  "auto_map": {
+    "AutoTokenizer": [
+      "morpiece_tokenizer.MorPieceTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "unk_token": "<unk>",
+  "pad_token": "<pad>",
+  "mask_token": "<mask>",
+  "sep_token": "<sep>",
+  "cls_token": "<cls>",
+  "model_max_length": 512,
+  "padding_side": "left",
+  "truncation_side": "right",
+  "chat_template": null,
+  "clean_up_tokenization_spaces": false,
+  "split_special_tokens": false,
+  "strip_accents": null,
+  "add_prefix_space": true,
+  "vocab_size": 50684,
+  "min_frequency": 10,
+  "cutoff": 100,
+  "bf": 10,
+  "use_tokenizers_lib": true
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff