Spaces:
Sleeping
Sleeping
| from typing import Dict, List, Tuple | |
| from tiktoken import get_encoding | |
| from transformers.tokenization_utils_base import PreTrainedTokenizerBase | |
| # Create a wrapper class to make OpenAI's tokenizer compatible with the HybridChunker interface | |
| class OpenAITokenizerWrapper(PreTrainedTokenizerBase): | |
| """Minimal wrapper for OpenAI's tokenizer.""" | |
| def __init__( | |
| self, model_name: str = "cl100k_base", max_length: int = 8191, **kwargs | |
| ): | |
| """Initialize the tokenizer. | |
| Args: | |
| model_name: The name of the OpenAI encoding to use | |
| max_length: Maximum sequence length | |
| """ | |
| super().__init__(model_max_length=max_length, **kwargs) | |
| self.tokenizer = get_encoding(model_name) | |
| self._vocab_size = self.tokenizer.max_token_value | |
| def tokenize(self, text: str, **kwargs) -> List[str]: | |
| """Main method used by HybridChunker.""" | |
| return [str(t) for t in self.tokenizer.encode(text)] | |
| def _tokenize(self, text: str) -> List[str]: | |
| return self.tokenize(text) | |
| def _convert_token_to_id(self, token: str) -> int: | |
| return int(token) | |
| def _convert_id_to_token(self, index: int) -> str: | |
| return str(index) | |
| def get_vocab(self) -> Dict[str, int]: | |
| return dict(enumerate(range(self.vocab_size))) | |
| def vocab_size(self) -> int: | |
| return self._vocab_size | |
| def save_vocabulary(self, *args) -> Tuple[str]: | |
| return () | |
| def from_pretrained(cls, *args, **kwargs): | |
| """Class method to match HuggingFace's interface.""" | |
| return cls() |