Upload 6 files
Browse files- llmjp4_tokenizer.py +16 -0
llmjp4_tokenizer.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
| 1 |
# llm-jp-4 tokenizer
|
| 2 |
|
| 3 |
from collections.abc import Sequence
|
|
|
|
| 4 |
|
| 5 |
from transformers import LlamaTokenizerFast
|
|
|
|
| 6 |
|
| 7 |
from .llmjp4_harmony import HarmonyMessageParser, HarmonyMessage
|
| 8 |
|
|
@@ -53,6 +55,20 @@ class Llmjp4Tokenizer(LlamaTokenizerFast):
|
|
| 53 |
},
|
| 54 |
}
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
def __init__(self, *args, **kwargs):
|
| 57 |
super().__init__(*args, **kwargs)
|
| 58 |
|
|
|
|
| 1 |
# llm-jp-4 tokenizer
|
| 2 |
|
| 3 |
from collections.abc import Sequence
|
| 4 |
+
import os
|
| 5 |
|
| 6 |
from transformers import LlamaTokenizerFast
|
| 7 |
+
from tokenizers import Tokenizer
|
| 8 |
|
| 9 |
from .llmjp4_harmony import HarmonyMessageParser, HarmonyMessage
|
| 10 |
|
|
|
|
| 55 |
},
|
| 56 |
}
|
| 57 |
|
| 58 |
+
@classmethod
|
| 59 |
+
def convert_to_native_format(cls, **kwargs):
|
| 60 |
+
# NOTE(odashi):
|
| 61 |
+
# Workaround for transformers 5.x.
|
| 62 |
+
# Guaranteeing the same inner behavior with TokenizersBackend.
|
| 63 |
+
# https://github.com/huggingface/transformers/blob/7d9754a05193eb79b1d86aa744b622b8068008cd/src/transformers/tokenization_utils_tokenizers.py#L110-L116
|
| 64 |
+
local_kwargs = dict(kwargs)
|
| 65 |
+
fast_tokenizer_file = local_kwargs.pop("tokenizer_file", None)
|
| 66 |
+
if fast_tokenizer_file is None or not os.path.isfile(fast_tokenizer_file):
|
| 67 |
+
raise ValueError("Tokenizer file must exist.")
|
| 68 |
+
|
| 69 |
+
local_kwargs["tokenizer_object"] = Tokenizer.from_file(fast_tokenizer_file)
|
| 70 |
+
return local_kwargs
|
| 71 |
+
|
| 72 |
def __init__(self, *args, **kwargs):
|
| 73 |
super().__init__(*args, **kwargs)
|
| 74 |
|