Taka008 commited on
Commit
832ee2e
·
verified ·
1 Parent(s): b68a2a0

Upload 6 files

Browse files
Files changed (1) hide show
  1. llmjp4_tokenizer.py +16 -0
llmjp4_tokenizer.py CHANGED
@@ -1,8 +1,10 @@
1
  # llm-jp-4 tokenizer
2
 
3
  from collections.abc import Sequence
 
4
 
5
  from transformers import LlamaTokenizerFast
 
6
 
7
  from .llmjp4_harmony import HarmonyMessageParser, HarmonyMessage
8
 
@@ -53,6 +55,20 @@ class Llmjp4Tokenizer(LlamaTokenizerFast):
53
  },
54
  }
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def __init__(self, *args, **kwargs):
57
  super().__init__(*args, **kwargs)
58
 
 
1
  # llm-jp-4 tokenizer
2
 
3
  from collections.abc import Sequence
4
+ import os
5
 
6
  from transformers import LlamaTokenizerFast
7
+ from tokenizers import Tokenizer
8
 
9
  from .llmjp4_harmony import HarmonyMessageParser, HarmonyMessage
10
 
 
55
  },
56
  }
57
 
58
+ @classmethod
59
+ def convert_to_native_format(cls, **kwargs):
60
+ # NOTE(odashi):
61
+ # Workaround for transformers 5.x.
62
+ # Guaranteeing the same inner behavior with TokenizersBackend.
63
+ # https://github.com/huggingface/transformers/blob/7d9754a05193eb79b1d86aa744b622b8068008cd/src/transformers/tokenization_utils_tokenizers.py#L110-L116
64
+ local_kwargs = dict(kwargs)
65
+ fast_tokenizer_file = local_kwargs.pop("tokenizer_file", None)
66
+ if fast_tokenizer_file is None or not os.path.isfile(fast_tokenizer_file):
67
+ raise ValueError("Tokenizer file must exist.")
68
+
69
+ local_kwargs["tokenizer_object"] = Tokenizer.from_file(fast_tokenizer_file)
70
+ return local_kwargs
71
+
72
  def __init__(self, *args, **kwargs):
73
  super().__init__(*args, **kwargs)
74