experimental_gqa_1_5b / tokenizer /load_tokenizer.py
Norapom's picture
Add Cl100kChatTokenizer (chat/think/tool reserved tokens)
37700a8 verified
"""Standalone shim: import Cl100kChatTokenizer without needing Megatron.
The tokenizer class only depends on `tiktoken`; the abstract base classes
from Megatron are stripped out at import-time here for portability.
"""
import sys, types
# Stub the Megatron abstract base classes that the file inherits from.
# We don't need their behaviour at runtime — only the class hierarchy.
_abstract = types.ModuleType("megatron.core.tokenizers.text.libraries.abstract_tokenizer")
class MegatronTokenizerTextAbstract: ...
_abstract.MegatronTokenizerTextAbstract = MegatronTokenizerTextAbstract
_chat = types.ModuleType("megatron.core.tokenizers.text.libraries.chat_template")
class MegatronTokenizerChatTemplate: ...
_chat.MegatronTokenizerChatTemplate = MegatronTokenizerChatTemplate
sys.modules.setdefault(
"megatron.core.tokenizers.text.libraries.abstract_tokenizer", _abstract
)
sys.modules.setdefault(
"megatron.core.tokenizers.text.libraries.chat_template", _chat
)
from cl100k_chat_tokenizer import Cl100kChatTokenizer, CL100K_CHAT_SPECIAL_TOKENS # noqa