|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Utils for tokenization."""
|
|
|
|
|
|
import warnings
|
|
|
|
|
|
__all__ = ["hf_tokenizer", "hf_processor"]
|
|
|
|
|
|
|
|
|
def set_pad_token_id(tokenizer):
|
|
|
"""Set pad_token_id to eos_token_id if it is None.
|
|
|
|
|
|
Args:
|
|
|
tokenizer (transformers.PreTrainedTokenizer): The tokenizer to be set.
|
|
|
|
|
|
"""
|
|
|
if tokenizer.pad_token_id is None:
|
|
|
tokenizer.pad_token_id = tokenizer.eos_token_id
|
|
|
warnings.warn(f"tokenizer.pad_token_id is None. Now set to {tokenizer.eos_token_id}", stacklevel=1)
|
|
|
if tokenizer.pad_token is None:
|
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
|
warnings.warn(f"tokenizer.pad_token is None. Now set to {tokenizer.eos_token}", stacklevel=1)
|
|
|
|
|
|
|
|
|
def hf_tokenizer(name_or_path, correct_pad_token=True, correct_gemma2=True, **kwargs):
|
|
|
"""Create a huggingface pretrained tokenizer which correctness handles eos and pad tokens.
|
|
|
|
|
|
Args:
|
|
|
|
|
|
name (str): The name of the tokenizer.
|
|
|
correct_pad_token (bool): Whether to correct the pad token id.
|
|
|
correct_gemma2 (bool): Whether to correct the gemma2 tokenizer.
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
transformers.PreTrainedTokenizer: The pretrained tokenizer.
|
|
|
|
|
|
"""
|
|
|
from transformers import AutoTokenizer
|
|
|
|
|
|
if correct_gemma2 and isinstance(name_or_path, str) and "gemma-2-2b-it" in name_or_path:
|
|
|
|
|
|
|
|
|
warnings.warn("Found gemma-2-2b-it tokenizer. Set eos_token and eos_token_id to <end_of_turn> and 107.", stacklevel=1)
|
|
|
kwargs["eos_token"] = "<end_of_turn>"
|
|
|
kwargs["eos_token_id"] = 107
|
|
|
tokenizer = AutoTokenizer.from_pretrained(name_or_path, **kwargs)
|
|
|
if correct_pad_token:
|
|
|
set_pad_token_id(tokenizer)
|
|
|
return tokenizer
|
|
|
|
|
|
|
|
|
def hf_processor(name_or_path, **kwargs):
|
|
|
"""Create a huggingface processor to process multimodal data.
|
|
|
|
|
|
Args:
|
|
|
name_or_path (str): The name of the processor.
|
|
|
|
|
|
Returns:
|
|
|
transformers.ProcessorMixin: The pretrained processor.
|
|
|
"""
|
|
|
from transformers import AutoProcessor
|
|
|
|
|
|
try:
|
|
|
processor = AutoProcessor.from_pretrained(name_or_path, **kwargs)
|
|
|
except Exception:
|
|
|
processor = None
|
|
|
|
|
|
|
|
|
if processor is not None and "Processor" not in processor.__class__.__name__:
|
|
|
processor = None
|
|
|
return processor
|
|
|
|