|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os.path |
|
|
from dataclasses import MISSING, dataclass |
|
|
from typing import Dict, List, Optional |
|
|
|
|
|
import nemo |
|
|
from nemo.collections.common.tokenizers.bytelevel_tokenizers import ByteLevelTokenizer |
|
|
from nemo.collections.common.tokenizers.char_tokenizer import CharTokenizer |
|
|
from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer |
|
|
from nemo.collections.common.tokenizers.regex_tokenizer import RegExTokenizer |
|
|
from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer |
|
|
from nemo.collections.common.tokenizers.word_tokenizer import WordTokenizer |
|
|
from nemo.collections.common.tokenizers.youtokentome_tokenizer import YouTokenToMeTokenizer |
|
|
from nemo.collections.nlp.modules.common.huggingface.huggingface_utils import get_huggingface_pretrained_lm_models_list |
|
|
from nemo.collections.nlp.modules.common.lm_utils import get_pretrained_lm_models_list |
|
|
from nemo.collections.nlp.parts.nlp_overrides import HAVE_APEX |
|
|
from nemo.utils import logging |
|
|
|
|
|
try: |
|
|
from nemo.collections.nlp.modules.common.megatron.megatron_utils import get_megatron_tokenizer |
|
|
|
|
|
HAVE_APEX = True |
|
|
|
|
|
except (ImportError, ModuleNotFoundError): |
|
|
HAVE_APEX = False |
|
|
|
|
|
|
|
|
__all__ = ['get_tokenizer', 'get_tokenizer_list'] |
|
|
|
|
|
|
|
|
megatron_tokenizer_model_map = { |
|
|
'BertWordPieceLowerCase': 'megatron-bert-345m-uncased', |
|
|
'BertWordPieceCase': 'megatron-bert-345m-cased', |
|
|
'GPT2BPETokenizer': 'megatron-gpt-345m', |
|
|
} |
|
|
|
|
|
|
|
|
def get_tokenizer_list() -> List[str]: |
|
|
""" |
|
|
Returns all all supported tokenizer names |
|
|
""" |
|
|
s = set(get_pretrained_lm_models_list()) |
|
|
s.update(set(get_huggingface_pretrained_lm_models_list(include_external=True))) |
|
|
return ["sentencepiece", "char", "word"] + list(s) |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class TokenizerConfig: |
|
|
library: str = MISSING |
|
|
tokenizer_model: Optional[str] = None |
|
|
vocab_size: Optional[int] = None |
|
|
vocab_file: Optional[str] = None |
|
|
special_tokens: Optional[Dict[str, str]] = None |
|
|
bpe_dropout: Optional[float] = 0.0 |
|
|
coverage: Optional[float] = 0.999 |
|
|
training_sample_size: Optional[int] = None |
|
|
r2l: Optional[bool] = False |
|
|
sentencepiece_legacy: Optional[bool] = False |
|
|
|
|
|
|
|
|
def get_tokenizer( |
|
|
tokenizer_name: str, |
|
|
tokenizer_model: Optional[str] = None, |
|
|
vocab_file: Optional[str] = None, |
|
|
merges_file: Optional[str] = None, |
|
|
special_tokens: Optional[Dict[str, str]] = None, |
|
|
use_fast: Optional[bool] = False, |
|
|
bpe_dropout: Optional[float] = 0.0, |
|
|
): |
|
|
""" |
|
|
Args: |
|
|
tokenizer_name: sentencepiece or pretrained model from the hugging face list, |
|
|
for example: bert-base-cased |
|
|
To see the list of all HuggingFace pretrained models, use: |
|
|
nemo_nlp.modules.common.get_huggingface_pretrained_lm_models_list() |
|
|
tokenizer_model: tokenizer model file of sentencepiece or youtokentome |
|
|
special_tokens: dict of special tokens |
|
|
vocab_file: path to vocab file |
|
|
use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer |
|
|
bpe_dropout: (only supported by YTTM tokenizer) BPE dropout tries to corrupt the standard segmentation |
|
|
procedure of BPE to help |
|
|
model better learn word compositionality and become robust to segmentation errors. |
|
|
It has emperically been shown to improve inference time BLEU scores. |
|
|
""" |
|
|
if special_tokens is None: |
|
|
special_tokens_dict = {} |
|
|
else: |
|
|
special_tokens_dict = special_tokens |
|
|
|
|
|
if 'megatron' in tokenizer_name: |
|
|
if not HAVE_APEX: |
|
|
raise ImportError( |
|
|
"Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." |
|
|
) |
|
|
if vocab_file is None: |
|
|
vocab_file = nemo.collections.nlp.modules.common.megatron.megatron_utils.get_megatron_vocab_file( |
|
|
tokenizer_name |
|
|
) |
|
|
merges_file = nemo.collections.nlp.modules.common.megatron.megatron_utils.get_megatron_merges_file( |
|
|
tokenizer_name |
|
|
) |
|
|
tokenizer_name = get_megatron_tokenizer(tokenizer_name) |
|
|
|
|
|
if tokenizer_name == 'sentencepiece': |
|
|
return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer( |
|
|
model_path=tokenizer_model, special_tokens=special_tokens, legacy=True |
|
|
) |
|
|
elif tokenizer_name == 'yttm': |
|
|
return YouTokenToMeTokenizer(model_path=tokenizer_model, bpe_dropout=bpe_dropout) |
|
|
elif tokenizer_name == 'word': |
|
|
return WordTokenizer(vocab_file=vocab_file, **special_tokens_dict) |
|
|
elif tokenizer_name == 'char': |
|
|
return CharTokenizer(vocab_file=vocab_file, **special_tokens_dict) |
|
|
elif tokenizer_name == 'regex': |
|
|
return RegExTokenizer().load_tokenizer(regex_file=tokenizer_model, vocab_file=vocab_file) |
|
|
|
|
|
logging.info( |
|
|
f"Getting HuggingFace AutoTokenizer with pretrained_model_name: {tokenizer_name}, vocab_file: {vocab_file}, merges_files: {merges_file}, " |
|
|
f"special_tokens_dict: {special_tokens_dict}, and use_fast: {use_fast}" |
|
|
) |
|
|
return AutoTokenizer( |
|
|
pretrained_model_name=tokenizer_name, |
|
|
vocab_file=vocab_file, |
|
|
merges_file=merges_file, |
|
|
**special_tokens_dict, |
|
|
use_fast=use_fast, |
|
|
) |
|
|
|
|
|
|
|
|
def get_nmt_tokenizer( |
|
|
library: str = 'yttm', |
|
|
model_name: Optional[str] = None, |
|
|
tokenizer_model: Optional[str] = None, |
|
|
vocab_file: Optional[str] = None, |
|
|
merges_file: Optional[str] = None, |
|
|
special_tokens: Optional[Dict[str, str]] = None, |
|
|
use_fast: Optional[bool] = False, |
|
|
bpe_dropout: Optional[float] = 0.0, |
|
|
r2l: Optional[bool] = False, |
|
|
legacy: Optional[bool] = False, |
|
|
delimiter: Optional[str] = None, |
|
|
): |
|
|
""" |
|
|
Args: |
|
|
model_name: if using a pretrained model from NeMo, HuggingFace, or Megatron |
|
|
tokenizer_model: tokenizer model file of sentencepiece or youtokentome |
|
|
special_tokens: dict of special tokens |
|
|
vocab_file: path to vocab file |
|
|
use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer |
|
|
bpe_dropout: (only supported by YTTM tokenizer) BPE dropout tries to corrupt the standard segmentation procedure |
|
|
of BPE to help model better learn word compositionality and become robust to segmentation errors. |
|
|
It has empirically been shown to improve inference time BLEU scores. |
|
|
r2l: Whether to return subword IDs from right to left |
|
|
""" |
|
|
if special_tokens is None: |
|
|
special_tokens_dict = {} |
|
|
else: |
|
|
special_tokens_dict = special_tokens |
|
|
|
|
|
if (library != 'byte-level') and ( |
|
|
model_name is None and (tokenizer_model is None or not os.path.isfile(tokenizer_model)) |
|
|
): |
|
|
raise ValueError("No Tokenizer path provided or file does not exist!") |
|
|
|
|
|
if library == 'yttm': |
|
|
logging.info(f'Getting YouTokenToMeTokenizer with model: {tokenizer_model} with r2l: {r2l}.') |
|
|
return YouTokenToMeTokenizer(model_path=tokenizer_model, bpe_dropout=bpe_dropout, r2l=r2l) |
|
|
elif library == 'huggingface': |
|
|
logging.info(f'Getting HuggingFace AutoTokenizer with pretrained_model_name: {model_name}') |
|
|
return AutoTokenizer( |
|
|
pretrained_model_name=model_name, |
|
|
vocab_file=vocab_file, |
|
|
merges_file=merges_file, |
|
|
**special_tokens_dict, |
|
|
use_fast=use_fast, |
|
|
) |
|
|
elif library == 'sentencepiece': |
|
|
logging.info(f'Getting SentencePiece with model: {tokenizer_model}') |
|
|
return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer( |
|
|
model_path=tokenizer_model, legacy=legacy |
|
|
) |
|
|
elif library == 'byte-level': |
|
|
logging.info(f'Using byte-level tokenization') |
|
|
return ByteLevelTokenizer(special_tokens_dict) |
|
|
elif library == 'regex': |
|
|
logging.info(f'Using regex tokenization') |
|
|
return RegExTokenizer().load_tokenizer(regex_file=tokenizer_model, vocab_file=vocab_file) |
|
|
elif library == 'megatron': |
|
|
if model_name in megatron_tokenizer_model_map: |
|
|
model_name = megatron_tokenizer_model_map[model_name] |
|
|
logging.info( |
|
|
f'Getting Megatron tokenizer for pretrained model name: {model_name}, custom vocab file: {vocab_file}, and merges file: {merges_file}' |
|
|
) |
|
|
return get_tokenizer(tokenizer_name=model_name, vocab_file=vocab_file, merges_file=merges_file) |
|
|
elif library == 'tabular': |
|
|
return TabularTokenizer(vocab_file, delimiter=delimiter) |
|
|
else: |
|
|
raise NotImplementedError( |
|
|
'Currently we only support "yttm", "huggingface", "sentencepiece", "megatron", and "byte-level" tokenizer' |
|
|
'libraries.' |
|
|
) |
|
|
|