Spaces:

rynmurdock
/

Babel

Runtime error

App Files Files Community

Babel / Optimus /code /pytorch_transformers /tokenization_xlnet.py

rynmurdock

init

c5ca37a almost 2 years ago

raw

history blame contribute delete

8.27 kB

	# coding=utf-8
	# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	""" Tokenization classes for XLNet model."""
	from __future__ import (absolute_import, division, print_function,
	unicode_literals)

	import logging
	import os
	from shutil import copyfile

	import unicodedata
	import six

	from .tokenization_utils import PreTrainedTokenizer

	logger = logging.getLogger(__name__)

	VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}

	PRETRAINED_VOCAB_FILES_MAP = {
	'vocab_file':
	{
	'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model",
	'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model",
	}
	}

	PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
	'xlnet-base-cased': None,
	'xlnet-large-cased': None,
	}

	SPIECE_UNDERLINE = u'▁'

	# Segments (not really needed)
	SEG_ID_A = 0
	SEG_ID_B = 1
	SEG_ID_CLS = 2
	SEG_ID_SEP = 3
	SEG_ID_PAD = 4

	class XLNetTokenizer(PreTrainedTokenizer):
	"""
	SentencePiece based tokenizer. Peculiarities:

	- requires `SentencePiece <https://github.com/google/sentencepiece>`_
	"""
	vocab_files_names = VOCAB_FILES_NAMES
	pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
	max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

	def __init__(self, vocab_file,
	do_lower_case=False, remove_space=True, keep_accents=False,
	bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
	pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",
	additional_special_tokens=["<eop>", "<eod>"], **kwargs):
	super(XLNetTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
	unk_token=unk_token, sep_token=sep_token,
	pad_token=pad_token, cls_token=cls_token,
	mask_token=mask_token, additional_special_tokens=
	additional_special_tokens, **kwargs)

	self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
	self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens

	try:
	import sentencepiece as spm
	except ImportError:
	logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
	"pip install sentencepiece")

	self.do_lower_case = do_lower_case
	self.remove_space = remove_space
	self.keep_accents = keep_accents
	self.vocab_file = vocab_file

	self.sp_model = spm.SentencePieceProcessor()
	self.sp_model.Load(vocab_file)

	@property
	def vocab_size(self):
	return len(self.sp_model)

	def __getstate__(self):
	state = self.__dict__.copy()
	state["sp_model"] = None
	return state

	def __setstate__(self, d):
	self.__dict__ = d
	try:
	import sentencepiece as spm
	except ImportError:
	logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
	"pip install sentencepiece")
	self.sp_model = spm.SentencePieceProcessor()
	self.sp_model.Load(self.vocab_file)

	def preprocess_text(self, inputs):
	if self.remove_space:
	outputs = ' '.join(inputs.strip().split())
	else:
	outputs = inputs
	outputs = outputs.replace("``", '"').replace("''", '"')

	if six.PY2 and isinstance(outputs, str):
	outputs = outputs.decode('utf-8')

	if not self.keep_accents:
	outputs = unicodedata.normalize('NFKD', outputs)
	outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
	if self.do_lower_case:
	outputs = outputs.lower()

	return outputs

	def _tokenize(self, text, return_unicode=True, sample=False):
	""" Tokenize a string.
	return_unicode is used only for py2
	"""
	text = self.preprocess_text(text)
	# note(zhiliny): in some systems, sentencepiece only accepts str for py2
	if six.PY2 and isinstance(text, unicode):
	text = text.encode('utf-8')

	if not sample:
	pieces = self.sp_model.EncodeAsPieces(text)
	else:
	pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
	new_pieces = []
	for piece in pieces:
	if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
	cur_pieces = self.sp_model.EncodeAsPieces(
	piece[:-1].replace(SPIECE_UNDERLINE, ''))
	if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
	if len(cur_pieces[0]) == 1:
	cur_pieces = cur_pieces[1:]
	else:
	cur_pieces[0] = cur_pieces[0][1:]
	cur_pieces.append(piece[-1])
	new_pieces.extend(cur_pieces)
	else:
	new_pieces.append(piece)

	# note(zhiliny): convert back to unicode for py2
	if six.PY2 and return_unicode:
	ret_pieces = []
	for piece in new_pieces:
	if isinstance(piece, str):
	piece = piece.decode('utf-8')
	ret_pieces.append(piece)
	new_pieces = ret_pieces

	return new_pieces

	def _convert_token_to_id(self, token):
	""" Converts a token (str/unicode) in an id using the vocab. """
	return self.sp_model.PieceToId(token)

	def _convert_id_to_token(self, index, return_unicode=True):
	"""Converts an index (integer) in a token (string/unicode) using the vocab."""
	token = self.sp_model.IdToPiece(index)
	if six.PY2 and return_unicode and isinstance(token, str):
	token = token.decode('utf-8')
	return token

	def convert_tokens_to_string(self, tokens):
	"""Converts a sequence of tokens (strings for sub-words) in a single string."""
	out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
	return out_string

	def add_special_tokens_single_sentence(self, token_ids):
	"""
	Adds special tokens to a sequence pair for sequence classification tasks.
	An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
	"""
	sep = [self.sep_token_id]
	cls = [self.cls_token_id]
	return token_ids + sep + cls

	def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
	"""
	Adds special tokens to a sequence for sequence classification tasks.
	An XLNet sequence has the following format: X [SEP][CLS]
	"""
	sep = [self.sep_token_id]
	cls = [self.cls_token_id]
	return token_ids_0 + sep + token_ids_1 + sep + cls

	def save_vocabulary(self, save_directory):
	""" Save the sentencepiece vocabulary (copy original file) and special tokens file
	to a directory.
	"""
	if not os.path.isdir(save_directory):
	logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
	return
	out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])

	if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
	copyfile(self.vocab_file, out_vocab_file)

	return (out_vocab_file,)