Upload folder using huggingface_hub

59b7eeb verified 9 months ago

12.2 kB

	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import math
	from typing import Dict, List, Tuple

	import numpy as np
	import torch
	from fairseq.data import Dictionary, FairseqDataset, data_utils
	from fairseq.data.concat_dataset import ConcatDataset
	from fairseq.data.legacy.block_pair_dataset import BlockPairDataset
	from fairseq.data.token_block_dataset import TokenBlockDataset


	class MaskedLMDataset(FairseqDataset):
	"""
	A wrapper Dataset for masked language modelling. The dataset
	wraps around TokenBlockDataset or BlockedPairDataset and creates a batch
	where the input blocks are masked according to the specified masking
	probability. Additionally the batch can also contain sentence level targets
	if this is specified.

	Args:
	dataset: Dataset which generates blocks of data. Only BlockPairDataset
	and TokenBlockDataset are supported.
	sizes: Sentence lengths
	vocab: Dictionary with the vocabulary and special tokens.
	pad_idx: Id of padding token in dictionary
	mask_idx: Id of mask token in dictionary
	classif_token_idx: Id of classification token in dictionary. This is the
	token associated with the sentence embedding (Eg: CLS for BERT)
	sep_token_idx: Id of separator token in dictionary
	(Eg: SEP in BERT)
	seed: Seed for random number generator for reproducibility.
	shuffle: Shuffle the elements before batching.
	has_pairs: Specifies whether the underlying dataset
	generates a pair of blocks along with a sentence_target or not.
	Setting it to True assumes that the underlying dataset generates a
	label for the pair of sentences which is surfaced as
	sentence_target. The default value assumes a single block with no
	sentence target.
	segment_id: An optional segment id for filling in the segment labels
	when we are in the single block setting (Eg: XLM). Default is 0.
	masking_ratio: specifies what percentage of the blocks should be masked.
	masking_prob: specifies the probability of a given token being
	replaced with the "MASK" token.
	random_token_prob: specifies the probability of a given token being
	replaced by a random token from the vocabulary.
	"""

	def __init__(
	self,
	dataset: FairseqDataset,
	sizes: np.ndarray,
	vocab: Dictionary,
	pad_idx: int,
	mask_idx: int,
	classif_token_idx: int,
	sep_token_idx: int,
	seed: int = 1,
	shuffle: bool = True,
	has_pairs: bool = True,
	segment_id: int = 0,
	masking_ratio: float = 0.15,
	masking_prob: float = 0.8,
	random_token_prob: float = 0.1,
	):
	# Make sure the input datasets are the ones supported
	assert (
	isinstance(dataset, TokenBlockDataset)
	or isinstance(dataset, BlockPairDataset)
	or isinstance(dataset, ConcatDataset)
	), (
	"MaskedLMDataset only wraps TokenBlockDataset or BlockPairDataset or "
	"ConcatDataset"
	)

	self.dataset = dataset
	self.sizes = np.array(sizes)
	self.vocab = vocab
	self.pad_idx = pad_idx
	self.mask_idx = mask_idx
	self.classif_token_idx = classif_token_idx
	self.sep_token_idx = sep_token_idx
	self.shuffle = shuffle
	self.seed = seed
	self.has_pairs = has_pairs
	self.segment_id = segment_id
	self.masking_ratio = masking_ratio
	self.masking_prob = masking_prob
	self.random_token_prob = random_token_prob

	# If we have only one block then sizes needs to be updated to include
	# the classification token
	if not has_pairs:
	self.sizes = self.sizes + 1

	def __getitem__(self, index: int):
	# if has_pairs, then expect 2 blocks and a sentence target
	if self.has_pairs:
	(block_one, block_two, sentence_target) = self.dataset[index]
	else:
	block_one = self.dataset[index]

	return {
	"id": index,
	"block_one": block_one,
	"block_two": block_two if self.has_pairs else None,
	"sentence_target": sentence_target if self.has_pairs else None,
	}

	def __len__(self):
	return len(self.dataset)

	def _mask_block(
	self,
	sentence: np.ndarray,
	mask_idx: int,
	pad_idx: int,
	dictionary_token_range: Tuple,
	):
	"""
	Mask tokens for Masked Language Model training
	Samples mask_ratio tokens that will be predicted by LM.

	Note:This function may not be efficient enough since we had multiple
	conversions between np and torch, we can replace them with torch
	operators later.

	Args:
	sentence: 1d tensor to be masked
	mask_idx: index to use for masking the sentence
	pad_idx: index to use for masking the target for tokens we aren't
	predicting
	dictionary_token_range: range of indices in dictionary which can
	be used for random word replacement
	(e.g. without special characters)
	Return:
	masked_sent: masked sentence
	target: target with words which we are not predicting replaced
	by pad_idx
	"""
	masked_sent = np.copy(sentence)
	sent_length = len(sentence)
	mask_num = math.ceil(sent_length * self.masking_ratio)
	mask = np.random.choice(sent_length, mask_num, replace=False)
	target = np.copy(sentence)

	for i in range(sent_length):
	if i in mask:
	rand = np.random.random()

	# replace with mask if probability is less than masking_prob
	# (Eg: 0.8)
	if rand < self.masking_prob:
	masked_sent[i] = mask_idx

	# replace with random token if probability is less than
	# masking_prob + random_token_prob (Eg: 0.9)
	elif rand < (self.masking_prob + self.random_token_prob):
	# sample random token from dictionary
	masked_sent[i] = np.random.randint(
	dictionary_token_range[0], dictionary_token_range[1]
	)
	else:
	target[i] = pad_idx

	return masked_sent, target

	def _collate(self, samples: List[Dict], pad_idx: int, eos_idx: int):
	"""
	Does the heavy lifting for creating a batch from the input list of
	examples. The logic is as follows:
	1. Mask the input blocks. In case has_pair is True then we have 2
	blocks to mask.
	2. Prepend the first masked block tensor with the special token
	used as sentence embedding. Eg: CLS in BERT. This happens
	irrespective of the value of has_pair.
	3. If has_pair is True, then append the first masked block with the
	special separator token (eg: SEP for BERT) and compute segment
	label accordingly. In this case, also append the second masked
	block with this special separator token and compute its segment
	label.
	4. For the targets tensor, prepend and append with padding index
	accordingly.
	5. Concatenate all tensors.
	"""
	if len(samples) == 0:
	return {}
	# To ensure determinism, we reset the state of the PRNG after every
	# batch based on the seed and the first id of the batch. This ensures
	# that across epochs we get the same mask for the same example. This
	# is needed for reproducibility and is how BERT does masking
	# TODO: Can we add deteminism without this constraint?
	with data_utils.numpy_seed(self.seed + samples[0]["id"]):
	for s in samples:

	# token range is needed for replacing with random token during
	# masking
	token_range = (self.vocab.nspecial, len(self.vocab))

	# mask according to specified probabilities.
	masked_blk_one, masked_tgt_one = self._mask_block(
	s["block_one"],
	self.mask_idx,
	self.pad_idx,
	token_range,
	)

	tokens = np.concatenate([[self.classif_token_idx], masked_blk_one])
	targets = np.concatenate([[self.pad_idx], masked_tgt_one])
	segments = np.ones(len(tokens)) * self.segment_id

	# if has_pairs is True then we need to add the SEP token to both
	# the blocks after masking and re-compute segments based on the new
	# lengths.
	if self.has_pairs:
	tokens_one = np.concatenate([tokens, [self.sep_token_idx]])
	targets_one = np.concatenate([targets, [self.pad_idx]])

	masked_blk_two, masked_tgt_two = self._mask_block(
	s["block_two"], self.mask_idx, self.pad_idx, token_range
	)
	tokens_two = np.concatenate([masked_blk_two, [self.sep_token_idx]])
	targets_two = np.concatenate([masked_tgt_two, [self.pad_idx]])

	# block + 1 sep + 1 special (CLS)
	segments_one = np.zeros(len(tokens_one))
	# block + 1 sep
	segments_two = np.ones(len(tokens_two))

	tokens = np.concatenate([tokens_one, tokens_two])
	targets = np.concatenate([targets_one, targets_two])
	segments = np.concatenate([segments_one, segments_two])

	s["source"] = torch.LongTensor(tokens)
	s["segment_labels"] = torch.LongTensor(segments)
	s["lm_target"] = torch.LongTensor(targets)

	def merge(key):
	return data_utils.collate_tokens(
	[s[key] for s in samples], pad_idx, eos_idx, left_pad=False
	)

	return {
	"id": torch.LongTensor([s["id"] for s in samples]),
	"ntokens": sum(len(s["source"]) for s in samples),
	"net_input": {
	"src_tokens": merge("source"),
	"segment_labels": merge("segment_labels"),
	},
	"lm_target": merge("lm_target"),
	"sentence_target": torch.LongTensor([s["sentence_target"] for s in samples])
	if self.has_pairs
	else None,
	"nsentences": len(samples),
	}

	def collater(self, samples: List[Dict]):
	"""Merge a list of samples to form a mini-batch.

	Args:
	samples (List[dict]): samples to collate

	Returns:
	dict: a mini-batch of data
	"""
	return self._collate(samples, self.vocab.pad(), self.vocab.eos())

	def num_tokens(self, index: int):
	"""
	Return the number of tokens in a sample. This value is used to
	enforce max-tokens during batching.
	"""
	return self.sizes[index]

	def size(self, index: int):
	"""
	Return an example's size as a float or tuple. This value is used when
	filtering a dataset with max-positions.
	"""
	return self.sizes[index]

	def ordered_indices(self):
	"""
	Return an ordered list of indices. Batches will be constructed based
	on this order.
	"""
	if self.shuffle:
	return np.random.permutation(len(self))
	else:
	order = [np.arange(len(self))]
	order.append(self.sizes)
	return np.lexsort(order)

	@property
	def supports_prefetch(self):
	return getattr(self.dataset, "supports_prefetch", False)

	def prefetch(self, indices):
	self.dataset.prefetch(indices)