applied-ai-018
/

peacock-data-public-evaluation

Model card Files Files and versions

peacock-data-public-evaluation / Megatron-DeepSpeed /tasks /data_utils.py

applied-ai-018's picture

Add files using upload-large-folder tool

e61fdc8 verified about 1 year ago

history blame contribute delete

2.96 kB

	# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.

	""" Tasks data utility."""

	import re
	import numpy as np


	def clean_text(text):
	"""Remove new lines and multiple spaces and adjust end of sentence dot."""

	text = text.replace("\n", " ")
	text = re.sub(r'\s+', ' ', text)
	for _ in range(3):
	text = text.replace(' . ', '. ')

	return text


	def build_sample(ids, types, paddings, label, unique_id):
	"""Convert to numpy and return a sample consumed by the batch producer."""

	ids_np = np.array(ids, dtype=np.int64)
	types_np = np.array(types, dtype=np.int64)
	paddings_np = np.array(paddings, dtype=np.int64)
	sample = ({'text': ids_np,
	'types': types_np,
	'padding_mask': paddings_np,
	'label': int(label),
	'uid': int(unique_id)})

	return sample


	def build_tokens_types_paddings_from_text(text_a, text_b,
	tokenizer, max_seq_length):
	"""Build token types and paddings, trim if needed, and pad if needed."""

	text_a_ids = tokenizer.tokenize(text_a)
	text_b_ids = None
	if text_b is not None:
	text_b_ids = tokenizer.tokenize(text_b)

	return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids,
	max_seq_length, tokenizer.cls,
	tokenizer.sep, tokenizer.pad)


	def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length,
	cls_id, sep_id, pad_id):
	"""Build token types and paddings, trim if needed, and pad if needed."""

	ids = []
	types = []
	paddings = []

	# [CLS].
	ids.append(cls_id)
	types.append(0)
	paddings.append(1)

	# A.
	len_text_a = len(text_a_ids)
	ids.extend(text_a_ids)
	types.extend([0] * len_text_a)
	paddings.extend([1] * len_text_a)

	# [SEP].
	ids.append(sep_id)
	types.append(0)
	paddings.append(1)

	# B.
	if text_b_ids is not None:
	len_text_b = len(text_b_ids)
	ids.extend(text_b_ids)
	types.extend([1] * len_text_b)
	paddings.extend([1] * len_text_b)

	# Cap the size.
	trimmed = False
	if len(ids) >= max_seq_length:
	max_seq_length_m1 = max_seq_length - 1
	ids = ids[0:max_seq_length_m1]
	types = types[0:max_seq_length_m1]
	paddings = paddings[0:max_seq_length_m1]
	trimmed = True

	# [SEP].
	if (text_b_ids is not None) or trimmed:
	ids.append(sep_id)
	if text_b_ids is None:
	types.append(0)
	else:
	types.append(1)
	paddings.append(1)

	# Padding.
	padding_length = max_seq_length - len(ids)
	if padding_length > 0:
	ids.extend([pad_id] * padding_length)
	types.extend([pad_id] * padding_length)
	paddings.extend([0] * padding_length)

	return ids, types, paddings