yuyan-10b / tools /openwebtext /cleanup_dataset.py

Upload 21 files

1101a21 over 2 years ago

4.24 kB

	# coding=utf-8
	# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	import ftfy
	import json
	from langdetect import detect
	import numpy as np
	import time
	import os
	import sys

	from tokenizer import Tokenizer

	MIN_DOCUMENT_LENGHT = 128


	def print_progress(prefix, start_time, num_docs, num_fixed_text,
	num_non_english_docs, chars_non_english_docs,
	num_small_docs, chars_small_docs):

	string = prefix + ' \| '
	string += 'elapsed time: {:.2f} \| '.format(time.time() - start_time)
	string += 'documents: {} \| '.format(num_docs)
	string += 'fixed text: {} \| '.format(num_fixed_text)
	string += 'non-english: {} \| '.format(num_non_english_docs)
	string += 'non-english chars: {} \| '.format(chars_non_english_docs)
	string += 'small docs: {} \| '.format(num_small_docs)
	string += 'small docs chars: {}'.format(chars_small_docs)
	print(string, flush=True)


	def filter_corpus(filename, out_filename, print_interval=10000):

	print(' > filtering {}'.format(filename))

	tokenizer = Tokenizer(cache_dir='./cache')

	num_docs = 0
	num_written_docs = 0
	num_small_docs = 0
	num_fixed_text = 0
	num_non_english_docs = 0
	chars_non_english_docs = 0
	chars_small_docs = 0
	start_time = time.time()
	with open(out_filename, 'wb') as f:
	with open(filename, 'r') as fin:
	for line in fin:
	try:
	num_docs += 1
	myjson = json.loads(line)
	# Fix text
	text = ftfy.fix_text(myjson['text'])
	if text != myjson['text']:
	num_fixed_text += 1
	myjson['text'] = text
	# Detect language.
	if detect(text) != 'en':
	print('[non-english text]', myjson)
	num_non_english_docs += 1
	chars_non_english_docs += len(text)
	continue
	# On average each token is 5 characters so 8 is an
	# upper bound.
	if len(text) < (8 * MIN_DOCUMENT_LENGHT):
	tokens = tokenizer.tokenize_document(text)
	if len(tokens) < MIN_DOCUMENT_LENGHT:
	print('[small document, skipping]:', myjson)
	num_small_docs += 1
	chars_small_docs += len(text)
	continue
	myjson = json.dumps(myjson, ensure_ascii=False)
	f.write(myjson.encode('utf-8'))
	f.write('\n'.encode('utf-8'))
	num_written_docs += 1
	if num_docs % print_interval == 0:
	print_progress('[PROGRESS]', start_time, num_docs,
	num_fixed_text, num_non_english_docs,
	chars_non_english_docs,
	num_small_docs, chars_small_docs)
	except Exception as e:
	print(' skipping ', line, e)

	print_progress('[FINAL]', start_time, num_docs,
	num_fixed_text, num_non_english_docs,
	chars_non_english_docs,
	num_small_docs, chars_small_docs)


	if __name__ == '__main__':

	print('building gpt2 dataset ...')

	input_filename = sys.argv[1]
	output_filename = sys.argv[2]

	print('will be reading {}'.format(input_filename))
	print('and will write the results to {}'.format(output_filename))

	filter_corpus(input_filename, output_filename)