Spaces:
Running on Zero
Running on Zero
| # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. | |
| """Processing nmt data for finetuning.""" | |
| import argparse | |
| import json | |
| import multiprocessing | |
| import os | |
| import sys | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), | |
| os.path.pardir))) | |
| import time | |
| import torch | |
| from megatron.training.tokenizer import build_tokenizer | |
| from megatron.core.datasets import indexed_dataset | |
| class Encoder(object): | |
| def __init__(self, args): | |
| self.args = args | |
| def initializer(self): | |
| # Use Encoder class as a container for global data | |
| Encoder.tokenizer = build_tokenizer(self.args) | |
| def encode(self, text): | |
| ids = {} | |
| ids = Encoder.tokenizer.tokenize(text) | |
| assert len(ids) > 0 | |
| return ids, len(text) | |
| def get_args(): | |
| parser = argparse.ArgumentParser() | |
| group = parser.add_argument_group(title='input data') | |
| group.add_argument('--input', type=str, required=True, | |
| help='Path to input JSON') | |
| group = parser.add_argument_group(title='tokenizer') | |
| group.add_argument('--tokenizer-type', type=str, default='YTTMTokenizer', | |
| choices=['BertWordPieceLowerCase','BertWordPieceCase', | |
| 'GPT2BPETokenizer', 'SentencePieceTokenizer'], | |
| help='What type of tokenizer to use.') | |
| group.add_argument('--vocab-file', type=str, default=None, | |
| help='Path to the vocab file') | |
| group.add_argument('--merge-file', type=str, default=None, | |
| help='Path to the BPE merge file (if necessary).') | |
| group = parser.add_argument_group(title='output data') | |
| group.add_argument('--output-prefix', type=str, required=True, | |
| help='Path to binary output file without suffix') | |
| group = parser.add_argument_group(title='runtime') | |
| group.add_argument('--workers', type=int, default=1, | |
| help='Number of worker processes to launch') | |
| group.add_argument('--log-interval', type=int, default=100, | |
| help='Interval between progress updates') | |
| args = parser.parse_args() | |
| args.keep_empty = False | |
| # some default/dummy values for the tokenizer | |
| args.rank = 0 | |
| args.make_vocab_size_divisible_by = 128 | |
| args.tensor_model_parallel_size = 1 | |
| args.vocab_extra_ids = 0 | |
| return args | |
| def main(): | |
| args = get_args() | |
| startup_start = time.time() | |
| print("Opening", args.input) | |
| fin = open(args.input, 'r', encoding='utf-8') | |
| encoder = Encoder(args) | |
| tokenizer = build_tokenizer(args) | |
| pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer) | |
| encoded_sentences = pool.imap(encoder.encode, fin, 25) | |
| print(f"Vocab size: {tokenizer.vocab_size}") | |
| print(f"Output prefix: {args.output_prefix}") | |
| output_bin_file = "{}.bin".format(args.output_prefix) | |
| output_idx_file = "{}.idx".format(args.output_prefix) | |
| builder = indexed_dataset.IndexedDatasetBuilder( | |
| output_bin_file, dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size) | |
| ) | |
| startup_end = time.time() | |
| proc_start = time.time() | |
| total_bytes_processed = 0 | |
| print("Time to startup:", startup_end - startup_start) | |
| for i, (sentence, bytes_processed) in enumerate(encoded_sentences, start=1): | |
| total_bytes_processed += bytes_processed | |
| builder.add_item(torch.IntTensor(sentence)) | |
| # documents contain only one sentence. | |
| builder.end_document() | |
| if i % args.log_interval == 0: | |
| current = time.time() | |
| elapsed = current - proc_start | |
| mbs = total_bytes_processed/elapsed/1024/1024 | |
| print(f"Processed {i} sentences", | |
| f"({i/elapsed} sentences/s, {mbs} MB/s).", | |
| file=sys.stderr) | |
| builder.finalize(output_idx_file) | |
| if __name__ == '__main__': | |
| main() | |