| | |
| |
|
| | """Processing nmt data for finetuning.""" |
| |
|
| | import argparse |
| | import json |
| | import multiprocessing |
| | import os |
| | import sys |
| | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), |
| | os.path.pardir))) |
| | import time |
| | import torch |
| | from megatron.tokenizer import build_tokenizer |
| | from megatron.data import indexed_dataset |
| |
|
| |
|
| | class Encoder(object): |
| | def __init__(self, args): |
| | self.args = args |
| |
|
| | def initializer(self): |
| | |
| | Encoder.tokenizer = build_tokenizer(self.args) |
| |
|
| | def encode(self, text): |
| | ids = {} |
| | ids = Encoder.tokenizer.tokenize(text) |
| | assert len(ids) > 0 |
| | return ids, len(text) |
| |
|
| |
|
| | def get_args(): |
| | parser = argparse.ArgumentParser() |
| | group = parser.add_argument_group(title='input data') |
| | group.add_argument('--input', type=str, required=True, |
| | help='Path to input JSON') |
| |
|
| | group = parser.add_argument_group(title='tokenizer') |
| | group.add_argument('--tokenizer-type', type=str, default='YTTMTokenizer', |
| | choices=['BertWordPieceLowerCase','BertWordPieceCase', |
| | 'GPT2BPETokenizer', 'SentencePieceTokenizer'], |
| | help='What type of tokenizer to use.') |
| | group.add_argument('--vocab-file', type=str, default=None, |
| | help='Path to the vocab file') |
| | group.add_argument('--merge-file', type=str, default=None, |
| | help='Path to the BPE merge file (if necessary).') |
| |
|
| | group = parser.add_argument_group(title='output data') |
| | group.add_argument('--output-prefix', type=str, required=True, |
| | help='Path to binary output file without suffix') |
| | group.add_argument('--dataset-impl', type=str, default='mmap', |
| | choices=['lazy', 'cached', 'mmap']) |
| |
|
| | group = parser.add_argument_group(title='runtime') |
| | group.add_argument('--workers', type=int, default=1, |
| | help='Number of worker processes to launch') |
| | group.add_argument('--log-interval', type=int, default=100, |
| | help='Interval between progress updates') |
| | args = parser.parse_args() |
| | args.keep_empty = False |
| |
|
| | |
| | args.rank = 0 |
| | args.make_vocab_size_divisible_by = 128 |
| | args.tensor_model_parallel_size = 1 |
| | args.vocab_extra_ids = 0 |
| |
|
| | return args |
| |
|
| | def main(): |
| | args = get_args() |
| | startup_start = time.time() |
| |
|
| | print("Opening", args.input) |
| | fin = open(args.input, 'r', encoding='utf-8') |
| |
|
| | encoder = Encoder(args) |
| | tokenizer = build_tokenizer(args) |
| | pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer) |
| | encoded_sentences = pool.imap(encoder.encode, fin, 25) |
| |
|
| | print(f"Vocab size: {tokenizer.vocab_size}") |
| | print(f"Output prefix: {args.output_prefix}") |
| | output_bin_file = "{}.bin".format(args.output_prefix) |
| | output_idx_file = "{}.idx".format(args.output_prefix) |
| | builder = indexed_dataset.make_builder(output_bin_file, |
| | impl=args.dataset_impl, |
| | vocab_size=tokenizer.vocab_size) |
| |
|
| | startup_end = time.time() |
| | proc_start = time.time() |
| | total_bytes_processed = 0 |
| | print("Time to startup:", startup_end - startup_start) |
| |
|
| | for i, (sentence, bytes_processed) in enumerate(encoded_sentences, start=1): |
| | total_bytes_processed += bytes_processed |
| | builder.add_item(torch.IntTensor(sentence)) |
| | |
| | builder.end_document() |
| | if i % args.log_interval == 0: |
| | current = time.time() |
| | elapsed = current - proc_start |
| | mbs = total_bytes_processed/elapsed/1024/1024 |
| | print(f"Processed {i} sentences", |
| | f"({i/elapsed} sentences/s, {mbs} MB/s).", |
| | file=sys.stderr) |
| |
|
| | builder.finalize(output_idx_file) |
| |
|
| | if __name__ == '__main__': |
| | main() |
| |
|
| |
|