| | import sys |
| |
|
| | from transformers import AutoTokenizer |
| |
|
| |
|
| | dataset = sys.argv[1] |
| | model_name_or_path = sys.argv[2] |
| | max_len = int(sys.argv[3]) |
| |
|
| | subword_len_counter = 0 |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) |
| | max_len -= tokenizer.num_special_tokens_to_add() |
| |
|
| | with open(dataset, "rt") as f_p: |
| | for line in f_p: |
| | line = line.rstrip() |
| |
|
| | if not line: |
| | print(line) |
| | subword_len_counter = 0 |
| | continue |
| |
|
| | token = line.split()[0] |
| |
|
| | current_subwords_len = len(tokenizer.tokenize(token)) |
| |
|
| | |
| | |
| | if current_subwords_len == 0: |
| | continue |
| |
|
| | if (subword_len_counter + current_subwords_len) > max_len: |
| | print("") |
| | print(line) |
| | subword_len_counter = current_subwords_len |
| | continue |
| |
|
| | subword_len_counter += current_subwords_len |
| |
|
| | print(line) |
| |
|