File size: 1,510 Bytes
6810eb1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | import argparse
import os
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
def main():
parser = argparse.ArgumentParser(description="Train a BPE tokenizer on a text corpus.")
parser.add_argument("--input", type=str, required=True, help="Path to input text file (raw corpus).")
parser.add_argument("--output", type=str, required=True, help="Directory to save the trained tokenizer files.")
parser.add_argument("--vocab_size", type=int, default=8000, help="Vocabulary size for the tokenizer.")
parser.add_argument("--min_frequency", type=int, default=2, help="Minimum frequency for tokens to be included.")
args = parser.parse_args()
# Ensure output directory exists
os.makedirs(args.output, exist_ok=True)
# Initialize a Byte-Pair Encoding (BPE) tokenizer
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
# Use whitespace as a basic pre-tokenizer
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
# Trainer for BPE model
trainer = trainers.BpeTrainer(vocab_size=args.vocab_size, min_frequency=args.min_frequency,
special_tokens=["[PAD]", "[UNK]"])
# Train the tokenizer on the given file
tokenizer.train([args.input], trainer)
# Save the tokenizer model to the output directory
tokenizer_path = os.path.join(args.output, "tokenizer.json")
tokenizer.save(tokenizer_path)
print(f"Tokenizer trained and saved to {tokenizer_path}")
if __name__ == "__main__":
main()
|