| from __future__ import annotations |
|
|
| import os |
| import random |
| from pathlib import Path |
| from filelock import FileLock |
|
|
| import simdjson as json |
| from tqdm import tqdm |
| from tokenizers.models import BPE |
|
|
| from tokenizers import Tokenizer, pre_tokenizers, Regex |
| from tokenizers.pre_tokenizers import ByteLevel, Split, Digits |
| from tokenizers.decoders import ByteLevel as ByteLevelDecoder |
| from tokenizers.trainers import BpeTrainer |
|
|
|
|
| def ensure_dir(d): |
| if not os.path.exists(d): |
| os.makedirs(d, exist_ok=True) |
|
|
|
|
| def read_json(file): |
| return json.load(open(file)) |
|
|
|
|
| def jsonl_batch_generator(jsonl_file: str, batch_size: int): |
| """ |
| A generator function that yields sentences from a JSONL file. |
| Assumes each line is a JSON object with a 'text' field. |
| """ |
| batch = [] |
| with open(jsonl_file, 'r', encoding='utf-8') as f: |
| for line in f: |
| data = json.loads(line) |
| batch.append(data["text"]) |
| if len(batch) >= batch_size: |
| yield batch |
| batch = [] |
| if batch: |
| yield batch |
|
|
| def train_or_extend_tokenizer( |
| text_files: str, |
| vocab_size: int = 100000, |
| do_whitespace_pretokenization: bool = True, |
| ): |
| tokenizer = Tokenizer(BPE()) |
| trainer = BpeTrainer(show_progress=True, vocab_size=vocab_size) |
|
|
| regex_string = "(?=(\d{3})+(?!\d))" |
|
|
| if do_whitespace_pretokenization: |
| regex_string += ( |
| "| ?\p{L}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" |
| ) |
|
|
| pretokenizers = [ |
| Digits(individual_digits=False), |
| Split( |
| pattern=Regex(regex_string), |
| behavior="isolated", |
| invert=False, |
| ), |
| ByteLevel( |
| add_prefix_space=False, |
| trim_offsets=True, |
| use_regex=False, |
| ), |
| ] |
| tokenizer.pre_tokenizer = pre_tokenizers.Sequence(pretokenizers) |
| tokenizer.decoder = ByteLevelDecoder(add_prefix_space=False, trim_offsets=True, use_regex=False) |
|
|
| generator = jsonl_batch_generator(text_files, 1024) |
| tokenizer.train_from_iterator(generator, trainer=trainer) |
| |
|
|
| return tokenizer |
|
|