File size: 2,246 Bytes
72c0672 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | from __future__ import annotations
import os
import random
from pathlib import Path
from filelock import FileLock
import simdjson as json
from tqdm import tqdm
from tokenizers.models import BPE
from tokenizers import Tokenizer, pre_tokenizers, Regex
from tokenizers.pre_tokenizers import ByteLevel, Split, Digits
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.trainers import BpeTrainer
def ensure_dir(d):
if not os.path.exists(d):
os.makedirs(d, exist_ok=True)
def read_json(file):
return json.load(open(file))
def jsonl_batch_generator(jsonl_file: str, batch_size: int):
"""
A generator function that yields sentences from a JSONL file.
Assumes each line is a JSON object with a 'text' field.
"""
batch = []
with open(jsonl_file, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
batch.append(data["text"])
if len(batch) >= batch_size:
yield batch
batch = []
if batch:
yield batch
def train_or_extend_tokenizer(
text_files: str,
vocab_size: int = 100000,
do_whitespace_pretokenization: bool = True,
):
tokenizer = Tokenizer(BPE())
trainer = BpeTrainer(show_progress=True, vocab_size=vocab_size)
regex_string = "(?=(\d{3})+(?!\d))" # pretokenize digits in groups of 3 from right to left (from Luca)
if do_whitespace_pretokenization:
regex_string += (
"| ?\p{L}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" # GPT-2 pretokenization
)
pretokenizers = [
Digits(individual_digits=False),
Split(
pattern=Regex(regex_string),
behavior="isolated",
invert=False,
),
ByteLevel(
add_prefix_space=False,
trim_offsets=True,
use_regex=False,
),
]
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(pretokenizers)
tokenizer.decoder = ByteLevelDecoder(add_prefix_space=False, trim_offsets=True, use_regex=False)
generator = jsonl_batch_generator(text_files, 1024)
tokenizer.train_from_iterator(generator, trainer=trainer)
# tokenizer.train(text_files, trainer)
return tokenizer
|