File size: 1,768 Bytes
9987dd2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | from datasets import load_dataset
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors, Regex
# --- CONFIGURATION ---
DATASET_NAME = "sedthh/gutenberg_english"
VOCAB_SIZE = 32000
SAMPLE_SIZE = 3000
BATCH_SIZE = 100
# 1. Connect
print(f"1. Connecting to {DATASET_NAME}...")
dataset = load_dataset(DATASET_NAME, split="train", streaming=True)
# 2. The Generator
def batch_iterator():
batch = []
print("2. Collecting data...")
for i, item in enumerate(dataset):
if i >= SAMPLE_SIZE: break
batch.append(item['TEXT'])
if len(batch) == BATCH_SIZE:
print(f" > Processing batch {(i+1)//BATCH_SIZE}...", end='\r')
yield batch
batch = []
if batch: yield batch
# 3. TOKENIZER
print("\n3. Initializing Tokenizer...")
tokenizer = Tokenizer(models.BPE())
qwen_pattern = Regex(r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""")
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
pre_tokenizers.Split(pattern=qwen_pattern, behavior="isolated"),
pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False)
])
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(
vocab_size=VOCAB_SIZE,
special_tokens=["<|endoftext|>", "<|padding|>"],
show_progress=True,
initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
)
# 4. Train
print("4. Training Qwen-style tokenizer...")
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
# 5. Save
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
tokenizer.save("qwen_style_tokenizer.json")
print(f"\nSUCCESS! Saved 'qwen_style_tokenizer.json'") |