File size: 1,361 Bytes
80603a0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | import os
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer
from tqdm import tqdm
DATA_DIR = "data/wikitext-103"
def prepare():
os.makedirs(DATA_DIR, exist_ok=True)
print("📥 Loading wikitext-103-v1 from HuggingFace datasets...")
dataset = load_dataset("wikitext", "wikitext-103-v1")
print("📥 Loading GPT-2 tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
for split in ['train', 'validation', 'test']:
output_file = os.path.join(DATA_DIR, f"{split}.bin")
if os.path.exists(output_file):
print(f"⏩ {output_file} already exists.")
continue
print(f"⚗️ Tokenizing {split} split...")
all_ids = []
# Process in chunks to give progress bar
for text in tqdm(dataset[split]['text'], desc=split):
if not text.strip():
continue
# Adding eos_token_id to separate lines as GPT2 generally does not encode \n perfectly or we want clean boundaries
ids = tokenizer.encode(text) + [tokenizer.eos_token_id]
all_ids.extend(ids)
arr = np.array(all_ids, dtype=np.uint32)
arr.tofile(output_file)
print(f"✅ {split}.bin created: {len(arr)} tokens.")
if __name__ == '__main__':
prepare()
|