File size: 1,361 Bytes

80603a0

import os
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer
from tqdm import tqdm

DATA_DIR = "data/wikitext-103"

def prepare():
    os.makedirs(DATA_DIR, exist_ok=True)
    print("📥 Loading wikitext-103-v1 from HuggingFace datasets...")
    dataset = load_dataset("wikitext", "wikitext-103-v1")
    
    print("📥 Loading GPT-2 tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    
    for split in ['train', 'validation', 'test']:
        output_file = os.path.join(DATA_DIR, f"{split}.bin")
        if os.path.exists(output_file):
            print(f"⏩ {output_file} already exists.")
            continue
            
        print(f"⚗️ Tokenizing {split} split...")
        all_ids = []
        # Process in chunks to give progress bar
        for text in tqdm(dataset[split]['text'], desc=split):
            if not text.strip(): 
                continue
            # Adding eos_token_id to separate lines as GPT2 generally does not encode \n perfectly or we want clean boundaries
            ids = tokenizer.encode(text) + [tokenizer.eos_token_id]
            all_ids.extend(ids)
            
        arr = np.array(all_ids, dtype=np.uint32)
        arr.tofile(output_file)
        print(f"✅ {split}.bin created: {len(arr)} tokens.")

if __name__ == '__main__':
    prepare()