pentanet-124m / prepare_data.py
Kyworn's picture
Upload folder using huggingface_hub
80603a0 verified
import os
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer
from tqdm import tqdm
DATA_DIR = "data/wikitext-103"
def prepare():
os.makedirs(DATA_DIR, exist_ok=True)
print("📥 Loading wikitext-103-v1 from HuggingFace datasets...")
dataset = load_dataset("wikitext", "wikitext-103-v1")
print("📥 Loading GPT-2 tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
for split in ['train', 'validation', 'test']:
output_file = os.path.join(DATA_DIR, f"{split}.bin")
if os.path.exists(output_file):
print(f"⏩ {output_file} already exists.")
continue
print(f"⚗️ Tokenizing {split} split...")
all_ids = []
# Process in chunks to give progress bar
for text in tqdm(dataset[split]['text'], desc=split):
if not text.strip():
continue
# Adding eos_token_id to separate lines as GPT2 generally does not encode \n perfectly or we want clean boundaries
ids = tokenizer.encode(text) + [tokenizer.eos_token_id]
all_ids.extend(ids)
arr = np.array(all_ids, dtype=np.uint32)
arr.tofile(output_file)
print(f"✅ {split}.bin created: {len(arr)} tokens.")
if __name__ == '__main__':
prepare()