| import os | |
| from tqdm import tqdm | |
| import numpy as np | |
| import tiktoken | |
| from datasets import load_dataset | |
| enc = tiktoken.get_encoding("gpt2") | |
| if __name__ == '__main__': | |
| dataset = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split="train") | |
| split_dataset = dataset.train_test_split(test_size=0.0005, seed=2357, shuffle=True) | |
| split_dataset['val'] = split_dataset.pop('test') | |
| def process(example): | |
| ids = enc.encode_ordinary(example['text']) | |
| ids.append(enc.eot_token) | |
| return {'ids': ids, 'len': len(ids)} | |
| tokenized = split_dataset.map(process, remove_columns=['text'], desc="tokenizing", num_proc=8) | |
| for split, dset in tokenized.items(): | |
| arr_len = np.sum(dset['len'], dtype=np.int64) | |
| filename = os.path.join(os.path.dirname(__file__), f'{split}.bin') | |
| arr = np.memmap(filename, dtype=np.uint16, mode='w+', shape=(arr_len,)) | |
| idx = 0 | |
| for example in tqdm(dset, desc=f"writing {filename}"): | |
| arr[idx : idx + example['len']] = example['ids'] | |
| idx += example['len'] | |
| arr.flush() |