Create prepare.py
Browse filesThe data prepare script for Fineweb-Edu-10BT. This script prepares the pretrainingdata binary files.
- prepare.py +29 -0
prepare.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
import numpy as np
|
| 4 |
+
import tiktoken
|
| 5 |
+
from datasets import load_dataset
|
| 6 |
+
|
| 7 |
+
enc = tiktoken.get_encoding("gpt2")
|
| 8 |
+
|
| 9 |
+
if __name__ == '__main__':
|
| 10 |
+
dataset = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split="train")
|
| 11 |
+
split_dataset = dataset.train_test_split(test_size=0.0005, seed=2357, shuffle=True)
|
| 12 |
+
split_dataset['val'] = split_dataset.pop('test')
|
| 13 |
+
|
| 14 |
+
def process(example):
|
| 15 |
+
ids = enc.encode_ordinary(example['text'])
|
| 16 |
+
ids.append(enc.eot_token)
|
| 17 |
+
return {'ids': ids, 'len': len(ids)}
|
| 18 |
+
|
| 19 |
+
tokenized = split_dataset.map(process, remove_columns=['text'], desc="tokenizing", num_proc=8)
|
| 20 |
+
|
| 21 |
+
for split, dset in tokenized.items():
|
| 22 |
+
arr_len = np.sum(dset['len'], dtype=np.int64)
|
| 23 |
+
filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
|
| 24 |
+
arr = np.memmap(filename, dtype=np.uint16, mode='w+', shape=(arr_len,))
|
| 25 |
+
idx = 0
|
| 26 |
+
for example in tqdm(dset, desc=f"writing {filename}"):
|
| 27 |
+
arr[idx : idx + example['len']] = example['ids']
|
| 28 |
+
idx += example['len']
|
| 29 |
+
arr.flush()
|