LH-Tech-AI commited on
Commit
d9350d1
·
verified ·
1 Parent(s): 80bd48b

Create prepare.py

Browse files

The data prepare script for Fineweb-Edu-10BT. This script prepares the pretrainingdata binary files.

Files changed (1) hide show
  1. prepare.py +29 -0
prepare.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from tqdm import tqdm
3
+ import numpy as np
4
+ import tiktoken
5
+ from datasets import load_dataset
6
+
7
+ enc = tiktoken.get_encoding("gpt2")
8
+
9
+ if __name__ == '__main__':
10
+ dataset = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split="train")
11
+ split_dataset = dataset.train_test_split(test_size=0.0005, seed=2357, shuffle=True)
12
+ split_dataset['val'] = split_dataset.pop('test')
13
+
14
+ def process(example):
15
+ ids = enc.encode_ordinary(example['text'])
16
+ ids.append(enc.eot_token)
17
+ return {'ids': ids, 'len': len(ids)}
18
+
19
+ tokenized = split_dataset.map(process, remove_columns=['text'], desc="tokenizing", num_proc=8)
20
+
21
+ for split, dset in tokenized.items():
22
+ arr_len = np.sum(dset['len'], dtype=np.int64)
23
+ filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
24
+ arr = np.memmap(filename, dtype=np.uint16, mode='w+', shape=(arr_len,))
25
+ idx = 0
26
+ for example in tqdm(dset, desc=f"writing {filename}"):
27
+ arr[idx : idx + example['len']] = example['ids']
28
+ idx += example['len']
29
+ arr.flush()