File size: 2,319 Bytes
c96ac34 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import os
import multiprocessing as mp
import numpy as np
import tiktoken
from datasets import load_dataset
from tqdm import tqdm
# ------------------------------------------
local_dir = "edu_fineweb10B"
remote_name = "sample-10BT"
shard_size = int(1e8)
DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), local_dir)
os.makedirs(DATA_CACHE_DIR, exist_ok=True)
fw = load_dataset("HuggingFaceFW/fineweb-edu", name=remote_name, split="train")
enc = tiktoken.get_encoding("gpt2")
eot = enc._special_tokens['<|endoftext|>']
def tokenize(doc):
tokens = [eot]
tokens.extend(enc.encode_ordinary(doc["text"]))
tokens_np = np.array(tokens)
assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
tokens_np_uint16 = tokens_np.astype(np.uint16)
return tokens_np_uint16
def write_datafile(filename, tokens_np):
np.save(filename, tokens_np)
#
nprocs = max(1, os.cpu_count()//2)
with mp.Pool(nprocs) as pool:
shard_index = 0
all_tokens_np = np.empty((shard_size,), dtype=np.uint16)
token_count = 0
progress_bar = None
for tokens in pool.imap(tokenize, fw, chunksize=16):
if token_count + len(tokens) < shard_size:
all_tokens_np[token_count:token_count+len(tokens)] = tokens
token_count += len(tokens)
if progress_bar is None:
progress_bar = tqdm(total=shard_size, unit="tokens", desc=f"Shard {shard_index}")
progress_bar.update(len(tokens))
else:
split = "val" if shard_index == 0 else "train"
filename = os.path.join(DATA_CACHE_DIR, f"edufineweb_{split}_{shard_index:06d}")
remainder = shard_size - token_count
progress_bar.update(remainder)
all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
write_datafile(filename, all_tokens_np)
shard_index += 1
progress_bar = None
all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
token_count = len(tokens)-remainder
if token_count != 0:
split = "val" if shard_index == 0 else "train"
filename = os.path.join(DATA_CACHE_DIR, f"edufineweb_{split}_{shard_index:06d}")
write_datafile(filename, all_tokens_np[:token_count]) |