import os import tiktoken import numpy as np from datasets import load_dataset from tqdm import tqdm DATASET_NAME = "HuggingFaceFW/fineweb-edu" SAMPLE_NAME = "sample-10BT" TARGET_TOKENS = 100_000_000 NUM_PROC = 8 enc = tiktoken.get_encoding("gpt2") def process(example): ids = enc.encode_ordinary(example['text']) ids.append(enc.eot_token) return {'ids': ids, 'len': len(ids)} if __name__ == "__main__": print(f"Loading streaming dataset {DATASET_NAME}...") dataset = load_dataset(DATASET_NAME, name=SAMPLE_NAME, split='train', streaming=True) all_tokens = [] total_tokens = 0 pbar = tqdm(total=TARGET_TOKENS, desc="Collecting tokens") for example in dataset: tokens = process(example)['ids'] all_tokens.extend(tokens) total_tokens += len(tokens) pbar.update(len(tokens)) if total_tokens >= TARGET_TOKENS: break pbar.close() n = len(all_tokens) train_data = all_tokens[:int(n*0.95)] val_data = all_tokens[int(n*0.95):] for name, d in [('train', train_data), ('val', val_data)]: arr = np.array(d, dtype=np.uint16) filename = f"{name}.bin" arr.tofile(filename) print(f"Saved {filename} with {len(d):,} tokens.") print("\nDone!")