Upload prepare_data.py with huggingface_hub
Browse files- prepare_data.py +137 -0
prepare_data.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Prepare İvme's pretraining data: stream the dense mix, tokenize, pack to disk.
|
| 3 |
+
|
| 4 |
+
Output is a flat uint16 memmap (vocab 16384 < 65536, so uint16 is exact). We
|
| 5 |
+
write documents in ASCENDING quality order so a sequential read during training
|
| 6 |
+
acts as a curriculum — the model sees noisier web text first and the densest
|
| 7 |
+
material (textbooks, then Wikipedia) last. Research shows this ordering plus a
|
| 8 |
+
moderate LR decay beats random shuffling for free.
|
| 9 |
+
|
| 10 |
+
The mix mirrors what made Archaea-74M punch so far above its weight, pushed a
|
| 11 |
+
little denser (more math, stricter web filter):
|
| 12 |
+
|
| 13 |
+
FineWeb-HQ (score-gated web) 45% ~710M tokens [first / lowest density]
|
| 14 |
+
Python stack (filtered) 10% ~160M tokens
|
| 15 |
+
FineMath-4+ 15% ~235M tokens
|
| 16 |
+
Cosmopedia (stanford+wikihow) 25% ~395M tokens
|
| 17 |
+
Wikipedia EN 5% ~80M tokens [last / highest density]
|
| 18 |
+
---- -----------
|
| 19 |
+
100% ~1.57B tokens (Chinchilla-optimal)
|
| 20 |
+
|
| 21 |
+
Usage:
|
| 22 |
+
python prepare_data.py # full ~1.57B token build
|
| 23 |
+
python prepare_data.py --smoke # tiny build to test the pipeline
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
from __future__ import annotations
|
| 27 |
+
|
| 28 |
+
import argparse
|
| 29 |
+
import os
|
| 30 |
+
|
| 31 |
+
import numpy as np
|
| 32 |
+
|
| 33 |
+
from huggingface_hub import login
|
| 34 |
+
|
| 35 |
+
login(token="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")
|
| 36 |
+
|
| 37 |
+
TOKENIZER_PATH = "ivme_tokenizer.json"
|
| 38 |
+
OUT_DIR = "data"
|
| 39 |
+
DTYPE = np.uint16
|
| 40 |
+
|
| 41 |
+
# (source_key, target_tokens) in ASCENDING quality order — written in this order.
|
| 42 |
+
TOKEN_BUDGET = [
|
| 43 |
+
("fineweb_hq", 710_000_000),
|
| 44 |
+
("python", 160_000_000),
|
| 45 |
+
("finemath", 235_000_000),
|
| 46 |
+
("cosmopedia", 395_000_000),
|
| 47 |
+
("wikipedia", 80_000_000),
|
| 48 |
+
]
|
| 49 |
+
SMOKE_BUDGET = [(k, 200_000) for k, _ in TOKEN_BUDGET]
|
| 50 |
+
|
| 51 |
+
VAL_TOKENS = 2_000_000 # held out from the tail of each source proportionally
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def make_stream(source_key):
|
| 55 |
+
"""Return (iterable_of_text, text_field) for a source."""
|
| 56 |
+
from datasets import load_dataset
|
| 57 |
+
|
| 58 |
+
if source_key == "fineweb_hq":
|
| 59 |
+
ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True)
|
| 60 |
+
return ds, "text"
|
| 61 |
+
if source_key == "cosmopedia":
|
| 62 |
+
# Two dense subsets concatenated.
|
| 63 |
+
a = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True)
|
| 64 |
+
b = load_dataset("HuggingFaceTB/cosmopedia", "wikihow", split="train", streaming=True)
|
| 65 |
+
from itertools import chain
|
| 66 |
+
return chain(a, b), "text"
|
| 67 |
+
if source_key == "finemath":
|
| 68 |
+
ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True)
|
| 69 |
+
return ds, "text"
|
| 70 |
+
if source_key == "python":
|
| 71 |
+
ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True)
|
| 72 |
+
return ds, "content"
|
| 73 |
+
if source_key == "wikipedia":
|
| 74 |
+
ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
|
| 75 |
+
return ds, "text"
|
| 76 |
+
raise ValueError(source_key)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def build(budget):
|
| 80 |
+
from tokenizers import Tokenizer
|
| 81 |
+
|
| 82 |
+
os.makedirs(OUT_DIR, exist_ok=True)
|
| 83 |
+
tok = Tokenizer.from_file(TOKENIZER_PATH)
|
| 84 |
+
eos_id = tok.token_to_id("<|eos|>")
|
| 85 |
+
|
| 86 |
+
train_path = os.path.join(OUT_DIR, "train.bin")
|
| 87 |
+
val_path = os.path.join(OUT_DIR, "val.bin")
|
| 88 |
+
|
| 89 |
+
total_target = sum(n for _, n in budget)
|
| 90 |
+
print(f"[data] target ~{total_target/1e6:.0f}M tokens across {len(budget)} sources")
|
| 91 |
+
|
| 92 |
+
train_f = open(train_path, "wb")
|
| 93 |
+
val_buf = [] # small, held in memory
|
| 94 |
+
|
| 95 |
+
written_train = 0
|
| 96 |
+
for source_key, target in budget:
|
| 97 |
+
stream, field = make_stream(source_key)
|
| 98 |
+
src_written = 0
|
| 99 |
+
# Reserve a slice of each source's tail for validation.
|
| 100 |
+
val_target = int(VAL_TOKENS * (target / total_target))
|
| 101 |
+
print(f"[data] {source_key}: target {target/1e6:.0f}M (val {val_target/1e6:.2f}M)")
|
| 102 |
+
|
| 103 |
+
for row in stream:
|
| 104 |
+
text = row.get(field)
|
| 105 |
+
if not text:
|
| 106 |
+
continue
|
| 107 |
+
ids = tok.encode(text).ids
|
| 108 |
+
ids.append(eos_id) # document boundary
|
| 109 |
+
arr = np.array(ids, dtype=DTYPE)
|
| 110 |
+
|
| 111 |
+
if len(val_buf) * 0 + src_written >= target:
|
| 112 |
+
break
|
| 113 |
+
# Send the first val_target tokens of this source to val, rest to train.
|
| 114 |
+
if src_written < val_target:
|
| 115 |
+
val_buf.append(arr)
|
| 116 |
+
else:
|
| 117 |
+
arr.tofile(train_f)
|
| 118 |
+
written_train += len(arr)
|
| 119 |
+
src_written += len(arr)
|
| 120 |
+
|
| 121 |
+
if src_written % 5_000_000 < len(arr):
|
| 122 |
+
print(f" [{source_key}] {src_written/1e6:.1f}M / {target/1e6:.0f}M")
|
| 123 |
+
|
| 124 |
+
train_f.close()
|
| 125 |
+
val_arr = np.concatenate(val_buf) if val_buf else np.array([], dtype=DTYPE)
|
| 126 |
+
val_arr.tofile(val_path)
|
| 127 |
+
|
| 128 |
+
print(f"[data] train.bin : {written_train:,} tokens -> {train_path}")
|
| 129 |
+
print(f"[data] val.bin : {len(val_arr):,} tokens -> {val_path}")
|
| 130 |
+
print(f"[data] curriculum order preserved (sequential read = ascending quality)")
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
if __name__ == "__main__":
|
| 134 |
+
ap = argparse.ArgumentParser()
|
| 135 |
+
ap.add_argument("--smoke", action="store_true", help="tiny build to test the pipeline")
|
| 136 |
+
args = ap.parse_args()
|
| 137 |
+
build(SMOKE_BUDGET if args.smoke else TOKEN_BUDGET)
|