ereniko commited on
Commit
e82a88e
·
verified ·
1 Parent(s): f0169be

Upload prepare_data.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. prepare_data.py +137 -0
prepare_data.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Prepare İvme's pretraining data: stream the dense mix, tokenize, pack to disk.
3
+
4
+ Output is a flat uint16 memmap (vocab 16384 < 65536, so uint16 is exact). We
5
+ write documents in ASCENDING quality order so a sequential read during training
6
+ acts as a curriculum — the model sees noisier web text first and the densest
7
+ material (textbooks, then Wikipedia) last. Research shows this ordering plus a
8
+ moderate LR decay beats random shuffling for free.
9
+
10
+ The mix mirrors what made Archaea-74M punch so far above its weight, pushed a
11
+ little denser (more math, stricter web filter):
12
+
13
+ FineWeb-HQ (score-gated web) 45% ~710M tokens [first / lowest density]
14
+ Python stack (filtered) 10% ~160M tokens
15
+ FineMath-4+ 15% ~235M tokens
16
+ Cosmopedia (stanford+wikihow) 25% ~395M tokens
17
+ Wikipedia EN 5% ~80M tokens [last / highest density]
18
+ ---- -----------
19
+ 100% ~1.57B tokens (Chinchilla-optimal)
20
+
21
+ Usage:
22
+ python prepare_data.py # full ~1.57B token build
23
+ python prepare_data.py --smoke # tiny build to test the pipeline
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import argparse
29
+ import os
30
+
31
+ import numpy as np
32
+
33
+ from huggingface_hub import login
34
+
35
+ login(token="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")
36
+
37
+ TOKENIZER_PATH = "ivme_tokenizer.json"
38
+ OUT_DIR = "data"
39
+ DTYPE = np.uint16
40
+
41
+ # (source_key, target_tokens) in ASCENDING quality order — written in this order.
42
+ TOKEN_BUDGET = [
43
+ ("fineweb_hq", 710_000_000),
44
+ ("python", 160_000_000),
45
+ ("finemath", 235_000_000),
46
+ ("cosmopedia", 395_000_000),
47
+ ("wikipedia", 80_000_000),
48
+ ]
49
+ SMOKE_BUDGET = [(k, 200_000) for k, _ in TOKEN_BUDGET]
50
+
51
+ VAL_TOKENS = 2_000_000 # held out from the tail of each source proportionally
52
+
53
+
54
+ def make_stream(source_key):
55
+ """Return (iterable_of_text, text_field) for a source."""
56
+ from datasets import load_dataset
57
+
58
+ if source_key == "fineweb_hq":
59
+ ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True)
60
+ return ds, "text"
61
+ if source_key == "cosmopedia":
62
+ # Two dense subsets concatenated.
63
+ a = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True)
64
+ b = load_dataset("HuggingFaceTB/cosmopedia", "wikihow", split="train", streaming=True)
65
+ from itertools import chain
66
+ return chain(a, b), "text"
67
+ if source_key == "finemath":
68
+ ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True)
69
+ return ds, "text"
70
+ if source_key == "python":
71
+ ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True)
72
+ return ds, "content"
73
+ if source_key == "wikipedia":
74
+ ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
75
+ return ds, "text"
76
+ raise ValueError(source_key)
77
+
78
+
79
+ def build(budget):
80
+ from tokenizers import Tokenizer
81
+
82
+ os.makedirs(OUT_DIR, exist_ok=True)
83
+ tok = Tokenizer.from_file(TOKENIZER_PATH)
84
+ eos_id = tok.token_to_id("<|eos|>")
85
+
86
+ train_path = os.path.join(OUT_DIR, "train.bin")
87
+ val_path = os.path.join(OUT_DIR, "val.bin")
88
+
89
+ total_target = sum(n for _, n in budget)
90
+ print(f"[data] target ~{total_target/1e6:.0f}M tokens across {len(budget)} sources")
91
+
92
+ train_f = open(train_path, "wb")
93
+ val_buf = [] # small, held in memory
94
+
95
+ written_train = 0
96
+ for source_key, target in budget:
97
+ stream, field = make_stream(source_key)
98
+ src_written = 0
99
+ # Reserve a slice of each source's tail for validation.
100
+ val_target = int(VAL_TOKENS * (target / total_target))
101
+ print(f"[data] {source_key}: target {target/1e6:.0f}M (val {val_target/1e6:.2f}M)")
102
+
103
+ for row in stream:
104
+ text = row.get(field)
105
+ if not text:
106
+ continue
107
+ ids = tok.encode(text).ids
108
+ ids.append(eos_id) # document boundary
109
+ arr = np.array(ids, dtype=DTYPE)
110
+
111
+ if len(val_buf) * 0 + src_written >= target:
112
+ break
113
+ # Send the first val_target tokens of this source to val, rest to train.
114
+ if src_written < val_target:
115
+ val_buf.append(arr)
116
+ else:
117
+ arr.tofile(train_f)
118
+ written_train += len(arr)
119
+ src_written += len(arr)
120
+
121
+ if src_written % 5_000_000 < len(arr):
122
+ print(f" [{source_key}] {src_written/1e6:.1f}M / {target/1e6:.0f}M")
123
+
124
+ train_f.close()
125
+ val_arr = np.concatenate(val_buf) if val_buf else np.array([], dtype=DTYPE)
126
+ val_arr.tofile(val_path)
127
+
128
+ print(f"[data] train.bin : {written_train:,} tokens -> {train_path}")
129
+ print(f"[data] val.bin : {len(val_arr):,} tokens -> {val_path}")
130
+ print(f"[data] curriculum order preserved (sequential read = ascending quality)")
131
+
132
+
133
+ if __name__ == "__main__":
134
+ ap = argparse.ArgumentParser()
135
+ ap.add_argument("--smoke", action="store_true", help="tiny build to test the pipeline")
136
+ args = ap.parse_args()
137
+ build(SMOKE_BUDGET if args.smoke else TOKEN_BUDGET)