IvmeLabs
/

Ivme-Conversate-22M-Base

+"""
+Prepare İvme's pretraining data: stream the dense mix, tokenize, pack to disk.
+Output is a flat uint16 memmap (vocab 16384 < 65536, so uint16 is exact). We
+write documents in ASCENDING quality order so a sequential read during training
+acts as a curriculum — the model sees noisier web text first and the densest
+material (textbooks, then Wikipedia) last. Research shows this ordering plus a
+moderate LR decay beats random shuffling for free.
+The mix mirrors what made Archaea-74M punch so far above its weight, pushed a
+little denser (more math, stricter web filter):
+    FineWeb-HQ (score-gated web)   45%   ~710M tokens   [first / lowest density]
+    Python stack (filtered)        10%   ~160M tokens
+    FineMath-4+                    15%   ~235M tokens
+    Cosmopedia (stanford+wikihow)  25%   ~395M tokens
+    Wikipedia EN                    5%    ~80M tokens   [last / highest density]
+                                   ----  -----------
+                                   100%  ~1.57B tokens  (Chinchilla-optimal)
+Usage:
+    python prepare_data.py                 # full ~1.57B token build
+    python prepare_data.py --smoke         # tiny build to test the pipeline
+"""
+from __future__ import annotations
+import argparse
+import os
+import numpy as np
+from huggingface_hub import login
+login(token="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")
+TOKENIZER_PATH = "ivme_tokenizer.json"
+OUT_DIR = "data"
+DTYPE = np.uint16
+# (source_key, target_tokens) in ASCENDING quality order — written in this order.
+TOKEN_BUDGET = [
+    ("fineweb_hq", 710_000_000),
+    ("python",     160_000_000),
+    ("finemath",   235_000_000),
+    ("cosmopedia", 395_000_000),
+    ("wikipedia",   80_000_000),
+]
+SMOKE_BUDGET = [(k, 200_000) for k, _ in TOKEN_BUDGET]
+VAL_TOKENS = 2_000_000  # held out from the tail of each source proportionally
+def make_stream(source_key):
+    """Return (iterable_of_text, text_field) for a source."""
+    from datasets import load_dataset
+    if source_key == "fineweb_hq":
+        ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True)
+        return ds, "text"
+    if source_key == "cosmopedia":
+        # Two dense subsets concatenated.
+        a = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True)
+        b = load_dataset("HuggingFaceTB/cosmopedia", "wikihow", split="train", streaming=True)
+        from itertools import chain
+        return chain(a, b), "text"
+    if source_key == "finemath":
+        ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True)
+        return ds, "text"
+    if source_key == "python":
+        ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True)
+        return ds, "content"
+    if source_key == "wikipedia":
+        ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
+        return ds, "text"
+    raise ValueError(source_key)
+def build(budget):
+    from tokenizers import Tokenizer
+    os.makedirs(OUT_DIR, exist_ok=True)
+    tok = Tokenizer.from_file(TOKENIZER_PATH)
+    eos_id = tok.token_to_id("<|eos|>")
+    train_path = os.path.join(OUT_DIR, "train.bin")
+    val_path = os.path.join(OUT_DIR, "val.bin")
+    total_target = sum(n for _, n in budget)
+    print(f"[data] target ~{total_target/1e6:.0f}M tokens across {len(budget)} sources")
+    train_f = open(train_path, "wb")
+    val_buf = []  # small, held in memory
+    written_train = 0
+    for source_key, target in budget:
+        stream, field = make_stream(source_key)
+        src_written = 0
+        # Reserve a slice of each source's tail for validation.
+        val_target = int(VAL_TOKENS * (target / total_target))
+        print(f"[data] {source_key}: target {target/1e6:.0f}M (val {val_target/1e6:.2f}M)")
+        for row in stream:
+            text = row.get(field)
+            if not text:
+                continue
+            ids = tok.encode(text).ids
+            ids.append(eos_id)  # document boundary
+            arr = np.array(ids, dtype=DTYPE)
+            if len(val_buf) * 0 + src_written >= target:
+                break
+            # Send the first val_target tokens of this source to val, rest to train.
+            if src_written < val_target:
+                val_buf.append(arr)
+            else:
+                arr.tofile(train_f)
+                written_train += len(arr)
+            src_written += len(arr)
+            if src_written % 5_000_000 < len(arr):
+                print(f"  [{source_key}] {src_written/1e6:.1f}M / {target/1e6:.0f}M")
+    train_f.close()
+    val_arr = np.concatenate(val_buf) if val_buf else np.array([], dtype=DTYPE)
+    val_arr.tofile(val_path)
+    print(f"[data] train.bin : {written_train:,} tokens -> {train_path}")
+    print(f"[data] val.bin   : {len(val_arr):,} tokens -> {val_path}")
+    print(f"[data] curriculum order preserved (sequential read = ascending quality)")
+if __name__ == "__main__":
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--smoke", action="store_true", help="tiny build to test the pipeline")
+    args = ap.parse_args()
+    build(SMOKE_BUDGET if args.smoke else TOKEN_BUDGET)