| """Download docs_selected.jsonl from Hugging Face and tokenize it locally. |
| |
| This script is standalone. It does not import any local exporter or tokenizer |
| helpers. Tokenizer configs are JSON only and currently support the built-in |
| pure-byte and SentencePiece tokenizer definitions in `data/tokenizer_specs.json`. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import os |
| import shutil |
| from dataclasses import asdict, dataclass |
| from pathlib import Path |
| from typing import Any |
|
|
| import numpy as np |
| from huggingface_hub import hf_hub_download |
| from huggingface_hub.utils import EntryNotFoundError |
|
|
|
|
| DOCS_FILENAME = "docs_selected.jsonl" |
| SIDECAR_FILENAME = "docs_selected.source_manifest.json" |
| VERSION = "10B" |
| NUM_VAL_DOCS = 50_000 |
| SHARD_SIZE = 10**8 |
| APPEND_EOS = False |
| DATAFILE_MAGIC = 20240520 |
| DATAFILE_VERSION = 1 |
| DEFAULT_REPO_ID = os.environ.get("MATCHED_FINEWEB_REPO_ID", "willdepueoai/parameter-golf") |
| DEFAULT_REMOTE_ROOT = os.environ.get("MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", "datasets") |
| DEFAULT_CONFIG = Path(__file__).with_name("tokenizer_specs.json") |
| TOKENIZER_THREADS = max(1, int(os.environ.get("MATCHED_FINEWEB_TOKENIZER_THREADS", str(os.cpu_count() or 8)))) |
| SP_BATCH_SIZE = max(1, int(os.environ.get("MATCHED_FINEWEB_SP_BATCH_SIZE", "1024"))) |
|
|
|
|
| @dataclass(frozen=True) |
| class PureByteTokenizer: |
| pad_id: int = 0 |
| bos_id: int = 1 |
| eos_id: int = 2 |
| unk_id: int = 3 |
| byte_offset: int = 4 |
| byte_count: int = 256 |
|
|
| @property |
| def vocab_size(self) -> int: |
| return self.byte_offset + self.byte_count |
|
|
| def encode(self, text: str) -> np.ndarray: |
| data = text.encode("utf-8", errors="replace") |
| return np.frombuffer(data, dtype=np.uint8).astype(np.uint16, copy=False) + self.byte_offset |
|
|
| def encode_batch(self, texts: list[str]) -> list[np.ndarray]: |
| return [self.encode(text) for text in texts] |
|
|
| def save_json(self, path: str | Path) -> None: |
| path = Path(path) |
| path.parent.mkdir(parents=True, exist_ok=True) |
| payload = { |
| "tokenizer_type": "pure_byte", |
| "config": asdict(self), |
| "vocab_size": self.vocab_size, |
| } |
| path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") |
|
|
|
|
| def default_pure_byte_tokenizer() -> PureByteTokenizer: |
| return PureByteTokenizer() |
|
|
|
|
| def docs_sidecar_path(docs_jsonl: Path) -> Path: |
| return docs_jsonl.with_name(f"{docs_jsonl.stem}.source_manifest.json") |
|
|
|
|
| def maybe_load_docs_sidecar_meta(docs_jsonl: Path) -> dict[str, Any] | None: |
| sidecar_path = docs_sidecar_path(docs_jsonl) |
| if not sidecar_path.is_file(): |
| return None |
| payload = json.loads(sidecar_path.read_text(encoding="utf-8")) |
| if not isinstance(payload, dict): |
| raise ValueError(f"docs sidecar must be a JSON object: {sidecar_path}") |
| return payload |
|
|
|
|
| def copy_from_hf_cache(*, repo_id: str, remote_root: str, filename: str, destination: Path) -> bool: |
| remote_path = Path(remote_root) / filename if remote_root else Path(filename) |
| try: |
| cached_path = Path( |
| hf_hub_download( |
| repo_id=repo_id, |
| filename=remote_path.name, |
| subfolder=remote_path.parent.as_posix() if remote_path.parent != Path(".") else None, |
| repo_type="dataset", |
| ) |
| ) |
| except EntryNotFoundError: |
| return False |
|
|
| source = cached_path.resolve(strict=True) |
| destination.parent.mkdir(parents=True, exist_ok=True) |
| if destination.exists(): |
| destination.unlink() |
| try: |
| os.link(source, destination) |
| except OSError: |
| shutil.copy2(source, destination) |
| return True |
|
|
|
|
| def iter_docs(path: Path): |
| with path.open("r", encoding="utf-8") as f: |
| for line in f: |
| yield json.loads(line)["text"] |
|
|
|
|
| def count_docs(path: Path) -> int: |
| with path.open("r", encoding="utf-8") as f: |
| return sum(1 for _ in f) |
|
|
|
|
| def batched_docs_jsonl(path: Path, batch_size: int): |
| batch: list[str] = [] |
| for text in iter_docs(path): |
| batch.append(text) |
| if len(batch) == batch_size: |
| yield batch |
| batch = [] |
| if batch: |
| yield batch |
|
|
|
|
| def write_datafile(path: Path, toks: Any) -> None: |
| if len(toks) >= 2**31: |
| raise ValueError("token count too large") |
| header = np.zeros(256, dtype="<i4") |
| header[0] = DATAFILE_MAGIC |
| header[1] = DATAFILE_VERSION |
| header[2] = len(toks) |
| toks = np.asarray(toks) |
| if toks.dtype != np.uint16: |
| if not ((0 <= toks).all() and (toks < 2**16).all()): |
| raise ValueError("token dictionary too large for uint16") |
| toks = toks.astype("<u2", copy=False) |
| else: |
| toks = toks.astype("<u2", copy=False) |
| with path.open("wb") as f: |
| f.write(header.tobytes()) |
| f.write(toks.tobytes()) |
|
|
|
|
| def relativize_manifest_paths(value: Any, root: Path) -> Any: |
| if isinstance(value, dict): |
| return {k: relativize_manifest_paths(v, root) for k, v in value.items()} |
| if isinstance(value, list): |
| return [relativize_manifest_paths(v, root) for v in value] |
| if isinstance(value, str): |
| path = Path(value) |
| if path.is_absolute(): |
| try: |
| return path.relative_to(root).as_posix() |
| except ValueError: |
| return value |
| return value |
|
|
|
|
| def parse_reuse_sp_models(values: list[str]) -> dict[int, Path]: |
| reuse_models: dict[int, Path] = {} |
| for value in values: |
| vocab_size_str, model_path = value.split("=", 1) |
| vocab_size = int(vocab_size_str) |
| if vocab_size in reuse_models: |
| raise ValueError(f"duplicate --reuse_sp_model for vocab_size={vocab_size}") |
| reuse_models[vocab_size] = Path(model_path).expanduser().resolve() |
| return reuse_models |
|
|
|
|
| def load_specs(config_path: Path) -> list[dict[str, Any]]: |
| payload = json.loads(config_path.read_text(encoding="utf-8")) |
| if isinstance(payload, dict): |
| specs = payload.get("tokenizer_specs", payload.get("tokenizers")) |
| else: |
| specs = payload |
| if not isinstance(specs, list) or not specs: |
| raise ValueError("tokenizer_config must define a non-empty list") |
| if not all(isinstance(spec, dict) for spec in specs): |
| raise ValueError("each tokenizer spec must be a JSON object") |
| return [dict(spec) for spec in specs] |
|
|
|
|
| def tokenizer_kind(spec: dict[str, Any]) -> str: |
| kind = spec.get("kind") |
| if kind in {"byte", "pure_byte"}: |
| return "byte" |
| if kind in {"sentencepiece_bpe", "sentencepiece"}: |
| return "sentencepiece_bpe" |
| builder = str(spec.get("builder", "")) |
| builder_name = builder.rsplit(":", 1)[-1] |
| if builder_name == "build_pure_byte_tokenizer": |
| return "byte" |
| if builder_name == "build_sentencepiece_tokenizer": |
| return "sentencepiece_bpe" |
| if spec.get("dataset_suffix") == "byte260": |
| return "byte" |
| if "vocab_size" in spec: |
| return "sentencepiece_bpe" |
| raise ValueError( |
| f"unsupported tokenizer spec {spec.get('name', '<unnamed>')!r}: " |
| "expected a built-in pure-byte or sentencepiece builder" |
| ) |
|
|
|
|
| def write_tokenizer_config_export(output_root: Path, selected_specs: list[dict[str, Any]]) -> Path: |
| path = output_root / "tokenizer_config.export.json" |
| path.write_text(json.dumps({"tokenizers": selected_specs}, indent=2) + "\n", encoding="utf-8") |
| return path |
|
|
|
|
| def _iter_sentencepiece_text(docs_jsonl: Path, *, max_docs: int | None = None): |
| with docs_jsonl.open("r", encoding="utf-8") as f: |
| for i, line in enumerate(f): |
| if max_docs is not None and i >= max_docs: |
| break |
| text = json.loads(line)["text"].replace("\x00", " ").strip() |
| if text: |
| yield text |
|
|
|
|
| def build_pure_byte_tokenizer(*, spec: dict[str, Any], docs_jsonl: Path, tokenizers_dir: Path) -> dict[str, Any]: |
| del docs_jsonl |
| tok = default_pure_byte_tokenizer() |
| path = tokenizers_dir / spec.get("filename", "fineweb_pure_byte_260.json") |
| tok.save_json(path) |
| return { |
| "name": spec.get("name", "pure_byte_260"), |
| "kind": "byte", |
| "dataset_suffix": spec.get("dataset_suffix", "byte260"), |
| "vocab_size": tok.vocab_size, |
| "bos_id": tok.bos_id, |
| "eos_id": tok.eos_id, |
| "encode": tok.encode, |
| "encode_batch": tok.encode_batch, |
| "manifest": {"path": str(path), "pad_id": tok.pad_id, "unk_id": tok.unk_id}, |
| } |
|
|
|
|
| def build_sentencepiece_tokenizer(*, spec: dict[str, Any], docs_jsonl: Path, tokenizers_dir: Path) -> dict[str, Any]: |
| try: |
| import sentencepiece as spm |
| except ImportError as exc: |
| raise RuntimeError("sentencepiece is required for SentencePiece tokenizer exports") from exc |
|
|
| vocab_size = int(spec["vocab_size"]) |
| prefix = tokenizers_dir / spec.get("model_prefix", f"fineweb_{vocab_size}_bpe") |
| model_path = prefix.with_suffix(".model") |
| vocab_path = prefix.with_suffix(".vocab") |
| prefix.parent.mkdir(parents=True, exist_ok=True) |
| for artifact in (model_path, vocab_path): |
| if artifact.exists(): |
| artifact.unlink() |
|
|
| reuse_model_path = spec.get("reuse_model_path") |
| if reuse_model_path is not None: |
| reuse_model_path = Path(reuse_model_path).expanduser().resolve() |
| if not reuse_model_path.is_file(): |
| raise FileNotFoundError(reuse_model_path) |
| shutil.copy2(reuse_model_path, model_path) |
| reuse_vocab_path = reuse_model_path.with_suffix(".vocab") |
| if reuse_vocab_path.is_file(): |
| shutil.copy2(reuse_vocab_path, vocab_path) |
| else: |
| kwargs = { |
| "sentence_iterator": _iter_sentencepiece_text( |
| docs_jsonl, |
| max_docs=None if spec.get("tokenizer_train_docs") is None else int(spec["tokenizer_train_docs"]), |
| ), |
| "model_prefix": str(prefix), |
| "model_type": "bpe", |
| "vocab_size": vocab_size, |
| "character_coverage": 0.999, |
| "byte_fallback": True, |
| "split_digits": True, |
| "normalization_rule_name": "nmt_nfkc", |
| "add_dummy_prefix": False, |
| "pad_id": 0, |
| "bos_id": 1, |
| "eos_id": 2, |
| "unk_id": 3, |
| "hard_vocab_limit": False, |
| } |
| kwargs.update(spec.get("trainer_overrides") or {}) |
| spm.SentencePieceTrainer.train(**kwargs) |
|
|
| tok = spm.SentencePieceProcessor(model_file=str(model_path)) |
| return { |
| "name": spec.get("name", f"sp_bpe_{vocab_size}"), |
| "kind": "sentencepiece_bpe", |
| "dataset_suffix": spec.get("dataset_suffix", f"sp{vocab_size}"), |
| "vocab_size": int(tok.vocab_size()), |
| "bos_id": int(tok.bos_id()), |
| "eos_id": int(tok.eos_id()), |
| "encode": lambda text, tok=tok: tok.encode(text, out_type=int), |
| "encode_batch": lambda texts, tok=tok: tok.encode(texts, out_type=int, num_threads=TOKENIZER_THREADS), |
| "manifest": {"model_path": str(model_path), "vocab_path": str(vocab_path)}, |
| } |
|
|
|
|
| def export_shards( |
| docs_jsonl: Path, |
| tok: dict[str, Any], |
| output_dir: Path, |
| *, |
| num_val_docs: int, |
| shard_size: int, |
| docs_total: int, |
| max_train_tokens: int | None, |
| ) -> dict[str, int]: |
| output_dir.mkdir(parents=True, exist_ok=True) |
| for pattern in ("fineweb_train_*.bin", "fineweb_val_*.bin"): |
| for stale in output_dir.glob(pattern): |
| stale.unlink() |
|
|
| stats = { |
| "docs_total": 0, |
| "docs_val": 0, |
| "docs_train": 0, |
| "files_total": 0, |
| "files_val": 0, |
| "files_train": 0, |
| "tokens_total": 0, |
| "tokens_val": 0, |
| "tokens_train": 0, |
| } |
| buf = np.empty((shard_size,), dtype=np.uint16) |
| fill = 0 |
| split = "val" |
| shards = {"val": 0, "train": 0} |
|
|
| def flush() -> None: |
| nonlocal fill |
| if fill == 0: |
| return |
| write_datafile(output_dir / f"fineweb_{split}_{shards[split]:06d}.bin", buf[:fill]) |
| stats["files_total"] += 1 |
| stats[f"files_{split}"] += 1 |
| shards[split] += 1 |
| fill = 0 |
|
|
| vocab_size = int(tok["vocab_size"]) |
| if vocab_size > 2**16: |
| raise ValueError(f"vocab_size={vocab_size} is too large for uint16 shard storage") |
|
|
| batch_encode = tok.get("encode_batch") |
| batch_size = SP_BATCH_SIZE if callable(batch_encode) else 1 |
| for texts in batched_docs_jsonl(docs_jsonl, batch_size): |
| encoded_docs = batch_encode(texts) if callable(batch_encode) else [tok["encode"](text) for text in texts] |
| for text, encoded in zip(texts, encoded_docs, strict=True): |
| del text |
| split_for_doc = "val" if stats["docs_total"] < num_val_docs else "train" |
| if split_for_doc == "train" and max_train_tokens is not None and stats["tokens_train"] >= max_train_tokens: |
| flush() |
| return stats |
| if split_for_doc != split: |
| flush() |
| split = split_for_doc |
|
|
| encoded_arr = np.asarray(encoded, dtype=np.int32) |
| toks = np.empty((encoded_arr.size + 1 + int(APPEND_EOS),), dtype=np.int32) |
| toks[0] = tok["bos_id"] |
| toks[1 : 1 + encoded_arr.size] = encoded_arr |
| if APPEND_EOS: |
| toks[-1] = tok["eos_id"] |
| if not ((0 <= toks).all() and (toks < vocab_size).all()): |
| bad = int(toks[(toks < 0) | (toks >= vocab_size)][0]) |
| raise ValueError(f"token id {bad} outside declared vocab_size={vocab_size}") |
| if split == "train" and max_train_tokens is not None: |
| remaining_train_tokens = max_train_tokens - stats["tokens_train"] |
| if remaining_train_tokens <= 0: |
| flush() |
| return stats |
| if len(toks) > remaining_train_tokens: |
| toks = toks[:remaining_train_tokens] |
| toks = toks.astype("<u2", copy=False) |
|
|
| stats["docs_total"] += 1 |
| stats[f"docs_{split}"] += 1 |
| stats["tokens_total"] += len(toks) |
| stats[f"tokens_{split}"] += len(toks) |
|
|
| pos = 0 |
| while pos < len(toks): |
| take = min(shard_size - fill, len(toks) - pos) |
| buf[fill : fill + take] = toks[pos : pos + take] |
| fill += take |
| pos += take |
| if fill == shard_size: |
| flush() |
|
|
| if stats["docs_total"] and stats["docs_total"] % 100_000 == 0: |
| print(f"{output_dir.name}: {stats['docs_total']}/{docs_total} docs", flush=True) |
|
|
| flush() |
| if max_train_tokens is None and stats["docs_total"] != docs_total: |
| raise ValueError(f"expected {docs_total} docs, exported {stats['docs_total']}") |
| if max_train_tokens is not None and stats["tokens_train"] < max_train_tokens: |
| raise ValueError( |
| f"requested max_train_tokens={max_train_tokens}, but only exported {stats['tokens_train']} train tokens" |
| ) |
| return stats |
|
|
|
|
| def build_tokenizers( |
| *, |
| specs: list[dict[str, Any]], |
| docs_jsonl: Path, |
| tokenizers_dir: Path, |
| tokenizer_train_docs: int | None, |
| skip_byte: bool, |
| reuse_sp_models: dict[int, Path], |
| ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: |
| tokenizers: list[dict[str, Any]] = [] |
| selected_specs: list[dict[str, Any]] = [] |
| seen_names: set[str] = set() |
| seen_datasets: set[str] = set() |
|
|
| for raw_spec in specs: |
| spec = dict(raw_spec) |
| kind = tokenizer_kind(spec) |
| if skip_byte and kind == "byte": |
| continue |
| if kind == "sentencepiece_bpe": |
| if tokenizer_train_docs is not None: |
| spec["tokenizer_train_docs"] = int(tokenizer_train_docs) |
| vocab_size = int(spec["vocab_size"]) |
| if vocab_size in reuse_sp_models: |
| spec["reuse_model_path"] = str(reuse_sp_models[vocab_size]) |
|
|
| selected_specs.append(spec) |
| built = ( |
| build_pure_byte_tokenizer(spec=spec, docs_jsonl=docs_jsonl, tokenizers_dir=tokenizers_dir) |
| if kind == "byte" |
| else build_sentencepiece_tokenizer(spec=spec, docs_jsonl=docs_jsonl, tokenizers_dir=tokenizers_dir) |
| ) |
| name = str(built["name"]) |
| dataset_suffix = built.get("dataset_suffix") |
| dataset_name = str(built.get("dataset_name", f"fineweb{VERSION}_{dataset_suffix}")) |
| if name in seen_names: |
| raise ValueError(f"duplicate tokenizer name: {name}") |
| if dataset_name in seen_datasets: |
| raise ValueError(f"duplicate dataset name: {dataset_name}") |
| seen_names.add(name) |
| seen_datasets.add(dataset_name) |
| vocab_size = int(built["vocab_size"]) |
| recommended_bigram_vocab_size = int( |
| built.get("recommended_bigram_vocab_size", ((vocab_size + 127) // 128) * 128 * 5) |
| ) |
| tokenizers.append( |
| { |
| "name": name, |
| "kind": str(built["kind"]), |
| "dataset_name": dataset_name, |
| "vocab_size": vocab_size, |
| "bos_id": int(built["bos_id"]), |
| "eos_id": int(built["eos_id"]), |
| "encode": built["encode"], |
| "encode_batch": built.get("encode_batch"), |
| "recommended_bigram_vocab_size": recommended_bigram_vocab_size, |
| "manifest": { |
| "name": name, |
| "kind": str(built["kind"]), |
| "vocab_size": vocab_size, |
| "bos_id": int(built["bos_id"]), |
| "eos_id": int(built["eos_id"]), |
| "recommended_bigram_vocab_size": recommended_bigram_vocab_size, |
| "source_spec": spec, |
| **(built.get("manifest") or {}), |
| }, |
| } |
| ) |
| if not tokenizers: |
| raise ValueError("tokenizer_config produced no tokenizers after filtering") |
| return tokenizers, selected_specs |
|
|
|
|
| def build_parser() -> argparse.ArgumentParser: |
| parser = argparse.ArgumentParser( |
| description="Download docs_selected.jsonl from a Hugging Face dataset repo and tokenize it locally" |
| ) |
| parser.add_argument( |
| "--repo-id", |
| default=DEFAULT_REPO_ID, |
| help="Hugging Face dataset repo id, for example user/dataset", |
| ) |
| parser.add_argument( |
| "--remote-root", |
| default=DEFAULT_REMOTE_ROOT, |
| help="Optional subdirectory inside the dataset repo that contains docs_selected.jsonl", |
| ) |
| parser.add_argument("--output-root", required=True, help="Directory where docs, tokenizers, shards, and manifest are written") |
| parser.add_argument( |
| "--tokenizer-config", |
| default=str(DEFAULT_CONFIG), |
| help="Local tokenizer config JSON. Defaults to data/tokenizer_specs.json.", |
| ) |
| parser.add_argument( |
| "--num-val-docs", |
| type=int, |
| default=None, |
| help="Validation document count. Defaults to the downloaded sidecar when present, otherwise 50000.", |
| ) |
| parser.add_argument("--chunk-tokens", type=int, default=SHARD_SIZE, help="Shard size in tokens.") |
| parser.add_argument( |
| "--max-train-tokens", |
| type=int, |
| default=None, |
| help="Optional cap on exported training tokens after the full val split. " |
| "For example, 8000000000 writes 8B train tokens (80 shards at the default shard size).", |
| ) |
| parser.add_argument( |
| "--tokenizer-train-docs", |
| type=int, |
| default=None, |
| help="Limit the number of docs used for tokenizer training.", |
| ) |
| parser.add_argument("--skip-byte", action="store_true", help="Skip byte-tokenizer export.") |
| parser.add_argument( |
| "--reuse-sp-model", |
| action="append", |
| default=[], |
| metavar="VOCAB=MODEL", |
| help="Reuse an existing SentencePiece model for the given vocab size instead of retraining it.", |
| ) |
| return parser |
|
|
|
|
| def main() -> None: |
| args = build_parser().parse_args() |
| if args.chunk_tokens <= 0: |
| raise ValueError(f"--chunk_tokens must be positive, got {args.chunk_tokens}") |
| if args.max_train_tokens is not None and args.max_train_tokens <= 0: |
| raise ValueError(f"--max-train-tokens must be positive, got {args.max_train_tokens}") |
|
|
| output_root = Path(args.output_root).expanduser().resolve() |
| output_root.mkdir(parents=True, exist_ok=True) |
| tokenizers_dir = output_root / "tokenizers" |
| datasets_dir = output_root / "datasets" |
| tokenizers_dir.mkdir(parents=True, exist_ok=True) |
| datasets_dir.mkdir(parents=True, exist_ok=True) |
|
|
| docs_jsonl = output_root / DOCS_FILENAME |
| sidecar = output_root / SIDECAR_FILENAME |
| if not copy_from_hf_cache( |
| repo_id=args.repo_id, |
| remote_root=args.remote_root, |
| filename=DOCS_FILENAME, |
| destination=docs_jsonl, |
| ): |
| remote = f"{args.remote_root}/{DOCS_FILENAME}" if args.remote_root else DOCS_FILENAME |
| raise FileNotFoundError(f"{remote} not found in Hugging Face dataset repo {args.repo_id}") |
| if not copy_from_hf_cache( |
| repo_id=args.repo_id, |
| remote_root=args.remote_root, |
| filename=SIDECAR_FILENAME, |
| destination=sidecar, |
| ): |
| sidecar.unlink(missing_ok=True) |
|
|
| docs_sidecar = maybe_load_docs_sidecar_meta(docs_jsonl) |
| docs_total = int(docs_sidecar["num_docs"]) if docs_sidecar is not None and docs_sidecar.get("num_docs") is not None else count_docs(docs_jsonl) |
| if args.num_val_docs is not None: |
| num_val_docs = int(args.num_val_docs) |
| elif docs_sidecar is not None and docs_sidecar.get("docs_val") is not None: |
| num_val_docs = int(docs_sidecar["docs_val"]) |
| else: |
| num_val_docs = NUM_VAL_DOCS |
| if not (0 <= num_val_docs <= docs_total): |
| raise ValueError(f"num_val_docs must be in [0, {docs_total}], got {num_val_docs}") |
|
|
| specs = load_specs(Path(args.tokenizer_config).expanduser().resolve()) |
| reuse_sp_models = parse_reuse_sp_models(args.reuse_sp_model) |
| tokenizers, selected_specs = build_tokenizers( |
| specs=specs, |
| docs_jsonl=docs_jsonl, |
| tokenizers_dir=tokenizers_dir, |
| tokenizer_train_docs=args.tokenizer_train_docs, |
| skip_byte=args.skip_byte, |
| reuse_sp_models=reuse_sp_models, |
| ) |
| write_tokenizer_config_export(output_root, selected_specs) |
|
|
| docs_meta = { |
| "remote_repo_id": args.repo_id, |
| "remote_root": args.remote_root, |
| "num_docs": docs_total, |
| "docs_sha256": None if docs_sidecar is None else docs_sidecar.get("docs_sha256"), |
| "source_manifest": str(docs_sidecar_path(docs_jsonl)) if docs_sidecar is not None else None, |
| } |
| if docs_sidecar is not None: |
| docs_meta["source_sidecar"] = docs_sidecar |
|
|
| manifest = { |
| "version": VERSION, |
| "num_docs": docs_total, |
| "num_val_docs": num_val_docs, |
| "max_train_tokens": args.max_train_tokens, |
| "shuffle_seed": None if docs_sidecar is None else docs_sidecar.get("shuffle_seed"), |
| "shard_size": int(args.chunk_tokens), |
| "append_eos": APPEND_EOS, |
| "docs_jsonl": str(docs_jsonl), |
| "docs_meta": docs_meta, |
| "tokenizer_specs": selected_specs, |
| "tokenizers": [], |
| "datasets": [], |
| } |
|
|
| for tok in tokenizers: |
| output_dir = datasets_dir / tok["dataset_name"] |
| print(f"Exporting dataset: {tok['dataset_name']}", flush=True) |
| stats = export_shards( |
| docs_jsonl, |
| tok, |
| output_dir, |
| num_val_docs=num_val_docs, |
| shard_size=int(args.chunk_tokens), |
| docs_total=docs_total, |
| max_train_tokens=args.max_train_tokens, |
| ) |
| manifest["tokenizers"].append(tok["manifest"]) |
| manifest["datasets"].append( |
| { |
| "name": tok["dataset_name"], |
| "tokenizer_name": tok["name"], |
| "tokenizer_kind": tok["kind"], |
| "path": str(output_dir), |
| "train_glob": str(output_dir / "fineweb_train_*.bin"), |
| "val_glob": str(output_dir / "fineweb_val_*.bin"), |
| "vocab_size": tok["vocab_size"], |
| "bos_id": tok["bos_id"], |
| "eos_id": tok["eos_id"], |
| "recommended_bigram_vocab_size": tok["recommended_bigram_vocab_size"], |
| "stats": stats, |
| } |
| ) |
|
|
| manifest = relativize_manifest_paths(manifest, output_root) |
| manifest_path = output_root / "manifest.json" |
| manifest_path.write_text(json.dumps(manifest, indent=2) + "\n", encoding="utf-8") |
| print(f"Done. Manifest: {manifest_path}", flush=True) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|