Add reproducible WikiText corpus prep

Files changed (3) hide show

.gitignore +1 -0
scripts/prepare_wikitext103.py +76 -0
src/dropout_decay/data.py +6 -2

.gitignore CHANGED Viewed

@@ -2,6 +2,7 @@
 __pycache__/
 *.py[cod]
 .cache/
 *.npy
 *.pdf
 .venv/

 __pycache__/
 *.py[cod]
 .cache/
+data/
 *.npy
 *.pdf
 .venv/

scripts/prepare_wikitext103.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#!/usr/bin/env python3
+"""Download the public WikiText-103 raw parquet used for corpus holdouts."""
+from __future__ import annotations
+import argparse
+import hashlib
+from pathlib import Path
+from urllib.request import urlretrieve
+WIKITEXT103_RAW_TRAIN_URL = (
+    "https://huggingface.co/datasets/Salesforce/wikitext/resolve/"
+    "6231e49f19a707241d6b84d9cff60a3a86b85a85/"
+    "wikitext-103-raw-v1/train-00001-of-00002.parquet?download=true"
+)
+EXPECTED_BYTES = 156_700_942
+EXPECTED_SHA256 = "75aa65dee9de2a7c10ba1808efd2408c3f4eb008104c3ccac47f8ed19300ebdd"
+def sha256(path: Path) -> str:
+    digest = hashlib.sha256()
+    with path.open("rb") as handle:
+        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+def verify_file(path: Path) -> None:
+    size = path.stat().st_size
+    if size != EXPECTED_BYTES:
+        raise SystemExit(
+            f"{path} has {size:,} bytes; expected {EXPECTED_BYTES:,}."
+        )
+    actual = sha256(path)
+    if actual != EXPECTED_SHA256:
+        raise SystemExit(
+            f"{path} has sha256 {actual}; expected {EXPECTED_SHA256}."
+        )
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Prepare the WikiText-103 raw parquet corpus holdout."
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("data/wikitext103_raw"),
+        help="Directory where the parquet file should be stored.",
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Download again even if the target file already exists.",
+    )
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    target = args.output_dir / "train-00001-of-00002.parquet"
+    if target.exists() and not args.force:
+        verify_file(target)
+        print(target)
+        return
+    print(f"Downloading WikiText-103 raw train parquet to {target}")
+    urlretrieve(WIKITEXT103_RAW_TRAIN_URL, target)
+    verify_file(target)
+    print(target)
+if __name__ == "__main__":
+    main()

src/dropout_decay/data.py CHANGED Viewed

@@ -41,6 +41,10 @@ class CachedTokenizer:
     vocab_size: int
 def resolve_paths(corpus: str | None, corpus_glob: str | None) -> list[Path]:
     paths: list[Path] = []
     if corpus:
@@ -83,7 +87,7 @@ def load_cached_splits(
     tokenizer = CachedTokenizer(vocab_size=vocab_size)
     tokens = np.load(encoded_path, mmap_mode="r")
-    need_total = max_required_train_tokens + val_tokens
     if len(tokens) < need_total and not allow_short_corpus:
         raise ValueError(
             f"cached token file has {len(tokens):,} tokens, but {need_total:,} "
@@ -173,7 +177,7 @@ def encode_corpus(
     dtype = np.uint16 if tokenizer.vocab_size <= np.iinfo(np.uint16).max else np.uint32
     encoded_path = output_dir / f"tokens-v{tokenizer.vocab_size}-{dtype.__name__}.npy"
     tokenizer_path = output_dir / f"tokenizer-v{tokenizer.vocab_size}.json"
-    need_total = max_required_train_tokens + val_tokens
     encode_needed = force_reencode or not encoded_path.exists()
     if not encode_needed:
         cached_tokens = np.load(encoded_path, mmap_mode="r")

     vocab_size: int
+def required_token_count(max_required_train_tokens: int, val_tokens: int) -> int:
+    return max(max_required_train_tokens + val_tokens, val_tokens * 10)
 def resolve_paths(corpus: str | None, corpus_glob: str | None) -> list[Path]:
     paths: list[Path] = []
     if corpus:
     tokenizer = CachedTokenizer(vocab_size=vocab_size)
     tokens = np.load(encoded_path, mmap_mode="r")
+    need_total = required_token_count(max_required_train_tokens, val_tokens)
     if len(tokens) < need_total and not allow_short_corpus:
         raise ValueError(
             f"cached token file has {len(tokens):,} tokens, but {need_total:,} "
     dtype = np.uint16 if tokenizer.vocab_size <= np.iinfo(np.uint16).max else np.uint32
     encoded_path = output_dir / f"tokens-v{tokenizer.vocab_size}-{dtype.__name__}.npy"
     tokenizer_path = output_dir / f"tokenizer-v{tokenizer.vocab_size}.json"
+    need_total = required_token_count(max_required_train_tokens, val_tokens)
     encode_needed = force_reencode or not encoded_path.exists()
     if not encode_needed:
         cached_tokens = np.load(encoded_path, mmap_mode="r")