dropout-decay / scripts /prepare_tinystories.py
Mandeep Sidhu
Add TinyStories corpus holdout results
20c4ec9
#!/usr/bin/env python3
"""Download the public TinyStories parquet shard used for corpus holdouts."""
from __future__ import annotations
import argparse
import hashlib
from pathlib import Path
from urllib.request import urlretrieve
TINYSTORIES_TRAIN_URL = (
"https://huggingface.co/datasets/roneneldan/TinyStories/resolve/"
"f54c09fd23315a6f9c86f9dc80f725de7d8f9c64/"
"data/train-00000-of-00004-2d5a1467fff1081b.parquet?download=true"
)
EXPECTED_BYTES = 248_731_111
EXPECTED_SHA256 = "77cf780cebe52b6e83e3a2ac84bc56d8059363113e41d17a023f1d8b2ed0fc0b"
def sha256(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
def verify_file(path: Path) -> None:
size = path.stat().st_size
if size != EXPECTED_BYTES:
raise SystemExit(f"{path} has {size:,} bytes; expected {EXPECTED_BYTES:,}.")
actual = sha256(path)
if actual != EXPECTED_SHA256:
raise SystemExit(f"{path} has sha256 {actual}; expected {EXPECTED_SHA256}.")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Prepare the TinyStories public corpus holdout."
)
parser.add_argument(
"--output-dir",
type=Path,
default=Path("data/tinystories"),
help="Directory where the parquet file should be stored.",
)
parser.add_argument(
"--force",
action="store_true",
help="Download again even if the target file already exists.",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
args.output_dir.mkdir(parents=True, exist_ok=True)
target = args.output_dir / "train-00000-of-00004.parquet"
if target.exists() and not args.force:
verify_file(target)
print(target)
return
print(f"Downloading TinyStories train parquet shard to {target}")
urlretrieve(TINYSTORIES_TRAIN_URL, target)
verify_file(target)
print(target)
if __name__ == "__main__":
main()