phdm-21d-embedding / scripts /push_jsonl_dataset.py
issdandavis
feat: add hugging face datasets setup tooling
f47473f
#!/usr/bin/env python
"""Build and push a DatasetDict from local JSONL files."""
from __future__ import annotations
import argparse
import os
from pathlib import Path
from datasets import DatasetDict, load_dataset
from huggingface_hub import create_repo
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Push train/validation/test JSONL files to a Hugging Face dataset repo."
)
parser.add_argument(
"--dataset-id",
required=True,
help="Dataset repo id (for example: username/dataset-name).",
)
parser.add_argument("--train", help="Path to train JSONL file.")
parser.add_argument("--validation", help="Path to validation JSONL file.")
parser.add_argument("--test", help="Path to test JSONL file.")
parser.add_argument(
"--private",
action="store_true",
help="Create/update the dataset as private.",
)
parser.add_argument(
"--token",
default=os.environ.get("HF_TOKEN"),
help="HF access token. Defaults to HF_TOKEN env var.",
)
return parser.parse_args()
def validate_split_path(name: str, path: str | None) -> Path | None:
if not path:
return None
file_path = Path(path).expanduser().resolve()
if not file_path.exists():
raise FileNotFoundError(f"{name} file not found: {file_path}")
if file_path.suffix.lower() != ".jsonl":
raise ValueError(f"{name} file must be a .jsonl file: {file_path}")
return file_path
def main() -> None:
args = parse_args()
split_paths = {
"train": validate_split_path("train", args.train),
"validation": validate_split_path("validation", args.validation),
"test": validate_split_path("test", args.test),
}
split_paths = {k: str(v) for k, v in split_paths.items() if v is not None}
if not split_paths:
raise ValueError("Provide at least one split: --train, --validation, or --test.")
if not args.token:
raise ValueError("Set HF_TOKEN or pass --token to push a dataset.")
try:
create_repo(
repo_id=args.dataset_id,
repo_type="dataset",
private=args.private,
exist_ok=True,
token=args.token,
)
except Exception as exc: # pragma: no cover - network/hub failure path
raise SystemExit(f"Failed to create/access dataset repo '{args.dataset_id}': {exc}") from exc
split_datasets = {}
for split_name, path in split_paths.items():
split_datasets[split_name] = load_dataset(
"json",
data_files={split_name: path},
split=split_name,
)
print(f"Loaded {split_name}: {len(split_datasets[split_name])} rows from {path}")
dataset_dict = DatasetDict(split_datasets)
try:
dataset_dict.push_to_hub(
repo_id=args.dataset_id,
private=args.private,
token=args.token,
)
except Exception as exc: # pragma: no cover - network/hub failure path
raise SystemExit(f"Failed to push dataset '{args.dataset_id}': {exc}") from exc
print(f"Pushed dataset to: https://huggingface.co/datasets/{args.dataset_id}")
if __name__ == "__main__":
main()