File size: 3,269 Bytes
f47473f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
#!/usr/bin/env python
"""Build and push a DatasetDict from local JSONL files."""
from __future__ import annotations
import argparse
import os
from pathlib import Path
from datasets import DatasetDict, load_dataset
from huggingface_hub import create_repo
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Push train/validation/test JSONL files to a Hugging Face dataset repo."
)
parser.add_argument(
"--dataset-id",
required=True,
help="Dataset repo id (for example: username/dataset-name).",
)
parser.add_argument("--train", help="Path to train JSONL file.")
parser.add_argument("--validation", help="Path to validation JSONL file.")
parser.add_argument("--test", help="Path to test JSONL file.")
parser.add_argument(
"--private",
action="store_true",
help="Create/update the dataset as private.",
)
parser.add_argument(
"--token",
default=os.environ.get("HF_TOKEN"),
help="HF access token. Defaults to HF_TOKEN env var.",
)
return parser.parse_args()
def validate_split_path(name: str, path: str | None) -> Path | None:
if not path:
return None
file_path = Path(path).expanduser().resolve()
if not file_path.exists():
raise FileNotFoundError(f"{name} file not found: {file_path}")
if file_path.suffix.lower() != ".jsonl":
raise ValueError(f"{name} file must be a .jsonl file: {file_path}")
return file_path
def main() -> None:
args = parse_args()
split_paths = {
"train": validate_split_path("train", args.train),
"validation": validate_split_path("validation", args.validation),
"test": validate_split_path("test", args.test),
}
split_paths = {k: str(v) for k, v in split_paths.items() if v is not None}
if not split_paths:
raise ValueError("Provide at least one split: --train, --validation, or --test.")
if not args.token:
raise ValueError("Set HF_TOKEN or pass --token to push a dataset.")
try:
create_repo(
repo_id=args.dataset_id,
repo_type="dataset",
private=args.private,
exist_ok=True,
token=args.token,
)
except Exception as exc: # pragma: no cover - network/hub failure path
raise SystemExit(f"Failed to create/access dataset repo '{args.dataset_id}': {exc}") from exc
split_datasets = {}
for split_name, path in split_paths.items():
split_datasets[split_name] = load_dataset(
"json",
data_files={split_name: path},
split=split_name,
)
print(f"Loaded {split_name}: {len(split_datasets[split_name])} rows from {path}")
dataset_dict = DatasetDict(split_datasets)
try:
dataset_dict.push_to_hub(
repo_id=args.dataset_id,
private=args.private,
token=args.token,
)
except Exception as exc: # pragma: no cover - network/hub failure path
raise SystemExit(f"Failed to push dataset '{args.dataset_id}': {exc}") from exc
print(f"Pushed dataset to: https://huggingface.co/datasets/{args.dataset_id}")
if __name__ == "__main__":
main()
|