|
|
|
|
|
"""Build and push a DatasetDict from local JSONL files.""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import argparse |
|
|
import os |
|
|
from pathlib import Path |
|
|
|
|
|
from datasets import DatasetDict, load_dataset |
|
|
from huggingface_hub import create_repo |
|
|
|
|
|
|
|
|
def parse_args() -> argparse.Namespace: |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Push train/validation/test JSONL files to a Hugging Face dataset repo." |
|
|
) |
|
|
parser.add_argument( |
|
|
"--dataset-id", |
|
|
required=True, |
|
|
help="Dataset repo id (for example: username/dataset-name).", |
|
|
) |
|
|
parser.add_argument("--train", help="Path to train JSONL file.") |
|
|
parser.add_argument("--validation", help="Path to validation JSONL file.") |
|
|
parser.add_argument("--test", help="Path to test JSONL file.") |
|
|
parser.add_argument( |
|
|
"--private", |
|
|
action="store_true", |
|
|
help="Create/update the dataset as private.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--token", |
|
|
default=os.environ.get("HF_TOKEN"), |
|
|
help="HF access token. Defaults to HF_TOKEN env var.", |
|
|
) |
|
|
return parser.parse_args() |
|
|
|
|
|
|
|
|
def validate_split_path(name: str, path: str | None) -> Path | None: |
|
|
if not path: |
|
|
return None |
|
|
|
|
|
file_path = Path(path).expanduser().resolve() |
|
|
if not file_path.exists(): |
|
|
raise FileNotFoundError(f"{name} file not found: {file_path}") |
|
|
if file_path.suffix.lower() != ".jsonl": |
|
|
raise ValueError(f"{name} file must be a .jsonl file: {file_path}") |
|
|
return file_path |
|
|
|
|
|
|
|
|
def main() -> None: |
|
|
args = parse_args() |
|
|
|
|
|
split_paths = { |
|
|
"train": validate_split_path("train", args.train), |
|
|
"validation": validate_split_path("validation", args.validation), |
|
|
"test": validate_split_path("test", args.test), |
|
|
} |
|
|
split_paths = {k: str(v) for k, v in split_paths.items() if v is not None} |
|
|
|
|
|
if not split_paths: |
|
|
raise ValueError("Provide at least one split: --train, --validation, or --test.") |
|
|
|
|
|
if not args.token: |
|
|
raise ValueError("Set HF_TOKEN or pass --token to push a dataset.") |
|
|
|
|
|
try: |
|
|
create_repo( |
|
|
repo_id=args.dataset_id, |
|
|
repo_type="dataset", |
|
|
private=args.private, |
|
|
exist_ok=True, |
|
|
token=args.token, |
|
|
) |
|
|
except Exception as exc: |
|
|
raise SystemExit(f"Failed to create/access dataset repo '{args.dataset_id}': {exc}") from exc |
|
|
|
|
|
split_datasets = {} |
|
|
for split_name, path in split_paths.items(): |
|
|
split_datasets[split_name] = load_dataset( |
|
|
"json", |
|
|
data_files={split_name: path}, |
|
|
split=split_name, |
|
|
) |
|
|
print(f"Loaded {split_name}: {len(split_datasets[split_name])} rows from {path}") |
|
|
|
|
|
dataset_dict = DatasetDict(split_datasets) |
|
|
try: |
|
|
dataset_dict.push_to_hub( |
|
|
repo_id=args.dataset_id, |
|
|
private=args.private, |
|
|
token=args.token, |
|
|
) |
|
|
except Exception as exc: |
|
|
raise SystemExit(f"Failed to push dataset '{args.dataset_id}': {exc}") from exc |
|
|
|
|
|
print(f"Pushed dataset to: https://huggingface.co/datasets/{args.dataset_id}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|