File size: 2,553 Bytes
f47473f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | #!/usr/bin/env python
"""Quick dataset loader for Hugging Face Hub datasets."""
from __future__ import annotations
import argparse
import os
from datasets import load_dataset
from huggingface_hub import HfApi
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Load and preview a dataset split from Hugging Face Hub."
)
parser.add_argument(
"--dataset-id",
default="issdandavis/scbe-aethermoore-knowledge-base",
help="Dataset repo id on Hugging Face Hub (for example: username/dataset-name).",
)
parser.add_argument("--split", default="train", help="Split to load.")
parser.add_argument(
"--limit",
type=int,
default=3,
help="How many examples to print from the split.",
)
parser.add_argument(
"--streaming",
action="store_true",
help="Stream examples without downloading the full dataset.",
)
parser.add_argument(
"--token",
default=os.environ.get("HF_TOKEN"),
help="HF access token. Defaults to HF_TOKEN env var.",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
token = args.token
if token:
try:
user = HfApi(token=token).whoami()["name"]
print(f"Authenticated as: {user}")
except Exception as exc: # pragma: no cover - network/auth failure path
print(f"Token check failed ({exc}). Retrying without token for public access.")
token = None
else:
print("No HF token provided. Public datasets only.")
try:
ds = load_dataset(
path=args.dataset_id,
split=args.split,
token=token,
streaming=args.streaming,
)
except Exception as exc: # pragma: no cover - network/hub failure path
raise SystemExit(
f"Failed to load dataset '{args.dataset_id}' split '{args.split}': {exc}"
) from exc
if args.streaming:
print(f"Loaded streaming split '{args.split}' from '{args.dataset_id}'.")
for idx, row in enumerate(ds):
print(f"[{idx}] {row}")
if idx + 1 >= args.limit:
break
return
print(f"Loaded split '{args.split}' from '{args.dataset_id}'.")
print(f"Rows: {len(ds)}")
print(f"Columns: {ds.column_names}")
print(f"Features: {ds.features}")
for idx in range(min(args.limit, len(ds))):
print(f"[{idx}] {ds[idx]}")
if __name__ == "__main__":
main()
|