Dataset-Creator / hf_dataset_loader.py
TitleOS's picture
Upload 9 files
390cebe verified
Raw
History Blame Contribute Delete
1.05 kB
"""Service layer for pulling up to N rows out of a HF dataset, for real
(not just a peek). Streams where the dataset supports it so we never
download more than was asked for.
"""
from __future__ import annotations
from typing import Optional
from datasets import load_dataset
def load_limited(
repo_id: str,
subset: str,
split: str,
limit: int,
token: Optional[str] = None,
) -> list:
try:
ds = load_dataset(repo_id, subset or None, split=split, streaming=True, token=token)
rows = []
for i, row in enumerate(ds):
if i >= limit:
break
rows.append(dict(row))
return rows
except Exception:
# Some datasets (mostly older script-based ones) don't support
# streaming. Fall back to a full load and slice - slower, but
# correct, and rare enough not to special-case earlier.
ds = load_dataset(repo_id, subset or None, split=split, token=token)
n = min(limit, len(ds))
return [dict(ds[i]) for i in range(n)]