Spaces:
Running
Running
| """Service layer for pulling up to N rows out of a HF dataset, for real | |
| (not just a peek). Streams where the dataset supports it so we never | |
| download more than was asked for. | |
| """ | |
| from __future__ import annotations | |
| from typing import Optional | |
| from datasets import load_dataset | |
| def load_limited( | |
| repo_id: str, | |
| subset: str, | |
| split: str, | |
| limit: int, | |
| token: Optional[str] = None, | |
| ) -> list: | |
| try: | |
| ds = load_dataset(repo_id, subset or None, split=split, streaming=True, token=token) | |
| rows = [] | |
| for i, row in enumerate(ds): | |
| if i >= limit: | |
| break | |
| rows.append(dict(row)) | |
| return rows | |
| except Exception: | |
| # Some datasets (mostly older script-based ones) don't support | |
| # streaming. Fall back to a full load and slice - slower, but | |
| # correct, and rare enough not to special-case earlier. | |
| ds = load_dataset(repo_id, subset or None, split=split, token=token) | |
| n = min(limit, len(ds)) | |
| return [dict(ds[i]) for i in range(n)] | |