"""Service layer for pulling up to N rows out of a HF dataset, for real (not just a peek). Streams where the dataset supports it so we never download more than was asked for. """ from __future__ import annotations from typing import Optional from datasets import load_dataset def load_limited( repo_id: str, subset: str, split: str, limit: int, token: Optional[str] = None, ) -> list: try: ds = load_dataset(repo_id, subset or None, split=split, streaming=True, token=token) rows = [] for i, row in enumerate(ds): if i >= limit: break rows.append(dict(row)) return rows except Exception: # Some datasets (mostly older script-based ones) don't support # streaming. Fall back to a full load and slice - slower, but # correct, and rare enough not to special-case earlier. ds = load_dataset(repo_id, subset or None, split=split, token=token) n = min(limit, len(ds)) return [dict(ds[i]) for i in range(n)]