Spaces:

TitleOS
/

Dataset-Creator

Running

Dataset-Creator / hf_dataset_loader.py

Upload 9 files

390cebe verified 10 days ago

1.05 kB

	"""Service layer for pulling up to N rows out of a HF dataset, for real
	(not just a peek). Streams where the dataset supports it so we never
	download more than was asked for.
	"""
	from __future__ import annotations

	from typing import Optional

	from datasets import load_dataset


	def load_limited(
	repo_id: str,
	subset: str,
	split: str,
	limit: int,
	token: Optional[str] = None,
	) -> list:
	try:
	ds = load_dataset(repo_id, subset or None, split=split, streaming=True, token=token)
	rows = []
	for i, row in enumerate(ds):
	if i >= limit:
	break
	rows.append(dict(row))
	return rows
	except Exception:
	# Some datasets (mostly older script-based ones) don't support
	# streaming. Fall back to a full load and slice - slower, but
	# correct, and rare enough not to special-case earlier.
	ds = load_dataset(repo_id, subset or None, split=split, token=token)
	n = min(limit, len(ds))
	return [dict(ds[i]) for i in range(n)]