Spaces:
Running
Running
File size: 1,817 Bytes
b67578f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
"""Sampling tools for loading actual data from datasets."""
from typing import Optional
from utils.hf_client import get_client
from utils.formatting import format_sample
def sample_rows(
dataset_id: str,
n_rows: int = 5,
config: Optional[str] = None,
split: str = "train",
random_seed: Optional[int] = None
) -> str:
"""
Get a sample of actual rows from a dataset to inspect the data.
Use this tool to see real examples from a dataset. This helps understand
what the data looks like, the format of each column, and typical values.
Args:
dataset_id: The full dataset identifier (e.g., "squad", "imdb")
n_rows: Number of rows to sample (1-20, default: 5). Keep small for large datasets.
config: Optional dataset configuration name. Leave empty for default config.
split: The dataset split to sample from ("train", "test", "validation"). Default: "train"
random_seed: Optional seed for reproducible sampling. If not provided, returns first N rows.
Returns:
Formatted sample showing actual data rows in JSON format, with each row
numbered and clearly separated.
Notes:
- Large binary data (images, audio) is shown as placeholders
- Very long text is truncated for readability
- Use get_schema first to understand column types before sampling
Example usage:
- sample_rows("imdb", 3) - Get 3 movie reviews
- sample_rows("squad", 5, split="validation") - Get 5 QA pairs from validation
"""
n_rows = max(1, min(20, n_rows)) # Clamp between 1 and 20
client = get_client()
samples = client.load_sample(
dataset_id=dataset_id,
config=config,
split=split,
n_rows=n_rows
)
return format_sample(samples, dataset_id)
|